From 858b677fcf4d534724f11c780a989f7d005b907e Mon Sep 17 00:00:00 2001 From: sam Date: Wed, 27 May 2009 05:04:10 +0000 Subject: [PATCH] Add multiple charsets support to img2twit, and autodetect charset when decoding. git-svn-id: file:///srv/caca.zoy.org/var/lib/svn/libpipi/trunk@3526 92316355-f0b4-4df1-b90c-862c8a59935f --- examples/img2twit.cpp | 125 +++++++++++++++++++++++++++--------------- 1 file changed, 82 insertions(+), 43 deletions(-) diff --git a/examples/img2twit.cpp b/examples/img2twit.cpp index 0afdc77..9ff776e 100644 --- a/examples/img2twit.cpp +++ b/examples/img2twit.cpp @@ -26,40 +26,40 @@ #include "../genethumb/mygetopt.h" /* - * User-definable settings. + * Format-dependent settings. Change this and you risk making all other + * generated strings unusable. */ -/* The Unicode characters at disposal - XXX: must be _ordered_ */ -static const uint32_t unichars[] = -{ - /* Printable ASCII (except space) */ - //0x0021, 0x007f, - - /* Stupid symbols and Dingbats shit */ - //0x25a0, 0x2600, /* Geometric Shapes */ - //0x2600, 0x269e, 0x26a0, 0x26bd, 0x26c0, 0x26c4, /* Misc. Symbols */ - //0x2701, 0x2705, 0x2706, 0x270a, 0x270c, 0x2728, 0x2729, 0x274c, - // 0x274d, 0x274e, 0x274f, 0x2753, 0x2756, 0x2757, 0x2758, 0x275f, - // 0x2761, 0x2795, 0x2798, 0x27b0, 0x27b1, 0x27bf, /* Dingbats */ - - /* Chinese-looking stuff */ - //0x2e80, 0x2e9a, 0x2e9b, 0x2ef4, /* CJK Radicals Supplement */ - //0x2f00, 0x2fd6, /* Kangxi Radicals */ - //0x3400, 0x4db6, /* CJK Unified Ideographs Extension A */ - 0x4e00, 0x9fa6, /* CJK Unified Ideographs */ - - /* Korean - most people don't know the difference anyway */ - //0xac00, 0xd7a4, /* Hangul Syllables */ - - /* More Chinese */ - //0xf900, 0xfa2e, 0xfa30, 0xfa6b, 0xfa70, 0xfada, /* CJK Compat. Idgphs. */ - - /* TODO: there's also the U+20000 and U+2f800 planes, but they're - * not supported by the Twitter Javascript filter (yet?). */ - - /* End of list marker - XXX: don't remove! */ - 0x0000, 0x0000 -}; +/* Printable ASCII (except space) */ +#define RANGE_ASCII 0x0021, 0x007f + +/* CJK Unified Ideographs */ +#define RANGE_CJK 0x4e00, 0x9fa6 +//0x2e80, 0x2e9a, 0x2e9b, 0x2ef4, /* CJK Radicals Supplement */ +//0x2f00, 0x2fd6, /* Kangxi Radicals */ +//0x3400, 0x4db6, /* CJK Unified Ideographs Extension A */ +//0xac00, 0xd7a4, /* Hangul Syllables -- Korean, not Chinese */ +//0xf900, 0xfa2e, 0xfa30, 0xfa6b, 0xfa70, 0xfada, /* CJK Compat. Idgphs. */ +/* TODO: there's also the U+20000 and U+2f800 planes, but they're + * not supported by the Twitter Javascript filter (yet?). */ + +/* Stupid symbols and Dingbats shit */ +#define RANGE_SYMBOLS 0x25a0, 0x2600, /* Geometric Shapes */ \ + 0x2600, 0x269e, 0x26a0, 0x26bd, 0x26c0, 0x26c4, /* Misc. Symbols */ \ + 0x2701, 0x2705, 0x2706, 0x270a, 0x270c, 0x2728, 0x2729, 0x274c, \ + 0x274d, 0x274e, 0x274f, 0x2753, 0x2756, 0x2757, 0x2758, 0x275f, \ + 0x2761, 0x2795, 0x2798, 0x27b0, 0x27b1, 0x27bf /* Dingbats */ + +/* End of list marker */ +#define RANGE_END 0x0, 0x0 + +/* Pre-defined character ranges XXX: must be _ordered_ */ +static const uint32_t unichars_ascii[] = { RANGE_ASCII, RANGE_END }; +static const uint32_t unichars_cjk[] = { RANGE_CJK, RANGE_END }; +static const uint32_t unichars_symbols[] = { RANGE_SYMBOLS, RANGE_END }; + +/* The Unicode characters at disposal */ +static const uint32_t *unichars; /* The maximum image size we want to support */ #define MAX_W 4000 @@ -794,11 +794,12 @@ static void analyse(pipi_image_t *src) int main(int argc, char *argv[]) { + uint32_t unicode_data[4096]; /* FIXME: allocate this dynamically */ int opstats[2 * NB_OPS]; char const *srcname = NULL, *dstname = NULL; pipi_image_t *src, *tmp, *dst; double error = 1.0; - int width, height, ret = 0; + int width, height; /* Parse command-line options */ for(;;) @@ -808,12 +809,13 @@ int main(int argc, char *argv[]) { { "output", 1, NULL, 'o' }, { "length", 1, NULL, 'l' }, + { "charset", 1, NULL, 'c' }, { "quality", 1, NULL, 'q' }, { "debug", 0, NULL, 'd' }, { "help", 0, NULL, 'h' }, { NULL, 0, NULL, 0 }, }; - int c = mygetopt(argc, argv, "o:l:q:dh", long_options, &option_index); + int c = mygetopt(argc, argv, "o:l:c:q:dh", long_options, &option_index); if(c == -1) break; @@ -831,6 +833,20 @@ int main(int argc, char *argv[]) MAX_MSG_LEN = 16; } break; + case 'c': + if(!strcmp(myoptarg, "ascii")) + unichars = unichars_ascii; + else if(!strcmp(myoptarg, "cjk")) + unichars = unichars_cjk; + else if(!strcmp(myoptarg, "symbols")) + unichars = unichars_symbols; + else + { + fprintf(stderr, "Error: invalid char block \"%s\".", myoptarg); + fprintf(stderr, "Valid sets are: ascii, cjk, symbols\n"); + return EXIT_FAILURE; + } + break; case 'q': ITERATIONS_PER_POINT = 10 * atof(myoptarg); if(ITERATIONS_PER_POINT < 0) @@ -849,6 +865,7 @@ int main(int argc, char *argv[]) printf("Mandatory arguments to long options are mandatory for short options too.\n"); printf(" -o, --output output resulting image to filename\n"); printf(" -l, --length message length in characters (default 140)\n"); + printf(" -c, --charset character set to use (ascii, [cjk], symbols)\n"); printf(" -q, --quality set image quality (0 - 10) (default 5)\n"); printf(" -d, --debug print debug information\n"); printf(" -h, --help display this help and exit\n"); @@ -879,6 +896,32 @@ int main(int argc, char *argv[]) if(myoptind == argc - 1) srcname = argv[myoptind]; + /* Decoding mode: read UTF-8 text from stdin */ + if(dstname) + for(int i = 0; i < MAX_MSG_LEN; i++) + unicode_data[i] = fread_utf8(stdin); + + /* Autodetect charset if decoding, otherwise switch to CJK. */ + if(!unichars) + { + if(dstname) + { + if(unicode_data[0] >= 0x0021 && unicode_data[0] < 0x007f) + unichars = unichars_ascii; + else if(unicode_data[0] >= 0x4e00 && unicode_data[0] < 0x9fa6) + unichars = unichars_cjk; + else if(unicode_data[0] >= 0x25a0 && unicode_data[0] < 0x27bf) + unichars = unichars_symbols; + else + { + fprintf(stderr, "Error: unable to detect charset\n"); + return EXIT_FAILURE; + } + } + else + unichars = unichars_cjk; + } + pipi_set_gamma(1.0); /* Precompute bit allocation */ @@ -903,14 +946,10 @@ int main(int argc, char *argv[]) if(dstname) { - /* Decoding mode: read UTF-8 text from stdin, find each - * character's index in our character list, and push it to our - * wonderful custom bitstream. */ - uint32_t data[MAX_MSG_LEN]; - for(int i = 0; i < MAX_MSG_LEN; i++) - data[i] = uni2index(fread_utf8(stdin)); + /* Decoding mode: find each character's index in our character + * list, and push it to our wonderful custom bitstream. */ for(int i = MAX_MSG_LEN; i--; ) - b.push(data[i], NUM_CHARACTERS); + b.push(uni2index(unicode_data[i]), NUM_CHARACTERS); /* Read width and height from bitstream */ src = NULL; @@ -932,7 +971,7 @@ int main(int argc, char *argv[]) height = pipi_get_image_height(src); } - /* Compute best w/h ratio */ + /* Compute "best" w/h ratio */ dw = 1; dh = TOTAL_CELLS; for(unsigned int i = 1; i <= TOTAL_CELLS; i++) { @@ -1160,6 +1199,6 @@ int main(int argc, char *argv[]) pipi_free(dst); } - return ret; + return EXIT_SUCCESS; }