From: Marc Cromme Date: Tue, 22 May 2007 21:20:10 +0000 (+0000) Subject: finished test ICU stand-allone program for benchmarking of ICU tokenization and norma... X-Git-Tag: PAZPAR2.1.0.0~113 X-Git-Url: http://git.indexdata.com/?a=commitdiff_plain;h=d060969f7c7f2a41142ae5dfdb945cda973c91ee;p=pazpar2-moved-to-github.git finished test ICU stand-allone program for benchmarking of ICU tokenization and normalization. Works quite well, benchmarking on the James English Bible from Project Gutenberg (4,5 MB plain text consisting of 870.000 individual tokens) took 3.5 seconds on a laptop. More testing/benchmarking is needed. --- diff --git a/src/Makefile.am b/src/Makefile.am index ded9a7f..f8a4d38 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,7 +1,6 @@ -# $Id: Makefile.am,v 1.23 2007-05-22 08:26:59 marc Exp $ +# $Id: Makefile.am,v 1.24 2007-05-22 21:20:10 marc Exp $ -bin_PROGRAMS = pazpar2 -# icu_chain_test +bin_PROGRAMS = pazpar2 icu_chain_test check_PROGRAMS = test_config \ test_icu_I18N \ @@ -35,8 +34,8 @@ libpazpar2_a_SOURCES = config.c config.h eventl.c eventl.h \ pazpar2_SOURCES = pazpar2.c pazpar2_LDADD = libpazpar2.a $(YAZLIB) $(ICU_LIBS) -#icu_chain_test_SOURCES = icu_chain_test.c icu_I18N -#icu_chain_test_LDADD = $(YAZLIB) $(ICU_LIBS) +icu_chain_test_SOURCES = icu_chain_test.c icu_I18N.c +icu_chain_test_LDADD = $(YAZLIB) $(ICU_LIBS) test_config_SOURCES = test_config.c test_config_LDADD = libpazpar2.a $(YAZLIB) diff --git a/src/icu_chain_test.c b/src/icu_chain_test.c index ac749e0..ddc926e 100644 --- a/src/icu_chain_test.c +++ b/src/icu_chain_test.c @@ -5,16 +5,26 @@ #include #include +#define _GNU_SOURCE +#include +#include + +//#include +#include + +#include +#include + #include "icu_I18N.h" -/* commando line parameters */ +/* commando line and config parameters */ static struct config_t { - //char infile[1024]; - //char locale[128]; - char conffile[1024]; - //char outfile[1024]; - int verbatim; - int print; + char conffile[1024]; + char print[1024]; + int xmloutput; + struct icu_chain * chain; + FILE * infile; + FILE * outfile; } config; @@ -24,9 +34,15 @@ void print_option_error(const struct config_t *p_config) fprintf(stderr, "Calling error, valid options are :\n"); fprintf(stderr, "icu_chain_test\n" " [-c (path/to/config/file.xml)]\n" - " [-p (c|l|t)] print available info \n" - " [-v] verbouse output\n" - "\n"); + " [-p (a|c|l|t)] print ICU info \n" + " [-x] XML output\n" + "\n" + "Examples:\n" + "cat hugetextfile.txt | ./icu_chain_test -c config.xml \n" + "./icu_chain_test -p c\n" + "./icu_chain_test -p l -x\n" + "./icu_chain_test -p t -x\n" + ); exit(1); } @@ -36,10 +52,15 @@ void read_params(int argc, char **argv, struct config_t *p_config){ /* set default parameters */ p_config->conffile[0] = 0; - + p_config->print[0] = 0; + p_config->xmloutput = 0; + p_config->chain = 0; + p_config->infile = stdin; + p_config->outfile = stdout; + /* set up command line parameters */ - while ((ret = options("c:p:v", argv, argc, &arg)) != -2) + while ((ret = options("c:p:x", argv, argc, &arg)) != -2) { switch (ret) { @@ -49,21 +70,26 @@ void read_params(int argc, char **argv, struct config_t *p_config){ case 'p': strcpy(p_config->print, arg); break; - case 'v': - if (arg) - p_config->verbatim = atoi(arg); - else - p_config->verbatim = 1; + case 'x': + p_config->xmloutput = 1; break; default: print_option_error(p_config); } } + + //p_config->infile = fopen("/etc/passwd", "r"); + - if (! strlen(p_config->conffile)) - print_option_error(); -} + if ((!strlen(p_config->conffile) + && !strlen(p_config->print)) + || !config.infile + || !config.outfile) + + print_option_error(p_config); +}; + /* UConverter *conv; */ /* conv = ucnv_open("utf-8", &status); */ @@ -85,20 +111,29 @@ void read_params(int argc, char **argv, struct config_t *p_config){ static void print_icu_converters(const struct config_t *p_config) { - int32_t count; - int32_t i; - - count = ucnv_countAvailable(); - printf("Available ICU converters: %d\n", count); - - for(i=0;ixmloutput) + fprintf(config.outfile, "\n", + count, ucnv_getDefaultName()); + else { + fprintf(config.outfile, "Available ICU converters: %d\n", count); + fprintf(config.outfile, "Default ICU Converter is: '%s'\n", ucnv_getDefaultName()); + } + + for(i=0;ixmloutput) + fprintf(config.outfile, "\n", ucnv_getAvailableName(i)); + else + fprintf(config.outfile, "%s ", ucnv_getAvailableName(i)); + } + + if (p_config->xmloutput) + fprintf(config.outfile, "\n"); + else + fprintf(config.outfile, "\n"); } static void print_icu_transliterators(const struct config_t *p_config) @@ -111,27 +146,26 @@ static void print_icu_transliterators(const struct config_t *p_config) int32_t buf_cap = 128; char buf[buf_cap]; - if (1 < p_config->verbatim){ - printf("\n"); - printf("\n\n", count); - } else - printf("Available ICU transliterators: %d\n", count); + if (p_config->xmloutput) + fprintf(config.outfile, "\n", count); + else + fprintf(config.outfile, "Available ICU transliterators: %d\n", count); for(i=0;iverbatim) - printf("\n", buf); + if (p_config->xmloutput) + fprintf(config.outfile, "\n", buf); else - printf(" %s", buf); + fprintf(config.outfile, " %s", buf); } - if (1 < p_config->verbatim){ - printf("\n\n"); + if (p_config->xmloutput){ + fprintf(config.outfile, "\n"); } else { - printf("\n\nUnicode Set Patterns:\n" + fprintf(config.outfile, "\n\nUnicode Set Patterns:\n" " Pattern Description\n" " Ranges [a-z] The lower case letters a through z\n" " Named Chars [abc123] The six characters a,b,c,1,2 and 3\n" @@ -173,13 +207,11 @@ static void print_icu_transliterators(const struct config_t *p_config) " http://icu.sourceforge.net/userguide/Transform.html\n" " http://icu.sourceforge.net/userguide/TransformRule.html\n" ); - } - printf("\n\n"); - - - exit(0); + fprintf(config.outfile, "\n\n"); + + } } static void print_icu_xml_locales(const struct config_t *p_config) @@ -225,10 +257,10 @@ static void print_icu_xml_locales(const struct config_t *p_config) count = uloc_countAvailable() ; - if (1 < p_config->verbatim){ - printf(""); - printf("\n\n", - count, uloc_getDefault(), ucol_countAvailable()); + if (p_config->xmloutput){ + + fprintf(config.outfile, "\n", + count, uloc_getDefault(), ucol_countAvailable()); } for(i=0;iverbatim){ - printf("xmloutput){ + fprintf(config.outfile, ""); + fprintf(config.outfile, " localname=\"%s\"", localname_str); + fprintf(config.outfile, ">"); if (strlen(localname_str)) - printf("%s", localname_str); - printf("\n"); + fprintf(config.outfile, "%s", localname_str); + fprintf(config.outfile, "\n"); } - else if (1 == p_config->verbatim){ - printf("%s", uloc_getAvailable(i)); - printf(" | "); + else if (1 == p_config->xmloutput){ + fprintf(config.outfile, "%s", uloc_getAvailable(i)); + fprintf(config.outfile, " | "); if (strlen(name_str)) - printf("%s", name_str); - printf(" | "); + fprintf(config.outfile, "%s", name_str); + fprintf(config.outfile, " | "); if (strlen(localname_str)) - printf("%s", localname_str); - printf("\n"); + fprintf(config.outfile, "%s", localname_str); + fprintf(config.outfile, "\n"); } else - printf("%s ", uloc_getAvailable(i)); + fprintf(config.outfile, "%s ", uloc_getAvailable(i)); } - if (1 < p_config->verbatim) - printf("\n\n"); + if (p_config->xmloutput) + fprintf(config.outfile, "\n"); else - printf("\n"); + fprintf(config.outfile, "\n"); if(U_FAILURE(status)) { fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status)); exit(status); } - exit(0); } -int main(int argc, char **argv) { +static void print_info(const struct config_t *p_config) +{ + if (p_config->xmloutput) + fprintf(config.outfile, "\n" + "\n"); + + if ('c' == config.print[0]) + print_icu_converters(&config); + else if ('l' == config.print[0]) + print_icu_xml_locales(&config); + else if ('t' == config.print[0]) + print_icu_transliterators(&config); + else { + print_icu_converters(&config); + print_icu_xml_locales(&config); + print_icu_transliterators(&config); + } + + if (p_config->xmloutput) + fprintf(config.outfile, "\n"); - //LIBXML_TEST_VERSION; + exit(0); +}; - read_params(argc, argv, &config); - if (config.debug) - print_options(&config); - if ('c' == config.print[0]) - print_icu_converters(&config); +static void process_text_file(const struct config_t *p_config) +{ + char * line = 0; + size_t line_cap = 0; + ssize_t line_len; + + xmlDoc *doc = xmlParseFile(config.conffile); + xmlNode *xml_node = xmlDocGetRootElement(doc); + + long unsigned int token_count = 0; + long unsigned int line_count = 0; + + UErrorCode status = U_ZERO_ERROR; + int success = 0; + + + config.chain = icu_chain_xml_config(xml_node, &status); - if ('l' == config.print[0]) - print_icu_xml_locales(&config); + if (config.chain && U_SUCCESS(status)) + success = 1; - if ('t' == config.print[0]) - print_icu_transliterators(&config); - - //xmlCleanupParser(); - //xmlMemoryDump(); - return(0); -} + if (p_config->xmloutput) + fprintf(config.outfile, + "\n" + "\n" + "\n"); + + // read input lines for processing + while ((line_len = getline(&line, &line_cap, config.infile)) != -1) { + success = icu_chain_assign_cstr(config.chain, line, &status); + line_count++; + + while (success && icu_chain_next_token(config.chain, &status)){ + if (U_FAILURE(status)) + success = 0; + else { + token_count++; + if (p_config->xmloutput) + fprintf(config.outfile, + "\n", + token_count, + line_count, + icu_chain_get_norm(config.chain), + icu_chain_get_display(config.chain)); + else + fprintf(config.outfile, "%lu %lu '%s' '%s'\n", + token_count, + line_count, + icu_chain_get_norm(config.chain), + icu_chain_get_display(config.chain)); + } + } + + } + + if (p_config->xmloutput) + fprintf(config.outfile, + "\n" + "\n"); + + icu_chain_destroy(config.chain); + xmlFreeDoc(doc); + if (line) + free(line); +}; + + +int main(int argc, char **argv) +{ + + read_params(argc, argv, &config); + if (config.conffile && strlen(config.conffile)) + process_text_file(&config); + + if (config.print && strlen(config.print)) + print_info(&config); + + + return(0); +}; + + +/* + * Local variables: + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */