/* This file is part of the YAZ toolkit.
- * Copyright (C) 1995-2011 Index Data
+ * Copyright (C) Index Data
* See the file LICENSE for details.
*/
#include <unicode/ucnv.h>
#include <unicode/ustring.h>
-#include <unicode/ucol.h>
+#include <unicode/ucol.h>
#include <unicode/ubrk.h>
#include <unicode/utrans.h>
#include <unicode/uclean.h>
#include <yaz/wrbuf.h>
/* commando line and config parameters */
-struct config_t {
+struct config_t {
char conffile[1024];
char print[1024];
int xmloutput;
int sortoutput;
+ int org_output;
yaz_icu_chain_t chain;
FILE * infile;
FILE * outfile;
};
-
+
void print_option_error(const struct config_t *p_config)
-{
+{
fprintf(stderr, "yaz-icu [options] [infile]\n"
"Options:\n"
" -c file XML configuration\n"
" -p a|c|l|t Print ICU info \n"
" -s Show sort normalization key\n"
+ " -o Show org positions\n"
" -x XML output instread of text\n"
"\n"
"Examples:\n"
}
void read_params(int argc, char **argv, struct config_t *p_config)
-{
+{
char *arg;
int ret;
-
+
/* set default parameters */
p_config->conffile[0] = 0;
p_config->print[0] = 0;
p_config->chain = 0;
p_config->infile = 0;
p_config->outfile = stdout;
-
+ p_config->org_output = 0;
+
/* set up command line parameters */
-
- while ((ret = options("c:p:xs", argv, argc, &arg)) != -2)
+
+ while ((ret = options("c:op:sx", argv, argc, &arg)) != -2)
{
switch (ret)
{
case 'x':
p_config->xmloutput = 1;
break;
+ case 'o':
+ p_config->org_output = 1;
+ break;
case 0:
if (p_config->infile)
{
fprintf(p_config->outfile, "<converters count=\"%d\" default=\"%s\">\n",
count, ucnv_getDefaultName());
else
- {
+ {
fprintf(p_config->outfile, "Available ICU converters: %d\n", count);
- fprintf(p_config->outfile, "Default ICU Converter is: '%s'\n",
+ fprintf(p_config->outfile, "Default ICU Converter is: '%s'\n",
ucnv_getDefaultName());
}
-
+
for (i = 0; i < count; i++)
{
if (p_config->xmloutput)
- fprintf(p_config->outfile, "<converter id=\"%s\"/>\n",
+ fprintf(p_config->outfile, "<converter id=\"%s\"/>\n",
ucnv_getAvailableName(i));
- else
+ else
fprintf(p_config->outfile, "%s\n", ucnv_getAvailableName(i));
}
-
+
if (p_config->xmloutput)
fprintf(p_config->outfile, "</converters>\n");
else
if (p_config->xmloutput)
fprintf(p_config->outfile, "<transliterators count=\"%d\">\n", count);
- else
+ else
fprintf(p_config->outfile, "Available ICU transliterators: %d\n", count);
while ((name = uenum_next(en, &length, &status)))
"see http://userguide.icu-project.org/transforms/general\n"
" http://www.unicode.org/reports/tr44/\n"
);
-
-
+
+
fprintf(p_config->outfile, "\n\n");
-
+
}
}
int32_t count;
int32_t i;
UErrorCode status = U_ZERO_ERROR;
-
+
UChar keyword[64];
int32_t keyword_len = 0;
char keyword_str[128];
if (p_config->xmloutput)
{
- fprintf(p_config->outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n",
+ fprintf(p_config->outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n",
count, uloc_getDefault(), ucol_countAvailable());
}
else
fprintf(p_config->outfile, "Available ICU locales: %d\n", count);
fprintf(p_config->outfile, "Default locale is: %s\n", uloc_getDefault());
}
-
- for (i = 0; i < count; i++)
+
+ for (i = 0; i < count; i++)
{
- keyword_len
- = uloc_getDisplayKeyword(uloc_getAvailable(i), "en",
- keyword, 64,
+ keyword_len
+ = uloc_getDisplayKeyword(uloc_getAvailable(i), "en",
+ keyword, 64,
&status);
u_strToUTF8(keyword_str, 128, &keyword_str_len,
keyword, keyword_len,
&status);
-
-
- language_len
- = uloc_getDisplayLanguage(uloc_getAvailable(i), "en",
- language, 64,
+
+
+ language_len
+ = uloc_getDisplayLanguage(uloc_getAvailable(i), "en",
+ language, 64,
&status);
u_strToUTF8(lang_str, 128, &lang_str_len,
&status);
- script_len
- = uloc_getDisplayScript(uloc_getAvailable(i), "en",
- script, 64,
+ script_len
+ = uloc_getDisplayScript(uloc_getAvailable(i), "en",
+ script, 64,
&status);
u_strToUTF8(script_str, 128, &script_str_len,
script, script_len,
&status);
- location_len
- = uloc_getDisplayCountry(uloc_getAvailable(i), "en",
- location, 64,
+ location_len
+ = uloc_getDisplayCountry(uloc_getAvailable(i), "en",
+ location, 64,
&status);
u_strToUTF8(location_str, 128, &location_str_len,
location, location_len,
&status);
- variant_len
- = uloc_getDisplayVariant(uloc_getAvailable(i), "en",
- variant, 64,
+ variant_len
+ = uloc_getDisplayVariant(uloc_getAvailable(i), "en",
+ variant, 64,
&status);
u_strToUTF8(variant_str, 128, &variant_str_len,
variant, variant_len,
&status);
- name_len
- = uloc_getDisplayName(uloc_getAvailable(i), "en",
- name, 64,
+ name_len
+ = uloc_getDisplayName(uloc_getAvailable(i), "en",
+ name, 64,
&status);
u_strToUTF8(name_str, 128, &name_str_len,
name, name_len,
&status);
- localname_len
- = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i),
- localname, 64,
+ localname_len
+ = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i),
+ localname, 64,
&status);
u_strToUTF8(localname_str, 128, &localname_str_len,
if (p_config->xmloutput)
{
- fprintf(p_config->outfile, "<locale id=\"%s\"", uloc_getAvailable(i));
+ fprintf(p_config->outfile, "<locale id=\"%s\"", uloc_getAvailable(i));
if (strlen(lang_str))
fprintf(p_config->outfile, " language=\"%s\"", lang_str);
if (strlen(script_str))
fprintf(p_config->outfile, ">");
if (strlen(localname_str))
fprintf(p_config->outfile, "%s", localname_str);
- fprintf(p_config->outfile, "</locale>\n");
+ fprintf(p_config->outfile, "</locale>\n");
}
else if (1 == p_config->xmloutput)
{
- fprintf(p_config->outfile, "%s", uloc_getAvailable(i));
+ fprintf(p_config->outfile, "%s", uloc_getAvailable(i));
fprintf(p_config->outfile, " | ");
if (strlen(name_str))
fprintf(p_config->outfile, "%s", name_str);
{
char *line = 0;
char linebuf[1024];
-
- xmlDoc *doc = xmlParseFile(p_config->conffile);
+
+ xmlDoc *doc = xmlParseFile(p_config->conffile);
xmlNode *xml_node = xmlDocGetRootElement(doc);
- long unsigned int token_count = 0;
- long unsigned int line_count = 0;
-
+ long unsigned int token_count = 0;
+ long unsigned int line_count = 0;
+
UErrorCode status = U_ZERO_ERROR;
-
+
if (!xml_node)
- {
+ {
printf("Could not parse XML config file '%s' \n",
p_config->conffile);
exit(1);
p_config->chain = icu_chain_xml_config(xml_node, 1, &status);
if (!p_config->chain || !U_SUCCESS(status))
- {
+ {
printf("Could not set up ICU chain from config file '%s' \n",
p_config->conffile);
- if (!U_SUCCESS(status))
- printf("ICU Error: %d %s\n", status, u_errorName(status));
exit(1);
}
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
"<icu>\n"
"<tokens>\n");
-
+
/* read input lines for processing */
while ((line=fgets(linebuf, sizeof(linebuf)-1, p_config->infile)))
{
success = 0;
else
{
+ size_t start, len;
const char *sortkey = icu_chain_token_sortkey(p_config->chain);
+
+ icu_chain_get_org_info(p_config->chain, &start, &len);
wrbuf_rewind(sw);
wrbuf_puts_escaped(sw, sortkey);
token_count++;
- if (p_config->xmloutput)
+ if (p_config->xmloutput)
{
- fprintf(p_config->outfile,
+ fprintf(p_config->outfile,
"<token id=\"%lu\" line=\"%lu\"",
token_count, line_count);
wrbuf_xmlputs(cdata, icu_chain_token_display(p_config->chain));
fprintf(p_config->outfile, " display=\"%s\"",
wrbuf_cstr(cdata));
-
+
if (p_config->sortoutput)
{
wrbuf_rewind(cdata);
{
fprintf(p_config->outfile, " '%s'", wrbuf_cstr(sw));
}
+ if (p_config->org_output)
+ {
+ fprintf(p_config->outfile, " %ld+%ld",
+ (long) start, (long) len);
+ }
fprintf(p_config->outfile, "\n");
}
}
fprintf(p_config->outfile,
"</tokens>\n"
"</icu>\n");
-
+
icu_chain_destroy(p_config->chain);
xmlFreeDoc(doc);
if (line)
#endif /* YAZ_HAVE_ICU */
-int main(int argc, char **argv)
+int main(int argc, char **argv)
{
+#if YAZ_HAVE_ICU
struct config_t config;
-#if YAZ_HAVE_ICU
read_params(argc, argv, &config);
if (config.conffile && strlen(config.conffile))
process_text_file(&config);
-
+
if (config.print && strlen(config.print))
print_info(&config);