Omit sort key by default in yaz-icu's output
[yaz-moved-to-github.git] / util / yaz-icu.c
index 00b390d..c510617 100644 (file)
@@ -1,8 +1,6 @@
-/*
- * Copyright (C) 1995-2007, Index Data ApS
+/* This file is part of the YAZ toolkit.
+ * Copyright (C) 1995-2009 Index Data
  * See the file LICENSE for details.
- *
- * $Id: yaz-icu.c,v 1.12 2007-11-08 18:02:04 adam Exp $
  */
 
 #if HAVE_CONFIG_H
 #include <unicode/utrans.h>
 
 #include <yaz/icu.h>
+#include <yaz/wrbuf.h>
 
 /* commando line and config parameters */
 static struct config_t { 
     char conffile[1024];
     char print[1024];
     int xmloutput;
+    int sortoutput;
     yaz_icu_chain_t chain;
     FILE * infile;
     FILE * outfile;
@@ -44,6 +44,7 @@ void print_option_error(const struct config_t *p_config)
     fprintf(stderr, "yaz-icu\n"
             "   [-c (path/to/config/file.xml)]\n"
             "   [-p (a|c|l|t)] print ICU info \n"
+            "   [-s] Show sort normalization key\n"
             "   [-x] XML output\n"
             "\n"
             "Examples:\n"
@@ -53,10 +54,10 @@ void print_option_error(const struct config_t *p_config)
             "./yaz-icu -p t -x\n"
             "\n"
             "Example ICU chain XML configuration file:\n"
-            "<icu_chain id=\"en:word\" locale=\"en\">\n"
-            "  <normalize rule=\"[:Control:] Any-Remove\"/>\n"
+            "<icu_chain locale=\"en\">\n"
+            "  <transform rule=\"[:Control:] Any-Remove\"/>\n"
             "  <tokenize rule=\"l\"/>\n"
-            "  <normalize rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
+            "  <transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
             "  <casemap rule=\"l\"/>\n"
             "</icu_chain>\n"
           );
@@ -72,13 +73,14 @@ void read_params(int argc, char **argv, struct config_t *p_config)
     p_config->conffile[0] = 0;
     p_config->print[0] = 0;
     p_config->xmloutput = 0;
+    p_config->sortoutput = 0;
     p_config->chain = 0;
     p_config->infile = stdin;
     p_config->outfile = stdout;
     
     /* set up command line parameters */
     
-    while ((ret = options("c:p:x", argv, argc, &arg)) != -2)
+    while ((ret = options("c:p:xs", argv, argc, &arg)) != -2)
     {
         switch (ret)
         {
@@ -88,10 +90,14 @@ void read_params(int argc, char **argv, struct config_t *p_config)
         case 'p':
             strcpy(p_config->print, arg);
             break;
+        case 's':
+            p_config->sortoutput = 1;
+            break;
         case 'x':
             p_config->xmloutput = 1;
             break;
         default:
+            printf("Got %d\n", ret);
             print_option_error(p_config);
         }
     }
@@ -156,7 +162,7 @@ static void print_icu_converters(const struct config_t *p_config)
 static void print_icu_transliterators(const struct config_t *p_config)
 {
     int32_t buf_cap = 128;
-    char buf[buf_cap];
+    char buf[128];
     int32_t i;
     int32_t count = utrans_countAvailableIDs();
     
@@ -394,7 +400,7 @@ static void print_icu_xml_locales(const struct config_t *p_config)
     if(U_FAILURE(status))
     {
         fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
-        exit(status);
+        exit(2);
     }
 }
 
@@ -439,21 +445,21 @@ static void process_text_file(const struct config_t *p_config)
     UErrorCode status = U_ZERO_ERROR;
     int success = 0;
     
-    if (! xml_node)
+    if (!xml_node)
     {   
         printf("Could not parse XML config file '%s' \n",
                 config.conffile);
-        exit (1);
+        exit(1);
     }
 
-    config.chain = icu_chain_xml_config(xml_node, 0, &status);
+    config.chain = icu_chain_xml_config(xml_node, 1, &status);
 
     if (config.chain && U_SUCCESS(status))
         success = 1;
     else {   
         printf("Could not set up ICU chain from config file '%s' \n",
                 config.conffile);
-        exit (1);
+        exit(1);
     }
 
     if (p_config->xmloutput)
@@ -470,25 +476,46 @@ static void process_text_file(const struct config_t *p_config)
 
         while (success && icu_chain_next_token(config.chain, &status))
         {
+            WRBUF sw = wrbuf_alloc();
             if (U_FAILURE(status))
                 success = 0;
             else {
+                const char *sortkey = icu_chain_token_sortkey(config.chain);
+                wrbuf_rewind(sw);
+                wrbuf_puts_escaped(sw, sortkey);
                 token_count++;
                 if (p_config->xmloutput)                    
+                {
+                    /* should XML encode this. Bug #1902 */
                     fprintf(config.outfile, 
-                            "<token id=\%lu\" line=\"%lu\""
-                            " norm=\"%s\" display=\"%s\"/>\n",
+                            "<token id=\"%lu\" line=\"%lu\""
+                            " norm=\"%s\" display=\"%s\"",
                             token_count,
                             line_count,
                             icu_chain_token_norm(config.chain),
                             icu_chain_token_display(config.chain));
+                    if (p_config->sortoutput)
+                    {
+                        fprintf(config.outfile, " sortkey=\"%s\"",
+                                wrbuf_cstr(sw));
+                    }
+                    fprintf(config.outfile, "/>\n");
+                }
                 else
-                    fprintf(config.outfile, "%lu %lu '%s' '%s'\n",
+                {
+                    fprintf(config.outfile, "%lu %lu '%s' '%s'",
                             token_count,
                             line_count,
                             icu_chain_token_norm(config.chain),
                             icu_chain_token_display(config.chain));
+                    if (p_config->sortoutput)
+                    {
+                        fprintf(config.outfile, " '%s'", wrbuf_cstr(sw));
+                    }
+                    fprintf(config.outfile, "\n");
+                }
             }
+            wrbuf_destroy(sw);
         }
         
     }
@@ -527,15 +554,17 @@ int main(int argc, char **argv)
            "re-configure and re-compile\n");
 
 
+    exit(3);
 #endif /* YAZ_HAVE_ICU */
 
-    return(0);
+    return 0;
 }
 
 
 /*
  * Local variables:
  * c-basic-offset: 4
+ * c-file-style: "Stroustrup"
  * indent-tabs-mode: nil
  * End:
  * vim: shiftwidth=4 tabstop=8 expandtab