Init winsock. Incorporate ICU. Pazpar2 runs on Windows.
[pazpar2-moved-to-github.git] / src / icu_I18N.c
index aaa0924..f40b529 100644 (file)
@@ -1,26 +1,24 @@
-/* $Id: icu_I18N.c,v 1.18 2007-05-21 10:14:08 marc Exp $
-   Copyright (c) 2006-2007, Index Data.
+/* This file is part of Pazpar2.
+   Copyright (C) 2006-2008 Index Data
 
-   This file is part of Pazpar2.
+Pazpar2 is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2, or (at your option) any later
+version.
 
-   Pazpar2 is free software; you can redistribute it and/or modify it under
-   the terms of the GNU General Public License as published by the Free
-   Software Foundation; either version 2, or (at your option) any later
-   version.
+Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
 
-   Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
-   WARRANTY; without even the implied warranty of MERCHANTABILITY or
-   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-   for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 
-   You should have received a copy of the GNU General Public License
-   along with Pazpar2; see the file LICENSE.  If not, write to the
-   Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
-   02111-1307, USA.
 */
 
 #if HAVE_CONFIG_H
-#include "cconfig.h"
+#include <config.h>
 #endif
 
 #define USE_TIMING 0
@@ -83,7 +81,6 @@ struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity)
     return buf16;
 };
 
-
 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
                                             size_t capacity)
 {
@@ -172,8 +169,6 @@ struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
                 buf8->utf8 
                     = (uint8_t *) realloc(buf8->utf8, 
                                           sizeof(uint8_t) * capacity);
-            buf8->utf8[0] = (uint8_t) 0;
-            buf8->utf8_len = 0;
             buf8->utf8_cap = capacity;
         } 
         else { 
@@ -206,6 +201,16 @@ struct icu_buf_utf8 * icu_buf_utf8_copy(struct icu_buf_utf8 * dest8,
 };
 
 
+const char *icu_buf_utf8_to_cstr(struct icu_buf_utf8 *src8)
+{
+    if (!src8 || src8->utf8_len == 0)
+        return "";
+    if (src8->utf8_len == src8->utf8_cap)
+        src8 = icu_buf_utf8_resize(src8, src8->utf8_len * 2 + 1);
+    src8->utf8[src8->utf8_len] = '\0';
+    return (const char *) src8->utf8;
+}
+
 
 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8)
 {
@@ -241,7 +246,7 @@ UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
 
     //if (*status != U_BUFFER_OVERFLOW_ERROR
     if (U_SUCCESS(*status)  
-        && utf16_len < dest16->utf16_cap)
+        && utf16_len <= dest16->utf16_cap)
         dest16->utf16_len = utf16_len;
     else {
         dest16->utf16[0] = (UChar) 0;
@@ -279,7 +284,7 @@ UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
 
     //  if (*status != U_BUFFER_OVERFLOW_ERROR
     if (U_SUCCESS(*status)  
-        && utf16_len < dest16->utf16_cap)
+        && utf16_len <= dest16->utf16_cap)
         dest16->utf16_len = utf16_len;
     else {
         dest16->utf16[0] = (UChar) 0;
@@ -316,7 +321,7 @@ UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
 
     //if (*status != U_BUFFER_OVERFLOW_ERROR
     if (U_SUCCESS(*status)  
-        && utf8_len < dest8->utf8_cap)
+        && utf8_len <= dest8->utf8_cap)
         dest8->utf8_len = utf8_len;
     else {
         dest8->utf8[0] = (uint8_t) 0;
@@ -445,7 +450,7 @@ int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
     }
     
     if (U_SUCCESS(*status)
-        && dest16_len < dest16->utf16_cap)
+        && dest16_len <= dest16->utf16_cap)
         dest16->utf16_len = dest16_len;
     else {
         dest16->utf16[0] = (UChar) 0;
@@ -687,6 +692,7 @@ struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
                            UTRANS_FORWARD,
                            0, 0, 
                            normalizer->parse_error, status);
+        // yaz_log(YLOG_LOG, "utrans_open %p", normalizer->trans);
         break;
     case 'r':
         normalizer->trans
@@ -695,6 +701,7 @@ struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
                            UTRANS_REVERSE ,
                            0, 0,
                            normalizer->parse_error, status);
+        // yaz_log(YLOG_LOG, "utrans_open %p", normalizer->trans);
         break;
     default:
         *status = U_UNSUPPORTED_ERROR;
@@ -716,7 +723,10 @@ void icu_normalizer_destroy(struct icu_normalizer * normalizer){
         if (normalizer->rules16) 
             icu_buf_utf16_destroy(normalizer->rules16);
         if (normalizer->trans)
+        {
+            // yaz_log(YLOG_LOG, "utrans_close %p", normalizer->trans);
             utrans_close(normalizer->trans);
+        }
         free(normalizer);
     }
 };
@@ -764,21 +774,16 @@ struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain,
     step = (struct icu_chain_step *) malloc(sizeof(struct icu_chain_step));
 
     step->type = type;
-    step->more_tokens = 0;
-    step->need_new_token = 1;
 
-    if (buf16)
-        step->buf16 = buf16;
-    else
-        step->buf16 = 0;
+    step->buf16 = buf16;
 
     // create auxilary objects
     switch(step->type) {
     case ICU_chain_step_type_display:
         break;
-    case ICU_chain_step_type_norm:
+    case ICU_chain_step_type_index:
         break;
-    case ICU_chain_step_type_sort:
+    case ICU_chain_step_type_sortkey:
         break;
     case ICU_chain_step_type_casemap:
         step->u.casemap = icu_casemap_create((char *) chain->locale, 
@@ -809,9 +814,9 @@ void icu_chain_step_destroy(struct icu_chain_step * step){
     switch(step->type) {
     case ICU_chain_step_type_display:
         break;
-    case ICU_chain_step_type_norm:
+    case ICU_chain_step_type_index:
         break;
-    case ICU_chain_step_type_sort:
+    case ICU_chain_step_type_sortkey:
         break;
     case ICU_chain_step_type_casemap:
         icu_casemap_destroy(step->u.casemap);
@@ -828,8 +833,7 @@ void icu_chain_step_destroy(struct icu_chain_step * step){
     default:
         break;
     }
-
-
+    free(step);
 };
 
 
@@ -870,6 +874,7 @@ void icu_chain_destroy(struct icu_chain * chain)
         icu_buf_utf16_destroy(chain->src16);
     
         icu_chain_step_destroy(chain->steps);
+        free(chain);
     }
 };
 
@@ -880,6 +885,8 @@ struct icu_chain * icu_chain_xml_config(xmlNode *xml_node,
 
     xmlNode *node = 0;
     struct icu_chain * chain = 0;
+    xmlChar *xml_id = 0;
+    xmlChar *xml_locale = 0;
    
     if (!xml_node 
         ||xml_node->type != XML_ELEMENT_NODE 
@@ -887,8 +894,8 @@ struct icu_chain * icu_chain_xml_config(xmlNode *xml_node,
 
         return 0;
     
-    xmlChar *xml_id = xmlGetProp(xml_node, (xmlChar *) "id");
-    xmlChar *xml_locale = xmlGetProp(xml_node, (xmlChar *) "locale");
+    xml_id = xmlGetProp(xml_node, (xmlChar *) "id");
+    xml_locale = xmlGetProp(xml_node, (xmlChar *) "locale");
 
     if (!xml_id || !strlen((const char *) xml_id) 
         || !xml_locale || !strlen((const char *) xml_locale))
@@ -897,16 +904,19 @@ struct icu_chain * icu_chain_xml_config(xmlNode *xml_node,
     chain = icu_chain_create((const uint8_t *) xml_id, 
                              (const uint8_t *) xml_locale);
     
+    xmlFree(xml_id);
+    xmlFree(xml_locale);
     if (!chain)
         return 0;
         
     for (node = xml_node->children; node; node = node->next)
     {
+        xmlChar *xml_rule = 0;
+        struct icu_chain_step * step = 0;
         if (node->type != XML_ELEMENT_NODE)
             continue;
 
-        xmlChar *xml_rule = xmlGetProp(node, (xmlChar *) "rule");
-        struct icu_chain_step * step = 0;
+        xml_rule = xmlGetProp(node, (xmlChar *) "rule");
 
         if (!strcmp((const char *) node->name, 
                     (const char *) "casemap")){
@@ -929,16 +939,17 @@ struct icu_chain * icu_chain_xml_config(xmlNode *xml_node,
                                          (const uint8_t *) "", status);
         }
         else if (!strcmp((const char *) node->name,
-                         (const char *) "normal")){
-            step = icu_chain_insert_step(chain, ICU_chain_step_type_norm, 
+                         (const char *) "index")){
+            step = icu_chain_insert_step(chain, ICU_chain_step_type_index, 
                                          (const uint8_t *) "", status);
         }
         else if (!strcmp((const char *) node->name,
-                         (const char *) "sort")){
-            step = icu_chain_insert_step(chain, ICU_chain_step_type_sort, 
+                         (const char *) "sortkey")){
+            step = icu_chain_insert_step(chain, ICU_chain_step_type_sortkey, 
                                          (const uint8_t *) "", status);
         }
 
+        xmlFree(xml_rule);
         if (!step || U_FAILURE(*status)){
             icu_chain_destroy(chain);
             return 0;
@@ -978,10 +989,10 @@ struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
     case ICU_chain_step_type_display:
         buf16 = src16;
         break;
-    case ICU_chain_step_type_norm:
+    case ICU_chain_step_type_index:
         buf16 = src16;
         break;
-    case ICU_chain_step_type_sort:
+    case ICU_chain_step_type_sortkey:
         buf16 = src16;
         break;
     case ICU_chain_step_type_casemap:
@@ -1046,10 +1057,10 @@ int icu_chain_step_next_token(struct icu_chain * chain,
     case ICU_chain_step_type_display:
         icu_utf16_to_utf8(chain->display8, src16, status);
         break;
-    case ICU_chain_step_type_norm:
+    case ICU_chain_step_type_index:
         icu_utf16_to_utf8(chain->norm8, src16, status);
         break;
-    case ICU_chain_step_type_sort:
+    case ICU_chain_step_type_sortkey:
         icu_utf16_to_utf8(chain->sort8, src16, status);
         break;
     case ICU_chain_step_type_casemap:
@@ -1125,6 +1136,7 @@ int icu_chain_assign_cstr(struct icu_chain * chain,
 
     while (stp){
         stp->more_tokens = 1;
+        stp->need_new_token = 1;
         stp = stp->previous;
     }
     
@@ -1170,7 +1182,7 @@ int icu_chain_get_token_count(struct icu_chain * chain)
 const char * icu_chain_get_display(struct icu_chain * chain)
 {
     if (chain->display8)
-        return (const char *) chain->display8->utf8;
+        return icu_buf_utf8_to_cstr(chain->display8);
     
     return 0;
 };
@@ -1178,7 +1190,7 @@ const char * icu_chain_get_display(struct icu_chain * chain)
 const char * icu_chain_get_norm(struct icu_chain * chain)
 {
     if (chain->norm8)
-        return (const char *) chain->norm8->utf8;
+        return icu_buf_utf8_to_cstr(chain->norm8);
     
     return 0;
 };
@@ -1186,7 +1198,7 @@ const char * icu_chain_get_norm(struct icu_chain * chain)
 const char * icu_chain_get_sort(struct icu_chain * chain)
 {
     if (chain->sort8)
-        return (const char *) chain->sort8->utf8;
+        return icu_buf_utf8_to_cstr(chain->sort8);
     
     return 0;
 };