Merge branch 'master' into stemming
authorDennis Schafroth <dennis@indexdata.com>
Mon, 8 Nov 2010 17:05:54 +0000 (18:05 +0100)
committerDennis Schafroth <dennis@indexdata.com>
Mon, 8 Nov 2010 17:05:54 +0000 (18:05 +0100)
Conflicts:
src/icu_chain.c

Merge minor ICU change and test changes.

Fix includes to include config.h.
Fix Makefile: Include stemmer in libyaz_icu.

1  2 
include/yaz/stemmer.h
src/Makefile.am
src/icu_chain.c
src/stemmer.c

diff --combined include/yaz/stemmer.h
index f5d23ab,0000000..6e51cae
mode 100644,000000..100644
--- /dev/null
@@@ -1,60 -1,0 +1,61 @@@
 +/* This file is part of the YAZ toolkit.
 + * Copyright (C) 1995-2010 Index Data.
 + * All rights reserved.
 + * Redistribution and use in source and binary forms, with or without
 + * modification, are permitted provided that the following conditions are met:
 + *
 + *     * Redistributions of source code must retain the above copyright
 + *       notice, this list of conditions and the following disclaimer.
 + *     * Redistributions in binary form must reproduce the above copyright
 + *       notice, this list of conditions and the following disclaimer in the
 + *       documentation and/or other materials provided with the distribution.
 + *     * Neither the name of Index Data nor the names of its contributors
 + *       may be used to endorse or promote products derived from this
 + *       software without specific prior written permission.
 + *
 + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
 + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
 + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 + */
 +
 +/**
 + * \file stemming.h
 + * \brief Header for the stemming API. Simple wrapper to hide underlying implementation.
 + */
 +
 +#ifndef YAZ_STEMMER_H
 +#define YAZ_STEMMER_H
 +
 +#include <yaz/yconfig.h>
++#include <yaz/xmltypes.h>
 +#include <yaz/icu_I18N.h>
 +#include <yaz/log.h>
 +
 +#include <unicode/ustring.h>  /* some more string fcns*/
 +#include <unicode/uchar.h>    /* char names           */
 +
 +
 +YAZ_BEGIN_CDECL
 +
 +typedef struct yaz_stemmer_t *yaz_stemmer_p;
 +
 +YAZ_EXPORT
 +yaz_stemmer_p yaz_stemmer_create(const char *locale, const char *rule, UErrorCode *status);
 +
 +YAZ_EXPORT
 +yaz_stemmer_p yaz_stemmer_clone(yaz_stemmer_p stemmer);
 +
 +YAZ_EXPORT
 +void yaz_stemmer_stem(yaz_stemmer_p stemmer, struct icu_buf_utf16 *dst, struct icu_buf_utf16* src, UErrorCode *status);
 +
 +YAZ_EXPORT
 +void yaz_stemmer_destroy(yaz_stemmer_p stemmer);
 +
 +#endif
diff --combined src/Makefile.am
@@@ -77,7 -77,7 +77,7 @@@ libyaz_la_SOURCES=version.c options.c l
    wrbuf.c oid_db.c errno.c \
    nmemsdup.c xmalloc.c readconf.c tpath.c nmem.c matchstr.c atoin.c \
    siconv.c iconv-p.h utf8.c ucs4.c iso5428.c advancegreek.c \
-   odr_bool.c ber_bool.c ber_len.c ber_tag.c odr_util.c facet.c stemmer.c \
+   odr_bool.c ber_bool.c ber_len.c ber_tag.c odr_util.c facet.c \
    odr_null.c ber_null.c odr_int.c ber_int.c odr_tag.c odr_cons.c \
    odr_seq.c odr_oct.c ber_oct.c odr_bit.c ber_bit.c odr_oid.c \
    ber_oid.c odr_use.c odr_choice.c odr_any.c ber_any.c odr.c odr_mem.c \
@@@ -115,7 -115,7 +115,7 @@@ libyaz_server_la_SOURCES = statserv.c s
  
  libyaz_server_la_LDFLAGS=-version-info $(YAZ_VERSION_INFO)
  
--libyaz_icu_la_SOURCES = icu_chain.c icu_utf16.c icu_utf8.c \
++libyaz_icu_la_SOURCES = icu_chain.c icu_utf16.c icu_utf8.c stemmer.c \
        icu_transform.c icu_casemap.c icu_tokenizer.c icu_sortkey.c
  libyaz_icu_la_LDFLAGS=-version-info $(YAZ_VERSION_INFO)
  
diff --combined src/icu_chain.c
  
  #include <yaz/icu_I18N.h>
  
 +#include <yaz/stemmer.h>
 +
  #include <yaz/log.h>
+ #include <yaz/nmem.h>
+ #include <yaz/nmem_xml.h>
  #include <string.h>
  #include <stdlib.h>
  #include <stdio.h>
  
  enum icu_chain_step_type {
      ICU_chain_step_type_none,
 -    ICU_chain_step_type_display,   /* convert to utf8 display format */
 -    ICU_chain_step_type_casemap,   /* apply utf16 charmap */
 -    ICU_chain_step_type_transform, /* apply utf16 transform */
 -    ICU_chain_step_type_tokenize,  /* apply utf16 tokenization */
 -    ICU_chain_step_type_transliterate  /* apply utf16 tokenization */
 +    ICU_chain_step_type_display,        /* convert to utf8 display format */
 +    ICU_chain_step_type_casemap,        /* apply utf16 charmap */
 +    ICU_chain_step_type_transform,      /* apply utf16 transform */
 +    ICU_chain_step_type_tokenize,       /* apply utf16 tokenization */
 +    ICU_chain_step_type_transliterate,  /* apply utf16 tokenization */
 +    YAZ_chain_step_type_stemming        /* apply utf16 stemming (YAZ) */
  };
  
  struct icu_chain_step
      /* type and action object */
      enum icu_chain_step_type type;
      union {
-         struct icu_casemap   * casemap;
-         struct icu_transform * transform;
-         struct icu_tokenizer * tokenizer;
-         yaz_stemmer_p          stemmer;
 -      struct icu_casemap *casemap;
++      struct icu_casemap   *casemap;
+       struct icu_transform *transform;
+       struct icu_tokenizer *tokenizer;  
++        yaz_stemmer_p         stemmer;
      } u;
-     struct icu_chain_step * previous;
+     struct icu_chain_step *previous;
  };
  
  struct icu_chain
      char *locale;
      int sort;
  
-     UCollator * coll;
+     UCollator *coll;
      
      /* linked list of chain steps */
-     struct icu_chain_step * csteps;
+     struct icu_chain_step *csteps;
  };
  
  int icu_check_status(UErrorCode status)
      return 1;
  }
  
- static struct icu_chain_step *icu_chain_step_create(
-     struct icu_chain * chain,  enum icu_chain_step_type type,
-     const uint8_t * rule, 
-     UErrorCode *status)
+ static struct icu_chain_step *icu_chain_insert_step(
+     struct icu_chain *chain, enum icu_chain_step_type type,
+     const uint8_t *rule, UErrorCode *status)
  {
-     struct icu_chain_step * step = 0;
+     struct icu_chain_step *step = 0;
      
      if (!chain || !type || !rule)
          return 0;
          step->u.transform = icu_transform_create("custom", 'f',
                                                   (const char *) rule, status);
          break;
 +    case YAZ_chain_step_type_stemming:
 +        step->u.stemmer = yaz_stemmer_create((char *) chain->locale, (const char *) rule, status);
 +        break;
      default:
          break;
      }
+     step->previous = chain->csteps;
+     chain->csteps = step;
      return step;
  }
  
  
- static void icu_chain_step_destroy(struct icu_chain_step * step)
+ static void icu_chain_step_destroy(struct icu_chain_step *step)
  {
      if (!step)
          return;
      case ICU_chain_step_type_tokenize:
          icu_tokenizer_destroy(step->u.tokenizer);
          break;
 +    case YAZ_chain_step_type_stemming:
 +        yaz_stemmer_destroy(step->u.stemmer);
 +        break;
      default:
          break;
      }
@@@ -172,9 -165,6 +175,9 @@@ struct icu_chain_step *icu_chain_step_c
          case ICU_chain_step_type_tokenize:
              (*sp)->u.tokenizer = icu_tokenizer_clone(old->u.tokenizer);
              break;
 +        case YAZ_chain_step_type_stemming:
 +            yaz_stemmer_clone(step->u.stemmer);
 +            break;
          case ICU_chain_step_type_none:
              break;
          }
  }
  
  struct icu_chain *icu_chain_create(const char *locale, int sort,
-                                    UErrorCode * status)
+                                    UErrorCode *status)
  {
-     struct icu_chain * chain 
+     struct icu_chain *chain 
          = (struct icu_chain *) xmalloc(sizeof(*chain));
  
      *status = U_ZERO_ERROR;
      return chain;
  }
  
- void icu_chain_destroy(struct icu_chain * chain)
+ void icu_chain_destroy(struct icu_chain *chain)
  {
      if (chain)
      {
  }
  
  static struct icu_chain_step *icu_chain_insert_step(
-     struct icu_chain * chain, enum icu_chain_step_type type,
-     const uint8_t * rule, UErrorCode *status);
+     struct icu_chain *chain, enum icu_chain_step_type type,
+     const uint8_t *rule, UErrorCode *status);
  
- struct icu_chain * icu_chain_xml_config(const xmlNode *xml_node, 
-                                         int sort,
-                                         UErrorCode * status)
+ struct icu_chain *icu_chain_xml_config(const xmlNode *xml_node, 
+                                        int sort,
+                                        UErrorCode *status)
  {
      xmlNode *node = 0;
-     struct icu_chain * chain = 0;
+     int no_errors = 0;
+     struct icu_chain *chain = 0;
+     NMEM nmem = 0;
     
      *status = U_ZERO_ERROR;
  
          return 0;
      
      {
-         xmlChar * xml_locale = xmlGetProp((xmlNode *) xml_node, 
-                                           (xmlChar *) "locale");
+         xmlChar *xml_locale = xmlGetProp((xmlNode *) xml_node, 
+                                          (xmlChar *) "locale");
          
          if (xml_locale)
          {
      if (!chain)
          return 0;
  
+     nmem = nmem_create();
      for (node = xml_node->children; node; node = node->next)
      {
-         xmlChar *xml_rule;
-         struct icu_chain_step * step = 0;
+         char *rule = 0;
+         struct icu_chain_step *step = 0;
+         struct _xmlAttr *attr;
  
+         nmem_reset(nmem);
          if (node->type != XML_ELEMENT_NODE)
              continue;
  
-         xml_rule = xmlGetProp(node, (xmlChar *) "rule");
+         for (attr = node->properties; attr; attr = attr->next)
+         {
+             if (!strcmp((const char *) attr->name, "rule"))
+             {
+                 rule = nmem_text_node_cdata(attr->children, nmem);
+             }
+             else
+             {
+                 yaz_log(YLOG_WARN, "Unsupported attribute '%s' for "
+                         "element '%s'", attr->name, node->name);
+                 no_errors++;
+                 continue;
+             }
+         }
+         if (!rule && node->children)
+             rule = nmem_text_node_cdata(node->children, nmem);
+         
          if (!strcmp((const char *) node->name, "casemap"))
              step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap, 
-                                          (const uint8_t *) xml_rule, status);
+                                          (const uint8_t *) rule, status);
          else if (!strcmp((const char *) node->name, "transform"))
              step = icu_chain_insert_step(chain, ICU_chain_step_type_transform, 
-                                          (const uint8_t *) xml_rule, status);
+                                          (const uint8_t *) rule, status);
          else if (!strcmp((const char *) node->name, "transliterate"))
              step = icu_chain_insert_step(chain, ICU_chain_step_type_transliterate, 
-                                          (const uint8_t *) xml_rule, status);
+                                          (const uint8_t *) rule, status);
          else if (!strcmp((const char *) node->name, "tokenize"))
              step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize, 
-                                          (const uint8_t *) xml_rule, status);
+                                          (const uint8_t *) rule, status);
          else if (!strcmp((const char *) node->name, "display"))
              step = icu_chain_insert_step(chain, ICU_chain_step_type_display, 
                                           (const uint8_t *) "", status);
 +        else if (!strcmp((const char *) node->name, "stemming"))
-             step = yaz_chain_insert_step(chain, YAZ_chain_step_type_stemming,
-                                          (const uint8_t *) xml_rule, status);
++            step = icu_chain_insert_step(chain, YAZ_chain_step_type_stemming,
++                                         (const uint8_t *) rule, status);
          else if (!strcmp((const char *) node->name, "normalize"))
          {
              yaz_log(YLOG_WARN, "Element %s is deprecated. "
                      "Use transform instead", node->name);
              step = icu_chain_insert_step(chain, ICU_chain_step_type_transform, 
-                                          (const uint8_t *) xml_rule, status);
+                                          (const uint8_t *) rule, status);
          }
          else if (!strcmp((const char *) node->name, "index")
                   || !strcmp((const char *) node->name, "sortkey"))
          else
          {
              yaz_log(YLOG_WARN, "Unknown element %s", node->name);
-             icu_chain_destroy(chain);
-             return 0;
+             no_errors++;
+             continue;
          }
-         xmlFree(xml_rule);
          if (step && U_FAILURE(*status))
          {
-             icu_chain_destroy(chain);
-             return 0;
+             no_errors++;
+             break;
          }
      }
-     return chain;
- }
- static struct icu_chain_step *icu_chain_insert_step(
-     struct icu_chain * chain, enum icu_chain_step_type type,
-     const uint8_t * rule, UErrorCode *status)
- {    
-     struct icu_chain_step * step = 0;
-     if (!chain || !type || !rule)
+     nmem_destroy(nmem);
+     if (no_errors)
+     {
+         icu_chain_destroy(chain);
          return 0;
-     /* create actual chain step with this buffer */
-     step = icu_chain_step_create(chain, type, rule,
-                                  status);
-     step->previous = chain->csteps;
-     chain->csteps = step;
-     return step;
+     }
+     return chain;
  }
  
  struct icu_iter {
@@@ -414,15 -407,6 +423,15 @@@ struct icu_buf_utf16 *icu_iter_invoke(y
              if (dst)
                  icu_utf16_to_utf8(iter->display, dst, &iter->status);
              break;
 +        case YAZ_chain_step_type_stemming:
 +            if (dst)
 +            {
 +                struct icu_buf_utf16 *src = dst;
 +                dst = icu_buf_utf16_create(0);
 +                yaz_stemmer_stem(step->u.stemmer, dst, src, &iter->status);
 +                icu_buf_utf16_destroy(src);
 +            }
 +            break;
          default:
              assert(0);
          }
@@@ -520,7 -504,7 +529,7 @@@ int icu_iter_get_token_number(yaz_icu_i
      return iter->token_count;
  }
  
- int icu_chain_assign_cstr(struct icu_chain * chain, const char * src8cstr, 
+ int icu_chain_assign_cstr(struct icu_chain *chain, const char *src8cstr, 
                            UErrorCode *status)
  {
      if (chain->iter)
      return 1;
  }
  
- int icu_chain_next_token(struct icu_chain * chain, UErrorCode *status)
+ int icu_chain_next_token(struct icu_chain *chain, UErrorCode *status)
  {
      *status = U_ZERO_ERROR;
      return icu_iter_next(chain->iter);
  }
  
- int icu_chain_token_number(struct icu_chain * chain)
+ int icu_chain_token_number(struct icu_chain *chain)
  {
      if (chain && chain->iter)
          return chain->iter->token_count;
      return 0;
  }
  
- const char * icu_chain_token_display(struct icu_chain * chain)
+ const char *icu_chain_token_display(struct icu_chain *chain)
  {
      if (chain->iter)
          return icu_iter_get_display(chain->iter);
      return 0;
  }
  
- const char * icu_chain_token_norm(struct icu_chain * chain)
+ const char *icu_chain_token_norm(struct icu_chain *chain)
  {
      if (chain->iter)
          return icu_iter_get_norm(chain->iter);
      return 0;
  }
  
- const char * icu_chain_token_sortkey(struct icu_chain * chain)
+ const char *icu_chain_token_sortkey(struct icu_chain *chain)
  {
      if (chain->iter)
          return icu_iter_get_sortkey(chain->iter);
diff --combined src/stemmer.c
index 5cb3267,0000000..b544ebc
mode 100644,000000..100644
--- /dev/null
@@@ -1,33 -1,0 +1,41 @@@
 +
 +
++#if HAVE_CONFIG_H
++#include "config.h"
++#endif
++
++#if YAZ_HAVE_ICU
++
++#include <yaz/yconfig.h>
 +
 +#include <yaz/stemmer.h>
 +
 +#include <unicode/ustring.h>  /* some more string fcns*/
 +#include <unicode/uchar.h>    /* char names           */
 +
 +struct yaz_stemmer_t
 +{
 +    int implementation;
 +    union {
 +        struct sb_stemmer *snowballer;
 +    };
 +};
 +
 +yaz_stemmer_p yaz_stemmer_create(const char *locale, const char *rule, UErrorCode *status) {
 +    return 0;
 +}
 +
 +yaz_stemmer_p yaz_stemmer_clone(yaz_stemmer_p stemmer) {
 +    return 0;
 +}
 +
 +void yaz_stemmer_stem(yaz_stemmer_p stemmer, struct icu_buf_utf16 *dst, struct icu_buf_utf16* src, UErrorCode *status) {
 +
 +}
 +
 +void yaz_stemmer_destroy(yaz_stemmer_p stemmer) {
 +
 +
 +}
 +
++#endif /* YAZ_HAVE_ICU */