From: Dennis Schafroth Date: Mon, 8 Nov 2010 17:05:54 +0000 (+0100) Subject: Merge branch 'master' into stemming X-Git-Tag: v4.1.4~14^2~16 X-Git-Url: http://git.indexdata.com/?p=yaz-moved-to-github.git;a=commitdiff_plain;h=9e04bbdbec82ea6f5795dc2d16bd9876ecc7762f;hp=29fc3c6ee42b1c1bd2200c950393326007fa0d83 Merge branch 'master' into stemming Conflicts: src/icu_chain.c Merge minor ICU change and test changes. Fix includes to include config.h. Fix Makefile: Include stemmer in libyaz_icu. --- diff --git a/include/yaz/stemmer.h b/include/yaz/stemmer.h new file mode 100644 index 0000000..6e51cae --- /dev/null +++ b/include/yaz/stemmer.h @@ -0,0 +1,61 @@ +/* This file is part of the YAZ toolkit. + * Copyright (C) 1995-2010 Index Data. + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Index Data nor the names of its contributors + * may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file stemming.h + * \brief Header for the stemming API. Simple wrapper to hide underlying implementation. + */ + +#ifndef YAZ_STEMMER_H +#define YAZ_STEMMER_H + +#include +#include +#include +#include + +#include /* some more string fcns*/ +#include /* char names */ + + +YAZ_BEGIN_CDECL + +typedef struct yaz_stemmer_t *yaz_stemmer_p; + +YAZ_EXPORT +yaz_stemmer_p yaz_stemmer_create(const char *locale, const char *rule, UErrorCode *status); + +YAZ_EXPORT +yaz_stemmer_p yaz_stemmer_clone(yaz_stemmer_p stemmer); + +YAZ_EXPORT +void yaz_stemmer_stem(yaz_stemmer_p stemmer, struct icu_buf_utf16 *dst, struct icu_buf_utf16* src, UErrorCode *status); + +YAZ_EXPORT +void yaz_stemmer_destroy(yaz_stemmer_p stemmer); + +#endif diff --git a/src/Makefile.am b/src/Makefile.am index d0060df..d8143f5 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -115,7 +115,7 @@ libyaz_server_la_SOURCES = statserv.c seshigh.c eventl.c \ libyaz_server_la_LDFLAGS=-version-info $(YAZ_VERSION_INFO) -libyaz_icu_la_SOURCES = icu_chain.c icu_utf16.c icu_utf8.c \ +libyaz_icu_la_SOURCES = icu_chain.c icu_utf16.c icu_utf8.c stemmer.c \ icu_transform.c icu_casemap.c icu_tokenizer.c icu_sortkey.c libyaz_icu_la_LDFLAGS=-version-info $(YAZ_VERSION_INFO) diff --git a/src/icu_chain.c b/src/icu_chain.c index d6521c0..8cc9a64 100644 --- a/src/icu_chain.c +++ b/src/icu_chain.c @@ -17,6 +17,8 @@ #include +#include + #include #include #include @@ -30,11 +32,12 @@ enum icu_chain_step_type { ICU_chain_step_type_none, - ICU_chain_step_type_display, /* convert to utf8 display format */ - ICU_chain_step_type_casemap, /* apply utf16 charmap */ - ICU_chain_step_type_transform, /* apply utf16 transform */ - ICU_chain_step_type_tokenize, /* apply utf16 tokenization */ - ICU_chain_step_type_transliterate /* apply utf16 tokenization */ + ICU_chain_step_type_display, /* convert to utf8 display format */ + ICU_chain_step_type_casemap, /* apply utf16 charmap */ + ICU_chain_step_type_transform, /* apply utf16 transform */ + ICU_chain_step_type_tokenize, /* apply utf16 tokenization */ + ICU_chain_step_type_transliterate, /* apply utf16 tokenization */ + YAZ_chain_step_type_stemming /* apply utf16 stemming (YAZ) */ }; struct icu_chain_step @@ -42,9 +45,10 @@ struct icu_chain_step /* type and action object */ enum icu_chain_step_type type; union { - struct icu_casemap *casemap; + struct icu_casemap *casemap; struct icu_transform *transform; struct icu_tokenizer *tokenizer; + yaz_stemmer_p stemmer; } u; struct icu_chain_step *previous; }; @@ -105,6 +109,9 @@ static struct icu_chain_step *icu_chain_insert_step( step->u.transform = icu_transform_create("custom", 'f', (const char *) rule, status); break; + case YAZ_chain_step_type_stemming: + step->u.stemmer = yaz_stemmer_create((char *) chain->locale, (const char *) rule, status); + break; default: break; } @@ -136,6 +143,9 @@ static void icu_chain_step_destroy(struct icu_chain_step *step) case ICU_chain_step_type_tokenize: icu_tokenizer_destroy(step->u.tokenizer); break; + case YAZ_chain_step_type_stemming: + yaz_stemmer_destroy(step->u.stemmer); + break; default: break; } @@ -165,6 +175,9 @@ struct icu_chain_step *icu_chain_step_clone(struct icu_chain_step *old) case ICU_chain_step_type_tokenize: (*sp)->u.tokenizer = icu_tokenizer_clone(old->u.tokenizer); break; + case YAZ_chain_step_type_stemming: + yaz_stemmer_clone(step->u.stemmer); + break; case ICU_chain_step_type_none: break; } @@ -288,6 +301,9 @@ struct icu_chain *icu_chain_xml_config(const xmlNode *xml_node, else if (!strcmp((const char *) node->name, "display")) step = icu_chain_insert_step(chain, ICU_chain_step_type_display, (const uint8_t *) "", status); + else if (!strcmp((const char *) node->name, "stemming")) + step = icu_chain_insert_step(chain, YAZ_chain_step_type_stemming, + (const uint8_t *) rule, status); else if (!strcmp((const char *) node->name, "normalize")) { yaz_log(YLOG_WARN, "Element %s is deprecated. " @@ -407,6 +423,15 @@ struct icu_buf_utf16 *icu_iter_invoke(yaz_icu_iter_t iter, if (dst) icu_utf16_to_utf8(iter->display, dst, &iter->status); break; + case YAZ_chain_step_type_stemming: + if (dst) + { + struct icu_buf_utf16 *src = dst; + dst = icu_buf_utf16_create(0); + yaz_stemmer_stem(step->u.stemmer, dst, src, &iter->status); + icu_buf_utf16_destroy(src); + } + break; default: assert(0); } diff --git a/src/stemmer.c b/src/stemmer.c new file mode 100644 index 0000000..b544ebc --- /dev/null +++ b/src/stemmer.c @@ -0,0 +1,41 @@ + + +#if HAVE_CONFIG_H +#include "config.h" +#endif + +#if YAZ_HAVE_ICU + +#include + +#include + +#include /* some more string fcns*/ +#include /* char names */ + +struct yaz_stemmer_t +{ + int implementation; + union { + struct sb_stemmer *snowballer; + }; +}; + +yaz_stemmer_p yaz_stemmer_create(const char *locale, const char *rule, UErrorCode *status) { + return 0; +} + +yaz_stemmer_p yaz_stemmer_clone(yaz_stemmer_p stemmer) { + return 0; +} + +void yaz_stemmer_stem(yaz_stemmer_p stemmer, struct icu_buf_utf16 *dst, struct icu_buf_utf16* src, UErrorCode *status) { + +} + +void yaz_stemmer_destroy(yaz_stemmer_p stemmer) { + + +} + +#endif /* YAZ_HAVE_ICU */