From 7336356a8d037e2782e201bcdc48ff4ae2850ae4 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Thu, 13 Dec 2007 11:09:20 +0000 Subject: [PATCH 1/1] ICU functional for scan and snippets. --- index/Makefile.am | 4 +- index/extract.c | 39 ++++++++++++--- index/index.h | 19 +++----- index/rpnfacet.c | 139 ----------------------------------------------------- index/rpnscan.c | 57 +++++++++++++--------- index/untrans.c | 50 +++++++++++-------- test/api/t17.c | 20 +++++++- test/api/t17.idx | 7 +-- win/makefile | 3 +- 9 files changed, 128 insertions(+), 210 deletions(-) delete mode 100644 index/rpnfacet.c diff --git a/index/Makefile.am b/index/Makefile.am index 7a218e6..b08296f 100644 --- a/index/Makefile.am +++ b/index/Makefile.am @@ -1,4 +1,4 @@ -## $Id: Makefile.am,v 1.68 2007-12-03 16:54:49 adam Exp $ +## $Id: Makefile.am,v 1.69 2007-12-13 11:09:20 adam Exp $ aux_libs = \ ../rset/libidzebra-rset.la \ @@ -95,7 +95,7 @@ libidzebra_2_0_la_SOURCES = \ rank.h rank1.c ranksimilarity.c rankstatic.c \ records.c recindex.c recindex.h reckeys.c reckeys.h \ retrieve.c \ - rpnscan.c rpnsearch.c rpnfacet.c sortidx.c stream.c \ + rpnscan.c rpnsearch.c sortidx.c stream.c \ update_path.c update_file.c trunc.c untrans.c isam_methods.c \ zaptterm.c zebraapi.c zinfo.c zinfo.h zsets.c key_block.c key_block.h \ check_res.c rset_isam.c diff --git a/index/extract.c b/index/extract.c index cc54d67..aed0650 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,4 +1,4 @@ -/* $Id: extract.c,v 1.272 2007-12-10 17:06:08 adam Exp $ +/* $Id: extract.c,v 1.273 2007-12-13 11:09:20 adam Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -265,6 +265,26 @@ static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm) } +static void snippet_add_icu(RecWord *p, int ord, zebra_map_t zm) +{ + struct snip_rec_info *h = p->extractCtrl->handle; + + const char *res_buf = 0; + size_t res_len = 0; + + const char *display_buf = 0; + size_t display_len = 0; + + zebra_map_tokenize_start(zm, p->term_buf, p->term_len); + while (zebra_map_tokenize_next(zm, &res_buf, &res_len, + &display_buf, &display_len)) + { + zebra_snippets_appendn(h->snippets, p->seqno, 0, ord, + display_buf, display_len); + p->seqno++; + } +} + static void snippet_token_add(RecWord *p) { struct snip_rec_info *h = p->extractCtrl->handle; @@ -277,10 +297,15 @@ static void snippet_token_add(RecWord *p) int ch = zebraExplain_lookup_attr_str( zei, zinfo_index_category_index, p->index_type, p->index_name); - if (zebra_maps_is_complete(zm)) - snippet_add_complete_field(p, ch, zm); + if (zebra_maps_is_icu(zm)) + snippet_add_icu(p, ch, zm); else - snippet_add_incomplete_field(p, ch, zm); + { + if (zebra_maps_is_complete(zm)) + snippet_add_complete_field(p, ch, zm); + else + snippet_add_incomplete_field(p, ch, zm); + } } } @@ -1456,7 +1481,7 @@ void extract_flush_record_keys2(ZebraHandle zh, zint sysno, } -ZEBRA_RES zebra_rec_keys_to_snippets(ZebraHandle zh, +ZEBRA_RES zebra_rec_keys_to_snippets1(ZebraHandle zh, zebra_rec_keys_t reckeys, zebra_snippets *snippets) { @@ -1760,8 +1785,8 @@ static void extract_add_icu(RecWord *p, zebra_map_t zm) \param p token data to be indexed Call sequence: - extract_token - zebra_add_{in}_complete + extract_token_add + extract_add_{in}_complete extract_add_string extract_add_index_string diff --git a/index/index.h b/index/index.h index a123b30..8ef0835 100644 --- a/index/index.h +++ b/index/index.h @@ -1,4 +1,4 @@ -/* $Id: index.h,v 1.212 2007-12-03 13:04:04 adam Exp $ +/* $Id: index.h,v 1.213 2007-12-13 11:09:20 adam Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -377,14 +377,14 @@ Dict dict_open_res(BFiles bfs, const char *name, int cache, int rw, void zebra_setError(ZebraHandle zh, int code, const char *addinfo); void zebra_setError_zint(ZebraHandle zh, int code, zint i); -void zebra_term_untrans_iconv(ZebraHandle zh, NMEM stream, - const char *index_type, - char **dst, const char *src); +int zebra_term_untrans_iconv(ZebraHandle zh, NMEM stream, + const char *index_type, + char **dst, const char *src); ZEBRA_RES zebra_get_hit_vector(ZebraHandle zh, const char *setname, zint sysno); -void zebra_term_untrans(ZebraHandle zh, const char *index_type, - char *dst, const char *src); +int zebra_term_untrans(ZebraHandle zh, const char *index_type, + char *dst, const char *src); ZEBRA_RES zebra_apt_get_ord(ZebraHandle zh, Z_AttributesPlusTerm *zapt, @@ -437,13 +437,6 @@ ZEBRA_RES zebra_term_limits_APT(ZebraHandle zh, const char **term_ref_id_str, NMEM nmem); -ZEBRA_RES rpn_facet(ZebraHandle zh, ODR stream, - Z_AttributesPlusTerm *zapt, - const Odr_oid *attributeset, - int *position, int *num_entries, - ZebraScanEntry **list, - int *is_partial, const char *set_name); - ZEBRA_RES zebra_result_recid_to_sysno(ZebraHandle zh, const char *setname, zint recid, diff --git a/index/rpnfacet.c b/index/rpnfacet.c deleted file mode 100644 index 2bebbfd..0000000 --- a/index/rpnfacet.c +++ /dev/null @@ -1,139 +0,0 @@ -/* $Id: rpnfacet.c,v 1.3 2007-11-05 11:20:39 adam Exp $ - Copyright (C) 1995-2007 - Index Data ApS - -This file is part of the Zebra server. - -Zebra is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free -Software Foundation; either version 2, or (at your option) any later -version. - -Zebra is distributed in the hope that it will be useful, but WITHOUT ANY -WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - -*/ - -#include -#include -#if HAVE_UNISTD_H -#include -#endif -#include - -#include -#include "index.h" -#include -#include -#include -#include -#include -#include - -ZEBRA_RES rpn_facet(ZebraHandle zh, ODR stream, Z_AttributesPlusTerm *zapt, - const Odr_oid *attributeset, - int *position, int *num_entries, - ZebraScanEntry **list, int *is_partial, - const char *set_name) -{ - int ord; - int use_sort_idx = 1; - ZEBRA_RES res = zebra_attr_list_get_ord(zh, - zapt->attributes, - zinfo_index_category_sort, - 0 /* index_type */, - attributeset, &ord); - if (res != ZEBRA_OK) - return res; - else if (use_sort_idx) - { - const char *index_type = 0; - const char *db = 0; - const char *string_index = 0; - /* for each ord .. */ - /* check that sort idx exist for ord */ - /* sweep through result set and sort_idx at the same time */ - char *this_entry_buf = xmalloc(SORT_IDX_ENTRYSIZE); - char *dst_buf = xmalloc(SORT_IDX_ENTRYSIZE); - size_t sysno_mem_index = 0; - RSET rset = resultSetRef(zh, set_name); - zint p_this_sys = 0; - RSFD rfd; - TERMID termid; - struct it_key key; - - if (zebraExplain_lookup_ord(zh->reg->zei, - ord, &index_type, &db, &string_index)) - { - yaz_log(YLOG_WARN, "zebraExplain_lookup_ord failed"); - } - - if (zh->m_staticrank) - sysno_mem_index = 1; - - rfd = rset_open(rset, RSETF_READ); - while (rset_read(rfd, &key, &termid)) - { - zint sysno = key.mem[sysno_mem_index]; - if (sysno != p_this_sys) - { - p_this_sys = sysno; - zebra_sort_sysno(zh->reg->sort_index, sysno); - zebra_sort_type(zh->reg->sort_index, ord); - zebra_sort_read(zh->reg->sort_index, this_entry_buf); - - zebra_term_untrans(zh, index_type, dst_buf, this_entry_buf); - yaz_log(YLOG_LOG, "dst_buf=%s", dst_buf); - } - } - rset_close(rfd); - xfree(this_entry_buf); - xfree(dst_buf); - zebra_setError(zh, YAZ_BIB1_TEMPORARY_SYSTEM_ERROR, "facet not done1"); - return ZEBRA_FAIL; - } - else - { - int num = 100; /* to be customizable */ - int i; - - ZebraMetaRecord *meta = zebra_meta_records_create_range( - zh, set_name, 0, num); - - for (i = 0; i < num; i++) - { - zint sysno = meta[i].sysno; - Record rec = rec_get(zh->reg->records, sysno); - if (!rec) - { - yaz_log(YLOG_WARN, "rec_get fail on sysno=" ZINT_FORMAT, - sysno); - break; - } - else - { - - - rec_free(&rec); - } - } - zebra_meta_records_destroy(zh, meta, num); - zebra_setError(zh, YAZ_BIB1_TEMPORARY_SYSTEM_ERROR, "facet not done2"); - return ZEBRA_FAIL; - } -} - -/* - * Local variables: - * c-basic-offset: 4 - * indent-tabs-mode: nil - * End: - * vim: shiftwidth=4 tabstop=8 expandtab - */ - diff --git a/index/rpnscan.c b/index/rpnscan.c index 3312157..fa12bbd 100644 --- a/index/rpnscan.c +++ b/index/rpnscan.c @@ -1,4 +1,4 @@ -/* $Id: rpnscan.c,v 1.23 2007-12-03 11:49:11 adam Exp $ +/* $Id: rpnscan.c,v 1.24 2007-12-13 11:09:20 adam Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -45,14 +45,28 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA static ZEBRA_RES trans_scan_term(ZebraHandle zh, Z_AttributesPlusTerm *zapt, char *termz, zebra_map_t zm) { - char termz0[IT_MAX_WORD]; + char term_utf8[IT_MAX_WORD]; - if (zapt_term_to_utf8(zh, zapt, termz0) == ZEBRA_FAIL) + if (zapt_term_to_utf8(zh, zapt, term_utf8) == ZEBRA_FAIL) return ZEBRA_FAIL; /* error */ + else if (zebra_maps_is_icu(zm)) + { + const char *res_buf; + size_t res_len; + zebra_map_tokenize_start(zm, term_utf8, strlen(term_utf8)); + + if (zebra_map_tokenize_next(zm, &res_buf, &res_len, 0, 0)) + { + memcpy(termz, res_buf, res_len); + termz[res_len] = '\0'; + } + else + termz[0] = '\0'; + } else { const char **map; - const char *cp = (const char *) termz0; + const char *cp = (const char *) term_utf8; const char *cp_end = cp + strlen(cp); const char *src; int i = 0; @@ -218,8 +232,6 @@ static int scan_save_set(ZebraHandle zh, ODR stream, NMEM nmem, if (pos != -1) { zint sysno; - int code = -1; - zebra_snippets *rec_snippets = zebra_snippets_create(); zebra_snippets *hit_snippets = zebra_snippets_create(); glist[pos].term = 0; @@ -227,22 +239,28 @@ static int scan_save_set(ZebraHandle zh, ODR stream, NMEM nmem, get_first_snippet_from_rset(zh, rset, hit_snippets, &sysno); if (sysno) - code = zebra_get_rec_snippets(zh, sysno, rec_snippets); - - if (code == 0) { - const struct zebra_snippet_word *w = - zebra_snippets_lookup(rec_snippets, hit_snippets); - if (w) + zebra_snippets *rec_snippets = zebra_snippets_create(); + int code = zebra_get_rec_snippets(zh, sysno, rec_snippets); + if (code == 0) { - glist[pos].display_term = odr_strdup(stream, w->term); + const struct zebra_snippet_word *w = + zebra_snippets_lookup(rec_snippets, hit_snippets); + if (w) + { + glist[pos].display_term = odr_strdup(stream, w->term); + } } + zebra_snippets_destroy(rec_snippets); } - if (!glist[pos].term) - zebra_term_untrans_iconv(zh, stream->mem, index_type, - &glist[pos].term, term); + if (zebra_term_untrans_iconv(zh, stream->mem, index_type, + &glist[pos].term, term)) + { + /* failed.. use display_term instead (which could be 0) */ + glist[pos].term = glist[pos].display_term; + } + glist[pos].occurrences = count; - zebra_snippets_destroy(rec_snippets); zebra_snippets_destroy(hit_snippets); } rset_delete(rset); @@ -517,11 +535,6 @@ ZEBRA_RES rpn_scan(ZebraHandle zh, ODR stream, Z_AttributesPlusTerm *zapt, zebra_setError(zh, YAZ_BIB1_TOO_MANY_DATABASES_SPECIFIED, 0); return ZEBRA_FAIL; } - if (sort_flag) - { - return rpn_facet(zh, stream, zapt, attributeset, position, num_entries, - list, is_partial, set_name); - } for (base_no = 0; base_no < num_bases; base_no++) { int ord; diff --git a/index/untrans.c b/index/untrans.c index 0551e3c..904e5e5 100644 --- a/index/untrans.c +++ b/index/untrans.c @@ -1,4 +1,4 @@ -/* $Id: untrans.c,v 1.5 2007-10-31 16:56:14 adam Exp $ +/* $Id: untrans.c,v 1.6 2007-12-13 11:09:20 adam Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -28,35 +28,44 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #include "index.h" #include -void zebra_term_untrans(ZebraHandle zh, const char *index_type, - char *dst, const char *src) +int zebra_term_untrans(ZebraHandle zh, const char *index_type, + char *dst, const char *src) { zebra_map_t zm = zebra_map_get(zh->reg->zebra_maps, index_type); - int len = 0; - while (*src) + if (zebra_maps_is_icu(zm)) + return -1; + else { - const char *cp = zebra_maps_output(zm, &src); - if (!cp) - { - if (len < IT_MAX_WORD-1) - dst[len++] = *src; - src++; - } - else - while (*cp && len < IT_MAX_WORD-1) - dst[len++] = *cp++; + int len = 0; + while (*src) + { + const char *cp = zebra_maps_output(zm, &src); + if (!cp) + { + if (len < IT_MAX_WORD-1) + dst[len++] = *src; + src++; + } + else + while (*cp && len < IT_MAX_WORD-1) + dst[len++] = *cp++; + } + dst[len] = '\0'; } - dst[len] = '\0'; + return 0; } -void zebra_term_untrans_iconv(ZebraHandle zh, NMEM stream, - const char *index_type, - char **dst, const char *src) +int zebra_term_untrans_iconv(ZebraHandle zh, NMEM stream, + const char *index_type, + char **dst, const char *src) { char term_src[IT_MAX_WORD]; char term_dst[IT_MAX_WORD]; + int r; - zebra_term_untrans (zh, index_type, term_src, src); + r = zebra_term_untrans (zh, index_type, term_src, src); + if (r) + return r; if (zh->iconv_from_utf8 != 0) { @@ -83,6 +92,7 @@ void zebra_term_untrans_iconv(ZebraHandle zh, NMEM stream, } else *dst = nmem_strdup(stream, term_src); + return 0; } diff --git a/test/api/t17.c b/test/api/t17.c index 0c99133..b945f3d 100644 --- a/test/api/t17.c +++ b/test/api/t17.c @@ -1,4 +1,4 @@ -/* $Id: t17.c,v 1.8 2007-12-07 14:17:37 adam Exp $ +/* $Id: t17.c,v 1.9 2007-12-13 11:09:20 adam Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -26,10 +26,17 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #include #include "testlib.h" +#define char_ae "\xc3\xa6" +#define char_AE "\xc3\x86" +#define char_oslash "\xc3\xb8" +#define char_Oslash "\xc3\x98" + const char *myrec[] = { "\nMy computer\n\n", "\nMy x computer\n\n", "\nMy computer x\n\n" , + + "\n" char_ae "\n\n" , 0} ; static void tst(int argc, char **argv) @@ -51,6 +58,13 @@ static void tst(int argc, char **argv) YAZ_CHECK(tl_query(zh, "@attr 1=title my", 3)); + YAZ_CHECK(tl_query(zh, "@attr 1=title mY", 3)); + + YAZ_CHECK(tl_query(zh, char_ae, 1)); +#if 0 + YAZ_CHECK(tl_query(zh, char_AE, 1)); +#endif + /* phrase search */ YAZ_CHECK(tl_query(zh, "@attr 1=title {my computer}", 2)); YAZ_CHECK(tl_query(zh, "@attr 1=title @attr 6=1 {my computer}", 2)); @@ -59,7 +73,9 @@ static void tst(int argc, char **argv) /* complete-subfield search */ YAZ_CHECK(tl_query(zh, "@attr 1=title @attr 6=2 {my computer}", 1)); YAZ_CHECK(tl_query(zh, "@attr 1=title @attr 6=2 {my}", 0)); - + + /* scan */ + YAZ_CHECK(tl_close_down(zh, zs)); #endif } diff --git a/test/api/t17.idx b/test/api/t17.idx index 16e9985..922ddbe 100644 --- a/test/api/t17.idx +++ b/test/api/t17.idx @@ -1,5 +1,5 @@ # Zebra indexes as referred to from the *.abs-files. -# $Id: t17.idx,v 1.5 2007-12-07 14:17:37 adam Exp $ +# $Id: t17.idx,v 1.6 2007-12-13 11:09:20 adam Exp $ # # Traditional word index @@ -12,14 +12,15 @@ alwaysmatches 1 firstinfield 1 # simplechain dummy icuchain words-icu.xml -debug 1 +# debug 1 # Phrase index # Used if completeness is 'complete {sub}field' (@attr 6=2, @attr 6=1) # and structure is word/phrase/word-list/free-form-text/document-text index p completeness 1 -charmap phrases-icu.xml +icuchain phrases-icu.xml +# debug 1 # Sort register sort s diff --git a/win/makefile b/win/makefile index 7ac8fde..fde5806 100644 --- a/win/makefile +++ b/win/makefile @@ -1,5 +1,5 @@ # Zebra makefile for MS NMAKE -# $Id: makefile,v 1.71 2007-12-03 17:16:48 adam Exp $ +# $Id: makefile,v 1.72 2007-12-13 11:09:20 adam Exp $ ########################################################### ############### Parameters @@ -400,7 +400,6 @@ ZEBRALIB_OBJS= \ $(OBJDIR)\regxread.obj \ $(OBJDIR)\res.obj \ $(OBJDIR)\retrieve.obj \ - $(OBJDIR)\rpnfacet.obj \ $(OBJDIR)\rpnscan.obj \ $(OBJDIR)\rpnsearch.obj \ $(OBJDIR)\rsbetween.obj \ -- 1.7.10.4