From d9b85731ab7b965b1ae9bc1c283e39faf10a644a Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Wed, 28 Jul 2004 09:47:41 +0000 Subject: [PATCH] Added a new 'cut' directive to charmaps (.chr files) which specifies that only characters after the cutting char should be indexed. --- NEWS | 3 +++ include/charmap.h | 13 +++++------ include/zebramap.h | 14 ++++-------- index/extract.c | 18 ++++++++++----- test/sort/Makefile.am | 5 +++-- test/sort/default.idx | 55 ++++++++++++++++++++++++++++++++++++++++++++++ test/sort/rec3.xml | 2 +- test/sort/rec4.xml | 3 +++ test/sort/string-hat.chr | 44 +++++++++++++++++++++++++++++++++++++ test/sort/test1.sh | 10 +++++---- test/sort/zebra.cfg | 3 +-- util/charmap.c | 32 ++++++++++++++++++++++++--- 12 files changed, 167 insertions(+), 35 deletions(-) create mode 100644 test/sort/default.idx create mode 100644 test/sort/rec4.xml create mode 100644 test/sort/string-hat.chr diff --git a/NEWS b/NEWS index c1d1704..5a35531 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,7 @@ +Added a new 'cut' directive to charmaps (.chr files) which specifies that +only characters after the cutting char should be indexed. + Update Perl internals so that it matches the current Zebra API. The recordGroup structure is no longer available. A group of resources can still be referenced by setting groupName=>.. in various methods. diff --git a/include/charmap.h b/include/charmap.h index 6001b7a..365b6ab 100644 --- a/include/charmap.h +++ b/include/charmap.h @@ -1,5 +1,5 @@ -/* $Id: charmap.h,v 1.8 2002-08-02 19:26:55 adam Exp $ - Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 +/* $Id: charmap.h,v 1.9 2004-07-28 09:47:41 adam Exp $ + Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps This file is part of the Zebra server. @@ -27,12 +27,11 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #include -#ifdef __cplusplus -extern "C" { -#endif +YAZ_BEGIN_CDECL YAZ_EXPORT extern const char *CHR_UNKNOWN; YAZ_EXPORT extern const char *CHR_SPACE; +YAZ_EXPORT extern const char *CHR_CUT; YAZ_EXPORT extern const char *CHR_BASE; struct chr_t_entry; @@ -55,8 +54,6 @@ YAZ_EXPORT const char *chr_map_output(chrmaptab t, const char **from, int len); YAZ_EXPORT unsigned char zebra_prim(char **s); -#ifdef __cplusplus -} -#endif +YAZ_END_CDECL #endif diff --git a/include/zebramap.h b/include/zebramap.h index f8e1614..62845c0 100644 --- a/include/zebramap.h +++ b/include/zebramap.h @@ -1,5 +1,5 @@ -/* $Id: zebramap.h,v 1.14 2002-08-02 19:26:55 adam Exp $ - Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 +/* $Id: zebramap.h,v 1.15 2004-07-28 09:47:41 adam Exp $ + Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps This file is part of the Zebra server. @@ -20,17 +20,13 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ - - #ifndef ZEBRAMAP_H #define ZEBRAMAP_H #include #include -#ifdef __cplusplus -extern "C" { -#endif +YAZ_BEGIN_CDECL typedef struct zebra_maps *ZebraMaps; ZebraMaps zebra_maps_open (Res res, const char *base); @@ -55,8 +51,6 @@ int zebra_maps_is_positioned (ZebraMaps zms, unsigned reg_id); WRBUF zebra_replace(ZebraMaps zms, unsigned reg_id, const char *ex_list, const char *input_str, int input_len); -#ifdef __cplusplus -} -#endif +YAZ_END_CDECL #endif diff --git a/index/extract.c b/index/extract.c index 9183bbc..593c9e8 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,4 +1,4 @@ -/* $Id: extract.c,v 1.156 2004-07-28 08:15:45 adam Exp $ +/* $Id: extract.c,v 1.157 2004-07-28 09:47:41 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -1655,6 +1655,7 @@ static void extract_add_complete_field (RecWord *p) while (map && *map && **map == *CHR_SPACE) { remain = p->length - (b - p->string); + if (remain > 0) map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain); else @@ -1669,10 +1670,17 @@ static void extract_add_complete_field (RecWord *p) { const char *cp = *map; - if (i >= IT_MAX_WORD) - break; - while (i < IT_MAX_WORD && *cp) - buf[i++] = *(cp++); + if (**map == *CHR_CUT) + { + i = 0; + } + else + { + if (i >= IT_MAX_WORD) + break; + while (i < IT_MAX_WORD && *cp) + buf[i++] = *(cp++); + } remain = p->length - (b - p->string); if (remain > 0) map = zebra_maps_input (p->zebra_maps, p->reg_type, &b, diff --git a/test/sort/Makefile.am b/test/sort/Makefile.am index 4d6b90f..f3f433e 100644 --- a/test/sort/Makefile.am +++ b/test/sort/Makefile.am @@ -1,9 +1,10 @@ -# $Id: Makefile.am,v 1.2 2003-05-21 14:39:22 adam Exp $ +# $Id: Makefile.am,v 1.3 2004-07-28 09:47:41 adam Exp $ check_SCRIPTS = test1.sh TESTS = $(check_SCRIPTS) -EXTRA_DIST = zebra.cfg rec1.xml rec2.xml rec3.xml zebra.cfg my.abs \ +EXTRA_DIST = zebra.cfg default.idx string-hat.chr \ + rec1.xml rec2.xml rec3.xml rec4.xml zebra.cfg my.abs \ $(check_SCRIPTS) diff --git a/test/sort/default.idx b/test/sort/default.idx new file mode 100644 index 0000000..705795c --- /dev/null +++ b/test/sort/default.idx @@ -0,0 +1,55 @@ +# Zebra indexes as referred to from the *.abs-files. +# $Id: default.idx,v 1.1 2004-07-28 09:47:41 adam Exp $ +# + +# Traditional word index +# Used if completenss is 'incomplete field' (@attr 6=1) and +# structure is word/phrase/word-list/free-form-text/document-text +index w +completeness 0 +position 1 +charmap string.chr + +# Phrase index +# Used if completeness is 'complete {sub}field' (@attr 6=2, @attr 6=1) +# and structure is word/phrase/word-list/free-form-text/document-text +index p +completeness 1 +charmap string.chr + +# URX (URL) index +# Used if structure=urx (@attr 4=104) +index u +completeness 0 +charmap urx.chr + +# Numeric index +# Used if structure=numeric (@attr 4=109) +index n +completeness 0 +charmap numeric.chr + +# Null map index (no mapping at all) +# Used if structure=key (@attr 4=3) +index 0 +completeness 0 +position 1 +charmap @ + +# Year +# Used if structure=year (@attr 4=4) +index y +completeness 0 +charmap @ + +# Date +# Used if structure=date (@attr 4=5) +index d +completeness 0 +charmap @ + +# Sort register as usual but specify another map : string-cut. +sort s +completeness 1 +charmap string-cut.chr + diff --git a/test/sort/rec3.xml b/test/sort/rec3.xml index 2b44c52..a83e15a 100644 --- a/test/sort/rec3.xml +++ b/test/sort/rec3.xml @@ -1,3 +1,3 @@ - third computer + 3rd computer diff --git a/test/sort/rec4.xml b/test/sort/rec4.xml new file mode 100644 index 0000000..c187216 --- /dev/null +++ b/test/sort/rec4.xml @@ -0,0 +1,3 @@ + + third ^computer + diff --git a/test/sort/string-hat.chr b/test/sort/string-hat.chr new file mode 100644 index 0000000..024dc06 --- /dev/null +++ b/test/sort/string-hat.chr @@ -0,0 +1,44 @@ +# Generic character map but with ^ as cut char +# +# $Id: string-hat.chr,v 1.1 2004-07-28 09:47:41 adam Exp $ + +# Define the basic value-set. *Beware* of changing this without re-indexing +# your databases. + +lowercase {0-9}{a-y}üzæäøöå +uppercase {0-9}{A-Y}ÜZÆÄØÖÅ + +cut ^ + +# Breaking characters + +space {\001-\040}!"#$%&'\()*+,-./:;<=>?@\[\\]_`\{|}~ + +# Characters to be considered equivalent for searching purposes. + +# equivalent æä(ae) +# equivalent øö(oe) +# equivalent å(aa) +# equivalent uü + +# Supplemental mappings + +#map (ä) ä +#map (æ) æ +#map (ø) ø +#map (å) å +#map (ö) ö +#map (Ä) Ä +#map (&Aelig;) Æ +#map (Ø) Ø +#map (Å) Å +#map (Ö) Ö + +#map éÉ e +#map á a +#map ó o +#map í i + +#map (Aa) (AA) + +#map (aa) a diff --git a/test/sort/test1.sh b/test/sort/test1.sh index 87973fb..7be1269 100755 --- a/test/sort/test1.sh +++ b/test/sort/test1.sh @@ -1,5 +1,5 @@ #!/bin/sh -# $Id: test1.sh,v 1.7 2004-06-15 09:43:34 adam Exp $ +# $Id: test1.sh,v 1.8 2004-07-28 09:47:41 adam Exp $ pp=${srcdir:-"."} @@ -16,10 +16,12 @@ cp $pp/rec*.xml recs ../../index/zebrasrv -c $pp/zebra.cfg -l $LOG unix:socket & sleep 1 test -f lock/zebrasrv.pid || exit 2 -../api/testclient -n3 unix:socket '@or computer @attr 7=1 @attr 1=30 0' >tmp1 -echo 'Result count: 3 +../api/testclient -n4 unix:socket '@or computer @attr 7=1 @attr 1=30 0' >tmp1 +echo 'Result count: 4 my: - title: third computer + title: 3rd computer +my: + title: third ^computer my: title: second computer dateTime: 1 diff --git a/test/sort/zebra.cfg b/test/sort/zebra.cfg index 9caecf3..390d228 100644 --- a/test/sort/zebra.cfg +++ b/test/sort/zebra.cfg @@ -1,5 +1,5 @@ # Simple Zebra configuration file -# $Id: zebra.cfg,v 1.2 2004-06-15 09:43:34 adam Exp $ +# $Id: zebra.cfg,v 1.3 2004-07-28 09:47:41 adam Exp $ # # Where the schema files, attribute files, etc are located. profilePath: ${srcdir:-.}:${srcdir:-.}/../../tab @@ -9,7 +9,6 @@ attset: bib1.att attset: explain.att recordtype.xml: grs.sgml -recordId: file lockdir: lock register: reg:20M isam: b diff --git a/util/charmap.c b/util/charmap.c index f4f5920..96a390a 100644 --- a/util/charmap.c +++ b/util/charmap.c @@ -1,4 +1,4 @@ -/* $Id: charmap.c,v 1.28 2004-03-09 15:12:15 adam Exp $ +/* $Id: charmap.c,v 1.29 2004-07-28 09:47:42 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -42,7 +42,8 @@ typedef unsigned ucs4_t; const char *CHR_UNKNOWN = "\001"; const char *CHR_SPACE = "\002"; -const char *CHR_BASE = "\003"; +const char *CHR_CUT = "\003"; +const char *CHR_BASE = "\005"; struct chrmaptab_info { @@ -354,6 +355,17 @@ static void fun_addspace(const char *s, void *data, int num) (char*) CHR_SPACE, 0); } +/* + * Callback function. + * Add a space-entry to the value space. + */ +static void fun_addcut(const char *s, void *data, int num) +{ + chrmaptab tab = (chrmaptab) data; + tab->input = set_map_string(tab->input, tab->nmem, s, strlen(s), + (char*) CHR_CUT, 0); +} + /* * Create a string containing the mapped characters provided. */ @@ -612,7 +624,7 @@ chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only, { if (argc != 2) { - logf(LOG_FATAL, "Syntax error in charmap"); + logf(LOG_FATAL, "Syntax error in charmap for space"); ++errors; } if (scan_string(argv[1], t_unicode, t_utf8, @@ -622,6 +634,20 @@ chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only, ++errors; } } + else if (!map_only && !yaz_matchstr(argv[0], "cut")) + { + if (argc != 2) + { + logf(LOG_FATAL, "Syntax error in charmap for cut"); + ++errors; + } + if (scan_string(argv[1], t_unicode, t_utf8, + fun_addcut, res, 0) < 0) + { + logf(LOG_FATAL, "Bad cut specification"); + ++errors; + } + } else if (!yaz_matchstr(argv[0], "map")) { chrwork buf; -- 1.7.10.4