From 01ddc55fd5a59535e229c09440cfdadccadf3555 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Tue, 7 Sep 1999 07:19:21 +0000 Subject: [PATCH] Work on character mapping. Implemented replace rules. --- include/charmap.h | 32 +++-- include/recctrl.h | 11 +- include/zebramap.h | 9 +- index/extract.c | 23 +++- index/kdump.c | 12 +- index/main.c | 7 +- index/zrpn.c | 136 +++++++++++++++++++-- recctrl/recgrs.c | 11 +- recctrl/rectext.c | 7 +- recctrl/regxread.c | 12 +- tab/scan.chr | 44 +++++++ tab/string.chr | 40 +++--- util/charmap.c | 343 +++++++++++++++++++++++++++++++--------------------- util/zebramap.c | 213 ++++++++++++++++++++++++++++++-- 14 files changed, 683 insertions(+), 217 deletions(-) create mode 100644 tab/scan.chr diff --git a/include/charmap.h b/include/charmap.h index 54a90d9..6b3f6dd 100644 --- a/include/charmap.h +++ b/include/charmap.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1995-1997, Index Data. + * Copyright (c) 1995-1999, Index Data. * * All rights reserved. * @@ -36,7 +36,10 @@ * OF THIS SOFTWARE. * * $Log: charmap.h,v $ - * Revision 1.4 1997-10-27 14:33:04 adam + * Revision 1.5 1999-09-07 07:19:21 adam + * Work on character mapping. Implemented replace rules. + * + * Revision 1.4 1997/10/27 14:33:04 adam * Moved towards generic character mapping depending on "structure" * field in abstract syntax file. Fixed a few memory leaks. Fixed * bug with negative integers when doing searches with relational @@ -51,26 +54,35 @@ #ifndef CHARMAP_H #define CHARMAP_H +#include + #ifdef __cplusplus extern "C" { #endif -extern const char *CHR_UNKNOWN; -extern const char *CHR_SPACE; -extern const char *CHR_BASE; +YAZ_EXPORT extern const char *CHR_UNKNOWN; +YAZ_EXPORT extern const char *CHR_SPACE; +YAZ_EXPORT extern const char *CHR_BASE; struct chr_t_entry; typedef struct chr_t_entry chr_t_entry; typedef struct chrmaptab_info *chrmaptab; -chrmaptab chrmaptab_create(const char *tabpath, const char *name, - int map_only); -void chrmaptab_destroy (chrmaptab tab); +YAZ_EXPORT chrmaptab chrmaptab_create(const char *tabpath, const char *name, + int map_only); +YAZ_EXPORT void chrmaptab_destroy (chrmaptab tab); -const char **chr_map_input(chrmaptab t, const char **from, int len); +YAZ_EXPORT const char **chr_map_input(chrmaptab t, const char **from, int len); +YAZ_EXPORT const char **chr_map_input_x(chrmaptab t, + const char **from, int *len); +YAZ_EXPORT const char **chr_map_input_q(chrmaptab maptab, + const char **from, int len, + const char **qmap); + +YAZ_EXPORT const char *chr_map_output(chrmaptab t, const char **from, int len); -const char *chr_map_output(chrmaptab t, const char **from, int len); +YAZ_EXPORT unsigned char zebra_prim(char **s); #ifdef __cplusplus } diff --git a/include/recctrl.h b/include/recctrl.h index 44d4618..a6812ec 100644 --- a/include/recctrl.h +++ b/include/recctrl.h @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: recctrl.h,v $ - * Revision 1.30 1999-05-21 12:00:17 adam + * Revision 1.31 1999-09-07 07:19:21 adam + * Work on character mapping. Implemented replace rules. + * + * Revision 1.30 1999/05/21 12:00:17 adam * Better diagnostics for extraction process. * * Revision 1.29 1999/05/20 12:57:18 adam @@ -131,6 +134,7 @@ typedef struct { int length; int *seqnos; ZebraMaps zebra_maps; + struct recExtractCtrl *extractCtrl; } RecWord; /* Extract record control */ @@ -143,11 +147,12 @@ struct recExtractCtrl { off_t offset; /* start offset */ char *subType; void (*init)(struct recExtractCtrl *p, RecWord *w); - void (*addWord)(RecWord *p); + void *clientData; + void (*tokenAdd)(RecWord *w); ZebraMaps zebra_maps; int flagShowRecords; int seqno[256]; - void (*addSchema)(struct recExtractCtrl *p, Odr_oid *oid); + void (*schemaAdd)(struct recExtractCtrl *p, Odr_oid *oid); data1_handle dh; }; diff --git a/include/zebramap.h b/include/zebramap.h index 974d80c..73710eb 100644 --- a/include/zebramap.h +++ b/include/zebramap.h @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: zebramap.h,v $ - * Revision 1.8 1999-02-12 13:29:21 adam + * Revision 1.9 1999-09-07 07:19:21 adam + * Work on character mapping. Implemented replace rules. + * + * Revision 1.8 1999/02/12 13:29:21 adam * Implemented position-flag for registers. * * Revision 1.7 1999/02/02 14:50:46 adam @@ -64,6 +67,10 @@ int zebra_maps_sort (ZebraMaps zms, Z_SortAttributes *sortAttributes); int zebra_maps_is_complete (ZebraMaps zms, unsigned reg_id); int zebra_maps_is_sort (ZebraMaps zms, unsigned reg_id); int zebra_maps_is_positioned (ZebraMaps zms, unsigned reg_id); + +WRBUF zebra_replace(ZebraMaps zms, unsigned reg_id, const char *ex_list, + const char *input_str, int input_len); + #ifdef __cplusplus } #endif diff --git a/index/extract.c b/index/extract.c index 1e6a395..3a9c4b1 100644 --- a/index/extract.c +++ b/index/extract.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: extract.c,v $ - * Revision 1.97 1999-07-06 12:28:04 adam + * Revision 1.98 1999-09-07 07:19:21 adam + * Work on character mapping. Implemented replace rules. + * + * Revision 1.97 1999/07/06 12:28:04 adam * Updated record index structure. Format includes version ID. Compression * algorithm ID is stored for each record block. * @@ -649,6 +652,7 @@ static void wordInit (struct recExtractCtrl *p, RecWord *w) w->attrSet = VAL_BIB1; w->attrUse = 1016; w->reg_type = 'w'; + w->extractCtrl = p; } static struct sortKey { @@ -856,6 +860,13 @@ static void addCompleteField (RecWord *p) static void addRecordKey (RecWord *p) { + WRBUF wrbuf; + if ((wrbuf = zebra_replace(p->zebra_maps, p->reg_type, 0, + p->string, p->length))) + { + p->string = wrbuf_buf(wrbuf); + p->length = wrbuf_len(wrbuf); + } if (zebra_maps_is_complete (p->zebra_maps, p->reg_type)) addCompleteField (p); else @@ -1291,8 +1302,8 @@ static int recordExtract (SYSNO *sysno, const char *fname, extractCtrl.fh = fi; extractCtrl.subType = subType; extractCtrl.init = wordInit; - extractCtrl.addWord = addRecordKey; - extractCtrl.addSchema = addSchema; + extractCtrl.tokenAdd = addRecordKey; + extractCtrl.schemaAdd = addSchema; extractCtrl.dh = rGroup->dh; for (i = 0; i<256; i++) { @@ -1632,7 +1643,7 @@ int fileExtract (SYSNO *sysno, const char *fname, { if (zebraExplain_newDatabase (zti, rGroup->databaseName, rGroup->explainDatabase)) - abort (); + return 0; } if (rGroup->flagStoreData == -1) @@ -1760,8 +1771,8 @@ static int explain_extract (void *handle, Record rec, data1_node *n) reckeys.prevSeqNo = 0; extractCtrl.init = wordInit; - extractCtrl.addWord = addRecordKey; - extractCtrl.addSchema = addSchema; + extractCtrl.tokenAdd = addRecordKey; + extractCtrl.schemaAdd = addSchema; extractCtrl.dh = rGroup->dh; for (i = 0; i<256; i++) extractCtrl.seqno[i] = 0; diff --git a/index/kdump.c b/index/kdump.c index ddedacd..c9aae4a 100644 --- a/index/kdump.c +++ b/index/kdump.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: kdump.c,v $ - * Revision 1.17 1999-02-02 14:50:55 adam + * Revision 1.18 1999-09-07 07:19:21 adam + * Work on character mapping. Implemented replace rules. + * + * Revision 1.17 1999/02/02 14:50:55 adam * Updated WIN32 code specific sections. Changed header. * * Revision 1.16 1998/05/20 10:12:17 adam @@ -207,8 +210,11 @@ int main (int argc, char **argv) while (*from) { const char *res = zebra_maps_output (zm, reg_type, &from); - while (*res) - *(to++) = *(res++); + if (!res) + *to++ = *from++; + else + while (*res) + *to++ = *res++; } *to = '\0'; printf ("%c %3d %c %7d %5d %s\n", reg_type, usedb_type, op ? 'i':'d', diff --git a/index/main.c b/index/main.c index a7c3e7f..68190bb 100644 --- a/index/main.c +++ b/index/main.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: main.c,v $ - * Revision 1.69 1999-07-21 08:31:33 adam + * Revision 1.70 1999-09-07 07:19:21 adam + * Work on character mapping. Implemented replace rules. + * + * Revision 1.69 1999/07/21 08:31:33 adam * More version info on WIN32. * * Revision 1.68 1999/07/14 10:59:26 adam @@ -543,7 +546,7 @@ int main (int argc, char **argv) #endif #if HAVE_BZLIB_H fprintf (stderr, "libbzip2\n" - " (C) 1996-1998 Julian R Seward. All rights reserved.\n"); + " (C) 1996-1999 Julian R Seward. All rights reserved.\n"); #endif } else if (ret == 'v') diff --git a/index/zrpn.c b/index/zrpn.c index 766df2a..249e7e9 100644 --- a/index/zrpn.c +++ b/index/zrpn.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: zrpn.c,v $ - * Revision 1.94 1999-07-20 13:59:18 adam + * Revision 1.95 1999-09-07 07:19:21 adam + * Work on character mapping. Implemented replace rules. + * + * Revision 1.94 1999/07/20 13:59:18 adam * Fixed bug that occurred when phrases had 0 hits. * * Revision 1.93 1999/06/17 14:38:40 adam @@ -452,8 +455,11 @@ static void term_untrans (ZebraHandle zh, int reg_type, while (*src) { const char *cp = zebra_maps_output (zh->zebra_maps, reg_type, &src); - while (*cp) - *dst++ = *cp++; + if (!cp) + *dst++ = *src++; + else + while (*cp) + *dst++ = *cp++; } *dst = '\0'; } @@ -661,6 +667,54 @@ static int term_102 (ZebraMaps zebra_maps, int reg_type, const char **src, dst_term); } + +/* term_104: handle term, where trunc=Process # and ! */ +static int term_104 (ZebraMaps zebra_maps, int reg_type, + const char **src, char *dst, int space_split, + char *dst_term) +{ + const char *s0, *s1; + const char **map; + int i = 0; + int j = 0; + + if (!term_pre (zebra_maps, reg_type, src, "#!", "#!")) + return 0; + s0 = *src; + while (*s0) + { + if (*s0 == '#') + { + dst[i++] = '.'; + dst[i++] = '*'; + dst_term[j++] = *s0++; + } + else if (*s0 == '!') + { + dst[i++] = '.'; + dst_term[j++] = *s0++; + } + { + s1 = s0; + map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0)); + if (space_split && **map == *CHR_SPACE) + break; + while (s1 < s0) + { + if (!isalnum (*s1)) + dst[i++] = '\\'; + dst_term[j++] = *s1; + dst[i++] = *s1++; + } + } + } + dst[i] = '\0'; + dst_term[j++] = '\0'; + *src = s0; + return i; +} + + /* gen_regular_rel - generate regular expression from relation * val: border value (inclusive) * islt: 1 if <=; 0 if >=. @@ -758,11 +812,14 @@ static void gen_regular_rel (char *dst, int val, int islt) dst[dst_p] = '\0'; if (islt) { - for (i=1; izebra_maps, reg_type, + &termp, term_dict + j, space_split, term_dst)) + return 0; + strcat (term_dict, ")"); + r = dict_lookup_grep (zh->dict, term_dict, 0, grep_info, + &max_pos, 0, grep_handle); + if (r) + logf (LOG_WARN, "dict_lookup_grep err, trunc=#/!: %d", r); + break; } } *term_sub = termp; @@ -1399,9 +1467,51 @@ static RSET rpn_prox (ZebraHandle zh, RSET *rset, int rset_no) return result; } + +char *normalize_term(ZebraHandle zh, Z_AttributesPlusTerm *zapt, + const char *termz, NMEM stream, unsigned reg_id) +{ + WRBUF wrbuf = 0; + AttrType truncation; + int truncation_value; + char *ex_list = 0; + + attr_init (&truncation, zapt, 5); + truncation_value = attr_find (&truncation, NULL); + + switch (truncation_value) + { + default: + ex_list = ""; + break; + case 101: + ex_list = "#"; + break; + case 102: + case 103: + ex_list = 0; + break; + case 104: + ex_list = "!#"; + break; + } + if (ex_list) + wrbuf = zebra_replace(zh->zebra_maps, reg_id, ex_list, + termz, strlen(termz)); + if (!wrbuf) + return nmem_strdup(stream, termz); + else + { + char *buf = (char*) nmem_malloc (stream, wrbuf_len(wrbuf)+1); + memcpy (buf, wrbuf_buf(wrbuf), wrbuf_len(wrbuf)); + buf[wrbuf_len(wrbuf)] = '\0'; + return buf; + } +} + static RSET rpn_search_APT_phrase (ZebraHandle zh, Z_AttributesPlusTerm *zapt, - const char *termz, + const char *termz_org, oid_value attributeSet, NMEM stream, int reg_type, int complete_flag, @@ -1409,10 +1519,11 @@ static RSET rpn_search_APT_phrase (ZebraHandle zh, int num_bases, char **basenames) { char term_dst[IT_MAX_WORD+1]; - const char *termp = termz; RSET rset[60], result; int i, r, rset_no = 0; struct grep_info grep_info; + char *termz = normalize_term(zh, zapt, termz_org, stream, reg_type); + const char *termp = termz; #ifdef TERM_COUNT grep_info.term_no = 0; @@ -1460,7 +1571,7 @@ static RSET rpn_search_APT_phrase (ZebraHandle zh, static RSET rpn_search_APT_or_list (ZebraHandle zh, Z_AttributesPlusTerm *zapt, - const char *termz, + const char *termz_org, oid_value attributeSet, NMEM stream, int reg_type, int complete_flag, @@ -1468,11 +1579,11 @@ static RSET rpn_search_APT_or_list (ZebraHandle zh, int num_bases, char **basenames) { char term_dst[IT_MAX_WORD+1]; - const char *termp = termz; RSET rset[60], result; int i, r, rset_no = 0; struct grep_info grep_info; - + char *termz = normalize_term(zh, zapt, termz_org, stream, reg_type); + const char *termp = termz; #ifdef TERM_COUNT grep_info.term_no = 0; #endif @@ -1525,7 +1636,7 @@ static RSET rpn_search_APT_or_list (ZebraHandle zh, static RSET rpn_search_APT_and_list (ZebraHandle zh, Z_AttributesPlusTerm *zapt, - const char *termz, + const char *termz_org, oid_value attributeSet, NMEM stream, int reg_type, int complete_flag, @@ -1533,10 +1644,11 @@ static RSET rpn_search_APT_and_list (ZebraHandle zh, int num_bases, char **basenames) { char term_dst[IT_MAX_WORD+1]; - const char *termp = termz; RSET rset[60], result; int i, r, rset_no = 0; struct grep_info grep_info; + char *termz = normalize_term(zh, zapt, termz_org, stream, reg_type); + const char *termp = termz; #ifdef TERM_COUNT grep_info.term_no = 0; diff --git a/recctrl/recgrs.c b/recctrl/recgrs.c index 150f4a7..dc31d46 100644 --- a/recctrl/recgrs.c +++ b/recctrl/recgrs.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: recgrs.c,v $ - * Revision 1.31 1999-07-14 10:56:43 adam + * Revision 1.32 1999-09-07 07:19:21 adam + * Work on character mapping. Implemented replace rules. + * + * Revision 1.31 1999/07/14 10:56:43 adam * Fixed potential memory leak. * * Revision 1.30 1999/07/06 12:26:41 adam @@ -382,7 +385,7 @@ static int dumpkeys(data1_node *n, struct recExtractCtrl *p, int level) wrd.length = n->u.data.len; wrd.attrSet = (int) (tlist->att->parent->reference); wrd.attrUse = tlist->att->locals->local; - (*p->addWord)(&wrd); + (*p->tokenAdd)(&wrd); } } } @@ -404,7 +407,7 @@ int grs_extract_tree(struct recExtractCtrl *p, data1_node *n) oe.value = n->u.root.absyn->reference; if ((oid_ent_to_oid (&oe, oidtmp))) - (*p->addSchema)(p, oidtmp); + (*p->schemaAdd)(p, oidtmp); return dumpkeys(n, p, 0); } @@ -434,7 +437,7 @@ static int grs_extract_sub(struct grs_handlers *h, struct recExtractCtrl *p, oe.oclass = CLASS_SCHEMA; oe.value = n->u.root.absyn->reference; if ((oid_ent_to_oid (&oe, oidtmp))) - (*p->addSchema)(p, oidtmp); + (*p->schemaAdd)(p, oidtmp); if (dumpkeys(n, p, 0) < 0) { diff --git a/recctrl/rectext.c b/recctrl/rectext.c index bcc9d26..29f8a6b 100644 --- a/recctrl/rectext.c +++ b/recctrl/rectext.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: rectext.c,v $ - * Revision 1.12 1999-05-26 07:49:14 adam + * Revision 1.13 1999-09-07 07:19:21 adam + * Work on character mapping. Implemented replace rules. + * + * Revision 1.12 1999/05/26 07:49:14 adam * C++ compilation. * * Revision 1.11 1999/05/21 12:00:17 adam @@ -147,7 +150,7 @@ static int text_extract (void *clientData, struct recExtractCtrl *p) { recWord.string = w; recWord.length = i; - (*p->addWord)(&recWord); + (*p->tokenAdd)(&recWord); } } while (r > 0); buf_close (fi); diff --git a/recctrl/regxread.c b/recctrl/regxread.c index 0ef13aa..d24be82 100644 --- a/recctrl/regxread.c +++ b/recctrl/regxread.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: regxread.c,v $ - * Revision 1.31 1999-07-14 13:05:29 adam + * Revision 1.32 1999-09-07 07:19:21 adam + * Work on character mapping. Implemented replace rules. + * + * Revision 1.31 1999/07/14 13:05:29 adam * Tcl filter works with objects when TCL is version 8 or later; filter * works with strings otherwise (slow). * @@ -1026,9 +1029,9 @@ static void tagStrip (const char **tag, int *len) static void tagBegin (struct lexSpec *spec, const char *tag, int len) { - struct data1_node *parent = spec->d1_stack[spec->d1_level -1]; + struct data1_node *parent; data1_element *elem = NULL; - data1_node *partag = get_parent_tag(spec->dh, parent); + data1_node *partag; data1_node *res; data1_element *e = NULL; int localtag = 0; @@ -1039,6 +1042,9 @@ static void tagBegin (struct lexSpec *spec, return ; } tagStrip (&tag, &len); + + parent = spec->d1_stack[spec->d1_level -1]; + partag = get_parent_tag(spec->dh, parent); res = data1_mk_node (spec->dh, spec->m); res->parent = parent; diff --git a/tab/scan.chr b/tab/scan.chr new file mode 100644 index 0000000..599dd7c --- /dev/null +++ b/tab/scan.chr @@ -0,0 +1,44 @@ +# Danish/Swedish character map. +# +# $Id: scan.chr,v 1.1 1999-09-07 07:19:21 adam Exp $ + +# Define the basic value-set. *Beware* of changing this without re-indexing +# your databases. + +lowercase {0-9}{a-y}üzæäøöå +uppercase {0-9}{A-Y}ÜZÆÄØÖÅ + +# Breaking characters + +space {\001-\040}!"#$%&'\()*+,-./:;<=>?@\[\\]^_`\{|}~ + +# Characters to be considered equivalent for searching purposes. + +# equivalent æä(ae) +# equivalent øö(oe) +# equivalent å(aa) +# equivalent uü + +# Supplemental mappings + +map (ä) ä +map (æ) æ +map (ø) ø +map (å) å +map (ö) ö +map (Ä) Ä +map (&Aelig;) Æ +map (Ø) Ø +map (Å) Å +map (Ö) Ö + +map éÉ e +map á a +map ó o +map í i + +map (Aa) (AA) + +map (aa) a + +#qmap (ies) (ie) diff --git a/tab/string.chr b/tab/string.chr index ba89e2b..ccb1f78 100644 --- a/tab/string.chr +++ b/tab/string.chr @@ -1,6 +1,6 @@ -# Danish/Swedish character map. +# Generic character map. # -# $Id: string.chr,v 1.3 1998-11-29 22:45:55 quinn Exp $ +# $Id: string.chr,v 1.4 1999-09-07 07:19:21 adam Exp $ # Define the basic value-set. *Beware* of changing this without re-indexing # your databases. @@ -21,20 +21,22 @@ space {\001-\040}!"#$%&'\()*+,-./:;<=>?@\[\\]^_`\{|}~ # Supplemental mappings -map (ä) ä -map (æ) æ -map (ø) ø -map (å) å -map (ö) ö -map (Ä) Ä -map (&Aelig;) Æ -map (Ø) Ø -map (Å) Å -map (Ö) Ö - -map éÉ e -map á a -map ó o -map í i - -map (Aa) (AA) +#map (ä) ä +#map (æ) æ +#map (ø) ø +#map (å) å +#map (ö) ö +#map (Ä) Ä +#map (&Aelig;) Æ +#map (Ø) Ø +#map (Å) Å +#map (Ö) Ö + +#map éÉ e +#map á a +#map ó o +#map í i + +#map (Aa) (AA) + +#map (aa) a diff --git a/util/charmap.c b/util/charmap.c index 5b5b81e..7b03644 100644 --- a/util/charmap.c +++ b/util/charmap.c @@ -1,10 +1,13 @@ /* - * Copyright (C) 1996-1998, Index Data + * Copyright (C) 1996-1999, Index Data * All rights reserved. * Sebastian Hammer, Adam Dickmeiss * * $Log: charmap.c,v $ - * Revision 1.15 1999-05-26 07:49:14 adam + * Revision 1.16 1999-09-07 07:19:21 adam + * Work on character mapping. Implemented replace rules. + * + * Revision 1.15 1999/05/26 07:49:14 adam * C++ compilation. * * Revision 1.14 1998/10/13 20:09:18 adam @@ -71,9 +74,6 @@ #define CHR_MAXSTR 1024 #define CHR_MAXEQUIV 32 -int chr_map_chrs(chr_t_entry *t, char **from, int len, - int *read, char **to, int max); - const char *CHR_UNKNOWN = "\001"; const char *CHR_SPACE = "\002"; const char *CHR_BASE = "\003"; @@ -81,10 +81,9 @@ const char *CHR_BASE = "\003"; struct chrmaptab_info { chr_t_entry *input; /* mapping table for input data */ - chr_t_entry *query_equiv; /* mapping table for queries */ + chr_t_entry *q_input; /* mapping table for queries */ unsigned char *output[256]; /* return mapping - for display of registers */ int base_uppercase; /* Start of upper-case ordinals */ - char **tmp_buf; NMEM nmem; }; @@ -93,9 +92,8 @@ struct chrmaptab_info */ struct chr_t_entry { - chr_t_entry **children; /* array of children */ - unsigned char *target; /* target for this node, if any */ - unsigned char *equiv; /* equivalent to, or sumthin */ + chr_t_entry **children; /* array of children */ + unsigned char **target; /* target for this node, if any */ }; /* @@ -121,11 +119,13 @@ static chr_t_entry *set_map_string(chr_t_entry *root, NMEM nmem, } if (!len) { - if (!root->target || (char*) root->target == CHR_SPACE || - (char*) root->target == CHR_UNKNOWN) - root->target = (unsigned char *) nmem_strdup(nmem, to); - else if ((char*) to != CHR_SPACE) - logf(LOG_DEBUG, "Character map overlap"); + if (!root->target || !root->target[0] || strcmp(root->target[0], to)) + { + root->target = (unsigned char **) + nmem_malloc(nmem, sizeof(*root->target)*2); + root->target[0] = (unsigned char *) nmem_strdup(nmem, to); + root->target[1] = 0; + } } else { @@ -146,32 +146,6 @@ static chr_t_entry *set_map_string(chr_t_entry *root, NMEM nmem, return root; } -int chr_map_chrs(chr_t_entry *t, char **from, int len, int *read, char **to, - int max) -{ - int i = 0; - unsigned char *s; - - while (len && t->children && t->children[(unsigned char) **from]) - { - t = t->children[(unsigned char) **from]; - (*from)++; - len--; - } - /* if there were no matches, we are still at the root node, - which always has a null mapping */ - for (s = t->target; *s && max; s++) - { - **to = *s; - s++; - (*to)++; - max--; - i++; - } - return i; -} - - static chr_t_entry *find_entry(chr_t_entry *t, const char **from, int len) { chr_t_entry *res; @@ -188,19 +162,59 @@ static chr_t_entry *find_entry(chr_t_entry *t, const char **from, int len) *from = pos; } /* no children match. use ourselves, if we have a target */ - return t->target ? t : 0; + return t->target ? t : 0; +} + +static chr_t_entry *find_entry_x(chr_t_entry *t, const char **from, int *len) +{ + chr_t_entry *res; + + while (*len <= 0) + { /* switch to next buffer */ + if (*len < 0) + break; + from++; + len++; + } + if (*len > 0 && t->children && t->children[(unsigned char) **from]) + { + const char *old_from = *from; + int old_len = *len; + + (*len)--; + (*from)++; + if ((res = find_entry_x(t->children[(unsigned char) *old_from], + from, len))) + return res; + /* no match */ + *len = old_len; + *from = old_from; + } + /* no children match. use ourselves, if we have a target */ + return t->target ? t : 0; +} + +const char **chr_map_input_x(chrmaptab maptab, const char **from, int *len) +{ + chr_t_entry *t = maptab->input; + chr_t_entry *res; + + if (!(res = find_entry_x(t, from, len))) + abort(); + return (const char **) (res->target); } const char **chr_map_input(chrmaptab maptab, const char **from, int len) { chr_t_entry *t = maptab->input; chr_t_entry *res; + int len_tmp[2]; - if (!(res = find_entry(t, from, len))) + len_tmp[0] = len; + len_tmp[1] = -1; + if (!(res = find_entry_x(t, from, len_tmp))) abort(); - maptab->tmp_buf[0] = (char*) res->target; - maptab->tmp_buf[1] = NULL; - return (const char **) maptab->tmp_buf; + return (const char **) (res->target); } const char *chr_map_output(chrmaptab maptab, const char **from, int len) @@ -210,27 +224,28 @@ const char *chr_map_output(chrmaptab maptab, const char **from, int len) return (const char*) maptab->output[c]; } -static unsigned char prim(char **s) +unsigned char zebra_prim(char **s) { unsigned char c; unsigned int i; - + if (**s == '\\') { (*s)++; c = **s; switch (c) { - case '\\': c = '\\'; (*s)++; break; - case 'r': c = '\r'; (*s)++; break; - case 'n': c = '\n'; (*s)++; break; - case 't': c = '\t'; (*s)++; break; - case 's': c = ' '; (*s)++; break; - case 'x': sscanf(*s, "x%2x", &i); c = i; *s += 3; break; - case '{': case '[': case '(': case '}': case ']': case ')': - (*s)++; - break; - default: sscanf(*s, "%3o", &i); c = i; *s += 3; break; + case '\\': c = '\\'; (*s)++; break; + case 'r': c = '\r'; (*s)++; break; + case 'n': c = '\n'; (*s)++; break; + case 't': c = '\t'; (*s)++; break; + case 's': c = ' '; (*s)++; break; + case 'x': sscanf(*s, "x%2x", &i); c = i; *s += 3; break; + case '{': case '[': case '(': case '}': case ']': case ')': case '$': + (*s)++; + break; + default: + sscanf(*s, "%3o", &i); c = i; *s += 3; break; } return c; } @@ -247,7 +262,7 @@ static void fun_addentry(const char *s, void *data, int num) { chrmaptab tab = (chrmaptab) data; char tmp[2]; - + tmp[0] = num; tmp[1] = '\0'; tab->input = set_map_string(tab->input, tab->nmem, s, strlen(s), tmp); tab->output[num + tab->base_uppercase] = @@ -283,67 +298,86 @@ static void fun_mkstring(const char *s, void *data, int num) /* * Add a map to the string contained in the argument. */ -static void fun_addmap(const char *s, void *data, int num) +static void fun_add_map(const char *s, void *data, int num) { chrwork *arg = (chrwork *) data; assert(arg->map->input); + logf (LOG_LOG, "set map %.*s", (int) strlen(s), s); set_map_string(arg->map->input, arg->map->nmem, s, strlen(s), arg->string); + for (s = arg->string; *s; s++) + logf (LOG_LOG, " %3d", (unsigned char) *s); +} + +/* + * Add a query map to the string contained in the argument. + */ +static void fun_add_qmap(const char *s, void *data, int num) +{ + chrwork *arg = (chrwork *) data; + + assert(arg->map->q_input); + logf (LOG_LOG, "set qmap %.*s", (int) strlen(s), s); + set_map_string(arg->map->q_input, arg->map->nmem, s, + strlen(s), arg->string); + for (s = arg->string; *s; s++) + logf (LOG_LOG, " %3d", (unsigned char) *s); } + static int scan_string(char *s, void (*fun)(const char *c, void *data, int num), void *data, int *num) { unsigned char c, str[1024], begin, end, *p; - + while (*s) { switch (*s) { - case '{': - s++; - begin = prim(&s); - if (*s != '-') - { - logf(LOG_FATAL, "Bad range in char-map"); - return -1; - } - s++; - end = prim(&s); - if (end <= begin) - { - logf(LOG_FATAL, "Bad range in char-map"); - return -1; - } - s++; - for (c = begin; c <= end; c++) - { - str[0] = c; str[1] = '\0'; - (*fun)((char *) str, data, num ? (*num)++ : 0); - } - break; - case '[': s++; abort(); break; - case '(': - p = (unsigned char*) ++s; + case '{': + s++; + begin = zebra_prim(&s); + if (*s != '-') + { + logf(LOG_FATAL, "Bad range in char-map"); + return -1; + } + s++; + end = zebra_prim(&s); + if (end <= begin) + { + logf(LOG_FATAL, "Bad range in char-map"); + return -1; + } + s++; + for (c = begin; c <= end; c++) + { + str[0] = c; str[1] = '\0'; + (*fun)((char *) str, data, num ? (*num)++ : 0); + } + break; + case '[': s++; abort(); break; + case '(': + p = (unsigned char*) ++s; /* Find the end-marker, ignoring escapes */ - do + do + { + if (!(p = (unsigned char*) strchr((char*) p, ')'))) { - if (!(p = (unsigned char*) strchr((char*) p, ')'))) - { - logf(LOG_FATAL, "Missing ')' in string"); - return -1; - } + logf(LOG_FATAL, "Missing ')' in string"); + return -1; } - while (*(p - 1) == '\\'); - *p = 0; - (*fun)(s, data, num ? (*num)++ : 0); - s = (char*) p + 1; - break; - default: - c = prim(&s); - str[0] = c; str[1] = '\0'; - (*fun)((char *) str, data, num ? (*num)++ : 0); + } + while (*(p - 1) == '\\'); + *p = 0; + (*fun)(s, data, num ? (*num)++ : 0); + s = (char*) p + 1; + break; + default: + c = zebra_prim(&s); + str[0] = c; str[1] = '\0'; + (*fun)((char *) str, data, num ? (*num)++ : 0); } } return 0; @@ -355,20 +389,24 @@ chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only) char line[512], *argv[50]; chrmaptab res; int lineno = 0; + int errors = 0; int argc, num = (int) *CHR_BASE, i; + NMEM nmem; + logf (LOG_LOG, "maptab %s open", name); if (!(f = yaz_path_fopen(tabpath, name, "r"))) { logf(LOG_WARN|LOG_ERRNO, "%s", name); return 0; } - res = (chrmaptab) xmalloc(sizeof(*res)); - res->nmem = nmem_create (); - res->tmp_buf = (char **) - nmem_malloc (res->nmem, sizeof(*res->tmp_buf) * 100); + nmem = nmem_create (); + res = (chrmaptab) nmem_malloc(nmem, sizeof(*res)); + res->nmem = nmem; res->input = (chr_t_entry *) nmem_malloc(res->nmem, sizeof(*res->input)); - res->input->target = (unsigned char*) CHR_UNKNOWN; - res->input->equiv = 0; + res->input->target = (unsigned char **) + nmem_malloc(res->nmem, sizeof(*res->input->target) * 2); + res->input->target[0] = (unsigned char*) CHR_UNKNOWN; + res->input->target[1] = 0; res->input->children = (chr_t_entry **) nmem_malloc(res->nmem, sizeof(res->input) * 256); for (i = 0; i < 256; i++) @@ -376,38 +414,42 @@ chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only) res->input->children[i] = (chr_t_entry *) nmem_malloc(res->nmem, sizeof(*res->input)); res->input->children[i]->children = 0; + res->input->children[i]->target = (unsigned char **) + nmem_malloc (res->nmem, 2 * sizeof(unsigned char *)); + res->input->children[i]->target[1] = 0; if (map_only) { - res->input->children[i]->target = (unsigned char *) - nmem_malloc (res->nmem, 2 * sizeof(char)); - res->input->children[i]->target[0] = i; - res->input->children[i]->target[1] = 0; + res->input->children[i]->target[0] = (unsigned char *) + nmem_malloc (res->nmem, 2 * sizeof(unsigned char)); + res->input->children[i]->target[0][0] = i; + res->input->children[i]->target[0][1] = 0; } else - res->input->children[i]->target = (unsigned char*) CHR_UNKNOWN; - res->input->children[i]->equiv = 0; + res->input->children[i]->target[0] = (unsigned char*) CHR_UNKNOWN; } - res->query_equiv = 0; + res->q_input = (chr_t_entry *) + nmem_malloc(res->nmem, sizeof(*res->q_input)); + res->q_input->target = 0; + res->q_input->children = 0; + for (i = *CHR_BASE; i < 256; i++) res->output[i] = 0; res->output[(int) *CHR_SPACE] = (unsigned char *) " "; res->output[(int) *CHR_UNKNOWN] = (unsigned char*) "@"; res->base_uppercase = 0; - while ((argc = readconf_line(f, &lineno, line, 512, argv, 50))) + while (!errors && (argc = readconf_line(f, &lineno, line, 512, argv, 50))) if (!map_only && !yaz_matchstr(argv[0], "lowercase")) { if (argc != 2) { logf(LOG_FATAL, "Syntax error in charmap"); - fclose(f); - return 0; + ++errors; } if (scan_string(argv[1], fun_addentry, res, &num) < 0) { logf(LOG_FATAL, "Bad value-set specification"); - fclose(f); - return 0; + ++errors; } res->base_uppercase = num; res->output[(int) *CHR_SPACE + num] = (unsigned char *) " "; @@ -419,20 +461,17 @@ chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only) if (!res->base_uppercase) { logf(LOG_FATAL, "Uppercase directive with no lowercase set"); - fclose(f); - return 0; + ++errors; } if (argc != 2) { - logf(LOG_FATAL, "Syntax error in charmap"); - fclose(f); - return 0; + logf(LOG_FATAL, "Missing arg for uppercase directive"); + ++errors; } if (scan_string(argv[1], fun_addentry, res, &num) < 0) { logf(LOG_FATAL, "Bad value-set specification"); - fclose(f); - return 0; + ++errors; } } else if (!map_only && !yaz_matchstr(argv[0], "space")) @@ -440,14 +479,12 @@ chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only) if (argc != 2) { logf(LOG_FATAL, "Syntax error in charmap"); - fclose(f); - return 0; + ++errors; } if (scan_string(argv[1], fun_addspace, res, 0) < 0) { logf(LOG_FATAL, "Bad space specification"); - fclose(f); - return 0; + ++errors; } } else if (!yaz_matchstr(argv[0], "map")) @@ -456,37 +493,63 @@ chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only) if (argc != 3) { - logf(LOG_FATAL, "charmap MAP directive requires 2 args"); - fclose(f); - return 0; + logf(LOG_FATAL, "charmap directive map requires 2 args"); + ++errors; } buf.map = res; buf.string[0] = '\0'; if (scan_string(argv[2], fun_mkstring, &buf, 0) < 0) { logf(LOG_FATAL, "Bad map target"); - fclose(f); - return 0; + ++errors; } - if (scan_string(argv[1], fun_addmap, &buf, 0) < 0) + if (scan_string(argv[1], fun_add_map, &buf, 0) < 0) { logf(LOG_FATAL, "Bad map source"); - fclose(f); - return 0; + ++errors; + } + } + else if (!yaz_matchstr(argv[0], "qmap")) + { + chrwork buf; + + if (argc != 3) + { + logf(LOG_FATAL, "charmap directive qmap requires 2 args"); + ++errors; + } + buf.map = res; + buf.string[0] = '\0'; + if (scan_string(argv[2], fun_mkstring, &buf, 0) < 0) + { + logf(LOG_FATAL, "Bad qmap target"); + ++errors; + } + if (scan_string(argv[1], fun_add_qmap, &buf, 0) < 0) + { + logf(LOG_FATAL, "Bad qmap source"); + ++errors; } } else { logf(LOG_WARN, "Syntax error at '%s' in %s", line, name); } + fclose(f); + if (errors) + { + chrmaptab_destroy(res); + res = 0; + } + logf (LOG_LOG, "maptab %s close %d errors", name, errors); return res; } void chrmaptab_destroy(chrmaptab tab) { - nmem_destroy (tab->nmem); - xfree (tab); + if (tab) + nmem_destroy (tab->nmem); } diff --git a/util/zebramap.c b/util/zebramap.c index 3d75a41..4e24daa 100644 --- a/util/zebramap.c +++ b/util/zebramap.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: zebramap.c,v $ - * Revision 1.15 1999-05-26 07:49:14 adam + * Revision 1.16 1999-09-07 07:19:21 adam + * Work on character mapping. Implemented replace rules. + * + * Revision 1.15 1999/05/26 07:49:14 adam * C++ compilation. * * Revision 1.14 1999/02/19 10:37:40 adam @@ -67,6 +70,13 @@ #define ZEBRA_MAP_TYPE_SORT 1 #define ZEBRA_MAP_TYPE_INDEX 2 +struct zm_token { + char *token_from; + char *token_to; + int token_min; + struct zm_token *next; +}; + struct zebra_map { unsigned reg_id; int completeness; @@ -83,6 +93,7 @@ struct zebra_map { chrmaptab maptab; const char *maptab_name; struct zebra_map *next; + struct zm_token *replace_tokens; }; struct zebra_maps { @@ -92,6 +103,7 @@ struct zebra_maps { char temp_map_str[2]; const char *temp_map_ptr[2]; struct zebra_map **lookup_array; + WRBUF wrbuf_1, wrbuf_2; }; void zebra_maps_close (ZebraMaps zms) @@ -103,6 +115,8 @@ void zebra_maps_close (ZebraMaps zms) chrmaptab_destroy (zm->maptab); zm = zm->next; } + wrbuf_free (zms->wrbuf_1, 1); + wrbuf_free (zms->wrbuf_2, 1); nmem_destroy (zms->nmem); xfree (zms); } @@ -136,6 +150,7 @@ static void zebra_map_read (ZebraMaps zms, const char *name) (*zm)->type = ZEBRA_MAP_TYPE_INDEX; (*zm)->completeness = 0; (*zm)->positioned = 1; + (*zm)->replace_tokens = 0; } else if (!yaz_matchstr (argv[0], "sort") && argc == 2) { @@ -169,6 +184,43 @@ static void zebra_map_read (ZebraMaps zms, const char *name) if ((*zm)->type == ZEBRA_MAP_TYPE_SORT) (*zm)->u.sort.entry_size = atoi (argv[1]); } + else if (zm && !yaz_matchstr (argv[0], "replace") && argc >= 2) + { + struct zm_token *token = nmem_malloc (zms->nmem, sizeof(*token)); + char *cp, *dp; + token->next = (*zm)->replace_tokens; + (*zm)->replace_tokens = token; + dp = token->token_from = nmem_strdup (zms->nmem, cp = argv[1]); + while (*cp) + { + if (*cp == '$') + { + *dp++ = ' '; + cp++; + } + else + *dp++ = zebra_prim(&cp); + } + *dp = '\0'; + + if (argc >= 3) + { + dp = token->token_to = nmem_strdup (zms->nmem, cp = argv[2]); + while (*cp) + { + if (*cp == '$') + { + *dp++ = ' '; + cp++; + } + else + *dp++ = zebra_prim(&cp); + } + *dp = '\0'; + } + else + token->token_to = 0; + } } if (zm) (*zm)->next = NULL; @@ -191,7 +243,8 @@ ZebraMaps zebra_maps_open (Res res) int i; zms->nmem = nmem_create (); - zms->tabpath = nmem_strdup (zms->nmem, res_get_def (res, "profilePath", ".")); + zms->tabpath = nmem_strdup (zms->nmem, + res_get_def (res, "profilePath", ".")); zms->map_list = NULL; zms->temp_map_str[0] = '\0'; @@ -206,6 +259,9 @@ ZebraMaps zebra_maps_open (Res res) zms->lookup_array[i] = 0; if (!res || !res_trav (res, "index", zms, zms_map_handle)) zebra_map_read (zms, "default.idx"); + + zms->wrbuf_1 = wrbuf_alloc(); + zms->wrbuf_2 = wrbuf_alloc(); return zms; } @@ -261,19 +317,47 @@ const char **zebra_maps_input (ZebraMaps zms, unsigned reg_id, return zms->temp_map_ptr; } +#if 0 +int zebra_maps_input_tokens (ZebraMaps zms, unsigned reg_id, + const char *input_str, int input_len, + WRBUF wrbuf) +{ + chrmaptab maptab = zebra_charmap_get (zms, reg_id); + int len[4]; + char *str[3]; + int input_i = 0; + int first = 1; + const char **out; + + if (!maptab) + { + wrbuf_write (wrbuf, input_str, input_len); + return -1; + } + str[0] = " "; + len[0] = 1; + str[1] = input_str; + len[1] = input_len; + str[2] = " "; + len[2] = 1; + len[3] = -1; + + out = chr_map_input (maptab, str, len); + while (len[1] > 0) + { + while (out && *out && **out == *CHR_SPACE) + out = chr_map_input (maptab, str, len); + } +} +#endif + const char *zebra_maps_output(ZebraMaps zms, unsigned reg_id, const char **from) { - chrmaptab maptab; - unsigned char i = (unsigned char) **from; - static char buf[2] = {0,0}; - - maptab = zebra_charmap_get (zms, reg_id); - if (maptab) - return chr_map_output (maptab, from, 1); - (*from)++; - buf[0] = i; - return buf; + chrmaptab maptab = zebra_charmap_get (zms, reg_id); + if (!maptab) + return 0; + return chr_map_output (maptab, from, 1); } @@ -466,3 +550,108 @@ int zebra_maps_attr (ZebraMaps zms, Z_AttributesPlusTerm *zapt, } return 0; } + +int zebra_replace_sub(ZebraMaps zms, unsigned reg_id, const char *ex_list, + const char *input_str, int input_len, WRBUF wrbuf); + +WRBUF zebra_replace(ZebraMaps zms, unsigned reg_id, const char *ex_list, + const char *input_str, int input_len) +{ + struct zebra_map *zm = zebra_map_get (zms, reg_id); + + wrbuf_rewind(zms->wrbuf_1); + wrbuf_write(zms->wrbuf_1, input_str, input_len); + if (!zm->replace_tokens) + return zms->wrbuf_1; + +#if 0 + logf (LOG_LOG, "zebra_replace"); + logf (LOG_LOG, "in:%.*s:", wrbuf_len(zms->wrbuf_1), + wrbuf_buf(zms->wrbuf_1)); +#endif + for (;;) + { + if (!zebra_replace_sub(zms, reg_id, ex_list, wrbuf_buf(zms->wrbuf_1), + wrbuf_len(zms->wrbuf_1), zms->wrbuf_2)) + return zms->wrbuf_2; + if (!zebra_replace_sub(zms, reg_id, ex_list, wrbuf_buf(zms->wrbuf_2), + wrbuf_len(zms->wrbuf_2), zms->wrbuf_1)) + return zms->wrbuf_1; + } + return 0; +} + +int zebra_replace_sub(ZebraMaps zms, unsigned reg_id, const char *ex_list, + const char *input_str, int input_len, WRBUF wrbuf) +{ + int i = -1; + int no_replaces = 0; + struct zebra_map *zm = zebra_map_get (zms, reg_id); + + wrbuf_rewind(wrbuf); + for (i = -1; i <= input_len; ) + { + struct zm_token *token; + char replace_string[128]; + int replace_out; + int replace_in = 0; + + for (token = zm->replace_tokens; !replace_in && token; + token = token->next) + { + int j = 0; + int replace_done = 0; + replace_out = 0; + for (;; j++) + { + int c; + if (!token->token_from[j]) + { + replace_in = j; + break; + } + if (ex_list && strchr (ex_list, token->token_from[j])) + break; + if (i+j < 0 || j+i >= input_len) + c = ' '; + else + c = tolower(input_str[j+i]); + if (token->token_from[j] == '.') + { + if (c == ' ') + break; + replace_string[replace_out++] = c; + } + else + { + if (c != token->token_from[j]) + break; + if (!replace_done) + { + const char *cp = token->token_to; + replace_done = 1; + for (; cp && *cp; cp++) + replace_string[replace_out++] = *cp; + } + } + } + } + if (!replace_in) + { + if (i >= 0 && i < input_len) + wrbuf_putc(wrbuf, input_str[i]); + i++; + } + else + { + no_replaces++; + if (replace_out) + wrbuf_write(wrbuf, replace_string, replace_out); + i += replace_in; + } + } +#if 0 + logf (LOG_LOG, "out:%.*s:", wrbuf_len(wrbuf), wrbuf_buf(wrbuf)); +#endif + return no_replaces; +} -- 1.7.10.4