From: Adam Dickmeiss Date: Tue, 24 Aug 2004 14:29:09 +0000 (+0000) Subject: Allow range to be specified in termlist, e.g. title:w:range(data,2,4) X-Git-Tag: snippet.version.1~396 X-Git-Url: http://git.indexdata.com/?p=idzebra-moved-to-github.git;a=commitdiff_plain;h=cc4289d73c2b872c09d221d5e1188d3cdd9da438 Allow range to be specified in termlist, e.g. title:w:range(data,2,4) which indexes 4 characters from position 2 in title. --- diff --git a/data1/d1_absyn.c b/data1/d1_absyn.c index a913e4b..f2a48ec 100644 --- a/data1/d1_absyn.c +++ b/data1/d1_absyn.c @@ -1,5 +1,5 @@ -/* $Id: d1_absyn.c,v 1.10 2004-08-04 08:35:22 adam Exp $ - Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 +/* $Id: d1_absyn.c,v 1.11 2004-08-24 14:29:09 adam Exp $ + Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps This file is part of the Zebra server. @@ -362,28 +362,57 @@ const char * mk_xpath_regexp (data1_handle dh, char *expr) pop, 2002-12-13 */ static int parse_termlists (data1_handle dh, data1_termlist ***tpp, - char *p, const char *file, int lineno, + char *cp, const char *file, int lineno, const char *element_name, data1_absyn *res, int xpelement) { data1_termlist **tp = *tpp; - do + while(1) { char attname[512], structure[512]; char *source; - int r; - - if (!(r = sscanf(p, "%511[^:,]:%511[^,]", attname, - structure))) + int r, i; + int level = 0; + structure[0] = '\0'; + for (i = 0; cp[i] && inext = 0; if (!xpelement) { @@ -416,7 +445,7 @@ static int parse_termlists (data1_handle dh, data1_termlist ***tpp, nmem_strdup (data1_nmem_get (dh), structure); tp = &(*tp)->next; } - while ((p = strchr(p, ',')) && *(++p)); + *tpp = tp; return 0; } diff --git a/recctrl/recgrs.c b/recctrl/recgrs.c index 2e3a3a1..cc7e2f6 100644 --- a/recctrl/recgrs.c +++ b/recctrl/recgrs.c @@ -1,5 +1,5 @@ -/* $Id: recgrs.c,v 1.88 2004-08-06 13:36:23 adam Exp $ - Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003 +/* $Id: recgrs.c,v 1.89 2004-08-24 14:29:09 adam Exp $ + Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps This file is part of the Zebra server. @@ -126,6 +126,138 @@ static void grs_destroy(void *clientData) xfree (h); } +struct source_parser { + int len; + const char *tok; + const char *src; + int lookahead; +}; + +static int sp_lex(struct source_parser *sp) +{ + while (*sp->src == ' ') + (sp->src)++; + sp->tok = sp->src; + sp->len = 0; + while (*sp->src && !strchr("<>();,-: ", *sp->src)) + { + sp->src++; + sp->len++; + } + if (sp->len) + sp->lookahead = 't'; + else + { + sp->lookahead = *sp->src; + if (*sp->src) + sp->src++; + } + return sp->lookahead; +} + + +static int sp_expr(struct source_parser *sp, data1_node *n, RecWord *wrd) +{ + if (sp->lookahead != 't') + return 0; + if (sp->len == 4 && !memcmp(sp->tok, "data", sp->len)) + { + if (n->which == DATA1N_data) + { + wrd->string = n->u.data.data; + wrd->length = n->u.data.len; + } + sp_lex(sp); + } + else if (sp->len == 3 && !memcmp(sp->tok, "tag", sp->len)) + { + if (n->which == DATA1N_tag) + { + wrd->string = n->u.tag.tag; + wrd->length = strlen(n->u.tag.tag); + } + sp_lex(sp); + } + else if (sp->len == 4 && !memcmp(sp->tok, "attr", sp->len)) + { + sp_lex(sp); + if (sp->lookahead != '(') + return 0; + sp_lex(sp); + if (sp->lookahead != 't') + return 0; + + if (n->which == DATA1N_tag) + { + data1_xattr *p = n->u.tag.attributes; + while (p && strlen(p->name) != sp->len && + memcmp (p->name, sp->tok, sp->len)) + p = p->next; + if (p) + { + wrd->string = p->value; + wrd->length = strlen(p->value); + } + } + sp_lex(sp); + if (sp->lookahead != ')') + return 0; + sp_lex(sp); + } + else if (sp->len == 5 && !memcmp(sp->tok, "range", sp->len)) + { + int start, len; + sp_lex(sp); + if (sp->lookahead != '(') + return 0; + + sp_lex(sp); + sp_expr(sp, n, wrd); + if (sp->lookahead != ',') + return 0; + + sp_lex(sp); + if (sp->lookahead != 't') + return 0; + start = atoi_n(sp->tok, sp->len); + + sp_lex(sp); + if (sp->lookahead != ',') + return 0; + + sp_lex(sp); + if (sp->lookahead != 't') + return 0; + len = atoi_n(sp->tok, sp->len); + + sp_lex(sp); + if (sp->lookahead != ')') + return 0; + + sp_lex(sp); + if (wrd->string && wrd->length) + { + wrd->string += start; + wrd->length -= start; + if (wrd->length > len) + wrd->length = len; + } + } + return 1; +} + +static int sp_parse(data1_node *n, RecWord *wrd, const char *src) +{ + struct source_parser sp; + sp.len = 0; + sp.tok = 0; + sp.src = src; + sp.lookahead = 0; + sp_lex(&sp); + + return sp_expr(&sp, n, wrd); +} + int d1_check_xpath_predicate(data1_node *n, struct xpath_predicate *p) { int res = 1; @@ -355,6 +487,8 @@ static void index_xpath (data1_node *n, struct recExtractCtrl *p, size_t flen = 0; data1_node *nn; int termlist_only = 1; + data1_termlist *tl; + int xpdone = 0; yaz_log(LOG_DEBUG, "index_xpath level=%d use=%d", level, use); if ((!n->root->u.root.absyn) || @@ -367,58 +501,88 @@ static void index_xpath (data1_node *n, struct recExtractCtrl *p, case DATA1N_data: wrd->string = n->u.data.data; wrd->length = n->u.data.len; - if (p->flagShowRecords) - { - printf("%*s XData:\"", (level + 1) * 4, ""); - for (i = 0; ilength && i < 8; i++) - fputc (wrd->string[i], stdout); - printf("\"\n"); - } - else { - data1_termlist *tl; - int xpdone = 0; - flen = 0; - - /* we have to fetch the whole path to the data tag */ - for (nn = n; nn; nn = nn->parent) { - if (nn->which == DATA1N_tag) { - size_t tlen = strlen(nn->u.tag.tag); - if (tlen + flen > (sizeof(tag_path_full)-2)) return; - memcpy (tag_path_full + flen, nn->u.tag.tag, tlen); - flen += tlen; - tag_path_full[flen++] = '/'; - } - else if (nn->which == DATA1N_root) break; - } - - tag_path_full[flen] = 0; + xpdone = 0; + flen = 0; - /* If we have a matching termlist... */ - if (n->root->u.root.absyn && (tl = xpath_termlist_by_tagpath(tag_path_full, n))) { - for (; tl; tl = tl->next) { - wrd->reg_type = *tl->structure; - /* this is the ! case, so structure is for the xpath index */ - if (!tl->att) { - wrd->attrSet = VAL_IDXPATH; - wrd->attrUse = use; - (*p->tokenAdd)(wrd); - xpdone = 1; - } else { - /* this is just the old fashioned attribute based index */ - wrd->attrSet = (int) (tl->att->parent->reference); - wrd->attrUse = tl->att->locals->local; - (*p->tokenAdd)(wrd); - } - } - } - /* xpath indexing is done, if there was no termlist given, - or no ! in the termlist, and default indexing is enabled... */ - if ((!xpdone) && (!termlist_only)) { - wrd->attrSet = VAL_IDXPATH; - wrd->attrUse = use; - wrd->reg_type = 'w'; - (*p->tokenAdd)(wrd); - } + /* we have to fetch the whole path to the data tag */ + for (nn = n; nn; nn = nn->parent) { + if (nn->which == DATA1N_tag) { + size_t tlen = strlen(nn->u.tag.tag); + if (tlen + flen > (sizeof(tag_path_full)-2)) return; + memcpy (tag_path_full + flen, nn->u.tag.tag, tlen); + flen += tlen; + tag_path_full[flen++] = '/'; + } + else if (nn->which == DATA1N_root) break; + } + + tag_path_full[flen] = 0; + + /* If we have a matching termlist... */ + if (n->root->u.root.absyn && + (tl = xpath_termlist_by_tagpath(tag_path_full, n))) + { + for (; tl; tl = tl->next) + { + /* need to copy recword because it may be changed */ + RecWord wrd_tl; + wrd->reg_type = *tl->structure; + /* this is the ! case, so structure is for the xpath index */ + memcpy (&wrd_tl, wrd, sizeof(*wrd)); + if (tl->source) + sp_parse(n, &wrd_tl, tl->source); + if (!tl->att) { + wrd_tl.attrSet = VAL_IDXPATH; + wrd_tl.attrUse = use; + if (p->flagShowRecords) + { + int i; + printf("%*sXPath index", (level + 1) * 4, ""); + printf (" XData:\""); + for (i = 0; i 40) + printf (" ..."); + fputc ('\n', stdout); + } + else + (*p->tokenAdd)(&wrd_tl); + xpdone = 1; + } else { + /* this is just the old fashioned attribute based index */ + wrd_tl.attrSet = (int) (tl->att->parent->reference); + wrd_tl.attrUse = tl->att->locals->local; + if (p->flagShowRecords) + { + int i; + printf("%*sIdx: [%s]", (level + 1) * 4, "", + tl->structure); + printf("%s:%s [%d] %s", + tl->att->parent->name, + tl->att->name, tl->att->value, + tl->source); + printf (" XData:\""); + for (i = 0; i 40) + printf (" ..."); + fputc ('\n', stdout); + } + else + (*p->tokenAdd)(&wrd_tl); + } + } + } + /* xpath indexing is done, if there was no termlist given, + or no ! in the termlist, and default indexing is enabled... */ + if (!p->flagShowRecords && !xpdone && !termlist_only) + { + wrd->attrSet = VAL_IDXPATH; + wrd->attrUse = use; + wrd->reg_type = 'w'; + (*p->tokenAdd)(wrd); } break; case DATA1N_tag: @@ -612,33 +776,11 @@ static void index_termlist (data1_node *par, data1_node *n, for (; tlist; tlist = tlist->next) { - - char xattr[512]; /* consider source */ wrd->string = 0; + assert(tlist->source); + sp_parse(n, wrd, tlist->source); - if (!strcmp (tlist->source, "data") && n->which == DATA1N_data) - { - wrd->string = n->u.data.data; - wrd->length = n->u.data.len; - } - else if (!strcmp (tlist->source, "tag") && n->which == DATA1N_tag) - { - wrd->string = n->u.tag.tag; - wrd->length = strlen(n->u.tag.tag); - } - else if (sscanf (tlist->source, "attr(%511[^)])", xattr) == 1 && - n->which == DATA1N_tag) - { - data1_xattr *p = n->u.tag.attributes; - while (p && strcmp (p->name, xattr)) - p = p->next; - if (p) - { - wrd->string = p->value; - wrd->length = strlen(p->value); - } - } if (wrd->string) { if (p->flagShowRecords) diff --git a/test/marcxml/record.abs b/test/marcxml/record.abs index 46c5fd6..5524a11 100644 --- a/test/marcxml/record.abs +++ b/test/marcxml/record.abs @@ -1,4 +1,4 @@ -# $Id: record.abs,v 1.3 2004-01-15 13:30:32 adam Exp $ +# $Id: record.abs,v 1.4 2004-08-24 14:29:09 adam Exp $ name marcxml attset bib1.att @@ -10,6 +10,7 @@ marc usmarc.mar xpath disable +xelm /record/controlfield[@tag="008"] Code-Language:w xelm /record/datafield[@tag="100"]/subfield[@code="a"] author:w,author:s xelm /record/datafield[@tag="245"]/subfield title:w diff --git a/test/marcxml/test1.sh b/test/marcxml/test1.sh index c6bfe5a..7921101 100755 --- a/test/marcxml/test1.sh +++ b/test/marcxml/test1.sh @@ -1,5 +1,5 @@ #!/bin/sh -# $Id: test1.sh,v 1.4 2004-06-15 09:43:33 adam Exp $ +# $Id: test1.sh,v 1.5 2004-08-24 14:29:09 adam Exp $ pp=${srcdir:-"."} @@ -15,7 +15,7 @@ fi ../../index/zebraidx -c $pp/zebra.cfg -l $LOG $DBG update $pp/m*.xml ../../index/zebrasrv -c $pp/zebra.cfg -l $LOG $DBG unix:socket & sleep 1 -../api/testclient unix:socket '@and @attr 1=1003 jack @attr 1=4 computer' >tmp1 +../api/testclient unix:socket '@and @attr 1=54 eng @and @attr 1=1003 jack @attr 1=4 computer' >tmp1 echo 'Result count: 2' >tmp2 kill `cat zebrasrv.pid` || exit 1 diff tmp1 tmp2 || exit 2