From: Adam Dickmeiss <adam@indexdata.dk>
Date: Tue, 24 Aug 2004 14:29:09 +0000 (+0000)
Subject: Allow range to be specified in termlist, e.g. title:w:range(data,2,4)
X-Git-Tag: snippet.version.1~396
X-Git-Url: http://git.indexdata.com/?p=idzebra-moved-to-github.git;a=commitdiff_plain;h=cc4289d73c2b872c09d221d5e1188d3cdd9da438

Allow range to be specified in termlist, e.g. title:w:range(data,2,4)
which indexes 4 characters from position 2 in title.
---

diff --git a/data1/d1_absyn.c b/data1/d1_absyn.c
index a913e4b..f2a48ec 100644
--- a/data1/d1_absyn.c
+++ b/data1/d1_absyn.c
@@ -1,5 +1,5 @@
-/* $Id: d1_absyn.c,v 1.10 2004-08-04 08:35:22 adam Exp $
-   Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002
+/* $Id: d1_absyn.c,v 1.11 2004-08-24 14:29:09 adam Exp $
+   Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
    Index Data Aps
 
 This file is part of the Zebra server.
@@ -362,28 +362,57 @@ const char * mk_xpath_regexp (data1_handle dh, char *expr)
    pop, 2002-12-13
  */
 static int parse_termlists (data1_handle dh, data1_termlist ***tpp,
-			    char *p, const char *file, int lineno,
+			    char *cp, const char *file, int lineno,
 			    const char *element_name, data1_absyn *res,
 			    int xpelement)
 {
     data1_termlist **tp = *tpp;
-    do
+    while(1)
     {
 	char attname[512], structure[512];
 	char *source;
-	int r;
-	
-	if (!(r = sscanf(p, "%511[^:,]:%511[^,]", attname,
-			 structure)))
+	int r, i;
+	int level = 0;
+	structure[0] = '\0';
+	for (i = 0; cp[i] && i<sizeof(attname)-1; i++)
+	    if (strchr(":,", cp[i]))
+		break;
+	    else
+		attname[i] = cp[i];
+	if (i == 0)
 	{
-	    yaz_log(LOG_WARN,
-		    "%s:%d: Syntax error in termlistspec '%s'",
-		    file, lineno, p);
-	    return -1;
+	    if (*cp)
+		yaz_log(LOG_WARN,
+			"%s:%d: Syntax error in termlistspec '%s'",
+			file, lineno, cp);
+	    break;
 	}
+	attname[i] = '\0';
+	r = 1;
+	cp += i;
+	if (*cp == ':')
+	    cp++;
+
+	for (i = 0; cp[i] && i<sizeof(structure)-1; i++)
+	    if (level == 0 && strchr(",", cp[i]))
+		break;
+	    else
+	    {
+		structure[i] = cp[i];
+		if (cp[i] == '(')
+		    level++;
+		else if (cp[i] == ')')
+		    level--;
+	    }
+	structure[i] = '\0';
+	if (i)
+	    r = 2;
+	cp += i;
+	if (*cp)
+	    cp++;  /* skip , */
 
 	*tp = (data1_termlist *)
-	  nmem_malloc(data1_nmem_get(dh), sizeof(**tp));
+	    nmem_malloc(data1_nmem_get(dh), sizeof(**tp));
 	(*tp)->next = 0;
         
 	if (!xpelement) {
@@ -416,7 +445,7 @@ static int parse_termlists (data1_handle dh, data1_termlist ***tpp,
 		nmem_strdup (data1_nmem_get (dh), structure);
 	tp = &(*tp)->next;
     }
-    while ((p = strchr(p, ',')) && *(++p));
+
     *tpp = tp;
     return 0;
 }
diff --git a/recctrl/recgrs.c b/recctrl/recgrs.c
index 2e3a3a1..cc7e2f6 100644
--- a/recctrl/recgrs.c
+++ b/recctrl/recgrs.c
@@ -1,5 +1,5 @@
-/* $Id: recgrs.c,v 1.88 2004-08-06 13:36:23 adam Exp $
-   Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003
+/* $Id: recgrs.c,v 1.89 2004-08-24 14:29:09 adam Exp $
+   Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
    Index Data Aps
 
 This file is part of the Zebra server.
@@ -126,6 +126,138 @@ static void grs_destroy(void *clientData)
     xfree (h);
 }
 
+struct source_parser {
+    int len;
+    const char *tok;
+    const char *src;
+    int lookahead;
+};
+
+static int sp_lex(struct source_parser *sp)
+{
+    while (*sp->src == ' ')
+	(sp->src)++;
+    sp->tok = sp->src;
+    sp->len = 0;
+    while (*sp->src && !strchr("<>();,-: ", *sp->src))
+    {
+	sp->src++;
+	sp->len++;
+    }
+    if (sp->len)
+	sp->lookahead = 't';
+    else
+    {
+	sp->lookahead = *sp->src;
+	if (*sp->src)
+	    sp->src++;
+    }
+    return sp->lookahead;
+}
+
+
+static int sp_expr(struct source_parser *sp, data1_node *n, RecWord *wrd)
+{
+    if (sp->lookahead != 't')
+	return 0;
+    if (sp->len == 4 && !memcmp(sp->tok, "data", sp->len))
+    {
+	if (n->which == DATA1N_data)
+	{
+	    wrd->string = n->u.data.data;
+	    wrd->length = n->u.data.len;
+	}
+	sp_lex(sp);
+    }
+    else if (sp->len == 3 && !memcmp(sp->tok, "tag", sp->len))
+    {
+	if (n->which == DATA1N_tag)
+	{		
+	    wrd->string = n->u.tag.tag;
+	    wrd->length = strlen(n->u.tag.tag);
+	}
+	sp_lex(sp);
+    }
+    else if (sp->len == 4 && !memcmp(sp->tok, "attr", sp->len))
+    {
+	sp_lex(sp);
+	if (sp->lookahead != '(')
+	    return 0;
+	sp_lex(sp);
+	if (sp->lookahead != 't')
+	    return 0;
+	
+	if (n->which == DATA1N_tag)
+	{
+	    data1_xattr *p = n->u.tag.attributes;
+	    while (p && strlen(p->name) != sp->len && 
+		   memcmp (p->name, sp->tok, sp->len))
+		p = p->next;
+	    if (p)
+	    {
+		wrd->string = p->value;
+		wrd->length = strlen(p->value);
+	    }
+	}
+	sp_lex(sp);
+	if (sp->lookahead != ')')
+	    return 0;
+	sp_lex(sp);
+    }
+    else if (sp->len == 5 && !memcmp(sp->tok, "range", sp->len))
+    {
+	int start, len;
+	sp_lex(sp);
+	if (sp->lookahead != '(')
+	    return 0;
+	
+	sp_lex(sp);
+	sp_expr(sp, n, wrd);
+	if (sp->lookahead != ',')
+	    return 0;
+	
+	sp_lex(sp);
+	if (sp->lookahead != 't')
+	    return 0;
+	start = atoi_n(sp->tok, sp->len);
+	
+	sp_lex(sp);
+	if (sp->lookahead != ',')
+	    return 0;
+	
+	sp_lex(sp);
+	if (sp->lookahead != 't')
+	    return 0;
+	len = atoi_n(sp->tok, sp->len);
+	
+	sp_lex(sp);
+	if (sp->lookahead != ')')
+	    return 0;
+	
+	sp_lex(sp);
+	if (wrd->string && wrd->length)
+	{
+	    wrd->string += start;
+	    wrd->length -= start;
+	    if (wrd->length > len)
+		wrd->length = len;
+	}
+    }
+    return 1;
+}
+
+static int sp_parse(data1_node *n, RecWord *wrd, const char *src)
+{
+    struct source_parser sp;
+    sp.len = 0;
+    sp.tok = 0;
+    sp.src = src;
+    sp.lookahead = 0;
+    sp_lex(&sp);
+
+    return sp_expr(&sp, n, wrd);
+}
+
 int d1_check_xpath_predicate(data1_node *n, struct xpath_predicate *p)
 {
     int res = 1;
@@ -355,6 +487,8 @@ static void index_xpath (data1_node *n, struct recExtractCtrl *p,
     size_t flen = 0;
     data1_node *nn;
     int termlist_only = 1;
+    data1_termlist *tl;
+    int xpdone = 0;
 
     yaz_log(LOG_DEBUG, "index_xpath level=%d use=%d", level, use);
     if ((!n->root->u.root.absyn) ||
@@ -367,58 +501,88 @@ static void index_xpath (data1_node *n, struct recExtractCtrl *p,
     case DATA1N_data:
         wrd->string = n->u.data.data;
         wrd->length = n->u.data.len;
-        if (p->flagShowRecords)
-        {
-            printf("%*s XData:\"", (level + 1) * 4, "");
-            for (i = 0; i<wrd->length && i < 8; i++)
-                fputc (wrd->string[i], stdout);
-            printf("\"\n");
-        }  
-        else  {
-            data1_termlist *tl;
-            int xpdone = 0;
-            flen = 0;
-            
-            /* we have to fetch the whole path to the data tag */
-            for (nn = n; nn; nn = nn->parent) {
-                if (nn->which == DATA1N_tag) {
-                    size_t tlen = strlen(nn->u.tag.tag);
-                    if (tlen + flen > (sizeof(tag_path_full)-2)) return;
-                    memcpy (tag_path_full + flen, nn->u.tag.tag, tlen);
-                    flen += tlen;
-                    tag_path_full[flen++] = '/';
-                }
-                else if (nn->which == DATA1N_root)  break;
-            }
-
-            tag_path_full[flen] = 0;
+        xpdone = 0;
+        flen = 0;
             
-            /* If we have a matching termlist... */
-            if (n->root->u.root.absyn && (tl = xpath_termlist_by_tagpath(tag_path_full, n))) {
-                for (; tl; tl = tl->next) {
-                    wrd->reg_type = *tl->structure;
-                    /* this is the ! case, so structure is for the xpath index */
-                    if (!tl->att) {
-                        wrd->attrSet = VAL_IDXPATH;
-                        wrd->attrUse = use;
-                        (*p->tokenAdd)(wrd);
-                        xpdone = 1;
-                    } else {
-                        /* this is just the old fashioned attribute based index */
-                        wrd->attrSet = (int) (tl->att->parent->reference);
-                        wrd->attrUse = tl->att->locals->local;
-                        (*p->tokenAdd)(wrd);
-                    }
-                }
-            }
-            /* xpath indexing is done, if there was no termlist given, 
-               or no ! in the termlist, and default indexing is enabled... */
-            if ((!xpdone) && (!termlist_only)) {
-                wrd->attrSet = VAL_IDXPATH;
-                wrd->attrUse = use;
-                wrd->reg_type = 'w';
-                (*p->tokenAdd)(wrd);
-            }
+	/* we have to fetch the whole path to the data tag */
+	for (nn = n; nn; nn = nn->parent) {
+	    if (nn->which == DATA1N_tag) {
+		size_t tlen = strlen(nn->u.tag.tag);
+		if (tlen + flen > (sizeof(tag_path_full)-2)) return;
+		memcpy (tag_path_full + flen, nn->u.tag.tag, tlen);
+		flen += tlen;
+		tag_path_full[flen++] = '/';
+	    }
+	    else if (nn->which == DATA1N_root)  break;
+	}
+	
+	tag_path_full[flen] = 0;
+	
+	/* If we have a matching termlist... */
+	if (n->root->u.root.absyn && 
+	    (tl = xpath_termlist_by_tagpath(tag_path_full, n)))
+	{
+	    for (; tl; tl = tl->next)
+	    {
+		/* need to copy recword because it may be changed */
+		RecWord wrd_tl;
+		wrd->reg_type = *tl->structure;
+		/* this is the ! case, so structure is for the xpath index */
+		memcpy (&wrd_tl, wrd, sizeof(*wrd));
+		if (tl->source)
+		    sp_parse(n, &wrd_tl, tl->source);
+		if (!tl->att) {
+		    wrd_tl.attrSet = VAL_IDXPATH;
+		    wrd_tl.attrUse = use;
+		    if (p->flagShowRecords)
+		    {
+			int i;
+		        printf("%*sXPath index", (level + 1) * 4, "");
+			printf (" XData:\"");
+			for (i = 0; i<wrd_tl.length && i < 40; i++)
+			    fputc (wrd_tl.string[i], stdout);
+			fputc ('"', stdout);
+			if (wrd_tl.length > 40)
+			    printf (" ...");
+			fputc ('\n', stdout);
+		    }
+		    else
+			(*p->tokenAdd)(&wrd_tl);
+		    xpdone = 1;
+		} else {
+		    /* this is just the old fashioned attribute based index */
+		    wrd_tl.attrSet = (int) (tl->att->parent->reference);
+		    wrd_tl.attrUse = tl->att->locals->local;
+		    if (p->flagShowRecords)
+		    {
+			int i;
+			printf("%*sIdx: [%s]", (level + 1) * 4, "",
+			       tl->structure);
+			printf("%s:%s [%d] %s",
+			       tl->att->parent->name,
+			       tl->att->name, tl->att->value,
+			       tl->source);
+			printf (" XData:\"");
+			for (i = 0; i<wrd_tl.length && i < 40; i++)
+			    fputc (wrd_tl.string[i], stdout);
+			fputc ('"', stdout);
+			if (wrd_tl.length > 40)
+			    printf (" ...");
+			fputc ('\n', stdout);
+		    }
+		    else
+			(*p->tokenAdd)(&wrd_tl);
+		}
+	    }
+	}
+	/* xpath indexing is done, if there was no termlist given, 
+	   or no ! in the termlist, and default indexing is enabled... */
+	if (!p->flagShowRecords && !xpdone && !termlist_only)
+	{
+	    wrd->attrSet = VAL_IDXPATH;
+	    wrd->attrUse = use;
+	    wrd->reg_type = 'w';
+	    (*p->tokenAdd)(wrd);
 	}
         break;
     case DATA1N_tag:
@@ -612,33 +776,11 @@ static void index_termlist (data1_node *par, data1_node *n,
     
     for (; tlist; tlist = tlist->next)
     {
-
-	char xattr[512];
 	/* consider source */
 	wrd->string = 0;
+	assert(tlist->source);
+	sp_parse(n, wrd, tlist->source);
 
-	if (!strcmp (tlist->source, "data") && n->which == DATA1N_data)
-	{
-	    wrd->string = n->u.data.data;
-	    wrd->length = n->u.data.len;
-	}
-	else if (!strcmp (tlist->source, "tag") && n->which == DATA1N_tag)
-        {
-	    wrd->string = n->u.tag.tag;
-	    wrd->length = strlen(n->u.tag.tag);
-	}
-	else if (sscanf (tlist->source, "attr(%511[^)])", xattr) == 1 &&
-	    n->which == DATA1N_tag)
-	{
-	    data1_xattr *p = n->u.tag.attributes;
-	    while (p && strcmp (p->name, xattr))
-		p = p->next;
-	    if (p)
-	    {
-		wrd->string = p->value;
-		wrd->length = strlen(p->value);
-	    }
-	}
 	if (wrd->string)
 	{
 	    if (p->flagShowRecords)
diff --git a/test/marcxml/record.abs b/test/marcxml/record.abs
index 46c5fd6..5524a11 100644
--- a/test/marcxml/record.abs
+++ b/test/marcxml/record.abs
@@ -1,4 +1,4 @@
-# $Id: record.abs,v 1.3 2004-01-15 13:30:32 adam Exp $
+# $Id: record.abs,v 1.4 2004-08-24 14:29:09 adam Exp $
 
 name marcxml
 attset bib1.att
@@ -10,6 +10,7 @@ marc usmarc.mar
 
 xpath disable
 
+xelm /record/controlfield[@tag="008"]		Code-Language:w
 xelm /record/datafield[@tag="100"]/subfield[@code="a"]	author:w,author:s
 xelm /record/datafield[@tag="245"]/subfield		title:w
 
diff --git a/test/marcxml/test1.sh b/test/marcxml/test1.sh
index c6bfe5a..7921101 100755
--- a/test/marcxml/test1.sh
+++ b/test/marcxml/test1.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-# $Id: test1.sh,v 1.4 2004-06-15 09:43:33 adam Exp $
+# $Id: test1.sh,v 1.5 2004-08-24 14:29:09 adam Exp $
 
 pp=${srcdir:-"."}
 
@@ -15,7 +15,7 @@ fi
 ../../index/zebraidx -c $pp/zebra.cfg -l $LOG $DBG update $pp/m*.xml
 ../../index/zebrasrv -c $pp/zebra.cfg -l $LOG $DBG unix:socket &
 sleep 1
-../api/testclient unix:socket '@and @attr 1=1003 jack @attr 1=4 computer' >tmp1
+../api/testclient unix:socket '@and @attr 1=54 eng @and @attr 1=1003 jack @attr 1=4 computer' >tmp1
 echo 'Result count: 2' >tmp2
 kill `cat zebrasrv.pid` || exit 1
 diff tmp1 tmp2 || exit 2