From 69983aaec470e41649c4dfae8a9e7cbcf061cacf Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Wed, 29 Jun 2005 16:52:26 +0000 Subject: [PATCH] Fixed bug #317: xelm only indexes cdata at matching node (not children). --- data1/d1_absyn.c | 104 +++++++++++++++++--------------- include/d1_absyn.h | 21 +++++++ recctrl/recgrs.c | 148 +++++++++++++++++++++------------------------- test/xpath/assembled.abs | 1 + test/xpath/xpath6.c | 7 +-- 5 files changed, 150 insertions(+), 131 deletions(-) diff --git a/data1/d1_absyn.c b/data1/d1_absyn.c index 45724a9..34c4dd1 100644 --- a/data1/d1_absyn.c +++ b/data1/d1_absyn.c @@ -1,4 +1,4 @@ -/* $Id: d1_absyn.c,v 1.20 2005-06-23 06:45:46 adam Exp $ +/* $Id: d1_absyn.c,v 1.21 2005-06-29 16:52:26 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -439,65 +439,73 @@ void fix_element_ref (data1_handle dh, data1_absyn *absyn, data1_element *e) */ -const char * mk_xpath_regexp (data1_handle dh, char *expr) +static const char * mk_xpath_regexp (data1_handle dh, const char *expr) { - char *p = expr; - char *pp; - char *s; + const char *p = expr; int abs = 1; int i; - int j; - int e=0; - int is_predicate = 0; + int e = 0; + char *stack[32]; + char *res_p, *res = 0; + size_t res_size = 1; - static char *stack[32]; - static char res[1024]; - char *r = ""; - - if (*p != '/') { return (""); } + if (*p != '/') + return (""); p++; - if (*p == '/') { abs=0; p++; } - - while (*p) { - i=0; - while (*p && !strchr("/",*p)) { - i++; p++; - } - stack[e] = (char *) nmem_malloc (data1_nmem_get (dh), i+1); + if (*p == '/') + { + abs =0; + p++; + } + while (*p) + { + int is_predicate = 0; + char *s; + int j; + for (i = 0; *p && !strchr("/",*p); i++, p++) + ; + res_size += (i+3); /* we'll add / between later .. */ + stack[e] = (char *) nmem_malloc(data1_nmem_get(dh), i+1); s = stack[e]; - for (j=0; j< i; j++) { - pp = p-i+j; - if (*pp == '[') { - is_predicate=1; - } - else if (*pp == ']') { - is_predicate=0; - } - else { - if (!is_predicate) { - if (*pp == '*') - *s++ = '.'; - *s++ = *pp; + for (j = 0; j < i; j++) + { + const char *pp = p-i+j; + if (*pp == '[') + is_predicate=1; + else if (*pp == ']') + is_predicate=0; + else + { + if (!is_predicate) { + if (*pp == '*') + *s++ = '.'; + *s++ = *pp; + } } - } } *s = 0; e++; - if (*p) {p++;} + if (*p) + p++; } - e--; p = &res[0]; i=0; - sprintf (p, "^"); p++; - while (e >= 0) { - /* !!! res size is not checked !!! */ - sprintf (p, "%s/",stack[e]); - p += strlen(stack[e]) + 1; - e--; + res_p = res = nmem_malloc(data1_nmem_get(dh), res_size + 10); + + i = 0; + sprintf(res_p, ".*/"); + res_p = res_p + strlen(res_p); + while (--e >= 0) { + sprintf(res_p, "%s/", stack[e]); + res_p += strlen(stack[e]) + 1; } - if (!abs) { sprintf (p, ".*"); p+=2; } - sprintf (p, "$"); p++; - r = nmem_strdup (data1_nmem_get (dh), res); - yaz_log(YLOG_DEBUG,"Got regexp: %s",r); - return (r); + if (!abs) + { + sprintf(res_p, ".*"); + res_p += 2; + } + sprintf (res_p, "$"); + res_p++; + yaz_log(YLOG_DEBUG, "Got regexp: %s", res); + return res; } /* *ostrich* diff --git a/include/d1_absyn.h b/include/d1_absyn.h index 771911a..80556ed 100644 --- a/include/d1_absyn.h +++ b/include/d1_absyn.h @@ -1,3 +1,24 @@ +/* $Id: d1_absyn.h,v 1.2 2005-06-29 16:52:27 adam Exp $ + Copyright (C) 1995-2005 + Index Data ApS + +This file is part of the Zebra server. + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Zebra; see the file LICENSE.zebra. If not, write to the +Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. +*/ #ifndef D1_ABSYN_H #define D1_ABSYN_H 1 diff --git a/recctrl/recgrs.c b/recctrl/recgrs.c index 99eb954..eb80884 100644 --- a/recctrl/recgrs.c +++ b/recctrl/recgrs.c @@ -1,4 +1,4 @@ -/* $Id: recgrs.c,v 1.102 2005-06-23 06:45:47 adam Exp $ +/* $Id: recgrs.c,v 1.103 2005-06-29 16:52:27 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -337,6 +337,33 @@ int d1_check_xpath_predicate(data1_node *n, struct xpath_predicate *p) } +static int dfa_match_first(struct DFA_state **dfaar, const char *text) +{ + struct DFA_state *s = dfaar[0]; /* start state */ + struct DFA_tran *t; + int i; + const char *p = text; + unsigned char c; + + for (c = *p++, t = s->trans, i = s->tran_no; --i >= 0; t++) + if (c >= t->ch[0] && c <= t->ch[1]) + { + while (i >= 0) + { + /* move to next state and return if we get a match */ + s = dfaar[t->to]; + if (s->rule_no) + return 1; + /* next char */ + c = *p++; + for (t = s->trans, i = s->tran_no; --i >= 0; t++) + if (c >= t->ch[0] && c <= t->ch[1]) + break; + } + } + return 0; +} + /* *ostrich* New function, looking for xpath "element" definitions in abs, by @@ -362,44 +389,20 @@ data1_termlist *xpath_termlist_by_tagpath(char *tagpath, data1_node *n) #ifdef ENHANCED_XELM struct xpath_location_step *xp; #endif - char *pexpr = xmalloc(strlen(tagpath)+2); + char *pexpr = xmalloc(strlen(tagpath)+5); int ok = 0; - sprintf (pexpr, "%s\n", tagpath); - yaz_log(YLOG_DEBUG, "Checking tagpath %s",tagpath); + sprintf (pexpr, "/%s\n", tagpath); + yaz_log(YLOG_LOG, "Checking tagpath %s", pexpr); for (; xpe; xpe = xpe->next) { - struct DFA_state **dfaar = xpe->dfa->states; - struct DFA_state *s = dfaar[0]; - struct DFA_tran *t = s->trans; - int i = s->tran_no; - unsigned char c = *pexpr++; - int start_line = 1; - - if ((c >= t->ch[0] && c <= t->ch[1]) || (!t->ch[0])) - { - const char *p = pexpr; - do - { - if ((s = dfaar[t->to])->rule_no && - (start_line || s->rule_nno)) - { - ok = 1; - break; - } - for (t=s->trans, i=s->tran_no; --i >= 0; t++) - if ((unsigned) *p >= t->ch[0] && (unsigned) *p <= t->ch[1]) - break; - p++; - } - while (i >= 0); - } + int i; + ok = dfa_match_first(xpe->dfa->states, pexpr); if (ok) - yaz_log(YLOG_DEBUG, " xpath match %s",xpe->xpath_expr); + yaz_log(YLOG_LOG, " xpath got match %s",xpe->xpath_expr); else - yaz_log(YLOG_DEBUG, " xpath no match %s",xpe->xpath_expr); + yaz_log(YLOG_LOG, " xpath no match %s",xpe->xpath_expr); - pexpr--; if (ok) { #ifdef ENHANCED_XELM /* we have to check the perdicates up to the root node */ @@ -499,6 +502,32 @@ static void index_xpath_attr (char *tag_path, char *name, char *value, } +static void mk_tag_path_full(char *tag_path_full, size_t max, data1_node *n) +{ + size_t flen = 0; + data1_node *nn; + + /* we have to fetch the whole path to the data tag */ + for (nn = n; nn; nn = nn->parent) + { + if (nn->which == DATA1N_tag) + { + size_t tlen = strlen(nn->u.tag.tag); + if (tlen + flen > (max - 2)) + break; + memcpy (tag_path_full + flen, nn->u.tag.tag, tlen); + flen += tlen; + tag_path_full[flen++] = '/'; + } + else + if (nn->which == DATA1N_root) + break; + } + tag_path_full[flen] = 0; + yaz_log(YLOG_LOG, "mk_tag_path_full=%s", tag_path_full); +} + + static void index_xpath(struct source_parser *sp, data1_node *n, struct recExtractCtrl *p, int level, RecWord *wrd, @@ -512,8 +541,6 @@ static void index_xpath(struct source_parser *sp, data1_node *n, { int i; char tag_path_full[1024]; - size_t flen = 0; - data1_node *nn; int termlist_only = 1; data1_termlist *tl; int xpdone = 0; @@ -541,26 +568,8 @@ static void index_xpath(struct source_parser *sp, data1_node *n, wrd->term_buf = n->u.data.data; wrd->term_len = n->u.data.len; xpdone = 0; - flen = 0; - - /* we have to fetch the whole path to the data tag */ - for (nn = n; nn; nn = nn->parent) - { - if (nn->which == DATA1N_tag) - { - size_t tlen = strlen(nn->u.tag.tag); - if (tlen + flen > (sizeof(tag_path_full)-2)) - break; - memcpy (tag_path_full + flen, nn->u.tag.tag, tlen); - flen += tlen; - tag_path_full[flen++] = '/'; - } - else - if (nn->which == DATA1N_root) - break; - } - - tag_path_full[flen] = 0; + + mk_tag_path_full(tag_path_full, sizeof(tag_path_full), n); /* If we have a matching termlist... */ if (n->root->u.root.absyn && @@ -651,26 +660,11 @@ static void index_xpath(struct source_parser *sp, data1_node *n, } break; case DATA1N_tag: - flen = 0; - for (nn = n; nn; nn = nn->parent) - { - if (nn->which == DATA1N_tag) - { - size_t tlen = strlen(nn->u.tag.tag); - if (tlen + flen > (sizeof(tag_path_full)-2)) - break; - memcpy (tag_path_full + flen, nn->u.tag.tag, tlen); - flen += tlen; - tag_path_full[flen++] = '/'; - } - else if (nn->which == DATA1N_root) - break; - } - + mk_tag_path_full(tag_path_full, sizeof(tag_path_full), n); wrd->index_type = '0'; wrd->term_buf = tag_path_full; - wrd->term_len = flen; + wrd->term_len = strlen(tag_path_full); #if NATTR wrd->index_name = xpath_index; #else @@ -692,8 +686,6 @@ static void index_xpath(struct source_parser *sp, data1_node *n, data1_termlist *tl; int do_xpindex; - tag_path_full[flen] = 0; - /* Add tag start/end xpath index, only when there is a ! in the apropriate xelm directive, or default xpath indexing is enabled @@ -735,11 +727,10 @@ static void index_xpath(struct source_parser *sp, data1_node *n, int do_xpindex = 1 - termlist_only; data1_termlist *tl; char attr_tag_path_full[1024]; - int int_len = flen; /* this could be cached as well */ - sprintf (attr_tag_path_full, "@%s/%.*s", - xp->name, int_len, tag_path_full); + sprintf (attr_tag_path_full, "@%s/%s", + xp->name, tag_path_full); tll[i] = xpath_termlist_by_tagpath(attr_tag_path_full,n); @@ -802,11 +793,10 @@ static void index_xpath(struct source_parser *sp, data1_node *n, for (xp = n->u.tag.attributes; xp; xp = xp->next) { data1_termlist *tl; char attr_tag_path_full[1024]; - int int_len = flen; int xpdone = 0; - sprintf (attr_tag_path_full, "@%s/%.*s", - xp->name, int_len, tag_path_full); + sprintf (attr_tag_path_full, "@%s/%s", + xp->name, tag_path_full); if ((tl = tll[i])) { diff --git a/test/xpath/assembled.abs b/test/xpath/assembled.abs index 7daa026..2c06889 100644 --- a/test/xpath/assembled.abs +++ b/test/xpath/assembled.abs @@ -4,3 +4,4 @@ xpath enable xelm /assembled/basic/names/CASno !:w,!:p,!:s,NAL-call-number:p,Title-key:s xelm /*/orgs body-of-text:w +xelm //something Title:w diff --git a/test/xpath/xpath6.c b/test/xpath/xpath6.c index c0ef3c3..9828f8d 100644 --- a/test/xpath/xpath6.c +++ b/test/xpath/xpath6.c @@ -1,4 +1,4 @@ -/* $Id: xpath6.c,v 1.3 2005-05-04 12:53:05 adam Exp $ +/* $Id: xpath6.c,v 1.4 2005-06-29 16:52:27 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -46,6 +46,7 @@ int main(int argc, char **argv) zebra_end_trans(zh); zebra_commit(zh); + do_query(__LINE__, zh, "@attr 5=1 @attr 6=3 @attr 4=1 @attr 1=/assembled/basic/names/CASno \"367-93-1\"", 2); do_query(__LINE__, zh, "@attr 5=1 @attr 6=3 @attr 4=1 @attr 1=18 \"367-93-1\"", 2); @@ -60,10 +61,8 @@ int main(int argc, char **argv) "@and @attr 1=/assembled/orgs/org 1 @attr 5=1 @attr 6=3 @attr 4=1 " "@attr 1=/assembled/basic/names/CASno 367-93-1", 2); -#if 0 /* bug #317 */ - do_query(__LINE__, zh, "@attr 1=1010 46", 1); -#endif + do_query(__LINE__, zh, "@attr 1=1010 46", 2); return close_down(zh, zs, 0); } -- 1.7.10.4