From: Adam Dickmeiss Date: Thu, 28 Sep 2006 18:38:44 +0000 (+0000) Subject: Fixed bug #685: Optimize xelm/melm matching. Indexing the Koha collection X-Git-Tag: ZEBRA.2.0.6~108 X-Git-Url: http://git.indexdata.com/?p=idzebra-moved-to-github.git;a=commitdiff_plain;h=396e9aaedfbed7534e329b42475cd7abe2fd3814 Fixed bug #685: Optimize xelm/melm matching. Indexing the Koha collection is reduced from 6073 sec to 2578 sec (on Zebra 1.3.38 though). --- diff --git a/data1/d1_absyn.c b/data1/d1_absyn.c index 33d0dc8..5adb241 100644 --- a/data1/d1_absyn.c +++ b/data1/d1_absyn.c @@ -1,4 +1,4 @@ -/* $Id: d1_absyn.c,v 1.28 2006-08-14 10:40:06 adam Exp $ +/* $Id: d1_absyn.c,v 1.29 2006-09-28 18:38:44 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -185,9 +185,10 @@ void data1_absyn_destroy (data1_handle dh) data1_xpelement *xpe = abs->xp_elements; while (xpe) { yaz_log (YLOG_DEBUG,"Destroy xp element %s",xpe->xpath_expr); - if (xpe->dfa) { dfa_delete (&xpe->dfa); } + if (xpe->dfa) + dfa_delete (&xpe->dfa); xpe = xpe->next; - } + } } p = p->next; } @@ -307,37 +308,6 @@ data1_esetname *data1_getesetbyname(data1_handle dh, data1_absyn *a, /* we have multiple versions of data1_getelementbyname */ #define DATA1_GETELEMENTBYTAGNAME_VERSION 1 -#if DATA1_GETELEMENTBYTAGNAME_VERSION==0 -/* straight linear search */ -data1_element *data1_getelementbytagname (data1_handle dh, data1_absyn *abs, - data1_element *parent, - const char *tagname) -{ - data1_element *r; - - /* It's now possible to have a data1 tree with no abstract syntax */ - if ( !abs ) - return 0; - - if (!parent) - r = abs->main_elements; - else - r = parent->children; - - for (; r; r = r->next) - { - data1_name *n; - - for (n = r->tag->names; n; n = n->next) - if (!data1_matchstr(tagname, n->name)) - return r; - } - return 0; -} -#endif - -#if DATA1_GETELEMENTBYTAGNAME_VERSION==1 -/* using hash search */ data1_element *data1_getelementbytagname (data1_handle dh, data1_absyn *abs, data1_element *parent, const char *tagname) @@ -354,12 +324,15 @@ data1_element *data1_getelementbytagname (data1_handle dh, data1_absyn *abs, else r = parent->children; +#if DATA1_GETELEMENTBYTAGNAME_VERSION==1 + /* using hash search */ if (!r) return 0; ht = r->hash; if (!ht) { + /* build hash table (the first time) */ ht = r->hash = data1_hash_open(29, data1_nmem_get(dh)); for (; r; r = r->next) { @@ -370,8 +343,19 @@ data1_element *data1_getelementbytagname (data1_handle dh, data1_absyn *abs, } } return data1_hash_lookup(ht, tagname); -} +#else + /* using linear search */ + for (; r; r = r->next) + { + data1_name *n; + + for (n = r->tag->names; n; n = n->next) + if (!data1_matchstr(tagname, n->name)) + return r; + } + return 0; #endif +} data1_element *data1_getelementbyname (data1_handle dh, data1_absyn *absyn, const char *name) @@ -869,9 +853,10 @@ static data1_absyn *data1_read_absyn(data1_handle dh, const char *file, int i; char *p, *xpath_expr, *termlists; const char *regexp; - struct DFA *dfa = dfa = dfa_init(); + struct DFA *dfa = 0; data1_termlist **tp; char melm_xpath[128]; + data1_xpelement *xp_old = 0; if (argc < 3) { @@ -888,13 +873,24 @@ static data1_absyn *data1_read_absyn(data1_handle dh, const char *file, } termlists = argv[2]; regexp = mk_xpath_regexp(dh, xpath_expr); - i = dfa_parse (dfa, ®exp); - if (i || *regexp) { - yaz_log(YLOG_WARN, "%s:%d: Bad xpath to xelm", file, lineno); - dfa_delete (&dfa); - continue; - } - + +#if OPTIMIZE_MELM + for (xp_old = res->xp_elements; xp_old; xp_old = xp_old->next) + if (!strcmp(xp_old->regexp, regexp)) + break; +#endif + if (!xp_old) + { + const char *regexp_ptr = regexp; + + dfa = dfa_init(); + i = dfa_parse (dfa, ®exp_ptr); + if (i || *regexp_ptr) { + yaz_log(YLOG_WARN, "%s:%d: Bad xpath to xelm", file, lineno); + dfa_delete (&dfa); + continue; + } + } if (!cur_xpelement) { cur_xpelement = (data1_xpelement *) @@ -905,12 +901,16 @@ static data1_absyn *data1_read_absyn(data1_handle dh, const char *file, nmem_malloc(data1_nmem_get(dh), sizeof(*cur_xpelement)); cur_xpelement = cur_xpelement->next; } +#if OPTIMIZE_MELM + cur_xpelement->regexp = regexp; +#endif cur_xpelement->next = NULL; cur_xpelement->xpath_expr = nmem_strdup(data1_nmem_get (dh), xpath_expr); - dfa_mkstate (dfa); - cur_xpelement->dfa = dfa; + if (dfa) + dfa_mkstate (dfa); + cur_xpelement->dfa = dfa; #ifdef ENHANCED_XELM cur_xpelement->xpath_len = diff --git a/dfa/dfa.c b/dfa/dfa.c index f01453e..60c5e99 100644 --- a/dfa/dfa.c +++ b/dfa/dfa.c @@ -1,4 +1,4 @@ -/* $Id: dfa.c,v 1.37 2006-08-14 10:40:08 adam Exp $ +/* $Id: dfa.c,v 1.38 2006-09-28 18:38:45 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -1097,6 +1097,11 @@ void dfa_set_cmap (struct DFA *dfa, void *vp, dfa->parse_info->cmap_data = vp; } +int dfa_get_last_rule (struct DFA *dfa) +{ + return dfa->parse_info->rule; +} + int dfa_parse (struct DFA *dfa, const char **pattern) { struct Tnode *top; diff --git a/include/d1_absyn.h b/include/d1_absyn.h index e147400..20d5669 100644 --- a/include/d1_absyn.h +++ b/include/d1_absyn.h @@ -1,4 +1,4 @@ -/* $Id: d1_absyn.h,v 1.6 2006-08-14 10:40:12 adam Exp $ +/* $Id: d1_absyn.h,v 1.7 2006-09-28 18:38:46 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -24,6 +24,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #define D1_ABSYN_H 1 #define ENHANCED_XELM 1 +#define OPTIMIZE_MELM 1 #include #include @@ -39,6 +40,10 @@ typedef struct data1_xpelement struct DFA *dfa; data1_termlist *termlists; struct data1_xpelement *next; +#if OPTIMIZE_MELM + const char *regexp; +#endif + int match_state; } data1_xpelement; struct data1_absyn diff --git a/include/dfa.h b/include/dfa.h index 3de3594..a92d421ce 100644 --- a/include/dfa.h +++ b/include/dfa.h @@ -1,4 +1,4 @@ -/* $Id: dfa.h,v 1.14 2006-08-14 10:40:12 adam Exp $ +/* $Id: dfa.h,v 1.15 2006-09-28 18:38:46 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -66,6 +66,7 @@ void dfa_set_cmap (struct DFA *dfa, void *vp, int dfa_parse (struct DFA *, const char **); void dfa_mkstate (struct DFA *); void dfa_delete (struct DFA **); +int dfa_get_last_rule (struct DFA *); void dfa_parse_cmap_clean (struct DFA *d); void dfa_parse_cmap_new (struct DFA *d, const int *cmap); diff --git a/index/recgrs.c b/index/recgrs.c index c09f731..7da47f7 100644 --- a/index/recgrs.c +++ b/index/recgrs.c @@ -1,4 +1,4 @@ -/* $Id: recgrs.c,v 1.5 2006-08-22 13:39:27 adam Exp $ +/* $Id: recgrs.c,v 1.6 2006-09-28 18:38:47 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -392,20 +392,40 @@ pop, 2003-01-17 data1_termlist *xpath_termlist_by_tagpath(char *tagpath, data1_node *n) { data1_absyn *abs = n->root->u.root.absyn; - data1_xpelement *xpe = abs->xp_elements; + + data1_xpelement *xpe = 0; data1_node *nn; #ifdef ENHANCED_XELM struct xpath_location_step *xp; #endif char *pexpr = xmalloc(strlen(tagpath)+5); - int ok = 0; sprintf (pexpr, "/%s\n", tagpath); - for (; xpe; xpe = xpe->next) + + for (xpe = abs->xp_elements; xpe; xpe = xpe->next) + xpe->match_state = -1; /* don't know if it matches yet */ + + for (xpe = abs->xp_elements; xpe; xpe = xpe->next) { int i; - ok = dfa_match_first(xpe->dfa->states, pexpr); + int ok = xpe->match_state; + if (ok == -1) + { /* don't know whether there is a match yet */ + data1_xpelement *xpe1; + + assert(xpe->dfa); + ok = dfa_match_first(xpe->dfa->states, pexpr); +#if OPTIMIZE_MELM + /* mark this and following ones with same regexp */ + for (xpe1 = xpe; xpe1; xpe1 = xpe1->next) + { + if (!strcmp(xpe1->regexp, xpe->regexp)) + xpe1->match_state = ok; + } +#endif + } + assert (ok == 0 || ok == 1); if (ok) { #ifdef ENHANCED_XELM /* we have to check the perdicates up to the root node */ @@ -440,7 +460,7 @@ data1_termlist *xpath_termlist_by_tagpath(char *tagpath, data1_node *n) xfree(pexpr); - if (ok) { + if (xpe) { yaz_log(YLOG_DEBUG, "Got it"); return xpe->termlists; } else {