From: Adam Dickmeiss Date: Thu, 30 Sep 2004 18:30:35 +0000 (+0000) Subject: Added a version of data1_getelementbyname that uses a hash instead X-Git-Tag: snippet.version.1~310 X-Git-Url: http://git.indexdata.com/?p=idzebra-moved-to-github.git;a=commitdiff_plain;h=ac2a5928508100836a6f98bef07f82914f4d7093 Added a version of data1_getelementbyname that uses a hash instead of linear search. It seems to speed up extract phase considerably. --- diff --git a/data1/d1_absyn.c b/data1/d1_absyn.c index 83d44ae..9c784cb 100644 --- a/data1/d1_absyn.c +++ b/data1/d1_absyn.c @@ -1,4 +1,4 @@ -/* $Id: d1_absyn.c,v 1.12 2004-09-28 10:15:02 adam Exp $ +/* $Id: d1_absyn.c,v 1.13 2004-09-30 18:30:35 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -33,8 +33,95 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #define D1_MAX_NESTING 128 -#if PRIVATE_DATA1_ABSYN -#endif +struct data1_hash_table { + NMEM nmem; + int size; + struct data1_hash_entry **ar; +}; + +struct data1_hash_entry { + void *clientData; + char *str; + struct data1_hash_entry *next; +}; + +unsigned data1_hash_calc(struct data1_hash_table *ht, const char *str) +{ + unsigned v = 0; + assert(str); + while (*str) + { + if (*str >= 'a' && *str <= 'z') + v = v*65509 + *str -'a'+10; + else if (*str >= 'A' && *str <= 'Z') + v = v*65509 + *str -'A'+10; + else if (*str >= '0' && *str <= '9') + v = v*65509 + *str -'0'; + str++; + } + return v % ht->size; +} + +struct data1_hash_table *data1_hash_open(int size, NMEM nmem) +{ + int i; + struct data1_hash_table *ht = nmem_malloc(nmem, sizeof(*ht)); + ht->nmem = nmem; + ht->size = size; + if (ht->size <= 0) + ht->size = 29; + ht->ar = nmem_malloc(nmem, sizeof(*ht->ar) * ht->size); + for (i = 0; isize; i++) + ht->ar[i] = 0; + return ht; +} + +void data1_hash_insert(struct data1_hash_table *ht, const char *str, + void *clientData, int copy) +{ + char *dstr = copy ? nmem_strdup(ht->nmem, str) : str; + if (strchr(str, '?') || strchr(str, '.')) + { + int i; + for (i = 0; isize; i++) + { + struct data1_hash_entry **he = &ht->ar[i]; + for (; *he && strcmp(str, (*he)->str); he = &(*he)->next) + ; + if (!*he) + { + *he = nmem_malloc(ht->nmem, sizeof(**he)); + (*he)->str = dstr; + (*he)->next = 0; + } + (*he)->clientData = clientData; + } + } + else + { + struct data1_hash_entry **he = &ht->ar[data1_hash_calc(ht, str)]; + for (; *he && strcmp(str, (*he)->str); he = &(*he)->next) + ; + if (!*he) + { + *he = nmem_malloc(ht->nmem, sizeof(**he)); + (*he)->str = dstr; + (*he)->next = 0; + } + (*he)->clientData = clientData; + } +} + +void *data1_hash_lookup(struct data1_hash_table *ht, const char *str) +{ + struct data1_hash_entry **he = &ht->ar[data1_hash_calc(ht, str)]; + + for (; *he && yaz_matchstr(str, (*he)->str); he = &(*he)->next) + ; + if (*he) + return (*he)->clientData; + return 0; +} struct data1_systag { char *name; @@ -56,6 +143,18 @@ struct data1_attset_cache_info data1_attset_cache next; }; +data1_element *data1_mk_element(data1_handle dh) +{ + data1_element *e = nmem_malloc(data1_nmem_get(dh), sizeof(*e)); + e->name = 0; + e->tag = 0; + e->termlists = 0; + e->next = e->children = 0; + e->sub_name = 0; + e->hash = 0; + return e; +} + data1_absyn *data1_absyn_search (data1_handle dh, const char *name) { data1_absyn_cache p = *data1_absyn_cache_get (dh); @@ -209,6 +308,11 @@ data1_esetname *data1_getesetbyname(data1_handle dh, data1_absyn *a, return 0; } +/* we have multiple versions of data1_getelementbyname */ +#define DATA1_GETELEMENTBYTAGNAME_VERSION 1 + +#if DATA1_GETELEMENTBYTAGNAME_VERSION==0 +/* straight linear search */ data1_element *data1_getelementbytagname (data1_handle dh, data1_absyn *abs, data1_element *parent, const char *tagname) @@ -234,6 +338,44 @@ data1_element *data1_getelementbytagname (data1_handle dh, data1_absyn *abs, } return 0; } +#endif + +#if DATA1_GETELEMENTBYTAGNAME_VERSION==1 +/* using hash search */ +data1_element *data1_getelementbytagname (data1_handle dh, data1_absyn *abs, + data1_element *parent, + const char *tagname) +{ + data1_element *r; + struct data1_hash_table *ht; + + /* It's now possible to have a data1 tree with no abstract syntax */ + if ( !abs ) + return 0; + + if (!parent) + r = abs->main_elements; + else + r = parent->children; + + if (!r) + return 0; + + ht = r->hash; + if (!ht) + { + ht = r->hash = data1_hash_open(29, data1_nmem_get(dh)); + for (; r; r = r->next) + { + data1_name *n; + + for (n = r->tag->names; n; n = n->next) + data1_hash_insert(ht, n->name, r, 0); + } + } + return data1_hash_lookup(ht, tagname); +} +#endif data1_element *data1_getelementbyname (data1_handle dh, data1_absyn *absyn, const char *name) @@ -617,12 +759,7 @@ data1_absyn *data1_read_absyn (data1_handle dh, const char *file, return 0; } level = i; - new_element = *ppl[level-1] = (data1_element *) - nmem_malloc(data1_nmem_get(dh), sizeof(*new_element)); - new_element->next = new_element->children = 0; - new_element->tag = 0; - new_element->termlists = 0; - new_element->sub_name = 0; + new_element = *ppl[level-1] = data1_mk_element(dh); tp = &new_element->termlists; ppl[level-1] = &new_element->next; @@ -1034,6 +1171,6 @@ data1_absyn *data1_read_absyn (data1_handle dh, const char *file, fix_element_ref (dh, res, cur_elements->elements); } *systagsp = 0; - yaz_log (LOG_DEBUG, "%s: data1_read_absyn end", file); + yaz_log(LOG_DEBUG, "%s: data1_read_absyn end", file); return res; }