X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=src%2Ftermlists.c;h=b40e5b026cf413e0edab895be20b731e282bebbb;hb=61027e68ea2f9c307a289b9cf8cc1be2cceb13d3;hp=027d057018c52cd962a9b9915d8a0fb625169387;hpb=bdf63c9b34f8a40edec11d50dc71454c0b8daf32;p=pazpar2-moved-to-github.git diff --git a/src/termlists.c b/src/termlists.c index 027d057..b40e5b0 100644 --- a/src/termlists.c +++ b/src/termlists.c @@ -1,16 +1,32 @@ -/* - * $Id: termlists.c,v 1.4 2007-01-10 10:04:23 adam Exp $ - */ +/* This file is part of Pazpar2. + Copyright (C) 2006-2010 Index Data -#include -#include -#include +Pazpar2 is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*/ #if HAVE_CONFIG_H -#include +#include #endif +#include +#include +#include + #include "termlists.h" +#include "jenkins_hash.h" // Discussion: // As terms are found in incoming records, they are added to (or updated in) a @@ -26,8 +42,7 @@ struct termlist_bucket struct termlist { struct termlist_bucket **hashtable; - int hashtable_size; - int hashmask; + unsigned hash_size; struct termlist_score **highscore; int highscore_size; @@ -37,42 +52,14 @@ struct termlist NMEM nmem; }; - -// Jenkins one-at-a-time hash (from wikipedia) -static unsigned int hash(const unsigned char *key) -{ - unsigned int hash = 0; - - while (*key) - { - hash += *(key++); - hash += (hash << 10); - hash ^= (hash >> 6); - } - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); - return hash; -} - -struct termlist *termlist_create(NMEM nmem, int numterms, int highscore_size) +struct termlist *termlist_create(NMEM nmem, int highscore_size) { - int hashsize = 1; - int halfnumterms; - struct termlist *res; - - // Calculate a hash size smallest power of 2 larger than 50% of expected numterms - halfnumterms = numterms >> 1; - if (halfnumterms < 0) - halfnumterms = 1; - while (hashsize < halfnumterms) - hashsize <<= 1; - res = nmem_malloc(nmem, sizeof(struct termlist)); - res->hashtable = nmem_malloc(nmem, hashsize * sizeof(struct termlist_bucket*)); - memset(res->hashtable, 0, hashsize * sizeof(struct termlist_bucket*)); - res->hashtable_size = hashsize; + struct termlist *res = nmem_malloc(nmem, sizeof(struct termlist)); + res->hash_size = 399; + res->hashtable = + nmem_malloc(nmem, res->hash_size * sizeof(struct termlist_bucket*)); + memset(res->hashtable, 0, res->hash_size * sizeof(struct termlist_bucket*)); res->nmem = nmem; - res->hashmask = hashsize - 1; // Creates a bitmask res->highscore = nmem_malloc(nmem, highscore_size * sizeof(struct termlist_score *)); res->highscore_size = highscore_size; @@ -124,11 +111,19 @@ void termlist_insert(struct termlist *tl, const char *term) { unsigned int bucket; struct termlist_bucket **p; + char buf[256], *cp; - bucket = hash((unsigned char *)term) & tl->hashmask; + if (strlen(term) > 255) + return; + strcpy(buf, term); + /* chop right */ + for (cp = buf + strlen(buf); cp != buf && strchr(",. -", cp[-1]); cp--) + cp[-1] = '\0'; + + bucket = jenkins_hash((unsigned char *)buf) % tl->hash_size; for (p = &tl->hashtable[bucket]; *p; p = &(*p)->next) { - if (!strcmp(term, (*p)->term.term)) + if (!strcmp(buf, (*p)->term.term)) { (*p)->term.frequency++; update_highscore(tl, &((*p)->term)); @@ -139,7 +134,7 @@ void termlist_insert(struct termlist *tl, const char *term) { struct termlist_bucket *new = nmem_malloc(tl->nmem, sizeof(struct termlist_bucket)); - new->term.term = nmem_strdup(tl->nmem, term); + new->term.term = nmem_strdup(tl->nmem, buf); new->term.frequency = 1; new->next = 0; *p = new; @@ -163,7 +158,9 @@ struct termlist_score **termlist_highscore(struct termlist *tl, int *len) /* * Local variables: * c-basic-offset: 4 + * c-file-style: "Stroustrup" * indent-tabs-mode: nil * End: * vim: shiftwidth=4 tabstop=8 expandtab */ +