X-Git-Url: http://git.indexdata.com/?p=pazpar2-moved-to-github.git;a=blobdiff_plain;f=src%2Ftermlists.c;h=79e88ee489fd146f19d6055fac3bf62ce83bee62;hp=cd8fe72df90e59b1624405ec04535f263e2d9d5d;hb=HEAD;hpb=cc29eab9f928f6cd0f4231cb2e554e2ac7b0b1f3 diff --git a/src/termlists.c b/src/termlists.c index cd8fe72..79e88ee 100644 --- a/src/termlists.c +++ b/src/termlists.c @@ -1,7 +1,5 @@ -/* $Id: termlists.c,v 1.7 2007-04-10 08:48:56 adam Exp $ - Copyright (c) 2006-2007, Index Data. - -This file is part of Pazpar2. +/* This file is part of Pazpar2. + Copyright (C) Index Data Pazpar2 is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free @@ -14,21 +12,22 @@ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with Pazpar2; see the file LICENSE. If not, write to the -Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA -02111-1307, USA. - */ +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -#include -#include -#include -#include +*/ #if HAVE_CONFIG_H -#include +#include #endif +#include +#include +#include +#include + #include "termlists.h" +#include "jenkins_hash.h" // Discussion: // As terms are found in incoming records, they are added to (or updated in) a @@ -44,120 +43,41 @@ struct termlist_bucket struct termlist { struct termlist_bucket **hashtable; - int hashtable_size; - int hashmask; - - struct termlist_score **highscore; - int highscore_size; - int highscore_num; - int highscore_min; + unsigned hash_size; + int no_entries; NMEM nmem; }; - -// Jenkins one-at-a-time hash (from wikipedia) -static unsigned int hash(const unsigned char *key) -{ - unsigned int hash = 0; - - while (*key) - { - hash += *(key++); - hash += (hash << 10); - hash ^= (hash >> 6); - } - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); - return hash; -} - -struct termlist *termlist_create(NMEM nmem, int numterms, int highscore_size) +struct termlist *termlist_create(NMEM nmem) { - int hashsize = 1; - int halfnumterms; - struct termlist *res; - - // Calculate a hash size smallest power of 2 larger than 50% of expected numterms - halfnumterms = numterms >> 1; - if (halfnumterms < 0) - halfnumterms = 1; - while (hashsize < halfnumterms) - hashsize <<= 1; - res = nmem_malloc(nmem, sizeof(struct termlist)); - res->hashtable = nmem_malloc(nmem, hashsize * sizeof(struct termlist_bucket*)); - memset(res->hashtable, 0, hashsize * sizeof(struct termlist_bucket*)); - res->hashtable_size = hashsize; + struct termlist *res = nmem_malloc(nmem, sizeof(struct termlist)); + res->hash_size = 399; + res->hashtable = + nmem_malloc(nmem, res->hash_size * sizeof(struct termlist_bucket*)); + memset(res->hashtable, 0, res->hash_size * sizeof(struct termlist_bucket*)); res->nmem = nmem; - res->hashmask = hashsize - 1; // Creates a bitmask - - res->highscore = nmem_malloc(nmem, highscore_size * sizeof(struct termlist_score *)); - res->highscore_size = highscore_size; - res->highscore_num = 0; - res->highscore_min = 0; - + res->no_entries = 0; return res; } -static void update_highscore(struct termlist *tl, struct termlist_score *t) -{ - int i; - int smallest; - int me = -1; - - if (tl->highscore_num > tl->highscore_size && t->frequency < tl->highscore_min) - return; - - smallest = 0; - for (i = 0; i < tl->highscore_num; i++) - { - if (tl->highscore[i]->frequency < tl->highscore[smallest]->frequency) - smallest = i; - if (tl->highscore[i] == t) - me = i; - } - if (tl->highscore_num) - tl->highscore_min = tl->highscore[smallest]->frequency; - if (t->frequency < tl->highscore_min) - tl->highscore_min = t->frequency; - if (me >= 0) - return; - if (tl->highscore_num < tl->highscore_size) - { - tl->highscore[tl->highscore_num++] = t; - if (t->frequency < tl->highscore_min) - tl->highscore_min = t->frequency; - } - else - { - if (t->frequency > tl->highscore[smallest]->frequency) - { - tl->highscore[smallest] = t; - } - } -} - -void termlist_insert(struct termlist *tl, const char *term) +void termlist_insert(struct termlist *tl, const char *display_term, + const char *norm_term, const char *id, size_t id_len, + int freq) { unsigned int bucket; struct termlist_bucket **p; - char buf[256], *cp; + char buf[256]; - if (strlen(term) > 255) + if (strlen(norm_term) > 255) return; - strcpy(buf, term); - for (cp = buf + strlen(buf) - 1; cp > buf && - (*cp == ',' || *cp == '.' || *cp == ' ' || *cp == '-'); cp--) - *cp = '\0'; - - bucket = hash((unsigned char *)buf) & tl->hashmask; + strcpy(buf, norm_term); + bucket = jenkins_hash((unsigned char *)buf) % tl->hash_size; for (p = &tl->hashtable[bucket]; *p; p = &(*p)->next) { - if (!strcmp(buf, (*p)->term.term)) + if (!strcmp(buf, (*p)->term.norm_term)) { - (*p)->term.frequency++; - update_highscore(tl, &((*p)->term)); + (*p)->term.frequency += freq; break; } } @@ -165,31 +85,54 @@ void termlist_insert(struct termlist *tl, const char *term) { struct termlist_bucket *new = nmem_malloc(tl->nmem, sizeof(struct termlist_bucket)); - new->term.term = nmem_strdup(tl->nmem, buf); - new->term.frequency = 1; + new->term.norm_term = nmem_strdup(tl->nmem, buf); + new->term.display_term = *display_term ? + nmem_strdup(tl->nmem, display_term) : new->term.norm_term; + new->term.id = id ? nmem_strdupn(tl->nmem, id, id_len) : 0; + new->term.frequency = freq; new->next = 0; *p = new; - update_highscore(tl, &new->term); + tl->no_entries++; } } static int compare(const void *s1, const void *s2) { - struct termlist_score **p1 = (struct termlist_score**) s1, **p2 = (struct termlist_score **) s2; - return (*p2)->frequency - (*p1)->frequency; + struct termlist_score **p1 = (struct termlist_score **) s1; + struct termlist_score **p2 = (struct termlist_score **) s2; + int d = (*p2)->frequency - (*p1)->frequency; + if (d) + return d; + return strcmp((*p1)->display_term, (*p2)->display_term); } -struct termlist_score **termlist_highscore(struct termlist *tl, int *len) +struct termlist_score **termlist_highscore(struct termlist *tl, int *len, + NMEM nmem) { - qsort(tl->highscore, tl->highscore_num, sizeof(struct termlist_score*), compare); - *len = tl->highscore_num; - return tl->highscore; + struct termlist_score **highscore = + (struct termlist_score **) + nmem_malloc(nmem, tl->no_entries * sizeof(*highscore)); + + int no = 0; + unsigned bucket; + for (bucket = 0; bucket < tl->hash_size; bucket++) + { + struct termlist_bucket *p; + for (p = tl->hashtable[bucket]; p; p = p->next) + highscore[no++] = &p->term; + } + assert(no == tl->no_entries); + qsort(highscore, tl->no_entries, sizeof(struct termlist_score*), compare); + *len = tl->no_entries; + return highscore; } /* * Local variables: * c-basic-offset: 4 + * c-file-style: "Stroustrup" * indent-tabs-mode: nil * End: * vim: shiftwidth=4 tabstop=8 expandtab */ +