src/termlists.c

   1 /* This file is part of Pazpar2.
   2    Copyright (C) 2006-2009 Index Data
   3
   4 Pazpar2 is free software; you can redistribute it and/or modify it under
   5 the terms of the GNU General Public License as published by the Free
   6 Software Foundation; either version 2, or (at your option) any later
   7 version.
   8
   9 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
  10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 for more details.
  13
  14 You should have received a copy of the GNU General Public License
  15 along with this program; if not, write to the Free Software
  16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17
  18 */
  19
  20 #if HAVE_CONFIG_H
  21 #include <config.h>
  22 #endif
  23
  24 #include <stdlib.h>
  25 #include <string.h>
  26 #include <yaz/yaz-util.h>
  27
  28 #include "termlists.h"
  29 #include "jenkins_hash.h"
  30
  31 // Discussion:
  32 // As terms are found in incoming records, they are added to (or updated in) a
  33 // Hash table. When term records are updated, a frequency value is updated. At
  34 // the same time, a highscore is maintained for the most frequent terms.
  35
  36 struct termlist_bucket
  37 {
  38     struct termlist_score term;
  39     struct termlist_bucket *next;
  40 };
  41
  42 struct termlist
  43 {
  44     struct termlist_bucket **hashtable;
  45     int hashtable_size;
  46     int hashmask;
  47
  48     struct termlist_score **highscore;
  49     int highscore_size;
  50     int highscore_num;
  51     int highscore_min;
  52
  53     NMEM nmem;
  54 };
  55
  56 struct termlist *termlist_create(NMEM nmem, int numterms, int highscore_size)
  57 {
  58     int hashsize = 1;
  59     int halfnumterms;
  60     struct termlist *res;
  61
  62     // Calculate a hash size smallest power of 2 larger than 50% of expected numterms
  63     halfnumterms = numterms >> 1;
  64     if (halfnumterms < 0)
  65         halfnumterms = 1;
  66     while (hashsize < halfnumterms)
  67         hashsize <<= 1;
  68     res = nmem_malloc(nmem, sizeof(struct termlist));
  69     res->hashtable = nmem_malloc(nmem, hashsize * sizeof(struct termlist_bucket*));
  70     memset(res->hashtable, 0, hashsize * sizeof(struct termlist_bucket*));
  71     res->hashtable_size = hashsize;
  72     res->nmem = nmem;
  73     res->hashmask = hashsize - 1; // Creates a bitmask
  74
  75     res->highscore = nmem_malloc(nmem, highscore_size * sizeof(struct termlist_score *));
  76     res->highscore_size = highscore_size;
  77     res->highscore_num = 0;
  78     res->highscore_min = 0;
  79
  80     return res;
  81 }
  82
  83 static void update_highscore(struct termlist *tl, struct termlist_score *t)
  84 {
  85     int i;
  86     int smallest;
  87     int me = -1;
  88
  89     if (tl->highscore_num > tl->highscore_size && t->frequency < tl->highscore_min)
  90         return;
  91
  92     smallest = 0;
  93     for (i = 0; i < tl->highscore_num; i++)
  94     {
  95         if (tl->highscore[i]->frequency < tl->highscore[smallest]->frequency)
  96             smallest = i;
  97         if (tl->highscore[i] == t)
  98             me = i;
  99     }
 100     if (tl->highscore_num)
 101         tl->highscore_min = tl->highscore[smallest]->frequency;
 102     if (t->frequency < tl->highscore_min)
 103         tl->highscore_min = t->frequency;
 104     if (me >= 0)
 105         return;
 106     if (tl->highscore_num < tl->highscore_size)
 107     {
 108         tl->highscore[tl->highscore_num++] = t;
 109         if (t->frequency < tl->highscore_min)
 110             tl->highscore_min = t->frequency;
 111     }
 112     else
 113     {
 114         if (t->frequency > tl->highscore[smallest]->frequency)
 115         {
 116             tl->highscore[smallest] = t;
 117         }
 118     }
 119 }
 120
 121 void termlist_insert(struct termlist *tl, const char *term)
 122 {
 123     unsigned int bucket;
 124     struct termlist_bucket **p;
 125     char buf[256], *cp;
 126
 127     if (strlen(term) > 255)
 128         return;
 129     strcpy(buf, term);
 130     /* chop right */
 131     for (cp = buf + strlen(buf); cp != buf && strchr(",. -", cp[-1]); cp--)
 132         cp[-1] = '\0';
 133
 134     bucket = jenkins_hash((unsigned char *)buf) & tl->hashmask;
 135     for (p = &tl->hashtable[bucket]; *p; p = &(*p)->next)
 136     {
 137         if (!strcmp(buf, (*p)->term.term))
 138         {
 139             (*p)->term.frequency++;
 140             update_highscore(tl, &((*p)->term));
 141             break;
 142         }
 143     }
 144     if (!*p) // We made it to the end of the bucket without finding match
 145     {
 146         struct termlist_bucket *new = nmem_malloc(tl->nmem,
 147                 sizeof(struct termlist_bucket));
 148         new->term.term = nmem_strdup(tl->nmem, buf);
 149         new->term.frequency = 1;
 150         new->next = 0;
 151         *p = new;
 152         update_highscore(tl, &new->term);
 153     }
 154 }
 155
 156 static int compare(const void *s1, const void *s2)
 157 {
 158     struct termlist_score **p1 = (struct termlist_score**) s1, **p2 = (struct termlist_score **) s2;
 159     return (*p2)->frequency - (*p1)->frequency;
 160 }
 161
 162 struct termlist_score **termlist_highscore(struct termlist *tl, int *len)
 163 {
 164     qsort(tl->highscore, tl->highscore_num, sizeof(struct termlist_score*), compare);
 165     *len = tl->highscore_num;
 166     return tl->highscore;
 167 }
 168
 169 /*
 170  * Local variables:
 171  * c-basic-offset: 4
 172  * c-file-style: "Stroustrup"
 173  * indent-tabs-mode: nil
 174  * End:
 175  * vim: shiftwidth=4 tabstop=8 expandtab
 176  */
 177