-/* $Id: termlists.c,v 1.7 2007-04-10 08:48:56 adam Exp $
- Copyright (c) 2006-2007, Index Data.
-
-This file is part of Pazpar2.
+/* This file is part of Pazpar2.
+ Copyright (C) 2006-2010 Index Data
Pazpar2 is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
for more details.
You should have received a copy of the GNU General Public License
-along with Pazpar2; see the file LICENSE. If not, write to the
-Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
-02111-1307, USA.
- */
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#include <yaz/yaz-util.h>
+*/
#if HAVE_CONFIG_H
-#include <cconfig.h>
+#include <config.h>
#endif
+#include <stdlib.h>
+#include <string.h>
+#include <yaz/yaz-util.h>
+
#include "termlists.h"
+#include "jenkins_hash.h"
// Discussion:
// As terms are found in incoming records, they are added to (or updated in) a
struct termlist
{
struct termlist_bucket **hashtable;
- int hashtable_size;
- int hashmask;
+ unsigned hash_size;
struct termlist_score **highscore;
int highscore_size;
NMEM nmem;
};
-
-// Jenkins one-at-a-time hash (from wikipedia)
-static unsigned int hash(const unsigned char *key)
+struct termlist *termlist_create(NMEM nmem, int highscore_size)
{
- unsigned int hash = 0;
-
- while (*key)
- {
- hash += *(key++);
- hash += (hash << 10);
- hash ^= (hash >> 6);
- }
- hash += (hash << 3);
- hash ^= (hash >> 11);
- hash += (hash << 15);
- return hash;
-}
-
-struct termlist *termlist_create(NMEM nmem, int numterms, int highscore_size)
-{
- int hashsize = 1;
- int halfnumterms;
- struct termlist *res;
-
- // Calculate a hash size smallest power of 2 larger than 50% of expected numterms
- halfnumterms = numterms >> 1;
- if (halfnumterms < 0)
- halfnumterms = 1;
- while (hashsize < halfnumterms)
- hashsize <<= 1;
- res = nmem_malloc(nmem, sizeof(struct termlist));
- res->hashtable = nmem_malloc(nmem, hashsize * sizeof(struct termlist_bucket*));
- memset(res->hashtable, 0, hashsize * sizeof(struct termlist_bucket*));
- res->hashtable_size = hashsize;
+ struct termlist *res = nmem_malloc(nmem, sizeof(struct termlist));
+ res->hash_size = 399;
+ res->hashtable =
+ nmem_malloc(nmem, res->hash_size * sizeof(struct termlist_bucket*));
+ memset(res->hashtable, 0, res->hash_size * sizeof(struct termlist_bucket*));
res->nmem = nmem;
- res->hashmask = hashsize - 1; // Creates a bitmask
res->highscore = nmem_malloc(nmem, highscore_size * sizeof(struct termlist_score *));
res->highscore_size = highscore_size;
if (strlen(term) > 255)
return;
strcpy(buf, term);
- for (cp = buf + strlen(buf) - 1; cp > buf &&
- (*cp == ',' || *cp == '.' || *cp == ' ' || *cp == '-'); cp--)
- *cp = '\0';
-
- bucket = hash((unsigned char *)buf) & tl->hashmask;
+ /* chop right */
+ for (cp = buf + strlen(buf); cp != buf && strchr(",. -", cp[-1]); cp--)
+ cp[-1] = '\0';
+
+ bucket = jenkins_hash((unsigned char *)buf) % tl->hash_size;
for (p = &tl->hashtable[bucket]; *p; p = &(*p)->next)
{
if (!strcmp(buf, (*p)->term.term))
/*
* Local variables:
* c-basic-offset: 4
+ * c-file-style: "Stroustrup"
* indent-tabs-mode: nil
* End:
* vim: shiftwidth=4 tabstop=8 expandtab
*/
+