Added subject facet browsing, beginning of relevance ranking
authorSebastian Hammer <quinn@indexdata.com>
Fri, 24 Nov 2006 20:29:07 +0000 (20:29 +0000)
committerSebastian Hammer <quinn@indexdata.com>
Fri, 24 Nov 2006 20:29:07 +0000 (20:29 +0000)
12 files changed:
Makefile
http.c
http.h
http_command.c
pazpar2.c
pazpar2.h
reclists.c [new file with mode: 0644]
reclists.h [new file with mode: 0644]
relevance.c [new file with mode: 0644]
relevance.h [new file with mode: 0644]
termlists.c [new file with mode: 0644]
termlists.h [new file with mode: 0644]

index 8e2b9a5..935ab70 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 # ParaZ. Copyright (C) 2000-2004, Index Data ApS
 # All rights reserved.
-# $Id: Makefile,v 1.2 2006-11-21 18:46:43 quinn Exp $
+# $Id: Makefile,v 1.3 2006-11-24 20:29:07 quinn Exp $
 
 SHELL=/bin/sh
 
@@ -11,7 +11,8 @@ YAZLIBS=`$(YAZCONF) --libs`
 YAZCFLAGS=`$(YAZCONF) --cflags`
 
 PROG=pazpar2
-PROGO=pazpar2.o eventl.o util.o command.o http.o http_command.o
+PROGO=pazpar2.o eventl.o util.o command.o http.o http_command.o termlists.o \
+               reclists.o relevance.o
 
 all: $(PROG)
 
diff --git a/http.c b/http.c
index 9de5c05..f2d6906 100644 (file)
--- a/http.c
+++ b/http.c
@@ -1,5 +1,5 @@
 /*
- * $Id: http.c,v 1.1 2006-11-21 18:46:43 quinn Exp $
+ * $Id: http.c,v 1.2 2006-11-24 20:29:07 quinn Exp $
  */
 
 #include <stdio.h>
@@ -11,6 +11,8 @@
 #include <strings.h>
 #include <ctype.h>
 #include <fcntl.h>
+#include <netdb.h>
+#include <errno.h>
 
 #include <yaz/yaz-util.h>
 #include <yaz/comstack.h>
 #include "http.h"
 #include "http_command.h"
 
+static void proxy_io(IOCHAN i, int event);
+
 extern IOCHAN channel_list;
 
+static struct sockaddr_in *proxy_addr = 0; // If this is set, we proxy normal HTTP requests
+static char proxy_url[256] = "";
+static struct http_buf *http_buf_freelist = 0;
+
+static struct http_buf *http_buf_create()
+{
+    struct http_buf *r;
+
+    if (http_buf_freelist)
+    {
+        r = http_buf_freelist;
+        http_buf_freelist = http_buf_freelist->next;
+    }
+    else
+        r = xmalloc(sizeof(struct http_buf));
+    r->offset = 0;
+    r->len = 0;
+    r->next = 0;
+    return r;
+}
+
+static void http_buf_destroy(struct http_buf *b)
+{
+    b->next = http_buf_freelist;
+    http_buf_freelist = b;
+}
+
+static void http_buf_destroy_queue(struct http_buf *b)
+{
+    struct http_buf *p;
+    while (b)
+    {
+        p = b->next;
+        http_buf_destroy(b);
+        b = p;
+    }
+}
+
+#ifdef GAGA
+// Calculate length of chain
+static int http_buf_len(struct http_buf *b)
+{
+    int sum = 0;
+    for (; b; b = b->next)
+        sum += b->len;
+    return sum;
+}
+#endif
+
+static struct http_buf *http_buf_bybuf(char *b, int len)
+{
+    struct http_buf *res = 0;
+    struct http_buf **p = &res;
+
+    while (len)
+    {
+        *p = http_buf_create();
+        int tocopy = len;
+        if (tocopy > HTTP_BUF_SIZE)
+            tocopy = HTTP_BUF_SIZE;
+        memcpy((*p)->buf, b, tocopy);
+        (*p)->len = tocopy;
+        len -= tocopy;
+        b += tocopy;
+        p = &(*p)->next;
+    }
+    return res;
+}
+
+// Add a (chain of) buffers to the end of an existing queue.
+static void http_buf_enqueue(struct http_buf **queue, struct http_buf *b)
+{
+    while (*queue)
+        queue = &(*queue)->next;
+    *queue = b;
+}
+
+static struct http_buf *http_buf_bywrbuf(WRBUF wrbuf)
+{
+    return http_buf_bybuf(wrbuf_buf(wrbuf), wrbuf_len(wrbuf));
+}
+
+// Non-destructively collapse chain of buffers into a string (max *len)
+// Return
+static int http_buf_peek(struct http_buf *b, char *buf, int len)
+{
+    int rd = 0;
+    while (b && rd < len)
+    {
+        int toread = len - rd;
+        if (toread > b->len)
+            toread = b->len;
+        memcpy(buf + rd, b->buf + b->offset, toread);
+        rd += toread;
+        b = b->next;
+    }
+    buf[rd] = '\0';
+    return rd;
+}
+
+// Ddestructively munch up to len  from head of queue.
+static int http_buf_read(struct http_buf **b, char *buf, int len)
+{
+    int rd = 0;
+    while ((*b) && rd < len)
+    {
+        int toread = len - rd;
+        if (toread > (*b)->len)
+            toread = (*b)->len;
+        memcpy(buf + rd, (*b)->buf + (*b)->offset, toread);
+        rd += toread;
+        if (toread < (*b)->len)
+        {
+            (*b)->len -= toread;
+            (*b)->offset += toread;
+            break;
+        }
+        else
+        {
+            struct http_buf *n = (*b)->next;
+            http_buf_destroy(*b);
+            *b = n;
+        }
+    }
+    buf[rd] = '\0';
+    return rd;
+}
+
 void http_addheader(struct http_response *r, const char *name, const char *value)
 {
     struct http_channel *c = r->channel;
@@ -35,16 +167,18 @@ void http_addheader(struct http_response *r, const char *name, const char *value
     r->headers = h;
 }
 
-char *argbyname(struct http_request *r, char *name)
+char *http_argbyname(struct http_request *r, char *name)
 {
     struct http_argument *p;
+    if (!name)
+        return 0;
     for (p = r->arguments; p; p = p->next)
         if (!strcmp(p->name, name))
             return p->value;
     return 0;
 }
 
-char *headerbyname(struct http_request *r, char *name)
+char *http_headerbyname(struct http_request *r, char *name)
 {
     struct http_header *p;
     for (p = r->headers; p; p = p->next)
@@ -67,10 +201,13 @@ struct http_response *http_create_response(struct http_channel *c)
 // Check if we have a complete request. Return 0 or length (including trailing newline)
 // FIXME: Does not deal gracefully with requests carrying payload
 // but this is kind of OK since we will reject anything other than an empty GET
-static int request_check(const char *buf)
+static int request_check(struct http_buf *queue)
 {
+    char tmp[4096];
     int len = 0;
+    char *buf = tmp;
 
+    http_buf_peek(queue, tmp, 4096);
     while (*buf) // Check if we have a sequence of lines terminated by an empty line
     {
         char *b = strstr(buf, "\r\n");
@@ -86,20 +223,32 @@ static int request_check(const char *buf)
     return 0;
 }
 
-struct http_request *http_parse_request(struct http_channel *c, char *buf)
+struct http_request *http_parse_request(struct http_channel *c, struct http_buf **queue,
+        int len)
 {
     struct http_request *r = nmem_malloc(c->nmem, sizeof(*r));
     char *p, *p2;
+    char tmp[4096];
+    char *buf = tmp;
+
+    if (len > 4096)
+        return 0;
+    if (http_buf_read(queue, buf, len) < len)
+        return 0;
 
     r->channel = c;
+    r->arguments = 0;
+    r->headers = 0;
     // Parse first line
-    if (!strncmp(buf, "GET ", 4))
-        r->method = Method_GET;
-    else
+    for (p = buf, p2 = r->method; *p && *p != ' ' && p - buf < 19; p++)
+        *(p2++) = *p;
+    if (*p != ' ')
     {
         yaz_log(YLOG_WARN, "Unexpected HTTP method in request");
         return 0;
     }
+    *p2 = '\0';
+
     if (!(buf = strchr(buf, ' ')))
     {
         yaz_log(YLOG_WARN, "Syntax error in request (1)");
@@ -151,18 +300,47 @@ struct http_request *http_parse_request(struct http_channel *c, char *buf)
         if (!(p = strstr(buf, "\r\n")))
             return 0;
         *(p++) = '\0';
+        p++;
         strcpy(r->http_version, buf);
         buf = p;
     }
     strcpy(c->version, r->http_version);
 
-    r->headers = 0; // We might want to parse these someday
+    r->headers = 0;
+    while (*buf)
+    {
+        if (!(p = strstr(buf, "\r\n")))
+            return 0;
+        if (p == buf)
+            break;
+        else
+        {
+            struct http_header *h = nmem_malloc(c->nmem, sizeof(*h));
+            if (!(p2 = strchr(buf, ':')))
+                return 0;
+            *(p2++) = '\0';
+            h->name = nmem_strdup(c->nmem, buf);
+            while (isspace(*p2))
+                p2++;
+            if (p2 >= p) // Empty header?
+            {
+                buf = p + 2;
+                continue;
+            }
+            *p = '\0';
+            h->value = nmem_strdup(c->nmem, p2);
+            h->next = r->headers;
+            r->headers = h;
+            buf = p + 2;
+        }
+    }
 
     return r;
 }
 
 
-static char *http_serialize_response(struct http_channel *c, struct http_response *r)
+static struct http_buf *http_serialize_response(struct http_channel *c,
+        struct http_response *r)
 {
     wrbuf_rewind(c->wrbuf);
     struct http_header *h;
@@ -177,16 +355,59 @@ static char *http_serialize_response(struct http_channel *c, struct http_respons
     if (r->payload)
         wrbuf_puts(c->wrbuf, r->payload);
 
-    wrbuf_putc(c->wrbuf, '\0');
-    return wrbuf_buf(c->wrbuf);
+    return http_buf_bywrbuf(c->wrbuf);
+}
+
+// Serialize a HTTP request
+static struct http_buf *http_serialize_request(struct http_request *r)
+{
+    struct http_channel *c = r->channel;
+    wrbuf_rewind(c->wrbuf);
+    struct http_header *h;
+    struct http_argument *a;
+
+    wrbuf_printf(c->wrbuf, "%s %s", r->method, r->path);
+
+    if (r->arguments)
+    {
+        wrbuf_putc(c->wrbuf, '?');
+        for (a = r->arguments; a; a = a->next) {
+            if (a != r->arguments)
+                wrbuf_putc(c->wrbuf, '&');
+            wrbuf_printf(c->wrbuf, "%s=%s", a->name, a->value);
+        }
+    }
+
+    wrbuf_printf(c->wrbuf, " HTTP/%s\r\n", r->http_version);
+
+    for (h = r->headers; h; h = h->next)
+        wrbuf_printf(c->wrbuf, "%s: %s\r\n", h->name, h->value);
+
+    wrbuf_puts(c->wrbuf, "\r\n");
+    
+    return http_buf_bywrbuf(c->wrbuf);
 }
 
+
 // Cleanup
 static void http_destroy(IOCHAN i)
 {
     struct http_channel *s = iochan_getdata(i);
 
     yaz_log(YLOG_DEBUG, "Destroying http channel");
+    if (s->proxy)
+    {
+        yaz_log(YLOG_DEBUG, "Destroying Proxy channel");
+        if (s->proxy->iochan)
+        {
+            close(iochan_getfd(s->proxy->iochan));
+            iochan_destroy(s->proxy->iochan);
+        }
+        http_buf_destroy_queue(s->proxy->oqueue);
+        xfree(s->proxy);
+    }
+    http_buf_destroy_queue(s->iqueue);
+    http_buf_destroy_queue(s->oqueue);
     nmem_destroy(s->nmem);
     wrbuf_free(s->wrbuf, 1);
     xfree(s);
@@ -194,6 +415,79 @@ static void http_destroy(IOCHAN i)
     iochan_destroy(i);
 }
 
+static int http_weshouldproxy(struct http_request *rq)
+{
+    if (proxy_addr && !strstr(rq->path, "search.pz2"))
+        return 1;
+    return 0;
+}
+
+static int http_proxy(struct http_request *rq)
+{
+    struct http_channel *c = rq->channel;
+    struct http_proxy *p = c->proxy;
+    struct http_header *hp;
+    struct http_buf *requestbuf;
+
+    yaz_log(YLOG_DEBUG, "Proxy request");
+
+    if (!p) // This is a new connection. Create a proxy channel
+    {
+        int sock;
+        struct protoent *pe;
+        int one = 1;
+        int flags;
+
+        yaz_log(YLOG_DEBUG, "Creating a new proxy channel");
+        if (!(pe = getprotobyname("tcp"))) {
+            abort();
+        }
+        if ((sock = socket(PF_INET, SOCK_STREAM, pe->p_proto)) < 0)
+        {
+            yaz_log(YLOG_WARN|YLOG_ERRNO, "socket");
+            return -1;
+        }
+        if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char*)
+                        &one, sizeof(one)) < 0)
+            abort();
+        if ((flags = fcntl(sock, F_GETFL, 0)) < 0) 
+            yaz_log(YLOG_FATAL|YLOG_ERRNO, "fcntl");
+        if (fcntl(sock, F_SETFL, flags | O_NONBLOCK) < 0)
+            yaz_log(YLOG_FATAL|YLOG_ERRNO, "fcntl2");
+        if (connect(sock, (struct sockaddr *) proxy_addr, sizeof(*proxy_addr)) < 0)
+            if (errno != EINPROGRESS)
+            {
+                yaz_log(YLOG_WARN|YLOG_ERRNO, "Proxy connect");
+                return -1;
+            }
+
+        p = xmalloc(sizeof(struct http_proxy));
+        p->oqueue = 0;
+        p->channel = c;
+        c->proxy = p;
+        // We will add EVENT_OUTPUT below
+        p->iochan = iochan_create(sock, proxy_io, EVENT_INPUT);
+        iochan_setdata(p->iochan, p);
+        p->iochan->next = channel_list;
+        channel_list = p->iochan;
+    }
+
+    // Modify Host: header
+    for (hp = rq->headers; hp; hp = hp->next)
+        if (!strcmp(hp->name, "Host"))
+            break;
+    if (!hp)
+    {
+        yaz_log(YLOG_WARN, "Failed to find Host header in proxy");
+        return -1;
+    }
+    hp->value = nmem_strdup(c->nmem, proxy_url);
+    requestbuf = http_serialize_request(rq);
+    http_buf_enqueue(&p->oqueue, requestbuf);
+    iochan_setflag(p->iochan, EVENT_OUTPUT);
+    return 0;
+}
+
 static void http_io(IOCHAN i, int event)
 {
     struct http_channel *hc = iochan_getdata(i);
@@ -202,74 +496,111 @@ static void http_io(IOCHAN i, int event)
 
     switch (event)
     {
-        int res;
+        int res, reqlen;
+        struct http_buf *htbuf;
 
         case EVENT_INPUT:
             yaz_log(YLOG_DEBUG, "HTTP Input event");
 
-            res = read(iochan_getfd(i), hc->ibuf + hc->read, IBUF_SIZE - (hc->read + 1));
-            if (res <= 0)
+            htbuf = http_buf_create();
+            res = read(iochan_getfd(i), htbuf->buf, HTTP_BUF_SIZE -1);
+            if (res <= 0 && errno != EAGAIN)
             {
                 yaz_log(YLOG_WARN|YLOG_ERRNO, "HTTP read");
+                http_buf_destroy(htbuf);
                 http_destroy(i);
                 return;
             }
-            yaz_log(YLOG_DEBUG, "HTTP read %d octets", res);
-            hc->read += res;
-            hc->ibuf[hc->read] = '\0';
+            if (res > 0)
+            {
+                htbuf->buf[res] = '\0';
+                htbuf->len = res;
+                http_buf_enqueue(&hc->iqueue, htbuf);
+            }
 
-            if ((res = request_check(hc->ibuf)) <= 2)
+            if ((reqlen = request_check(hc->iqueue)) <= 2)
             {
                 yaz_log(YLOG_DEBUG, "We don't have a complete HTTP request yet");
                 return;
             }
-            yaz_log(YLOG_DEBUG, "We think we have a complete HTTP request (len %d): \n%s", res,  hc->ibuf);
+            yaz_log(YLOG_DEBUG, "We think we have a complete HTTP request (len %d)", reqlen);
+
             nmem_reset(hc->nmem);
-            if (!(request = http_parse_request(hc, hc->ibuf)))
+            if (!(request = http_parse_request(hc, &hc->iqueue, reqlen)))
             {
                 yaz_log(YLOG_WARN, "Failed to parse request");
                 http_destroy(i);
                 return;
             }
-            response = http_command(request);
-            if (!response)
+            yaz_log(YLOG_LOG, "Request: %s %s v %s", request->method,  request->path,
+                    request->http_version);
+            if (http_weshouldproxy(request))
+                http_proxy(request);
+            else
             {
-                http_destroy(i);
-                return;
+                struct http_buf *hb;
+                // Execute our business logic!
+                response = http_command(request);
+                if (!response)
+                {
+                    http_destroy(i);
+                    return;
+                }
+                if (!(hb =  http_serialize_response(hc, response)))
+                {
+                    http_destroy(i);
+                    return;
+                }
+                http_buf_enqueue(&hc->oqueue, hb);
+                yaz_log(YLOG_DEBUG, "Response ready");
+                iochan_setflags(i, EVENT_OUTPUT); // Turns off input selecting
             }
-            // FIXME -- do something to cause the response to be sent to the client
-            if (!(hc->obuf = http_serialize_response(hc, response)))
+            if (hc->iqueue)
             {
-                http_destroy(i);
-                return;
+                yaz_log(YLOG_DEBUG, "We think we have more input to read. Forcing event");
+                iochan_setevent(i, EVENT_INPUT);
             }
-            yaz_log(YLOG_DEBUG, "Response ready:\n%s", hc->obuf);
-            hc->writ = 0;
-            hc->read = 0;
-            iochan_setflags(i, EVENT_OUTPUT); // Turns off input selecting
+
             break;
 
         case EVENT_OUTPUT:
             yaz_log(YLOG_DEBUG, "HTTP output event");
-            res = write(iochan_getfd(hc->iochan), hc->obuf + hc->writ,
-                        strlen(hc->obuf + hc->writ));
-            if (res <= 0)
+            if (hc->oqueue)
             {
-                yaz_log(YLOG_WARN|YLOG_ERRNO, "write");
-                http_destroy(i);
-                return;
-            }
-            hc->writ += res;
-            if (!hc->obuf[hc->writ]) {
-                yaz_log(YLOG_DEBUG, "Writing finished");
-                if (!strcmp(hc->version, "1.0"))
+                struct http_buf *wb = hc->oqueue;
+                res = write(iochan_getfd(hc->iochan), wb->buf + wb->offset, wb->len);
+                if (res <= 0)
                 {
-                    yaz_log(YLOG_DEBUG, "Closing 1.0 connection");
+                    yaz_log(YLOG_WARN|YLOG_ERRNO, "write");
                     http_destroy(i);
+                    return;
+                }
+                yaz_log(YLOG_DEBUG, "HTTP Wrote %d octets", res);
+                if (res == wb->len)
+                {
+                    hc->oqueue = hc->oqueue->next;
+                    http_buf_destroy(wb);
                 }
                 else
-                    iochan_setflags(i, EVENT_INPUT); // Turns off output flag
+                {
+                    wb->len -= res;
+                    wb->offset += res;
+                }
+                if (!hc->oqueue) {
+                    yaz_log(YLOG_DEBUG, "Writing finished");
+                    if (!strcmp(hc->version, "1.0"))
+                    {
+                        yaz_log(YLOG_DEBUG, "Closing 1.0 connection");
+                        http_destroy(i);
+                        return;
+                    }
+                    else
+                        iochan_setflags(i, EVENT_INPUT); // Turns off output flag
+                }
             }
+
+            if (!hc->oqueue && hc->proxy && !hc->proxy->iochan) 
+                http_destroy(i); // Server closed; we're done
             break;
         default:
             yaz_log(YLOG_WARN, "Unexpected event on connection");
@@ -277,6 +608,83 @@ static void http_io(IOCHAN i, int event)
     }
 }
 
+// Handles I/O on a client connection to a backend web server (proxy mode)
+static void proxy_io(IOCHAN pi, int event)
+{
+    struct http_proxy *pc = iochan_getdata(pi);
+    struct http_channel *hc = pc->channel;
+
+    switch (event)
+    {
+        int res;
+        struct http_buf *htbuf;
+
+        case EVENT_INPUT:
+            yaz_log(YLOG_DEBUG, "Proxy input event");
+            htbuf = http_buf_create();
+            res = read(iochan_getfd(pi), htbuf->buf, HTTP_BUF_SIZE -1);
+            yaz_log(YLOG_DEBUG, "Proxy read %d bytes.", res);
+            if (res == 0 || (res < 0 && errno != EINPROGRESS))
+            {
+                if (hc->oqueue)
+                {
+                    yaz_log(YLOG_WARN, "Proxy read came up short");
+                    // Close channel and alert client HTTP channel that we're gone
+                    http_buf_destroy(htbuf);
+                    close(iochan_getfd(pi));
+                    iochan_destroy(pi);
+                    pc->iochan = 0;
+                }
+                else
+                {
+                    http_destroy(hc->iochan);
+                    return;
+                }
+            }
+            else
+            {
+                htbuf->buf[res] = '\0';
+                htbuf->len = res;
+                http_buf_enqueue(&hc->oqueue, htbuf);
+            }
+            iochan_setflag(hc->iochan, EVENT_OUTPUT);
+            break;
+        case EVENT_OUTPUT:
+            yaz_log(YLOG_DEBUG, "Proxy output event");
+            if (!(htbuf = pc->oqueue))
+            {
+                iochan_clearflag(pi, EVENT_OUTPUT);
+                return;
+            }
+            res = write(iochan_getfd(pi), htbuf->buf + htbuf->offset, htbuf->len);
+            if (res <= 0)
+            {
+                yaz_log(YLOG_WARN|YLOG_ERRNO, "write");
+                http_destroy(hc->iochan);
+                return;
+            }
+            if (res == htbuf->len)
+            {
+                struct http_buf *np = htbuf->next;
+                http_buf_destroy(htbuf);
+                pc->oqueue = np;
+            }
+            else
+            {
+                htbuf->len -= res;
+                htbuf->offset += res;
+            }
+
+            if (!pc->oqueue) {
+                iochan_setflags(pi, EVENT_INPUT); // Turns off output flag
+            }
+            break;
+        default:
+            yaz_log(YLOG_WARN, "Unexpected event on connection");
+            http_destroy(hc->iochan);
+    }
+}
+
 /* Accept a new command connection */
 static void http_accept(IOCHAN i, int event)
 {
@@ -303,17 +711,17 @@ static void http_accept(IOCHAN i, int event)
     c = iochan_create(s, http_io, EVENT_INPUT | EVENT_EXCEPT);
 
     ch = xmalloc(sizeof(*ch));
-    ch->read = 0;
+    ch->proxy = 0;
     ch->nmem = nmem_create();
     ch->wrbuf = wrbuf_alloc();
     ch->iochan = c;
+    ch->iqueue = ch->oqueue = 0;
     iochan_setdata(c, ch);
 
     c->next = channel_list;
     channel_list = c;
 }
 
-
 /* Create a http-channel listener */
 void http_init(int port)
 {
@@ -347,6 +755,31 @@ void http_init(int port)
     channel_list = c;
 }
 
+void http_set_proxyaddr(char *host)
+{
+    char *p;
+    int port;
+    struct hostent *he;
+
+    strcpy(proxy_url, host);
+    p = strchr(host, ':');
+    yaz_log(YLOG_DEBUG, "Proxying for %s", host);
+    if (p) {
+        port = atoi(p + 1);
+        *p = '\0';
+    }
+    else
+        port = 80;
+    if (!(he = gethostbyname(host))) 
+    {
+        fprintf(stderr, "Failed to lookup '%s'\n", host);
+        exit(1);
+    }
+    proxy_addr = xmalloc(sizeof(struct sockaddr_in));
+    proxy_addr->sin_family = he->h_addrtype;
+    memcpy(&proxy_addr->sin_addr.s_addr, he->h_addr_list[0], he->h_length);
+    proxy_addr->sin_port = htons(port);
+}
 
 /*
  * Local variables:
diff --git a/http.h b/http.h
index bb15ac7..907a19d 100644 (file)
--- a/http.h
+++ b/http.h
@@ -1,19 +1,34 @@
 #ifndef HTTP_H
 #define HTTP_H
 
+// Generic I/O buffer
+struct http_buf
+{
+#define HTTP_BUF_SIZE 4096
+    char buf[4096];
+    int offset;
+    int len;
+    struct http_buf *next;
+};
+
 struct http_channel
 {
     IOCHAN iochan;
-#define IBUF_SIZE 10240
-    char ibuf[IBUF_SIZE];
+    struct http_buf *iqueue;
+    struct http_buf *oqueue;
     char version[10];
-    int read;
-    char *obuf;
-    int writ;
+    struct http_proxy *proxy;
     NMEM nmem;
     WRBUF wrbuf;
 };
 
+struct http_proxy //  attached to iochan for proxy connection
+{
+    IOCHAN iochan;
+    struct http_channel *channel;
+    struct http_buf *oqueue;
+};
+
 struct http_header
 {
     char *name;
@@ -32,11 +47,7 @@ struct http_request
 {
     struct http_channel *channel;
     char http_version[20];
-    enum
-    {
-        Method_GET,
-        Method_other
-    } method;
+    char method[20];
     char *path;
     struct http_header *headers;
     struct http_argument *arguments;
@@ -51,10 +62,18 @@ struct http_response
     char *payload;
 };
 
+void http_set_proxyaddr(char *url);
 void http_init(int port);
 void http_addheader(struct http_response *r, const char *name, const char *value);
-char *argbyname(struct http_request *r, char *name);
-char *headerbyname(struct http_request *r, char *name);
+char *http_argbyname(struct http_request *r, char *name);
+char *http_headerbyname(struct http_request *r, char *name);
 struct http_response *http_create_response(struct http_channel *c);
 
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */
 #endif
index 07210ec..0affacd 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * $Id: http_command.c,v 1.1 2006-11-21 18:46:43 quinn Exp $
+ * $Id: http_command.c,v 1.2 2006-11-24 20:29:07 quinn Exp $
  */
 
 #include <stdio.h>
@@ -9,6 +9,7 @@
 #include <stdlib.h>
 #include <strings.h>
 #include <ctype.h>
+#include <sys/time.h>
 
 #include <yaz/yaz-util.h>
 
@@ -21,7 +22,7 @@
 
 struct http_session {
     struct session *psession;
-    char session_id[128];
+    int session_id;
     int timestamp;
     struct http_session *next;
 };
@@ -31,8 +32,8 @@ static struct http_session *session_list = 0;
 struct http_session *http_session_create()
 {
     struct http_session *r = xmalloc(sizeof(*r));
-    r->psession = 0;
-    *r->session_id = '\0';
+    r->psession = new_session();
+    r->session_id = 0;
     r->timestamp = 0;
     r->next = session_list;
     session_list = r;
@@ -66,16 +67,192 @@ static void error(struct http_response *rs, char *code, char *msg, char *txt)
     rs->payload = nmem_strdup(c->nmem, tmp);
 }
 
+int  make_sessionid()
+{
+    struct timeval t;
+    int res;
+    static int seq = 0;
+
+    seq++;
+    if (gettimeofday(&t, 0) < 0)
+        abort();
+    res = t.tv_sec;
+    res = (res << 8) | (seq & 0xff);
+    return res;
+}
+
+static struct http_session *locate_session(struct http_request *rq, struct http_response *rs)
+{
+    struct http_session *p;
+    char *session = http_argbyname(rq, "session");
+    int id;
+
+    if (!session)
+    {
+        error(rs, "417", "Must supply session", 0);
+        return 0;
+    }
+    id = atoi(session);
+    for (p = session_list; p; p = p->next)
+        if (id == p->session_id)
+            return p;
+    error(rs, "417", "Session does not exist, or it has expired", 0);
+    return 0;
+}
+
+
 static void cmd_init(struct http_request *rq, struct http_response *rs)
 {
+    int sesid;
+    char buf[1024];
+    struct http_session *s = http_session_create();
+
+    // FIXME create a pazpar2 session
+    yaz_log(YLOG_DEBUG, "HTTP Session init");
+    sesid = make_sessionid();
+    s->session_id = sesid;
+    sprintf(buf, "<init><status>OK</status><session>%d</session></init>", sesid);
+    rs->payload = nmem_strdup(rq->channel->nmem, buf);
 }
 
+static void cmd_termlist(struct http_request *rq, struct http_response *rs)
+{
+    struct http_session *s = locate_session(rq, rs);
+    struct http_channel *c = rq->channel;
+    struct termlist_score **p;
+    int len;
+    int i;
+
+    if (!s)
+        return;
+    wrbuf_rewind(c->wrbuf);
+
+    wrbuf_puts(c->wrbuf, "<termlist>");
+    p = termlist(s->psession, &len);
+    if (p)
+        for (i = 0; i < len; i++)
+        {
+            wrbuf_puts(c->wrbuf, "\n<term>");
+            wrbuf_printf(c->wrbuf, "<name>%s</name>", p[i]->term);
+            wrbuf_printf(c->wrbuf, "<frequency>%d</frequency>", p[i]->frequency);
+            wrbuf_puts(c->wrbuf, "</term>");
+        }
+    wrbuf_puts(c->wrbuf, "</termlist>");
+    rs->payload = nmem_strdup(rq->channel->nmem, wrbuf_buf(c->wrbuf));
+}
+
+
+static void cmd_bytarget(struct http_request *rq, struct http_response *rs)
+{
+    struct http_session *s = locate_session(rq, rs);
+    struct http_channel *c = rq->channel;
+    struct hitsbytarget *ht;
+    int count, i;
+
+    if (!s)
+        return;
+    if (!(ht = hitsbytarget(s->psession, &count)))
+    {
+        error(rs, "500", "Failed to retrieve hitcounts", 0);
+        return;
+    }
+    wrbuf_rewind(c->wrbuf);
+    wrbuf_puts(c->wrbuf, "<bytarget><status>OK</status>");
+
+    for (i = 0; i < count; i++)
+    {
+        wrbuf_puts(c->wrbuf, "\n<target>");
+        wrbuf_printf(c->wrbuf, "<id>%s</id>\n", ht[i].id);
+        wrbuf_printf(c->wrbuf, "<hits>%d</hits>\n", ht[i].hits);
+        wrbuf_printf(c->wrbuf, "<diagnostic>%d</diagnostic>\n", ht[i].diagnostic);
+        wrbuf_printf(c->wrbuf, "<records>%d</records>\n", ht[i].records);
+        wrbuf_printf(c->wrbuf, "<state>%s</state>\n", ht[i].state);
+        wrbuf_puts(c->wrbuf, "</target>");
+    }
+
+    wrbuf_puts(c->wrbuf, "</bytarget>");
+    rs->payload = nmem_strdup(c->nmem, wrbuf_buf(c->wrbuf));
+}
+
+static void cmd_show(struct http_request *rq, struct http_response *rs)
+{
+    struct http_session *s = locate_session(rq, rs);
+    struct http_channel *c = rq->channel;
+    struct record **rl;
+    char *start = http_argbyname(rq, "start");
+    char *num = http_argbyname(rq, "num");
+    int startn = 0;
+    int numn = 20;
+    int i;
+
+    if (!s)
+        return;
+
+    if (start)
+        startn = atoi(start);
+    if (num)
+        numn = atoi(num);
+
+    rl = show(s->psession, startn, &numn);
+
+    wrbuf_rewind(c->wrbuf);
+    wrbuf_puts(c->wrbuf, "<show>\n<status>OK</status>\n");
+
+    for (i = 0; i < numn; i++)
+    {
+        int ccount;
+        struct record *p;
+
+        wrbuf_puts(c->wrbuf, "<hit>\n");
+        wrbuf_printf(c->wrbuf, "<merge_key>%s</merge_key>\n", rl[i]->merge_key);
+        for (ccount = 1, p = rl[i]->next_cluster; p;  p = p->next_cluster, ccount++)
+            ;
+        if (ccount > 1)
+            wrbuf_printf(c->wrbuf, "<count>%d</count>\n", ccount);
+        wrbuf_puts(c->wrbuf, "</hit>\n");
+    }
+
+    wrbuf_puts(c->wrbuf, "</show>\n");
+    rs->payload = nmem_strdup(c->nmem, wrbuf_buf(c->wrbuf));
+}
+
+static void cmd_search(struct http_request *rq, struct http_response *rs)
+{
+    struct http_session *s = locate_session(rq, rs);
+    char *query = http_argbyname(rq, "query");
+
+    if (!s)
+        return;
+    if (!query)
+    {
+        error(rs, "417", "Must supply query", 0);
+        return;
+    }
+    search(s->psession, query);
+    rs->payload = "<search><status>OK</status></search>";
+}
+
+
 static void cmd_stat(struct http_request *rq, struct http_response *rs)
 {
 }
 
 static void cmd_load(struct http_request *rq, struct http_response *rs)
 {
+    struct http_session *s = locate_session(rq, rs);
+    char *fn = http_argbyname(rq, "name");
+
+    if (!s)
+        return;
+    if (!fn)
+    {
+        error(rs, "417", "Must suppply name", 0);
+        return;
+    }
+    if (load_targets(s->psession, fn) < 0)
+        error(rs, "417", "Failed to find targets", "Possibly wrong filename");
+    else
+        rs->payload = "<load><status>OK</status></load>";
 }
 
 struct {
@@ -85,12 +262,16 @@ struct {
     { "init", cmd_init },
     { "stat", cmd_stat },
     { "load", cmd_load },
+    { "bytarget", cmd_bytarget },
+    { "show", cmd_show },
+    { "search", cmd_search },
+    { "termlist", cmd_termlist },
     {0,0}
 };
 
 struct http_response *http_command(struct http_request *rq)
 {
-    char *command = argbyname(rq, "command");
+    char *command = http_argbyname(rq, "command");
     struct http_channel *c = rq->channel;
     struct http_response *rs = http_create_response(c);
     int i;
index b4ca296..c2d4fbf 100644 (file)
--- a/pazpar2.c
+++ b/pazpar2.c
@@ -1,4 +1,4 @@
-/* $Id: pazpar2.c,v 1.3 2006-11-21 18:46:43 quinn Exp $ */
+/* $Id: pazpar2.c,v 1.4 2006-11-24 20:29:07 quinn Exp $ */
 
 #include <stdlib.h>
 #include <stdio.h>
@@ -21,6 +21,9 @@
 #include "eventl.h"
 #include "command.h"
 #include "http.h"
+#include "termlists.h"
+#include "reclists.h"
+#include "relevance.h"
 
 #define PAZPAR2_VERSION "0.1"
 #define MAX_DATABASES 512
@@ -80,6 +83,7 @@ static struct parameters {
     struct timeval base_time;
     int toget;
     int chunk;
+    void *ccl_filter;
 } global_parameters = 
 {
     30,
@@ -88,7 +92,8 @@ static struct parameters {
     PAZPAR2_VERSION,
     {0,0},
     100,
-    MAX_CHUNK
+    MAX_CHUNK,
+    0
 };
 
 
@@ -289,7 +294,11 @@ const char *find_subfield(const char *field, char subfield)
             p++;
         if (*p == '\t' && *(++p) == subfield) {
             if (*(++p) == ' ')
-                return ++p;
+            {
+                while (isspace(*p))
+                    p++;
+                return p;
+            }
         }
     }
     return 0;
@@ -358,6 +367,7 @@ char *extract_mergekey(struct session *s, const char *rec)
     return out;
 }
 
+#ifdef RECHEAP
 static void push_record(struct session *s, struct record *r)
 {
     int p;
@@ -387,13 +397,15 @@ static struct record *top_record(struct session *s)
 
 static struct record *pop_record(struct session *s)
 {
-    struct record *res = s->recheap[0];
+    struct record *res;
     int p = 0;
     int lastnonleaf = (s->recheap_max - 1) >> 1;
 
     if (s->recheap_max < 0)
         return 0;
 
+    res = s->recheap[0];
+
     s->recheap[p] = s->recheap[s->recheap_max--];
 
     while (p <= lastnonleaf)
@@ -472,6 +484,36 @@ static void rewind_recheap(struct session *s)
     }
 }
 
+#endif
+
+// FIXME needs to be generalized. Should flexibly generate X lists per search
+static void extract_subject(struct session *s, const char *rec)
+{
+    const char *field, *subfield;
+
+    while ((field = find_field(rec, "650")))
+    {
+        rec = field + 1; // Crude way to cause a loop through repeating fields
+        if ((subfield = find_subfield(field, 'a')))
+        {
+            char *e, *ef;
+            char buf[1024];
+            int len;
+
+            ef = index(subfield, '\n');
+            if ((e = index(subfield, '\t')) && e < ef)
+                ef = e;
+            while (ef > subfield && !isalpha(*(ef - 1)) && *(ef - 1) != ')')
+                ef--;
+            len = ef - subfield;
+            assert(len < 1023);
+            memcpy(buf, subfield, len);
+            buf[len] = '\0';
+            termlist_insert(s->termlist, buf);
+        }
+    }
+}
+
 struct record *ingest_record(struct target *t, char *buf, int len)
 {
     struct session *s = t->session;
@@ -490,6 +532,8 @@ struct record *ingest_record(struct target *t, char *buf, int len)
 
     res = nmem_malloc(s->nmem, sizeof(struct record));
 
+    extract_subject(s, recbuf);
+
     res->merge_key = extract_mergekey(s, recbuf);
     if (!res->merge_key)
         return 0;
@@ -500,7 +544,7 @@ struct record *ingest_record(struct target *t, char *buf, int len)
 
     yaz_log(YLOG_DEBUG, "Key: %s", res->merge_key);
 
-    push_record(s, res);
+    reclist_insert(s->reclist, res);
 
     return res;
 }
@@ -812,20 +856,12 @@ void search(struct session *s, char *query)
     }
     if (live_channels)
     {
+        const char *t[] = { "aa", "ab", 0 };
         int maxrecs = live_channels * global_parameters.toget;
-        if (!s->recheap_size)
-        {
-            s->recheap = xmalloc(maxrecs * sizeof(struct record *));
-            s->recheap_size = maxrecs;
-        }
-        else if (s->recheap_size < maxrecs)
-        {
-            s->recheap = xrealloc(s->recheap, maxrecs * sizeof(struct record*));
-            s->recheap_size = maxrecs;
-        }
+        s->termlist = termlist_create(s->nmem, maxrecs, 15);
+        s->reclist = reclist_create(s->nmem, maxrecs);
+        relevance_create(s->nmem, t, 1000);
     }
-    s->recheap_max = -1;
-    s->recheap_scratch = -1;
 }
 
 struct session *new_session() 
@@ -834,6 +870,8 @@ struct session *new_session()
 
     yaz_log(YLOG_DEBUG, "New pazpar2 session");
     
+    session->termlist = 0;
+    session->reclist = 0;
     session->requestid = -1;
     session->targets = 0;
     session->pqf_parser = yaz_pqf_create();
@@ -842,8 +880,6 @@ struct session *new_session()
     session->yaz_marc = yaz_marc_create();
     yaz_marc_subfield_str(session->yaz_marc, "\t");
     session->wrbuf = wrbuf_alloc();
-    session->recheap = 0;
-    session->recheap_size = 0;
 
     return session;
 }
@@ -877,6 +913,11 @@ struct hitsbytarget *hitsbytarget(struct session *s, int *count)
     return res;
 }
 
+struct termlist_score **termlist(struct session *s, int *num)
+{
+    return termlist_highscore(s->termlist, num);
+}
+
 struct record **show(struct session *s, int start, int *num)
 {
     struct record **recs = nmem_malloc(s->nmem, *num * sizeof(struct record *));
@@ -884,16 +925,16 @@ struct record **show(struct session *s, int start, int *num)
 
     // FIXME -- skip initial records
 
+    reclist_rewind(s->reclist);
     for (i = 0; i < *num; i++)
     {
-        recs[i] = read_recheap(s);
+        recs[i] = reclist_read_record(s->reclist);
         if (!recs[i])
         {
             *num = i;
             break;
         }
     }
-    rewind_recheap(s);
     return recs;
 }
 
@@ -926,6 +967,11 @@ void statistics(struct session *s, struct statistics *stat)
     stat->num_connections = i;
 }
 
+static void *load_cclfile(const char *fn)
+{
+    return 0;
+}
+
 int main(int argc, char **argv)
 {
     int ret;
@@ -936,7 +982,7 @@ int main(int argc, char **argv)
 
     yaz_log_init(YLOG_DEFAULT_LEVEL|YLOG_DEBUG, "pazpar2", 0);
 
-    while ((ret = options("c:h:", argv, argc, &arg)) != -2)
+    while ((ret = options("c:h:p:C:", argv, argc, &arg)) != -2)
     {
        switch (ret) {
            case 0:
@@ -944,9 +990,15 @@ int main(int argc, char **argv)
            case 'c':
                command_init(atoi(arg));
                break;
+            case 'C':
+                global_parameters.ccl_filter = load_cclfile(arg);
+                break;
             case 'h':
                 http_init(atoi(arg));
                 break;
+            case 'p':
+                http_set_proxyaddr(arg);
+                break;
            default:
                fprintf(stderr, "Usage: pazpar2 -d comport");
                exit(1);
index 0a737ac..aa33a65 100644 (file)
--- a/pazpar2.h
+++ b/pazpar2.h
@@ -2,6 +2,7 @@
 #define PAZPAR2_H
 
 #include <yaz/pquery.h>
+#include "termlists.h"
 
 struct record {
     struct target *target;
@@ -18,10 +19,8 @@ struct session {
     char query[1024];
     NMEM nmem;
     WRBUF wrbuf;
-    struct record **recheap;
-    int recheap_size;
-    int recheap_max;
-    int recheap_scratch;
+    struct termlist *termlist;
+    struct reclist *reclist;
     yaz_marc_t yaz_marc;
 };
 
@@ -52,6 +51,7 @@ int load_targets(struct session *s, const char *fn);
 void statistics(struct session *s, struct statistics *stat);
 void search(struct session *s, char *query);
 struct record **show(struct session *s, int start, int *num);
+struct termlist_score **termlist(struct session *s, int *num);
 
 #endif
 
diff --git a/reclists.c b/reclists.c
new file mode 100644 (file)
index 0000000..004efcd
--- /dev/null
@@ -0,0 +1,122 @@
+/*
+ * $Id: reclists.c,v 1.1 2006-11-24 20:29:07 quinn Exp $
+ */
+
+#include <assert.h>
+
+#include <yaz/yaz-util.h>
+
+#include "pazpar2.h"
+#include "reclists.h"
+
+struct reclist_bucket
+{
+    struct record *record;
+    struct reclist_bucket *next;
+};
+
+struct reclist
+{
+    struct reclist_bucket **hashtable;
+    int hashtable_size;
+    int hashmask;
+
+    struct record **flatlist;
+    int flatlist_size;
+    int num_records;
+    int pointer;
+
+    NMEM nmem;
+};
+
+struct record *reclist_read_record(struct reclist *l)
+{
+    if (l->pointer < l->num_records)
+        return l->flatlist[l->pointer++];
+    else
+        return 0;
+}
+
+void reclist_rewind(struct reclist *l)
+{
+    l->pointer = 0;
+}
+
+// Jenkins one-at-a-time hash (from wikipedia)
+static unsigned int hash(const unsigned char *key)
+{
+    unsigned int hash = 0;
+
+    while (*key)
+    {
+        hash += *(key++);
+        hash += (hash << 10);
+        hash ^= (hash >> 6);
+    }
+    hash += (hash << 3);
+    hash ^= (hash >> 11);
+    hash += (hash << 15);
+    return hash;
+}
+
+struct reclist *reclist_create(NMEM nmem, int numrecs)
+{
+    int hashsize = 1;
+    struct reclist *res;
+
+    assert(numrecs);
+    while (hashsize < numrecs)
+        hashsize <<= 1;
+    res = nmem_malloc(nmem, sizeof(struct reclist));
+    res->hashtable = nmem_malloc(nmem, hashsize * sizeof(struct reclist_bucket*));
+    bzero(res->hashtable, hashsize * sizeof(struct reclist_bucket*));
+    res->hashtable_size = hashsize;
+    res->nmem = nmem;
+    res->hashmask = hashsize - 1; // Creates a bitmask
+
+    res->num_records = 0;
+    res->flatlist = nmem_malloc(nmem, numrecs * sizeof(struct record*));
+    res->flatlist_size = numrecs;
+
+    return res;
+}
+
+void reclist_insert(struct reclist *l, struct record  *record)
+{
+    unsigned int bucket;
+    struct reclist_bucket **p;
+
+    bucket = hash(record->merge_key) & l->hashmask;
+    for (p = &l->hashtable[bucket]; *p; p = &(*p)->next)
+    {
+        // We found a matching record. Merge them
+        if (!strcmp(record->merge_key, (*p)->record->merge_key))
+        {
+            struct record *existing = (*p)->record;
+            yaz_log(YLOG_LOG, "Found a matching record: %s", record->merge_key);
+            record->next_cluster = existing->next_cluster;
+            existing->next_cluster = record;
+            break;
+        }
+    }
+    if (!*p) // We made it to the end of the bucket without finding match
+    {
+        yaz_log(YLOG_DEBUG, "Added a new record: %s", record->merge_key);
+        struct reclist_bucket *new = nmem_malloc(l->nmem,
+                sizeof(struct reclist_bucket));
+        new->record = record;
+        record->next_cluster = 0;
+        new->next = 0;
+        *p = new;
+        l->flatlist[l->num_records++] = record;
+    }
+}
+
+
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */
diff --git a/reclists.h b/reclists.h
new file mode 100644 (file)
index 0000000..b80522e
--- /dev/null
@@ -0,0 +1,11 @@
+#ifndef RECLISTS_H
+#define RECLISTS_H
+
+struct reclist;
+
+struct reclist *reclist_create(NMEM, int numrecs);
+void reclist_insert(struct reclist *tl, struct record  *record);
+struct record *reclist_read_record(struct reclist *l);
+void reclist_rewind(struct reclist *l);
+
+#endif
diff --git a/relevance.c b/relevance.c
new file mode 100644 (file)
index 0000000..fc85e38
--- /dev/null
@@ -0,0 +1,136 @@
+/*
+ * $Id: relevance.c,v 1.1 2006-11-24 20:29:07 quinn Exp $
+ */
+
+#include <ctype.h>
+
+#include "relevance.h"
+#include "pazpar2.h"
+
+struct relevance
+{
+    struct relevance_record *records;
+    int num_records;
+    int *doc_frequency_vec;
+    int vec_len;
+    struct word_trie *wt;
+    NMEM nmem;
+};
+
+struct relevance_record
+{
+    struct record *record;
+    int *term_frequency_vec;
+};
+
+// We use this data structure to recognize terms in input records,
+// and map them to record term vectors for counting.
+struct word_trie
+{
+    struct
+    {
+        struct word_trie *child;
+        int termno;
+    } list[26];
+};
+
+static struct word_trie *create_word_trie_node(NMEM nmem)
+{
+    struct word_trie *res = nmem_malloc(nmem, sizeof(struct word_trie));
+    int i;
+    for (i = 0; i < 26; i++)
+    {
+        res->list[i].child = 0;
+        res->list[i].termno = -1;
+    }
+    return res;
+}
+
+static void word_trie_addterm(NMEM nmem, struct word_trie *n, const char *term, int num)
+{
+    while (*term) {
+        int c = tolower(*term);
+        if (c < 'a' || c > 'z')
+            term++;
+        else
+        {
+            c -= 'a';
+            if (!n->list[c].child)
+            {
+                struct word_trie *new = create_word_trie_node(nmem);
+                n->list[c].child = new;
+            }
+            if (!*(++term))
+                n->list[c].termno = num;
+            else
+                word_trie_addterm(nmem, n->list[c].child, term, num);
+            break;
+        }
+    }
+
+}
+
+static struct word_trie *build_word_trie(NMEM nmem, const char **terms)
+{
+    struct word_trie *res = create_word_trie_node(nmem);
+    const char **p;
+    int i;
+
+    for (i = 1, p = terms; *p; p++, i++)
+        word_trie_addterm(nmem, res, *p, i);
+    return res;
+}
+
+struct relevance *relevance_create(NMEM nmem, const char **terms, int numrecs)
+{
+    struct relevance *res = nmem_malloc(nmem, sizeof(struct relevance));
+    const char **p;
+    int i;
+
+    for (p = terms, i = 0; *p; p++, i++)
+        ;
+    res->vec_len = ++i;
+    res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int));
+    bzero(res->doc_frequency_vec, res->vec_len * sizeof(int));
+    res->nmem = nmem;
+    res->num_records = 0;
+    res->records = nmem_malloc(nmem, numrecs * sizeof(struct relevance_record *));
+    res->wt = build_word_trie(nmem, terms);
+    return res;
+}
+
+struct relevance_record *relevance_newrec(struct relevance *r, struct record *rec)
+{
+    struct relevance_record *res = nmem_malloc(r->nmem,
+            sizeof(struct relevance_record));
+    res->record = rec;
+    res->term_frequency_vec = nmem_malloc(r->nmem, r->vec_len * sizeof(int));
+    bzero(res->term_frequency_vec, r->vec_len * sizeof(int));
+    return res;
+}
+
+void relevance_countwords(struct relevance_record *rec, const char *words, int len)
+{
+}
+
+void relevance_donerecord(struct relevance_record *rec)
+{
+}
+
+// Prepare for a relevance-sorted read of up to num entries
+void relevance_prepare_read(struct relevance *r, int num)
+{
+}
+
+struct record *relevance_read(struct relevance *r)
+{
+    return 0;
+}
+
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */
diff --git a/relevance.h b/relevance.h
new file mode 100644 (file)
index 0000000..3a2bc15
--- /dev/null
@@ -0,0 +1,27 @@
+#ifndef RELEVANCE_H
+#define RELEVANCE_H
+
+#include <yaz/yaz-util.h>
+
+#include "pazpar2.h"
+
+struct relevance;
+struct relevance_record;
+
+struct relevance *relevance_create(NMEM nmem, const char **terms, int numrecs);
+struct relevance_record *relevance_newrec(struct relevance *r, struct record *rec);
+void relevance_countwords(struct relevance_record *rec, const char *words, int len);
+void relevance_donerecord(struct relevance_record *rec);
+
+void relevance_prepare_read(struct relevance *r, int num);
+struct record *relevance_read(struct relevance *r);
+
+#endif
+
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */
diff --git a/termlists.c b/termlists.c
new file mode 100644 (file)
index 0000000..fbf346e
--- /dev/null
@@ -0,0 +1,165 @@
+/*
+ * $Id: termlists.c,v 1.1 2006-11-24 20:29:07 quinn Exp $
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <yaz/yaz-util.h>
+
+#include "termlists.h"
+
+// Discussion:
+// As terms are found in incoming records, they are added to (or updated in) a
+// Hash table. When term records are updated, a frequency value is updated. At
+// the same time, a highscore is maintained for the most frequent terms.
+
+struct termlist_bucket
+{
+    struct termlist_score term;
+    struct termlist_bucket *next;
+};
+
+struct termlist
+{
+    struct termlist_bucket **hashtable;
+    int hashtable_size;
+    int hashmask;
+
+    struct termlist_score **highscore;
+    int highscore_size;
+    int highscore_num;
+    int highscore_min;
+
+    NMEM nmem;
+};
+
+
+// Jenkins one-at-a-time hash (from wikipedia)
+static unsigned int hash(const unsigned char *key)
+{
+    unsigned int hash = 0;
+
+    while (*key)
+    {
+        hash += *(key++);
+        hash += (hash << 10);
+        hash ^= (hash >> 6);
+    }
+    hash += (hash << 3);
+    hash ^= (hash >> 11);
+    hash += (hash << 15);
+    return hash;
+}
+
+struct termlist *termlist_create(NMEM nmem, int numterms, int highscore_size)
+{
+    int hashsize = 1;
+    int halfnumterms;
+    struct termlist *res;
+
+    // Calculate a hash size smallest power of 2 larger than 50% of expected numterms
+    halfnumterms = numterms >> 1;
+    if (halfnumterms < 0)
+        halfnumterms = 1;
+    while (hashsize < halfnumterms)
+        hashsize <<= 1;
+    res = nmem_malloc(nmem, sizeof(struct termlist));
+    res->hashtable = nmem_malloc(nmem, hashsize * sizeof(struct termlist_bucket*));
+    bzero(res->hashtable, hashsize * sizeof(struct termlist_bucket*));
+    res->hashtable_size = hashsize;
+    res->nmem = nmem;
+    res->hashmask = hashsize - 1; // Creates a bitmask
+
+    res->highscore = nmem_malloc(nmem, highscore_size * sizeof(struct termlist_score *));
+    res->highscore_size = highscore_size;
+    res->highscore_num = 0;
+    res->highscore_min = 0;
+
+    return res;
+}
+
+static void update_highscore(struct termlist *tl, struct termlist_score *t)
+{
+    int i;
+    int smallest;
+    int me = -1;
+
+    if (t->frequency < tl->highscore_min)
+        return;
+
+    smallest = 0;
+    for (i = 0; i < tl->highscore_num; i++)
+    {
+        if (tl->highscore[i]->frequency < tl->highscore[smallest]->frequency)
+            smallest = i;
+        if (tl->highscore[i] == t)
+            me = i;
+    }
+    if (tl->highscore_num)
+        tl->highscore_min = tl->highscore[smallest]->frequency;
+    if (me >= 0)
+        return;
+    if (tl->highscore_num < tl->highscore_size)
+    {
+        tl->highscore[tl->highscore_num++] = t;
+        if (t->frequency < tl->highscore_min)
+            tl->highscore_min = t->frequency;
+    }
+    else
+    {
+        if (t->frequency > tl->highscore[smallest]->frequency)
+        {
+            tl->highscore[smallest] = t;
+        }
+    }
+}
+
+void termlist_insert(struct termlist *tl, const char *term)
+{
+    unsigned int bucket;
+    struct termlist_bucket **p;
+
+    bucket = hash(term) & tl->hashmask;
+    for (p = &tl->hashtable[bucket]; *p; p = &(*p)->next)
+    {
+        if (!strcmp(term, (*p)->term.term))
+        {
+            yaz_log(YLOG_LOG, "Found a matching term: %s", term);
+            (*p)->term.frequency++;
+            update_highscore(tl, &((*p)->term));
+            break;
+        }
+    }
+    if (!*p) // We made it to the end of the bucket without finding match
+    {
+        yaz_log(YLOG_DEBUG, "Added a new term: %s", term);
+        struct termlist_bucket *new = nmem_malloc(tl->nmem,
+                sizeof(struct termlist_bucket));
+        new->term.term = nmem_strdup(tl->nmem, term);
+        new->term.frequency = 1;
+        new->next = 0;
+        *p = new;
+        update_highscore(tl, &new->term);
+    }
+}
+
+static int compare(const void *s1, const void *s2)
+{
+    struct termlist_score **p1 = (struct termlist_score**) s1, **p2 = (struct termlist_score **) s2;
+    return (*p2)->frequency - (*p1)->frequency;
+}
+
+struct termlist_score **termlist_highscore(struct termlist *tl, int *len)
+{
+    qsort(tl->highscore, tl->highscore_num, sizeof(struct termlist_score*), compare);
+    *len = tl->highscore_num;
+    return tl->highscore;
+}
+
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */
diff --git a/termlists.h b/termlists.h
new file mode 100644 (file)
index 0000000..2dbee2d
--- /dev/null
@@ -0,0 +1,26 @@
+#ifndef TERMLISTS_H
+#define TERMLISTS_H
+
+#include <yaz/nmem.h>
+
+struct termlist_score
+{
+    char *term;
+    int frequency;
+};
+
+struct termlist;
+
+struct termlist *termlist_create(NMEM nmem, int numterms, int highscore_size);
+void termlist_insert(struct termlist *tl, const char *term);
+struct termlist_score **termlist_highscore(struct termlist *tl, int *len);
+
+#endif
+
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */