More work on multi-map for sort
authorAdam Dickmeiss <adam@indexdata.dk>
Tue, 9 Sep 2008 09:08:11 +0000 (11:08 +0200)
committerAdam Dickmeiss <adam@indexdata.dk>
Tue, 9 Sep 2008 09:08:11 +0000 (11:08 +0200)
Added a new function zebra_sort_add_en to eventually replace
zebra_sort_add.

include/sortidx.h
index/extract.c
index/reckeys.c
index/sortidx.c

index f944ba7..5800382 100644 (file)
@@ -38,6 +38,10 @@ typedef struct zebra_sort_index *zebra_sort_index_t;
 #define ZEBRA_SORT_TYPE_ISAMB 2
 #define ZEBRA_SORT_TYPE_MULTI 3
 
+struct zebra_sort_ent {
+    int num;
+    WRBUF wrbuf;
+};
 
 /** \brief creates sort handle
     \param bfs block files handle
@@ -73,6 +77,15 @@ void zebra_sort_sysno(zebra_sort_index_t si, zint sysno);
 void zebra_sort_add(zebra_sort_index_t si, const char *buf, int len);
 
 
+/** \brief adds multi-map content to sort file
+    \param si sort index handle
+    \param ent multi-map value
+
+    zebra_sort_type and zebra_sort_sysno must be called prior to this
+*/
+void zebra_sort_add_ent(zebra_sort_index_t si, struct zebra_sort_ent *ent);
+
+
 /** \brief delete sort entry
     \param si sort index handle
 
index 89f072a..0d648a6 100644 (file)
@@ -1867,8 +1867,56 @@ void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
        const char *str;
        struct it_key key_in;
 
+#define USE_SORT_ENT 1
+#if USE_SORT_ENT
+        NMEM nmem = nmem_create();
+        struct sort_add_ent {
+            int ord;
+            int cmd;
+            struct sort_add_ent *next;
+            struct zebra_sort_ent sort_ent;
+        };
+        struct sort_add_ent *sort_ent_list = 0;
+#endif
         zebra_sort_sysno(si, sysno);
 
+#if USE_SORT_ENT
+       while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
+        {
+            int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
+
+            struct sort_add_ent **e = &sort_ent_list;
+            while (*e && (*e)->ord != ord)
+                e = &(*e)->next;
+            if (!*e)
+            {
+                *e = nmem_malloc(nmem, sizeof(**e));
+                (*e)->next = 0;
+                (*e)->sort_ent.wrbuf = wrbuf_alloc();
+                (*e)->sort_ent.num = 0;
+                (*e)->ord = ord;
+                (*e)->cmd = cmd;
+            }
+            
+            wrbuf_write((*e)->sort_ent.wrbuf, str, slen);
+            wrbuf_putc((*e)->sort_ent.wrbuf, '\0');
+            (*e)->sort_ent.num++;
+        }
+        if (sort_ent_list)
+        {
+            struct sort_add_ent *e = sort_ent_list;
+            for (; e; e = e->next)
+            {
+                zebra_sort_type(si, e->ord);
+                if (e->cmd == 1)
+                    zebra_sort_add_ent(si, &e->sort_ent);
+                else
+                    zebra_sort_delete(si);
+                wrbuf_destroy(e->sort_ent.wrbuf);
+            }
+        }
+        nmem_destroy(nmem);
+#else
        while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
         {
             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
@@ -1879,6 +1927,7 @@ void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
             else
                 zebra_sort_delete(si);
         }
+#endif
     }
 }
 
index dec3e05..adea244 100644 (file)
@@ -27,31 +27,12 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 #include <yaz/nmem.h>
 #include <yaz/xmalloc.h>
 
-#define NEW 0
-
-#if NEW
-struct zebra_rec_word_entry {
-    char *buf;
-    size_t len;
-    int ord;
-    int max_seq;
-    struct zebra_rec_word_entry *next;
-    struct zebra_rec_key_entry *keys;
-    struct zebra_rec_key_entry **last_key;
-};
-
-struct zebra_rec_key_entry {
-    struct it_key key;
-    struct zebra_rec_key_entry *next;
-};
-#else
 struct zebra_rec_key_entry {
     char *buf;
     size_t len;
     struct it_key key;
     struct zebra_rec_key_entry *next;
 };
-#endif
 
 struct zebra_rec_keys_t_ {
     size_t buf_used;
@@ -65,28 +46,10 @@ struct zebra_rec_keys_t_ {
 
     NMEM nmem;
     size_t hash_size;
-#if NEW
-    struct zebra_rec_word_entry **entries;
-#else
     struct zebra_rec_key_entry **entries;
-#endif
 };
 
 
-#if NEW
-struct zebra_rec_word_entry **zebra_rec_keys_mk_hash(zebra_rec_keys_t p,
-                                                     const char *buf,
-                                                     size_t len,
-                                                     int ord)
-{
-    int i;
-    unsigned h = ord;
-
-    for (i = 0; i<len; i++)
-       h = h * 65509 + buf[i];
-    return &p->entries[h % (unsigned) p->hash_size];
-}
-#else
 struct zebra_rec_key_entry **zebra_rec_keys_mk_hash(zebra_rec_keys_t p,
                                                    const char *buf,
                                                    size_t len,
@@ -105,7 +68,6 @@ struct zebra_rec_key_entry **zebra_rec_keys_mk_hash(zebra_rec_keys_t p,
 #endif
     return &p->entries[h % (unsigned) p->hash_size];
 }
-#endif
 
 static void init_hash(zebra_rec_keys_t p)
 {
@@ -190,65 +152,6 @@ void zebra_rec_keys_close(zebra_rec_keys_t p)
     xfree(p);
 }
 
-#if NEW
-void zebra_rec_keys_write(zebra_rec_keys_t keys, 
-                         const char *str, size_t slen,
-                         const struct it_key *key)
-{
-    char *dst;
-    const char *src = (char*) key;
-    
-    struct zebra_rec_word_entry **wep;
-    struct zebra_rec_key_entry **kep;
-    int ord = key->mem[0];
-    int seq = key->mem[key->len-1];
-    
-    assert(keys->owner_of_buffer);
-
-    wep = zebra_rec_keys_mk_hash(keys, str, slen, ord);
-
-    while (*wep)
-    {
-       struct zebra_rec_word_entry *e = *wep;
-       if (ord == e->ord && slen == e->len && !memcmp(str, e->buf, slen))
-            break;
-       wep = &(*wep)->next;
-    }
-    
-    if (!*wep)
-    {
-        *wep = nmem_malloc(keys->nmem, sizeof(**wep));
-        (*wep)->buf = nmem_malloc(keys->nmem, slen);
-        memcpy((*wep)->buf, str, slen);
-        (*wep)->len = slen;
-        (*wep)->ord = ord;
-        (*wep)->next = 0;
-        (*wep)->keys = 0;
-        (*wep)->max_seq = 0;
-        (*wep)->last_key = &(*wep)->keys;
-    }
-    if (seq > (*wep)->max_seq)
-        kep = (*wep)->last_key;
-    else
-    {
-        kep = &(*wep)->keys;
-        while (*kep)
-        {
-            if (!key_compare(key, &(*kep)->key))
-                return;
-            kep = &(*kep)->next;
-        }
-    }
-    *kep = nmem_malloc(keys->nmem, sizeof(**kep));
-    (*kep)->next = 0;
-    (*wep)->last_key = &(*kep)->next;
-    memcpy(&(*kep)->key, key, sizeof(*key));
-    if (seq > (*wep)->max_seq)
-    {
-        (*wep)->max_seq = seq;
-    }
-}
-#else
 int zebra_rec_keys_add_hash(zebra_rec_keys_t keys, 
                            const char *str, size_t slen,
                            const struct it_key *key)
@@ -320,7 +223,6 @@ void zebra_rec_keys_write(zebra_rec_keys_t keys,
     *dst++ = '\0';
     keys->buf_used = dst - keys->buf;
 }
-#endif
 
 void zebra_rec_keys_reset(zebra_rec_keys_t keys)
 {
@@ -337,42 +239,6 @@ int zebra_rec_keys_rewind(zebra_rec_keys_t keys)
     assert(keys);
     iscz1_reset(keys->decode_handle);
 
-#if NEW
-    if (keys->buf_used == 0)
-    {
-        size_t i;
-        for (i = 0; i<keys->hash_size; i++)
-        {
-            struct zebra_rec_word_entry *we = keys->entries[i];
-            for (; we; we = we->next)
-            {
-                struct zebra_rec_key_entry *ke = we->keys;
-                for (; ke; ke = ke->next)
-                {
-                    const char *src = (char*) &ke->key;
-                    char *dst;
-                    if (keys->buf_used+1024 > keys->buf_max)
-                    {
-                        char *b = (char *) xmalloc (keys->buf_max += 128000);
-                        if (keys->buf_used > 0)
-                            memcpy (b, keys->buf, keys->buf_used);
-                        xfree (keys->buf);
-                        keys->buf = b;
-                    }
-                    
-                    dst = keys->buf + keys->buf_used;
-                    
-                    iscz1_encode(keys->encode_handle, &dst, &src);
-                    
-                    memcpy (dst, we->buf, we->len);
-                    dst += we->len;
-                    *dst++ = '\0';
-                    keys->buf_used = dst - keys->buf;
-                }
-            }
-        }
-    }
-#endif
 
     keys->fetch_offset = 0;
     if (keys->buf_used == 0)
index 65bff6b..ef15209 100644 (file)
@@ -374,6 +374,77 @@ void zebra_sort_delete(zebra_sort_index_t si)
     }
 }
 
+void zebra_sort_add_ent(zebra_sort_index_t si, struct zebra_sort_ent *ent)
+{
+    struct sortFile *sf = si->current_file;
+    int len;
+
+    if (!sf || !sf->u.bf)
+        return;
+    switch(si->type)
+    {
+    case ZEBRA_SORT_TYPE_FLAT:
+        /* take first entry from wrbuf - itself is 0-terminated */
+        len = strlen(wrbuf_buf(ent->wrbuf));
+        if (len > SORT_IDX_ENTRYSIZE)
+            len = SORT_IDX_ENTRYSIZE;
+        
+        memcpy(si->entry_buf, wrbuf_buf(ent->wrbuf), len);
+        if (len < SORT_IDX_ENTRYSIZE-len)
+            memset(si->entry_buf+len, 0, SORT_IDX_ENTRYSIZE-len);
+        bf_write(sf->u.bf, si->sysno+1, 0, 0, si->entry_buf);
+        break;
+    case ZEBRA_SORT_TYPE_ISAMB:
+        assert(sf->u.isamb);
+
+        assert(sf->no_inserted == 0);
+        if (sf->no_inserted == 0)
+        {
+            struct sort_term_stream s;
+            ISAMC_I isamc_i;
+            /* take first entry from wrbuf - itself is 0-terminated */
+            len = strlen(wrbuf_buf(ent->wrbuf)); 
+
+            s.st.sysno = si->sysno;
+            if (len >= SORT_MAX_TERM)
+                len = SORT_MAX_TERM-1;
+            memcpy(s.st.term, wrbuf_buf(ent->wrbuf), len);
+            s.st.term[len] = '\0';
+            s.st.length = len;
+            s.no = 1;
+            s.insert_flag = 1;
+            isamc_i.clientData = &s;
+            isamc_i.read_item = sort_term_code_read;
+            
+            isamb_merge(sf->u.isamb, &sf->isam_p, &isamc_i);
+            sf->no_inserted++;
+        }
+        break;
+    case ZEBRA_SORT_TYPE_MULTI:
+        assert(sf->u.isamb);
+        if (sf->no_inserted == 0)
+        {
+            struct sort_term_stream s;
+            ISAMC_I isamc_i;
+            len = wrbuf_len(ent->wrbuf);
+
+            s.st.sysno = si->sysno;
+            if (len >= SORT_MAX_MULTI)
+                len = SORT_MAX_MULTI-1;
+            memcpy(s.st.term, wrbuf_buf(ent->wrbuf), len);
+            s.st.length = len;
+            s.no = 1;
+            s.insert_flag = 1;
+            isamc_i.clientData = &s;
+            isamc_i.read_item = sort_term_code_read;
+            
+            isamb_merge(sf->u.isamb, &sf->isam_p, &isamc_i);
+            sf->no_inserted++;
+        }
+        break;
+    }
+}
+
 void zebra_sort_add(zebra_sort_index_t si, const char *buf, int len)
 {
     struct sortFile *sf = si->current_file;