ICU term lists are handled for search.
[idzebra-moved-to-github.git] / index / extract.c
index 66eaff1..48dd978 100644 (file)
@@ -1,4 +1,4 @@
-/* $Id: extract.c,v 1.266 2007-10-30 19:17:15 adam Exp $
+/* $Id: extract.c,v 1.271 2007-12-07 14:09:09 adam Exp $
    Copyright (C) 1995-2007
    Index Data ApS
 
@@ -67,14 +67,10 @@ static void zebra_init_log_level(void)
     }
 }
 
-static void extract_flush_record_keys(ZebraHandle zh, zint sysno,
-                                      int cmd, zebra_rec_keys_t reckeys,
-                                      zint staticrank);
 static void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
                                     int cmd, zebra_rec_keys_t skp);
 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid);
 static void extract_token_add(RecWord *p);
-static void extract_token_add2(RecWord *p);
 
 static void check_log_limit(ZebraHandle zh)
 {
@@ -100,15 +96,6 @@ static void logRecord(ZebraHandle zh)
 
 static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl)
 {
-    int i;
-    for (i = 0; i<256; i++)
-    {
-        zebra_map_t zm = zebra_map_get(zh->reg->zebra_maps, i);
-       if (zebra_maps_is_positioned(zm))
-           ctrl->seqno[i] = 1;
-       else
-           ctrl->seqno[i] = 0;
-    }
     ctrl->flagShowRecords = !zh->m_flag_rw;
 }
 
@@ -282,7 +269,7 @@ static void snippet_token_add(RecWord *p)
 {
     struct snip_rec_info *h = p->extractCtrl->handle;
     ZebraHandle zh = h->zh;
-    zebra_map_t zm = zebra_map_get(zh->reg->zebra_maps, *p->index_type);
+    zebra_map_t zm = zebra_map_get(zh->reg->zebra_maps, p->index_type);
 
     if (zm && zebra_maps_is_index(zm))
     {
@@ -533,13 +520,29 @@ struct recordLogInfo {
     struct recordGroup *rGroup;
 };
 
-static void all_matches_add(struct recExtractCtrl *ctrl)
+/** \brief add the always-matches index entry and map to real record ID
+    \param ctrl record control
+    \param record_id custom record ID
+    \param sysno system record ID
+    
+    This function serves two purposes.. It adds the always matches
+    entry and makes a pointer from the custom record ID (if defined)
+    back to the system record ID (sysno)
+    See zebra_recid_to_sysno .
+  */
+static void all_matches_add(struct recExtractCtrl *ctrl, zint record_id,
+                            zint sysno)
 {
     RecWord word;
     extract_init(ctrl, &word);
+    word.record_id = record_id;
+    /* we use the seqno as placeholder for a way to get back to
+       record database from _ALLRECORDS.. This is used if a custom
+       RECORD was defined */
+    word.seqno = sysno;
     word.index_name = "_ALLRECORDS";
     word.index_type = "w";
-    word.seqno = 1;
+
     extract_add_index_string(&word, zinfo_index_category_alwaysmatches,
                               "", 0);
 }
@@ -833,14 +836,7 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
         stream->endf(stream, &null_offset);;
 
         extractCtrl.init = extract_init;
-        if (zh->reg->index_types)
-        {
-            extractCtrl.tokenAdd = extract_token_add2;
-        }
-        else
-        {
-            extractCtrl.tokenAdd = extract_token_add;
-        }
+        extractCtrl.tokenAdd = extract_token_add;
         extractCtrl.schemaAdd = extract_schema_add;
         extractCtrl.dh = zh->reg->dh;
         extractCtrl.handle = zh;
@@ -894,8 +890,6 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
         else
             end_offset = stream->tellf(stream);
 
-        all_matches_add(&extractCtrl);
-        
         if (extractCtrl.match_criteria[0])
             match_criteria = extractCtrl.match_criteria;
     }
@@ -939,6 +933,7 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
            }
        }
     }
+
     if (zebra_rec_keys_empty(zh->reg->keys))
     {
        /* the extraction process returned no information - the record
@@ -971,6 +966,15 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
 
         *sysno = rec->sysno;
 
+
+        if (stream)
+        {
+            all_matches_add(&extractCtrl,
+                            zebra_rec_keys_get_custom_record_id(zh->reg->keys),
+                            *sysno);
+        }
+
+
        recordAttr = rec_init_attr(zh->reg->zei, rec);
        if (extractCtrl.staticrank < 0)
         {
@@ -1012,6 +1016,13 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
 
         rec = rec_get(zh->reg->records, *sysno);
         assert(rec);
+
+        if (stream)
+        {
+            all_matches_add(&extractCtrl,
+                            zebra_rec_keys_get_custom_record_id(zh->reg->keys),
+                            *sysno);
+        }
        
        recordAttr = rec_init_attr(zh->reg->zei, rec);
 
@@ -1277,7 +1288,7 @@ void extract_rec_keys_log(ZebraHandle zh, int is_insert,
             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
                                     0/* db */, &string_index);
             assert(index_type);
-            zebra_term_untrans_iconv(zh, nmem, *index_type,
+            zebra_term_untrans_iconv(zh, nmem, index_type,
                                      &dst_term, str);
             *keystr = '\0';
             for (i = 0; i<key.len; i++)
@@ -1444,47 +1455,6 @@ void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
     yaz_log(log_level_extract, "normal=%d optimized=%d", normal, optimized);
 }
 
-void extract_flush_record_keys(ZebraHandle zh, zint sysno, int cmd,
-                               zebra_rec_keys_t reckeys,
-                               zint staticrank)
-{
-    ZebraExplainInfo zei = zh->reg->zei;
-
-    extract_rec_keys_adjust(zh, cmd, reckeys);
-
-    if (log_level_details)
-    {
-        yaz_log(log_level_details, "Keys for record " ZINT_FORMAT " %s",
-                sysno, cmd ? "insert" : "delete");
-        extract_rec_keys_log(zh, cmd, reckeys, log_level_details);
-    }
-
-    if (!zh->reg->key_block)
-    {
-       int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
-        const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
-        int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
-        zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
-    }
-    zebraExplain_recordCountIncrement(zei, cmd ? 1 : -1);
-
-#if 0
-    yaz_log(YLOG_LOG, "sysno=" ZINT_FORMAT " cmd=%d", sysno, cmd);
-    print_rec_keys(zh, reckeys);
-#endif
-    if (zebra_rec_keys_rewind(reckeys))
-    {
-       size_t slen;
-       const char *str;
-       struct it_key key_in;
-       while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
-       {
-            key_block_write(zh->reg->key_block, sysno, 
-                            &key_in, cmd, str, slen,
-                            staticrank, zh->m_staticrank);
-       }
-    }
-}
 
 ZEBRA_RES zebra_rec_keys_to_snippets(ZebraHandle zh,
                                      zebra_rec_keys_t reckeys,
@@ -1510,7 +1480,7 @@ ZEBRA_RES zebra_rec_keys_to_snippets(ZebraHandle zh,
            zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
                                    0/* db */, 0 /* string_index */);
            assert(index_type);
-           zebra_term_untrans_iconv(zh, nmem, *index_type,
+           zebra_term_untrans_iconv(zh, nmem, index_type,
                                     &dst_term, str);
            zebra_snippets_append(snippets, seqno, 0, ord, dst_term);
            nmem_reset(nmem);
@@ -1541,7 +1511,7 @@ void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys)
            
            seqno = key.mem[key.len-1];
            
-           zebra_term_untrans(zh, *index_type, dst_buf, str);
+           zebra_term_untrans(zh, index_type, dst_buf, str);
            
            yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT 
                     " term=%s", ord, seqno, dst_buf); 
@@ -1753,19 +1723,21 @@ static void extract_add_complete_field(RecWord *p, zebra_map_t zm)
     extract_add_string(p, zm, buf, i);
 }
 
-static void extract_token_add2_index(ZebraHandle zh, zebra_index_type_t type,
-                                     RecWord *p)
+static void extract_add_icu(RecWord *p, zebra_map_t zm)
 {
     struct it_key key;
     const char *res_buf = 0;
     size_t res_len = 0;
-    int r = zebra_index_type_tokenize(type, p->term_buf, p->term_len,
-                                      &res_buf, &res_len);
+    ZebraHandle zh = p->extractCtrl->handle;
+    
     int cat = zinfo_index_category_index;
-    int ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, p->index_type, p->index_name);
+    int ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, 
+                                          p->index_type, p->index_name);
     if (ch < 0)
-        ch = zebraExplain_add_attr_str(zh->reg->zei, cat, p->index_type, p->index_name);
-    while (r)
+        ch = zebraExplain_add_attr_str(zh->reg->zei, cat, 
+                                       p->index_type, p->index_name);
+    zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
+    while (zebra_map_tokenize_next(zm, &res_buf, &res_len))
     {
         int i = 0;
         key.mem[i++] = ch;
@@ -1777,31 +1749,12 @@ static void extract_token_add2_index(ZebraHandle zh, zebra_index_type_t type,
         key.mem[i++] = p->seqno;
         key.len = i;
 
-        yaz_log(YLOG_LOG, "keys_write %.*s", (int) res_len, res_buf);
         zebra_rec_keys_write(zh->reg->keys, res_buf, res_len, &key);
         
         p->seqno++;
-        r = zebra_index_type_tokenize(type, 0, 0, &res_buf, &res_len);
     }
 }
 
-static void extract_token_add2(RecWord *p)
-{
-    ZebraHandle zh = p->extractCtrl->handle;
-    zebra_index_type_t type = zebra_index_type_get(zh->reg->index_types, p->index_type);
-    if (type)
-    {
-        if (zebra_index_type_is_index(type))
-        {
-            extract_token_add2_index(zh, type, p);
-        }
-        else if (zebra_index_type_is_sort(type))
-        {
-            ;
-            
-        }
-    }
-}
 
 /** \brief top-level indexing handler for recctrl system
     \param p token data to be indexed
@@ -1821,7 +1774,7 @@ static void extract_token_add2(RecWord *p)
 static void extract_token_add(RecWord *p)
 {
     ZebraHandle zh = p->extractCtrl->handle;
-    zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, *p->index_type);
+    zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
     WRBUF wrbuf;
 
     if (log_level_details)
@@ -1833,13 +1786,20 @@ static void extract_token_add(RecWord *p)
     }
     if ((wrbuf = zebra_replace(zm, 0, p->term_buf, p->term_len)))
     {
-       p->term_buf = wrbuf_buf(wrbuf);
-       p->term_len = wrbuf_len(wrbuf);
+        p->term_buf = wrbuf_buf(wrbuf);
+        p->term_len = wrbuf_len(wrbuf);
+    }
+    if (zebra_maps_is_icu(zm))
+    {
+        extract_add_icu(p, zm);
     }
-    if (zebra_maps_is_complete(zm))
-       extract_add_complete_field(p, zm);
     else
-       extract_add_incomplete_field(p, zm);
+    {
+        if (zebra_maps_is_complete(zm))
+            extract_add_complete_field(p, zm);
+        else
+            extract_add_incomplete_field(p, zm);
+    }
 }
 
 static void extract_set_store_data_cb(struct recExtractCtrl *p,