Happy new year
[idzebra-moved-to-github.git] / index / extract.c
index 7e81792..5071fd7 100644 (file)
@@ -1,5 +1,5 @@
 /* This file is part of the Zebra server.
-   Copyright (C) 1994-2011 Index Data
+   Copyright (C) Index Data
 
 Zebra is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free
@@ -110,8 +110,8 @@ static void logRecord(ZebraHandle zh)
     if (!(zh->records_processed % 1000))
     {
         yaz_log(YLOG_LOG, "Records: "ZINT_FORMAT" i/u/d "
-                ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT, 
-                zh->records_processed, zh->records_inserted, 
+                ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT,
+                zh->records_processed, zh->records_inserted,
                 zh->records_updated, zh->records_deleted);
     }
 }
@@ -122,7 +122,7 @@ static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl)
 }
 
 
-static void extract_add_index_string(RecWord *p, 
+static void extract_add_index_string(RecWord *p,
                                       zinfo_index_category_t cat,
                                       const char *str, int length);
 
@@ -144,11 +144,71 @@ struct snip_rec_info {
     zebra_snippets *snippets;
 };
 
+static int parse_complete_field(RecWord *p, zebra_map_t zm,
+                                char *buf)
+{
+    const char *b = p->term_buf;
+    const char **map = 0;
+    int i = 0, remain = p->term_len;
+
+    if (remain > 0)
+       map = zebra_maps_input(zm, &b, remain, 1);
+    while (remain > 0 && i < IT_MAX_WORD)
+    {
+       while (map && *map && **map == *CHR_SPACE)
+       {
+           remain = p->term_len - (b - p->term_buf);
+
+           if (remain > 0)
+           {
+               int first = i ? 0 : 1;  /* first position */
+               map = zebra_maps_input(zm, &b, remain, first);
+           }
+           else
+               map = 0;
+       }
+       if (!map)
+           break;
+
+       if (i && i < IT_MAX_WORD)
+           buf[i++] = *CHR_SPACE;
+       while (map && *map && **map != *CHR_SPACE)
+       {
+           const char *cp = *map;
+
+           if (**map == *CHR_CUT)
+           {
+               i = 0;
+           }
+           else
+           {
+               if (i >= IT_MAX_WORD)
+                   break;
+               while (i < IT_MAX_WORD && *cp)
+                   buf[i++] = *(cp++);
+           }
+           remain = p->term_len  - (b - p->term_buf);
+           if (remain > 0)
+           {
+               map = zebra_maps_input(zm, &b, remain, 0);
+           }
+           else
+               map = 0;
+       }
+    }
+    return i;
+}
 
 static void snippet_add_complete_field(RecWord *p, int ord,
                                        zebra_map_t zm)
 {
     struct snip_rec_info *h = p->extractCtrl->handle;
+    char buf[IT_MAX_WORD+1];
+    int i = parse_complete_field(p, zm, buf);
+
+    if (!i)
+        return;
+
     if (p->term_len && p->term_buf && zebra_maps_is_index(zm))
         zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
                                p->term_buf, p->term_len);
@@ -170,8 +230,7 @@ static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm)
 
     while (map)
     {
-       char buf[IT_MAX_WORD+1];
-       int i, remain;
+       int remain;
 
        /* Skip spaces */
        while (map && *map && **map == *CHR_SPACE)
@@ -189,17 +248,10 @@ static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm)
         {
             zebra_snippets_appendn(h->snippets, p->seqno, 1, ord,
                                    start, last - start);
-
         }
         start = last;
-
-       i = 0;
        while (map && *map && **map != *CHR_SPACE)
        {
-           const char *cp = *map;
-
-           while (i < IT_MAX_WORD && *cp)
-               buf[i++] = *(cp++);
            remain = p->term_len - (b - p->term_buf);
             last = b;
            if (remain > 0)
@@ -207,11 +259,11 @@ static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm)
            else
                map = 0;
        }
-       if (!i)
-           return;
+        if (start == last)
+            return ;
 
         if (first)
-        {   
+        {
             first = 0;
             if (zebra_maps_is_first_in_field(zm))
             {
@@ -285,7 +337,6 @@ void extract_snippet(ZebraHandle zh, zebra_snippets *sn,
 {
     struct recExtractCtrl extractCtrl;
     struct snip_rec_info info;
-    int r;
 
     extractCtrl.stream = stream;
     extractCtrl.first_record = 1;
@@ -296,20 +347,19 @@ void extract_snippet(ZebraHandle zh, zebra_snippets *sn,
     assert(zh->reg->dh);
 
     extractCtrl.dh = zh->reg->dh;
-    
+
     info.zh = zh;
     info.snippets = sn;
     extractCtrl.handle = &info;
     extractCtrl.match_criteria[0] = '\0';
     extractCtrl.staticrank = 0;
     extractCtrl.action = action_insert;
-    
+
     init_extractCtrl(zh, &extractCtrl);
 
     extractCtrl.setStoreData = 0;
 
-    r = (*rt->extract)(recTypeClientData, &extractCtrl);
-
+    (*rt->extract)(recTypeClientData, &extractCtrl);
 }
 
 static void searchRecordKey(ZebraHandle zh,
@@ -346,11 +396,11 @@ static void searchRecordKey(ZebraHandle zh,
            assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
 
            seqno = key.mem[key.len-1];
-           
+
            if (key.mem[0] == ch)
            {
                zint woff;
-               
+
                if (startSeq == -1)
                    startSeq = seqno;
                woff = seqno - startSeq;
@@ -383,15 +433,15 @@ static char *get_match_from_spec(ZebraHandle zh,
            char attset_str[64], attname_str[64];
            int i;
             int first = 1;
-           
+
            for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
                ;
-           for (i = 0; *s && *s != ',' && *s != ')' && 
+           for (i = 0; *s && *s != ',' && *s != ')' &&
                     !strchr(FILE_MATCH_BLANK, *s); s++)
                if (i+1 < sizeof(attset_str))
                    attset_str[i++] = *s;
            attset_str[i] = '\0';
-           
+
            for (; strchr(FILE_MATCH_BLANK, *s); s++)
                ;
            if (*s != ',')
@@ -400,7 +450,7 @@ static char *get_match_from_spec(ZebraHandle zh,
            {
                for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
                    ;
-               for (i = 0; *s && *s != ')' && 
+               for (i = 0; *s && *s != ')' &&
                         !strchr(FILE_MATCH_BLANK, *s); s++)
                    if (i+1 < sizeof(attname_str))
                        attname_str[i++] = *s;
@@ -416,7 +466,7 @@ static char *get_match_from_spec(ZebraHandle zh,
 
             searchRecordKey(zh, reckeys, attname_str, ws, 32);
             if (0) /* for debugging */
-            {   
+            {
                 for (i = 0; i<32; i++)
                 {
                     if (ws[i])
@@ -471,7 +521,7 @@ static char *get_match_from_spec(ZebraHandle zh,
            }
             else if (!strcmp(special, "type"))
                 spec_src = zh->m_record_type;
-            else 
+            else
                 spec_src = NULL;
             if (spec_src)
             {
@@ -532,7 +582,7 @@ struct recordLogInfo {
     \param ctrl record control
     \param record_id custom record ID
     \param sysno system record ID
-    
+
     This function serves two purposes.. It adds the always matches
     entry and makes a pointer from the custom record ID (if defined)
     back to the system record ID (sysno)
@@ -556,7 +606,7 @@ static void all_matches_add(struct recExtractCtrl *ctrl, zint record_id,
 }
 
 /* forward declaration */
-ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
+ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh,
                                        struct ZebraRecStream *stream,
                                        enum zebra_recctrl_action_t action,
                                        const char *recordType,
@@ -567,7 +617,7 @@ ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh,
                                        void *recTypeClientData);
 
 
-ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, 
+ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname,
                              enum zebra_recctrl_action_t action)
 {
     ZEBRA_RES r = ZEBRA_OK;
@@ -575,7 +625,6 @@ ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname,
     char gprefix[128];
     char ext[128];
     char ext_res[128];
-    struct file_read_info *fi = 0;
     const char *original_record_type = 0;
     RecType recType;
     void *recTypeClientData;
@@ -587,7 +636,7 @@ ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname,
         *gprefix = '\0';
     else
         sprintf(gprefix, "%s.", zh->m_group);
-    
+
     yaz_log(log_level_extract, "zebra_extract_file %s", fname);
 
     /* determine file extension */
@@ -641,7 +690,6 @@ ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname,
     if (sysno && (action == action_delete || action == action_a_delete))
     {
         streamp = 0;
-        fi = 0;
     }
     else
     {
@@ -655,7 +703,7 @@ ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname,
         }
         else
             strcpy(full_rep, fname);
-        
+
         if ((fd = open(full_rep, O_BINARY|O_RDONLY)) == -1)
         {
             yaz_log(YLOG_WARN|YLOG_ERRNO, "open %s", full_rep);
@@ -682,10 +730,10 @@ ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname,
   If sysno is provided, then it's used to identify the reocord.
   If not, and match_criteria is provided, then sysno is guessed
   If not, and a record is provided, then sysno is got from there
-  
+
  */
 
-ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, 
+ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh,
                                       const char *buf, size_t buf_size,
                                       enum zebra_recctrl_action_t action,
                                       const char *recordType,
@@ -704,7 +752,7 @@ ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh,
                 "Record type explicitly specified: %s", recordType);
         recType = recType_byName(zh->reg->recTypes, zh->res, recordType,
                                   &clientData);
-    } 
+    }
     else
     {
         if (!(zh->m_record_type))
@@ -718,7 +766,7 @@ ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh,
                                  zh->m_record_type, &clientData);
         recordType = zh->m_record_type;
     }
-    
+
     if (!recType)
     {
         yaz_log(YLOG_WARN, "No such record type: %s", recordType);
@@ -738,7 +786,7 @@ ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh,
     return res;
 }
 
-static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, 
+static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
                                              struct ZebraRecStream *stream,
                                              enum zebra_recctrl_action_t action,
                                              const char *recordType,
@@ -748,7 +796,7 @@ static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
                                              RecType recType,
                                              void *recTypeClientData,
                                              int *more)
-    
+
 {
     zint sysno0 = 0;
     RecordAttr *recordAttr;
@@ -758,7 +806,7 @@ static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
     Record rec;
     off_t start_offset = 0, end_offset = 0;
     const char *pr_fname = fname;  /* filename to print .. */
-    int show_progress = zh->records_processed + zh->records_skipped 
+    int show_progress = zh->records_processed + zh->records_skipped
         < zh->m_file_verbose_limit ? 1:0;
 
     zebra_init_log_level();
@@ -771,7 +819,7 @@ static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
 
     if (zebraExplain_curDatabase(zh->reg->zei, zh->basenames[0]))
     {
-        if (zebraExplain_newDatabase(zh->reg->zei, zh->basenames[0], 
+        if (zebraExplain_newDatabase(zh->reg->zei, zh->basenames[0],
                                      zh->m_explain_database))
             return ZEBRA_FAIL;
     }
@@ -784,7 +832,7 @@ static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
         start_offset = stream->tellf(stream);
 
         extractCtrl.first_record = start_offset ? 0 : 1;
-        
+
         stream->endf(stream, &null_offset);;
 
         extractCtrl.init = extract_init;
@@ -799,14 +847,14 @@ static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
         init_extractCtrl(zh, &extractCtrl);
 
         extract_set_store_data_prepare(&extractCtrl);
-        
+
         r = (*recType->extract)(recTypeClientData, &extractCtrl);
 
         if (action == action_update)
         {
             action = extractCtrl.action;
         }
-        
+
         switch (r)
         {
         case RECCTRL_EXTRACT_EOF:
@@ -824,7 +872,7 @@ static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
                 yaz_log(YLOG_LOG, "skip %s %s " ZINT_FORMAT,
                          recordType, pr_fname, (zint) start_offset);
             *more = 1;
-            
+
             end_offset = stream->endf(stream, 0);
             if (end_offset)
                 stream->seekf(stream, end_offset);
@@ -855,18 +903,18 @@ static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
         /* test mode .. Do not perform match */
         return ZEBRA_OK;
     }
-        
+
     if (!sysno)
     {
        sysno = &sysno0;
-        
+
         if (match_criteria && *match_criteria)
             matchStr = match_criteria;
         else
         {
             if (zh->m_record_id && *zh->m_record_id)
             {
-                matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname, 
+                matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname,
                                                zh->m_record_id);
                if (!matchStr)
                 {
@@ -887,13 +935,13 @@ static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
                 }
             }
         }
-        if (matchStr) 
+        if (matchStr)
        {
            int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
            char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord,
                                          matchStr);
 
-            
+
             if (log_level_extract)
             {
                 WRBUF w = wrbuf_hex_str(matchStr);
@@ -973,7 +1021,7 @@ static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
 #endif
        recordAttr->staticrank = extractCtrl.staticrank;
         zh->records_inserted++;
-    } 
+    }
     else
     {
         /* record already exists */
@@ -981,7 +1029,7 @@ static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
        zebra_rec_keys_t sortKeys = zebra_rec_keys_open();
        if (action == action_insert)
        {
-           yaz_log(YLOG_LOG, "skipped %s %s " ZINT_FORMAT, 
+           yaz_log(YLOG_LOG, "skipped %s %s " ZINT_FORMAT,
                         recordType, pr_fname, (zint) start_offset);
            logRecord(zh);
            return ZEBRA_FAIL;
@@ -996,7 +1044,7 @@ static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
                             zebra_rec_keys_get_custom_record_id(zh->reg->keys),
                             *sysno);
         }
-       
+
        recordAttr = rec_init_attr(zh->reg->zei, rec);
 
         /* decrease total size */
@@ -1023,7 +1071,7 @@ static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
 #if FLUSH2
             extract_flush_record_keys2(zh, *sysno, 0, recordAttr->staticrank,
                                        delkeys, recordAttr->staticrank);
-#endif       
+#endif
             if (zebra_rec_keys_empty(delkeys))
             {
                yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
@@ -1062,7 +1110,7 @@ static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
                                        zh->reg->keys, extractCtrl.staticrank,
                                        delkeys, recordAttr->staticrank);
 #else
-            extract_flush_record_keys(zh, *sysno, 1, 
+            extract_flush_record_keys(zh, *sysno, 1,
                                       zh->reg->keys, extractCtrl.staticrank);
 #endif
            recordAttr->staticrank = extractCtrl.staticrank;
@@ -1143,11 +1191,11 @@ static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
     /* update database name */
     xfree(rec->info[recInfo_databaseName]);
     rec->info[recInfo_databaseName] =
-        rec_strdup(zh->basenames[0], &rec->size[recInfo_databaseName]); 
+        rec_strdup(zh->basenames[0], &rec->size[recInfo_databaseName]);
 
     /* update offset */
     recordAttr->recordOffset = start_offset;
-    
+
     /* commit this record */
     rec_put(zh->reg->records, &rec);
     logRecord(zh);
@@ -1166,7 +1214,7 @@ static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
     \param recTypeClientData client data for record type
     \returns ZEBRA_OK for success; ZEBRA_FAIL for failure
 */
-ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
+ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh,
                                        struct ZebraRecStream *stream,
                                        enum zebra_recctrl_action_t action,
                                        const char *recordType,
@@ -1231,7 +1279,7 @@ ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n)
 
     extractCtrl.handle = handle;
     extractCtrl.first_record = 1;
-    
+
     extract_set_store_data_prepare(&extractCtrl);
 
     if (n)
@@ -1240,14 +1288,14 @@ ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n)
     if (rec->size[recInfo_delKeys])
     {
        zebra_rec_keys_t delkeys = zebra_rec_keys_open();
-       
+
        zebra_rec_keys_t sortkeys = zebra_rec_keys_open();
 
        zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys],
                               rec->size[recInfo_delKeys],
                               0);
 #if FLUSH2
-       extract_flush_record_keys2(zh, rec->sysno, 
+       extract_flush_record_keys2(zh, rec->sysno,
                                    zh->reg->keys, 0, delkeys, 0);
 #else
        extract_flush_record_keys(zh, rec->sysno, 0, delkeys, 0);
@@ -1267,14 +1315,14 @@ ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n)
 #if FLUSH2
        extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, 0, 0);
 #else
-        extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);        
+        extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
 #endif
     }
     extract_flush_sort_keys(zh, rec->sysno, 1, zh->reg->sortKeys);
-    
+
     xfree(rec->info[recInfo_delKeys]);
     zebra_rec_keys_get_buf(zh->reg->keys,
-                          &rec->info[recInfo_delKeys], 
+                          &rec->info[recInfo_delKeys],
                           &rec->size[recInfo_delKeys]);
 
     xfree(rec->info[recInfo_sortKeys]);
@@ -1288,30 +1336,27 @@ void zebra_it_key_str_dump(ZebraHandle zh, struct it_key *key,
                            const char *str, size_t slen, NMEM nmem, int level)
 {
     char keystr[200]; /* room for zints to print */
-    char *dst_term = 0;
     int ord = CAST_ZINT_TO_INT(key->mem[0]);
     const char *index_type;
     int i;
     const char *string_index;
-    
+
     zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
                             0/* db */, &string_index);
     assert(index_type);
-    zebra_term_untrans_iconv(zh, nmem, index_type,
-                             &dst_term, str);
     *keystr = '\0';
     for (i = 0; i < key->len; i++)
     {
         sprintf(keystr + strlen(keystr), ZINT_FORMAT " ", key->mem[i]);
     }
-    
+
     if (*str < CHR_BASE_CHAR)
     {
         int i;
         char dst_buf[200]; /* room for special chars */
-        
+
         strcpy(dst_buf , "?");
-        
+
         if (!strcmp(str, ""))
             strcpy(dst_buf, "alwaysmatches");
         if (!strcmp(str, FIRST_IN_FIELD_STR))
@@ -1320,18 +1365,30 @@ void zebra_it_key_str_dump(ZebraHandle zh, struct it_key *key,
             strcpy(dst_buf, "unknown");
         else if (!strcmp(str, CHR_SPACE))
             strcpy(dst_buf, "space");
-        
+
         for (i = 0; i<slen; i++)
         {
             sprintf(dst_buf + strlen(dst_buf), " %d", str[i] & 0xff);
         }
         yaz_log(level, "%s%s %s %s", keystr, index_type,
                 string_index, dst_buf);
-        
     }
     else
-        yaz_log(level, "%s%s %s \"%s\"", keystr, index_type,
-                string_index, dst_term);
+    {
+        char *dst_term = 0;
+        zebra_term_untrans_iconv(zh, nmem, index_type, &dst_term, str);
+        if (dst_term)
+            yaz_log(level, "%s%s %s \"%s\"", keystr, index_type,
+                    string_index, dst_term);
+        else
+        {
+            WRBUF w = wrbuf_alloc();
+            wrbuf_write_escaped(w, str, strlen(str));
+            yaz_log(level, "%s%s %s %s", keystr, index_type,
+                    string_index, wrbuf_cstr(w));
+            wrbuf_destroy(w);
+        }
+    }
 }
 
 void extract_rec_keys_log(ZebraHandle zh, int is_insert,
@@ -1459,7 +1516,7 @@ static void extract_flush_record_keys2(
                                       &ins_key_in);
 
         if (del && ins && ins_rank == del_rank
-            && !key_compare(&del_key_in, &ins_key_in) 
+            && !key_compare(&del_key_in, &ins_key_in)
             && ins_slen == del_slen && !memcmp(del_str, ins_str, del_slen))
         {
             optimized++;
@@ -1467,14 +1524,14 @@ static void extract_flush_record_keys2(
         }
         if (!del && !ins)
             break;
-        
+
         normal++;
         if (del)
-            key_block_write(zh->reg->key_block, sysno, 
+            key_block_write(zh->reg->key_block, sysno,
                             &del_key_in, 0, del_str, del_slen,
                             del_rank, zh->m_staticrank);
         if (ins)
-            key_block_write(zh->reg->key_block, sysno, 
+            key_block_write(zh->reg->key_block, sysno,
                             &ins_key_in, 1, ins_str, ins_slen,
                             ins_rank, zh->m_staticrank);
     }
@@ -1517,7 +1574,7 @@ static void extract_flush_record_keys(
         struct it_key key_in;
         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
         {
-            key_block_write(zh->reg->key_block, sysno, 
+            key_block_write(zh->reg->key_block, sysno,
                             &key_in, cmd, str, slen,
                             staticrank, zh->m_staticrank);
         }
@@ -1530,7 +1587,7 @@ ZEBRA_RES zebra_rec_keys_to_snippets1(ZebraHandle zh,
                                      zebra_snippets *snippets)
 {
     NMEM nmem = nmem_create();
-    if (zebra_rec_keys_rewind(reckeys)) 
+    if (zebra_rec_keys_rewind(reckeys))
     {
        const char *str;
        size_t slen;
@@ -1545,7 +1602,7 @@ ZEBRA_RES zebra_rec_keys_to_snippets1(ZebraHandle zh,
            assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
            seqno = key.mem[key.len-1];
            ord = CAST_ZINT_TO_INT(key.mem[0]);
-           
+
            zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
                                    0/* db */, 0 /* string_index */);
            assert(index_type);
@@ -1577,13 +1634,13 @@ void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys)
            assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
 
            zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 0);
-           
+
            seqno = key.mem[key.len-1];
-           
+
            zebra_term_untrans(zh, index_type, dst_buf, str);
-           
-           yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT 
-                    " term=%s", ord, seqno, dst_buf); 
+
+           yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT
+                    " term=%s", ord, seqno, dst_buf);
        }
     }
 }
@@ -1657,7 +1714,7 @@ static void extract_add_string(RecWord *p, zebra_map_t zm,
     {
 
         WRBUF w = wrbuf_alloc();
-        
+
         wrbuf_write_escaped(w, string, length);
         yaz_log(log_level_details, "extract_add_string: %s", wrbuf_cstr(w));
         wrbuf_destroy(w);
@@ -1692,7 +1749,7 @@ static void extract_add_incomplete_field(RecWord *p, zebra_map_t zm)
     int remain = p->term_len;
     int first = 1;
     const char **map = 0;
-    
+
     if (remain > 0)
        map = zebra_maps_input(zm, &b, remain, 0);
 
@@ -1729,7 +1786,7 @@ static void extract_add_incomplete_field(RecWord *p, zebra_map_t zm)
            return;
 
         if (first)
-        {   
+        {
             first = 0;
             if (zebra_maps_is_first_in_field(zm))
             {
@@ -1745,57 +1802,8 @@ static void extract_add_incomplete_field(RecWord *p, zebra_map_t zm)
 
 static void extract_add_complete_field(RecWord *p, zebra_map_t zm)
 {
-    const char *b = p->term_buf;
     char buf[IT_MAX_WORD+1];
-    const char **map = 0;
-    int i = 0, remain = p->term_len;
-
-    if (remain > 0)
-       map = zebra_maps_input(zm, &b, remain, 1);
-
-    while (remain > 0 && i < IT_MAX_WORD)
-    {
-       while (map && *map && **map == *CHR_SPACE)
-       {
-           remain = p->term_len - (b - p->term_buf);
-
-           if (remain > 0)
-           {
-               int first = i ? 0 : 1;  /* first position */
-               map = zebra_maps_input(zm, &b, remain, first);
-           }
-           else
-               map = 0;
-       }
-       if (!map)
-           break;
-
-       if (i && i < IT_MAX_WORD)
-           buf[i++] = *CHR_SPACE;
-       while (map && *map && **map != *CHR_SPACE)
-       {
-           const char *cp = *map;
-
-           if (**map == *CHR_CUT)
-           {
-               i = 0;
-           }
-           else
-           {
-               if (i >= IT_MAX_WORD)
-                   break;
-               while (i < IT_MAX_WORD && *cp)
-                   buf[i++] = *(cp++);
-           }
-           remain = p->term_len  - (b - p->term_buf);
-           if (remain > 0)
-           {
-               map = zebra_maps_input(zm, &b, remain, 0);
-           }
-           else
-               map = 0;
-       }
-    }
+    int i = parse_complete_field(p, zm, buf);
     if (!i)
        return;
     extract_add_string(p, zm, buf, i);
@@ -1810,6 +1818,11 @@ static void extract_add_icu(RecWord *p, zebra_map_t zm)
     zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
     while (zebra_map_tokenize_next(zm, &res_buf, &res_len, 0, 0))
     {
+        if (res_len > IT_MAX_WORD)
+        {
+            yaz_log(YLOG_LOG, "Truncating long term %ld", (long) res_len);
+            res_len = IT_MAX_WORD;
+        }
         extract_add_string(p, zm, res_buf, res_len);
         p->seqno++;
     }
@@ -1823,32 +1836,26 @@ static void extract_add_icu(RecWord *p, zebra_map_t zm)
     extract_token_add
     extract_add_{in}_complete / extract_add_icu
     extract_add_string
-    
+
     extract_add_index_string
     or
     extract_add_sort_string
     or
     extract_add_staticrank_string
-    
+
 */
 static void extract_token_add(RecWord *p)
 {
     ZebraHandle zh = p->extractCtrl->handle;
     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
-    WRBUF wrbuf;
 
     if (log_level_details)
     {
         yaz_log(log_level_details, "extract_token_add "
                 "type=%s index=%s seqno=" ZINT_FORMAT " s=%.*s",
-                p->index_type, p->index_name, 
+                p->index_type, p->index_name,
                 p->seqno, p->term_len, p->term_buf);
     }
-    if ((wrbuf = zebra_replace(zm, 0, p->term_buf, p->term_len)))
-    {
-        p->term_buf = wrbuf_buf(wrbuf);
-        p->term_len = wrbuf_len(wrbuf);
-    }
     if (zebra_maps_is_icu(zm))
     {
         extract_add_icu(p, zm);
@@ -1940,7 +1947,7 @@ void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
                 (*e)->sysno = filter_sysno ? filter_sysno : sysno;
                 (*e)->section_id = section_id;
             }
-            
+
             wrbuf_write((*e)->wrbuf, str, slen);
             wrbuf_putc((*e)->wrbuf, '\0');
         }