Start work on ICU based regexp searches
[idzebra-moved-to-github.git] / index / rpnsearch.c
index 69a7e9a..8b474a9 100644 (file)
@@ -1,5 +1,5 @@
 /* This file is part of the Zebra server.
-   Copyright (C) 1994-2010 Index Data
+   Copyright (C) 1994-2011 Index Data
 
 Zebra is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free
@@ -17,6 +17,9 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 
 */
 
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif
 #include <stdio.h>
 #include <assert.h>
 #ifdef WIN32
@@ -154,7 +157,7 @@ static int grep_handle(char *name, const char *info, void *p)
 }
 
 static int term_pre(zebra_map_t zm, const char **src,
-                   const char *ct1, const char *ct2, int first)
+                   const char *ct1, int first)
 {
     const char *s1, *s0 = *src;
     const char **map;
@@ -164,8 +167,6 @@ static int term_pre(zebra_map_t zm, const char **src,
     {
         if (ct1 && strchr(ct1, *s0))
             break;
-        if (ct2 && strchr(ct2, *s0))
-            break;
         s1 = s0;
         map = zebra_maps_input(zm, &s1, strlen(s1), first);
         if (**map != *CHR_SPACE)
@@ -233,24 +234,113 @@ static void add_non_space(const char *start, const char *end,
 }
 
 
+static int term_102_icu(zebra_map_t zm,
+                        const char **src, WRBUF term_dict, int space_split,
+                        WRBUF display_term)
+{
+    int no_terms = 0;
+    const char *s0 = *src, *s1;
+    while (*s0 == ' ')
+        s0++;
+    s1 = s0;
+    for (;;)
+    {
+        if (*s1 == ' ' && space_split)
+            break;
+        else if (*s1 && !strchr(REGEX_CHARS "-", *s1))
+            s1++;
+        else
+        {
+            /* EOF or regex reserved char */
+            if (s0 != s1)
+            {
+                const char *res_buf = 0;
+                size_t res_len = 0;
+                const char *display_buf;
+                size_t display_len;
+
+                zebra_map_tokenize_start(zm, s0, s1 - s0);
+
+                if (zebra_map_tokenize_next(zm, &res_buf, &res_len,
+                                            &display_buf, &display_len))
+                {
+                    size_t i = res_len;
+                    while (--i >= 0 && res_buf[i] != '\x01')
+                        ;
+                    if (i > 0)
+                    {
+                        while (--i >= 0 && res_buf[i] != '\x01')
+                            ;
+                    }
+                    res_len = i; /* reduce res_len */
+                    for (i = 0; i < res_len; i++)
+                    {
+                        if (strchr(REGEX_CHARS "\\", res_buf[i]))
+                            wrbuf_putc(term_dict, '\\');
+                        if (res_buf[i] < 32)
+                            wrbuf_putc(term_dict, '\x01');
+                        
+                        wrbuf_putc(term_dict, res_buf[i]);
+                    }
+                    wrbuf_write(display_term, display_buf, display_len);
+
+                    no_terms++;
+                }
+            }
+            if (*s1 == '\0')
+                break;
+            
+            wrbuf_putc(term_dict, *s1);
+            wrbuf_putc(display_term, *s1);
+
+            s1++;
+            s0 = s1;
+        }
+    }
+    if (no_terms)
+        wrbuf_puts(term_dict, "\x01\x01.*");
+    *src = s1;
+    return no_terms;
+}
+
 static int term_100_icu(zebra_map_t zm,
                         const char **src, WRBUF term_dict, int space_split,
                         WRBUF display_term,
-                        int right_trunc)
+                        int mode)
 {
-    int i;
+    size_t i;
     const char *res_buf = 0;
     size_t res_len = 0;
     const char *display_buf;
     size_t display_len;
+    const char *s0 = *src, *s1;
+
+    while (*s0 == ' ')
+        s0++;
+    
+    if (*s0 == '\0')
+        return 0;
+    
+    if (space_split)
+    {
+        s1 = s0;
+        while (*s1 && *s1 != ' ')
+            s1++;
+    }
+    else
+        s1 = s0 + strlen(s0);
+
+    *src = s1;
+
+    zebra_map_tokenize_start(zm, s0, s1 - s0);
+    
     if (!zebra_map_tokenize_next(zm, &res_buf, &res_len,
                                  &display_buf, &display_len))
     {
-        *src += strlen(*src);
         return 0;
     }
     wrbuf_write(display_term, display_buf, display_len);
-    if (right_trunc)
+    if (mode)
     {
         /* ICU sort keys seem to be of the form
            basechars \x01 accents \x01 length
@@ -271,17 +361,21 @@ static int term_100_icu(zebra_map_t zm,
         }
         res_len = i; /* reduce res_len */
     }
+    if (mode & 2)
+        wrbuf_puts(term_dict, ".*");
     for (i = 0; i < res_len; i++)
     {
         if (strchr(REGEX_CHARS "\\", res_buf[i]))
             wrbuf_putc(term_dict, '\\');
         if (res_buf[i] < 32)
-            wrbuf_putc(term_dict, 1);
-            
+            wrbuf_putc(term_dict, '\x01');
+        
         wrbuf_putc(term_dict, res_buf[i]);
     }
-    if (right_trunc)
+    if (mode & 1)
         wrbuf_puts(term_dict, ".*");
+    else if (mode)
+        wrbuf_puts(term_dict, "\x01\x01.*");
     return 1;
 }
 
@@ -297,7 +391,7 @@ static int term_100(zebra_map_t zm,
     const char *space_start = 0;
     const char *space_end = 0;
 
-    if (!term_pre(zm, src, NULL, NULL, !space_split))
+    if (!term_pre(zm, src, 0, !space_split))
         return 0;
     s0 = *src;
     while (*s0)
@@ -350,7 +444,7 @@ static int term_101(zebra_map_t zm,
     const char **map;
     int i = 0;
 
-    if (!term_pre(zm, src, "#", "#", !space_split))
+    if (!term_pre(zm, src, "#", !space_split))
         return 0;
     s0 = *src;
     while (*s0)
@@ -387,7 +481,7 @@ static int term_103(zebra_map_t zm, const char **src,
     const char *s0;
     const char **map;
 
-    if (!term_pre(zm, src, "^\\()[].*+?|", "(", !space_split))
+    if (!term_pre(zm, src, "^\\()[].*+?|", !space_split))
         return 0;
     s0 = *src;
     if (errors && *s0 == '+' && s0[1] && s0[2] == '+' && s0[3] &&
@@ -432,7 +526,7 @@ static int term_102(zebra_map_t zm, const char **src,
 }
 
 
-/* term_104: handle term, process # and ! */
+/* term_104: handle term, process ?n * # */
 static int term_104(zebra_map_t zm, const char **src, 
                     WRBUF term_dict, int space_split, WRBUF display_term)
 {
@@ -440,7 +534,7 @@ static int term_104(zebra_map_t zm, const char **src,
     const char **map;
     int i = 0;
 
-    if (!term_pre(zm, src, "?*#", "?*#", !space_split))
+    if (!term_pre(zm, src, "?*#", !space_split))
         return 0;
     s0 = *src;
     while (*s0)
@@ -501,7 +595,7 @@ static int term_104(zebra_map_t zm, const char **src,
     return i;
 }
 
-/* term_105/106: handle term, where trunc = Process * and ! and right trunc */
+/* term_105/106: handle term, process * ! and possibly right_truncate */
 static int term_105(zebra_map_t zm, const char **src, 
                     WRBUF term_dict, int space_split,
                    WRBUF display_term, int right_truncate)
@@ -510,7 +604,7 @@ static int term_105(zebra_map_t zm, const char **src,
     const char **map;
     int i = 0;
 
-    if (!term_pre(zm, src, "*!", "*!", !space_split))
+    if (!term_pre(zm, src, "\\*!", !space_split))
         return 0;
     s0 = *src;
     while (*s0)
@@ -529,6 +623,13 @@ static int term_105(zebra_map_t zm, const char **src,
             wrbuf_putc(display_term, *s0);
             s0++;
         }
+        else if (*s0 == '\\')
+        {
+            i++;
+            wrbuf_puts(term_dict, "\\\\");
+            wrbuf_putc(display_term, *s0);
+            s0++;
+        }
        else
         {
            const char *s1 = s0;
@@ -1040,6 +1141,13 @@ static ZEBRA_RES string_term(ZebraHandle zh, Z_AttributesPlusTerm *zapt,
                     return ZEBRA_OK;
                 }
                 break;
+            case 102:
+                if (!term_102_icu(zm, &termp, term_dict, space_split, display_term))
+                {
+                    *term_sub = 0;
+                    return ZEBRA_OK;
+                }
+                break;                
             case 1:          /* right truncation */
                 if (!term_100_icu(zm, &termp, term_dict, space_split, display_term, 1))
                 {
@@ -1047,6 +1155,20 @@ static ZEBRA_RES string_term(ZebraHandle zh, Z_AttributesPlusTerm *zapt,
                     return ZEBRA_OK;
                 }
                 break;
+            case 2:
+                if (!term_100_icu(zm, &termp, term_dict, space_split, display_term, 2))
+                {
+                    *term_sub = 0;
+                    return ZEBRA_OK;
+                }
+                break;
+            case 3:
+                if (!term_100_icu(zm, &termp, term_dict, space_split, display_term, 3))
+                {
+                    *term_sub = 0;
+                    return ZEBRA_OK;
+                }
+                break;
             default:
                 zebra_setError_zint(zh,
                                     YAZ_BIB1_UNSUPP_TRUNCATION_ATTRIBUTE,
@@ -1139,7 +1261,7 @@ static ZEBRA_RES string_term(ZebraHandle zh, Z_AttributesPlusTerm *zapt,
             }
             wrbuf_putc(term_dict, ')');
             break;
-        case 104:        /* process # and ! in term */
+        case 104:        /* process ?n * # term */
             wrbuf_putc(term_dict, '(');
             if (!term_104(zm, &termp, term_dict, space_split, display_term))
             {
@@ -1148,7 +1270,7 @@ static ZEBRA_RES string_term(ZebraHandle zh, Z_AttributesPlusTerm *zapt,
             }
             wrbuf_putc(term_dict, ')');
             break;
-        case 105:        /* process * and ! in term */
+        case 105:        /* process * ! in term and right truncate */
             wrbuf_putc(term_dict, '(');
             if (!term_105(zm, &termp, term_dict, space_split, display_term, 1))
             {
@@ -1157,7 +1279,7 @@ static ZEBRA_RES string_term(ZebraHandle zh, Z_AttributesPlusTerm *zapt,
             }
             wrbuf_putc(term_dict, ')');
             break;
-        case 106:        /* process * and ! in term */
+        case 106:        /* process * ! in term */
             wrbuf_putc(term_dict, '(');
             if (!term_105(zm, &termp, term_dict, space_split, display_term, 0))
             {
@@ -1362,8 +1484,6 @@ static ZEBRA_RES search_terms_list(ZebraHandle zh,
                                    struct rset_key_control *kc)
 {
     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, index_type);
-    if (zebra_maps_is_icu(zm))
-        zebra_map_tokenize_start(zm, termz, strlen(termz));
     return search_terms_chrmap(zh, zapt, termz, attributeSet, hits_limit,
                                stream, index_type, complete_flag,
                                rank_type, xpath_use,
@@ -1803,6 +1923,7 @@ static ZEBRA_RES rpn_search_APT_numeric(ZebraHandle zh,
                                        Z_AttributesPlusTerm *zapt,
                                        const char *termz,
                                        const Odr_oid *attributeSet,
+                                        zint hits_limit,
                                        NMEM stream,
                                        const char *index_type, 
                                         int complete_flag,
@@ -1818,7 +1939,7 @@ static ZEBRA_RES rpn_search_APT_numeric(ZebraHandle zh,
     ZEBRA_RES res;
     struct grep_info grep_info;
     int alloc_sets = 0;
-    zint hits_limit_value;
+    zint hits_limit_value = hits_limit;
     const char *term_ref_id_str = 0;
 
     zebra_term_limits_APT(zh, zapt, &hits_limit_value, &term_ref_id_str,
@@ -1977,7 +2098,7 @@ static ZEBRA_RES rpn_sort_spec(ZebraHandle zh, Z_AttributesPlusTerm *zapt,
     sk->u.sortAttributes->id = odr_oiddup_nmem(stream, attributeSet);
     sk->u.sortAttributes->list = zapt->attributes;
 
-    sks->sortRelation = (int *)
+    sks->sortRelation = (Odr_int *)
         nmem_malloc(stream, sizeof(*sks->sortRelation));
     if (sort_relation_value == 1)
         *sks->sortRelation = Z_SortKeySpec_ascending;
@@ -1986,7 +2107,7 @@ static ZEBRA_RES rpn_sort_spec(ZebraHandle zh, Z_AttributesPlusTerm *zapt,
     else 
         *sks->sortRelation = Z_SortKeySpec_ascending;
 
-    sks->caseSensitivity = (int *)
+    sks->caseSensitivity = (Odr_int *)
         nmem_malloc(stream, sizeof(*sks->caseSensitivity));
     *sks->caseSensitivity = 0;
 
@@ -2035,7 +2156,7 @@ static RSET xpath_trunc(ZebraHandle zh, NMEM stream,
         return rset_create_null(rset_nmem, kc, 0);
     else
     {
-        int i, r, max_pos;
+        int i, max_pos;
         char ord_buf[32];
         RSET rset;
         WRBUF term_dict = wrbuf_alloc();
@@ -2053,8 +2174,8 @@ static RSET xpath_trunc(ZebraHandle zh, NMEM stream,
         wrbuf_puts(term_dict, term);
         
         grep_info.isam_p_indx = 0;
-        r = dict_lookup_grep(zh->reg->dict, wrbuf_cstr(term_dict), 0,
-                             &grep_info, &max_pos, 0, grep_handle);
+        dict_lookup_grep(zh->reg->dict, wrbuf_cstr(term_dict), 0,
+                         &grep_info, &max_pos, 0, grep_handle);
         yaz_log(YLOG_DEBUG, "%s %d positions", term,
                 grep_info.isam_p_indx);
         rset = rset_trunc(zh, grep_info.isam_p_buf,
@@ -2375,7 +2496,8 @@ static ZEBRA_RES rpn_search_database(ZebraHandle zh,
     }
     else if (!strcmp(search_type, "numeric"))
     {
-        res = rpn_search_APT_numeric(zh, zapt, termz, attributeSet, stream,
+        res = rpn_search_APT_numeric(zh, zapt, termz, attributeSet, hits_limit,
+                                     stream,
                                     index_type, complete_flag, rank_type,
                                     xpath_use,
                                     rset_nmem,