Handle right-trucation for ICU normalized terms.

author Adam Dickmeiss <adam@indexdata.dk>

Wed, 26 Mar 2008 14:03:45 +0000 (15:03 +0100)

committer Adam Dickmeiss <adam@indexdata.dk>

Wed, 26 Mar 2008 14:03:45 +0000 (15:03 +0100)
author Adam Dickmeiss <adam@indexdata.dk>
Wed, 26 Mar 2008 14:03:45 +0000 (15:03 +0100)
committer Adam Dickmeiss <adam@indexdata.dk>
Wed, 26 Mar 2008 14:03:45 +0000 (15:03 +0100)
diff --git a/dfa/dfa.c b/dfa/dfa.c

index 8450d2a..f909f2e 100644 (file)
--- a/dfa/dfa.c
+++ b/dfa/dfa.c
@@ -1032,9 +1032,12 @@ static struct DFA_parse *dfa_parse_init (void)
      parse_info->rule = 0;
      parse_info->root = NULL;
  
+    /* initialize the anyset which by default does not include \n */
      parse_info->anyset = mk_BSet (&parse_info->charset);
      res_BSet (parse_info->charset, parse_info->anyset);
+    add_BSet (parse_info->charset, parse_info->anyset, '\n');
      com_BSet (parse_info->charset, parse_info->anyset);
+
      parse_info->use_Tnode = parse_info->max_Tnode = 0;
      parse_info->start = parse_info->end = NULL;
      parse_info->charMap = NULL;
@@ -1096,6 +1099,11 @@ struct DFA *dfa_init (void)
      return dfa;
  }
  
+void dfa_anyset_includes_nl(struct DFA *dfa)
+{
+    add_BSet (dfa->parse_info->charset, dfa->parse_info->anyset, '\n');
+}
+
  void dfa_set_cmap (struct DFA *dfa, void *vp,
                    const char **(*cmap)(void *vp, const char **from, int len))
  {
@@ -1117,12 +1125,6 @@ int dfa_parse (struct DFA *dfa, const char **pattern)
      assert (dfa->parse_info);
      parse_info = dfa->parse_info;
  
-    if (!parse_info->cmap)
-    {
-       res_BSet (parse_info->charset, parse_info->anyset);
-       add_BSet (parse_info->charset, parse_info->anyset, '\n');
-       com_BSet (parse_info->charset, parse_info->anyset);
-    }
      do_parse (parse_info, pattern, &top);
      if (parse_info->err_code)
          return parse_info->err_code;
diff --git a/dict/lookgrep.c b/dict/lookgrep.c

index b0ad4ac..b3de2af 100644 (file)
--- a/dict/lookgrep.c
+++ b/dict/lookgrep.c
@@ -390,6 +390,8 @@ int dict_lookup_grep(Dict dict, const char *pattern, int range, void *client,
      dfa_verbose = 1;
  #endif
  
+    dfa_anyset_includes_nl(dfa);
+
      yaz_log(YLOG_DEBUG, "dict_lookup_grep range=%d", range);
      for (i = 0; pattern[i]; i++)
      {
diff --git a/include/dfa.h b/include/dfa.h

index 045cd69..eceb0a6 100644 (file)
--- a/include/dfa.h
+++ b/include/dfa.h
@@ -61,6 +61,7 @@ struct DFA {
  };
  
  struct DFA *dfa_init (void);
+void dfa_anyset_includes_nl(struct DFA *dfa);
  void dfa_set_cmap (struct DFA *dfa, void *vp,
                     const char **(*cmap)(void *vp, const char **from, int len));
  int dfa_parse (struct DFA *, const char **);
diff --git a/index/rpnsearch.c b/index/rpnsearch.c

index a117ad2..8b63758 100644 (file)
--- a/index/rpnsearch.c
+++ b/index/rpnsearch.c
@@ -238,7 +238,8 @@ static void add_non_space(const char *start, const char *end,
  
  static int term_100_icu(zebra_map_t zm,
                          const char **src, WRBUF term_dict, int space_split,
-                        WRBUF display_term)
+                        WRBUF display_term,
+                        int right_trunc)
  {
      int i;
      const char *res_buf = 0;
@@ -252,14 +253,38 @@ static int term_100_icu(zebra_map_t zm,
          return 0;
      }
      wrbuf_write(display_term, display_buf, display_len);
+    if (right_trunc)
+    {
+        /* ICU sort keys seem to be of the form
+           basechars \x01 accents \x01 length
+           For now we'll just right truncate from basechars . This 
+           may give false hits due to accents not being used.
+        */
+        i = res_len;
+        while (--i >= 0 && res_buf[i] != '\x01')
+            ;
+        if (i > 0)
+        {
+            while (--i >= 0 && res_buf[i] != '\x01')
+                ;
+        }
+        if (i == 0)
+        {  /* did not find base chars at all. Throw error */
+            return -1;
+        }
+        res_len = i; /* reduce res_len */
+    }
      for (i = 0; i < res_len; i++)
      {
          if (strchr(REGEX_CHARS "\\", res_buf[i]))
              wrbuf_putc(term_dict, '\\');
          if (res_buf[i] < 32)
              wrbuf_putc(term_dict, 1);
+            
          wrbuf_putc(term_dict, res_buf[i]);
      }
+    if (right_trunc)
+        wrbuf_puts(term_dict, ".*");
      return 1;
  }
  
@@ -275,9 +300,6 @@ static int term_100(zebra_map_t zm,
      const char *space_start = 0;
      const char *space_end = 0;
  
-    if (zebra_maps_is_icu(zm))
-        return term_100_icu(zm, src, term_dict, space_split, display_term);
-
      if (!term_pre(zm, src, NULL, NULL, !space_split))
          return 0;
      s0 = *src;
@@ -1013,113 +1035,144 @@ static ZEBRA_RES string_term(ZebraHandle zh, Z_AttributesPlusTerm *zapt,
      wrbuf_putc(term_dict, ')');
      
      prefix_len = wrbuf_len(term_dict);
-    
-    switch (truncation_value)
-    {
-    case -1:         /* not specified */
-    case 100:        /* do not truncate */
-        if (!string_relation(zh, zapt, &termp, term_dict,
-                             attributeSet,
-                             zm, space_split, display_term,
-                             &relation_error))
+
+    if (zebra_maps_is_icu(zm))
+    {
+        /* ICU case */
+        switch (truncation_value)
          {
-            if (relation_error)
+        case -1:         /* not specified */
+        case 100:        /* do not truncate */
+            if (!term_100_icu(zm, &termp, term_dict, space_split, display_term, 0))
              {
-                zebra_setError(zh, relation_error, 0);
-                return ZEBRA_FAIL;
+                *term_sub = 0;
+                return ZEBRA_OK;
              }
-            *term_sub = 0;
-            return ZEBRA_OK;
-        }
-        break;
-    case 1:          /* right truncation */
-        wrbuf_putc(term_dict, '(');
-        if (!term_100(zm, &termp, term_dict, space_split, display_term))
-        {
-            *term_sub = 0;
-            return ZEBRA_OK;
-        }
-        wrbuf_puts(term_dict, ".*)");
-        break;
-    case 2:          /* keft truncation */
-        wrbuf_puts(term_dict, "(.*");
-        if (!term_100(zm, &termp, term_dict, space_split, display_term))
-        {
-            *term_sub = 0;
-            return ZEBRA_OK;
-        }
-        wrbuf_putc(term_dict, ')');
-        break;
-    case 3:          /* left&right truncation */
-        wrbuf_puts(term_dict, "(.*");
-        if (!term_100(zm, &termp, term_dict, space_split, display_term))
-        {
-            *term_sub = 0;
-            return ZEBRA_OK;
-        }
-        wrbuf_puts(term_dict, ".*)");
-        break;
-    case 101:        /* process # in term */
-        wrbuf_putc(term_dict, '(');
-        if (!term_101(zm, &termp, term_dict, space_split, display_term))
-        {
-            *term_sub = 0;
-            return ZEBRA_OK;
-        }
-        wrbuf_puts(term_dict, ")");
-        break;
-    case 102:        /* Regexp-1 */
-        wrbuf_putc(term_dict, '(');
-        if (!term_102(zm, &termp, term_dict, space_split, display_term))
-        {
-            *term_sub = 0;
-            return ZEBRA_OK;
-        }
-        wrbuf_putc(term_dict, ')');
-        break;
-    case 103:       /* Regexp-2 */
-        regex_range = 1;
-        wrbuf_putc(term_dict, '(');
-        if (!term_103(zm, &termp, term_dict, &regex_range,
-                      space_split, display_term))
-        {
-            *term_sub = 0;
-            return ZEBRA_OK;
-        }
-        wrbuf_putc(term_dict, ')');
-        break;
-    case 104:        /* process # and ! in term */
-        wrbuf_putc(term_dict, '(');
-        if (!term_104(zm, &termp, term_dict, space_split, display_term))
-        {
-            *term_sub = 0;
-            return ZEBRA_OK;
-        }
-        wrbuf_putc(term_dict, ')');
-        break;
-    case 105:        /* process * and ! in term */
-        wrbuf_putc(term_dict, '(');
-        if (!term_105(zm, &termp, term_dict, space_split, display_term, 1))
-        {
-            *term_sub = 0;
-            return ZEBRA_OK;
+            break;
+        case 1:          /* right truncation */
+            if (!term_100_icu(zm, &termp, term_dict, space_split, display_term, 1))
+            {
+                *term_sub = 0;
+                return ZEBRA_OK;
+            }
+            break;
+        default:
+            zebra_setError_zint(zh,
+                                YAZ_BIB1_UNSUPP_TRUNCATION_ATTRIBUTE,
+                                truncation_value);
+            return ZEBRA_FAIL;
          }
-        wrbuf_putc(term_dict, ')');
-        break;
-    case 106:        /* process * and ! in term */
-        wrbuf_putc(term_dict, '(');
-        if (!term_105(zm, &termp, term_dict, space_split, display_term, 0))
+    }
+    else
+    {
+        /* non-ICU case. using string.chr and friends */
+        switch (truncation_value)
          {
-            *term_sub = 0;
-            return ZEBRA_OK;
+        case -1:         /* not specified */
+        case 100:        /* do not truncate */
+            if (!string_relation(zh, zapt, &termp, term_dict,
+                                 attributeSet,
+                                 zm, space_split, display_term,
+                                 &relation_error))
+            {
+                if (relation_error)
+                {
+                    zebra_setError(zh, relation_error, 0);
+                    return ZEBRA_FAIL;
+                }
+                *term_sub = 0;
+                return ZEBRA_OK;
+            }
+            break;
+        case 1:          /* right truncation */
+            wrbuf_putc(term_dict, '(');
+            if (!term_100(zm, &termp, term_dict, space_split, display_term))
+            {
+                *term_sub = 0;
+                return ZEBRA_OK;
+            }
+            wrbuf_puts(term_dict, ".*)");
+            break;
+        case 2:          /* left truncation */
+            wrbuf_puts(term_dict, "(.*");
+            if (!term_100(zm, &termp, term_dict, space_split, display_term))
+            {
+                *term_sub = 0;
+                return ZEBRA_OK;
+            }
+            wrbuf_putc(term_dict, ')');
+            break;
+        case 3:          /* left&right truncation */
+            wrbuf_puts(term_dict, "(.*");
+            if (!term_100(zm, &termp, term_dict, space_split, display_term))
+            {
+                *term_sub = 0;
+                return ZEBRA_OK;
+            }
+            wrbuf_puts(term_dict, ".*)");
+            break;
+        case 101:        /* process # in term */
+            wrbuf_putc(term_dict, '(');
+            if (!term_101(zm, &termp, term_dict, space_split, display_term))
+            {
+                *term_sub = 0;
+                return ZEBRA_OK;
+            }
+            wrbuf_puts(term_dict, ")");
+            break;
+        case 102:        /* Regexp-1 */
+            wrbuf_putc(term_dict, '(');
+            if (!term_102(zm, &termp, term_dict, space_split, display_term))
+            {
+                *term_sub = 0;
+                return ZEBRA_OK;
+            }
+            wrbuf_putc(term_dict, ')');
+            break;
+        case 103:       /* Regexp-2 */
+            regex_range = 1;
+            wrbuf_putc(term_dict, '(');
+            if (!term_103(zm, &termp, term_dict, &regex_range,
+                          space_split, display_term))
+            {
+                *term_sub = 0;
+                return ZEBRA_OK;
+            }
+            wrbuf_putc(term_dict, ')');
+            break;
+        case 104:        /* process # and ! in term */
+            wrbuf_putc(term_dict, '(');
+            if (!term_104(zm, &termp, term_dict, space_split, display_term))
+            {
+                *term_sub = 0;
+                return ZEBRA_OK;
+            }
+            wrbuf_putc(term_dict, ')');
+            break;
+        case 105:        /* process * and ! in term */
+            wrbuf_putc(term_dict, '(');
+            if (!term_105(zm, &termp, term_dict, space_split, display_term, 1))
+            {
+                *term_sub = 0;
+                return ZEBRA_OK;
+            }
+            wrbuf_putc(term_dict, ')');
+            break;
+        case 106:        /* process * and ! in term */
+            wrbuf_putc(term_dict, '(');
+            if (!term_105(zm, &termp, term_dict, space_split, display_term, 0))
+            {
+                *term_sub = 0;
+                return ZEBRA_OK;
+            }
+            wrbuf_putc(term_dict, ')');
+            break;
+        default:
+            zebra_setError_zint(zh,
+                                YAZ_BIB1_UNSUPP_TRUNCATION_ATTRIBUTE,
+                                truncation_value);
+            return ZEBRA_FAIL;
          }
-        wrbuf_putc(term_dict, ')');
-        break;
-    default:
-        zebra_setError_zint(zh,
-                            YAZ_BIB1_UNSUPP_TRUNCATION_ATTRIBUTE,
-                            truncation_value);
-        return ZEBRA_FAIL;
      }
      if (1)
      {
diff --git a/test/api/t17.c b/test/api/t17.c

index 107204d..57d57f2 100644 (file)
--- a/test/api/t17.c
+++ b/test/api/t17.c
@@ -60,9 +60,11 @@ static void tst(int argc, char **argv)
  
      /* simple term */
      YAZ_CHECK(tl_query(zh, "@attr 1=title notfound", 0));
-
-    YAZ_CHECK(tl_query(zh, "@attr 1=title computer", 3));
   
+    YAZ_CHECK(tl_query(zh, "@attr 1=title computer", 3));
+
+    YAZ_CHECK(tl_query(zh, "@attr 1=title @attr 5=1 comput", 3));
+
      YAZ_CHECK(tl_query(zh, "@attr 1=title .computer.", 3));
  
      YAZ_CHECK(tl_query(zh, "@attr 1=title x", 2));
@@ -84,9 +86,16 @@ static void tst(int argc, char **argv)
      YAZ_CHECK(tl_query(zh, "@attr 1=abstract צביה", 1));
      YAZ_CHECK(tl_query(zh, "@attr 1=abstract הגדול", 1));
      YAZ_CHECK(tl_query(zh, "@attr 1=abstract בסיפור", 1));
+    YAZ_CHECK(tl_query(zh, "@attr 1=abstract בסיפ", 0));
      YAZ_CHECK(tl_query(zh, "@attr 1=abstract 点", 1));
      YAZ_CHECK(tl_query(zh, "@attr 1=abstract wet", 1));
  
+    YAZ_CHECK(tl_query(zh, "@attr 1=abstract @attr 5=1 בסיפ", 1));
+    YAZ_CHECK(tl_query(zh, "@attr 1=abstract @attr 5=1 סיפ", 0));
+    YAZ_CHECK(tl_query(zh, "@attr 1=abstract @attr 5=1 בסי", 1));
+    YAZ_CHECK(tl_query(zh, "@attr 1=abstract @attr 5=1 בס", 1));
+    YAZ_CHECK(tl_query(zh, "@attr 1=abstract @attr 5=1 ב", 1));
+
      /* phrase search */
      YAZ_CHECK(tl_query(zh, "@attr 1=title {my computer}", 2));
      YAZ_CHECK(tl_query(zh, "@attr 1=title @attr 6=1 {my computer}", 2));
@@ -112,7 +121,6 @@ static void tst(int argc, char **argv)
          const char *ent[] = { char_ae, "B" char_aring "d", "My computer" };
          YAZ_CHECK(tl_scan(zh, "@attr 1=title @attr 6=2 0", 1, 3, 1, 3, 0, ent));
      }
-
      
      YAZ_CHECK(tl_close_down(zh, zs));
  #endif
author	Adam Dickmeiss <adam@indexdata.dk>
	Wed, 26 Mar 2008 14:03:45 +0000 (15:03 +0100)
committer	Adam Dickmeiss <adam@indexdata.dk>
	Wed, 26 Mar 2008 14:03:45 +0000 (15:03 +0100)
dfa/dfa.c		patch \| blob \| history
dict/lookgrep.c		patch \| blob \| history
include/dfa.h		patch \| blob \| history
index/rpnsearch.c		patch \| blob \| history
test/api/t17.c		patch \| blob \| history