First indexing using index_types system (ICU).

author Adam Dickmeiss <adam@indexdata.dk>

Mon, 29 Oct 2007 13:43:57 +0000 (13:43 +0000)

committer Adam Dickmeiss <adam@indexdata.dk>

Mon, 29 Oct 2007 13:43:57 +0000 (13:43 +0000)
author Adam Dickmeiss <adam@indexdata.dk>
Mon, 29 Oct 2007 13:43:57 +0000 (13:43 +0000)
committer Adam Dickmeiss <adam@indexdata.dk>
Mon, 29 Oct 2007 13:43:57 +0000 (13:43 +0000)
diff --git a/include/idzebra/recctrl.h b/include/idzebra/recctrl.h

index c7c92cc..92f46e4 100644 (file)
--- a/include/idzebra/recctrl.h
+++ b/include/idzebra/recctrl.h
@@ -1,4 +1,4 @@
-/* $Id: recctrl.h,v 1.35 2007-05-08 12:50:04 adam Exp $
+/* $Id: recctrl.h,v 1.36 2007-10-29 13:43:57 adam Exp $
     Copyright (C) 1995-2007
     Index Data ApS
  
diff --git a/index/extract.c b/index/extract.c

index 24e5b9e..0eec47e 100644 (file)
--- a/index/extract.c
+++ b/index/extract.c
@@ -1,4 +1,4 @@
-/* $Id: extract.c,v 1.263 2007-10-29 09:25:40 adam Exp $
+/* $Id: extract.c,v 1.264 2007-10-29 13:43:57 adam Exp $
     Copyright (C) 1995-2007
     Index Data ApS
  
@@ -20,6 +20,10 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  
  */
  
+/** \file
+    \brief indexes records and extract tokens for indexing and sorting
+*/
+
  #include <stdio.h>
  #include <assert.h>
  #include <ctype.h>
@@ -31,10 +35,12 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  #endif
  #include <fcntl.h>
  
+
  #include "index.h"
  #include "orddict.h"
  #include <direntz.h>
  #include <charmap.h>
+#include <yaz/snprintf.h>
  
  static int log_level_extract = 0;
  static int log_level_details = 0;
@@ -68,6 +74,7 @@ static void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
                                      int cmd, zebra_rec_keys_t skp);
  static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid);
  static void extract_token_add(RecWord *p);
+static void extract_token_add2(RecWord *p);
  
  static void check_log_limit(ZebraHandle zh)
  {
@@ -828,7 +835,14 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
          stream->endf(stream, &null_offset);;
  
          extractCtrl.init = extract_init;
-        extractCtrl.tokenAdd = extract_token_add;
+        if (zh->reg->index_types)
+        {
+            extractCtrl.tokenAdd = extract_token_add2;
+        }
+        else
+        {
+            extractCtrl.tokenAdd = extract_token_add;
+        }
          extractCtrl.schemaAdd = extract_schema_add;
          extractCtrl.dh = zh->reg->dh;
          extractCtrl.handle = zh;
@@ -1744,6 +1758,75 @@ static void extract_add_complete_field(RecWord *p)
      extract_add_string(p, buf, i);
  }
  
+static void extract_token_add2_index(ZebraHandle zh, zebra_index_type_t type,
+                                     RecWord *p)
+{
+    struct it_key key;
+    const char *res_buf = 0;
+    size_t res_len = 0;
+    int r = zebra_index_type_tokenize(type, p->term_buf, p->term_len,
+                                      &res_buf, &res_len);
+    int cat = zinfo_index_category_index;
+    int ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, p->index_type, p->index_name);
+    if (ch < 0)
+        ch = zebraExplain_add_attr_str(zh->reg->zei, cat, p->index_type, p->index_name);
+    while (r)
+    {
+        int i = 0;
+        key.mem[i++] = ch;
+        key.mem[i++] = p->record_id;
+        key.mem[i++] = p->section_id;
+        
+        if (zh->m_segment_indexing)
+            key.mem[i++] = p->segment;
+        key.mem[i++] = p->seqno;
+        key.len = i;
+
+        yaz_log(YLOG_LOG, "keys_write %.*s", (int) res_len, res_buf);
+        zebra_rec_keys_write(zh->reg->keys, res_buf, res_len, &key);
+        
+        p->seqno++;
+        r = zebra_index_type_tokenize(type, 0, 0, &res_buf, &res_len);
+    }
+}
+
+static void extract_token_add2(RecWord *p)
+{
+    zebra_index_type_t type;
+    ZebraHandle zh = p->extractCtrl->handle;
+    char type_tmp[2];
+    type_tmp[0] = p->index_type;
+    type_tmp[1] = '\0';
+    type = zebra_index_type_get(zh->reg->index_types, type_tmp);
+    if (type)
+    {
+        if (zebra_index_type_is_index(type))
+        {
+            extract_token_add2_index(zh, type, p);
+        }
+        else if (zebra_index_type_is_sort(type))
+        {
+            ;
+            
+        }
+    }
+}
+
+/** \brief top-level indexing handler for recctrl system
+    \param p token data to be indexed
+
+    Call sequence:
+    extract_token
+    zebra_add_{in}_complete
+    extract_add_string
+    
+    extract_add_index_string
+    or
+    extract_add_sort_string
+    or
+    extract_add_staticrank_string
+    
+*/
  static void extract_token_add(RecWord *p)
  {
      ZebraHandle zh = p->extractCtrl->handle;
diff --git a/test/api/Makefile.am b/test/api/Makefile.am

index 3c5f376..bec97ea 100644 (file)
--- a/test/api/Makefile.am
+++ b/test/api/Makefile.am
@@ -1,15 +1,15 @@
-# $Id: Makefile.am,v 1.40 2006-11-16 12:48:28 adam Exp $
+# $Id: Makefile.am,v 1.41 2007-10-29 13:43:58 adam Exp $
  
  noinst_PROGRAMS = testclient
  testclient_SOURCES = testclient.c 
  
-simpletests = t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15 t16
+simpletests = t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15 t16 t17
  safaritests = safari1
  check_PROGRAMS = $(simpletests) $(safaritests)
  TESTS = $(check_PROGRAMS)
  
  EXTRA_DIST=zebra.cfg zebra6.cfg zebra8.cfg zebra10.cfg zebra15.cfg safari.cfg \
-       t10.att t10.abs
+       t10.att t10.abs zebra17.cfg indextypes17.xml
  
  noinst_LIBRARIES = libtestlib.a
  
@@ -32,6 +32,7 @@ t13_SOURCES = t13.c
  t14_SOURCES = t14.c
  t15_SOURCES = t15.c
  t16_SOURCES = t16.c
+t17_SOURCES = t17.c
  
  safari1_SOURCES = safari1.c testlib.c
  
diff --git a/test/api/indextypes17.xml b/test/api/indextypes17.xml

new file mode 100644 (file)

index 0000000..49b4d21
--- /dev/null
+++ b/test/api/indextypes17.xml
@@ -0,0 +1,18 @@
+<indextypes>
+  <indextype id="w:el" position="1" alwaysmatches="1" firstinfield="1"
+            locale="el">
+    <simple/>
+  </indextype>
+  <indextype id="w" position="1" alwaysmatches="1" firstinfield="1"
+            locale="en">
+    <simple/>
+  </indextype>
+  <indextype id="p" position="0" alwaysmatches="0" firstinfield="0"
+            locale="en">
+    <simple/>
+  </indextype>
+  <indextype id="s" sort="1" 
+            locale="en">
+    <simple/>
+  </indextype>
+</indextypes>
diff --git a/test/api/t17.c b/test/api/t17.c

new file mode 100644 (file)

index 0000000..84b1ae3
--- /dev/null
+++ b/test/api/t17.c
@@ -0,0 +1,66 @@
+/* $Id: t17.c,v 1.1 2007-10-29 13:43:58 adam Exp $
+   Copyright (C) 1995-2007
+   Index Data ApS
+
+This file is part of the Zebra server.
+
+Zebra is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2, or (at your option) any later
+version.
+
+Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+*/
+
+/** \file
+    \brief tests unicode enabled searching (index_types)
+*/
+#include <yaz/test.h>
+#include "testlib.h"
+
+const char *myrec[] = {
+        "<gils>\n<title>My title</title>\n</gils>\n",
+        "<gils>\n<title>My x title</title>\n</gils>\n",
+        "<gils>\n<title>My title x</title>\n</gils>\n" ,
+       0} ;
+       
+static void tst(int argc, char **argv)
+{
+    ZebraService zs = tl_start_up("zebra17.cfg", argc, argv);
+    ZebraHandle zh = zebra_open(zs, 0);
+
+    YAZ_CHECK(tl_init_data(zh, myrec));
+
+    /* simple term */
+    YAZ_CHECK(tl_query(zh, "@attr 1=title notfound", 0));
+
+    /* we should get 3 hits. But 0 for now */
+#if 0
+
+    YAZ_CHECK(tl_query(zh, "@attr 1=title title", 3));
+#else
+    YAZ_CHECK(tl_query(zh, "@attr 1=title title", 0));
+#endif
+
+ 
+    YAZ_CHECK(tl_close_down(zh, zs));
+}
+
+TL_MAIN
+
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */
+
diff --git a/test/api/zebra17.cfg b/test/api/zebra17.cfg

new file mode 100644 (file)

index 0000000..9c2fb1e
--- /dev/null
+++ b/test/api/zebra17.cfg
@@ -0,0 +1,12 @@
+# $Id: zebra17.cfg,v 1.1 2007-10-29 13:43:58 adam Exp $
+profilepath: ${srcdir:-.}:${srcdir:-.}/../../tab
+
+attset: bib1.att
+attset: explain.att
+
+recordType: grs.sgml
+
+indextypes: indextypes17.xml
+
+isam: b 
+
author	Adam Dickmeiss <adam@indexdata.dk>
	Mon, 29 Oct 2007 13:43:57 +0000 (13:43 +0000)
committer	Adam Dickmeiss <adam@indexdata.dk>
	Mon, 29 Oct 2007 13:43:57 +0000 (13:43 +0000)
include/idzebra/recctrl.h		patch \| blob \| history
index/extract.c		patch \| blob \| history
test/api/Makefile.am		patch \| blob \| history
test/api/indextypes17.xml	[new file with mode: 0644]	patch \| blob
test/api/t17.c	[new file with mode: 0644]	patch \| blob
test/api/zebra17.cfg	[new file with mode: 0644]	patch \| blob