Fixed problem with 'encoding' directive for .chr files. Added \LXXXX
authorAdam Dickmeiss <adam@indexdata.dk>
Tue, 9 Mar 2004 15:12:14 +0000 (15:12 +0000)
committerAdam Dickmeiss <adam@indexdata.dk>
Tue, 9 Mar 2004 15:12:14 +0000 (15:12 +0000)
hex translation for .chr files. Added test/charmap test cases.

CHANGELOG
configure.in
test/Makefile.am
test/charmap/Makefile.am [new file with mode: 0644]
test/charmap/default.idx [new file with mode: 0644]
test/charmap/string.utf8.chr [new file with mode: 0644]
test/charmap/test1.sh [new file with mode: 0755]
test/charmap/test2.sh [new file with mode: 0755]
test/charmap/x.xml [new file with mode: 0644]
test/charmap/zebra.cfg [new file with mode: 0644]
util/charmap.c

index 7758dcc..cfd3b4a 100644 (file)
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,9 @@
+
+Added feature charmaps (.chr) so that characters may be specified in
+\LXXXX HEX notation.
+
+Fixed problem with encoding directive for charmap(.chr) files.
+
 --- 1.3.15 2004/01/15
 
 Fix bug. X-Path attribute expressions with spaces in them now works.
index be5024b..e1223d1 100644 (file)
@@ -1,5 +1,5 @@
 dnl Zebra, Index Data Aps, 1995-2004
-dnl $Id: configure.in,v 1.88 2004-01-15 14:22:21 adam Exp $
+dnl $Id: configure.in,v 1.89 2004-03-09 15:12:14 adam Exp $
 dnl
 AC_INIT(include/zebraver.h)
 AM_INIT_AUTOMAKE(idzebra,1.3.15)
@@ -390,7 +390,7 @@ AC_OUTPUT([
   test/config/Makefile
   perl/Makefile.PL test/xelm/Makefile
   test/dmoz/Makefile test/xpath/Makefile test/sort/Makefile test/zsh/Makefile
-  test/marcxml/Makefile
+  test/marcxml/Makefile test/charmap/Makefile
   examples/Makefile examples/gils/Makefile examples/zthes/Makefile
   idzebra.spec
 ])
index f587ffe..670f307 100644 (file)
@@ -1,3 +1,4 @@
 
-SUBDIRS=api gils malxml config usmarc dmoz xpath sort xelm cddb rusmarc zsh marcxml
+SUBDIRS=api gils malxml config usmarc dmoz xpath sort xelm cddb \
+ rusmarc zsh marcxml charmap
 
diff --git a/test/charmap/Makefile.am b/test/charmap/Makefile.am
new file mode 100644 (file)
index 0000000..37b79de
--- /dev/null
@@ -0,0 +1,9 @@
+# $Id: Makefile.am,v 1.1 2004-03-09 15:12:15 adam Exp $
+
+check_SCRIPTS = test1.sh test2.sh
+
+TESTS = $(check_SCRIPTS)
+
+EXTRA_DIST = zebra.cfg x.xml default.idx string.utf8.chr \
+ $(check_SCRIPTS)
+
diff --git a/test/charmap/default.idx b/test/charmap/default.idx
new file mode 100644 (file)
index 0000000..5e6eb11
--- /dev/null
@@ -0,0 +1,38 @@
+# Zebra indexes as referred to from the *.abs-files.
+#  $Id: default.idx,v 1.1 2004-03-09 15:12:15 adam Exp $
+#
+# Traditional word index
+index w
+completeness 0
+position 1
+charmap string.utf8.chr
+
+# Phrase index
+index p
+completeness 1
+charmap string.chr
+
+# URX (URL) index
+index u
+completeness 0
+charmap urx.chr
+
+# Numeric index (integer only)
+index n
+completeness 0
+charmap numeric.chr
+
+# Null map index (no mapping at all)
+index 0
+completeness 0
+position 1
+charmap @
+
+# Sort register (no mapping at all)
+sort s
+completeness 1
+charmap string.chr
+
+index y
+completeness 0
+charmap @
diff --git a/test/charmap/string.utf8.chr b/test/charmap/string.utf8.chr
new file mode 100644 (file)
index 0000000..d67402f
--- /dev/null
@@ -0,0 +1,28 @@
+# $Id: string.utf8.chr,v 1.1 2004-03-09 15:12:15 adam Exp $
+
+# Define the basic value-set. *Beware* of changing this without re-indexing
+# your databases.
+
+# This specifies that _this_ file is in UTF-8.
+encoding utf-8
+
+lowercase {0-9}{a-y}üzæäøöå
+uppercase {0-9}{A-Y}ÜZÆÄØÖÅ
+
+# Breaking characters
+
+space {\001-\040}!"#$%&'\()*+,-./:;<=>?@\[\\]^_`\{|}~
+
+# Characters to be considered equivalent for searching purposes.
+
+# equivalent æä(ae)
+# equivalent øö(oe)
+# equivalent å(aa)
+# equivalent uü
+
+# Supplemental mappings
+
+# Latin letter with h with dot below
+map \L1E25 h
+# Latin letter with H with dot below
+map \L1E24 h
diff --git a/test/charmap/test1.sh b/test/charmap/test1.sh
new file mode 100755 (executable)
index 0000000..501057d
--- /dev/null
@@ -0,0 +1,16 @@
+#!/bin/sh
+LOG=test1.log
+rm -f $LOG
+if ../../index/zebraidx -l $LOG -V|grep Expat >/dev/null; then
+       ../../index/zebraidx -l$LOG init
+else
+       exit 0
+fi
+../../index/zebraidx -l$LOG update *.xml
+../../index/zebrasrv -l$LOG unix:socket &
+sleep 1
+../api/testclient unix:socket '@term string æ' >tmp1
+echo 'Result count: 1' >tmp2
+kill `cat zebrasrv.pid` || exit 1
+diff tmp1 tmp2 || exit 2
+rm -f tmp1 tmp2
diff --git a/test/charmap/test2.sh b/test/charmap/test2.sh
new file mode 100755 (executable)
index 0000000..2b7861c
--- /dev/null
@@ -0,0 +1,17 @@
+#!/bin/sh
+LOG=test2.log
+rm -f $LOG
+if ../../index/zebraidx -l $LOG -V|grep Expat >/dev/null; then
+       ../../index/zebraidx -l$LOG init
+else
+       exit 0
+fi
+../../index/zebraidx -l$LOG update *.xml
+../../index/zebrasrv -l$LOG unix:socket &
+sleep 1
+# search for UNICODE 1E25 - letter h with dot below
+../api/testclient unix:socket '@term string ḥ' >tmp1
+echo 'Result count: 1' >tmp2
+kill `cat zebrasrv.pid` || exit 1
+diff tmp1 tmp2 || exit 2
+rm -f tmp1 tmp2
diff --git a/test/charmap/x.xml b/test/charmap/x.xml
new file mode 100644 (file)
index 0000000..cf5970f
--- /dev/null
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<gils>
+  <Title>
+    h æ
+    <Acronym>
+      UUCCSEIS
+    </Acronym>
+  </Title>
+</gils>
diff --git a/test/charmap/zebra.cfg b/test/charmap/zebra.cfg
new file mode 100644 (file)
index 0000000..07d0695
--- /dev/null
@@ -0,0 +1,14 @@
+# Simple Zebra configuration file
+# $Id: zebra.cfg,v 1.1 2004-03-09 15:12:15 adam Exp $
+#
+# Where the schema files, attribute files, etc are located.
+profilePath: .:../../tab
+
+# Files that describe the attribute sets supported.
+attset: bib1.att
+attset: gils.att
+attset: explain.att
+
+recordtype: grs.xml
+
+isam: b
index 31642bc..f4f5920 100644 (file)
@@ -1,5 +1,5 @@
-/* $Id: charmap.c,v 1.27 2003-01-13 10:53:16 oleg Exp $
-   Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002
+/* $Id: charmap.c,v 1.28 2004-03-09 15:12:15 adam Exp $
+   Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
    Index Data Aps
 
 This file is part of the Zebra server.
@@ -244,6 +244,14 @@ unsigned char zebra_prim(char **s)
     return c;
 }
 
+static int zebra_ucs4_strlen(ucs4_t *s)
+{
+    int i = 0;
+    while (*s++)
+       i++;
+    return i;
+}
+
 ucs4_t zebra_prim_w(ucs4_t **s)
 {
     ucs4_t c;
@@ -263,13 +271,16 @@ ucs4_t zebra_prim_w(ucs4_t **s)
        case 't': c = '\t'; (*s)++; break;
        case 's': c = ' '; (*s)++; break;
        case 'x': 
-            fmtstr[0] = (*s)[0];
-            fmtstr[1] = (*s)[1];
-            fmtstr[2] = (*s)[2];
-            fmtstr[3] = 0;
-            sscanf(fmtstr, "x%2x", &i);
-            c = i;
-            *s += 3; break;
+           if (zebra_ucs4_strlen(*s) >= 3)
+           {
+               fmtstr[0] = (*s)[1];
+               fmtstr[1] = (*s)[2];
+               fmtstr[2] = 0;
+               sscanf(fmtstr, "%x", &i);
+               c = i;
+               *s += 3;
+           }
+           break;
         case '0':
         case '1':
         case '2':
@@ -280,14 +291,30 @@ ucs4_t zebra_prim_w(ucs4_t **s)
         case '7':
         case '8':
         case '9':
-            fmtstr[0] = (*s)[0];
-            fmtstr[1] = (*s)[1];
-            fmtstr[2] = (*s)[2];
-            fmtstr[3] = 0;
-           sscanf(fmtstr, "%3o", &i);
-            c = i;
-            *s += 3;
+           if (zebra_ucs4_strlen(*s) >= 3)
+           {
+               fmtstr[0] = (*s)[0];
+               fmtstr[1] = (*s)[1];
+               fmtstr[2] = (*s)[2];
+               fmtstr[3] = 0;
+               sscanf(fmtstr, "%o", &i);
+               c = i;
+               *s += 3;
+           }
             break;
+       case 'L':
+           if (zebra_ucs4_strlen(*s) >= 5)
+           {
+               fmtstr[0] = (*s)[1];
+               fmtstr[1] = (*s)[2];
+               fmtstr[2] = (*s)[3];
+               fmtstr[3] = (*s)[4];
+               fmtstr[4] = 0;
+               sscanf(fmtstr, "%x", &i);
+               c = i;
+               *s += 5;
+           }
+           break;
         default:
             (*s)++;
        }
@@ -386,6 +413,8 @@ static int scan_to_utf8 (yaz_iconv_t t, ucs4_t *from, size_t inlen,
         ret = yaz_iconv (t, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
         if (ret == (size_t) (-1))
         {
+           yaz_log(LOG_LOG, "from: %2X %2X %2X %2X",
+                   from[0], from[1], from[2], from[3]);
             yaz_log (LOG_WARN|LOG_ERRNO, "bad unicode sequence");
             return -1;
         }
@@ -648,20 +677,23 @@ chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only,
             * zebra need to comment next part of code.
             */
 
-           /*
+           /* Original code */
+#if 1
             if (t_unicode != 0)
                 yaz_iconv_close (t_unicode);
             t_unicode = yaz_iconv_open (ucs4_native, argv[1]);
-           */
-           
+#endif
            /*
             * Fix me. It is additional staff for conversion of characters from local encoding
             * of *.chr file to UTF-8 (internal encoding).
             * NOTE: The derective encoding must be first directive in *.chr file.
             */
+           /* For whatever reason Oleg enabled this.. */
+#if 0
            if (t_utf8 != 0)
                yaz_iconv_close(t_utf8);
            t_utf8 = yaz_iconv_open ("UTF-8", argv[1]);
+#endif
         }
        else
        {