From 2dd8bfd275211ec1c984b93562c2085ae87040e0 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Tue, 9 Mar 2004 15:12:14 +0000 Subject: [PATCH] Fixed problem with 'encoding' directive for .chr files. Added \LXXXX hex translation for .chr files. Added test/charmap test cases. --- CHANGELOG | 6 ++++ configure.in | 4 +-- test/Makefile.am | 3 +- test/charmap/Makefile.am | 9 ++++++ test/charmap/default.idx | 38 +++++++++++++++++++++++ test/charmap/string.utf8.chr | 28 +++++++++++++++++ test/charmap/test1.sh | 16 ++++++++++ test/charmap/test2.sh | 17 ++++++++++ test/charmap/x.xml | 9 ++++++ test/charmap/zebra.cfg | 14 +++++++++ util/charmap.c | 70 ++++++++++++++++++++++++++++++------------ 11 files changed, 192 insertions(+), 22 deletions(-) create mode 100644 test/charmap/Makefile.am create mode 100644 test/charmap/default.idx create mode 100644 test/charmap/string.utf8.chr create mode 100755 test/charmap/test1.sh create mode 100755 test/charmap/test2.sh create mode 100644 test/charmap/x.xml create mode 100644 test/charmap/zebra.cfg diff --git a/CHANGELOG b/CHANGELOG index 7758dcc..cfd3b4a 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,9 @@ + +Added feature charmaps (.chr) so that characters may be specified in +\LXXXX HEX notation. + +Fixed problem with encoding directive for charmap(.chr) files. + --- 1.3.15 2004/01/15 Fix bug. X-Path attribute expressions with spaces in them now works. diff --git a/configure.in b/configure.in index be5024b..e1223d1 100644 --- a/configure.in +++ b/configure.in @@ -1,5 +1,5 @@ dnl Zebra, Index Data Aps, 1995-2004 -dnl $Id: configure.in,v 1.88 2004-01-15 14:22:21 adam Exp $ +dnl $Id: configure.in,v 1.89 2004-03-09 15:12:14 adam Exp $ dnl AC_INIT(include/zebraver.h) AM_INIT_AUTOMAKE(idzebra,1.3.15) @@ -390,7 +390,7 @@ AC_OUTPUT([ test/config/Makefile perl/Makefile.PL test/xelm/Makefile test/dmoz/Makefile test/xpath/Makefile test/sort/Makefile test/zsh/Makefile - test/marcxml/Makefile + test/marcxml/Makefile test/charmap/Makefile examples/Makefile examples/gils/Makefile examples/zthes/Makefile idzebra.spec ]) diff --git a/test/Makefile.am b/test/Makefile.am index f587ffe..670f307 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -1,3 +1,4 @@ -SUBDIRS=api gils malxml config usmarc dmoz xpath sort xelm cddb rusmarc zsh marcxml +SUBDIRS=api gils malxml config usmarc dmoz xpath sort xelm cddb \ + rusmarc zsh marcxml charmap diff --git a/test/charmap/Makefile.am b/test/charmap/Makefile.am new file mode 100644 index 0000000..37b79de --- /dev/null +++ b/test/charmap/Makefile.am @@ -0,0 +1,9 @@ +# $Id: Makefile.am,v 1.1 2004-03-09 15:12:15 adam Exp $ + +check_SCRIPTS = test1.sh test2.sh + +TESTS = $(check_SCRIPTS) + +EXTRA_DIST = zebra.cfg x.xml default.idx string.utf8.chr \ + $(check_SCRIPTS) + diff --git a/test/charmap/default.idx b/test/charmap/default.idx new file mode 100644 index 0000000..5e6eb11 --- /dev/null +++ b/test/charmap/default.idx @@ -0,0 +1,38 @@ +# Zebra indexes as referred to from the *.abs-files. +# $Id: default.idx,v 1.1 2004-03-09 15:12:15 adam Exp $ +# +# Traditional word index +index w +completeness 0 +position 1 +charmap string.utf8.chr + +# Phrase index +index p +completeness 1 +charmap string.chr + +# URX (URL) index +index u +completeness 0 +charmap urx.chr + +# Numeric index (integer only) +index n +completeness 0 +charmap numeric.chr + +# Null map index (no mapping at all) +index 0 +completeness 0 +position 1 +charmap @ + +# Sort register (no mapping at all) +sort s +completeness 1 +charmap string.chr + +index y +completeness 0 +charmap @ diff --git a/test/charmap/string.utf8.chr b/test/charmap/string.utf8.chr new file mode 100644 index 0000000..d67402f --- /dev/null +++ b/test/charmap/string.utf8.chr @@ -0,0 +1,28 @@ +# $Id: string.utf8.chr,v 1.1 2004-03-09 15:12:15 adam Exp $ + +# Define the basic value-set. *Beware* of changing this without re-indexing +# your databases. + +# This specifies that _this_ file is in UTF-8. +encoding utf-8 + +lowercase {0-9}{a-y}üzæäøöå +uppercase {0-9}{A-Y}ÜZÆÄØÖÅ + +# Breaking characters + +space {\001-\040}!"#$%&'\()*+,-./:;<=>?@\[\\]^_`\{|}~ + +# Characters to be considered equivalent for searching purposes. + +# equivalent æä(ae) +# equivalent øö(oe) +# equivalent å(aa) +# equivalent uü + +# Supplemental mappings + +# Latin letter with h with dot below +map \L1E25 h +# Latin letter with H with dot below +map \L1E24 h diff --git a/test/charmap/test1.sh b/test/charmap/test1.sh new file mode 100755 index 0000000..501057d --- /dev/null +++ b/test/charmap/test1.sh @@ -0,0 +1,16 @@ +#!/bin/sh +LOG=test1.log +rm -f $LOG +if ../../index/zebraidx -l $LOG -V|grep Expat >/dev/null; then + ../../index/zebraidx -l$LOG init +else + exit 0 +fi +../../index/zebraidx -l$LOG update *.xml +../../index/zebrasrv -l$LOG unix:socket & +sleep 1 +../api/testclient unix:socket '@term string æ' >tmp1 +echo 'Result count: 1' >tmp2 +kill `cat zebrasrv.pid` || exit 1 +diff tmp1 tmp2 || exit 2 +rm -f tmp1 tmp2 diff --git a/test/charmap/test2.sh b/test/charmap/test2.sh new file mode 100755 index 0000000..2b7861c --- /dev/null +++ b/test/charmap/test2.sh @@ -0,0 +1,17 @@ +#!/bin/sh +LOG=test2.log +rm -f $LOG +if ../../index/zebraidx -l $LOG -V|grep Expat >/dev/null; then + ../../index/zebraidx -l$LOG init +else + exit 0 +fi +../../index/zebraidx -l$LOG update *.xml +../../index/zebrasrv -l$LOG unix:socket & +sleep 1 +# search for UNICODE 1E25 - letter h with dot below +../api/testclient unix:socket '@term string ḥ' >tmp1 +echo 'Result count: 1' >tmp2 +kill `cat zebrasrv.pid` || exit 1 +diff tmp1 tmp2 || exit 2 +rm -f tmp1 tmp2 diff --git a/test/charmap/x.xml b/test/charmap/x.xml new file mode 100644 index 0000000..cf5970f --- /dev/null +++ b/test/charmap/x.xml @@ -0,0 +1,9 @@ + + + + h æ + <Acronym> + UUCCSEIS + </Acronym> + + diff --git a/test/charmap/zebra.cfg b/test/charmap/zebra.cfg new file mode 100644 index 0000000..07d0695 --- /dev/null +++ b/test/charmap/zebra.cfg @@ -0,0 +1,14 @@ +# Simple Zebra configuration file +# $Id: zebra.cfg,v 1.1 2004-03-09 15:12:15 adam Exp $ +# +# Where the schema files, attribute files, etc are located. +profilePath: .:../../tab + +# Files that describe the attribute sets supported. +attset: bib1.att +attset: gils.att +attset: explain.att + +recordtype: grs.xml + +isam: b diff --git a/util/charmap.c b/util/charmap.c index 31642bc..f4f5920 100644 --- a/util/charmap.c +++ b/util/charmap.c @@ -1,5 +1,5 @@ -/* $Id: charmap.c,v 1.27 2003-01-13 10:53:16 oleg Exp $ - Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 +/* $Id: charmap.c,v 1.28 2004-03-09 15:12:15 adam Exp $ + Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps This file is part of the Zebra server. @@ -244,6 +244,14 @@ unsigned char zebra_prim(char **s) return c; } +static int zebra_ucs4_strlen(ucs4_t *s) +{ + int i = 0; + while (*s++) + i++; + return i; +} + ucs4_t zebra_prim_w(ucs4_t **s) { ucs4_t c; @@ -263,13 +271,16 @@ ucs4_t zebra_prim_w(ucs4_t **s) case 't': c = '\t'; (*s)++; break; case 's': c = ' '; (*s)++; break; case 'x': - fmtstr[0] = (*s)[0]; - fmtstr[1] = (*s)[1]; - fmtstr[2] = (*s)[2]; - fmtstr[3] = 0; - sscanf(fmtstr, "x%2x", &i); - c = i; - *s += 3; break; + if (zebra_ucs4_strlen(*s) >= 3) + { + fmtstr[0] = (*s)[1]; + fmtstr[1] = (*s)[2]; + fmtstr[2] = 0; + sscanf(fmtstr, "%x", &i); + c = i; + *s += 3; + } + break; case '0': case '1': case '2': @@ -280,14 +291,30 @@ ucs4_t zebra_prim_w(ucs4_t **s) case '7': case '8': case '9': - fmtstr[0] = (*s)[0]; - fmtstr[1] = (*s)[1]; - fmtstr[2] = (*s)[2]; - fmtstr[3] = 0; - sscanf(fmtstr, "%3o", &i); - c = i; - *s += 3; + if (zebra_ucs4_strlen(*s) >= 3) + { + fmtstr[0] = (*s)[0]; + fmtstr[1] = (*s)[1]; + fmtstr[2] = (*s)[2]; + fmtstr[3] = 0; + sscanf(fmtstr, "%o", &i); + c = i; + *s += 3; + } break; + case 'L': + if (zebra_ucs4_strlen(*s) >= 5) + { + fmtstr[0] = (*s)[1]; + fmtstr[1] = (*s)[2]; + fmtstr[2] = (*s)[3]; + fmtstr[3] = (*s)[4]; + fmtstr[4] = 0; + sscanf(fmtstr, "%x", &i); + c = i; + *s += 5; + } + break; default: (*s)++; } @@ -386,6 +413,8 @@ static int scan_to_utf8 (yaz_iconv_t t, ucs4_t *from, size_t inlen, ret = yaz_iconv (t, &inbuf, &inbytesleft, &outbuf, &outbytesleft); if (ret == (size_t) (-1)) { + yaz_log(LOG_LOG, "from: %2X %2X %2X %2X", + from[0], from[1], from[2], from[3]); yaz_log (LOG_WARN|LOG_ERRNO, "bad unicode sequence"); return -1; } @@ -648,20 +677,23 @@ chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only, * zebra need to comment next part of code. */ - /* + /* Original code */ +#if 1 if (t_unicode != 0) yaz_iconv_close (t_unicode); t_unicode = yaz_iconv_open (ucs4_native, argv[1]); - */ - +#endif /* * Fix me. It is additional staff for conversion of characters from local encoding * of *.chr file to UTF-8 (internal encoding). * NOTE: The derective encoding must be first directive in *.chr file. */ + /* For whatever reason Oleg enabled this.. */ +#if 0 if (t_utf8 != 0) yaz_iconv_close(t_utf8); t_utf8 = yaz_iconv_open ("UTF-8", argv[1]); +#endif } else { -- 1.7.10.4