From b6d34be5a1131d2bb7ff367b7d5e498599f791a8 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Sat, 7 Aug 2004 08:18:19 +0000 Subject: [PATCH] Fixes for MARC-8 in yaz_iconv character set utilies. The MARC-8 to UTF-8/UCS conversion is now only based on codetables.xml. Thanks to Larry Dixson for reporting this error. --- NEWS | 5 +++ src/Makefile.am | 8 ++-- src/siconv.c | 70 ++++++++++++++++++++++--------- test/tsticonv.c | 35 +++++++++------- util/Makefile.am | 6 +-- util/marcdump.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- win/makefile | 6 +-- win/yaz.nsi | 3 +- 8 files changed, 203 insertions(+), 50 deletions(-) diff --git a/NEWS b/NEWS index c578b8d..a00dae8 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,10 @@ Possible compatibility problems with earlier versions marked with '*'. +Fixes for MARC-8 in yaz_iconv character set utilies. The MARC-8 +to UTF-8/UCS conversion is now only based on codetables.xml. + +yaz_marc_decode_buf sets leader pos 9 to "a" for MARCXML output. + --- 2.0.22 2004/08/06 Add support for more "commit changes" in ZOOM (uses Extended Services). diff --git a/src/Makefile.am b/src/Makefile.am index 8ba3ba7..6a9ebce 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,6 +1,6 @@ ## Copyright (C) 1994-2004, Index Data ## All rights reserved. -## $Id: Makefile.am,v 1.14 2004-08-07 08:06:57 adam Exp $ +## $Id: Makefile.am,v 1.15 2004-08-07 08:18:19 adam Exp $ if ISTHR thrlib=libyazthread.la @@ -20,7 +20,7 @@ illdatadir=$(pkgdatadir)/ill illdata_DATA=ill9702.asn item-req.asn ill.tcl EXTRA_DIST=$(tabdata_DATA) $(illdata_DATA) \ - charconv.tcl codetables.xml charconv.sgm + charconv.tcl codetables.xml YAZCOMP = $(top_srcdir)/util/yaz-asncomp YAZCOMPLINE = $(YAZCOMP) -d z.tcl -i yaz -I../include $(YCFLAGS) @@ -29,8 +29,8 @@ AM_CPPFLAGS=-I$(top_srcdir)/include $(XML2_CFLAGS) AM_YFLAGS=-p cql_ THREADED_FLAGS = @CFLAGSTHREADS@ -# MARC8 conversion is generated from charconv.sgm + codetables.xml -marc8.c: charconv.tcl charconv.sgm codetables.xml +# MARC8 conversion is generated from codetables.xml +marc8.c: charconv.tcl codetables.xml cd $(srcdir); ./charconv.tcl -p marc8 codetables.xml -o marc8.c libyaz_la_SOURCES=version.c options.c log.c marcdisp.c oid.c wrbuf.c \ diff --git a/src/siconv.c b/src/siconv.c index 1eb66c4..7d31a00 100644 --- a/src/siconv.c +++ b/src/siconv.c @@ -2,7 +2,7 @@ * Copyright (c) 1997-2004, Index Data * See the file LICENSE for details. * - * $Id: siconv.c,v 1.5 2004-03-16 13:12:43 adam Exp $ + * $Id: siconv.c,v 1.6 2004-08-07 08:18:19 adam Exp $ */ /* mini iconv and wrapper for system iconv library (if present) */ @@ -24,24 +24,24 @@ #include -unsigned long yaz_marc8_conv (unsigned char *inp, size_t inbytesleft, - size_t *no_read); +unsigned long yaz_marc8_1_conv (unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); unsigned long yaz_marc8_2_conv (unsigned char *inp, size_t inbytesleft, - size_t *no_read); + size_t *no_read, int *combining); unsigned long yaz_marc8_3_conv (unsigned char *inp, size_t inbytesleft, - size_t *no_read); + size_t *no_read, int *combining); unsigned long yaz_marc8_4_conv (unsigned char *inp, size_t inbytesleft, - size_t *no_read); + size_t *no_read, int *combining); unsigned long yaz_marc8_5_conv (unsigned char *inp, size_t inbytesleft, - size_t *no_read); + size_t *no_read, int *combining); unsigned long yaz_marc8_6_conv (unsigned char *inp, size_t inbytesleft, - size_t *no_read); + size_t *no_read, int *combining); unsigned long yaz_marc8_7_conv (unsigned char *inp, size_t inbytesleft, - size_t *no_read); + size_t *no_read, int *combining); unsigned long yaz_marc8_8_conv (unsigned char *inp, size_t inbytesleft, - size_t *no_read); + size_t *no_read, int *combining); unsigned long yaz_marc8_9_conv (unsigned char *inp, size_t inbytesleft, - size_t *no_read); + size_t *no_read, int *combining); struct yaz_iconv_struct { int my_errno; @@ -53,6 +53,8 @@ struct yaz_iconv_struct { size_t (*write_handle)(yaz_iconv_t cd, unsigned long x, char **outbuf, size_t *outbytesleft); int marc8_esc_mode; + int marc8_comb_x; + int marc8_comb_no_read; #if HAVE_ICONV_H iconv_t iconv_cd; #endif @@ -233,6 +235,13 @@ static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp, static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp, size_t inbytesleft, size_t *no_read) { + if (cd->marc8_comb_x) + { + unsigned long x = cd->marc8_comb_x; + *no_read = cd->marc8_comb_no_read; + cd->marc8_comb_x = 0; + return x; + } *no_read = 0; while(inbytesleft >= 1 && inp[0] == 27) { @@ -259,6 +268,7 @@ static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp, else { unsigned long x; + int comb = 0; size_t no_read_sub = 0; switch(cd->marc8_esc_mode) @@ -266,40 +276,59 @@ static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp, case 'B': /* Basic ASCII */ case 'E': /* ANSEL */ case 's': /* ASCII */ - x = yaz_marc8_conv(inp, inbytesleft, &no_read_sub); + x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, &comb); break; case 'g': /* Greek */ - x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub); + x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, &comb); break; case 'b': /* Subscripts */ - x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub); + x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, &comb); break; case 'p': /* Superscripts */ - x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub); + x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, &comb); break; case '2': /* Basic Hebrew */ - x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub); + x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, &comb); break; case 'N': /* Basic Cyrillic */ case 'Q': /* Extended Cyrillic */ - x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub); + x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, &comb); break; case '3': /* Basic Arabic */ case '4': /* Extended Arabic */ - x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub); + x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, &comb); break; case 'S': /* Greek */ - x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub); + x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, &comb); break; case '1': /* Chinese, Japanese, Korean (EACC) */ - x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub); + x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, &comb); break; default: *no_read = 0; cd->my_errno = YAZ_ICONV_EILSEQ; return 0; } +#if 0 + printf ("esc mode=%c x=%04lX comb=%d\n", cd->marc8_esc_mode, x, comb); +#endif *no_read += no_read_sub; + + if (comb && cd->marc8_comb_x == 0) + { + size_t tmp_read = 0; + unsigned long next_x; + + /* read next char .. */ + next_x = yaz_read_marc8(cd, inp + *no_read, + inbytesleft - *no_read, &tmp_read); + /* save this x for later .. */ + cd->marc8_comb_x = x; + /* save next read for later .. */ + cd->marc8_comb_no_read = tmp_read; + /* return next x - thereby swap */ + x = next_x; + } return x; } } @@ -465,6 +494,7 @@ yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode) cd->init_handle = 0; cd->my_errno = YAZ_ICONV_UNKNOWN; cd->marc8_esc_mode = 'B'; + cd->marc8_comb_x = 0; /* a useful hack: if fromcode has leading @, the library not use YAZ's own conversions .. */ diff --git a/test/tsticonv.c b/test/tsticonv.c index 1799002..15c869d 100644 --- a/test/tsticonv.c +++ b/test/tsticonv.c @@ -2,7 +2,7 @@ * Copyright (c) 2002-2004, Index Data * See the file LICENSE for details. * - * $Id: tsticonv.c,v 1.2 2004-03-15 21:39:06 adam Exp $ + * $Id: tsticonv.c,v 1.3 2004-08-07 08:18:19 adam Exp $ */ #if HAVE_CONFIG_H @@ -20,9 +20,6 @@ static const char *iso_8859_1_a[] = { "ax" , "\330", "eneb\346r", - "\xfc", - "\xfb", - "\xfbr", 0 }; /* same test strings in MARC-8 format */ @@ -30,9 +27,6 @@ static const char *marc8_a[] = { "ax", "\xa2", /* latin capital letter o with stroke */ "eneb\xb5r", /* latin small letter ae */ - "\xe8\x75", /* latin small letter u with umlaut */ - "\xe3\x75", /* latin small letter u with circumflex */ - "\xe3\x75r", /* latin small letter u with circumflex */ 0 }; @@ -79,16 +73,25 @@ static void marc8_tst_a() static void marc8_tst_b() { static const char *marc8_b[] = { + /* 0 */ "\033$1" "\x21\x2B\x3B" /* FF1F */ "\033(B" "o", + /* 1 */ "\033$1" "\x6F\x77\x29" /* AE0E */ "\x6F\x52\x7C" /* c0F4 */ "\033(B", + /* 2 */ "\033$1" - "\x21\x50\x6E" /* 7CFB */ - "\x21\x51\x31" /* 7D71 */ - "\x21\x3A\x67" /* 5B89 */ - "\x21\x33\x22" /* 5168 */ - "\x21\x33\x53" /* 5206 */ - "\x21\x44\x2B" /* 6790 */ + "\x21\x50\x6E" /* UCS 7CFB */ + "\x21\x51\x31" /* UCS 7D71 */ + "\x21\x3A\x67" /* UCS 5B89 */ + "\x21\x33\x22" /* UCS 5168 */ + "\x21\x33\x53" /* UCS 5206 */ + "\x21\x44\x2B" /* UCS 6790 */ "\033(B", + /* 3 */ + "\xB0\xB2", /* AYN and oSLASH */ + /* 4 */ + "\xF6\x61", /* a underscore */ + /* 5 */ + "\x61\xC2", /* a, phonorecord mark */ 0 }; static const char *ucs4_b[] = { @@ -100,6 +103,9 @@ static void marc8_tst_b() "\x00\x00\x51\x68" "\x00\x00\x52\x06" "\x00\x00\x67\x90", + "\x00\x00\x02\xBB" "\x00\x00\x00\xF8", + "\x00\x00\x00\x61" "\x00\x00\x03\x32", + "\x00\x00\x00\x61" "\x00\x00\x21\x17", 0 }; int i; @@ -115,7 +121,7 @@ static void marc8_tst_b() { size_t r; size_t len; - size_t expect_len = (i == 2 ? 24 : 8); + size_t expect_len = i == 2 ? 24 : 8; char *inbuf= (char*) marc8_b[i]; size_t inbytesleft = strlen(inbuf); char outbuf0[24]; @@ -257,6 +263,7 @@ static void dconvert(int mandatory, const char *tmpcode) int main (int argc, char **argv) { + yaz_log_init_file("tsticonv.log"); dconvert(1, "UTF-8"); dconvert(1, "ISO-8859-1"); dconvert(1, "UCS4"); diff --git a/util/Makefile.am b/util/Makefile.am index af3919f..eca8ee3 100644 --- a/util/Makefile.am +++ b/util/Makefile.am @@ -1,8 +1,6 @@ ## Copyright (C) 1994-2004, Index Data ## All rights reserved. -## $Id: Makefile.am,v 1.27 2004-05-01 23:32:20 adam Exp $ - -TESTS = $(check_PROGRAMS) +## $Id: Makefile.am,v 1.28 2004-08-07 08:18:20 adam Exp $ bin_SCRIPTS = yaz-asncomp yaz-config @@ -10,7 +8,7 @@ EXTRA_DIST = yaz-asncomp DISTCLEANFILES = yaz-config -AM_CPPFLAGS=-I$(top_srcdir)/include +AM_CPPFLAGS=-I$(top_srcdir)/include $(XML2_CFLAGS) bin_PROGRAMS = yaz-marcdump yaz-iconv noinst_PROGRAMS = cclsh cql2pqf cql2xcql srwtst yaz-benchmark diff --git a/util/marcdump.c b/util/marcdump.c index 3e97bad..d441855 100644 --- a/util/marcdump.c +++ b/util/marcdump.c @@ -2,17 +2,27 @@ * Copyright (c) 1995-2004, Index Data * See the file LICENSE for details. * - * $Id: marcdump.c,v 1.24 2004-08-04 09:30:30 adam Exp $ + * $Id: marcdump.c,v 1.25 2004-08-07 08:18:20 adam Exp $ */ #if HAVE_CONFIG_H #include #endif +#if HAVE_XML2 +#include +#include + +#include +#include + +#endif + #include #include #include #include +#include #if HAVE_LOCALE_H #include @@ -39,9 +49,60 @@ static void usage(const char *prog) prog); } +#if HAVE_XML2 +void print_xpath_nodes(xmlNodeSetPtr nodes, FILE* output) { + xmlNodePtr cur; + int size; + int i; + + assert(output); + size = (nodes) ? nodes->nodeNr : 0; + + fprintf(output, "Result (%d nodes):\n", size); + for(i = 0; i < size; ++i) { + assert(nodes->nodeTab[i]); + + if(nodes->nodeTab[i]->type == XML_NAMESPACE_DECL) + { + xmlNsPtr ns; + + ns = (xmlNsPtr)nodes->nodeTab[i]; + cur = (xmlNodePtr)ns->next; + if(cur->ns) { + fprintf(output, "= namespace \"%s\"=\"%s\" for node %s:%s\n", + ns->prefix, ns->href, cur->ns->href, cur->name); + } else { + fprintf(output, "= namespace \"%s\"=\"%s\" for node %s\n", + ns->prefix, ns->href, cur->name); + } + } + else if(nodes->nodeTab[i]->type == XML_ELEMENT_NODE) + { + cur = nodes->nodeTab[i]; + if(cur->ns) { + fprintf(output, "= element node \"%s:%s\"\n", + cur->ns->href, cur->name); + } + else + { + fprintf(output, "= element node \"%s\"\n", + cur->name); + } + } + else + { + cur = nodes->nodeTab[i]; + fprintf(output, "= node \"%s\": type %d\n", cur->name, cur->type); + } + } +} +#endif + int main (int argc, char **argv) { int r; + int libxml_dom_test = 0; + int print_offset = 0; char *arg; int verbose = 0; FILE *inf; @@ -51,7 +112,7 @@ int main (int argc, char **argv) int xml = 0; FILE *cfile = 0; char *from = 0, *to = 0; - + int num = 1; #if HAVE_LOCALE_H setlocale(LC_CTYPE, ""); @@ -62,7 +123,7 @@ int main (int argc, char **argv) #endif #endif - while ((r = options("vc:xOXIf:t:", argv, argc, &arg)) != -2) + while ((r = options("pvc:xOXIf:t:2", argv, argc, &arg)) != -2) { int count; no++; @@ -91,6 +152,12 @@ int main (int argc, char **argv) case 'I': xml = YAZ_MARC_ISO2709; break; + case 'p': + print_offset = 1; + break; + case '2': + libxml_dom_test = 1; + break; case 0: inf = fopen (arg, "rb"); count = 0; @@ -128,7 +195,16 @@ int main (int argc, char **argv) r = fread (buf, 1, 5, inf); if (r < 5) + { + if (r && print_offset) + printf ("Extra %d bytes", r); break; + } + if (print_offset) + { + long off = ftell(inf); + printf ("Record %d offset %ld\n", num, (long) off); + } len = atoi_n(buf, 5); if (len < 25 || len > 100000) break; @@ -140,6 +216,43 @@ int main (int argc, char **argv) if (r <= 0) break; fwrite (result, rlen, 1, stdout); +#if HAVE_XML2 + if (libxml_dom_test) + { + xmlDocPtr doc = xmlParseMemory(result, rlen); + if (!doc) + fprintf(stderr, "xmLParseMemory failed\n"); + else + { + int i; + xmlXPathContextPtr xpathCtx; + xmlXPathObjectPtr xpathObj; + static const char *xpathExpr[] = { + "/record/datafield[@tag='245']/subfield[@code='a']", + "/record/datafield[@tag='100']/subfield", + "/record/datafield[@tag='245']/subfield[@code='a']", + "/record/datafield[@tag='650']/subfield", + "/record/datafield[@tag='650']", + 0}; + + xpathCtx = xmlXPathNewContext(doc); + + for (i = 0; xpathExpr[i]; i++) { + xpathObj = xmlXPathEvalExpression(xpathExpr[i], xpathCtx); + if(xpathObj == NULL) { + fprintf(stderr,"Error: unable to evaluate xpath expression \"%s\"\n", xpathExpr[i]); + } + else + { + print_xpath_nodes(xpathObj->nodesetval, stdout); + xmlXPathFreeObject(xpathObj); + } + } + xmlXPathFreeContext(xpathCtx); + xmlFreeDoc(doc); + } + } +#endif if (cfile) { char *p = buf; @@ -159,6 +272,7 @@ int main (int argc, char **argv) } fprintf (cfile, "\"\n"); } + num++; } count++; if (cd) diff --git a/win/makefile b/win/makefile index e8cf92a..40868dd 100644 --- a/win/makefile +++ b/win/makefile @@ -1,6 +1,6 @@ # Copyright (C) 1994-2004, Index Data # All rights reserved. -# $Id: makefile,v 1.78 2004-05-10 11:56:33 adam Exp $ +# $Id: makefile,v 1.79 2004-08-07 08:18:20 adam Exp $ # # Programmed by # HL: Heikki Levanto, Index Data @@ -620,9 +620,9 @@ $(ITEM_REQ_FILES): $(SRCDIR)\item-req.asn $(TCL) $(TCLOPT) -d ill.tcl item-req.asn @cd $(WINDIR) -$(SRCDIR)\marc8.c: $(SRCDIR)\charconv.sgm $(SRCDIR)\codetables.xml $(SRCDIR)\charconv.tcl +$(SRCDIR)\marc8.c: $(SRCDIR)\codetables.xml $(SRCDIR)\charconv.tcl @cd $(SRCDIR) - $(TCL) charconv.tcl -O 1 -p marc8 charconv.sgm codetables.xml -o marc8.c + $(TCL) charconv.tcl -p marc8 codetables.xml -o marc8.c !endif diff --git a/win/yaz.nsi b/win/yaz.nsi index 0fd503a..591fd7f 100644 --- a/win/yaz.nsi +++ b/win/yaz.nsi @@ -1,4 +1,4 @@ -; $Id: yaz.nsi,v 1.49 2004-08-06 08:31:03 adam Exp $ +; $Id: yaz.nsi,v 1.50 2004-08-07 08:18:20 adam Exp $ !define VERSION "2.0.22" @@ -132,7 +132,6 @@ Section "YAZ Source" YAZ_Source File ..\src\*.y File ..\src\*.tcl File ..\src\*.asn - File ..\src\charconv.sgm File ..\src\codetables.xml SetOutPath $INSTDIR\zoom File ..\zoom\*.c -- 1.7.10.4