From: Adam Dickmeiss Date: Wed, 28 Aug 2002 12:47:09 +0000 (+0000) Subject: Zebra uses own XML reader (was part of YAZ before) X-Git-Tag: ZEBRA.1.3.2~46 X-Git-Url: http://git.indexdata.com/?p=idzebra-moved-to-github.git;a=commitdiff_plain;h=c63292356fbdef7b575efbf027ba8574482b0320 Zebra uses own XML reader (was part of YAZ before) --- diff --git a/configure.in b/configure.in index 45d256c..e8966a3 100644 --- a/configure.in +++ b/configure.in @@ -1,5 +1,5 @@ dnl Zebra, Index Data Aps, 1995-2002 -dnl $Id: configure.in,v 1.41 2002-08-23 13:42:59 adam Exp $ +dnl $Id: configure.in,v 1.42 2002-08-28 12:47:09 adam Exp $ dnl AC_INIT(include/zebraver.h) AM_INIT_AUTOMAKE(zebra,1.3.1) @@ -81,6 +81,14 @@ if test "x$tclconfig" = xNONE; then AC_PREFIX_PROGRAM(tclsh) tclconfig=${prefix}/lib prefix=${saveprefix} + if test ! -r ${tclconfig}/tclConfig.sh; then + # Not found, try search for Tcl on Debian systems. + for d in /usr/lib/tcl*; do + if test -d $d; then + tclconfig=$d + fi + done + fi fi AC_MSG_CHECKING(for Tcl) if test -r ${tclconfig}/tclConfig.sh; then @@ -121,6 +129,27 @@ else AC_CHECK_HEADERS(bzlib.h) fi fi +dnl +dnl ------ EXPAT +expat=yes +AC_ARG_WITH(expat, [ --with-expat[=DIR] EXPAT library in DIR],[expat=$withval]) +if test "$expat" != "no"; then + xLIBS="$LIBS"; + xCFLAGS="$CFLAGS"; + if test "$expat" != "yes"; then + EXPAT_CFLAGS="-I$expat/include" + EXPAT_LIBS="-L$expat/lib" + CFLAGS="$EXPAT_CFLAGS $CFLAGS" + LIBS="$EXPAT_LIBS $LIBS" + fi + AC_CHECK_LIB(expat,XML_ParserCreate,[LIBS="$LIBS -lexpat"]) + if test "$ac_cv_lib_expat_XML_ParserCreate" = "yes"; then + AC_CHECK_HEADERS(expat.h) + else + LIBS="$xLIBS" + CFLAGS="$xCFLAGS" + fi +fi dnl ------- 64 bit files AC_MSG_CHECKING(for LFS) AC_TRY_RUN([#define _FILE_OFFSET_BITS 64 diff --git a/include/recctrl.h b/include/recctrl.h index 71680b3..8083710 100644 --- a/include/recctrl.h +++ b/include/recctrl.h @@ -1,4 +1,4 @@ -/* $Id: recctrl.h,v 1.38 2002-08-02 19:26:55 adam Exp $ +/* $Id: recctrl.h,v 1.39 2002-08-28 12:47:10 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 Index Data Aps @@ -109,7 +109,8 @@ struct recType #define RECCTRL_EXTRACT_OK 0 #define RECCTRL_EXTRACT_EOF 1 -#define RECCTRL_EXTRACT_ERROR 2 +#define RECCTRL_EXTRACT_ERROR_GENERIC 2 +#define RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER 3 typedef struct recTypes *RecTypes; diff --git a/index/extract.c b/index/extract.c index baaa341..3138c88 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,4 +1,4 @@ -/* $Id: extract.c,v 1.120 2002-08-02 19:26:55 adam Exp $ +/* $Id: extract.c,v 1.121 2002-08-28 12:47:10 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 Index Data Aps @@ -488,7 +488,7 @@ static int recordExtract (ZebraHandle zh, if (r == RECCTRL_EXTRACT_EOF) return 0; - else if (r == RECCTRL_EXTRACT_ERROR) + else if (r == RECCTRL_EXTRACT_ERROR_GENERIC) { /* error occured during extraction ... */ if (rGroup->flagRw && @@ -499,6 +499,18 @@ static int recordExtract (ZebraHandle zh, } return 0; } + else if (r == RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER) + { + /* error occured during extraction ... */ + if (rGroup->flagRw && + zh->records_processed < rGroup->fileVerboseLimit) + { + logf (LOG_WARN, "no filter for %s %s " + PRINTF_OFF_T, rGroup->recordType, + fname, recordOffset); + } + return 0; + } if (zh->reg->keys.buf_used == 0) { /* the extraction process returned no information - the record @@ -955,19 +967,16 @@ int extract_rec_in_mem (ZebraHandle zh, const char *recordType, if (r == RECCTRL_EXTRACT_EOF) return 0; - else if (r == RECCTRL_EXTRACT_ERROR) + else if (r == RECCTRL_EXTRACT_ERROR_GENERIC) { /* error occured during extraction ... */ -#if 1 - yaz_log (LOG_WARN, "extract error"); -#else - if (rGroup->flagRw && - zh->records_processed < rGroup->fileVerboseLimit) - { - logf (LOG_WARN, "fail %s %s %ld", rGroup->recordType, - fname, (long) recordOffset); - } -#endif + yaz_log (LOG_WARN, "extract error: generic"); + return 0; + } + else if (r == RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER) + { + /* error occured during extraction ... */ + yaz_log (LOG_WARN, "extract error: no such filter"); return 0; } if (zh->reg->keys.buf_used == 0) diff --git a/recctrl/recgrs.c b/recctrl/recgrs.c index 8d9e3d5..6399224 100644 --- a/recctrl/recgrs.c +++ b/recctrl/recgrs.c @@ -1,4 +1,4 @@ -/* $Id: recgrs.c,v 1.61 2002-08-23 14:29:58 adam Exp $ +/* $Id: recgrs.c,v 1.62 2002-08-28 12:47:10 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 Index Data Aps @@ -20,8 +20,6 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ - - #include #include #include @@ -101,7 +99,7 @@ static void *grs_init(RecType recType) grs_add_handler (h, recTypeGrs_tcl); #endif grs_add_handler (h, recTypeGrs_marc); -#if YAZ_HAVE_EXPAT +#if HAVE_EXPAT_H grs_add_handler (h, recTypeGrs_xml); #endif return h; @@ -472,7 +470,7 @@ static int grs_extract_sub(struct grs_handlers *h, struct recExtractCtrl *p, gri.dh = p->dh; if (read_grs_type (h, &gri, p->subType, &n)) - return RECCTRL_EXTRACT_ERROR; + return RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER; if (!n) return RECCTRL_EXTRACT_EOF; oe.proto = PROTO_Z3950; @@ -499,7 +497,7 @@ static int grs_extract_sub(struct grs_handlers *h, struct recExtractCtrl *p, if (dumpkeys(n, p, 0, &wrd) < 0) { data1_free_tree(p->dh, n); - return RECCTRL_EXTRACT_ERROR; + return RECCTRL_EXTRACT_ERROR_GENERIC; } data1_free_tree(p->dh, n); return RECCTRL_EXTRACT_OK; @@ -655,7 +653,7 @@ static int grs_retrieve(void *clientData, struct recRetrieveCtrl *p) /* ensure our data1 tree is UTF-8 */ data1_iconv (p->dh, mem, node, "UTF-8", data1_get_encoding(p->dh, node)); -#if 0 +#if 1 data1_pr_tree (p->dh, node, stdout); #endif top = data1_get_root_tag (p->dh, node); diff --git a/recctrl/xmlread.c b/recctrl/xmlread.c index 54e0296..90b17d3 100644 --- a/recctrl/xmlread.c +++ b/recctrl/xmlread.c @@ -1,4 +1,4 @@ -/* $Id: xmlread.c,v 1.2 2002-08-02 19:26:56 adam Exp $ +/* $Id: xmlread.c,v 1.3 2002-08-28 12:47:10 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 Index Data Aps @@ -20,15 +20,419 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ - - -#if YAZ_HAVE_EXPAT +#if HAVE_EXPAT_H #include +#include +#include +#if HAVE_ICONV_H +#include +#include +#endif + #include #include "grsread.h" +#include +#include +#include + +#include + +#define XML_CHUNK 1024 + +struct user_info { + data1_node *d1_stack[256]; + int level; + data1_handle dh; + NMEM nmem; + int loglevel; +}; + +static void cb_start (void *user, const char *el, const char **attr) +{ + struct user_info *ui = (struct user_info*) user; + if (ui->level == 1) + data1_set_root (ui->dh, ui->d1_stack[0], ui->nmem, el); + ui->d1_stack[ui->level] = data1_mk_tag (ui->dh, ui->nmem, el, attr, + ui->d1_stack[ui->level-1]); + ui->level++; + yaz_log (ui->loglevel, "cb_start %s", el); +} + +static void cb_end (void *user, const char *el) +{ + struct user_info *ui = (struct user_info*) user; + + ui->level--; + yaz_log (ui->loglevel, "cb_end %s", el); +} + +static void cb_chardata (void *user, const char *s, int len) +{ + struct user_info *ui = (struct user_info*) user; +#if 0 + yaz_log (ui->loglevel, "cb_chardata %.*s", len, s); +#endif + ui->d1_stack[ui->level] = data1_mk_text_n (ui->dh, ui->nmem, s, len, + ui->d1_stack[ui->level -1]); +} + +static void cb_decl (void *user, const char *version, const char*encoding, + int standalone) +{ + struct user_info *ui = (struct user_info*) user; + const char *attr_list[7]; + + attr_list[0] = "version"; + attr_list[1] = version; + + attr_list[2] = "encoding"; + attr_list[3] = "UTF-8"; /* encoding */ + + attr_list[4] = "standalone"; + attr_list[5] = standalone ? "yes" : "no"; + + attr_list[6] = 0; + + data1_mk_preprocess (ui->dh, ui->nmem, "xml", attr_list, + ui->d1_stack[ui->level-1]); + yaz_log (ui->loglevel, "decl version=%s encoding=%s", + version ? version : "null", + encoding ? encoding : "null"); +} + +static void cb_processing (void *user, const char *target, + const char *data) +{ + struct user_info *ui = (struct user_info*) user; + data1_node *res = + data1_mk_preprocess (ui->dh, ui->nmem, target, 0, + ui->d1_stack[ui->level-1]); + data1_mk_text_nf (ui->dh, ui->nmem, data, strlen(data), res); + + yaz_log (ui->loglevel, "decl processing target=%s data=%s", + target ? target : "null", + data ? data : "null"); + + +} + +static void cb_comment (void *user, const char *data) +{ + struct user_info *ui = (struct user_info*) user; + yaz_log (ui->loglevel, "decl comment data=%s", data ? data : "null"); + data1_mk_comment (ui->dh, ui->nmem, data, ui->d1_stack[ui->level-1]); +} + +static void cb_doctype_start (void *userData, const char *doctypeName, + const char *sysid, const char *pubid, + int has_internal_subset) +{ + struct user_info *ui = (struct user_info*) userData; + yaz_log (ui->loglevel, "doctype start doctype=%s sysid=%s pubid=%s", + doctypeName, sysid, pubid); +} + +static void cb_doctype_end (void *userData) +{ + struct user_info *ui = (struct user_info*) userData; + yaz_log (ui->loglevel, "doctype end"); +} + + +static void cb_entity_decl (void *userData, const char *entityName, + int is_parameter_entity, + const char *value, int value_length, + const char *base, const char *systemId, + const char *publicId, const char *notationName) +{ + struct user_info *ui = (struct user_info*) userData; + yaz_log (ui->loglevel, + "entity decl %s is_para_entry=%d value=%.*s base=%s systemId=%s" + " publicId=%s notationName=%s", + entityName, is_parameter_entity, value_length, value, + base, systemId, publicId, notationName); + +} + +static int cb_external_entity (XML_Parser pparser, + const char *context, + const char *base, + const char *systemId, + const char *publicId) +{ + struct user_info *ui = (struct user_info*) XML_GetUserData(pparser); + FILE *inf; + int done = 0; + XML_Parser parser; + + yaz_log (ui->loglevel, + "external entity context=%s base=%s systemid=%s publicid=%s", + context, base, systemId, publicId); + if (!systemId) + return 1; + + if (!(inf = fopen (systemId, "rb"))) + { + yaz_log (LOG_WARN|LOG_ERRNO, "fopen %s", systemId); + return 0; + } + + parser = XML_ExternalEntityParserCreate (pparser, "", 0); + while (!done) + { + int r; + void *buf = XML_GetBuffer (parser, XML_CHUNK); + if (!buf) + { + yaz_log (LOG_WARN, "XML_GetBuffer fail"); + break; + } + r = fread (buf, 1, XML_CHUNK, inf); + if (r == 0) + { + if (ferror(inf)) + { + yaz_log (LOG_WARN|LOG_ERRNO, "fread %s", systemId); + break; + } + done = 1; + } + if (!XML_ParseBuffer (parser, r, done)) + { + yaz_log (LOG_WARN, "XML_ParseBuffer failed %s", + XML_ErrorString(XML_GetErrorCode(parser))); + } + } + fclose (inf); + XML_ParserFree (parser); + return done; +} + + +#if HAVE_ICONV_H +static int cb_encoding_convert (void *data, const char *s) +{ + iconv_t t = (iconv_t) data; + size_t ret; + size_t outleft = 2; + char outbuf_[2], *outbuf = outbuf_; + size_t inleft = 4; + char *inbuf = (char *) s; + unsigned short code; + + ret = iconv (t, &inbuf, &inleft, &outbuf, &outleft); + if (ret == (size_t) (-1) && errno != E2BIG) + { + iconv (t, 0, 0, 0, 0); + return -1; + } + if (outleft != 0) + return -1; + memcpy (&code, outbuf_, sizeof(short)); + return code; +} + +static void cb_encoding_release (void *data) +{ + iconv_t t = (iconv_t) data; + iconv_close (t); +} + +static int cb_encoding_handler (void *userData, const char *name, + XML_Encoding *info) +{ + int i = 0; + int no_ok = 0; + struct user_info *ui = (struct user_info*) userData; + + iconv_t t = iconv_open ("UNICODE", name); + if (t == (iconv_t) (-1)) + return 0; + + info->data = 0; /* signal that multibyte is not in use */ + yaz_log (ui->loglevel, "Encoding handler of %s", name); + for (i = 0; i<256; i++) + { + size_t ret; + char outbuf_[5]; + char inbuf_[5]; + char *inbuf = inbuf_; + char *outbuf = outbuf_; + size_t inleft = 1; + size_t outleft = 2; + inbuf_[0] = i; + + iconv (t, 0, 0, 0, 0); /* reset iconv */ + + ret = iconv(t, &inbuf, &inleft, &outbuf, &outleft); + if (ret == (size_t) (-1)) + { + if (errno == EILSEQ) + { + yaz_log (ui->loglevel, "Encoding %d: invalid sequence", i); + info->map[i] = -1; /* invalid sequence */ + } + if (errno == EINVAL) + { /* multi byte input */ + int len = 2; + int j = 0; + info->map[i] = -1; + + while (len <= 4) + { + char sbuf[80]; + int k; + inbuf = inbuf_; + inleft = len; + outbuf = outbuf_; + outleft = 2; + + inbuf_[len-1] = j; + iconv (t, 0,0,0,0); + + assert (i >= 0 && i<255); + + *sbuf = 0; + for (k = 0; k 255) + break; + } + else if (errno == EINVAL) + { + len++; + j = 7; + } + } + else if (outleft == 0) + { + info->map[i] = -len; + info->data = t; /* signal that multibyte is in use */ + break; + } + else + { + break; + } + } + if (info->map[i] < -1) + yaz_log (ui->loglevel, "Encoding %d: multibyte input %d", + i, -info->map[i]); + else + yaz_log (ui->loglevel, "Encoding %d: multibyte input failed", + i); + } + if (errno == E2BIG) + { + info->map[i] = -1; /* no room for output */ + yaz_log (LOG_WARN, "Encoding %d: no room for output", + i); + } + } + else if (outleft == 0) + { + unsigned short code; + memcpy (&code, outbuf_, sizeof(short)); + info->map[i] = code; + no_ok++; + } + else + { /* should never happen */ + info->map[i] = -1; + yaz_log (LOG_DEBUG, "Encoding %d: bad state", i); + } + } + if (info->data) + { /* at least one multi byte */ + info->convert = cb_encoding_convert; + info->release = cb_encoding_release; + } + else + { + /* no multi byte - we no longer need iconv handler */ + iconv_close(t); + info->convert = 0; + info->release = 0; + } + if (!no_ok) + return 0; + return 1; +} +/* HAVE_ICONV_H */ +#endif + + +data1_node *zebra_read_xml (data1_handle dh, + int (*rf)(void *, char *, size_t), void *fh, + NMEM m) +{ + XML_Parser parser; + struct user_info uinfo; + int done = 0; + + uinfo.loglevel = LOG_LOG; + uinfo.level = 1; + uinfo.dh = dh; + uinfo.nmem = m; + uinfo.d1_stack[0] = data1_mk_node2 (dh, m, DATA1N_root, 0); + uinfo.d1_stack[1] = 0; /* indicate no children (see end of routine) */ + + parser = XML_ParserCreate (0 /* encoding */); + + XML_SetElementHandler (parser, cb_start, cb_end); + XML_SetCharacterDataHandler (parser, cb_chardata); + XML_SetXmlDeclHandler (parser, cb_decl); + XML_SetProcessingInstructionHandler (parser, cb_processing); + XML_SetUserData (parser, &uinfo); + XML_SetCommentHandler (parser, cb_comment); + XML_SetDoctypeDeclHandler (parser, cb_doctype_start, cb_doctype_end); + XML_SetEntityDeclHandler (parser, cb_entity_decl); + XML_SetExternalEntityRefHandler (parser, cb_external_entity); +#if HAVE_ICONV_H + XML_SetUnknownEncodingHandler (parser, cb_encoding_handler, &uinfo); +#endif + while (!done) + { + int r; + void *buf = XML_GetBuffer (parser, XML_CHUNK); + if (!buf) + { + /* error */ + yaz_log (LOG_WARN, "XML_GetBuffer fail"); + break; + } + r = (*rf)(fh, buf, XML_CHUNK); + if (r < 0) + { + /* error */ + yaz_log (LOG_WARN, "XML read fail"); + break; + } + else if (r == 0) + done = 1; + if (!XML_ParseBuffer (parser, r, done)) + { + yaz_log (LOG_WARN, "XML_ParseBuffer (1) failed %s", + XML_ErrorString(XML_GetErrorCode(parser))); + } + } + XML_ParserFree (parser); + if (!uinfo.d1_stack[1] || !done) + return 0; + return uinfo.d1_stack[0]; +} + struct xml_info { int dummy; }; @@ -41,7 +445,7 @@ static void *grs_init_xml(void) static data1_node *grs_read_xml (struct grs_read_info *p) { - return data1_read_xml (p->dh, p->readf, p->fh, p->mem); + return zebra_read_xml (p->dh, p->readf, p->fh, p->mem); } static void grs_destroy_xml(void *clientData) @@ -60,4 +464,6 @@ static struct recTypeGrs xml_type = { RecTypeGrs recTypeGrs_xml = &xml_type; +/* HAVE_EXPAT_H */ #endif +