From 0f3b8bcc6fe2e3beeec7c834d9a64dca48a4f1b7 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Thu, 28 Apr 2005 08:20:39 +0000 Subject: [PATCH] Added new fundamental filter 'xslt'. This filter reads XML records and uses LibXSLT for both indexing (extract) and retrieval (present). During indexing the filter generates a Zebra indexing record via XSLT which describes how Zebra is to index the record. Because the driver is XSLT driven it can use any X-Path plus logic behind the scenes and is thus more powerful than xelm/elm in .abs. The XSLT can accept parameters from Zebra. For example, if a date is received the filter could make a date index. The filter also uses allows splitting of XML records during indexing, so that MARC collections can be indexed directly (but it is quite limited and takes place before XSLT is invoked: XSLT requires a DOM structure in memory). Refer ot example test case in in test/xslt. --- configure.in | 37 +++++- include/idzebra/recctrl.h | 4 +- index/extract.c | 88 ++++++++++--- index/index.h | 5 +- index/zebraapi.c | 10 +- recctrl/Makefile.am | 19 ++- recctrl/alvis.c | 79 +++++------ recctrl/recctrl.c | 13 +- recctrl/xslt.c | 322 +++++++++++++++++++++++++++++++++++++++++++++ test/Makefile.am | 4 +- test/marcxml/Makefile.am | 4 +- test/marcxml/t1.c | 4 +- test/marcxml/t2.c | 4 +- test/xslt/Makefile.am | 26 ++++ test/xslt/id.xsl | 16 +++ test/xslt/marc-col.xml | 139 +++++++++++++++++++ test/xslt/marc1.xsl | 23 ++++ test/xslt/zebra.cfg | 5 + 18 files changed, 714 insertions(+), 88 deletions(-) create mode 100644 recctrl/xslt.c create mode 100644 test/xslt/Makefile.am create mode 100644 test/xslt/id.xsl create mode 100644 test/xslt/marc-col.xml create mode 100644 test/xslt/marc1.xsl create mode 100644 test/xslt/zebra.cfg diff --git a/configure.in b/configure.in index 860823b..9ab20c5 100644 --- a/configure.in +++ b/configure.in @@ -1,5 +1,5 @@ dnl Zebra, Index Data ApS, 1995-2005 -dnl $Id: configure.in,v 1.118 2005-04-26 08:11:22 adam Exp $ +dnl $Id: configure.in,v 1.119 2005-04-28 08:20:39 adam Exp $ dnl AC_INIT(include/idzebra/version.h) AM_INIT_AUTOMAKE(idzebra,1.4.0) @@ -28,6 +28,30 @@ dnl dnl ------ YAZ YAZ_INIT($yazflag,2.1.3) YAZ_DOC +dnl ----- libXSLT +AC_SUBST(XSLT_LIBS) +AC_SUBST(XSLT_CFLAGS) +xsltdir=yes +AC_ARG_WITH(xslt,[[ --with-xslt[=PREFIX] use libxslt in PREFIX]],xsltdir=$withval) +if test "$xsltdir" = "yes"; then + for d in /usr /usr/local; do + if test -x $d/bin/xslt-config; then + xsltdir=$d + fi + done +fi +if test "$xsltdir" != "no"; then + AC_MSG_CHECKING(for libXSLT) + if test -x $xsltdir/bin/xslt-config; then + XSLT_LIBS=`$xsltdir/bin/xslt-config --libs` + XSLT_CFLAGS=`$xsltdir/bin/xslt-config --cflags` + XSLT_VER=`$xsltdir/bin/xslt-config --version` + AC_MSG_RESULT($XSLT_VER) + AC_DEFINE(HAVE_XSLT) + else + AC_MSG_RESULT(Not found) + fi +fi dnl ------ Look for Tcl dnl See if user has specified location of tclConfig.sh; otherwise dnl see if tclConfig.sh exists in same prefix lcoation as tclsh; otherwise @@ -256,14 +280,20 @@ ZEBRA_MODULE(grs-sgml,static,[ --enable-mod-grs-sgml Simple SGML/XML filter]) ZEBRA_MODULE(grs-regx,shared,[ --enable-mod-grs-regx REGX/TCL filter]) ZEBRA_MODULE(grs-marc,shared,[ --enable-mod-grs-marc MARC filter]) ZEBRA_MODULE(grs-danbib,shared,[ --enable-mod-grs-danbib DanBib filter (DBC)]) -ZEBRA_MODULE(safari,shared,[ --enable-mod-safari Safari filter (DBC)]) +ZEBRA_MODULE(safari,shared, [ --enable-mod-safari Safari filter (DBC)]) if test "$ac_cv_header_expat_h" = "yes"; then def="shared" else def="no" fi ZEBRA_MODULE(grs-xml,[$def], [ --enable-mod-grs-xml XML filter (Expat based)]) -ZEBRA_MODULE(alvis,shared, [ --enable-mod-alvis ALVIS XML filter]) +if test "$XSLT_VER"; then + def="shared" +else + def="no" +fi +ZEBRA_MODULE(xslt,[$def], [ --enable-mod-xslt XSLT filter]) +ZEBRA_MODULE(alvis,shared, [ --enable-mod-alvis ALVIS filter]) dnl ------ ANSI C Header files AC_STDC_HEADERS if test "$ac_cv_header_stdc" = "no"; then @@ -296,6 +326,7 @@ AC_OUTPUT([ doc/zebraphp.dsl doc/tkl.xsl test/Makefile test/gils/Makefile test/usmarc/Makefile test/api/Makefile + test/xslt/Makefile test/xpath/Makefile test/rusmarc/Makefile test/cddb/Makefile test/malxml/Makefile test/mbox/Makefile diff --git a/include/idzebra/recctrl.h b/include/idzebra/recctrl.h index abc8357..0a39e70 100644 --- a/include/idzebra/recctrl.h +++ b/include/idzebra/recctrl.h @@ -1,4 +1,4 @@ -/* $Id: recctrl.h,v 1.9 2005-03-31 12:42:06 adam Exp $ +/* $Id: recctrl.h,v 1.10 2005-04-28 08:20:39 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -59,7 +59,9 @@ struct recExtractCtrl { void (*init)(struct recExtractCtrl *p, RecWord *w); void *clientData; void (*tokenAdd)(RecWord *w); + void (*setStoreData)(struct recExtractCtrl *p, void *buf, size_t size); ZebraMaps zebra_maps; + int first_record; int flagShowRecords; int seqno[256]; char match_criteria[256]; diff --git a/index/extract.c b/index/extract.c index 8f0cf0b..f32e575 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,4 +1,4 @@ -/* $Id: extract.c,v 1.178 2005-04-15 10:47:48 adam Exp $ +/* $Id: extract.c,v 1.179 2005-04-28 08:20:39 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -79,6 +79,8 @@ static void logRecord (ZebraHandle zh) } } +static void extract_set_store_data_prepare(struct recExtractCtrl *p); + static void extract_init (struct recExtractCtrl *p, RecWord *w) { w->zebra_maps = p->zebra_maps; @@ -393,7 +395,9 @@ static int file_extract_record(ZebraHandle zh, SYSNO *sysno, const char *fname, int deleteFlag, struct file_read_info *fi, - int force_update) + int force_update, + RecType recType, + void *recTypeClientData) { RecordAttr *recordAttr; int r; @@ -401,17 +405,7 @@ static int file_extract_record(ZebraHandle zh, SYSNO sysnotmp; Record rec; off_t recordOffset = 0; - RecType recType; - void *clientData; - if (!(recType = - recType_byName (zh->reg->recTypes, zh->res, zh->m_record_type, - &clientData))) - { - yaz_log (YLOG_WARN, "No such record type: %s", zh->m_record_type); - return 0; - } - /* announce database */ if (zebraExplain_curDatabase (zh->reg->zei, zh->basenames[0])) { @@ -430,8 +424,10 @@ static int file_extract_record(ZebraHandle zh, create_rec_keys_codec(&zh->reg->keys); zh->reg->sortKeys.buf_used = 0; + recordOffset = fi->file_moffset; + extractCtrl.handle = zh; extractCtrl.offset = fi->file_moffset; extractCtrl.readf = file_read; extractCtrl.seekf = file_seek; @@ -443,7 +439,10 @@ static int file_extract_record(ZebraHandle zh, extractCtrl.schemaAdd = extract_schema_add; extractCtrl.dh = zh->reg->dh; extractCtrl.match_criteria[0] = '\0'; - extractCtrl.handle = zh; + extractCtrl.first_record = fi->file_offset ? 0 : 1; + + extract_set_store_data_prepare(&extractCtrl); + for (i = 0; i<256; i++) { if (zebra_maps_is_positioned(zh->reg->zebra_maps, i)) @@ -463,7 +462,7 @@ static int file_extract_record(ZebraHandle zh, yaz_log_init_prefix2 (msg); } - r = (*recType->extract)(clientData, &extractCtrl); + r = (*recType->extract)(recTypeClientData, &extractCtrl); yaz_log_init_prefix2 (0); if (r == RECCTRL_EXTRACT_EOF) @@ -685,7 +684,14 @@ static int file_extract_record(ZebraHandle zh, /* update store data */ xfree (rec->info[recInfo_storeData]); - if (zh->m_store_data) + if (zh->store_data_buf) + { + rec->size[recInfo_storeData] = zh->store_data_size; + rec->info[recInfo_storeData] = zh->store_data_buf; + zh->store_data_buf = 0; + file_end(fi, fi->file_offset); + } + else if (zh->m_store_data) { rec->size[recInfo_storeData] = recordAttr->recordSize; rec->info[recInfo_storeData] = (char *) @@ -732,6 +738,8 @@ int fileExtract (ZebraHandle zh, SYSNO *sysno, const char *fname, char ext_res[128]; struct file_read_info *fi; const char *original_record_type = 0; + RecType recType; + void *recTypeClientData; if (!zh->m_group || !*zh->m_group) *gprefix = '\0'; @@ -770,6 +778,21 @@ int fileExtract (ZebraHandle zh, SYSNO *sysno, const char *fname, zh->m_record_id = res_get (zh->res, ext_res); } + if (!(recType = + recType_byName (zh->reg->recTypes, zh->res, zh->m_record_type, + &recTypeClientData))) + { + yaz_log(YLOG_WARN, "No such record type: %s", zh->m_record_type); + return 0; + } + + switch(recType->version) + { + case 0: + break; + default: + yaz_log(YLOG_WARN, "Bad filter version: %s", zh->m_record_type); + } if (sysno && deleteFlag) fd = -1; else @@ -797,7 +820,8 @@ int fileExtract (ZebraHandle zh, SYSNO *sysno, const char *fname, do { file_begin (fi); - r = file_extract_record (zh, sysno, fname, deleteFlag, fi, 1); + r = file_extract_record (zh, sysno, fname, deleteFlag, fi, 1, + recType, recTypeClientData); } while (r && !sysno && fi->file_more); file_read_stop (fi); if (fd != -1) @@ -850,6 +874,7 @@ ZEBRA_RES buffer_extract_record (ZebraHandle zh, extractCtrl.seekf = zebra_record_int_seek; extractCtrl.tellf = zebra_record_int_tell; extractCtrl.endf = zebra_record_int_end; + extractCtrl.first_record = 1; extractCtrl.fh = &fc; create_rec_keys_codec(&zh->reg->keys); @@ -903,6 +928,7 @@ ZEBRA_RES buffer_extract_record (ZebraHandle zh, else extractCtrl.seqno[i] = 0; } + extract_set_store_data_prepare(&extractCtrl); r = (*recType->extract)(clientData, &extractCtrl); @@ -1187,6 +1213,9 @@ int explain_extract (void *handle, Record rec, data1_node *n) extractCtrl.flagShowRecords = 0; extractCtrl.match_criteria[0] = '\0'; extractCtrl.handle = handle; + extractCtrl.first_record = 1; + + extract_set_store_data_prepare(&extractCtrl); if (n) grs_extract_tree(&extractCtrl, n); @@ -1669,9 +1698,34 @@ void extract_token_add (RecWord *p) extract_add_incomplete_field(p); } +static void extract_set_store_data_cb(struct recExtractCtrl *p, + void *buf, size_t sz) +{ + ZebraHandle zh = (ZebraHandle) p->handle; + + xfree(zh->store_data_buf); + zh->store_data_buf = 0; + zh->store_data_size = 0; + if (buf && sz) + { + zh->store_data_buf = xmalloc(sz); + zh->store_data_size = sz; + memcpy(zh->store_data_buf, buf, sz); + } +} + +static void extract_set_store_data_prepare(struct recExtractCtrl *p) +{ + ZebraHandle zh = (ZebraHandle) p->handle; + xfree(zh->store_data_buf); + zh->store_data_buf = 0; + zh->store_data_size = 0; + p->setStoreData = extract_set_store_data_cb; +} + void extract_schema_add (struct recExtractCtrl *p, Odr_oid *oid) { - ZebraHandle zh = (ZebraHandle) (p->handle); + ZebraHandle zh = (ZebraHandle) p->handle; zebraExplain_addSchema (zh->reg->zei, oid); } diff --git a/index/index.h b/index/index.h index c50a1b2..2c9c981 100644 --- a/index/index.h +++ b/index/index.h @@ -1,4 +1,4 @@ -/* $Id: index.h,v 1.134 2005-04-25 11:54:08 adam Exp $ +/* $Id: index.h,v 1.135 2005-04-28 08:20:40 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -306,6 +306,9 @@ struct zebra_session { int m_explain_database; int m_flag_rw; int m_file_verbose_limit; + + void *store_data_buf; + size_t store_data_size; }; struct rank_control { diff --git a/index/zebraapi.c b/index/zebraapi.c index f2d73c6..0be396f 100644 --- a/index/zebraapi.c +++ b/index/zebraapi.c @@ -1,4 +1,4 @@ -/* $Id: zebraapi.c,v 1.162 2005-04-26 08:11:22 adam Exp $ +/* $Id: zebraapi.c,v 1.163 2005-04-28 08:20:40 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -145,6 +145,8 @@ ZebraHandle zebra_open (ZebraService zs) zebra_mutex_cond_unlock (&zs->session_lock); + zh->store_data_buf = 0; + return zh; } @@ -1928,7 +1930,7 @@ void zebra_set_resource(ZebraHandle zh, const char *name, const char *value) ASSERTZH; assert(name); assert(value); - yaz_log(log_level, "zebra_set_resource %s:%s",name,value); + yaz_log(log_level, "zebra_set_resource %s:%s", name, value); zh->errCode = 0; res_set(zh->res, name, value); } @@ -1940,9 +1942,9 @@ const char *zebra_get_resource(ZebraHandle zh, ASSERTZH; assert(name); assert(defaultvalue); - v= res_get_def( zh->res, name, (char *)defaultvalue); + v = res_get_def (zh->res, name, (char *)defaultvalue); zh->errCode = 0; - yaz_log(log_level, "zebra_get_resource %s:%s",name,v); + yaz_log(log_level, "zebra_get_resource %s:%s", name, v); return v; } diff --git a/recctrl/Makefile.am b/recctrl/Makefile.am index 4647cf9..32191f4 100644 --- a/recctrl/Makefile.am +++ b/recctrl/Makefile.am @@ -1,4 +1,4 @@ -## $Id: Makefile.am,v 1.16 2005-03-31 12:42:06 adam Exp $ +## $Id: Makefile.am,v 1.17 2005-04-28 08:20:40 adam Exp $ common_libs = libidzebra-recctrl.la \ ../data1/libidzebra-data1.la \ @@ -34,9 +34,14 @@ mod_safari_la_LIBADD = $(common_libs) $(mod_safari_la_LADD) mod_alvis_la_SOURCES = alvis.c mod_alvis_la_LDFLAGS = -rpath $(pkglibdir) -module -avoid-version -mod_alvis_la_LADD = +mod_alvis_la_LADD = $(XSLT_LIBS) mod_alvis_la_LIBADD = $(common_libs) $(mod_alvis_la_LADD) +mod_xslt_la_SOURCES = xslt.c +mod_xslt_la_LDFLAGS = -rpath $(pkglibdir) -module -avoid-version +mod_xslt_la_LADD = $(XSLT_LIBS) +mod_xslt_la_LIBADD = $(common_libs) $(mod_alvis_la_LADD) + pkglib_LTLIBRARIES = $(SHARED_MODULE_LA) EXTRA_LTLIBRARIES = \ mod-grs-regx.la \ @@ -44,7 +49,8 @@ EXTRA_LTLIBRARIES = \ mod-grs-marc.la \ mod-grs-danbib.la \ mod-safari.la \ - mod-alvis.la + mod-alvis.la \ + mod-xslt.la # The common library lib_LTLIBRARIES = libidzebra-recctrl.la @@ -62,6 +68,9 @@ EXTRA_libidzebra_recctrl_la_SOURCES = \ $(mod_grs_xml_la_SOURCES) \ $(mod_grs_marc_la_SOURCES) \ $(mod_grs_danbib_la_SOURCES) \ - $(mod_safari_la_SOURCES) + $(mod_safari_la_SOURCES) \ + $(mod_alvis_la_SOURCES) \ + $(mod_xslt_la_SOURCES) -AM_CPPFLAGS = -I$(srcdir)/../include $(YAZINC) $(TCL_INCLUDE) -DDEFAULT_MODULE_PATH=\"$(pkglibdir)\" +AM_CPPFLAGS = -I$(srcdir)/../include $(YAZINC) $(XSLT_CFLAGS) \ + $(TCL_INCLUDE) -DDEFAULT_MODULE_PATH=\"$(pkglibdir)\" diff --git a/recctrl/alvis.c b/recctrl/alvis.c index f86fd25..907cb06 100644 --- a/recctrl/alvis.c +++ b/recctrl/alvis.c @@ -1,4 +1,4 @@ -/* $Id: alvis.c,v 1.1 2005-03-31 12:42:06 adam Exp $ +/* $Id: alvis.c,v 1.2 2005-04-28 08:20:40 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -44,23 +44,23 @@ static void filter_config(void *clientData, Res res, const char *args) } -static void filter_destroy(void *clientData) +static void filter_destroy (void *clientData) { struct filter_info *tinfo = clientData; xfree (tinfo->sep); xfree (tinfo); } -struct fi_info { +struct buf_info { struct recExtractCtrl *p; char *buf; int offset; int max; }; -static struct fi_info *fi_open(struct recExtractCtrl *p) +static struct buf_info *buf_open (struct recExtractCtrl *p) { - struct fi_info *fi = (struct fi_info *) xmalloc (sizeof(*fi)); + struct buf_info *fi = (struct buf_info *) xmalloc (sizeof(*fi)); fi->p = p; fi->buf = (char *) xmalloc (4096); @@ -69,7 +69,7 @@ static struct fi_info *fi_open(struct recExtractCtrl *p) return fi; } -static int fi_getchar(struct fi_info *fi, char *dst) +static int buf_read (struct filter_info *tinfo, struct buf_info *fi, char *dst) { if (fi->offset >= fi->max) { @@ -81,35 +81,28 @@ static int fi_getchar(struct fi_info *fi, char *dst) return 0; } *dst = fi->buf[(fi->offset)++]; - return 1; -} - -static int fi_gets(struct fi_info *fi, char *dst, int max) -{ - int l; - for (l = 0; l < max; l++) + if (tinfo->sep && *dst == *tinfo->sep) { - if (!fi_getchar(fi, dst+l)) - return 0; - if (dst[l] == '\n') - break; + off_t off = (*fi->p->tellf)(fi->p->fh); + (*fi->p->endf)(fi->p->fh, off - (fi->max - fi->offset)); + return 0; } - dst[l] = '\0'; return 1; } -static void fi_close (struct fi_info *fi) +static void buf_close (struct buf_info *fi) { xfree (fi->buf); xfree (fi); } -static int filter_extract(void *clientData, struct recExtractCtrl *p) +static int filter_extract (void *clientData, struct recExtractCtrl *p) { struct filter_info *tinfo = clientData; - char line[512]; + char w[512]; RecWord recWord; - struct fi_info *fi = fi_open(p); + int r; + struct buf_info *fi = buf_open (p); #if 0 yaz_log(YLOG_LOG, "filter_extract off=%ld", @@ -118,35 +111,25 @@ static int filter_extract(void *clientData, struct recExtractCtrl *p) xfree(tinfo->sep); tinfo->sep = 0; (*p->init)(p, &recWord); - - if (!fi_gets(fi, line, sizeof(line)-1)) - return RECCTRL_EXTRACT_ERROR_GENERIC; - sscanf(line, "%255s", p->match_criteria); - recWord.reg_type = 'w'; - while (fi_gets(fi, line, sizeof(line)-1)) + do { - int nor = 0; - char field[40]; - char *cp; -#if 0 - yaz_log(YLOG_LOG, "safari line: %s", line); -#endif - if (sscanf(line, ZINT_FORMAT " " ZINT_FORMAT " " ZINT_FORMAT " %39s %n", - &recWord.record_id, &recWord.section_id, &recWord.seqno, - field, &nor) < 4) - { - yaz_log(YLOG_WARN, "Bad safari record line: %s", line); - return RECCTRL_EXTRACT_ERROR_GENERIC; + int i = 0; + + r = buf_read (tinfo, fi, w); + while (r > 0 && i < 511 && w[i] != '\n' && w[i] != '\r') + { + i++; + r = buf_read (tinfo, fi, w + i); } - for (cp = line + nor; *cp == ' '; cp++) - ; - recWord.attrStr = field; - recWord.term_buf = cp; - recWord.term_len = strlen(cp); - (*p->tokenAdd)(&recWord); - } - fi_close(fi); + if (i) + { + recWord.term_buf = w; + recWord.term_len = i; + (*p->tokenAdd)(&recWord); + } + } while (r > 0); + buf_close (fi); return RECCTRL_EXTRACT_OK; } diff --git a/recctrl/recctrl.c b/recctrl/recctrl.c index e3f9746..5709d8c 100644 --- a/recctrl/recctrl.c +++ b/recctrl/recctrl.c @@ -1,4 +1,4 @@ -/* $Id: recctrl.c,v 1.19 2005-03-31 12:42:07 adam Exp $ +/* $Id: recctrl.c,v 1.20 2005-04-28 08:20:40 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -111,12 +111,23 @@ RecTypeClass recTypeClass_create (Res res, NMEM nmem) } #endif #ifdef IDZEBRA_STATIC_ALVIS +#if HAVE_XSLT if (1) { extern RecType idzebra_filter_alvis[]; recTypeClass_add (&rts, idzebra_filter_alvis, nmem, 0); } #endif +#endif +#ifdef IDZEBRA_STATIC_XSLT +#if HAVE_XSLT + if (1) + { + extern RecType idzebra_filter_xslt[]; + recTypeClass_add (&rts, idzebra_filter_xslt, nmem, 0); + } +#endif +#endif #if HAVE_DLFCN_H if (module_path) diff --git a/recctrl/xslt.c b/recctrl/xslt.c new file mode 100644 index 0000000..62c8dd0 --- /dev/null +++ b/recctrl/xslt.c @@ -0,0 +1,322 @@ +/* $Id: xslt.c,v 1.1 2005-04-28 08:20:40 adam Exp $ + Copyright (C) 1995-2005 + Index Data ApS + +This file is part of the Zebra server. + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Zebra; see the file LICENSE.zebra. If not, write to the +Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. +*/ + +#include +#include +#include + +#include +#include +#include + +#include +#include + +struct filter_info { + xsltStylesheetPtr stylesheet_xsp; + xmlTextReaderPtr reader; + char *fname; + int split_depth; +}; + +static const char *zebra_index_ns = "http://indexdata.dk/zebra/indexing/1"; + +static void *filter_init (Res res, RecType recType) +{ + struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo)); + tinfo->stylesheet_xsp = 0; + tinfo->reader = 0; + tinfo->fname = 0; + tinfo->split_depth = 1; + return tinfo; +} + +static void filter_config(void *clientData, Res res, const char *args) +{ + struct filter_info *tinfo = clientData; + if (!args || !*args) + args = "default.xsl"; + if (!tinfo->fname || strcmp(args, tinfo->fname)) + { + /* different filename so must reread stylesheet */ + xfree(tinfo->fname); + tinfo->fname = xstrdup(args); + if (tinfo->stylesheet_xsp) + xsltFreeStylesheet(tinfo->stylesheet_xsp); + tinfo->stylesheet_xsp = + xsltParseStylesheetFile((const xmlChar*) tinfo->fname); + } +} + +static void filter_destroy(void *clientData) +{ + struct filter_info *tinfo = clientData; + if (tinfo->stylesheet_xsp) + xsltFreeStylesheet(tinfo->stylesheet_xsp); + xfree(tinfo->fname); + xfree(tinfo); +} + +static int ioread_ex(void *context, char *buffer, int len) +{ + struct recExtractCtrl *p = context; + return (*p->readf)(p->fh, buffer, len); +} + +static int ioclose_ex(void *context) +{ + return 0; +} + +static void index_field(struct filter_info *tinfo, struct recExtractCtrl *ctrl, + xmlNodePtr ptr, RecWord *recWord) +{ + for(; ptr; ptr = ptr->next) + { + index_field(tinfo, ctrl, ptr->children, recWord); + if (ptr->type != XML_TEXT_NODE) + continue; + recWord->term_buf = ptr->content; + recWord->term_len = strlen(ptr->content); + (*ctrl->tokenAdd)(recWord); + } +} + +static void index_node(struct filter_info *tinfo, struct recExtractCtrl *ctrl, + xmlNodePtr ptr, RecWord *recWord) +{ + for(; ptr; ptr = ptr->next) + { + index_node(tinfo, ctrl, ptr->children, recWord); + if (ptr->type != XML_ELEMENT_NODE || !ptr->ns || + strcmp(ptr->ns->href, zebra_index_ns)) + continue; + if (!strcmp(ptr->name, "index")) + { + char *field_str = 0; + const char *xpath_str = 0; + struct _xmlAttr *attr; + for (attr = ptr->properties; attr; attr = attr->next) + { + if (!strcmp(attr->name, "field") + && attr->children && attr->children->type == XML_TEXT_NODE) + field_str = attr->children->content; + if (!strcmp(attr->name, "xpath") + && attr->children && attr->children->type == XML_TEXT_NODE) + xpath_str = attr->children->content; + } + if (field_str) + { + recWord->attrStr = field_str; + index_field(tinfo, ctrl, ptr->children, recWord); + } + } + } +} + +static int filter_extract(void *clientData, struct recExtractCtrl *p) +{ + static const char *params[] = { + "schema", "'http://indexdata.dk/zebra/indexing/1'", + 0 + }; + struct filter_info *tinfo = clientData; + RecWord recWord; + int ret; + + if (p->first_record) + { + if (tinfo->reader) + xmlFreeTextReader(tinfo->reader); + tinfo->reader = xmlReaderForIO(ioread_ex, ioclose_ex, + p /* I/O handler */, + 0 /* URL */, + 0 /* encoding */, + XML_PARSE_XINCLUDE); + } + if (!tinfo->reader) + return RECCTRL_EXTRACT_ERROR_GENERIC; + + if (!tinfo->stylesheet_xsp) + return RECCTRL_EXTRACT_ERROR_GENERIC; + + (*p->init)(p, &recWord); + recWord.reg_type = 'w'; + + ret = xmlTextReaderRead(tinfo->reader); + while (ret == 1) { + int type = xmlTextReaderNodeType(tinfo->reader); + int depth = xmlTextReaderDepth(tinfo->reader); + if (tinfo->split_depth == 0 || + (type == XML_READER_TYPE_ELEMENT && tinfo->split_depth == depth)) + { + xmlChar *buf_out; + int len_out; + + xmlNodePtr ptr = xmlTextReaderExpand(tinfo->reader); + xmlNodePtr ptr2 = xmlCopyNode(ptr, 1); + xmlDocPtr doc = xmlNewDoc("1.0"); + + xmlDocSetRootElement(doc, ptr2); + + if (tinfo->stylesheet_xsp) + { + xmlDocPtr resDoc = + xsltApplyStylesheet(tinfo->stylesheet_xsp, + doc, params); + if (p->flagShowRecords) + { + xmlDocDumpMemory(resDoc, &buf_out, &len_out); + fwrite(buf_out, len_out, 1, stdout); + xmlFree(buf_out); + } + index_node(tinfo, p, xmlDocGetRootElement(resDoc), &recWord); + xmlFreeDoc(resDoc); + } + xmlDocDumpMemory(doc, &buf_out, &len_out); + if (p->flagShowRecords) + fwrite(buf_out, len_out, 1, stdout); + (*p->setStoreData)(p, buf_out, len_out); + xmlFree(buf_out); + + xmlFreeDoc(doc); + return RECCTRL_EXTRACT_OK; + } + ret = xmlTextReaderRead(tinfo->reader); + } + xmlFreeTextReader(tinfo->reader); + tinfo->reader = 0; + return RECCTRL_EXTRACT_EOF; +} + +static int ioread_ret(void *context, char *buffer, int len) +{ + struct recRetrieveCtrl *p = context; + return (*p->readf)(p->fh, buffer, len); +} + +static int ioclose_ret(void *context) +{ + return 0; +} + +static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p) +{ + static const char *params[] = { + "schema", "'F'", + 0 + }; + struct filter_info *tinfo = clientData; + xmlDocPtr resDoc; + xmlDocPtr doc; + + if (p->comp) + { + const char *esn; + char *esn_quoted; + if (p->comp->which != Z_RecordComp_simple + || p->comp->u.simple->which != Z_ElementSetNames_generic) + { + p->diagnostic = YAZ_BIB1_PRESENT_COMP_SPEC_PARAMETER_UNSUPP; + return 0; + } + esn = p->comp->u.simple->u.generic; + esn_quoted = odr_malloc(p->odr, 3 + strlen(esn)); + sprintf(esn_quoted, "'%s'", esn); + params[1] = esn_quoted; + } + if (!tinfo->stylesheet_xsp) + { + p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS; + return 0; + } + doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */, + 0 /* URL */, + 0 /* encoding */, + XML_PARSE_XINCLUDE); + if (!doc) + { + p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS; + return 0; + } + resDoc = xsltApplyStylesheet(tinfo->stylesheet_xsp, + doc, params); + if (!resDoc) + { + p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS; + } + else if (p->input_format == VAL_NONE || p->input_format == VAL_TEXT_XML) + { + xmlChar *buf_out; + int len_out; + xmlDocDumpMemory(resDoc, &buf_out, &len_out); + + p->output_format = VAL_TEXT_XML; + p->rec_len = len_out; + p->rec_buf = odr_malloc(p->odr, p->rec_len); + memcpy(p->rec_buf, buf_out, p->rec_len); + + xmlFree(buf_out); + } + else if (p->output_format == VAL_SUTRS) + { + xmlChar *buf_out; + int len_out; + xmlDocDumpMemory(resDoc, &buf_out, &len_out); + + p->output_format = VAL_SUTRS; + p->rec_len = len_out; + p->rec_buf = odr_malloc(p->odr, p->rec_len); + memcpy(p->rec_buf, buf_out, p->rec_len); + + xmlFree(buf_out); + } + else + { + p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP; + } + xmlFreeDoc(resDoc); + xmlFreeDoc(doc); + return 0; +} + +static struct recType filter_type = { + 0, + "xslt", + filter_init, + filter_config, + filter_destroy, + filter_extract, + filter_retrieve +}; + +RecType +#ifdef IDZEBRA_STATIC_XSLT +idzebra_filter_xslt +#else +idzebra_filter +#endif + +[] = { + &filter_type, + 0, +}; diff --git a/test/Makefile.am b/test/Makefile.am index 9bfea70..49d3e4a 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -1,4 +1,4 @@ -SUBDIRS=codec api xpath gils malxml config usmarc dmoz sort sort2 xelm cddb \ - rusmarc zsh marcxml charmap mbox espec +SUBDIRS=codec api xslt xpath gils malxml config usmarc dmoz sort \ + sort2 xelm cddb rusmarc zsh marcxml charmap mbox espec diff --git a/test/marcxml/Makefile.am b/test/marcxml/Makefile.am index f5f28ec..3ad42ff 100644 --- a/test/marcxml/Makefile.am +++ b/test/marcxml/Makefile.am @@ -1,4 +1,4 @@ -# $Id: Makefile.am,v 1.3 2004-12-02 12:04:49 adam Exp $ +# $Id: Makefile.am,v 1.4 2005-04-28 08:20:41 adam Exp $ check_PROGRAMS = t1 t2 @@ -9,7 +9,7 @@ EXTRA_DIST = zebra.cfg record.abs m1.xml m2.xml m3.xml sample-marc t1_SOURCES = t1.c t2_SOURCES = t2.c -AM_CPPFLAGS = -I$(top_srcdir)/include $(YAZINC) +AM_CPPFLAGS = -I$(top_srcdir)/include -I$(srcdir)/../api $(YAZINC) zebralibs = \ ../../index/libidzebra-api.la \ diff --git a/test/marcxml/t1.c b/test/marcxml/t1.c index 303bf3b..36a9c4d 100644 --- a/test/marcxml/t1.c +++ b/test/marcxml/t1.c @@ -1,4 +1,4 @@ -/* $Id: t1.c,v 1.4 2005-01-15 19:38:36 adam Exp $ +/* $Id: t1.c,v 1.5 2005-04-28 08:20:41 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -20,7 +20,7 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -#include "../api/testlib.h" +#include "testlib.h" int main(int argc, char **argv) { diff --git a/test/marcxml/t2.c b/test/marcxml/t2.c index 0d5e070..e947f51 100644 --- a/test/marcxml/t2.c +++ b/test/marcxml/t2.c @@ -1,4 +1,4 @@ -/* $Id: t2.c,v 1.3 2005-01-15 19:38:37 adam Exp $ +/* $Id: t2.c,v 1.4 2005-04-28 08:20:41 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -20,7 +20,7 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -#include "../api/testlib.h" +#include "testlib.h" int main(int argc, char **argv) { diff --git a/test/xslt/Makefile.am b/test/xslt/Makefile.am new file mode 100644 index 0000000..606ca28 --- /dev/null +++ b/test/xslt/Makefile.am @@ -0,0 +1,26 @@ +# $Id: Makefile.am,v 1.1 2005-04-28 08:20:41 adam Exp $ + +check_PROGRAMS = xslt1 +TESTS = $(check_PROGRAMS) + +EXTRA_DIST=zebra.cfg marc-col.xml marc1.xsl + +xslt1_SOURCES = xslt1.c + +AM_CPPFLAGS = -I$(srcdir)/../api -I$(top_srcdir)/include $(YAZINC) + +zebralibs = \ + ../../index/libidzebra-api.la \ + ../../rset/libidzebra-rset.la \ + ../../recctrl/libidzebra-recctrl.la \ + ../../dict/libidzebra-dict.la \ + ../../isams/libidzebra-isams.la \ + ../../isamc/libidzebra-isamc.la \ + ../../isamb/libidzebra-isamb.la \ + ../../data1/libidzebra-data1.la \ + ../../bfile/libidzebra-bfile.la \ + ../../dfa/libidzebra-dfa.la \ + ../../util/libidzebra-util.la + +LDADD = ../api/libtestlib.a $(zebralibs) $(YAZLALIB) + diff --git a/test/xslt/id.xsl b/test/xslt/id.xsl new file mode 100644 index 0000000..21d6d6b --- /dev/null +++ b/test/xslt/id.xsl @@ -0,0 +1,16 @@ + + + + + + + + + + + + diff --git a/test/xslt/marc-col.xml b/test/xslt/marc-col.xml new file mode 100644 index 0000000..195be74 --- /dev/null +++ b/test/xslt/marc-col.xml @@ -0,0 +1,139 @@ + + + 00366nam 22001698a 4500 + 11224466 + DLC + 00000000000000.0 + 910710c19910701nju 00010 eng + + 11224466 + + + DLC + DLC + + + 123-xyz + + + Jack Collins + + + How to program a computer + + + Penguin + + + 8710 + + + p. cm. + + + + 00366nam 22001698a 4500 + 11224467 + DLC + 00000000000000.0 + 910710c19910701nju 00010 eng + + 11224467 + + + DLC + DLC + + + 123-xyz + + + Jack Collins + + + How to program a computer + + + Penguin + + + 8710 + + + p. cm. + + + + 01369cam 2200265 i 4500 + 73090924 //r82 + DLC + 19820524000000.0 + 760609s1974 nyua b 10110 eng + + 73090924 //r82 + + + DLC + DLC + DLC + + + RC71.3 + .W67 1971 + + + 616.07/575/02854 + + + Workshop on Computer Processing of Dynamic Images from an Anger Scintillation Camera, + Washington University, + 1971. + + + Computer processing of dynamic images from an Anger scintillation camera : + the proceedings of a workshop / + cosponsored by the Biomedical Computer Laboratory and the Nuclear Medicine Division, Department of Radiology, School of Medicine, Washington University, St. Louis, January 18-22, 1971 ; edited by Kenneth B. Larson, Jerome R. Cox, Jr. + + + New York : + Society of Nuclear Medicine, + [c1974] + + + xiv, p. : + ill. ; + 24 cm. + + + Includes bibliographical references and index. + + + Radioisotope scanning + Data processing + Congresses. + + + Scintillation cameras + Congresses. + + + Imaging systems in medicine + Data processing + Congresses. + + + Larson, Kenneth B. + + + Cox, Jerome R. + + + Washington University, St. Louis. + Biomedical Computer Laboratory. + + + Washington University, St. Louis. + Nuclear Medicine Division. + + + diff --git a/test/xslt/marc1.xsl b/test/xslt/marc1.xsl new file mode 100644 index 0000000..87d4e4c --- /dev/null +++ b/test/xslt/marc1.xsl @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + diff --git a/test/xslt/zebra.cfg b/test/xslt/zebra.cfg new file mode 100644 index 0000000..743434f --- /dev/null +++ b/test/xslt/zebra.cfg @@ -0,0 +1,5 @@ +profilePath: ${srcdir:-.}/../../tab + +modulePath: ../../recctrl/.libs + +recordType: xslt.marc1.xsl -- 1.7.10.4