From acb63831cbd1595325151f0829b8dc9ae3eb8fbe Mon Sep 17 00:00:00 2001 From: Marc Cromme Date: Mon, 5 Dec 2005 12:18:40 +0000 Subject: [PATCH] added first version of CSV GRS filter still missing is - build of GRS data1 node tree to be given to indexer - control of how to call back zebra such that each record per line gets indexed on it's own - configuration file reading and config parameter passing --- configure.in | 3 +- recctrl/Makefile.am | 10 +- recctrl/csvread.c | 330 +++++++++++++++++++++++++++++++++++++++++++++++++++ recctrl/recctrl.c | 9 +- 4 files changed, 349 insertions(+), 3 deletions(-) create mode 100644 recctrl/csvread.c diff --git a/configure.in b/configure.in index 9382e5c..ea84476 100644 --- a/configure.in +++ b/configure.in @@ -1,5 +1,5 @@ dnl Zebra, Index Data ApS, 1995-2005 -dnl $Id: configure.in,v 1.128 2005-08-19 21:42:17 adam Exp $ +dnl $Id: configure.in,v 1.129 2005-12-05 12:18:40 marc Exp $ dnl AC_INIT(include/idzebra/version.h) AM_INIT_AUTOMAKE(idzebra,1.4.0) @@ -292,6 +292,7 @@ ZEBRA_MODULE(text,static, [ --enable-mod-text Text filter]) ZEBRA_MODULE(grs-sgml,static,[ --enable-mod-grs-sgml Simple SGML/XML filter]) ZEBRA_MODULE(grs-regx,shared,[ --enable-mod-grs-regx REGX/TCL filter]) ZEBRA_MODULE(grs-marc,shared,[ --enable-mod-grs-marc MARC filter]) +ZEBRA_MODULE(grs-csv,shared, [ --enable-mod-grs-csv CSV filter]) ZEBRA_MODULE(grs-danbib,shared,[ --enable-mod-grs-danbib DanBib filter (DBC)]) ZEBRA_MODULE(safari,shared, [ --enable-mod-safari Safari filter (DBC)]) if test "$ac_cv_header_expat_h" = "yes"; then diff --git a/recctrl/Makefile.am b/recctrl/Makefile.am index 32191f4..522ed9d 100644 --- a/recctrl/Makefile.am +++ b/recctrl/Makefile.am @@ -1,4 +1,4 @@ -## $Id: Makefile.am,v 1.17 2005-04-28 08:20:40 adam Exp $ +## $Id: Makefile.am,v 1.18 2005-12-05 12:18:40 marc Exp $ common_libs = libidzebra-recctrl.la \ ../data1/libidzebra-data1.la \ @@ -22,6 +22,11 @@ mod_grs_marc_la_LDFLAGS = -rpath $(pkglibdir) -module -avoid-version mod_grs_marc_la_LADD = mod_grs_marc_la_LIBADD = $(common_libs) $(mod_grs_marc_la_LADD) +mod_grs_csv_la_SOURCES = csvread.c +mod_grs_csv_la_LDFLAGS = -rpath $(pkglibdir) -module -avoid-version +mod_grs_csv_la_LADD = +mod_grs_csv_la_LIBADD = $(common_libs) $(mod_grs_csv_la_LADD) + mod_grs_danbib_la_SOURCES = danbibr.c mod_grs_danbib_la_LDFLAGS = -rpath $(pkglibdir) -module -avoid-version mod_grs_danbib_la_LADD = @@ -47,6 +52,7 @@ EXTRA_LTLIBRARIES = \ mod-grs-regx.la \ mod-grs-xml.la \ mod-grs-marc.la \ + mod-grs-csv.la \ mod-grs-danbib.la \ mod-safari.la \ mod-alvis.la \ @@ -61,12 +67,14 @@ libidzebra_recctrl_la_LIBADD = $(STATIC_MODULE_OBJ) \ ../dfa/libidzebra-dfa.la \ ../util/libidzebra-util.la \ $(STATIC_MODULE_LADD) + libidzebra_recctrl_la_DEPENDENCIES = $(STATIC_MODULE_OBJ) EXTRA_libidzebra_recctrl_la_SOURCES = \ $(mod_grs_regx_la_SOURCES) \ $(mod_grs_xml_la_SOURCES) \ $(mod_grs_marc_la_SOURCES) \ + $(mod_grs_csv_la_SOURCES) \ $(mod_grs_danbib_la_SOURCES) \ $(mod_safari_la_SOURCES) \ $(mod_alvis_la_SOURCES) \ diff --git a/recctrl/csvread.c b/recctrl/csvread.c new file mode 100644 index 0000000..0703d42 --- /dev/null +++ b/recctrl/csvread.c @@ -0,0 +1,330 @@ +/* $Id: csvread.c,v 1.1 2005-12-05 12:18:41 marc Exp $ + Copyright (C) 1995-2005 + Index Data ApS + +This file is part of the Zebra server. + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Zebra; see the file LICENSE.zebra. If not, write to the +Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. +*/ + + + +#include +#include +#include + +/* #include */ +#include +#include + +/* #include */ +#include + +/* +struct csv_getc_info { + char *buf; + int buf_size; + int size; + int off; + off_t moffset; + void *fh; + int (*readf)(void *, char *, size_t); + WRBUF wrbuf; +}; +*/ + +static struct csv_t { + NMEM nmem; + int buf_size; + char *buf; + int name_size; + int value_size; + char *value; + char field_char; + char record_char; + char string_char; + char *root_element; + int field_line; + int lower_case; + int max_nr_fields; + int nr_fields; + /* char *field_names; */ + char **field_name; +}; + + +static void grs_config_csv(void *clientData, Res res, const char *args) +{ + int i; + struct csv_t *csvp = (struct csv_t*) clientData; + + yaz_log (YLOG_LOG, "Called CSV filter grs_config_csv"); + yaz_log (YLOG_LOG, "'%s'", args); + + csvp->buf_size = 64; + csvp->buf = nmem_malloc(csvp->nmem, csvp->buf_size); + csvp->name_size = 256; + csvp->value_size = 4096; + csvp->value = nmem_malloc(csvp->nmem, csvp->value_size); + + csvp->field_char = '|'; + csvp->record_char = '\n'; + csvp->string_char = 0; + csvp->root_element = nmem_strdup(csvp->nmem, "csv"); + csvp->field_line = 1; + csvp->lower_case = 1; + csvp->max_nr_fields = 512; + csvp->nr_fields = 0; + /* csvp->field_names = 0; */ /*nmem_strdup(csvp->nmem, "a|b|c|d|e");*/ + + csvp->field_name + = nmem_malloc(csvp->nmem, + sizeof(*(csvp->field_name)) * csvp->max_nr_fields); + for (i = 0; i < csvp->max_nr_fields; i++){ + csvp->field_name[i] = 0; + } + + /* know field names from config file */ + /*if (strlen(csvp->field_names)) + yaz_log (YLOG_LOG, "CSV filter grs_config_csv field names"); + */ + + yaz_log (YLOG_LOG, "Ended CSV filter grs_config_csv"); +} + + +static data1_node *grs_read_csv (struct grs_read_info *gri) +{ + data1_node *root_node = 0; + data1_node *node = 0; + struct csv_t *csvp = (struct csv_t *)gri->clientData; + int field_nr = 0; + int end_of_record = 0; + int read_header = 0; + int read_bytes = 0; + char *cb = csvp->buf; + char *cv = csvp->value; + + yaz_log (YLOG_LOG, "Called CSV filter grs_read_csv"); + + /* if on start of first line, read header line for dynamic configure */ + if(csvp->field_line && gri->offset == 0) + read_header = 1; + + while (!end_of_record){ + +#if 0 + /* configure grs.csv filter with first line in file containing field + name information */ + if (read_header){ + yaz_log (YLOG_LOG, "CSV filter grs_read_csv reading header line"); + + /* create new memory for fieldname and value */ + if (old_nr_fields < csvp->nr_fields){ + yaz_log(YLOG_LOG, + "CSV filter grs_read_csv name:'%d' ", csvp->nr_fields); + old_nr_fields = csvp->nr_fields; + csvp->field_name[csvp->nr_fields] + = nmem_malloc(csvp->nmem, csvp->name_size); + csvp->field_value[csvp->nr_fields] + = nmem_malloc(csvp->nmem, csvp->value_size); + + /* read buf and copy values to field_name[] */ + read_bytes = (*gri->readf)(gri->fh, csvp->buf, csvp->buf_size); + gri-> offset = (*gri->tellf)(gri->fh); + /* yaz_log(YLOG_LOG, "CSV filter grs_read_csv offset:'%d' ", offset); */ + read_header = 0; + } + } else { + /* read buf and copy values to field_value[] */ + read_bytes = (*gri->readf)(gri->fh, csvp->buf, csvp->buf_size); + gri->offset = (*gri->tellf)(gri->fh); + yaz_log(YLOG_LOG, "CSV filter grs_read_csv offset:'%d' ", offset); + } + +#endif + + + /* read new buffer from file */ + read_bytes = (*gri->readf)(gri->fh, csvp->buf, csvp->buf_size); + + yaz_log (YLOG_LOG, "CSV filter grs_read_csv read_bytes %d", read_bytes); + yaz_log (YLOG_LOG, "CSV filter grs_read_csv csvp->buf %s", csvp->buf); + + gri->offset = (*gri->tellf)(gri->fh); + yaz_log(YLOG_LOG, "CSV filter grs_read_csv gri->offset:'%d' ", + (int)gri->offset); + + /* work on buffer */ + cb = csvp->buf; + while ((cb - csvp->buf < read_bytes) + && (cv - csvp->value < csvp->value_size) + && !end_of_record){ + + if (*cb == csvp->field_char){ + /* if field finished */ + *cv = '\0'; + if (read_header){ + /* read field names from header line */ + if (csvp->nr_fields < csvp->max_nr_fields){ + csvp->nr_fields++; + yaz_log (YLOG_LOG, "CSV filter grs_read_csv header %d %s", + field_nr, csvp->value); + } else { + yaz_log (YLOG_WARN, "CSV filter grs_read_csv header %d %s " + "exceeds configured max number of fields %d", + field_nr, csvp->value, csvp->max_nr_fields); + } + } else { + /* process following value line fields */ + if (field_nr < csvp->nr_fields){ + /* less or qual fields number */ + yaz_log (YLOG_LOG, "CSV filter grs_read_csv value %d %s", + field_nr, csvp->value); + } else { + /* too many fields */ + yaz_log (YLOG_WARN, "CSV filter grs_read_csv value %d %s " + "exceeds dynamic configured number of fields %d", + field_nr, csvp->value, csvp->nr_fields); + } + + } + cb++; + cv = csvp->value; + field_nr++; + } else if (*cb == csvp->record_char){ + /* if record finished */ + *cv = '\0'; + cb++; + cv = csvp->value; + field_nr = 0; + if (read_header){ + read_header = 0; + yaz_log (YLOG_LOG, "CSV filter grs_read_csv header end"); + } else { + end_of_record = 1; + yaz_log (YLOG_LOG, "CSV filter grs_read_csv record end"); + } + } else { + /* just plain char to be stored in value */ + if (csvp->lower_case && read_header){ + *cv = tolower(*cb); + } else { + *cv = *cb; + } + cb++; + cv++; + } + } + + + /* if (gri->endf) + (*gri->endf)(gri->fh, offset - 1); */ + } + + /* try to build GRS node and document */ + /* + root_node = data1_mk_root(gri->dh, gri->mem, cvsp->root_name); + node = data1_mk_node2(gri->dh, gri->mem, DATA1N_data, root_node); + node = data1_mk_tag(gri->dh, gri->mem, "pr_name_gn", 0, node); + data1_mk_text_n(gri->dh, gri->mem, csvp->buf, read_bytes, node); + */ + if (!root_node){ + yaz_log (YLOG_WARN, "empty CSV record of type '%s' " + "near file offset %d " + "or missing abstract syntax file '%s.abs'", + csvp->root_element, (int)gri->offset, csvp->root_element); + return 0; + } + + yaz_log (YLOG_LOG, "Ended CSV filter grs_read_csv"); + return root_node; +} + +static void *grs_init_csv(Res res, RecType recType) +{ + yaz_log (YLOG_LOG, "Called CSV filter grs_init_csv"); + NMEM m = nmem_create(); + struct csv_t *csvp = (struct csv_t *) nmem_malloc(m, sizeof(*csvp)); + csvp->nmem = m; + yaz_log (YLOG_LOG, "Ended CSV filter grs_init_csv"); + return csvp; +} + +static void grs_destroy_csv(void *clientData) +{ + struct csv_t *csvp = (struct csv_t*) clientData; + + yaz_log (YLOG_LOG, "Called CSV filter grs_destroy_csv"); + + nmem_destroy(csvp->nmem); + clientData = 0; + + yaz_log (YLOG_LOG, "Ended CSV filter grs_destroy_csv"); +} + +static int grs_extract_csv(void *clientData, struct recExtractCtrl *ctrl) +{ + int res; + /* struct csv_t *csvp = (struct csv_t*) clientData; */ + + yaz_log (YLOG_LOG, "Called CSV filter grs_extract_csv"); + yaz_log (YLOG_LOG, "recExtractCtr fh %d", (int)ctrl->fh); + yaz_log (YLOG_LOG, "recExtractCtr offset %d", (int)ctrl->offset); + + res = zebra_grs_extract(clientData, ctrl, grs_read_csv); + + yaz_log (YLOG_LOG, "recExtractCtr fh %d", (int)ctrl->fh); + yaz_log (YLOG_LOG, "recExtractCtr offset %d", (int)ctrl->offset); + yaz_log (YLOG_LOG, "Ended CSV filter grs_extract_csv"); + + return res; +} + +static int grs_retrieve_csv(void *clientData, struct recRetrieveCtrl *ctrl) +{ + int res; + /* struct csv_t *csvp = (struct csv_t*) clientData; */ + + yaz_log (YLOG_LOG, "Called CSV filter grs_retrieve_csv"); + res = zebra_grs_retrieve(clientData, ctrl, grs_read_csv); + yaz_log (YLOG_LOG, "Ended CSV filter grs_retrieve_csv"); + + return res; +} + +static struct recType grs_type_csv = +{ + 0, + "grs.csv", + grs_init_csv, + grs_config_csv, + grs_destroy_csv, + grs_extract_csv, + grs_retrieve_csv +}; + +RecType +#ifdef IDZEBRA_STATIC_GRS_CSV +idzebra_filter_grs_csv +#else +idzebra_filter +#endif + +[] = { + &grs_type_csv, + 0, +}; diff --git a/recctrl/recctrl.c b/recctrl/recctrl.c index 98c4076..e1d949b 100644 --- a/recctrl/recctrl.c +++ b/recctrl/recctrl.c @@ -1,4 +1,4 @@ -/* $Id: recctrl.c,v 1.21 2005-08-30 12:27:18 adam Exp $ +/* $Id: recctrl.c,v 1.22 2005-12-05 12:18:41 marc Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -98,6 +98,13 @@ RecTypeClass recTypeClass_create (Res res, NMEM nmem) recTypeClass_add (&rts, idzebra_filter_grs_marc, nmem, 0); } #endif +#ifdef IDZEBRA_STATIC_GRS_CSV + if (1) + { + extern RecType idzebra_filter_grs_csv[]; + recTypeClass_add (&rts, idzebra_filter_grs_csv, nmem, 0); + } +#endif #ifdef IDZEBRA_STATIC_GRS_DANBIB if (1) { -- 1.7.10.4