From acb63831cbd1595325151f0829b8dc9ae3eb8fbe Mon Sep 17 00:00:00 2001
From: Marc Cromme <marc@indexdata.dk>
Date: Mon, 5 Dec 2005 12:18:40 +0000
Subject: [PATCH] added first version of CSV GRS filter still missing is -
 build of GRS data1 node tree to be given to indexer -
 control of how to call back zebra such that each record per
 line gets indexed   on it's own - configuration file
 reading and config parameter passing

---
 configure.in        |    3 +-
 recctrl/Makefile.am |   10 +-
 recctrl/csvread.c   |  330 +++++++++++++++++++++++++++++++++++++++++++++++++++
 recctrl/recctrl.c   |    9 +-
 4 files changed, 349 insertions(+), 3 deletions(-)
 create mode 100644 recctrl/csvread.c

diff --git a/configure.in b/configure.in
index 9382e5c..ea84476 100644
--- a/configure.in
+++ b/configure.in
@@ -1,5 +1,5 @@
 dnl Zebra, Index Data ApS, 1995-2005
-dnl $Id: configure.in,v 1.128 2005-08-19 21:42:17 adam Exp $
+dnl $Id: configure.in,v 1.129 2005-12-05 12:18:40 marc Exp $
 dnl
 AC_INIT(include/idzebra/version.h)
 AM_INIT_AUTOMAKE(idzebra,1.4.0)
@@ -292,6 +292,7 @@ ZEBRA_MODULE(text,static,    [  --enable-mod-text       Text filter])
 ZEBRA_MODULE(grs-sgml,static,[  --enable-mod-grs-sgml   Simple SGML/XML filter])
 ZEBRA_MODULE(grs-regx,shared,[  --enable-mod-grs-regx   REGX/TCL filter])
 ZEBRA_MODULE(grs-marc,shared,[  --enable-mod-grs-marc   MARC filter])
+ZEBRA_MODULE(grs-csv,shared, [  --enable-mod-grs-csv    CSV filter])
 ZEBRA_MODULE(grs-danbib,shared,[  --enable-mod-grs-danbib DanBib filter (DBC)])
 ZEBRA_MODULE(safari,shared,  [  --enable-mod-safari Safari filter (DBC)])
 if test "$ac_cv_header_expat_h" = "yes"; then
diff --git a/recctrl/Makefile.am b/recctrl/Makefile.am
index 32191f4..522ed9d 100644
--- a/recctrl/Makefile.am
+++ b/recctrl/Makefile.am
@@ -1,4 +1,4 @@
-## $Id: Makefile.am,v 1.17 2005-04-28 08:20:40 adam Exp $
+## $Id: Makefile.am,v 1.18 2005-12-05 12:18:40 marc Exp $
 
 common_libs = libidzebra-recctrl.la \
  ../data1/libidzebra-data1.la \
@@ -22,6 +22,11 @@ mod_grs_marc_la_LDFLAGS = -rpath $(pkglibdir) -module -avoid-version
 mod_grs_marc_la_LADD =
 mod_grs_marc_la_LIBADD = $(common_libs) $(mod_grs_marc_la_LADD)
 
+mod_grs_csv_la_SOURCES = csvread.c
+mod_grs_csv_la_LDFLAGS = -rpath $(pkglibdir) -module -avoid-version
+mod_grs_csv_la_LADD =
+mod_grs_csv_la_LIBADD = $(common_libs) $(mod_grs_csv_la_LADD)
+
 mod_grs_danbib_la_SOURCES = danbibr.c
 mod_grs_danbib_la_LDFLAGS = -rpath $(pkglibdir) -module -avoid-version
 mod_grs_danbib_la_LADD =
@@ -47,6 +52,7 @@ EXTRA_LTLIBRARIES = \
 	mod-grs-regx.la \
 	mod-grs-xml.la \
 	mod-grs-marc.la  \
+	mod-grs-csv.la  \
 	mod-grs-danbib.la \
 	mod-safari.la \
 	mod-alvis.la \
@@ -61,12 +67,14 @@ libidzebra_recctrl_la_LIBADD = $(STATIC_MODULE_OBJ) \
  ../dfa/libidzebra-dfa.la  \
  ../util/libidzebra-util.la \
  $(STATIC_MODULE_LADD)
+
 libidzebra_recctrl_la_DEPENDENCIES = $(STATIC_MODULE_OBJ)
 
 EXTRA_libidzebra_recctrl_la_SOURCES = \
  $(mod_grs_regx_la_SOURCES) \
  $(mod_grs_xml_la_SOURCES) \
  $(mod_grs_marc_la_SOURCES) \
+ $(mod_grs_csv_la_SOURCES) \
  $(mod_grs_danbib_la_SOURCES) \
  $(mod_safari_la_SOURCES) \
  $(mod_alvis_la_SOURCES) \
diff --git a/recctrl/csvread.c b/recctrl/csvread.c
new file mode 100644
index 0000000..0703d42
--- /dev/null
+++ b/recctrl/csvread.c
@@ -0,0 +1,330 @@
+/* $Id: csvread.c,v 1.1 2005-12-05 12:18:41 marc Exp $
+   Copyright (C) 1995-2005
+   Index Data ApS
+
+This file is part of the Zebra server.
+
+Zebra is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2, or (at your option) any later
+version.
+
+Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with Zebra; see the file LICENSE.zebra.  If not, write to the
+Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
+02111-1307, USA.
+*/
+
+
+
+#include <yaz/log.h>
+#include <yaz/nmem.h>
+#include <yaz/yaz-util.h>
+
+/* #include <d1_absyn.h> */
+#include <idzebra/data1.h>
+#include <idzebra/recgrs.h>
+
+/* #include <assert.h> */
+#include <ctype.h>
+
+/*
+struct csv_getc_info {
+    char *buf;
+    int buf_size;
+    int size;
+    int off;
+    off_t moffset;
+    void *fh;
+    int (*readf)(void *, char *, size_t);
+    WRBUF wrbuf;
+};
+*/
+
+static struct csv_t {
+  NMEM nmem;
+  int buf_size;
+  char *buf;
+  int name_size;
+  int value_size;
+  char *value;
+  char field_char;
+  char record_char;
+  char string_char;
+  char *root_element;
+  int field_line;
+  int lower_case;
+  int max_nr_fields;
+  int nr_fields;
+  /* char *field_names; */
+  char **field_name;
+};
+
+
+static void grs_config_csv(void *clientData, Res res, const char *args)
+{
+  int i;
+  struct csv_t *csvp = (struct csv_t*) clientData;
+
+  yaz_log (YLOG_LOG, "Called CSV filter grs_config_csv");
+  yaz_log (YLOG_LOG, "'%s'", args);
+
+  csvp->buf_size = 64;
+  csvp->buf = nmem_malloc(csvp->nmem, csvp->buf_size);
+  csvp->name_size = 256;
+  csvp->value_size = 4096;
+  csvp->value = nmem_malloc(csvp->nmem, csvp->value_size);
+
+  csvp->field_char = '|';
+  csvp->record_char = '\n';
+  csvp->string_char = 0;    
+  csvp->root_element = nmem_strdup(csvp->nmem, "csv");
+  csvp->field_line = 1;
+  csvp->lower_case = 1;
+  csvp->max_nr_fields = 512;
+  csvp->nr_fields = 0;
+  /* csvp->field_names = 0; */ /*nmem_strdup(csvp->nmem, "a|b|c|d|e");*/
+
+  csvp->field_name 
+    = nmem_malloc(csvp->nmem, 
+                  sizeof(*(csvp->field_name)) * csvp->max_nr_fields);
+  for (i = 0; i < csvp->max_nr_fields; i++){
+    csvp->field_name[i] = 0; 
+  }
+
+  /* know field names from config file */
+  /*if (strlen(csvp->field_names))
+    yaz_log (YLOG_LOG, "CSV filter grs_config_csv field names");
+  */
+
+  yaz_log (YLOG_LOG, "Ended CSV filter grs_config_csv");
+}
+
+
+static data1_node *grs_read_csv (struct grs_read_info *gri)
+{
+  data1_node *root_node = 0;
+  data1_node *node = 0;
+  struct csv_t *csvp = (struct csv_t *)gri->clientData;
+  int field_nr = 0; 
+  int end_of_record = 0;
+  int read_header = 0;
+  int read_bytes = 0;
+  char *cb = csvp->buf;
+  char *cv = csvp->value; 
+
+  yaz_log (YLOG_LOG, "Called CSV filter grs_read_csv");
+
+  /* if on start of first line, read header line for dynamic configure */ 
+  if(csvp->field_line && gri->offset == 0)
+    read_header = 1;
+
+  while (!end_of_record){
+
+#if 0    
+    /* configure grs.csv filter with first line in file containing field 
+       name information */
+    if (read_header){
+      yaz_log (YLOG_LOG, "CSV filter grs_read_csv reading header line");
+      
+      /* create new memory for fieldname and value */
+      if (old_nr_fields < csvp->nr_fields){
+        yaz_log(YLOG_LOG, 
+                "CSV filter grs_read_csv name:'%d' ", csvp->nr_fields);
+        old_nr_fields = csvp->nr_fields;
+        csvp->field_name[csvp->nr_fields] 
+          = nmem_malloc(csvp->nmem, csvp->name_size);
+        csvp->field_value[csvp->nr_fields] 
+          = nmem_malloc(csvp->nmem, csvp->value_size);
+
+        /* read buf and copy values to field_name[] */  
+        read_bytes = (*gri->readf)(gri->fh, csvp->buf, csvp->buf_size);
+       gri-> offset = (*gri->tellf)(gri->fh);
+        /* yaz_log(YLOG_LOG, "CSV filter grs_read_csv offset:'%d' ", offset); */
+        read_header = 0;
+      }
+    } else {
+      /* read buf and copy values to field_value[] */  
+      read_bytes = (*gri->readf)(gri->fh, csvp->buf, csvp->buf_size);
+      gri->offset = (*gri->tellf)(gri->fh);
+      yaz_log(YLOG_LOG, "CSV filter grs_read_csv offset:'%d' ", offset);
+    }
+    
+#endif
+
+
+    /* read new buffer from file */  
+    read_bytes = (*gri->readf)(gri->fh, csvp->buf, csvp->buf_size);
+
+    yaz_log (YLOG_LOG, "CSV filter grs_read_csv read_bytes  %d", read_bytes);
+    yaz_log (YLOG_LOG, "CSV filter grs_read_csv csvp->buf %s", csvp->buf);
+
+    gri->offset = (*gri->tellf)(gri->fh);
+    yaz_log(YLOG_LOG, "CSV filter grs_read_csv gri->offset:'%d' ", 
+            (int)gri->offset);
+
+    /* work on buffer */
+    cb = csvp->buf;
+    while ((cb - csvp->buf < read_bytes)
+           && (cv - csvp->value < csvp->value_size)
+           && !end_of_record){
+
+      if (*cb == csvp->field_char){
+        /* if field finished */
+        *cv = '\0';
+        if (read_header){
+          /* read field names from header line */
+          if (csvp->nr_fields < csvp->max_nr_fields){
+            csvp->nr_fields++;
+              yaz_log (YLOG_LOG, "CSV filter grs_read_csv header %d %s", 
+                       field_nr, csvp->value);
+          } else {
+            yaz_log (YLOG_WARN, "CSV filter grs_read_csv header %d %s "
+                     "exceeds configured max number of fields %d", 
+                     field_nr, csvp->value, csvp->max_nr_fields);
+          }
+        } else {
+          /* process following value line fields */
+          if (field_nr < csvp->nr_fields){
+            /* less or qual fields number */
+            yaz_log (YLOG_LOG, "CSV filter grs_read_csv value %d %s", 
+                     field_nr, csvp->value);
+          } else {
+          /* too many fields */
+            yaz_log (YLOG_WARN, "CSV filter grs_read_csv value %d %s "
+                     "exceeds dynamic configured number of fields %d", 
+                     field_nr, csvp->value, csvp->nr_fields);
+          }
+          
+        }
+        cb++;
+        cv = csvp->value;
+        field_nr++;
+      } else if (*cb == csvp->record_char){
+        /* if record finished */
+        *cv = '\0';
+        cb++;
+        cv = csvp->value;
+        field_nr = 0;
+        if (read_header){
+          read_header = 0;
+          yaz_log (YLOG_LOG, "CSV filter grs_read_csv header end");
+        } else {
+          end_of_record = 1;
+          yaz_log (YLOG_LOG, "CSV filter grs_read_csv record end");
+        }
+      } else {
+        /* just plain char to be stored in value */
+        if (csvp->lower_case && read_header){
+          *cv = tolower(*cb);
+        } else {
+          *cv = *cb;
+        }
+         cb++;
+         cv++;
+      }
+    }
+  
+      
+    /* if (gri->endf)
+      (*gri->endf)(gri->fh, offset - 1);  */
+  }
+
+  /* try to build GRS node and document */
+  /*
+  root_node = data1_mk_root(gri->dh, gri->mem, cvsp->root_name);
+  node = data1_mk_node2(gri->dh, gri->mem, DATA1N_data, root_node);
+  node = data1_mk_tag(gri->dh, gri->mem, "pr_name_gn", 0, node);  
+  data1_mk_text_n(gri->dh, gri->mem, csvp->buf, read_bytes, node);
+  */
+  if (!root_node){
+    yaz_log (YLOG_WARN, "empty CSV record of type '%s' "
+             "near file offset %d "
+             "or missing abstract syntax file '%s.abs'",
+             csvp->root_element, (int)gri->offset, csvp->root_element);
+    return 0;
+  }
+
+  yaz_log (YLOG_LOG, "Ended CSV filter grs_read_csv");
+  return root_node;
+}
+
+static void *grs_init_csv(Res res, RecType recType)
+{
+  yaz_log (YLOG_LOG, "Called CSV filter grs_init_csv");
+  NMEM m = nmem_create();
+  struct csv_t *csvp = (struct csv_t *) nmem_malloc(m, sizeof(*csvp));
+  csvp->nmem = m;
+  yaz_log (YLOG_LOG, "Ended CSV filter grs_init_csv");
+  return csvp;
+}
+
+static void grs_destroy_csv(void *clientData)
+{
+  struct csv_t *csvp = (struct csv_t*) clientData;
+
+  yaz_log (YLOG_LOG, "Called CSV filter grs_destroy_csv");
+
+  nmem_destroy(csvp->nmem);
+  clientData = 0;
+
+  yaz_log (YLOG_LOG, "Ended CSV filter grs_destroy_csv");
+}
+
+static int grs_extract_csv(void *clientData, struct recExtractCtrl *ctrl)
+{
+  int res;
+  /* struct csv_t *csvp = (struct csv_t*) clientData; */
+
+  yaz_log (YLOG_LOG, "Called CSV filter grs_extract_csv");
+  yaz_log (YLOG_LOG, "recExtractCtr fh     %d", (int)ctrl->fh);
+  yaz_log (YLOG_LOG, "recExtractCtr offset %d", (int)ctrl->offset);
+
+  res = zebra_grs_extract(clientData, ctrl, grs_read_csv);
+
+  yaz_log (YLOG_LOG, "recExtractCtr fh     %d", (int)ctrl->fh);
+  yaz_log (YLOG_LOG, "recExtractCtr offset %d", (int)ctrl->offset);
+  yaz_log (YLOG_LOG, "Ended CSV filter grs_extract_csv");
+
+  return res;
+}
+
+static int grs_retrieve_csv(void *clientData, struct recRetrieveCtrl *ctrl)
+{
+  int res;
+  /* struct csv_t *csvp = (struct csv_t*) clientData; */
+  
+  yaz_log (YLOG_LOG, "Called CSV filter grs_retrieve_csv");
+  res = zebra_grs_retrieve(clientData, ctrl, grs_read_csv);
+  yaz_log (YLOG_LOG, "Ended CSV filter grs_retrieve_csv");
+
+  return res;
+}
+
+static struct recType grs_type_csv =
+{
+    0,
+    "grs.csv",
+    grs_init_csv,
+    grs_config_csv,
+    grs_destroy_csv,
+    grs_extract_csv,
+    grs_retrieve_csv
+};
+
+RecType
+#ifdef IDZEBRA_STATIC_GRS_CSV
+idzebra_filter_grs_csv
+#else
+idzebra_filter
+#endif
+
+[] = {
+    &grs_type_csv,
+    0,
+};
diff --git a/recctrl/recctrl.c b/recctrl/recctrl.c
index 98c4076..e1d949b 100644
--- a/recctrl/recctrl.c
+++ b/recctrl/recctrl.c
@@ -1,4 +1,4 @@
-/* $Id: recctrl.c,v 1.21 2005-08-30 12:27:18 adam Exp $
+/* $Id: recctrl.c,v 1.22 2005-12-05 12:18:41 marc Exp $
    Copyright (C) 1995-2005
    Index Data ApS
 
@@ -98,6 +98,13 @@ RecTypeClass recTypeClass_create (Res res, NMEM nmem)
 	recTypeClass_add (&rts, idzebra_filter_grs_marc, nmem, 0);
     }
 #endif
+#ifdef IDZEBRA_STATIC_GRS_CSV
+    if (1)
+    {
+	extern RecType idzebra_filter_grs_csv[];
+	recTypeClass_add (&rts, idzebra_filter_grs_csv, nmem, 0);
+    }
+#endif
 #ifdef IDZEBRA_STATIC_GRS_DANBIB
     if (1)
     {
-- 
1.7.10.4