Utility yaz-marcdump got option to display YAZ version (-V)
[yaz-moved-to-github.git] / util / marcdump.c
index 3de8d76..43ff281 100644 (file)
@@ -1,21 +1,31 @@
-/*
- * Copyright (C) 1995-2005, Index Data ApS
+/* This file is part of the YAZ toolkit.
+ * Copyright (C) 1995-2009 Index Data
  * See the file LICENSE for details.
- *
- * $Id: marcdump.c,v 1.26 2005-01-15 19:47:15 adam Exp $
  */
 
+#define _FILE_OFFSET_BITS 64
+
 #if HAVE_CONFIG_H
 #include <config.h>
 #endif
 
-#if HAVE_XML2
+#if YAZ_HAVE_XML2
 #include <libxml/parser.h>
 #include <libxml/tree.h>
-
 #include <libxml/xpath.h>
 #include <libxml/xpathInternals.h>
 
+/* Libxml2 version < 2.6.15. xmlreader not reliable/present */
+#if LIBXML_VERSION < 20615
+#define USE_XMLREADER 0
+#else
+#define USE_XMLREADER 1
+#endif
+
+#if USE_XMLREADER
+#include <libxml/xmlreader.h>
+#endif
+
 #endif
 
 #include <stdio.h>
 #define SEEK_END 2
 #endif
 
+
+static char *prog;
+
 static void usage(const char *prog)
 {
-    fprintf (stderr, "Usage: %s [-c cfile] [-f from] [-t to] [-x] [-O] [-X] [-I] [-v] file...\n",
+    fprintf (stderr, "Usage: %s [-i format] [-o format] [-f from] [-t to] "
+             "[-l pos=value] [-c cfile] [-s prefix] [-C size] [-n] "
+             "[-p] [-v] [-V] file...\n",
              prog);
 } 
 
-#if HAVE_XML2
-void print_xpath_nodes(xmlNodeSetPtr nodes, FILE* output) {
-    xmlNodePtr cur;
-    int size;
-    int i;
-    
-    assert(output);
-    size = (nodes) ? nodes->nodeNr : 0;
+static void show_version(void)
+{
+    char vstr[20], sha1_str[41];
+
+    yaz_version(vstr, sha1_str);
+    printf("YAZ version: %s %s\n", YAZ_VERSION, YAZ_VERSION_SHA1);
+    if (strcmp(sha1_str, YAZ_VERSION_SHA1))
+        printf("YAZ DLL/SO: %s %s\n", vstr, sha1_str);
+    exit(0);
+}
+
+static int getbyte_stream(void *client_data)
+{
+    FILE *f = (FILE*) client_data;
+
+    int c = fgetc(f);
+    if (c == EOF)
+        return 0;
+    return c;
+}
+
+static void ungetbyte_stream(int c, void *client_data)
+{
+    FILE *f = (FILE*) client_data;
+
+    if (c == 0)
+        c = EOF;
+    ungetc(c, f);
+}
+
+static void marcdump_read_line(yaz_marc_t mt, const char *fname)
+{
+    FILE *inf = fopen(fname, "rb");
+    if (!inf)
+    {
+        fprintf (stderr, "%s: cannot open %s:%s\n",
+                 prog, fname, strerror (errno));
+        exit(1);
+    }
     
-    fprintf(output, "Result (%d nodes):\n", size);
-    for(i = 0; i < size; ++i) {
-       assert(nodes->nodeTab[i]);
-       
-       if(nodes->nodeTab[i]->type == XML_NAMESPACE_DECL)
-       {
-           xmlNsPtr ns;
-           
-           ns = (xmlNsPtr)nodes->nodeTab[i];
-           cur = (xmlNodePtr)ns->next;
-           if(cur->ns) { 
-               fprintf(output, "= namespace \"%s\"=\"%s\" for node %s:%s\n", 
-                   ns->prefix, ns->href, cur->ns->href, cur->name);
-           } else {
-               fprintf(output, "= namespace \"%s\"=\"%s\" for node %s\n", 
-                   ns->prefix, ns->href, cur->name);
-           }
-       } 
-       else if(nodes->nodeTab[i]->type == XML_ELEMENT_NODE)
-       {
-           cur = nodes->nodeTab[i];        
-           if(cur->ns) { 
-               fprintf(output, "= element node \"%s:%s\"\n", 
-                   cur->ns->href, cur->name);
-           } 
-           else
-           {
-               fprintf(output, "= element node \"%s\"\n", 
-                   cur->name);
-           }
-       }
-       else
-       {
-           cur = nodes->nodeTab[i];    
-           fprintf(output, "= node \"%s\": type %d\n", cur->name, cur->type);
-       }
+    while (yaz_marc_read_line(mt, getbyte_stream,
+                              ungetbyte_stream, inf) == 0)
+    {
+        WRBUF wrbuf = wrbuf_alloc();
+        yaz_marc_write_mode(mt, wrbuf);
+        fputs(wrbuf_cstr(wrbuf), stdout);
+        wrbuf_destroy(wrbuf);
     }
+    fclose(inf);
+}
+
+#if YAZ_HAVE_XML2
+static void marcdump_read_xml(yaz_marc_t mt, const char *fname)
+{
+    WRBUF wrbuf = wrbuf_alloc();
+#if USE_XMLREADER
+    xmlTextReaderPtr reader = xmlReaderForFile(fname, 0 /* encoding */,
+                                               0 /* options */);
+
+    if (reader)
+    {
+        int ret;
+        while ((ret = xmlTextReaderRead(reader)) == 1)
+        {
+            int type = xmlTextReaderNodeType(reader);
+            if (type == XML_READER_TYPE_ELEMENT)
+            {
+                const char *name = (const char *) 
+                    xmlTextReaderLocalName(reader);
+                if (!strcmp(name, "record"))
+                {
+                    xmlNodePtr ptr = xmlTextReaderExpand(reader);
+        
+                    int r = yaz_marc_read_xml(mt, ptr);
+                    if (r)
+                        fprintf(stderr, "yaz_marc_read_xml failed\n");
+                    else
+                    {
+                        yaz_marc_write_mode(mt, wrbuf);
+                        
+                        fputs(wrbuf_cstr(wrbuf), stdout);
+                        wrbuf_rewind(wrbuf);
+                    }
+                }
+            }
+        }
+    }
+#else
+    xmlDocPtr doc = xmlParseFile(fname);
+    if (doc)
+    {
+        xmlNodePtr ptr = xmlDocGetRootElement(doc);
+        for (; ptr; ptr = ptr->next)
+        {
+            if (ptr->type == XML_ELEMENT_NODE)
+            {
+                if (!strcmp((const char *) ptr->name, "collection"))
+                {
+                    ptr = ptr->children;
+                    continue;
+                }
+                if (!strcmp((const char *) ptr->name, "record"))
+                {
+                    int r = yaz_marc_read_xml(mt, ptr);
+                    if (r)
+                        fprintf(stderr, "yaz_marc_read_xml failed\n");
+                    else
+                    {
+                        yaz_marc_write_mode(mt, wrbuf);
+                        
+                        fputs(wrbuf_cstr(wrbuf), stdout);
+                        wrbuf_rewind(wrbuf);
+                    }
+                }
+            }
+        }
+        xmlFreeDoc(doc);
+    }
+#endif
+    fputs(wrbuf_cstr(wrbuf), stdout);
+    wrbuf_destroy(wrbuf);
 }
 #endif
 
+static void dump(const char *fname, const char *from, const char *to,
+                 int input_format, int output_format,
+                 int write_using_libxml2,
+                 int print_offset, const char *split_fname, int split_chunk,
+                 int verbose, FILE *cfile, const char *leader_spec)
+{
+    yaz_marc_t mt = yaz_marc_create();
+    yaz_iconv_t cd = 0;
+
+    if (yaz_marc_leader_spec(mt, leader_spec))
+    {
+        fprintf(stderr, "bad leader spec: %s\n", leader_spec);
+        yaz_marc_destroy(mt);
+        exit(2);
+    }
+    if (from && to)
+    {
+        cd = yaz_iconv_open(to, from);
+        if (!cd)
+        {
+            fprintf(stderr, "conversion from %s to %s "
+                    "unsupported\n", from, to);
+            yaz_marc_destroy(mt);
+            exit(2);
+        }
+        yaz_marc_iconv(mt, cd);
+    }
+    yaz_marc_xml(mt, output_format);
+    yaz_marc_enable_collection(mt);
+    yaz_marc_write_using_libxml2(mt, write_using_libxml2);
+    yaz_marc_debug(mt, verbose);
+
+    if (input_format == YAZ_MARC_MARCXML || input_format == YAZ_MARC_XCHANGE)
+    {
+#if YAZ_HAVE_XML2
+        marcdump_read_xml(mt, fname);
+#endif
+    }
+    else if (input_format == YAZ_MARC_LINE)
+    {
+        marcdump_read_line(mt, fname);
+    }
+    else if (input_format == YAZ_MARC_ISO2709)
+    {
+        FILE *inf = fopen(fname, "rb");
+        int num = 1;
+        int marc_no = 0;
+        int split_file_no = -1;
+        if (!inf)
+        {
+            fprintf (stderr, "%s: cannot open %s:%s\n",
+                     prog, fname, strerror (errno));
+            exit(1);
+        }
+        if (cfile)
+            fprintf (cfile, "char *marc_records[] = {\n");
+        for(;; marc_no++)
+        {
+            const char *result = 0;
+            size_t len;
+            size_t rlen;
+            size_t len_result;
+            size_t r;
+            char buf[100001];
+            
+            r = fread (buf, 1, 5, inf);
+            if (r < 5)
+            {
+                if (r && print_offset && verbose)
+                    printf ("<!-- Extra %ld bytes at end of file -->\n",
+                            (long) r);
+                break;
+            }
+            while (*buf < '0' || *buf > '9')
+            {
+                int i;
+                long off = ftell(inf) - 5;
+                if (verbose || print_offset)
+                    printf("<!-- Skipping bad byte %d (0x%02X) at offset "
+                           "%ld (0x%lx) -->\n", 
+                           *buf & 0xff, *buf & 0xff,
+                           off, off);
+                for (i = 0; i<4; i++)
+                    buf[i] = buf[i+1];
+                r = fread(buf+4, 1, 1, inf);
+                if (r < 1)
+                    break;
+            }
+            if (r < 1)
+            {
+                if (verbose || print_offset)
+                    printf ("<!-- End of file with data -->\n");
+                break;
+            }
+            if (print_offset)
+            {
+                long off = ftell(inf) - 5;
+                printf ("<!-- Record %d offset %ld (0x%lx) -->\n",
+                        num, off, off);
+            }
+            len = atoi_n(buf, 5);
+            if (len < 25 || len > 100000)
+            {
+                long off = ftell(inf) - 5;
+                printf("Bad Length %ld read at offset %ld (%lx)\n",
+                       (long)len, (long) off, (long) off);
+                break;
+            }
+            rlen = len - 5;
+            r = fread (buf + 5, 1, rlen, inf);
+            if (r < rlen)
+                break;
+            while (buf[len-1] != ISO2709_RS)
+            {
+                if (len > sizeof(buf)-2)
+                    break;
+                r = fread (buf + len, 1, 1, inf);
+                if (r != 1)
+                    break;
+                len++;
+            }
+            if (split_fname)
+            {
+                char fname[256];
+                const char *mode = 0;
+                FILE *sf;
+                if ((marc_no % split_chunk) == 0)
+                {
+                    mode = "wb";
+                    split_file_no++;
+                }
+                else
+                    mode = "ab";
+                sprintf(fname, "%.200s%07d", split_fname, split_file_no);
+                sf = fopen(fname, mode);
+                if (!sf)
+                {
+                    fprintf(stderr, "Could not open %s\n", fname);
+                    split_fname = 0;
+                }
+                else
+                {
+                    if (fwrite(buf, 1, len, sf) != len)
+                    {
+                        fprintf(stderr, "Could write content to %s\n",
+                                fname);
+                        split_fname = 0;
+                    }
+                    fclose(sf);
+                }
+            }
+            len_result = rlen;
+            r = yaz_marc_decode_buf(mt, buf, -1, &result, &len_result);
+            if (r > 0 && result)
+            {
+                if (fwrite(result, len_result, 1, stdout) != 1)
+                {
+                    fprintf(stderr, "Write to stdout failed\n");
+                    break;
+                }
+            }
+            if (r > 0 && cfile)
+            {
+                char *p = buf;
+                size_t i;
+                if (marc_no)
+                    fprintf (cfile, ",");
+                fprintf (cfile, "\n");
+                for (i = 0; i < r; i++)
+                {
+                    if ((i & 15) == 0)
+                        fprintf (cfile, "  \"");
+                    fprintf (cfile, "\\x%02X", p[i] & 255);
+                    
+                    if (i < r - 1 && (i & 15) == 15)
+                        fprintf (cfile, "\"\n");
+                    
+                }
+                fprintf (cfile, "\"\n");
+            }
+            num++;
+            if (verbose)
+                printf("\n");
+        }
+        if (cfile)
+            fprintf (cfile, "};\n");
+        fclose(inf);
+    }
+    {
+        WRBUF wrbuf = wrbuf_alloc();
+        yaz_marc_write_trailer(mt, wrbuf);
+        fputs(wrbuf_cstr(wrbuf), stdout);
+        wrbuf_destroy(wrbuf);
+    }
+    if (cd)
+        yaz_iconv_close(cd);
+    yaz_marc_destroy(mt);
+}
+
 int main (int argc, char **argv)
 {
     int r;
-    int libxml_dom_test = 0;
     int print_offset = 0;
     char *arg;
     int verbose = 0;
-    FILE *inf;
-    char buf[100001];
-    char *prog = *argv;
     int no = 0;
-    int xml = 0;
+    int output_format = YAZ_MARC_LINE;
     FILE *cfile = 0;
     char *from = 0, *to = 0;
-    int num = 1;
-    
+    int input_format = YAZ_MARC_ISO2709;
+    int split_chunk = 1;
+    const char *split_fname = 0;
+    const char *leader_spec = 0;
+    int write_using_libxml2 = 0;
+
 #if HAVE_LOCALE_H
     setlocale(LC_CTYPE, "");
 #endif
@@ -123,180 +412,126 @@ int main (int argc, char **argv)
 #endif
 #endif
 
-    while ((r = options("pvc:xOXIf:t:2", argv, argc, &arg)) != -2)
+    prog = *argv;
+    while ((r = options("i:o:C:npc:xOeXIf:t:s:l:Vv", argv, argc, &arg)) != -2)
     {
-       int count;
-       no++;
+        no++;
         switch (r)
         {
+        case 'i':
+            input_format = yaz_marc_decode_formatstr(arg);
+            if (input_format == -1)
+            {
+                fprintf(stderr, "%s: bad input format: %s\n", prog, arg);
+                exit(1);
+            }
+#if YAZ_HAVE_XML2
+#else
+            if (input_format == YAZ_MARC_MARCXML 
+                || input_format == YAZ_MARC_XCHANGE)
+            {
+                fprintf(stderr, "%s: Libxml2 support not enabled\n", prog);
+                exit(3);
+            }
+#endif
+            break;
+        case 'o':
+            /* dirty hack so we can make Libxml2 do the writing ..
+               rather than WRBUF */
+            if (strlen(arg) > 4 && strncmp(arg, "xml,", 4) == 0)
+            {
+                arg = arg + 4;
+                write_using_libxml2 = 1;
+            }
+            output_format = yaz_marc_decode_formatstr(arg);
+            if (output_format == -1)
+            {
+                fprintf(stderr, "%s: bad output format: %s\n", prog, arg);
+                exit(1);
+            }
+            break;
+        case 'l':
+            leader_spec = arg;
+            break;
         case 'f':
             from = arg;
             break;
         case 't':
             to = arg;
             break;
-       case 'c':
-           if (cfile)
-               fclose (cfile);
-           cfile = fopen (arg, "w");
-           break;
+        case 'c':
+            if (cfile)
+                fclose (cfile);
+            cfile = fopen(arg, "w");
+            break;
         case 'x':
-            xml = YAZ_MARC_SIMPLEXML;
+            fprintf(stderr, "%s: -x no longer supported. "
+                    "Use -i marcxml instead\n", prog);
+            exit(1);
             break;
         case 'O':
-            xml = YAZ_MARC_OAIMARC;
+            fprintf(stderr, "%s: OAI MARC no longer supported."
+                    " Use MARCXML instead.\n", prog);
+            exit(1);
+            break;
+        case 'e':
+            fprintf(stderr, "%s: -e no longer supported. "
+                    "Use -o marcxchange instead\n", prog);
+            exit(1);
             break;
         case 'X':
-            xml = YAZ_MARC_MARCXML;
+            fprintf(stderr, "%s: -X no longer supported. "
+                    "Use -o marcxml instead\n", prog);
+            exit(1);
+            break;
+        case 'I':
+            fprintf(stderr, "%s: -I no longer supported. "
+                    "Use -o marc instead\n", prog);
+            exit(1);
+            break;
+        case 'n':
+            output_format = YAZ_MARC_CHECK;
+            break;
+        case 'p':
+            print_offset = 1;
+            break;
+        case 's':
+            split_fname = arg;
+            break;
+        case 'C':
+            split_chunk = atoi(arg);
             break;
-       case 'I':
-           xml = YAZ_MARC_ISO2709;
-           break;
-       case 'p':
-           print_offset = 1;
-           break;
-       case '2':
-           libxml_dom_test = 1;
-           break;
         case 0:
-           inf = fopen (arg, "rb");
-           count = 0;
-           if (!inf)
-           {
-               fprintf (stderr, "%s: cannot open %s:%s\n",
-                        prog, arg, strerror (errno));
-               exit(1);
-           }
-           if (cfile)
-               fprintf (cfile, "char *marc_records[] = {\n");
-            if (1)
-            {
-                yaz_marc_t mt = yaz_marc_create();
-                yaz_iconv_t cd = 0;
-
-                if (from && to)
-                {
-                    cd = yaz_iconv_open(to, from);
-                    if (!cd)
-                    {
-                        fprintf(stderr, "conversion from %s to %s "
-                                "unsupported\n", from, to);
-                        exit(2);
-                    }
-                   yaz_marc_iconv(mt, cd);
-                }
-                yaz_marc_xml(mt, xml);
-                yaz_marc_debug(mt, verbose);
-                while (1)
-                {
-                    int len;
-                    char *result;
-                    int rlen;
-                    
-                    r = fread (buf, 1, 5, inf);
-                    if (r < 5)
-                   {
-                       if (r && print_offset)
-                           printf ("Extra %d bytes", r);
-                        break;
-                   }
-                   if (print_offset)
-                   {
-                       long off = ftell(inf);
-                       printf ("Record %d offset %ld\n", num, (long) off);
-                   }
-                    len = atoi_n(buf, 5);
-                    if (len < 25 || len > 100000)
-                        break;
-                    len = len - 5;
-                    r = fread (buf + 5, 1, len, inf);
-                    if (r < len)
-                        break;
-                    r = yaz_marc_decode_buf (mt, buf, -1, &result, &rlen);
-                    if (r <= 0)
-                        break;
-                   fwrite (result, rlen, 1, stdout);
-#if HAVE_XML2
-                   if (libxml_dom_test)
-                   {
-                       xmlDocPtr doc = xmlParseMemory(result, rlen);
-                       if (!doc)
-                           fprintf(stderr, "xmLParseMemory failed\n");
-                       else
-                       {
-                           int i;
-                           xmlXPathContextPtr xpathCtx; 
-                           xmlXPathObjectPtr xpathObj; 
-                           static const char *xpathExpr[] = {
-                               "/record/datafield[@tag='245']/subfield[@code='a']",
-                               "/record/datafield[@tag='100']/subfield",
-                               "/record/datafield[@tag='245']/subfield[@code='a']",
-                               "/record/datafield[@tag='650']/subfield",
-                               "/record/datafield[@tag='650']",
-                               0};
-                           
-                           xpathCtx = xmlXPathNewContext(doc);
-
-                           for (i = 0; xpathExpr[i]; i++) {
-                               xpathObj = xmlXPathEvalExpression(xpathExpr[i], xpathCtx);
-                               if(xpathObj == NULL) {
-                                   fprintf(stderr,"Error: unable to evaluate xpath expression \"%s\"\n", xpathExpr[i]);
-                               }
-                               else
-                               {
-                                   print_xpath_nodes(xpathObj->nodesetval, stdout);
-                                   xmlXPathFreeObject(xpathObj);
-                               }
-                           }
-                           xmlXPathFreeContext(xpathCtx); 
-                           xmlFreeDoc(doc);
-                       }
-                   }
-#endif
-                    if (cfile)
-                    {
-                        char *p = buf;
-                        int i;
-                        if (count)
-                            fprintf (cfile, ",");
-                        fprintf (cfile, "\n");
-                        for (i = 0; i < r; i++)
-                        {
-                            if ((i & 15) == 0)
-                                fprintf (cfile, "  \"");
-                            fprintf (cfile, "\\x%02X", p[i] & 255);
-                            
-                            if (i < r - 1 && (i & 15) == 15)
-                                fprintf (cfile, "\"\n");
-                            
-                       }
-                        fprintf (cfile, "\"\n");
-                    }
-                   num++;
-                }
-                count++;
-                if (cd)
-                    yaz_iconv_close(cd);
-                yaz_marc_destroy(mt);
-           }
-           if (cfile)
-               fprintf (cfile, "};\n");
-           fclose(inf);
+            dump(arg, from, to, input_format, output_format,
+                 write_using_libxml2,
+                 print_offset, split_fname, split_chunk,
+                 verbose, cfile, leader_spec);
             break;
         case 'v':
-           verbose++;
+            verbose++;
+            break;
+        case 'V': 
+            show_version();
             break;
         default:
             usage(prog);
-            exit (1);
+            exit(1);
         }
     }
     if (cfile)
-       fclose (cfile);
+        fclose (cfile);
     if (!no)
     {
         usage(prog);
-       exit (1);
+        exit (1);
     }
     exit (0);
 }
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */
+