Do not use Libxml2's reader if libxml2 version < 2.6.15.
[yaz-moved-to-github.git] / util / marcdump.c
1 /*
2  * Copyright (C) 1995-2007, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: marcdump.c,v 1.55 2007-12-18 21:13:06 adam Exp $
6  */
7
8 #define _FILE_OFFSET_BITS 64
9
10 #if HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #if YAZ_HAVE_XML2
15 #include <libxml/parser.h>
16 #include <libxml/tree.h>
17 #include <libxml/xpath.h>
18 #include <libxml/xpathInternals.h>
19
20 /* Libxml2 version < 2.6.15. xmlreader not reliable/present */
21 #if LIBXML_VERSION < 20615
22 #define USE_XMLREADER 0
23 #else
24 #define USE_XMLREADER 1
25 #endif
26
27 #if USE_XMLREADER
28 #include <libxml/xmlreader.h>
29 #endif
30
31 #endif
32
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <errno.h>
37 #include <assert.h>
38
39 #if HAVE_LOCALE_H
40 #include <locale.h>
41 #endif
42 #if HAVE_LANGINFO_H
43 #include <langinfo.h>
44 #endif
45
46 #include <yaz/marcdisp.h>
47 #include <yaz/yaz-util.h>
48 #include <yaz/xmalloc.h>
49 #include <yaz/options.h>
50
51 #ifndef SEEK_SET
52 #define SEEK_SET 0
53 #endif
54 #ifndef SEEK_END
55 #define SEEK_END 2
56 #endif
57
58
59 static char *prog;
60
61 static void usage(const char *prog)
62 {
63     fprintf (stderr, "Usage: %s [-c cfile] [-f from] [-t to] "
64              "[-i format] [-o format] "
65              "[-n] [-l pos=value] [-v] [-C chunk] [-s splitfname] [-p] file...\n",
66              prog);
67
68
69 static int getbyte_stream(void *client_data)
70 {
71     FILE *f = (FILE*) client_data;
72
73     int c = fgetc(f);
74     if (c == EOF)
75         return 0;
76     return c;
77 }
78
79 static void ungetbyte_stream(int c, void *client_data)
80 {
81     FILE *f = (FILE*) client_data;
82
83     if (c == 0)
84         c = EOF;
85     ungetc(c, f);
86 }
87
88 static void marcdump_read_line(yaz_marc_t mt, const char *fname)
89 {
90     FILE *inf = fopen(fname, "rb");
91     if (!inf)
92     {
93         fprintf (stderr, "%s: cannot open %s:%s\n",
94                  prog, fname, strerror (errno));
95         exit(1);
96     }
97     
98     while (yaz_marc_read_line(mt, getbyte_stream,
99                               ungetbyte_stream, inf) == 0)
100     {
101         WRBUF wrbuf = wrbuf_alloc();
102         yaz_marc_write_mode(mt, wrbuf);
103         fputs(wrbuf_cstr(wrbuf), stdout);
104         wrbuf_destroy(wrbuf);
105     }
106     {
107         WRBUF wrbuf = wrbuf_alloc();
108         yaz_marc_write_trailer(mt, wrbuf);
109         fputs(wrbuf_cstr(wrbuf), stdout);
110         wrbuf_destroy(wrbuf);
111     }
112     fclose(inf);
113 }
114
115 #if YAZ_HAVE_XML2
116 static void marcdump_read_xml(yaz_marc_t mt, const char *fname)
117 {
118     WRBUF wrbuf = wrbuf_alloc();
119 #if USE_XMLREADER
120     xmlTextReaderPtr reader = xmlReaderForFile(fname, 0 /* encoding */,
121                                                0 /* options */);
122
123     if (reader)
124     {
125         int ret;
126         while ((ret = xmlTextReaderRead(reader)) == 1)
127         {
128             int type = xmlTextReaderNodeType(reader);
129             if (type == XML_READER_TYPE_ELEMENT)
130             {
131                 const char *name = (const char *) 
132                     xmlTextReaderConstName(reader);
133                 if (!strcmp(name, "record"))
134                 {
135                     xmlNodePtr ptr = xmlTextReaderExpand(reader);
136         
137                     int r = yaz_marc_read_xml(mt, ptr);
138                     if (r)
139                         fprintf(stderr, "yaz_marc_read_xml failed\n");
140                     else
141                     {
142                         yaz_marc_write_mode(mt, wrbuf);
143                         
144                         fputs(wrbuf_cstr(wrbuf), stdout);
145                         wrbuf_rewind(wrbuf);
146                     }
147                 }
148             }
149         }
150         yaz_marc_write_trailer(mt, wrbuf);
151         fputs(wrbuf_cstr(wrbuf), stdout);
152     }
153 #else
154     xmlDocPtr doc = xmlParseFile(fname);
155     if (doc)
156     {
157         xmlNodePtr ptr = xmlDocGetRootElement(doc);
158         for (; ptr; ptr = ptr->next)
159         {
160             if (ptr->type == XML_ELEMENT_NODE)
161             {
162                 if (!strcmp((const char *) ptr->name, "collection"))
163                 {
164                     ptr = ptr->children;
165                     continue;
166                 }
167                 if (!strcmp((const char *) ptr->name, "record"))
168                 {
169                     int r = yaz_marc_read_xml(mt, ptr);
170                     if (r)
171                         fprintf(stderr, "yaz_marc_read_xml failed\n");
172                     else
173                     {
174                         yaz_marc_write_mode(mt, wrbuf);
175                         
176                         fputs(wrbuf_cstr(wrbuf), stdout);
177                         wrbuf_rewind(wrbuf);
178                     }
179                 }
180             }
181         }
182         xmlFreeDoc(doc);
183     }
184 #endif
185     yaz_marc_write_trailer(mt, wrbuf);
186     fputs(wrbuf_cstr(wrbuf), stdout);
187     wrbuf_destroy(wrbuf);
188 }
189 #endif
190
191 static void dump(const char *fname, const char *from, const char *to,
192                  int input_format, int output_format,
193                  int write_using_libxml2,
194                  int print_offset, const char *split_fname, int split_chunk,
195                  int verbose, FILE *cfile, const char *leader_spec)
196 {
197     yaz_marc_t mt = yaz_marc_create();
198     yaz_iconv_t cd = 0;
199
200     if (yaz_marc_leader_spec(mt, leader_spec))
201     {
202         fprintf(stderr, "bad leader spec: %s\n", leader_spec);
203         yaz_marc_destroy(mt);
204         exit(2);
205     }
206     if (from && to)
207     {
208         cd = yaz_iconv_open(to, from);
209         if (!cd)
210         {
211             fprintf(stderr, "conversion from %s to %s "
212                     "unsupported\n", from, to);
213             yaz_marc_destroy(mt);
214             exit(2);
215         }
216         yaz_marc_iconv(mt, cd);
217     }
218     yaz_marc_xml(mt, output_format);
219     yaz_marc_enable_collection(mt);
220     yaz_marc_write_using_libxml2(mt, write_using_libxml2);
221     yaz_marc_debug(mt, verbose);
222
223     if (input_format == YAZ_MARC_MARCXML || input_format == YAZ_MARC_XCHANGE)
224     {
225 #if YAZ_HAVE_XML2
226         marcdump_read_xml(mt, fname);
227 #endif
228     }
229     else if (input_format == YAZ_MARC_LINE)
230     {
231         marcdump_read_line(mt, fname);
232     }
233     else if (input_format == YAZ_MARC_ISO2709)
234     {
235         FILE *inf = fopen(fname, "rb");
236         int num = 1;
237         int marc_no = 0;
238         int split_file_no = -1;
239         if (!inf)
240         {
241             fprintf (stderr, "%s: cannot open %s:%s\n",
242                      prog, fname, strerror (errno));
243             exit(1);
244         }
245         if (cfile)
246             fprintf (cfile, "char *marc_records[] = {\n");
247         for(;; marc_no++)
248         {
249             const char *result = 0;
250             size_t len;
251             size_t rlen;
252             size_t len_result;
253             size_t r;
254             char buf[100001];
255             
256             r = fread (buf, 1, 5, inf);
257             if (r < 5)
258             {
259                 if (r && print_offset && verbose)
260                     printf ("<!-- Extra %ld bytes at end of file -->\n",
261                             (long) r);
262                 break;
263             }
264             while (*buf < '0' || *buf > '9')
265             {
266                 int i;
267                 long off = ftell(inf) - 5;
268                 if (verbose || print_offset)
269                     printf("<!-- Skipping bad byte %d (0x%02X) at offset "
270                            "%ld (0x%lx) -->\n", 
271                            *buf & 0xff, *buf & 0xff,
272                            off, off);
273                 for (i = 0; i<4; i++)
274                     buf[i] = buf[i+1];
275                 r = fread(buf+4, 1, 1, inf);
276                 if (r < 1)
277                     break;
278             }
279             if (r < 1)
280             {
281                 if (verbose || print_offset)
282                     printf ("<!-- End of file with data -->\n");
283                 break;
284             }
285             if (print_offset)
286             {
287                 long off = ftell(inf) - 5;
288                 printf ("<!-- Record %d offset %ld (0x%lx) -->\n",
289                         num, off, off);
290             }
291             len = atoi_n(buf, 5);
292             if (len < 25 || len > 100000)
293             {
294                 long off = ftell(inf) - 5;
295                 printf("Bad Length %ld read at offset %ld (%lx)\n",
296                        (long)len, (long) off, (long) off);
297                 break;
298             }
299             rlen = len - 5;
300             r = fread (buf + 5, 1, rlen, inf);
301             if (r < rlen)
302                 break;
303             while (buf[len-1] != ISO2709_RS)
304             {
305                 if (len > sizeof(buf)-2)
306                     break;
307                 r = fread (buf + len, 1, 1, inf);
308                 if (r != 1)
309                     break;
310                 len++;
311             }
312             if (split_fname)
313             {
314                 char fname[256];
315                 const char *mode = 0;
316                 FILE *sf;
317                 if ((marc_no % split_chunk) == 0)
318                 {
319                     mode = "wb";
320                     split_file_no++;
321                 }
322                 else
323                     mode = "ab";
324                 sprintf(fname, "%.200s%07d", split_fname, split_file_no);
325                 sf = fopen(fname, mode);
326                 if (!sf)
327                 {
328                     fprintf(stderr, "Could not open %s\n", fname);
329                     split_fname = 0;
330                 }
331                 else
332                 {
333                     if (fwrite(buf, 1, len, sf) != len)
334                     {
335                         fprintf(stderr, "Could write content to %s\n",
336                                 fname);
337                         split_fname = 0;
338                     }
339                     fclose(sf);
340                 }
341             }
342             len_result = rlen;
343             r = yaz_marc_decode_buf(mt, buf, -1, &result, &len_result);
344             if (r > 0 && result)
345             {
346                 fwrite (result, len_result, 1, stdout);
347             }
348             if (r > 0 && cfile)
349             {
350                 char *p = buf;
351                 size_t i;
352                 if (marc_no)
353                     fprintf (cfile, ",");
354                 fprintf (cfile, "\n");
355                 for (i = 0; i < r; i++)
356                 {
357                     if ((i & 15) == 0)
358                         fprintf (cfile, "  \"");
359                     fprintf (cfile, "\\x%02X", p[i] & 255);
360                     
361                     if (i < r - 1 && (i & 15) == 15)
362                         fprintf (cfile, "\"\n");
363                     
364                 }
365                 fprintf (cfile, "\"\n");
366             }
367             num++;
368             if (verbose)
369                 printf("\n");
370         }
371         if (cfile)
372             fprintf (cfile, "};\n");
373         fclose(inf);
374     }
375     {
376         WRBUF wrbuf = wrbuf_alloc();
377         yaz_marc_write_trailer(mt, wrbuf);
378         fputs(wrbuf_cstr(wrbuf), stdout);
379         wrbuf_destroy(wrbuf);
380     }
381     if (cd)
382         yaz_iconv_close(cd);
383     yaz_marc_destroy(mt);
384 }
385
386 int main (int argc, char **argv)
387 {
388     int r;
389     int print_offset = 0;
390     char *arg;
391     int verbose = 0;
392     int no = 0;
393     int output_format = YAZ_MARC_LINE;
394     FILE *cfile = 0;
395     char *from = 0, *to = 0;
396     int input_format = YAZ_MARC_ISO2709;
397     int split_chunk = 1;
398     const char *split_fname = 0;
399     const char *leader_spec = 0;
400     int write_using_libxml2 = 0;
401
402 #if HAVE_LOCALE_H
403     setlocale(LC_CTYPE, "");
404 #endif
405 #if HAVE_LANGINFO_H
406 #ifdef CODESET
407     to = nl_langinfo(CODESET);
408 #endif
409 #endif
410
411     prog = *argv;
412     while ((r = options("i:o:C:npvc:xOeXIf:t:s:l:", argv, argc, &arg)) != -2)
413     {
414         no++;
415         switch (r)
416         {
417         case 'i':
418             input_format = yaz_marc_decode_formatstr(arg);
419             if (input_format == -1)
420             {
421                 fprintf(stderr, "%s: bad input format: %s\n", prog, arg);
422                 exit(1);
423             }
424 #if YAZ_HAVE_XML2
425 #else
426             if (input_format == YAZ_MARC_MARCXML 
427                 || input_format == YAZ_MARC_XCHANGE)
428             {
429                 fprintf(stderr, "%s: Libxml2 support not enabled\n", prog);
430                 exit(3);
431             }
432 #endif
433             break;
434         case 'o':
435             /* dirty hack so we can make Libxml2 do the writing ..
436                rather than WRBUF */
437             if (strlen(arg) > 4 && strncmp(arg, "xml,", 4) == 0)
438             {
439                 arg = arg + 4;
440                 write_using_libxml2 = 1;
441             }
442             output_format = yaz_marc_decode_formatstr(arg);
443             if (output_format == -1)
444             {
445                 fprintf(stderr, "%s: bad output format: %s\n", prog, arg);
446                 exit(1);
447             }
448             break;
449         case 'l':
450             leader_spec = arg;
451             break;
452         case 'f':
453             from = arg;
454             break;
455         case 't':
456             to = arg;
457             break;
458         case 'c':
459             if (cfile)
460                 fclose (cfile);
461             cfile = fopen(arg, "w");
462             break;
463         case 'x':
464             fprintf(stderr, "%s: -x no longer supported. "
465                     "Use -i marcxml instead\n", prog);
466             exit(1);
467             break;
468         case 'O':
469             fprintf(stderr, "%s: OAI MARC no longer supported."
470                     " Use MARCXML instead.\n", prog);
471             exit(1);
472             break;
473         case 'e':
474             fprintf(stderr, "%s: -e no longer supported. "
475                     "Use -o marcxchange instead\n", prog);
476             exit(1);
477             break;
478         case 'X':
479             fprintf(stderr, "%s: -X no longer supported. "
480                     "Use -o marcxml instead\n", prog);
481             exit(1);
482             break;
483         case 'I':
484             fprintf(stderr, "%s: -I no longer supported. "
485                     "Use -o marc instead\n", prog);
486             exit(1);
487             break;
488         case 'n':
489             output_format = YAZ_MARC_CHECK;
490             break;
491         case 'p':
492             print_offset = 1;
493             break;
494         case 's':
495             split_fname = arg;
496             break;
497         case 'C':
498             split_chunk = atoi(arg);
499             break;
500         case 0:
501             dump(arg, from, to, input_format, output_format,
502                  write_using_libxml2,
503                  print_offset, split_fname, split_chunk,
504                  verbose, cfile, leader_spec);
505             break;
506         case 'v':
507             verbose++;
508             break;
509         default:
510             usage(prog);
511             exit(1);
512         }
513     }
514     if (cfile)
515         fclose (cfile);
516     if (!no)
517     {
518         usage(prog);
519         exit (1);
520     }
521     exit (0);
522 }
523 /*
524  * Local variables:
525  * c-basic-offset: 4
526  * indent-tabs-mode: nil
527  * End:
528  * vim: shiftwidth=4 tabstop=8 expandtab
529  */
530