adfc0151f0f2388d5bf845c63c1c08f1c05d5092
[yaz-moved-to-github.git] / util / marcdump.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) Index Data
3  * See the file LICENSE for details.
4  */
5
6 #define _FILE_OFFSET_BITS 64
7
8 #if HAVE_CONFIG_H
9 #include <config.h>
10 #endif
11
12 #if YAZ_HAVE_XML2
13 #include <libxml/parser.h>
14 #include <libxml/tree.h>
15 #include <libxml/xpath.h>
16 #include <libxml/xpathInternals.h>
17
18 /* Libxml2 version < 2.6.15. xmlreader not reliable/present */
19 #if LIBXML_VERSION < 20615
20 #define USE_XMLREADER 0
21 #else
22 #define USE_XMLREADER 1
23 #endif
24
25 #if USE_XMLREADER
26 #include <libxml/xmlreader.h>
27 #endif
28
29 #endif
30
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <errno.h>
35 #include <assert.h>
36
37 #if HAVE_LOCALE_H
38 #include <locale.h>
39 #endif
40 #if HAVE_LANGINFO_H
41 #include <langinfo.h>
42 #endif
43
44 #include <yaz/marcdisp.h>
45 #include <yaz/json.h>
46 #include <yaz/yaz-util.h>
47 #include <yaz/xmalloc.h>
48 #include <yaz/options.h>
49 #include <yaz/backtrace.h>
50
51 #ifndef SEEK_SET
52 #define SEEK_SET 0
53 #endif
54 #ifndef SEEK_END
55 #define SEEK_END 2
56 #endif
57
58
59 static char *prog;
60
61 static int no_errors = 0;
62
63 static void usage(const char *prog)
64 {
65     fprintf(stderr, "Usage: %s [-i format] [-o format] [-f from] [-t to] "
66             "[-l pos=value] [-c cfile] [-s prefix] [-C size] [-n] "
67             "[-p] [-v] [-V] file...\n",
68             prog);
69 }
70
71 static void show_version(void)
72 {
73     char vstr[20], sha1_str[41];
74
75     yaz_version(vstr, sha1_str);
76     printf("YAZ version: %s %s\n", YAZ_VERSION, YAZ_VERSION_SHA1);
77     if (strcmp(sha1_str, YAZ_VERSION_SHA1))
78         printf("YAZ DLL/SO: %s %s\n", vstr, sha1_str);
79     exit(0);
80 }
81
82 static int getbyte_stream(void *client_data)
83 {
84     FILE *f = (FILE*) client_data;
85
86     int c = fgetc(f);
87     if (c == EOF)
88         return 0;
89     return c;
90 }
91
92 static void ungetbyte_stream(int c, void *client_data)
93 {
94     FILE *f = (FILE*) client_data;
95
96     if (c == 0)
97         c = EOF;
98     ungetc(c, f);
99 }
100
101 static void marcdump_read_line(yaz_marc_t mt, const char *fname)
102 {
103     FILE *inf = fopen(fname, "rb");
104     if (!inf)
105     {
106         fprintf(stderr, "%s: cannot open %s:%s\n",
107                 prog, fname, strerror(errno));
108         exit(1);
109     }
110
111     while (yaz_marc_read_line(mt, getbyte_stream,
112                               ungetbyte_stream, inf) == 0)
113     {
114         WRBUF wrbuf = wrbuf_alloc();
115         yaz_marc_write_mode(mt, wrbuf);
116         fputs(wrbuf_cstr(wrbuf), stdout);
117         wrbuf_destroy(wrbuf);
118     }
119     fclose(inf);
120 }
121
122 static void marcdump_read_json(yaz_marc_t mt, const char *fname)
123 {
124     FILE *inf = fopen(fname, "rb");
125     if (!inf)
126     {
127         fprintf(stderr, "%s: cannot open %s:%s\n",
128                 prog, fname, strerror(errno));
129         exit(1);
130     }
131     else
132     {
133         const char *errmsg;
134         size_t errpos;
135         WRBUF w = wrbuf_alloc();
136         struct json_node *n;
137         int c;
138
139         while ((c = getc(inf)) != EOF)
140             wrbuf_putc(w, c);
141         n = json_parse2(wrbuf_cstr(w), &errmsg, &errpos);
142         if (n)
143         {
144             int r = yaz_marc_read_json_node(mt, n);
145             if (r == 0)
146             {
147                 wrbuf_rewind(w);
148                 yaz_marc_write_mode(mt, w);
149                 fputs(wrbuf_cstr(w), stdout);
150                 wrbuf_rewind(w);
151             }
152             else
153             {
154                 fprintf(stderr, "%s: JSON MARC parsing failed ret=%d\n", fname,
155                         r);
156             }
157         }
158         else
159         {
160             fprintf(stderr, "%s: JSON parse error: %s . pos=%ld\n", fname,
161                     errmsg, (long) errpos);
162         }
163         wrbuf_destroy(w);
164         fclose(inf);
165     }
166 }
167
168
169 #if YAZ_HAVE_XML2
170 static void marcdump_read_xml(yaz_marc_t mt, const char *fname)
171 {
172     WRBUF wrbuf = wrbuf_alloc();
173 #if USE_XMLREADER
174     xmlTextReaderPtr reader = xmlReaderForFile(fname, 0 /* encoding */,
175                                                0 /* options */);
176
177     if (reader)
178     {
179         int ret;
180         while ((ret = xmlTextReaderRead(reader)) == 1)
181         {
182             int type = xmlTextReaderNodeType(reader);
183             if (type == XML_READER_TYPE_ELEMENT)
184             {
185                 char *name = (char *) xmlTextReaderLocalName(reader);
186                 if (!strcmp(name, "record") || !strcmp(name, "r"))
187                 {
188                     xmlNodePtr ptr = xmlTextReaderExpand(reader);
189
190                     int r = yaz_marc_read_xml(mt, ptr);
191                     if (r)
192                     {
193                         no_errors++;
194                         fprintf(stderr, "yaz_marc_read_xml failed\n");
195                     }
196                     else
197                     {
198                         int write_rc = yaz_marc_write_mode(mt, wrbuf);
199                         if (write_rc)
200                         {
201                             yaz_log(YLOG_WARN, "yaz_marc_write_mode: "
202                                     "write error: %d", write_rc);
203                             no_errors++;
204                         }
205                         fputs(wrbuf_cstr(wrbuf), stdout);
206                         wrbuf_rewind(wrbuf);
207                     }
208                 }
209                 xmlFree(name);
210             }
211         }
212         xmlFreeTextReader(reader);
213     }
214 #else
215     xmlDocPtr doc = xmlParseFile(fname);
216     if (doc)
217     {
218         xmlNodePtr ptr = xmlDocGetRootElement(doc);
219         for (; ptr; ptr = ptr->next)
220         {
221             if (ptr->type == XML_ELEMENT_NODE)
222             {
223                 if (!strcmp((const char *) ptr->name, "collection"))
224                 {
225                     ptr = ptr->children;
226                     continue;
227                 }
228                 if (!strcmp((const char *) ptr->name, "record") ||
229                     !strcmp((const char *) ptr->name, "r"))
230                 {
231                     int r = yaz_marc_read_xml(mt, ptr);
232                     if (r)
233                     {
234                         no_errors++;
235                         fprintf(stderr, "yaz_marc_read_xml failed\n");
236                     }
237                     else
238                     {
239                         yaz_marc_write_mode(mt, wrbuf);
240
241                         fputs(wrbuf_cstr(wrbuf), stdout);
242                         wrbuf_rewind(wrbuf);
243                     }
244                 }
245             }
246         }
247         xmlFreeDoc(doc);
248     }
249 #endif
250     fputs(wrbuf_cstr(wrbuf), stdout);
251     wrbuf_destroy(wrbuf);
252 }
253 #endif
254
255 static void dump(const char *fname, const char *from, const char *to,
256                  int input_format, int output_format,
257                  int write_using_libxml2,
258                  int print_offset, const char *split_fname, int split_chunk,
259                  int verbose, FILE *cfile, const char *leader_spec)
260 {
261     yaz_marc_t mt = yaz_marc_create();
262     yaz_iconv_t cd = 0;
263
264     if (yaz_marc_leader_spec(mt, leader_spec))
265     {
266         fprintf(stderr, "bad leader spec: %s\n", leader_spec);
267         yaz_marc_destroy(mt);
268         exit(2);
269     }
270     if (from && to)
271     {
272         cd = yaz_iconv_open(to, from);
273         if (!cd)
274         {
275             fprintf(stderr, "conversion from %s to %s "
276                     "unsupported\n", from, to);
277             yaz_marc_destroy(mt);
278             exit(2);
279         }
280         yaz_marc_iconv(mt, cd);
281     }
282     yaz_marc_enable_collection(mt);
283     yaz_marc_xml(mt, output_format);
284     yaz_marc_write_using_libxml2(mt, write_using_libxml2);
285     yaz_marc_debug(mt, verbose);
286
287     if (input_format == YAZ_MARC_MARCXML || input_format == YAZ_MARC_TURBOMARC || input_format == YAZ_MARC_XCHANGE)
288     {
289 #if YAZ_HAVE_XML2
290         marcdump_read_xml(mt, fname);
291 #endif
292     }
293     else if (input_format == YAZ_MARC_LINE)
294     {
295         marcdump_read_line(mt, fname);
296     }
297     else if (input_format == YAZ_MARC_JSON)
298     {
299         marcdump_read_json(mt, fname);
300     }
301     else if (input_format == YAZ_MARC_ISO2709)
302     {
303         FILE *inf = fopen(fname, "rb");
304         int num = 1;
305         int marc_no = 0;
306         int split_file_no = -1;
307         if (!inf)
308         {
309             fprintf(stderr, "%s: cannot open %s:%s\n",
310                     prog, fname, strerror(errno));
311             exit(1);
312         }
313         if (cfile)
314             fprintf(cfile, "char *marc_records[] = {\n");
315         for(;; marc_no++)
316         {
317             const char *result = 0;
318             size_t len;
319             size_t rlen;
320             size_t len_result;
321             size_t r;
322             char buf[100001];
323             yaz_iconv_t cd1 = 0;
324
325             r = fread(buf, 1, 5, inf);
326             if (r < 5)
327             {
328                 if (r == 0) /* normal EOF, all good */
329                     break;
330                 if (print_offset && verbose)
331                 {
332                     printf("<!-- Extra %ld bytes at end of file -->\n",
333                            (long) r);
334                 }
335                 break;
336             }
337             while (*buf < '0' || *buf > '9')
338             {
339                 int i;
340                 long off = ftell(inf) - 5;
341                 printf("<!-- Skipping bad byte %d (0x%02X) at offset "
342                        "%ld (0x%lx) -->\n",
343                        *buf & 0xff, *buf & 0xff,
344                        off, off);
345                 for (i = 0; i<4; i++)
346                     buf[i] = buf[i+1];
347                 r = fread(buf+4, 1, 1, inf);
348                 no_errors++;
349                 if (r < 1)
350                     break;
351             }
352             if (r < 1)
353             {
354                 if (verbose || print_offset)
355                     printf("<!-- End of file with data -->\n");
356                 break;
357             }
358             if (print_offset)
359             {
360                 long off = ftell(inf) - 5;
361                 printf("<!-- Record %d offset %ld (0x%lx) -->\n",
362                        num, off, off);
363             }
364             len = atoi_n(buf, 5);
365             if (len < 25 || len > 100000)
366             {
367                 long off = ftell(inf) - 5;
368                 printf("<!-- Bad Length %ld read at offset %ld (%lx) -->\n",
369                        (long)len, (long) off, (long) off);
370                 no_errors++;
371                 break;
372             }
373             rlen = len - 5;
374             r = fread(buf + 5, 1, rlen, inf);
375             if (r < rlen)
376             {
377                 long off = ftell(inf);
378                 printf("<!-- Premature EOF at offset %ld (%lx) -->\n",
379                        (long) off, (long) off);
380                 no_errors++;
381                 break;
382             }
383             while (buf[len-1] != ISO2709_RS)
384             {
385                 if (len > sizeof(buf)-2)
386                 {
387                     r = 0;
388                     break;
389                 }
390                 r = fread(buf + len, 1, 1, inf);
391                 if (r != 1)
392                     break;
393                 len++;
394             }
395             if (r < 1)
396             {
397                 printf("<!-- EOF while searching for RS -->\n");
398                 no_errors++;
399                 break;
400             }
401             if (split_fname)
402             {
403                 char fname[256];
404                 const char *mode = 0;
405                 FILE *sf;
406                 if ((marc_no % split_chunk) == 0)
407                 {
408                     mode = "wb";
409                     split_file_no++;
410                 }
411                 else
412                     mode = "ab";
413                 sprintf(fname, "%.200s%07d", split_fname, split_file_no);
414                 sf = fopen(fname, mode);
415                 if (!sf)
416                 {
417                     fprintf(stderr, "Could not open %s\n", fname);
418                     split_fname = 0;
419                 }
420                 else
421                 {
422                     if (fwrite(buf, 1, len, sf) != len)
423                     {
424                         fprintf(stderr, "Could write content to %s\n",
425                                 fname);
426                         split_fname = 0;
427                         no_errors++;
428                     }
429                     fclose(sf);
430                 }
431             }
432             len_result = rlen;
433
434             if (yaz_marc_check_marc21_coding(from, buf, 26))
435             {
436                 cd1 = yaz_iconv_open(to, "utf-8");
437                 if (cd1)
438                     yaz_marc_iconv(mt, cd1);
439             }
440             r = yaz_marc_decode_buf(mt, buf, -1, &result, &len_result);
441
442             if (cd1)
443             {
444                 yaz_iconv_close(cd1);
445                 yaz_marc_iconv(mt, cd);
446             }
447
448             if (r == -1)
449                 no_errors++;
450             if (r > 0 && result && len_result)
451             {
452                 if (fwrite(result, len_result, 1, stdout) != 1)
453                 {
454                     fprintf(stderr, "Write to stdout failed\n");
455                     no_errors++;
456                     break;
457                 }
458             }
459             if (r > 0 && cfile)
460             {
461                 char *p = buf;
462                 size_t i;
463                 if (marc_no)
464                     fprintf(cfile, ",");
465                 fprintf(cfile, "\n");
466                 for (i = 0; i < r; i++)
467                 {
468                     if ((i & 15) == 0)
469                         fprintf(cfile, "  \"");
470                     if (p[i] < 32 || p[i] > 126)
471                         fprintf(cfile, "\" \"\\x%02X\" \"", p[i] & 255);
472                     else
473                         fputc(p[i], cfile);
474
475                     if (i < r - 1 && (i & 15) == 15)
476                         fprintf(cfile, "\"\n");
477
478                 }
479                 fprintf(cfile, "\"\n");
480             }
481             num++;
482             if (verbose)
483                 printf("\n");
484         }
485         if (cfile)
486             fprintf(cfile, "};\n");
487         fclose(inf);
488     }
489     {
490         WRBUF wrbuf = wrbuf_alloc();
491         yaz_marc_write_trailer(mt, wrbuf);
492         fputs(wrbuf_cstr(wrbuf), stdout);
493         wrbuf_destroy(wrbuf);
494     }
495     if (cd)
496         yaz_iconv_close(cd);
497     yaz_marc_destroy(mt);
498 }
499
500 int main (int argc, char **argv)
501 {
502     int r;
503     int print_offset = 0;
504     char *arg;
505     int verbose = 0;
506     int no = 0;
507     int output_format = YAZ_MARC_LINE;
508     FILE *cfile = 0;
509     char *from = 0, *to = 0;
510     int input_format = YAZ_MARC_ISO2709;
511     int split_chunk = 1;
512     const char *split_fname = 0;
513     const char *leader_spec = 0;
514     int write_using_libxml2 = 0;
515
516 #if HAVE_LOCALE_H
517     setlocale(LC_CTYPE, "");
518 #endif
519 #if HAVE_LANGINFO_H
520 #ifdef CODESET
521     to = nl_langinfo(CODESET);
522 #endif
523 #endif
524
525     prog = *argv;
526     yaz_enable_panic_backtrace(prog);
527     while ((r = options("i:o:C:npc:xOeXIf:t:s:l:Vv", argv, argc, &arg)) != -2)
528     {
529         no++;
530         switch (r)
531         {
532         case 'i':
533             input_format = yaz_marc_decode_formatstr(arg);
534             if (input_format == -1)
535             {
536                 fprintf(stderr, "%s: bad input format: %s\n", prog, arg);
537                 exit(1);
538             }
539 #if YAZ_HAVE_XML2
540 #else
541             if (input_format == YAZ_MARC_MARCXML
542                 || input_format == YAZ_MARC_XCHANGE)
543             {
544                 fprintf(stderr, "%s: Libxml2 support not enabled\n", prog);
545                 exit(3);
546             }
547 #endif
548             break;
549         case 'o':
550             /* dirty hack so we can make Libxml2 do the writing ..
551                rather than WRBUF */
552             if (strlen(arg) > 4 && strncmp(arg, "xml,", 4) == 0)
553             {
554                 /* Only supported for Libxml2 2.6.0 or later */
555 #if LIBXML_VERSION >= 20600
556                 arg = arg + 4;
557                 write_using_libxml2 = 1;
558 #else
559                 fprintf(stderr, "%s: output using Libxml2 unsupported\n", prog);
560                 exit(4);
561 #endif
562             }
563             output_format = yaz_marc_decode_formatstr(arg);
564             if (output_format == -1)
565             {
566                 fprintf(stderr, "%s: bad output format: %s\n", prog, arg);
567                 exit(1);
568             }
569             break;
570         case 'l':
571             leader_spec = arg;
572             break;
573         case 'f':
574             from = arg;
575             break;
576         case 't':
577             to = arg;
578             break;
579         case 'c':
580             if (cfile)
581                 fclose(cfile);
582             cfile = fopen(arg, "w");
583             break;
584         case 'x':
585             fprintf(stderr, "%s: -x no longer supported. "
586                     "Use -i marcxml instead\n", prog);
587             exit(1);
588             break;
589         case 'O':
590             fprintf(stderr, "%s: OAI MARC no longer supported."
591                     " Use MARCXML instead.\n", prog);
592             exit(1);
593             break;
594         case 'e':
595             fprintf(stderr, "%s: -e no longer supported. "
596                     "Use -o marcxchange instead\n", prog);
597             exit(1);
598             break;
599         case 'X':
600             fprintf(stderr, "%s: -X no longer supported. "
601                     "Use -o marcxml instead\n", prog);
602             exit(1);
603             break;
604         case 'I':
605             fprintf(stderr, "%s: -I no longer supported. "
606                     "Use -o marc instead\n", prog);
607             exit(1);
608             break;
609         case 'n':
610             output_format = YAZ_MARC_CHECK;
611             break;
612         case 'p':
613             print_offset = 1;
614             break;
615         case 's':
616             split_fname = arg;
617             break;
618         case 'C':
619             split_chunk = atoi(arg);
620             break;
621         case 0:
622             dump(arg, from, to, input_format, output_format,
623                  write_using_libxml2,
624                  print_offset, split_fname, split_chunk,
625                  verbose, cfile, leader_spec);
626             break;
627         case 'v':
628             verbose++;
629             break;
630         case 'V':
631             show_version();
632             break;
633         default:
634             usage(prog);
635             exit(1);
636         }
637     }
638     if (cfile)
639         fclose(cfile);
640     if (!no)
641     {
642         usage(prog);
643         exit(1);
644     }
645     if (no_errors)
646         exit(5);
647     exit(0);
648 }
649 /*
650  * Local variables:
651  * c-basic-offset: 4
652  * c-file-style: "Stroustrup"
653  * indent-tabs-mode: nil
654  * End:
655  * vim: shiftwidth=4 tabstop=8 expandtab
656  */
657