891174b3bc2e8885b1cd995b6d4219fd047cf875
[yaz-moved-to-github.git] / util / marcdump.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2013 Index Data
3  * See the file LICENSE for details.
4  */
5
6 #define _FILE_OFFSET_BITS 64
7
8 #if HAVE_CONFIG_H
9 #include <config.h>
10 #endif
11
12 #if YAZ_HAVE_XML2
13 #include <libxml/parser.h>
14 #include <libxml/tree.h>
15 #include <libxml/xpath.h>
16 #include <libxml/xpathInternals.h>
17
18 /* Libxml2 version < 2.6.15. xmlreader not reliable/present */
19 #if LIBXML_VERSION < 20615
20 #define USE_XMLREADER 0
21 #else
22 #define USE_XMLREADER 1
23 #endif
24
25 #if USE_XMLREADER
26 #include <libxml/xmlreader.h>
27 #endif
28
29 #endif
30
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <errno.h>
35 #include <assert.h>
36
37 #if HAVE_LOCALE_H
38 #include <locale.h>
39 #endif
40 #if HAVE_LANGINFO_H
41 #include <langinfo.h>
42 #endif
43
44 #include <yaz/marcdisp.h>
45 #include <yaz/yaz-util.h>
46 #include <yaz/xmalloc.h>
47 #include <yaz/options.h>
48
49 #ifndef SEEK_SET
50 #define SEEK_SET 0
51 #endif
52 #ifndef SEEK_END
53 #define SEEK_END 2
54 #endif
55
56
57 static char *prog;
58
59 static int no_errors = 0;
60
61 static void usage(const char *prog)
62 {
63     fprintf(stderr, "Usage: %s [-i format] [-o format] [-f from] [-t to] "
64             "[-l pos=value] [-c cfile] [-s prefix] [-C size] [-n] "
65             "[-p] [-v] [-V] file...\n",
66             prog);
67 }
68
69 static void show_version(void)
70 {
71     char vstr[20], sha1_str[41];
72
73     yaz_version(vstr, sha1_str);
74     printf("YAZ version: %s %s\n", YAZ_VERSION, YAZ_VERSION_SHA1);
75     if (strcmp(sha1_str, YAZ_VERSION_SHA1))
76         printf("YAZ DLL/SO: %s %s\n", vstr, sha1_str);
77     exit(0);
78 }
79
80 static int getbyte_stream(void *client_data)
81 {
82     FILE *f = (FILE*) client_data;
83
84     int c = fgetc(f);
85     if (c == EOF)
86         return 0;
87     return c;
88 }
89
90 static void ungetbyte_stream(int c, void *client_data)
91 {
92     FILE *f = (FILE*) client_data;
93
94     if (c == 0)
95         c = EOF;
96     ungetc(c, f);
97 }
98
99 static void marcdump_read_line(yaz_marc_t mt, const char *fname)
100 {
101     FILE *inf = fopen(fname, "rb");
102     if (!inf)
103     {
104         fprintf(stderr, "%s: cannot open %s:%s\n",
105                 prog, fname, strerror(errno));
106         exit(1);
107     }
108
109     while (yaz_marc_read_line(mt, getbyte_stream,
110                               ungetbyte_stream, inf) == 0)
111     {
112         WRBUF wrbuf = wrbuf_alloc();
113         yaz_marc_write_mode(mt, wrbuf);
114         fputs(wrbuf_cstr(wrbuf), stdout);
115         wrbuf_destroy(wrbuf);
116     }
117     fclose(inf);
118 }
119
120 #if YAZ_HAVE_XML2
121 static void marcdump_read_xml(yaz_marc_t mt, const char *fname)
122 {
123     WRBUF wrbuf = wrbuf_alloc();
124 #if USE_XMLREADER
125     xmlTextReaderPtr reader = xmlReaderForFile(fname, 0 /* encoding */,
126                                                0 /* options */);
127
128     if (reader)
129     {
130         int ret;
131         while ((ret = xmlTextReaderRead(reader)) == 1)
132         {
133             int type = xmlTextReaderNodeType(reader);
134             if (type == XML_READER_TYPE_ELEMENT)
135             {
136                 char *name = (char *) xmlTextReaderLocalName(reader);
137                 if (!strcmp(name, "record") || !strcmp(name, "r"))
138                 {
139                     xmlNodePtr ptr = xmlTextReaderExpand(reader);
140
141                     int r = yaz_marc_read_xml(mt, ptr);
142                     if (r)
143                     {
144                         no_errors++;
145                         fprintf(stderr, "yaz_marc_read_xml failed\n");
146                     }
147                     else
148                     {
149                         int write_rc = yaz_marc_write_mode(mt, wrbuf);
150                         if (write_rc)
151                         {
152                             yaz_log(YLOG_WARN, "yaz_marc_write_mode: "
153                                     "write error: %d", write_rc);
154                             no_errors++;
155                         }
156                         fputs(wrbuf_cstr(wrbuf), stdout);
157                         wrbuf_rewind(wrbuf);
158                     }
159                 }
160                 xmlFree(name);
161             }
162         }
163         xmlFreeTextReader(reader);
164     }
165 #else
166     xmlDocPtr doc = xmlParseFile(fname);
167     if (doc)
168     {
169         xmlNodePtr ptr = xmlDocGetRootElement(doc);
170         for (; ptr; ptr = ptr->next)
171         {
172             if (ptr->type == XML_ELEMENT_NODE)
173             {
174                 if (!strcmp((const char *) ptr->name, "collection"))
175                 {
176                     ptr = ptr->children;
177                     continue;
178                 }
179                 if (!strcmp((const char *) ptr->name, "record") ||
180                     !strcmp((const char *) ptr->name, "r"))
181                 {
182                     int r = yaz_marc_read_xml(mt, ptr);
183                     if (r)
184                     {
185                         no_errors++;
186                         fprintf(stderr, "yaz_marc_read_xml failed\n");
187                     }
188                     else
189                     {
190                         yaz_marc_write_mode(mt, wrbuf);
191
192                         fputs(wrbuf_cstr(wrbuf), stdout);
193                         wrbuf_rewind(wrbuf);
194                     }
195                 }
196             }
197         }
198         xmlFreeDoc(doc);
199     }
200 #endif
201     fputs(wrbuf_cstr(wrbuf), stdout);
202     wrbuf_destroy(wrbuf);
203 }
204 #endif
205
206 static void dump(const char *fname, const char *from, const char *to,
207                  int input_format, int output_format,
208                  int write_using_libxml2,
209                  int print_offset, const char *split_fname, int split_chunk,
210                  int verbose, FILE *cfile, const char *leader_spec)
211 {
212     yaz_marc_t mt = yaz_marc_create();
213     yaz_iconv_t cd = 0;
214
215     if (yaz_marc_leader_spec(mt, leader_spec))
216     {
217         fprintf(stderr, "bad leader spec: %s\n", leader_spec);
218         yaz_marc_destroy(mt);
219         exit(2);
220     }
221     if (from && to)
222     {
223         cd = yaz_iconv_open(to, from);
224         if (!cd)
225         {
226             fprintf(stderr, "conversion from %s to %s "
227                     "unsupported\n", from, to);
228             yaz_marc_destroy(mt);
229             exit(2);
230         }
231         yaz_marc_iconv(mt, cd);
232     }
233     yaz_marc_enable_collection(mt);
234     yaz_marc_xml(mt, output_format);
235     yaz_marc_write_using_libxml2(mt, write_using_libxml2);
236     yaz_marc_debug(mt, verbose);
237
238     if (input_format == YAZ_MARC_MARCXML || input_format == YAZ_MARC_TURBOMARC || input_format == YAZ_MARC_XCHANGE)
239     {
240 #if YAZ_HAVE_XML2
241         marcdump_read_xml(mt, fname);
242 #endif
243     }
244     else if (input_format == YAZ_MARC_LINE)
245     {
246         marcdump_read_line(mt, fname);
247     }
248     else if (input_format == YAZ_MARC_ISO2709)
249     {
250         FILE *inf = fopen(fname, "rb");
251         int num = 1;
252         int marc_no = 0;
253         int split_file_no = -1;
254         if (!inf)
255         {
256             fprintf(stderr, "%s: cannot open %s:%s\n",
257                     prog, fname, strerror(errno));
258             exit(1);
259         }
260         if (cfile)
261             fprintf(cfile, "char *marc_records[] = {\n");
262         for(;; marc_no++)
263         {
264             const char *result = 0;
265             size_t len;
266             size_t rlen;
267             size_t len_result;
268             size_t r;
269             char buf[100001];
270
271             r = fread(buf, 1, 5, inf);
272             if (r < 5)
273             {
274                 if (r == 0) /* normal EOF, all good */
275                     break;
276                 if (print_offset && verbose)
277                 {
278                     printf("<!-- Extra %ld bytes at end of file -->\n",
279                            (long) r);
280                 }
281                 break;
282             }
283             while (*buf < '0' || *buf > '9')
284             {
285                 int i;
286                 long off = ftell(inf) - 5;
287                 printf("<!-- Skipping bad byte %d (0x%02X) at offset "
288                        "%ld (0x%lx) -->\n",
289                        *buf & 0xff, *buf & 0xff,
290                        off, off);
291                 for (i = 0; i<4; i++)
292                     buf[i] = buf[i+1];
293                 r = fread(buf+4, 1, 1, inf);
294                 no_errors++;
295                 if (r < 1)
296                     break;
297             }
298             if (r < 1)
299             {
300                 if (verbose || print_offset)
301                     printf("<!-- End of file with data -->\n");
302                 break;
303             }
304             if (print_offset)
305             {
306                 long off = ftell(inf) - 5;
307                 printf("<!-- Record %d offset %ld (0x%lx) -->\n",
308                        num, off, off);
309             }
310             len = atoi_n(buf, 5);
311             if (len < 25 || len > 100000)
312             {
313                 long off = ftell(inf) - 5;
314                 printf("<!-- Bad Length %ld read at offset %ld (%lx) -->\n",
315                        (long)len, (long) off, (long) off);
316                 no_errors++;
317                 break;
318             }
319             rlen = len - 5;
320             r = fread(buf + 5, 1, rlen, inf);
321             if (r < rlen)
322             {
323                 long off = ftell(inf);
324                 printf("<!-- Premature EOF at offset %ld (%lx) -->\n",
325                        (long) off, (long) off);
326                 no_errors++;
327                 break;
328             }
329             while (buf[len-1] != ISO2709_RS)
330             {
331                 if (len > sizeof(buf)-2)
332                 {
333                     r = 0;
334                     break;
335                 }
336                 r = fread(buf + len, 1, 1, inf);
337                 if (r != 1)
338                     break;
339                 len++;
340             }
341             if (r < 1)
342             {
343                 printf("<!-- EOF while searching for RS -->\n");
344                 no_errors++;
345                 break;
346             }
347             if (split_fname)
348             {
349                 char fname[256];
350                 const char *mode = 0;
351                 FILE *sf;
352                 if ((marc_no % split_chunk) == 0)
353                 {
354                     mode = "wb";
355                     split_file_no++;
356                 }
357                 else
358                     mode = "ab";
359                 sprintf(fname, "%.200s%07d", split_fname, split_file_no);
360                 sf = fopen(fname, mode);
361                 if (!sf)
362                 {
363                     fprintf(stderr, "Could not open %s\n", fname);
364                     split_fname = 0;
365                 }
366                 else
367                 {
368                     if (fwrite(buf, 1, len, sf) != len)
369                     {
370                         fprintf(stderr, "Could write content to %s\n",
371                                 fname);
372                         split_fname = 0;
373                         no_errors++;
374                     }
375                     fclose(sf);
376                 }
377             }
378             len_result = rlen;
379             r = yaz_marc_decode_buf(mt, buf, -1, &result, &len_result);
380             if (r == -1)
381                 no_errors++;
382             if (r > 0 && result && len_result)
383             {
384                 if (fwrite(result, len_result, 1, stdout) != 1)
385                 {
386                     fprintf(stderr, "Write to stdout failed\n");
387                     no_errors++;
388                     break;
389                 }
390             }
391             if (r > 0 && cfile)
392             {
393                 char *p = buf;
394                 size_t i;
395                 if (marc_no)
396                     fprintf(cfile, ",");
397                 fprintf(cfile, "\n");
398                 for (i = 0; i < r; i++)
399                 {
400                     if ((i & 15) == 0)
401                         fprintf(cfile, "  \"");
402                     if (p[i] < 32 || p[i] > 126)
403                         fprintf(cfile, "\" \"\\x%02X\" \"", p[i] & 255);
404                     else
405                         fputc(p[i], cfile);
406
407                     if (i < r - 1 && (i & 15) == 15)
408                         fprintf(cfile, "\"\n");
409
410                 }
411                 fprintf(cfile, "\"\n");
412             }
413             num++;
414             if (verbose)
415                 printf("\n");
416         }
417         if (cfile)
418             fprintf(cfile, "};\n");
419         fclose(inf);
420     }
421     {
422         WRBUF wrbuf = wrbuf_alloc();
423         yaz_marc_write_trailer(mt, wrbuf);
424         fputs(wrbuf_cstr(wrbuf), stdout);
425         wrbuf_destroy(wrbuf);
426     }
427     if (cd)
428         yaz_iconv_close(cd);
429     yaz_marc_destroy(mt);
430 }
431
432 int main (int argc, char **argv)
433 {
434     int r;
435     int print_offset = 0;
436     char *arg;
437     int verbose = 0;
438     int no = 0;
439     int output_format = YAZ_MARC_LINE;
440     FILE *cfile = 0;
441     char *from = 0, *to = 0;
442     int input_format = YAZ_MARC_ISO2709;
443     int split_chunk = 1;
444     const char *split_fname = 0;
445     const char *leader_spec = 0;
446     int write_using_libxml2 = 0;
447
448 #if HAVE_LOCALE_H
449     setlocale(LC_CTYPE, "");
450 #endif
451 #if HAVE_LANGINFO_H
452 #ifdef CODESET
453     to = nl_langinfo(CODESET);
454 #endif
455 #endif
456
457     prog = *argv;
458     while ((r = options("i:o:C:npc:xOeXIf:t:s:l:Vv", argv, argc, &arg)) != -2)
459     {
460         no++;
461         switch (r)
462         {
463         case 'i':
464             input_format = yaz_marc_decode_formatstr(arg);
465             if (input_format == -1)
466             {
467                 fprintf(stderr, "%s: bad input format: %s\n", prog, arg);
468                 exit(1);
469             }
470 #if YAZ_HAVE_XML2
471 #else
472             if (input_format == YAZ_MARC_MARCXML
473                 || input_format == YAZ_MARC_XCHANGE)
474             {
475                 fprintf(stderr, "%s: Libxml2 support not enabled\n", prog);
476                 exit(3);
477             }
478 #endif
479             break;
480         case 'o':
481             /* dirty hack so we can make Libxml2 do the writing ..
482                rather than WRBUF */
483             if (strlen(arg) > 4 && strncmp(arg, "xml,", 4) == 0)
484             {
485                 /* Only supported for Libxml2 2.6.0 or later */
486 #if LIBXML_VERSION >= 20600
487                 arg = arg + 4;
488                 write_using_libxml2 = 1;
489 #else
490                 fprintf(stderr, "%s: output using Libxml2 unsupported\n", prog);
491                 exit(4);
492 #endif
493             }
494             output_format = yaz_marc_decode_formatstr(arg);
495             if (output_format == -1)
496             {
497                 fprintf(stderr, "%s: bad output format: %s\n", prog, arg);
498                 exit(1);
499             }
500             break;
501         case 'l':
502             leader_spec = arg;
503             break;
504         case 'f':
505             from = arg;
506             break;
507         case 't':
508             to = arg;
509             break;
510         case 'c':
511             if (cfile)
512                 fclose(cfile);
513             cfile = fopen(arg, "w");
514             break;
515         case 'x':
516             fprintf(stderr, "%s: -x no longer supported. "
517                     "Use -i marcxml instead\n", prog);
518             exit(1);
519             break;
520         case 'O':
521             fprintf(stderr, "%s: OAI MARC no longer supported."
522                     " Use MARCXML instead.\n", prog);
523             exit(1);
524             break;
525         case 'e':
526             fprintf(stderr, "%s: -e no longer supported. "
527                     "Use -o marcxchange instead\n", prog);
528             exit(1);
529             break;
530         case 'X':
531             fprintf(stderr, "%s: -X no longer supported. "
532                     "Use -o marcxml instead\n", prog);
533             exit(1);
534             break;
535         case 'I':
536             fprintf(stderr, "%s: -I no longer supported. "
537                     "Use -o marc instead\n", prog);
538             exit(1);
539             break;
540         case 'n':
541             output_format = YAZ_MARC_CHECK;
542             break;
543         case 'p':
544             print_offset = 1;
545             break;
546         case 's':
547             split_fname = arg;
548             break;
549         case 'C':
550             split_chunk = atoi(arg);
551             break;
552         case 0:
553             dump(arg, from, to, input_format, output_format,
554                  write_using_libxml2,
555                  print_offset, split_fname, split_chunk,
556                  verbose, cfile, leader_spec);
557             break;
558         case 'v':
559             verbose++;
560             break;
561         case 'V':
562             show_version();
563             break;
564         default:
565             usage(prog);
566             exit(1);
567         }
568     }
569     if (cfile)
570         fclose(cfile);
571     if (!no)
572     {
573         usage(prog);
574         exit(1);
575     }
576     if (no_errors)
577         exit(5);
578     exit(0);
579 }
580 /*
581  * Local variables:
582  * c-basic-offset: 4
583  * c-file-style: "Stroustrup"
584  * indent-tabs-mode: nil
585  * End:
586  * vim: shiftwidth=4 tabstop=8 expandtab
587  */
588