f92204e386431f044f06dddd8baa1c9db08d69c9
[yaz-moved-to-github.git] / util / marcdump.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2013 Index Data
3  * See the file LICENSE for details.
4  */
5
6 #define _FILE_OFFSET_BITS 64
7
8 #if HAVE_CONFIG_H
9 #include <config.h>
10 #endif
11
12 #if YAZ_HAVE_XML2
13 #include <libxml/parser.h>
14 #include <libxml/tree.h>
15 #include <libxml/xpath.h>
16 #include <libxml/xpathInternals.h>
17
18 /* Libxml2 version < 2.6.15. xmlreader not reliable/present */
19 #if LIBXML_VERSION < 20615
20 #define USE_XMLREADER 0
21 #else
22 #define USE_XMLREADER 1
23 #endif
24
25 #if USE_XMLREADER
26 #include <libxml/xmlreader.h>
27 #endif
28
29 #endif
30
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <errno.h>
35 #include <assert.h>
36
37 #if HAVE_LOCALE_H
38 #include <locale.h>
39 #endif
40 #if HAVE_LANGINFO_H
41 #include <langinfo.h>
42 #endif
43
44 #include <yaz/marcdisp.h>
45 #include <yaz/yaz-util.h>
46 #include <yaz/xmalloc.h>
47 #include <yaz/options.h>
48
49 #ifndef SEEK_SET
50 #define SEEK_SET 0
51 #endif
52 #ifndef SEEK_END
53 #define SEEK_END 2
54 #endif
55
56
57 static char *prog;
58
59 static int no_errors = 0;
60
61 static void usage(const char *prog)
62 {
63     fprintf(stderr, "Usage: %s [-i format] [-o format] [-f from] [-t to] "
64             "[-l pos=value] [-c cfile] [-s prefix] [-C size] [-n] "
65             "[-p] [-v] [-V] file...\n",
66             prog);
67 }
68
69 static void show_version(void)
70 {
71     char vstr[20], sha1_str[41];
72
73     yaz_version(vstr, sha1_str);
74     printf("YAZ version: %s %s\n", YAZ_VERSION, YAZ_VERSION_SHA1);
75     if (strcmp(sha1_str, YAZ_VERSION_SHA1))
76         printf("YAZ DLL/SO: %s %s\n", vstr, sha1_str);
77     exit(0);
78 }
79
80 static int getbyte_stream(void *client_data)
81 {
82     FILE *f = (FILE*) client_data;
83
84     int c = fgetc(f);
85     if (c == EOF)
86         return 0;
87     return c;
88 }
89
90 static void ungetbyte_stream(int c, void *client_data)
91 {
92     FILE *f = (FILE*) client_data;
93
94     if (c == 0)
95         c = EOF;
96     ungetc(c, f);
97 }
98
99 static void marcdump_read_line(yaz_marc_t mt, const char *fname)
100 {
101     FILE *inf = fopen(fname, "rb");
102     if (!inf)
103     {
104         fprintf(stderr, "%s: cannot open %s:%s\n",
105                 prog, fname, strerror(errno));
106         exit(1);
107     }
108
109     while (yaz_marc_read_line(mt, getbyte_stream,
110                               ungetbyte_stream, inf) == 0)
111     {
112         WRBUF wrbuf = wrbuf_alloc();
113         yaz_marc_write_mode(mt, wrbuf);
114         fputs(wrbuf_cstr(wrbuf), stdout);
115         wrbuf_destroy(wrbuf);
116     }
117     fclose(inf);
118 }
119
120 #if YAZ_HAVE_XML2
121 static void marcdump_read_xml(yaz_marc_t mt, const char *fname)
122 {
123     WRBUF wrbuf = wrbuf_alloc();
124 #if USE_XMLREADER
125     xmlTextReaderPtr reader = xmlReaderForFile(fname, 0 /* encoding */,
126                                                0 /* options */);
127
128     if (reader)
129     {
130         int ret;
131         while ((ret = xmlTextReaderRead(reader)) == 1)
132         {
133             int type = xmlTextReaderNodeType(reader);
134             if (type == XML_READER_TYPE_ELEMENT)
135             {
136                 const char *name = (const char *)
137                     xmlTextReaderLocalName(reader);
138                 if (!strcmp(name, "record") || !strcmp(name, "r"))
139                 {
140                     xmlNodePtr ptr = xmlTextReaderExpand(reader);
141
142                     int r = yaz_marc_read_xml(mt, ptr);
143                     if (r)
144                     {
145                         no_errors++;
146                         fprintf(stderr, "yaz_marc_read_xml failed\n");
147                     }
148                     else
149                     {
150                         int write_rc = yaz_marc_write_mode(mt, wrbuf);
151                         if (write_rc)
152                         {
153                             yaz_log(YLOG_WARN, "yaz_marc_write_mode: "
154                                     "write error: %d", write_rc);
155                             no_errors++;
156                         }
157                         fputs(wrbuf_cstr(wrbuf), stdout);
158                         wrbuf_rewind(wrbuf);
159                     }
160                 }
161             }
162         }
163     }
164 #else
165     xmlDocPtr doc = xmlParseFile(fname);
166     if (doc)
167     {
168         xmlNodePtr ptr = xmlDocGetRootElement(doc);
169         for (; ptr; ptr = ptr->next)
170         {
171             if (ptr->type == XML_ELEMENT_NODE)
172             {
173                 if (!strcmp((const char *) ptr->name, "collection"))
174                 {
175                     ptr = ptr->children;
176                     continue;
177                 }
178                 if (!strcmp((const char *) ptr->name, "record") ||
179                     !strcmp((const char *) ptr->name, "r"))
180                 {
181                     int r = yaz_marc_read_xml(mt, ptr);
182                     if (r)
183                     {
184                         no_errors++;
185                         fprintf(stderr, "yaz_marc_read_xml failed\n");
186                     }
187                     else
188                     {
189                         yaz_marc_write_mode(mt, wrbuf);
190
191                         fputs(wrbuf_cstr(wrbuf), stdout);
192                         wrbuf_rewind(wrbuf);
193                     }
194                 }
195             }
196         }
197         xmlFreeDoc(doc);
198     }
199 #endif
200     fputs(wrbuf_cstr(wrbuf), stdout);
201     wrbuf_destroy(wrbuf);
202 }
203 #endif
204
205 static void dump(const char *fname, const char *from, const char *to,
206                  int input_format, int output_format,
207                  int write_using_libxml2,
208                  int print_offset, const char *split_fname, int split_chunk,
209                  int verbose, FILE *cfile, const char *leader_spec)
210 {
211     yaz_marc_t mt = yaz_marc_create();
212     yaz_iconv_t cd = 0;
213
214     if (yaz_marc_leader_spec(mt, leader_spec))
215     {
216         fprintf(stderr, "bad leader spec: %s\n", leader_spec);
217         yaz_marc_destroy(mt);
218         exit(2);
219     }
220     if (from && to)
221     {
222         cd = yaz_iconv_open(to, from);
223         if (!cd)
224         {
225             fprintf(stderr, "conversion from %s to %s "
226                     "unsupported\n", from, to);
227             yaz_marc_destroy(mt);
228             exit(2);
229         }
230         yaz_marc_iconv(mt, cd);
231     }
232     yaz_marc_enable_collection(mt);
233     yaz_marc_xml(mt, output_format);
234     yaz_marc_write_using_libxml2(mt, write_using_libxml2);
235     yaz_marc_debug(mt, verbose);
236
237     if (input_format == YAZ_MARC_MARCXML || input_format == YAZ_MARC_TURBOMARC || input_format == YAZ_MARC_XCHANGE)
238     {
239 #if YAZ_HAVE_XML2
240         marcdump_read_xml(mt, fname);
241 #endif
242     }
243     else if (input_format == YAZ_MARC_LINE)
244     {
245         marcdump_read_line(mt, fname);
246     }
247     else if (input_format == YAZ_MARC_ISO2709)
248     {
249         FILE *inf = fopen(fname, "rb");
250         int num = 1;
251         int marc_no = 0;
252         int split_file_no = -1;
253         if (!inf)
254         {
255             fprintf(stderr, "%s: cannot open %s:%s\n",
256                     prog, fname, strerror(errno));
257             exit(1);
258         }
259         if (cfile)
260             fprintf(cfile, "char *marc_records[] = {\n");
261         for(;; marc_no++)
262         {
263             const char *result = 0;
264             size_t len;
265             size_t rlen;
266             size_t len_result;
267             size_t r;
268             char buf[100001];
269
270             r = fread(buf, 1, 5, inf);
271             if (r < 5)
272             {
273                 if (r == 0) /* normal EOF, all good */
274                     break;
275                 if (print_offset && verbose)
276                 {
277                     printf("<!-- Extra %ld bytes at end of file -->\n",
278                            (long) r);
279                 }
280                 break;
281             }
282             while (*buf < '0' || *buf > '9')
283             {
284                 int i;
285                 long off = ftell(inf) - 5;
286                 printf("<!-- Skipping bad byte %d (0x%02X) at offset "
287                        "%ld (0x%lx) -->\n",
288                        *buf & 0xff, *buf & 0xff,
289                        off, off);
290                 for (i = 0; i<4; i++)
291                     buf[i] = buf[i+1];
292                 r = fread(buf+4, 1, 1, inf);
293                 no_errors++;
294                 if (r < 1)
295                     break;
296             }
297             if (r < 1)
298             {
299                 if (verbose || print_offset)
300                     printf("<!-- End of file with data -->\n");
301                 break;
302             }
303             if (print_offset)
304             {
305                 long off = ftell(inf) - 5;
306                 printf("<!-- Record %d offset %ld (0x%lx) -->\n",
307                        num, off, off);
308             }
309             len = atoi_n(buf, 5);
310             if (len < 25 || len > 100000)
311             {
312                 long off = ftell(inf) - 5;
313                 printf("<!-- Bad Length %ld read at offset %ld (%lx) -->\n",
314                        (long)len, (long) off, (long) off);
315                 no_errors++;
316                 break;
317             }
318             rlen = len - 5;
319             r = fread(buf + 5, 1, rlen, inf);
320             if (r < rlen)
321             {
322                 long off = ftell(inf);
323                 printf("<!-- Premature EOF at offset %ld (%lx) -->\n",
324                        (long) off, (long) off);
325                 no_errors++;
326                 break;
327             }
328             while (buf[len-1] != ISO2709_RS)
329             {
330                 if (len > sizeof(buf)-2)
331                 {
332                     r = 0;
333                     break;
334                 }
335                 r = fread(buf + len, 1, 1, inf);
336                 if (r != 1)
337                     break;
338                 len++;
339             }
340             if (r < 1)
341             {
342                 printf("<!-- EOF while searching for RS -->\n");
343                 no_errors++;
344                 break;
345             }
346             if (split_fname)
347             {
348                 char fname[256];
349                 const char *mode = 0;
350                 FILE *sf;
351                 if ((marc_no % split_chunk) == 0)
352                 {
353                     mode = "wb";
354                     split_file_no++;
355                 }
356                 else
357                     mode = "ab";
358                 sprintf(fname, "%.200s%07d", split_fname, split_file_no);
359                 sf = fopen(fname, mode);
360                 if (!sf)
361                 {
362                     fprintf(stderr, "Could not open %s\n", fname);
363                     split_fname = 0;
364                 }
365                 else
366                 {
367                     if (fwrite(buf, 1, len, sf) != len)
368                     {
369                         fprintf(stderr, "Could write content to %s\n",
370                                 fname);
371                         split_fname = 0;
372                         no_errors++;
373                     }
374                     fclose(sf);
375                 }
376             }
377             len_result = rlen;
378             r = yaz_marc_decode_buf(mt, buf, -1, &result, &len_result);
379             if (r == -1)
380                 no_errors++;
381             if (r > 0 && result && len_result)
382             {
383                 if (fwrite(result, len_result, 1, stdout) != 1)
384                 {
385                     fprintf(stderr, "Write to stdout failed\n");
386                     no_errors++;
387                     break;
388                 }
389             }
390             if (r > 0 && cfile)
391             {
392                 char *p = buf;
393                 size_t i;
394                 if (marc_no)
395                     fprintf(cfile, ",");
396                 fprintf(cfile, "\n");
397                 for (i = 0; i < r; i++)
398                 {
399                     if ((i & 15) == 0)
400                         fprintf(cfile, "  \"");
401                     if (p[i] < 32 || p[i] > 126)
402                         fprintf(cfile, "\" \"\\x%02X\" \"", p[i] & 255);
403                     else
404                         fputc(p[i], cfile);
405
406                     if (i < r - 1 && (i & 15) == 15)
407                         fprintf(cfile, "\"\n");
408
409                 }
410                 fprintf(cfile, "\"\n");
411             }
412             num++;
413             if (verbose)
414                 printf("\n");
415         }
416         if (cfile)
417             fprintf(cfile, "};\n");
418         fclose(inf);
419     }
420     {
421         WRBUF wrbuf = wrbuf_alloc();
422         yaz_marc_write_trailer(mt, wrbuf);
423         fputs(wrbuf_cstr(wrbuf), stdout);
424         wrbuf_destroy(wrbuf);
425     }
426     if (cd)
427         yaz_iconv_close(cd);
428     yaz_marc_destroy(mt);
429 }
430
431 int main (int argc, char **argv)
432 {
433     int r;
434     int print_offset = 0;
435     char *arg;
436     int verbose = 0;
437     int no = 0;
438     int output_format = YAZ_MARC_LINE;
439     FILE *cfile = 0;
440     char *from = 0, *to = 0;
441     int input_format = YAZ_MARC_ISO2709;
442     int split_chunk = 1;
443     const char *split_fname = 0;
444     const char *leader_spec = 0;
445     int write_using_libxml2 = 0;
446
447 #if HAVE_LOCALE_H
448     setlocale(LC_CTYPE, "");
449 #endif
450 #if HAVE_LANGINFO_H
451 #ifdef CODESET
452     to = nl_langinfo(CODESET);
453 #endif
454 #endif
455
456     prog = *argv;
457     while ((r = options("i:o:C:npc:xOeXIf:t:s:l:Vv", argv, argc, &arg)) != -2)
458     {
459         no++;
460         switch (r)
461         {
462         case 'i':
463             input_format = yaz_marc_decode_formatstr(arg);
464             if (input_format == -1)
465             {
466                 fprintf(stderr, "%s: bad input format: %s\n", prog, arg);
467                 exit(1);
468             }
469 #if YAZ_HAVE_XML2
470 #else
471             if (input_format == YAZ_MARC_MARCXML
472                 || input_format == YAZ_MARC_XCHANGE)
473             {
474                 fprintf(stderr, "%s: Libxml2 support not enabled\n", prog);
475                 exit(3);
476             }
477 #endif
478             break;
479         case 'o':
480             /* dirty hack so we can make Libxml2 do the writing ..
481                rather than WRBUF */
482             if (strlen(arg) > 4 && strncmp(arg, "xml,", 4) == 0)
483             {
484                 /* Only supported for Libxml2 2.6.0 or later */
485 #if LIBXML_VERSION >= 20600
486                 arg = arg + 4;
487                 write_using_libxml2 = 1;
488 #else
489                 fprintf(stderr, "%s: output using Libxml2 unsupported\n", prog);
490                 exit(4);
491 #endif
492             }
493             output_format = yaz_marc_decode_formatstr(arg);
494             if (output_format == -1)
495             {
496                 fprintf(stderr, "%s: bad output format: %s\n", prog, arg);
497                 exit(1);
498             }
499             break;
500         case 'l':
501             leader_spec = arg;
502             break;
503         case 'f':
504             from = arg;
505             break;
506         case 't':
507             to = arg;
508             break;
509         case 'c':
510             if (cfile)
511                 fclose(cfile);
512             cfile = fopen(arg, "w");
513             break;
514         case 'x':
515             fprintf(stderr, "%s: -x no longer supported. "
516                     "Use -i marcxml instead\n", prog);
517             exit(1);
518             break;
519         case 'O':
520             fprintf(stderr, "%s: OAI MARC no longer supported."
521                     " Use MARCXML instead.\n", prog);
522             exit(1);
523             break;
524         case 'e':
525             fprintf(stderr, "%s: -e no longer supported. "
526                     "Use -o marcxchange instead\n", prog);
527             exit(1);
528             break;
529         case 'X':
530             fprintf(stderr, "%s: -X no longer supported. "
531                     "Use -o marcxml instead\n", prog);
532             exit(1);
533             break;
534         case 'I':
535             fprintf(stderr, "%s: -I no longer supported. "
536                     "Use -o marc instead\n", prog);
537             exit(1);
538             break;
539         case 'n':
540             output_format = YAZ_MARC_CHECK;
541             break;
542         case 'p':
543             print_offset = 1;
544             break;
545         case 's':
546             split_fname = arg;
547             break;
548         case 'C':
549             split_chunk = atoi(arg);
550             break;
551         case 0:
552             dump(arg, from, to, input_format, output_format,
553                  write_using_libxml2,
554                  print_offset, split_fname, split_chunk,
555                  verbose, cfile, leader_spec);
556             break;
557         case 'v':
558             verbose++;
559             break;
560         case 'V':
561             show_version();
562             break;
563         default:
564             usage(prog);
565             exit(1);
566         }
567     }
568     if (cfile)
569         fclose(cfile);
570     if (!no)
571     {
572         usage(prog);
573         exit(1);
574     }
575     if (no_errors)
576         exit(5);
577     exit(0);
578 }
579 /*
580  * Local variables:
581  * c-basic-offset: 4
582  * c-file-style: "Stroustrup"
583  * indent-tabs-mode: nil
584  * End:
585  * vim: shiftwidth=4 tabstop=8 expandtab
586  */
587