b963320d376919ba34ed082b7b76855274ab4578
[yaz-moved-to-github.git] / util / marcdump.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2011 Index Data
3  * See the file LICENSE for details.
4  */
5
6 #define _FILE_OFFSET_BITS 64
7
8 #if HAVE_CONFIG_H
9 #include <config.h>
10 #endif
11
12 #if YAZ_HAVE_XML2
13 #include <libxml/parser.h>
14 #include <libxml/tree.h>
15 #include <libxml/xpath.h>
16 #include <libxml/xpathInternals.h>
17
18 /* Libxml2 version < 2.6.15. xmlreader not reliable/present */
19 #if LIBXML_VERSION < 20615
20 #define USE_XMLREADER 0
21 #else
22 #define USE_XMLREADER 1
23 #endif
24
25 #if USE_XMLREADER
26 #include <libxml/xmlreader.h>
27 #endif
28
29 #endif
30
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <errno.h>
35 #include <assert.h>
36
37 #if HAVE_LOCALE_H
38 #include <locale.h>
39 #endif
40 #if HAVE_LANGINFO_H
41 #include <langinfo.h>
42 #endif
43
44 #include <yaz/marcdisp.h>
45 #include <yaz/yaz-util.h>
46 #include <yaz/xmalloc.h>
47 #include <yaz/options.h>
48
49 #ifndef SEEK_SET
50 #define SEEK_SET 0
51 #endif
52 #ifndef SEEK_END
53 #define SEEK_END 2
54 #endif
55
56
57 static char *prog;
58
59 static void usage(const char *prog)
60 {
61     fprintf(stderr, "Usage: %s [-i format] [-o format] [-f from] [-t to] "
62             "[-l pos=value] [-c cfile] [-s prefix] [-C size] [-n] "
63             "[-p] [-v] [-V] file...\n",
64             prog);
65
66
67 static void show_version(void)
68 {
69     char vstr[20], sha1_str[41];
70
71     yaz_version(vstr, sha1_str);
72     printf("YAZ version: %s %s\n", YAZ_VERSION, YAZ_VERSION_SHA1);
73     if (strcmp(sha1_str, YAZ_VERSION_SHA1))
74         printf("YAZ DLL/SO: %s %s\n", vstr, sha1_str);
75     exit(0);
76 }
77
78 static int getbyte_stream(void *client_data)
79 {
80     FILE *f = (FILE*) client_data;
81
82     int c = fgetc(f);
83     if (c == EOF)
84         return 0;
85     return c;
86 }
87
88 static void ungetbyte_stream(int c, void *client_data)
89 {
90     FILE *f = (FILE*) client_data;
91
92     if (c == 0)
93         c = EOF;
94     ungetc(c, f);
95 }
96
97 static void marcdump_read_line(yaz_marc_t mt, const char *fname)
98 {
99     FILE *inf = fopen(fname, "rb");
100     if (!inf)
101     {
102         fprintf(stderr, "%s: cannot open %s:%s\n",
103                 prog, fname, strerror(errno));
104         exit(1);
105     }
106     
107     while (yaz_marc_read_line(mt, getbyte_stream,
108                               ungetbyte_stream, inf) == 0)
109     {
110         WRBUF wrbuf = wrbuf_alloc();
111         yaz_marc_write_mode(mt, wrbuf);
112         fputs(wrbuf_cstr(wrbuf), stdout);
113         wrbuf_destroy(wrbuf);
114     }
115     fclose(inf);
116 }
117
118 #if YAZ_HAVE_XML2
119 static void marcdump_read_xml(yaz_marc_t mt, const char *fname)
120 {
121     WRBUF wrbuf = wrbuf_alloc();
122 #if USE_XMLREADER
123     xmlTextReaderPtr reader = xmlReaderForFile(fname, 0 /* encoding */,
124                                                0 /* options */);
125
126     if (reader)
127     {
128         int ret;
129         while ((ret = xmlTextReaderRead(reader)) == 1)
130         {
131             int type = xmlTextReaderNodeType(reader);
132             if (type == XML_READER_TYPE_ELEMENT)
133             {
134                 const char *name = (const char *) 
135                     xmlTextReaderLocalName(reader);
136                 if (!strcmp(name, "record") || !strcmp(name, "r"))
137                 {
138                     xmlNodePtr ptr = xmlTextReaderExpand(reader);
139         
140                     int r = yaz_marc_read_xml(mt, ptr);
141                     if (r)
142                         fprintf(stderr, "yaz_marc_read_xml failed\n");
143                     else
144                     {
145                         int write_rc = yaz_marc_write_mode(mt, wrbuf);
146                         if (write_rc)
147                             yaz_log(YLOG_WARN, "yaz_marc_write_mode: write error: %d", write_rc);
148                         
149                         fputs(wrbuf_cstr(wrbuf), stdout);
150                         wrbuf_rewind(wrbuf);
151                     }
152                 }
153             }
154         }
155     }
156 #else
157     xmlDocPtr doc = xmlParseFile(fname);
158     if (doc)
159     {
160         xmlNodePtr ptr = xmlDocGetRootElement(doc);
161         for (; ptr; ptr = ptr->next)
162         {
163             if (ptr->type == XML_ELEMENT_NODE)
164             {
165                 if (!strcmp((const char *) ptr->name, "collection"))
166                 {
167                     ptr = ptr->children;
168                     continue;
169                 }
170                 if (!strcmp((const char *) ptr->name, "record") ||
171                     !strcmp((const char *) ptr->name, "r"))
172                 {
173                     int r = yaz_marc_read_xml(mt, ptr);
174                     if (r)
175                         fprintf(stderr, "yaz_marc_read_xml failed\n");
176                     else
177                     {
178                         yaz_marc_write_mode(mt, wrbuf);
179                         
180                         fputs(wrbuf_cstr(wrbuf), stdout);
181                         wrbuf_rewind(wrbuf);
182                     }
183                 }
184             }
185         }
186         xmlFreeDoc(doc);
187     }
188 #endif
189     fputs(wrbuf_cstr(wrbuf), stdout);
190     wrbuf_destroy(wrbuf);
191 }
192 #endif
193
194 static void dump(const char *fname, const char *from, const char *to,
195                  int input_format, int output_format,
196                  int write_using_libxml2,
197                  int print_offset, const char *split_fname, int split_chunk,
198                  int verbose, FILE *cfile, const char *leader_spec)
199 {
200     yaz_marc_t mt = yaz_marc_create();
201     yaz_iconv_t cd = 0;
202
203     if (yaz_marc_leader_spec(mt, leader_spec))
204     {
205         fprintf(stderr, "bad leader spec: %s\n", leader_spec);
206         yaz_marc_destroy(mt);
207         exit(2);
208     }
209     if (from && to)
210     {
211         cd = yaz_iconv_open(to, from);
212         if (!cd)
213         {
214             fprintf(stderr, "conversion from %s to %s "
215                     "unsupported\n", from, to);
216             yaz_marc_destroy(mt);
217             exit(2);
218         }
219         yaz_marc_iconv(mt, cd);
220     }
221     yaz_marc_enable_collection(mt);
222     yaz_marc_xml(mt, output_format);
223     yaz_marc_write_using_libxml2(mt, write_using_libxml2);
224     yaz_marc_debug(mt, verbose);
225
226     if (input_format == YAZ_MARC_MARCXML || input_format == YAZ_MARC_TURBOMARC || input_format == YAZ_MARC_XCHANGE)
227     {
228 #if YAZ_HAVE_XML2
229         marcdump_read_xml(mt, fname);
230 #endif
231     }
232     else if (input_format == YAZ_MARC_LINE)
233     {
234         marcdump_read_line(mt, fname);
235     }
236     else if (input_format == YAZ_MARC_ISO2709)
237     {
238         FILE *inf = fopen(fname, "rb");
239         int num = 1;
240         int marc_no = 0;
241         int split_file_no = -1;
242         if (!inf)
243         {
244             fprintf(stderr, "%s: cannot open %s:%s\n",
245                     prog, fname, strerror(errno));
246             exit(1);
247         }
248         if (cfile)
249             fprintf(cfile, "char *marc_records[] = {\n");
250         for(;; marc_no++)
251         {
252             const char *result = 0;
253             size_t len;
254             size_t rlen;
255             size_t len_result;
256             size_t r;
257             char buf[100001];
258             
259             r = fread(buf, 1, 5, inf);
260             if (r < 5)
261             {
262                 if (r && print_offset && verbose)
263                     printf("<!-- Extra %ld bytes at end of file -->\n",
264                            (long) r);
265                 break;
266             }
267             while (*buf < '0' || *buf > '9')
268             {
269                 int i;
270                 long off = ftell(inf) - 5;
271                 if (verbose || print_offset)
272                     printf("<!-- Skipping bad byte %d (0x%02X) at offset "
273                            "%ld (0x%lx) -->\n", 
274                            *buf & 0xff, *buf & 0xff,
275                            off, off);
276                 for (i = 0; i<4; i++)
277                     buf[i] = buf[i+1];
278                 r = fread(buf+4, 1, 1, inf);
279                 if (r < 1)
280                     break;
281             }
282             if (r < 1)
283             {
284                 if (verbose || print_offset)
285                     printf("<!-- End of file with data -->\n");
286                 break;
287             }
288             if (print_offset)
289             {
290                 long off = ftell(inf) - 5;
291                 printf("<!-- Record %d offset %ld (0x%lx) -->\n",
292                        num, off, off);
293             }
294             len = atoi_n(buf, 5);
295             if (len < 25 || len > 100000)
296             {
297                 long off = ftell(inf) - 5;
298                 printf("Bad Length %ld read at offset %ld (%lx)\n",
299                        (long)len, (long) off, (long) off);
300                 break;
301             }
302             rlen = len - 5;
303             r = fread(buf + 5, 1, rlen, inf);
304             if (r < rlen)
305                 break;
306             while (buf[len-1] != ISO2709_RS)
307             {
308                 if (len > sizeof(buf)-2)
309                     break;
310                 r = fread(buf + len, 1, 1, inf);
311                 if (r != 1)
312                     break;
313                 len++;
314             }
315             if (split_fname)
316             {
317                 char fname[256];
318                 const char *mode = 0;
319                 FILE *sf;
320                 if ((marc_no % split_chunk) == 0)
321                 {
322                     mode = "wb";
323                     split_file_no++;
324                 }
325                 else
326                     mode = "ab";
327                 sprintf(fname, "%.200s%07d", split_fname, split_file_no);
328                 sf = fopen(fname, mode);
329                 if (!sf)
330                 {
331                     fprintf(stderr, "Could not open %s\n", fname);
332                     split_fname = 0;
333                 }
334                 else
335                 {
336                     if (fwrite(buf, 1, len, sf) != len)
337                     {
338                         fprintf(stderr, "Could write content to %s\n",
339                                 fname);
340                         split_fname = 0;
341                     }
342                     fclose(sf);
343                 }
344             }
345             len_result = rlen;
346             r = yaz_marc_decode_buf(mt, buf, -1, &result, &len_result);
347             if (r > 0 && result && len_result)
348             {
349                 if (fwrite(result, len_result, 1, stdout) != 1)
350                 {
351                     fprintf(stderr, "Write to stdout failed\n");
352                     break;
353                 }
354             }
355             if (r > 0 && cfile)
356             {
357                 char *p = buf;
358                 size_t i;
359                 if (marc_no)
360                     fprintf(cfile, ",");
361                 fprintf(cfile, "\n");
362                 for (i = 0; i < r; i++)
363                 {
364                     if ((i & 15) == 0)
365                         fprintf(cfile, "  \"");
366                     fprintf(cfile, "\\x%02X", p[i] & 255);
367                     
368                     if (i < r - 1 && (i & 15) == 15)
369                         fprintf(cfile, "\"\n");
370                     
371                 }
372                 fprintf(cfile, "\"\n");
373             }
374             num++;
375             if (verbose)
376                 printf("\n");
377         }
378         if (cfile)
379             fprintf(cfile, "};\n");
380         fclose(inf);
381     }
382     {
383         WRBUF wrbuf = wrbuf_alloc();
384         yaz_marc_write_trailer(mt, wrbuf);
385         fputs(wrbuf_cstr(wrbuf), stdout);
386         wrbuf_destroy(wrbuf);
387     }
388     if (cd)
389         yaz_iconv_close(cd);
390     yaz_marc_destroy(mt);
391 }
392
393 int main (int argc, char **argv)
394 {
395     int r;
396     int print_offset = 0;
397     char *arg;
398     int verbose = 0;
399     int no = 0;
400     int output_format = YAZ_MARC_LINE;
401     FILE *cfile = 0;
402     char *from = 0, *to = 0;
403     int input_format = YAZ_MARC_ISO2709;
404     int split_chunk = 1;
405     const char *split_fname = 0;
406     const char *leader_spec = 0;
407     int write_using_libxml2 = 0;
408
409 #if HAVE_LOCALE_H
410     setlocale(LC_CTYPE, "");
411 #endif
412 #if HAVE_LANGINFO_H
413 #ifdef CODESET
414     to = nl_langinfo(CODESET);
415 #endif
416 #endif
417
418     prog = *argv;
419     while ((r = options("i:o:C:npc:xOeXIf:t:s:l:Vv", argv, argc, &arg)) != -2)
420     {
421         no++;
422         switch (r)
423         {
424         case 'i':
425             input_format = yaz_marc_decode_formatstr(arg);
426             if (input_format == -1)
427             {
428                 fprintf(stderr, "%s: bad input format: %s\n", prog, arg);
429                 exit(1);
430             }
431 #if YAZ_HAVE_XML2
432 #else
433             if (input_format == YAZ_MARC_MARCXML 
434                 || input_format == YAZ_MARC_XCHANGE)
435             {
436                 fprintf(stderr, "%s: Libxml2 support not enabled\n", prog);
437                 exit(3);
438             }
439 #endif
440             break;
441         case 'o':
442             /* dirty hack so we can make Libxml2 do the writing ..
443                rather than WRBUF */
444             if (strlen(arg) > 4 && strncmp(arg, "xml,", 4) == 0)
445             {
446                 /* Only supported for Libxml2 2.6.0 or later */
447 #if LIBXML_VERSION >= 20600
448                 arg = arg + 4;
449                 write_using_libxml2 = 1;
450 #else
451                 fprintf(stderr, "%s: output using Libxml2 unsupported\n", prog);
452                 exit(4);
453 #endif
454             }
455             output_format = yaz_marc_decode_formatstr(arg);
456             if (output_format == -1)
457             {
458                 fprintf(stderr, "%s: bad output format: %s\n", prog, arg);
459                 exit(1);
460             }
461             break;
462         case 'l':
463             leader_spec = arg;
464             break;
465         case 'f':
466             from = arg;
467             break;
468         case 't':
469             to = arg;
470             break;
471         case 'c':
472             if (cfile)
473                 fclose(cfile);
474             cfile = fopen(arg, "w");
475             break;
476         case 'x':
477             fprintf(stderr, "%s: -x no longer supported. "
478                     "Use -i marcxml instead\n", prog);
479             exit(1);
480             break;
481         case 'O':
482             fprintf(stderr, "%s: OAI MARC no longer supported."
483                     " Use MARCXML instead.\n", prog);
484             exit(1);
485             break;
486         case 'e':
487             fprintf(stderr, "%s: -e no longer supported. "
488                     "Use -o marcxchange instead\n", prog);
489             exit(1);
490             break;
491         case 'X':
492             fprintf(stderr, "%s: -X no longer supported. "
493                     "Use -o marcxml instead\n", prog);
494             exit(1);
495             break;
496         case 'I':
497             fprintf(stderr, "%s: -I no longer supported. "
498                     "Use -o marc instead\n", prog);
499             exit(1);
500             break;
501         case 'n':
502             output_format = YAZ_MARC_CHECK;
503             break;
504         case 'p':
505             print_offset = 1;
506             break;
507         case 's':
508             split_fname = arg;
509             break;
510         case 'C':
511             split_chunk = atoi(arg);
512             break;
513         case 0:
514             dump(arg, from, to, input_format, output_format,
515                  write_using_libxml2,
516                  print_offset, split_fname, split_chunk,
517                  verbose, cfile, leader_spec);
518             break;
519         case 'v':
520             verbose++;
521             break;
522         case 'V': 
523             show_version();
524             break;
525         default:
526             usage(prog);
527             exit(1);
528         }
529     }
530     if (cfile)
531         fclose(cfile);
532     if (!no)
533     {
534         usage(prog);
535         exit(1);
536     }
537     exit(0);
538 }
539 /*
540  * Local variables:
541  * c-basic-offset: 4
542  * c-file-style: "Stroustrup"
543  * indent-tabs-mode: nil
544  * End:
545  * vim: shiftwidth=4 tabstop=8 expandtab
546  */
547