Happy new year
[yaz-moved-to-github.git] / util / marcdump.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2009 Index Data
3  * See the file LICENSE for details.
4  */
5
6 #define _FILE_OFFSET_BITS 64
7
8 #if HAVE_CONFIG_H
9 #include <config.h>
10 #endif
11
12 #if YAZ_HAVE_XML2
13 #include <libxml/parser.h>
14 #include <libxml/tree.h>
15 #include <libxml/xpath.h>
16 #include <libxml/xpathInternals.h>
17
18 /* Libxml2 version < 2.6.15. xmlreader not reliable/present */
19 #if LIBXML_VERSION < 20615
20 #define USE_XMLREADER 0
21 #else
22 #define USE_XMLREADER 1
23 #endif
24
25 #if USE_XMLREADER
26 #include <libxml/xmlreader.h>
27 #endif
28
29 #endif
30
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <errno.h>
35 #include <assert.h>
36
37 #if HAVE_LOCALE_H
38 #include <locale.h>
39 #endif
40 #if HAVE_LANGINFO_H
41 #include <langinfo.h>
42 #endif
43
44 #include <yaz/marcdisp.h>
45 #include <yaz/yaz-util.h>
46 #include <yaz/xmalloc.h>
47 #include <yaz/options.h>
48
49 #ifndef SEEK_SET
50 #define SEEK_SET 0
51 #endif
52 #ifndef SEEK_END
53 #define SEEK_END 2
54 #endif
55
56
57 static char *prog;
58
59 static void usage(const char *prog)
60 {
61     fprintf (stderr, "Usage: %s [-c cfile] [-f from] [-t to] "
62              "[-i format] [-o format] "
63              "[-n] [-l pos=value] [-v] [-C chunk] [-s splitfname] [-p] file...\n",
64              prog);
65
66
67 static int getbyte_stream(void *client_data)
68 {
69     FILE *f = (FILE*) client_data;
70
71     int c = fgetc(f);
72     if (c == EOF)
73         return 0;
74     return c;
75 }
76
77 static void ungetbyte_stream(int c, void *client_data)
78 {
79     FILE *f = (FILE*) client_data;
80
81     if (c == 0)
82         c = EOF;
83     ungetc(c, f);
84 }
85
86 static void marcdump_read_line(yaz_marc_t mt, const char *fname)
87 {
88     FILE *inf = fopen(fname, "rb");
89     if (!inf)
90     {
91         fprintf (stderr, "%s: cannot open %s:%s\n",
92                  prog, fname, strerror (errno));
93         exit(1);
94     }
95     
96     while (yaz_marc_read_line(mt, getbyte_stream,
97                               ungetbyte_stream, inf) == 0)
98     {
99         WRBUF wrbuf = wrbuf_alloc();
100         yaz_marc_write_mode(mt, wrbuf);
101         fputs(wrbuf_cstr(wrbuf), stdout);
102         wrbuf_destroy(wrbuf);
103     }
104     fclose(inf);
105 }
106
107 #if YAZ_HAVE_XML2
108 static void marcdump_read_xml(yaz_marc_t mt, const char *fname)
109 {
110     WRBUF wrbuf = wrbuf_alloc();
111 #if USE_XMLREADER
112     xmlTextReaderPtr reader = xmlReaderForFile(fname, 0 /* encoding */,
113                                                0 /* options */);
114
115     if (reader)
116     {
117         int ret;
118         while ((ret = xmlTextReaderRead(reader)) == 1)
119         {
120             int type = xmlTextReaderNodeType(reader);
121             if (type == XML_READER_TYPE_ELEMENT)
122             {
123                 const char *name = (const char *) 
124                     xmlTextReaderLocalName(reader);
125                 if (!strcmp(name, "record"))
126                 {
127                     xmlNodePtr ptr = xmlTextReaderExpand(reader);
128         
129                     int r = yaz_marc_read_xml(mt, ptr);
130                     if (r)
131                         fprintf(stderr, "yaz_marc_read_xml failed\n");
132                     else
133                     {
134                         yaz_marc_write_mode(mt, wrbuf);
135                         
136                         fputs(wrbuf_cstr(wrbuf), stdout);
137                         wrbuf_rewind(wrbuf);
138                     }
139                 }
140             }
141         }
142     }
143 #else
144     xmlDocPtr doc = xmlParseFile(fname);
145     if (doc)
146     {
147         xmlNodePtr ptr = xmlDocGetRootElement(doc);
148         for (; ptr; ptr = ptr->next)
149         {
150             if (ptr->type == XML_ELEMENT_NODE)
151             {
152                 if (!strcmp((const char *) ptr->name, "collection"))
153                 {
154                     ptr = ptr->children;
155                     continue;
156                 }
157                 if (!strcmp((const char *) ptr->name, "record"))
158                 {
159                     int r = yaz_marc_read_xml(mt, ptr);
160                     if (r)
161                         fprintf(stderr, "yaz_marc_read_xml failed\n");
162                     else
163                     {
164                         yaz_marc_write_mode(mt, wrbuf);
165                         
166                         fputs(wrbuf_cstr(wrbuf), stdout);
167                         wrbuf_rewind(wrbuf);
168                     }
169                 }
170             }
171         }
172         xmlFreeDoc(doc);
173     }
174 #endif
175     fputs(wrbuf_cstr(wrbuf), stdout);
176     wrbuf_destroy(wrbuf);
177 }
178 #endif
179
180 static void dump(const char *fname, const char *from, const char *to,
181                  int input_format, int output_format,
182                  int write_using_libxml2,
183                  int print_offset, const char *split_fname, int split_chunk,
184                  int verbose, FILE *cfile, const char *leader_spec)
185 {
186     yaz_marc_t mt = yaz_marc_create();
187     yaz_iconv_t cd = 0;
188
189     if (yaz_marc_leader_spec(mt, leader_spec))
190     {
191         fprintf(stderr, "bad leader spec: %s\n", leader_spec);
192         yaz_marc_destroy(mt);
193         exit(2);
194     }
195     if (from && to)
196     {
197         cd = yaz_iconv_open(to, from);
198         if (!cd)
199         {
200             fprintf(stderr, "conversion from %s to %s "
201                     "unsupported\n", from, to);
202             yaz_marc_destroy(mt);
203             exit(2);
204         }
205         yaz_marc_iconv(mt, cd);
206     }
207     yaz_marc_xml(mt, output_format);
208     yaz_marc_enable_collection(mt);
209     yaz_marc_write_using_libxml2(mt, write_using_libxml2);
210     yaz_marc_debug(mt, verbose);
211
212     if (input_format == YAZ_MARC_MARCXML || input_format == YAZ_MARC_XCHANGE)
213     {
214 #if YAZ_HAVE_XML2
215         marcdump_read_xml(mt, fname);
216 #endif
217     }
218     else if (input_format == YAZ_MARC_LINE)
219     {
220         marcdump_read_line(mt, fname);
221     }
222     else if (input_format == YAZ_MARC_ISO2709)
223     {
224         FILE *inf = fopen(fname, "rb");
225         int num = 1;
226         int marc_no = 0;
227         int split_file_no = -1;
228         if (!inf)
229         {
230             fprintf (stderr, "%s: cannot open %s:%s\n",
231                      prog, fname, strerror (errno));
232             exit(1);
233         }
234         if (cfile)
235             fprintf (cfile, "char *marc_records[] = {\n");
236         for(;; marc_no++)
237         {
238             const char *result = 0;
239             size_t len;
240             size_t rlen;
241             size_t len_result;
242             size_t r;
243             char buf[100001];
244             
245             r = fread (buf, 1, 5, inf);
246             if (r < 5)
247             {
248                 if (r && print_offset && verbose)
249                     printf ("<!-- Extra %ld bytes at end of file -->\n",
250                             (long) r);
251                 break;
252             }
253             while (*buf < '0' || *buf > '9')
254             {
255                 int i;
256                 long off = ftell(inf) - 5;
257                 if (verbose || print_offset)
258                     printf("<!-- Skipping bad byte %d (0x%02X) at offset "
259                            "%ld (0x%lx) -->\n", 
260                            *buf & 0xff, *buf & 0xff,
261                            off, off);
262                 for (i = 0; i<4; i++)
263                     buf[i] = buf[i+1];
264                 r = fread(buf+4, 1, 1, inf);
265                 if (r < 1)
266                     break;
267             }
268             if (r < 1)
269             {
270                 if (verbose || print_offset)
271                     printf ("<!-- End of file with data -->\n");
272                 break;
273             }
274             if (print_offset)
275             {
276                 long off = ftell(inf) - 5;
277                 printf ("<!-- Record %d offset %ld (0x%lx) -->\n",
278                         num, off, off);
279             }
280             len = atoi_n(buf, 5);
281             if (len < 25 || len > 100000)
282             {
283                 long off = ftell(inf) - 5;
284                 printf("Bad Length %ld read at offset %ld (%lx)\n",
285                        (long)len, (long) off, (long) off);
286                 break;
287             }
288             rlen = len - 5;
289             r = fread (buf + 5, 1, rlen, inf);
290             if (r < rlen)
291                 break;
292             while (buf[len-1] != ISO2709_RS)
293             {
294                 if (len > sizeof(buf)-2)
295                     break;
296                 r = fread (buf + len, 1, 1, inf);
297                 if (r != 1)
298                     break;
299                 len++;
300             }
301             if (split_fname)
302             {
303                 char fname[256];
304                 const char *mode = 0;
305                 FILE *sf;
306                 if ((marc_no % split_chunk) == 0)
307                 {
308                     mode = "wb";
309                     split_file_no++;
310                 }
311                 else
312                     mode = "ab";
313                 sprintf(fname, "%.200s%07d", split_fname, split_file_no);
314                 sf = fopen(fname, mode);
315                 if (!sf)
316                 {
317                     fprintf(stderr, "Could not open %s\n", fname);
318                     split_fname = 0;
319                 }
320                 else
321                 {
322                     if (fwrite(buf, 1, len, sf) != len)
323                     {
324                         fprintf(stderr, "Could write content to %s\n",
325                                 fname);
326                         split_fname = 0;
327                     }
328                     fclose(sf);
329                 }
330             }
331             len_result = rlen;
332             r = yaz_marc_decode_buf(mt, buf, -1, &result, &len_result);
333             if (r > 0 && result)
334             {
335                 if (fwrite(result, len_result, 1, stdout) != 1)
336                 {
337                     fprintf(stderr, "Write to stdout failed\n");
338                     break;
339                 }
340             }
341             if (r > 0 && cfile)
342             {
343                 char *p = buf;
344                 size_t i;
345                 if (marc_no)
346                     fprintf (cfile, ",");
347                 fprintf (cfile, "\n");
348                 for (i = 0; i < r; i++)
349                 {
350                     if ((i & 15) == 0)
351                         fprintf (cfile, "  \"");
352                     fprintf (cfile, "\\x%02X", p[i] & 255);
353                     
354                     if (i < r - 1 && (i & 15) == 15)
355                         fprintf (cfile, "\"\n");
356                     
357                 }
358                 fprintf (cfile, "\"\n");
359             }
360             num++;
361             if (verbose)
362                 printf("\n");
363         }
364         if (cfile)
365             fprintf (cfile, "};\n");
366         fclose(inf);
367     }
368     {
369         WRBUF wrbuf = wrbuf_alloc();
370         yaz_marc_write_trailer(mt, wrbuf);
371         fputs(wrbuf_cstr(wrbuf), stdout);
372         wrbuf_destroy(wrbuf);
373     }
374     if (cd)
375         yaz_iconv_close(cd);
376     yaz_marc_destroy(mt);
377 }
378
379 int main (int argc, char **argv)
380 {
381     int r;
382     int print_offset = 0;
383     char *arg;
384     int verbose = 0;
385     int no = 0;
386     int output_format = YAZ_MARC_LINE;
387     FILE *cfile = 0;
388     char *from = 0, *to = 0;
389     int input_format = YAZ_MARC_ISO2709;
390     int split_chunk = 1;
391     const char *split_fname = 0;
392     const char *leader_spec = 0;
393     int write_using_libxml2 = 0;
394
395 #if HAVE_LOCALE_H
396     setlocale(LC_CTYPE, "");
397 #endif
398 #if HAVE_LANGINFO_H
399 #ifdef CODESET
400     to = nl_langinfo(CODESET);
401 #endif
402 #endif
403
404     prog = *argv;
405     while ((r = options("i:o:C:npvc:xOeXIf:t:s:l:", argv, argc, &arg)) != -2)
406     {
407         no++;
408         switch (r)
409         {
410         case 'i':
411             input_format = yaz_marc_decode_formatstr(arg);
412             if (input_format == -1)
413             {
414                 fprintf(stderr, "%s: bad input format: %s\n", prog, arg);
415                 exit(1);
416             }
417 #if YAZ_HAVE_XML2
418 #else
419             if (input_format == YAZ_MARC_MARCXML 
420                 || input_format == YAZ_MARC_XCHANGE)
421             {
422                 fprintf(stderr, "%s: Libxml2 support not enabled\n", prog);
423                 exit(3);
424             }
425 #endif
426             break;
427         case 'o':
428             /* dirty hack so we can make Libxml2 do the writing ..
429                rather than WRBUF */
430             if (strlen(arg) > 4 && strncmp(arg, "xml,", 4) == 0)
431             {
432                 arg = arg + 4;
433                 write_using_libxml2 = 1;
434             }
435             output_format = yaz_marc_decode_formatstr(arg);
436             if (output_format == -1)
437             {
438                 fprintf(stderr, "%s: bad output format: %s\n", prog, arg);
439                 exit(1);
440             }
441             break;
442         case 'l':
443             leader_spec = arg;
444             break;
445         case 'f':
446             from = arg;
447             break;
448         case 't':
449             to = arg;
450             break;
451         case 'c':
452             if (cfile)
453                 fclose (cfile);
454             cfile = fopen(arg, "w");
455             break;
456         case 'x':
457             fprintf(stderr, "%s: -x no longer supported. "
458                     "Use -i marcxml instead\n", prog);
459             exit(1);
460             break;
461         case 'O':
462             fprintf(stderr, "%s: OAI MARC no longer supported."
463                     " Use MARCXML instead.\n", prog);
464             exit(1);
465             break;
466         case 'e':
467             fprintf(stderr, "%s: -e no longer supported. "
468                     "Use -o marcxchange instead\n", prog);
469             exit(1);
470             break;
471         case 'X':
472             fprintf(stderr, "%s: -X no longer supported. "
473                     "Use -o marcxml instead\n", prog);
474             exit(1);
475             break;
476         case 'I':
477             fprintf(stderr, "%s: -I no longer supported. "
478                     "Use -o marc instead\n", prog);
479             exit(1);
480             break;
481         case 'n':
482             output_format = YAZ_MARC_CHECK;
483             break;
484         case 'p':
485             print_offset = 1;
486             break;
487         case 's':
488             split_fname = arg;
489             break;
490         case 'C':
491             split_chunk = atoi(arg);
492             break;
493         case 0:
494             dump(arg, from, to, input_format, output_format,
495                  write_using_libxml2,
496                  print_offset, split_fname, split_chunk,
497                  verbose, cfile, leader_spec);
498             break;
499         case 'v':
500             verbose++;
501             break;
502         default:
503             usage(prog);
504             exit(1);
505         }
506     }
507     if (cfile)
508         fclose (cfile);
509     if (!no)
510     {
511         usage(prog);
512         exit (1);
513     }
514     exit (0);
515 }
516 /*
517  * Local variables:
518  * c-basic-offset: 4
519  * indent-tabs-mode: nil
520  * End:
521  * vim: shiftwidth=4 tabstop=8 expandtab
522  */
523