ed2096aa838fb881ea0d9e6c5a27da208e9e9358
[yaz-moved-to-github.git] / util / marcdump.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2008 Index Data
3  * See the file LICENSE for details.
4  */
5
6 #define _FILE_OFFSET_BITS 64
7
8 #if HAVE_CONFIG_H
9 #include <config.h>
10 #endif
11
12 #if YAZ_HAVE_XML2
13 #include <libxml/parser.h>
14 #include <libxml/tree.h>
15 #include <libxml/xpath.h>
16 #include <libxml/xpathInternals.h>
17
18 /* Libxml2 version < 2.6.15. xmlreader not reliable/present */
19 #if LIBXML_VERSION < 20615
20 #define USE_XMLREADER 0
21 #else
22 #define USE_XMLREADER 1
23 #endif
24
25 #if USE_XMLREADER
26 #include <libxml/xmlreader.h>
27 #endif
28
29 #endif
30
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <errno.h>
35 #include <assert.h>
36
37 #if HAVE_LOCALE_H
38 #include <locale.h>
39 #endif
40 #if HAVE_LANGINFO_H
41 #include <langinfo.h>
42 #endif
43
44 #include <yaz/marcdisp.h>
45 #include <yaz/yaz-util.h>
46 #include <yaz/xmalloc.h>
47 #include <yaz/options.h>
48
49 #ifndef SEEK_SET
50 #define SEEK_SET 0
51 #endif
52 #ifndef SEEK_END
53 #define SEEK_END 2
54 #endif
55
56
57 static char *prog;
58
59 static void usage(const char *prog)
60 {
61     fprintf (stderr, "Usage: %s [-c cfile] [-f from] [-t to] "
62              "[-i format] [-o format] "
63              "[-n] [-l pos=value] [-v] [-C chunk] [-s splitfname] [-p] file...\n",
64              prog);
65
66
67 static int getbyte_stream(void *client_data)
68 {
69     FILE *f = (FILE*) client_data;
70
71     int c = fgetc(f);
72     if (c == EOF)
73         return 0;
74     return c;
75 }
76
77 static void ungetbyte_stream(int c, void *client_data)
78 {
79     FILE *f = (FILE*) client_data;
80
81     if (c == 0)
82         c = EOF;
83     ungetc(c, f);
84 }
85
86 static void marcdump_read_line(yaz_marc_t mt, const char *fname)
87 {
88     FILE *inf = fopen(fname, "rb");
89     if (!inf)
90     {
91         fprintf (stderr, "%s: cannot open %s:%s\n",
92                  prog, fname, strerror (errno));
93         exit(1);
94     }
95     
96     while (yaz_marc_read_line(mt, getbyte_stream,
97                               ungetbyte_stream, inf) == 0)
98     {
99         WRBUF wrbuf = wrbuf_alloc();
100         yaz_marc_write_mode(mt, wrbuf);
101         fputs(wrbuf_cstr(wrbuf), stdout);
102         wrbuf_destroy(wrbuf);
103     }
104     fclose(inf);
105 }
106
107 #if YAZ_HAVE_XML2
108 static void marcdump_read_xml(yaz_marc_t mt, const char *fname)
109 {
110     WRBUF wrbuf = wrbuf_alloc();
111 #if USE_XMLREADER
112     xmlTextReaderPtr reader = xmlReaderForFile(fname, 0 /* encoding */,
113                                                0 /* options */);
114
115     if (reader)
116     {
117         int ret;
118         while ((ret = xmlTextReaderRead(reader)) == 1)
119         {
120             int type = xmlTextReaderNodeType(reader);
121             if (type == XML_READER_TYPE_ELEMENT)
122             {
123                 const char *name = (const char *) 
124                     xmlTextReaderLocalName(reader);
125                 if (!strcmp(name, "record"))
126                 {
127                     xmlNodePtr ptr = xmlTextReaderExpand(reader);
128         
129                     int r = yaz_marc_read_xml(mt, ptr);
130                     if (r)
131                         fprintf(stderr, "yaz_marc_read_xml failed\n");
132                     else
133                     {
134                         yaz_marc_write_mode(mt, wrbuf);
135                         
136                         fputs(wrbuf_cstr(wrbuf), stdout);
137                         wrbuf_rewind(wrbuf);
138                     }
139                 }
140             }
141         }
142     }
143 #else
144     xmlDocPtr doc = xmlParseFile(fname);
145     if (doc)
146     {
147         xmlNodePtr ptr = xmlDocGetRootElement(doc);
148         for (; ptr; ptr = ptr->next)
149         {
150             if (ptr->type == XML_ELEMENT_NODE)
151             {
152                 if (!strcmp((const char *) ptr->name, "collection"))
153                 {
154                     ptr = ptr->children;
155                     continue;
156                 }
157                 if (!strcmp((const char *) ptr->name, "record"))
158                 {
159                     int r = yaz_marc_read_xml(mt, ptr);
160                     if (r)
161                         fprintf(stderr, "yaz_marc_read_xml failed\n");
162                     else
163                     {
164                         yaz_marc_write_mode(mt, wrbuf);
165                         
166                         fputs(wrbuf_cstr(wrbuf), stdout);
167                         wrbuf_rewind(wrbuf);
168                     }
169                 }
170             }
171         }
172         xmlFreeDoc(doc);
173     }
174 #endif
175     fputs(wrbuf_cstr(wrbuf), stdout);
176     wrbuf_destroy(wrbuf);
177 }
178 #endif
179
180 static void dump(const char *fname, const char *from, const char *to,
181                  int input_format, int output_format,
182                  int write_using_libxml2,
183                  int print_offset, const char *split_fname, int split_chunk,
184                  int verbose, FILE *cfile, const char *leader_spec)
185 {
186     yaz_marc_t mt = yaz_marc_create();
187     yaz_iconv_t cd = 0;
188
189     if (yaz_marc_leader_spec(mt, leader_spec))
190     {
191         fprintf(stderr, "bad leader spec: %s\n", leader_spec);
192         yaz_marc_destroy(mt);
193         exit(2);
194     }
195     if (from && to)
196     {
197         cd = yaz_iconv_open(to, from);
198         if (!cd)
199         {
200             fprintf(stderr, "conversion from %s to %s "
201                     "unsupported\n", from, to);
202             yaz_marc_destroy(mt);
203             exit(2);
204         }
205         yaz_marc_iconv(mt, cd);
206     }
207     yaz_marc_xml(mt, output_format);
208     yaz_marc_enable_collection(mt);
209     yaz_marc_write_using_libxml2(mt, write_using_libxml2);
210     yaz_marc_debug(mt, verbose);
211
212     if (input_format == YAZ_MARC_MARCXML || input_format == YAZ_MARC_XCHANGE)
213     {
214 #if YAZ_HAVE_XML2
215         marcdump_read_xml(mt, fname);
216 #endif
217     }
218     else if (input_format == YAZ_MARC_LINE)
219     {
220         marcdump_read_line(mt, fname);
221     }
222     else if (input_format == YAZ_MARC_ISO2709)
223     {
224         FILE *inf = fopen(fname, "rb");
225         int num = 1;
226         int marc_no = 0;
227         int split_file_no = -1;
228         if (!inf)
229         {
230             fprintf (stderr, "%s: cannot open %s:%s\n",
231                      prog, fname, strerror (errno));
232             exit(1);
233         }
234         if (cfile)
235             fprintf (cfile, "char *marc_records[] = {\n");
236         for(;; marc_no++)
237         {
238             const char *result = 0;
239             size_t len;
240             size_t rlen;
241             size_t len_result;
242             size_t r;
243             char buf[100001];
244             
245             r = fread (buf, 1, 5, inf);
246             if (r < 5)
247             {
248                 if (r && print_offset && verbose)
249                     printf ("<!-- Extra %ld bytes at end of file -->\n",
250                             (long) r);
251                 break;
252             }
253             while (*buf < '0' || *buf > '9')
254             {
255                 int i;
256                 long off = ftell(inf) - 5;
257                 if (verbose || print_offset)
258                     printf("<!-- Skipping bad byte %d (0x%02X) at offset "
259                            "%ld (0x%lx) -->\n", 
260                            *buf & 0xff, *buf & 0xff,
261                            off, off);
262                 for (i = 0; i<4; i++)
263                     buf[i] = buf[i+1];
264                 r = fread(buf+4, 1, 1, inf);
265                 if (r < 1)
266                     break;
267             }
268             if (r < 1)
269             {
270                 if (verbose || print_offset)
271                     printf ("<!-- End of file with data -->\n");
272                 break;
273             }
274             if (print_offset)
275             {
276                 long off = ftell(inf) - 5;
277                 printf ("<!-- Record %d offset %ld (0x%lx) -->\n",
278                         num, off, off);
279             }
280             len = atoi_n(buf, 5);
281             if (len < 25 || len > 100000)
282             {
283                 long off = ftell(inf) - 5;
284                 printf("Bad Length %ld read at offset %ld (%lx)\n",
285                        (long)len, (long) off, (long) off);
286                 break;
287             }
288             rlen = len - 5;
289             r = fread (buf + 5, 1, rlen, inf);
290             if (r < rlen)
291                 break;
292             while (buf[len-1] != ISO2709_RS)
293             {
294                 if (len > sizeof(buf)-2)
295                     break;
296                 r = fread (buf + len, 1, 1, inf);
297                 if (r != 1)
298                     break;
299                 len++;
300             }
301             if (split_fname)
302             {
303                 char fname[256];
304                 const char *mode = 0;
305                 FILE *sf;
306                 if ((marc_no % split_chunk) == 0)
307                 {
308                     mode = "wb";
309                     split_file_no++;
310                 }
311                 else
312                     mode = "ab";
313                 sprintf(fname, "%.200s%07d", split_fname, split_file_no);
314                 sf = fopen(fname, mode);
315                 if (!sf)
316                 {
317                     fprintf(stderr, "Could not open %s\n", fname);
318                     split_fname = 0;
319                 }
320                 else
321                 {
322                     if (fwrite(buf, 1, len, sf) != len)
323                     {
324                         fprintf(stderr, "Could write content to %s\n",
325                                 fname);
326                         split_fname = 0;
327                     }
328                     fclose(sf);
329                 }
330             }
331             len_result = rlen;
332             r = yaz_marc_decode_buf(mt, buf, -1, &result, &len_result);
333             if (r > 0 && result)
334             {
335                 fwrite (result, len_result, 1, stdout);
336             }
337             if (r > 0 && cfile)
338             {
339                 char *p = buf;
340                 size_t i;
341                 if (marc_no)
342                     fprintf (cfile, ",");
343                 fprintf (cfile, "\n");
344                 for (i = 0; i < r; i++)
345                 {
346                     if ((i & 15) == 0)
347                         fprintf (cfile, "  \"");
348                     fprintf (cfile, "\\x%02X", p[i] & 255);
349                     
350                     if (i < r - 1 && (i & 15) == 15)
351                         fprintf (cfile, "\"\n");
352                     
353                 }
354                 fprintf (cfile, "\"\n");
355             }
356             num++;
357             if (verbose)
358                 printf("\n");
359         }
360         if (cfile)
361             fprintf (cfile, "};\n");
362         fclose(inf);
363     }
364     {
365         WRBUF wrbuf = wrbuf_alloc();
366         yaz_marc_write_trailer(mt, wrbuf);
367         fputs(wrbuf_cstr(wrbuf), stdout);
368         wrbuf_destroy(wrbuf);
369     }
370     if (cd)
371         yaz_iconv_close(cd);
372     yaz_marc_destroy(mt);
373 }
374
375 int main (int argc, char **argv)
376 {
377     int r;
378     int print_offset = 0;
379     char *arg;
380     int verbose = 0;
381     int no = 0;
382     int output_format = YAZ_MARC_LINE;
383     FILE *cfile = 0;
384     char *from = 0, *to = 0;
385     int input_format = YAZ_MARC_ISO2709;
386     int split_chunk = 1;
387     const char *split_fname = 0;
388     const char *leader_spec = 0;
389     int write_using_libxml2 = 0;
390
391 #if HAVE_LOCALE_H
392     setlocale(LC_CTYPE, "");
393 #endif
394 #if HAVE_LANGINFO_H
395 #ifdef CODESET
396     to = nl_langinfo(CODESET);
397 #endif
398 #endif
399
400     prog = *argv;
401     while ((r = options("i:o:C:npvc:xOeXIf:t:s:l:", argv, argc, &arg)) != -2)
402     {
403         no++;
404         switch (r)
405         {
406         case 'i':
407             input_format = yaz_marc_decode_formatstr(arg);
408             if (input_format == -1)
409             {
410                 fprintf(stderr, "%s: bad input format: %s\n", prog, arg);
411                 exit(1);
412             }
413 #if YAZ_HAVE_XML2
414 #else
415             if (input_format == YAZ_MARC_MARCXML 
416                 || input_format == YAZ_MARC_XCHANGE)
417             {
418                 fprintf(stderr, "%s: Libxml2 support not enabled\n", prog);
419                 exit(3);
420             }
421 #endif
422             break;
423         case 'o':
424             /* dirty hack so we can make Libxml2 do the writing ..
425                rather than WRBUF */
426             if (strlen(arg) > 4 && strncmp(arg, "xml,", 4) == 0)
427             {
428                 arg = arg + 4;
429                 write_using_libxml2 = 1;
430             }
431             output_format = yaz_marc_decode_formatstr(arg);
432             if (output_format == -1)
433             {
434                 fprintf(stderr, "%s: bad output format: %s\n", prog, arg);
435                 exit(1);
436             }
437             break;
438         case 'l':
439             leader_spec = arg;
440             break;
441         case 'f':
442             from = arg;
443             break;
444         case 't':
445             to = arg;
446             break;
447         case 'c':
448             if (cfile)
449                 fclose (cfile);
450             cfile = fopen(arg, "w");
451             break;
452         case 'x':
453             fprintf(stderr, "%s: -x no longer supported. "
454                     "Use -i marcxml instead\n", prog);
455             exit(1);
456             break;
457         case 'O':
458             fprintf(stderr, "%s: OAI MARC no longer supported."
459                     " Use MARCXML instead.\n", prog);
460             exit(1);
461             break;
462         case 'e':
463             fprintf(stderr, "%s: -e no longer supported. "
464                     "Use -o marcxchange instead\n", prog);
465             exit(1);
466             break;
467         case 'X':
468             fprintf(stderr, "%s: -X no longer supported. "
469                     "Use -o marcxml instead\n", prog);
470             exit(1);
471             break;
472         case 'I':
473             fprintf(stderr, "%s: -I no longer supported. "
474                     "Use -o marc instead\n", prog);
475             exit(1);
476             break;
477         case 'n':
478             output_format = YAZ_MARC_CHECK;
479             break;
480         case 'p':
481             print_offset = 1;
482             break;
483         case 's':
484             split_fname = arg;
485             break;
486         case 'C':
487             split_chunk = atoi(arg);
488             break;
489         case 0:
490             dump(arg, from, to, input_format, output_format,
491                  write_using_libxml2,
492                  print_offset, split_fname, split_chunk,
493                  verbose, cfile, leader_spec);
494             break;
495         case 'v':
496             verbose++;
497             break;
498         default:
499             usage(prog);
500             exit(1);
501         }
502     }
503     if (cfile)
504         fclose (cfile);
505     if (!no)
506     {
507         usage(prog);
508         exit (1);
509     }
510     exit (0);
511 }
512 /*
513  * Local variables:
514  * c-basic-offset: 4
515  * indent-tabs-mode: nil
516  * End:
517  * vim: shiftwidth=4 tabstop=8 expandtab
518  */
519