Produce </collection> only once in MARCXML printing.
[yaz-moved-to-github.git] / util / marcdump.c
1 /*
2  * Copyright (C) 1995-2007, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: marcdump.c,v 1.55 2007-12-18 21:13:06 adam Exp $
6  */
7
8 #define _FILE_OFFSET_BITS 64
9
10 #if HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #if YAZ_HAVE_XML2
15 #include <libxml/parser.h>
16 #include <libxml/tree.h>
17 #include <libxml/xpath.h>
18 #include <libxml/xpathInternals.h>
19
20 /* Libxml2 version < 2.6.15. xmlreader not reliable/present */
21 #if LIBXML_VERSION < 20615
22 #define USE_XMLREADER 0
23 #else
24 #define USE_XMLREADER 1
25 #endif
26
27 #if USE_XMLREADER
28 #include <libxml/xmlreader.h>
29 #endif
30
31 #endif
32
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <errno.h>
37 #include <assert.h>
38
39 #if HAVE_LOCALE_H
40 #include <locale.h>
41 #endif
42 #if HAVE_LANGINFO_H
43 #include <langinfo.h>
44 #endif
45
46 #include <yaz/marcdisp.h>
47 #include <yaz/yaz-util.h>
48 #include <yaz/xmalloc.h>
49 #include <yaz/options.h>
50
51 #ifndef SEEK_SET
52 #define SEEK_SET 0
53 #endif
54 #ifndef SEEK_END
55 #define SEEK_END 2
56 #endif
57
58
59 static char *prog;
60
61 static void usage(const char *prog)
62 {
63     fprintf (stderr, "Usage: %s [-c cfile] [-f from] [-t to] "
64              "[-i format] [-o format] "
65              "[-n] [-l pos=value] [-v] [-C chunk] [-s splitfname] [-p] file...\n",
66              prog);
67
68
69 static int getbyte_stream(void *client_data)
70 {
71     FILE *f = (FILE*) client_data;
72
73     int c = fgetc(f);
74     if (c == EOF)
75         return 0;
76     return c;
77 }
78
79 static void ungetbyte_stream(int c, void *client_data)
80 {
81     FILE *f = (FILE*) client_data;
82
83     if (c == 0)
84         c = EOF;
85     ungetc(c, f);
86 }
87
88 static void marcdump_read_line(yaz_marc_t mt, const char *fname)
89 {
90     FILE *inf = fopen(fname, "rb");
91     if (!inf)
92     {
93         fprintf (stderr, "%s: cannot open %s:%s\n",
94                  prog, fname, strerror (errno));
95         exit(1);
96     }
97     
98     while (yaz_marc_read_line(mt, getbyte_stream,
99                               ungetbyte_stream, inf) == 0)
100     {
101         WRBUF wrbuf = wrbuf_alloc();
102         yaz_marc_write_mode(mt, wrbuf);
103         fputs(wrbuf_cstr(wrbuf), stdout);
104         wrbuf_destroy(wrbuf);
105     }
106     fclose(inf);
107 }
108
109 #if YAZ_HAVE_XML2
110 static void marcdump_read_xml(yaz_marc_t mt, const char *fname)
111 {
112     WRBUF wrbuf = wrbuf_alloc();
113 #if USE_XMLREADER
114     xmlTextReaderPtr reader = xmlReaderForFile(fname, 0 /* encoding */,
115                                                0 /* options */);
116
117     if (reader)
118     {
119         int ret;
120         while ((ret = xmlTextReaderRead(reader)) == 1)
121         {
122             int type = xmlTextReaderNodeType(reader);
123             if (type == XML_READER_TYPE_ELEMENT)
124             {
125                 const char *name = (const char *) 
126                     xmlTextReaderLocalName(reader);
127                 if (!strcmp(name, "record"))
128                 {
129                     xmlNodePtr ptr = xmlTextReaderExpand(reader);
130         
131                     int r = yaz_marc_read_xml(mt, ptr);
132                     if (r)
133                         fprintf(stderr, "yaz_marc_read_xml failed\n");
134                     else
135                     {
136                         yaz_marc_write_mode(mt, wrbuf);
137                         
138                         fputs(wrbuf_cstr(wrbuf), stdout);
139                         wrbuf_rewind(wrbuf);
140                     }
141                 }
142             }
143         }
144     }
145 #else
146     xmlDocPtr doc = xmlParseFile(fname);
147     if (doc)
148     {
149         xmlNodePtr ptr = xmlDocGetRootElement(doc);
150         for (; ptr; ptr = ptr->next)
151         {
152             if (ptr->type == XML_ELEMENT_NODE)
153             {
154                 if (!strcmp((const char *) ptr->name, "collection"))
155                 {
156                     ptr = ptr->children;
157                     continue;
158                 }
159                 if (!strcmp((const char *) ptr->name, "record"))
160                 {
161                     int r = yaz_marc_read_xml(mt, ptr);
162                     if (r)
163                         fprintf(stderr, "yaz_marc_read_xml failed\n");
164                     else
165                     {
166                         yaz_marc_write_mode(mt, wrbuf);
167                         
168                         fputs(wrbuf_cstr(wrbuf), stdout);
169                         wrbuf_rewind(wrbuf);
170                     }
171                 }
172             }
173         }
174         xmlFreeDoc(doc);
175     }
176 #endif
177     fputs(wrbuf_cstr(wrbuf), stdout);
178     wrbuf_destroy(wrbuf);
179 }
180 #endif
181
182 static void dump(const char *fname, const char *from, const char *to,
183                  int input_format, int output_format,
184                  int write_using_libxml2,
185                  int print_offset, const char *split_fname, int split_chunk,
186                  int verbose, FILE *cfile, const char *leader_spec)
187 {
188     yaz_marc_t mt = yaz_marc_create();
189     yaz_iconv_t cd = 0;
190
191     if (yaz_marc_leader_spec(mt, leader_spec))
192     {
193         fprintf(stderr, "bad leader spec: %s\n", leader_spec);
194         yaz_marc_destroy(mt);
195         exit(2);
196     }
197     if (from && to)
198     {
199         cd = yaz_iconv_open(to, from);
200         if (!cd)
201         {
202             fprintf(stderr, "conversion from %s to %s "
203                     "unsupported\n", from, to);
204             yaz_marc_destroy(mt);
205             exit(2);
206         }
207         yaz_marc_iconv(mt, cd);
208     }
209     yaz_marc_xml(mt, output_format);
210     yaz_marc_enable_collection(mt);
211     yaz_marc_write_using_libxml2(mt, write_using_libxml2);
212     yaz_marc_debug(mt, verbose);
213
214     if (input_format == YAZ_MARC_MARCXML || input_format == YAZ_MARC_XCHANGE)
215     {
216 #if YAZ_HAVE_XML2
217         marcdump_read_xml(mt, fname);
218 #endif
219     }
220     else if (input_format == YAZ_MARC_LINE)
221     {
222         marcdump_read_line(mt, fname);
223     }
224     else if (input_format == YAZ_MARC_ISO2709)
225     {
226         FILE *inf = fopen(fname, "rb");
227         int num = 1;
228         int marc_no = 0;
229         int split_file_no = -1;
230         if (!inf)
231         {
232             fprintf (stderr, "%s: cannot open %s:%s\n",
233                      prog, fname, strerror (errno));
234             exit(1);
235         }
236         if (cfile)
237             fprintf (cfile, "char *marc_records[] = {\n");
238         for(;; marc_no++)
239         {
240             const char *result = 0;
241             size_t len;
242             size_t rlen;
243             size_t len_result;
244             size_t r;
245             char buf[100001];
246             
247             r = fread (buf, 1, 5, inf);
248             if (r < 5)
249             {
250                 if (r && print_offset && verbose)
251                     printf ("<!-- Extra %ld bytes at end of file -->\n",
252                             (long) r);
253                 break;
254             }
255             while (*buf < '0' || *buf > '9')
256             {
257                 int i;
258                 long off = ftell(inf) - 5;
259                 if (verbose || print_offset)
260                     printf("<!-- Skipping bad byte %d (0x%02X) at offset "
261                            "%ld (0x%lx) -->\n", 
262                            *buf & 0xff, *buf & 0xff,
263                            off, off);
264                 for (i = 0; i<4; i++)
265                     buf[i] = buf[i+1];
266                 r = fread(buf+4, 1, 1, inf);
267                 if (r < 1)
268                     break;
269             }
270             if (r < 1)
271             {
272                 if (verbose || print_offset)
273                     printf ("<!-- End of file with data -->\n");
274                 break;
275             }
276             if (print_offset)
277             {
278                 long off = ftell(inf) - 5;
279                 printf ("<!-- Record %d offset %ld (0x%lx) -->\n",
280                         num, off, off);
281             }
282             len = atoi_n(buf, 5);
283             if (len < 25 || len > 100000)
284             {
285                 long off = ftell(inf) - 5;
286                 printf("Bad Length %ld read at offset %ld (%lx)\n",
287                        (long)len, (long) off, (long) off);
288                 break;
289             }
290             rlen = len - 5;
291             r = fread (buf + 5, 1, rlen, inf);
292             if (r < rlen)
293                 break;
294             while (buf[len-1] != ISO2709_RS)
295             {
296                 if (len > sizeof(buf)-2)
297                     break;
298                 r = fread (buf + len, 1, 1, inf);
299                 if (r != 1)
300                     break;
301                 len++;
302             }
303             if (split_fname)
304             {
305                 char fname[256];
306                 const char *mode = 0;
307                 FILE *sf;
308                 if ((marc_no % split_chunk) == 0)
309                 {
310                     mode = "wb";
311                     split_file_no++;
312                 }
313                 else
314                     mode = "ab";
315                 sprintf(fname, "%.200s%07d", split_fname, split_file_no);
316                 sf = fopen(fname, mode);
317                 if (!sf)
318                 {
319                     fprintf(stderr, "Could not open %s\n", fname);
320                     split_fname = 0;
321                 }
322                 else
323                 {
324                     if (fwrite(buf, 1, len, sf) != len)
325                     {
326                         fprintf(stderr, "Could write content to %s\n",
327                                 fname);
328                         split_fname = 0;
329                     }
330                     fclose(sf);
331                 }
332             }
333             len_result = rlen;
334             r = yaz_marc_decode_buf(mt, buf, -1, &result, &len_result);
335             if (r > 0 && result)
336             {
337                 fwrite (result, len_result, 1, stdout);
338             }
339             if (r > 0 && cfile)
340             {
341                 char *p = buf;
342                 size_t i;
343                 if (marc_no)
344                     fprintf (cfile, ",");
345                 fprintf (cfile, "\n");
346                 for (i = 0; i < r; i++)
347                 {
348                     if ((i & 15) == 0)
349                         fprintf (cfile, "  \"");
350                     fprintf (cfile, "\\x%02X", p[i] & 255);
351                     
352                     if (i < r - 1 && (i & 15) == 15)
353                         fprintf (cfile, "\"\n");
354                     
355                 }
356                 fprintf (cfile, "\"\n");
357             }
358             num++;
359             if (verbose)
360                 printf("\n");
361         }
362         if (cfile)
363             fprintf (cfile, "};\n");
364         fclose(inf);
365     }
366     {
367         WRBUF wrbuf = wrbuf_alloc();
368         yaz_marc_write_trailer(mt, wrbuf);
369         fputs(wrbuf_cstr(wrbuf), stdout);
370         wrbuf_destroy(wrbuf);
371     }
372     if (cd)
373         yaz_iconv_close(cd);
374     yaz_marc_destroy(mt);
375 }
376
377 int main (int argc, char **argv)
378 {
379     int r;
380     int print_offset = 0;
381     char *arg;
382     int verbose = 0;
383     int no = 0;
384     int output_format = YAZ_MARC_LINE;
385     FILE *cfile = 0;
386     char *from = 0, *to = 0;
387     int input_format = YAZ_MARC_ISO2709;
388     int split_chunk = 1;
389     const char *split_fname = 0;
390     const char *leader_spec = 0;
391     int write_using_libxml2 = 0;
392
393 #if HAVE_LOCALE_H
394     setlocale(LC_CTYPE, "");
395 #endif
396 #if HAVE_LANGINFO_H
397 #ifdef CODESET
398     to = nl_langinfo(CODESET);
399 #endif
400 #endif
401
402     prog = *argv;
403     while ((r = options("i:o:C:npvc:xOeXIf:t:s:l:", argv, argc, &arg)) != -2)
404     {
405         no++;
406         switch (r)
407         {
408         case 'i':
409             input_format = yaz_marc_decode_formatstr(arg);
410             if (input_format == -1)
411             {
412                 fprintf(stderr, "%s: bad input format: %s\n", prog, arg);
413                 exit(1);
414             }
415 #if YAZ_HAVE_XML2
416 #else
417             if (input_format == YAZ_MARC_MARCXML 
418                 || input_format == YAZ_MARC_XCHANGE)
419             {
420                 fprintf(stderr, "%s: Libxml2 support not enabled\n", prog);
421                 exit(3);
422             }
423 #endif
424             break;
425         case 'o':
426             /* dirty hack so we can make Libxml2 do the writing ..
427                rather than WRBUF */
428             if (strlen(arg) > 4 && strncmp(arg, "xml,", 4) == 0)
429             {
430                 arg = arg + 4;
431                 write_using_libxml2 = 1;
432             }
433             output_format = yaz_marc_decode_formatstr(arg);
434             if (output_format == -1)
435             {
436                 fprintf(stderr, "%s: bad output format: %s\n", prog, arg);
437                 exit(1);
438             }
439             break;
440         case 'l':
441             leader_spec = arg;
442             break;
443         case 'f':
444             from = arg;
445             break;
446         case 't':
447             to = arg;
448             break;
449         case 'c':
450             if (cfile)
451                 fclose (cfile);
452             cfile = fopen(arg, "w");
453             break;
454         case 'x':
455             fprintf(stderr, "%s: -x no longer supported. "
456                     "Use -i marcxml instead\n", prog);
457             exit(1);
458             break;
459         case 'O':
460             fprintf(stderr, "%s: OAI MARC no longer supported."
461                     " Use MARCXML instead.\n", prog);
462             exit(1);
463             break;
464         case 'e':
465             fprintf(stderr, "%s: -e no longer supported. "
466                     "Use -o marcxchange instead\n", prog);
467             exit(1);
468             break;
469         case 'X':
470             fprintf(stderr, "%s: -X no longer supported. "
471                     "Use -o marcxml instead\n", prog);
472             exit(1);
473             break;
474         case 'I':
475             fprintf(stderr, "%s: -I no longer supported. "
476                     "Use -o marc instead\n", prog);
477             exit(1);
478             break;
479         case 'n':
480             output_format = YAZ_MARC_CHECK;
481             break;
482         case 'p':
483             print_offset = 1;
484             break;
485         case 's':
486             split_fname = arg;
487             break;
488         case 'C':
489             split_chunk = atoi(arg);
490             break;
491         case 0:
492             dump(arg, from, to, input_format, output_format,
493                  write_using_libxml2,
494                  print_offset, split_fname, split_chunk,
495                  verbose, cfile, leader_spec);
496             break;
497         case 'v':
498             verbose++;
499             break;
500         default:
501             usage(prog);
502             exit(1);
503         }
504     }
505     if (cfile)
506         fclose (cfile);
507     if (!no)
508     {
509         usage(prog);
510         exit (1);
511     }
512     exit (0);
513 }
514 /*
515  * Local variables:
516  * c-basic-offset: 4
517  * indent-tabs-mode: nil
518  * End:
519  * vim: shiftwidth=4 tabstop=8 expandtab
520  */
521