Support read/write MARCXML collections.
[yaz-moved-to-github.git] / util / marcdump.c
1 /*
2  * Copyright (C) 1995-2007, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: marcdump.c,v 1.54 2007-12-17 20:59:32 adam Exp $
6  */
7
8 #define _FILE_OFFSET_BITS 64
9
10 #if HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #if YAZ_HAVE_XML2
15 #include <libxml/parser.h>
16 #include <libxml/tree.h>
17 #include <libxml/xmlreader.h>
18 #include <libxml/xpath.h>
19 #include <libxml/xpathInternals.h>
20
21 #endif
22
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <errno.h>
27 #include <assert.h>
28
29 #if HAVE_LOCALE_H
30 #include <locale.h>
31 #endif
32 #if HAVE_LANGINFO_H
33 #include <langinfo.h>
34 #endif
35
36 #include <yaz/marcdisp.h>
37 #include <yaz/yaz-util.h>
38 #include <yaz/xmalloc.h>
39 #include <yaz/options.h>
40
41 #ifndef SEEK_SET
42 #define SEEK_SET 0
43 #endif
44 #ifndef SEEK_END
45 #define SEEK_END 2
46 #endif
47
48
49 static char *prog;
50
51 static void usage(const char *prog)
52 {
53     fprintf (stderr, "Usage: %s [-c cfile] [-f from] [-t to] "
54              "[-i format] [-o format] "
55              "[-n] [-l pos=value] [-v] [-C chunk] [-s splitfname] [-p] file...\n",
56              prog);
57
58
59 static int getbyte_stream(void *client_data)
60 {
61     FILE *f = (FILE*) client_data;
62
63     int c = fgetc(f);
64     if (c == EOF)
65         return 0;
66     return c;
67 }
68
69 static void ungetbyte_stream(int c, void *client_data)
70 {
71     FILE *f = (FILE*) client_data;
72
73     if (c == 0)
74         c = EOF;
75     ungetc(c, f);
76 }
77
78 static void marcdump_read_line(yaz_marc_t mt, const char *fname)
79 {
80     FILE *inf = fopen(fname, "rb");
81     if (!inf)
82     {
83         fprintf (stderr, "%s: cannot open %s:%s\n",
84                  prog, fname, strerror (errno));
85         exit(1);
86     }
87     
88     while (yaz_marc_read_line(mt, getbyte_stream,
89                               ungetbyte_stream, inf) == 0)
90     {
91         WRBUF wrbuf = wrbuf_alloc();
92         yaz_marc_write_mode(mt, wrbuf);
93         fputs(wrbuf_cstr(wrbuf), stdout);
94         wrbuf_destroy(wrbuf);
95     }
96     {
97         WRBUF wrbuf = wrbuf_alloc();
98         yaz_marc_write_trailer(mt, wrbuf);
99         fputs(wrbuf_cstr(wrbuf), stdout);
100         wrbuf_destroy(wrbuf);
101     }
102     fclose(inf);
103 }
104
105 #if YAZ_HAVE_XML2
106 static void marcdump_read_xml(yaz_marc_t mt, const char *fname)
107 {
108     xmlTextReaderPtr reader = xmlReaderForFile(fname, 0 /* encoding */,
109                                                0 /* options */);
110
111     if (reader)
112     {
113         int ret;
114         WRBUF wrbuf = wrbuf_alloc();
115         while ((ret = xmlTextReaderRead(reader)) == 1)
116         {
117             int type = xmlTextReaderNodeType(reader);
118             if (type == XML_READER_TYPE_ELEMENT)
119             {
120                 const char *name = (const char *) 
121                     xmlTextReaderConstName(reader);
122                 if (!strcmp(name, "record"))
123                 {
124                     xmlNodePtr ptr = xmlTextReaderExpand(reader);
125         
126                     int r = yaz_marc_read_xml(mt, ptr);
127                     if (r)
128                         fprintf(stderr, "yaz_marc_read_xml failed\n");
129                     else
130                     {
131                         yaz_marc_write_mode(mt, wrbuf);
132                         
133                         fputs(wrbuf_cstr(wrbuf), stdout);
134                         wrbuf_rewind(wrbuf);
135                     }
136                 }
137             }
138         }
139         yaz_marc_write_trailer(mt, wrbuf);
140         fputs(wrbuf_cstr(wrbuf), stdout);
141         wrbuf_destroy(wrbuf);
142         xmlFreeTextReader(reader);
143     }
144 }
145 #endif
146
147 static void dump(const char *fname, const char *from, const char *to,
148                  int input_format, int output_format,
149                  int write_using_libxml2,
150                  int print_offset, const char *split_fname, int split_chunk,
151                  int verbose, FILE *cfile, const char *leader_spec)
152 {
153     yaz_marc_t mt = yaz_marc_create();
154     yaz_iconv_t cd = 0;
155
156     if (yaz_marc_leader_spec(mt, leader_spec))
157     {
158         fprintf(stderr, "bad leader spec: %s\n", leader_spec);
159         yaz_marc_destroy(mt);
160         exit(2);
161     }
162     if (from && to)
163     {
164         cd = yaz_iconv_open(to, from);
165         if (!cd)
166         {
167             fprintf(stderr, "conversion from %s to %s "
168                     "unsupported\n", from, to);
169             yaz_marc_destroy(mt);
170             exit(2);
171         }
172         yaz_marc_iconv(mt, cd);
173     }
174     yaz_marc_xml(mt, output_format);
175     yaz_marc_enable_collection(mt);
176     yaz_marc_write_using_libxml2(mt, write_using_libxml2);
177     yaz_marc_debug(mt, verbose);
178
179     if (input_format == YAZ_MARC_MARCXML || input_format == YAZ_MARC_XCHANGE)
180     {
181 #if YAZ_HAVE_XML2
182         marcdump_read_xml(mt, fname);
183 #endif
184     }
185     else if (input_format == YAZ_MARC_LINE)
186     {
187         marcdump_read_line(mt, fname);
188     }
189     else if (input_format == YAZ_MARC_ISO2709)
190     {
191         FILE *inf = fopen(fname, "rb");
192         int num = 1;
193         int marc_no = 0;
194         int split_file_no = -1;
195         if (!inf)
196         {
197             fprintf (stderr, "%s: cannot open %s:%s\n",
198                      prog, fname, strerror (errno));
199             exit(1);
200         }
201         if (cfile)
202             fprintf (cfile, "char *marc_records[] = {\n");
203         for(;; marc_no++)
204         {
205             const char *result = 0;
206             size_t len;
207             size_t rlen;
208             size_t len_result;
209             size_t r;
210             char buf[100001];
211             
212             r = fread (buf, 1, 5, inf);
213             if (r < 5)
214             {
215                 if (r && print_offset && verbose)
216                     printf ("<!-- Extra %ld bytes at end of file -->\n",
217                             (long) r);
218                 break;
219             }
220             while (*buf < '0' || *buf > '9')
221             {
222                 int i;
223                 long off = ftell(inf) - 5;
224                 if (verbose || print_offset)
225                     printf("<!-- Skipping bad byte %d (0x%02X) at offset "
226                            "%ld (0x%lx) -->\n", 
227                            *buf & 0xff, *buf & 0xff,
228                            off, off);
229                 for (i = 0; i<4; i++)
230                     buf[i] = buf[i+1];
231                 r = fread(buf+4, 1, 1, inf);
232                 if (r < 1)
233                     break;
234             }
235             if (r < 1)
236             {
237                 if (verbose || print_offset)
238                     printf ("<!-- End of file with data -->\n");
239                 break;
240             }
241             if (print_offset)
242             {
243                 long off = ftell(inf) - 5;
244                 printf ("<!-- Record %d offset %ld (0x%lx) -->\n",
245                         num, off, off);
246             }
247             len = atoi_n(buf, 5);
248             if (len < 25 || len > 100000)
249             {
250                 long off = ftell(inf) - 5;
251                 printf("Bad Length %ld read at offset %ld (%lx)\n",
252                        (long)len, (long) off, (long) off);
253                 break;
254             }
255             rlen = len - 5;
256             r = fread (buf + 5, 1, rlen, inf);
257             if (r < rlen)
258                 break;
259             while (buf[len-1] != ISO2709_RS)
260             {
261                 if (len > sizeof(buf)-2)
262                     break;
263                 r = fread (buf + len, 1, 1, inf);
264                 if (r != 1)
265                     break;
266                 len++;
267             }
268             if (split_fname)
269             {
270                 char fname[256];
271                 const char *mode = 0;
272                 FILE *sf;
273                 if ((marc_no % split_chunk) == 0)
274                 {
275                     mode = "wb";
276                     split_file_no++;
277                 }
278                 else
279                     mode = "ab";
280                 sprintf(fname, "%.200s%07d", split_fname, split_file_no);
281                 sf = fopen(fname, mode);
282                 if (!sf)
283                 {
284                     fprintf(stderr, "Could not open %s\n", fname);
285                     split_fname = 0;
286                 }
287                 else
288                 {
289                     if (fwrite(buf, 1, len, sf) != len)
290                     {
291                         fprintf(stderr, "Could write content to %s\n",
292                                 fname);
293                         split_fname = 0;
294                     }
295                     fclose(sf);
296                 }
297             }
298             len_result = rlen;
299             r = yaz_marc_decode_buf(mt, buf, -1, &result, &len_result);
300             if (r > 0 && result)
301             {
302                 fwrite (result, len_result, 1, stdout);
303             }
304             if (r > 0 && cfile)
305             {
306                 char *p = buf;
307                 size_t i;
308                 if (marc_no)
309                     fprintf (cfile, ",");
310                 fprintf (cfile, "\n");
311                 for (i = 0; i < r; i++)
312                 {
313                     if ((i & 15) == 0)
314                         fprintf (cfile, "  \"");
315                     fprintf (cfile, "\\x%02X", p[i] & 255);
316                     
317                     if (i < r - 1 && (i & 15) == 15)
318                         fprintf (cfile, "\"\n");
319                     
320                 }
321                 fprintf (cfile, "\"\n");
322             }
323             num++;
324             if (verbose)
325                 printf("\n");
326         }
327         if (cfile)
328             fprintf (cfile, "};\n");
329         fclose(inf);
330     }
331     {
332         WRBUF wrbuf = wrbuf_alloc();
333         yaz_marc_write_trailer(mt, wrbuf);
334         fputs(wrbuf_cstr(wrbuf), stdout);
335         wrbuf_destroy(wrbuf);
336     }
337     if (cd)
338         yaz_iconv_close(cd);
339     yaz_marc_destroy(mt);
340 }
341
342 int main (int argc, char **argv)
343 {
344     int r;
345     int print_offset = 0;
346     char *arg;
347     int verbose = 0;
348     int no = 0;
349     int output_format = YAZ_MARC_LINE;
350     FILE *cfile = 0;
351     char *from = 0, *to = 0;
352     int input_format = YAZ_MARC_ISO2709;
353     int split_chunk = 1;
354     const char *split_fname = 0;
355     const char *leader_spec = 0;
356     int write_using_libxml2 = 0;
357
358 #if HAVE_LOCALE_H
359     setlocale(LC_CTYPE, "");
360 #endif
361 #if HAVE_LANGINFO_H
362 #ifdef CODESET
363     to = nl_langinfo(CODESET);
364 #endif
365 #endif
366
367     prog = *argv;
368     while ((r = options("i:o:C:npvc:xOeXIf:t:s:l:", argv, argc, &arg)) != -2)
369     {
370         no++;
371         switch (r)
372         {
373         case 'i':
374             input_format = yaz_marc_decode_formatstr(arg);
375             if (input_format == -1)
376             {
377                 fprintf(stderr, "%s: bad input format: %s\n", prog, arg);
378                 exit(1);
379             }
380 #if YAZ_HAVE_XML2
381 #else
382             if (input_format == YAZ_MARC_MARCXML 
383                 || input_format == YAZ_MARC_XCHANGE)
384             {
385                 fprintf(stderr, "%s: Libxml2 support not enabled\n", prog);
386                 exit(3);
387             }
388 #endif
389             break;
390         case 'o':
391             /* dirty hack so we can make Libxml2 do the writing ..
392                rather than WRBUF */
393             if (strlen(arg) > 4 && strncmp(arg, "xml,", 4) == 0)
394             {
395                 arg = arg + 4;
396                 write_using_libxml2 = 1;
397             }
398             output_format = yaz_marc_decode_formatstr(arg);
399             if (output_format == -1)
400             {
401                 fprintf(stderr, "%s: bad output format: %s\n", prog, arg);
402                 exit(1);
403             }
404             break;
405         case 'l':
406             leader_spec = arg;
407             break;
408         case 'f':
409             from = arg;
410             break;
411         case 't':
412             to = arg;
413             break;
414         case 'c':
415             if (cfile)
416                 fclose (cfile);
417             cfile = fopen(arg, "w");
418             break;
419         case 'x':
420             fprintf(stderr, "%s: -x no longer supported. "
421                     "Use -i marcxml instead\n", prog);
422             exit(1);
423             break;
424         case 'O':
425             fprintf(stderr, "%s: OAI MARC no longer supported."
426                     " Use MARCXML instead.\n", prog);
427             exit(1);
428             break;
429         case 'e':
430             fprintf(stderr, "%s: -e no longer supported. "
431                     "Use -o marcxchange instead\n", prog);
432             exit(1);
433             break;
434         case 'X':
435             fprintf(stderr, "%s: -X no longer supported. "
436                     "Use -o marcxml instead\n", prog);
437             exit(1);
438             break;
439         case 'I':
440             fprintf(stderr, "%s: -I no longer supported. "
441                     "Use -o marc instead\n", prog);
442             exit(1);
443             break;
444         case 'n':
445             output_format = YAZ_MARC_CHECK;
446             break;
447         case 'p':
448             print_offset = 1;
449             break;
450         case 's':
451             split_fname = arg;
452             break;
453         case 'C':
454             split_chunk = atoi(arg);
455             break;
456         case 0:
457             dump(arg, from, to, input_format, output_format,
458                  write_using_libxml2,
459                  print_offset, split_fname, split_chunk,
460                  verbose, cfile, leader_spec);
461             break;
462         case 'v':
463             verbose++;
464             break;
465         default:
466             usage(prog);
467             exit(1);
468         }
469     }
470     if (cfile)
471         fclose (cfile);
472     if (!no)
473     {
474         usage(prog);
475         exit (1);
476     }
477     exit (0);
478 }
479 /*
480  * Local variables:
481  * c-basic-offset: 4
482  * indent-tabs-mode: nil
483  * End:
484  * vim: shiftwidth=4 tabstop=8 expandtab
485  */
486