Added continuation line support for MARC line format reader.
[yaz-moved-to-github.git] / src / marc_read_line.c
1 /*
2  * Copyright (C) 1995-2007, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: marc_read_line.c,v 1.6 2007-03-18 13:00:37 adam Exp $
6  */
7
8 /**
9  * \file marc_read_line.c
10  * \brief Implements reading of MARC in line format
11  */
12
13 #if HAVE_CONFIG_H
14 #include <config.h>
15 #endif
16
17 #ifdef WIN32
18 #include <windows.h>
19 #endif
20
21 #include <assert.h>
22 #include <stdio.h>
23 #include <string.h>
24 #include <ctype.h>
25
26 #include <yaz/marcdisp.h>
27 #include <yaz/wrbuf.h>
28 #include <yaz/yaz-util.h>
29
30 int yaz_gets(int (*getbyte)(void *client_data),
31              void (*ungetbyte)(int b, void *client_data),
32              void *client_data,
33              WRBUF w)
34 {
35     size_t sz = 0;
36     int ch = getbyte(client_data);
37
38     while (ch != '\0' && ch != '\r' && ch != '\n')
39     {
40         wrbuf_putc(w, ch);
41         sz++;
42         ch = getbyte(client_data);
43     }
44     if (ch == '\r')
45     {
46         ch = getbyte(client_data);
47         if (ch != '\n' && ch != '\0')
48             ungetbyte(ch, client_data);
49     }
50     else if (ch == '\n')
51     {
52         ch = getbyte(client_data);
53         if (ch != '\r' && ch != '\0')
54             ungetbyte(ch, client_data);
55     }
56     if (sz)
57     {
58         return 1;
59     }
60     return 0;
61 }
62
63 static int yaz_marc_line_gets(int (*getbyte)(void *client_data),
64                               void (*ungetbyte)(int b, void *client_data),
65                               void *client_data,
66                               WRBUF w)
67 {
68     int more;
69
70     wrbuf_rewind(w);
71     more = yaz_gets(getbyte, ungetbyte, client_data, w);
72     if (!more)
73         return 0;
74
75     while (more)
76     {
77         int i;
78         for (i = 0; i<4; i++)
79         {
80             int ch = getbyte(client_data);
81             if (ch != ' ')
82             {
83                 if (ch)
84                     ungetbyte(ch, client_data);
85                 return 1;
86             }
87         }
88         if (wrbuf_len(w) > 60 && wrbuf_buf(w)[wrbuf_len(w)-1] == '=')
89             wrbuf_cut_right(w, 1);
90         else
91             wrbuf_puts(w, " ");
92         more = yaz_gets(getbyte, ungetbyte, client_data, w);
93     }
94     return 1;
95 }
96
97     
98 int yaz_marc_read_line(yaz_marc_t mt,
99                        int (*getbyte)(void *client_data),
100                        void (*ungetbyte)(int b, void *client_data),
101                        void *client_data)
102 {
103     int indicator_length;
104     int identifier_length;
105     int base_address;
106     int length_data_entry;
107     int length_starting;
108     int length_implementation;
109     int marker_ch = 0;
110     int marker_skip = 0;
111     int header_created = 0;
112     WRBUF wrbuf_line = wrbuf_alloc();
113
114     yaz_marc_reset(mt);
115
116     while (yaz_marc_line_gets(getbyte, ungetbyte, client_data, wrbuf_line))
117     {
118         const char *line = wrbuf_cstr(wrbuf_line);
119         int val;
120         size_t line_len = strlen(line);
121         if (line_len == 0)       /* empty line indicates end of record */
122         {
123             if (header_created)
124                 break;
125         }
126         else if (line[0] == '$') /* indicates beginning/end of record */
127         {
128             if (header_created)
129                 break;
130         }
131         else if (line[0] == '(') /* annotation, skip it */
132             ;
133         else if (line_len == 24 && atoi_n_check(line, 5, &val) && val >= 24)
134         {
135             /* deal with header lines:  00366nam  22001698a 4500
136             */
137
138             if (header_created)
139                 break;
140             yaz_marc_set_leader(mt, line,
141                                 &indicator_length,
142                                 &identifier_length,
143                                 &base_address,
144                                 &length_data_entry,
145                                 &length_starting,
146                                 &length_implementation);
147             header_created = 1;
148         }
149         else if (line_len > 5 && line[0] != ' ' && line[1] != ' ' 
150                  && line[2] != ' ' && line[3] == ' ' )
151         {
152             /* deal with data/control lines: 245 12 ........ */
153             char tag[4];
154             const char *datafield_start = line+6;
155             marker_ch = 0;
156             marker_skip = 0;
157
158             memcpy(tag, line, 3);
159             tag[3] = '\0';
160             if (line_len >= 8) /* control - or datafield ? */
161             {
162                 if (*datafield_start == ' ')
163                     datafield_start++;  /* skip blank after indicator */
164
165                 if (strchr("$_*", *datafield_start))
166                 {
167                     marker_ch = *datafield_start;
168                     if (datafield_start[2] == ' ')
169                         marker_skip = 1; /* subfields has blank before data */
170                 }
171             }
172             if (!header_created)
173             {
174                 const char *leader = "01000cam  2200265 i 4500";
175
176                 yaz_marc_set_leader(mt, leader,
177                                     &indicator_length,
178                                     &identifier_length,
179                                     &base_address,
180                                     &length_data_entry,
181                                     &length_starting,
182                                     &length_implementation);
183                 header_created = 1;
184             }
185
186             if (marker_ch == 0)
187             {   /* control field */
188                 yaz_marc_add_controlfield(mt, tag, line+4, strlen(line+4));
189             }
190             else
191             {   /* data field */
192                 const char *indicator = line+4;
193                 int indicator_len = 2;
194                 const char *cp = datafield_start;
195
196                 yaz_marc_add_datafield(mt, tag, indicator, indicator_len);
197                 for (;;)
198                 {
199                     const char *next;
200                     size_t len;
201                     
202                     assert(cp[0] == marker_ch);
203                     cp++;
204                     next = cp;
205                     while ((next = strchr(next, marker_ch)))
206                     {
207                         if ((next[1] >= 'A' && next[1] <= 'Z')
208                             ||(next[1] >= 'a' && next[1] <= 'z'))
209                         {
210                             if (!marker_skip)
211                                 break;
212                             else if (next[2] == ' ')
213                                 break;
214                         }
215                         next++;
216                     }
217                     len = strlen(cp);
218                     if (next)
219                         len = next - cp - marker_skip;
220
221                     if (marker_skip)
222                     {
223                         /* remove ' ' after subfield marker */
224                         char *cp_blank = strchr(cp, ' ');
225                         if (cp_blank)
226                         {
227                             len--;
228                             while (cp_blank != cp)
229                             {
230                                 cp_blank[0] = cp_blank[-1];
231                                 cp_blank--;
232                             }
233                             cp++;
234                         }
235                     }
236                     assert(len >= 0);
237                     yaz_marc_add_subfield(mt, cp, len);
238                     if (!next)
239                         break;
240                     cp = next;
241                 }
242             }
243         }
244         else
245         {
246             yaz_marc_cprintf(mt, "Ignoring line: %s", line);
247         }
248     }
249     wrbuf_destroy(wrbuf_line);
250     if (!header_created)
251         return -1;
252     return 0;
253 }
254
255 /*
256  * Local variables:
257  * c-basic-offset: 4
258  * indent-tabs-mode: nil
259  * End:
260  * vim: shiftwidth=4 tabstop=8 expandtab
261  */
262