Merge branch 'master' into yaz-728
[yaz-moved-to-github.git] / src / iconv_decode_marc8.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) Index Data
3  * See the file LICENSE for details.
4  */
5 /**
6  * \file
7  * \brief MARC-8 decoding
8  *
9  * MARC-8 reference:
10  *  http://www.loc.gov/marc/specifications/speccharmarc8.html
11  */
12
13 #if HAVE_CONFIG_H
14 #include <config.h>
15 #endif
16
17 #include <assert.h>
18 #include <errno.h>
19 #include <string.h>
20
21 #include <yaz/xmalloc.h>
22 #include "iconv-p.h"
23
24 struct decoder_data {
25     int g0_mode;
26     int g1_mode;
27
28     int comb_offset;
29     int comb_size;
30     unsigned long comb_x[8];
31     size_t comb_no_read[8];
32     int control_mode;
33 };
34
35 yaz_conv_func_t yaz_marc8_42_conv;
36 yaz_conv_func_t yaz_marc8_45_conv;
37 yaz_conv_func_t yaz_marc8_67_conv;
38 yaz_conv_func_t yaz_marc8_62_conv;
39 yaz_conv_func_t yaz_marc8_70_conv;
40 yaz_conv_func_t yaz_marc8_32_conv;
41 yaz_conv_func_t yaz_marc8_4E_conv;
42 yaz_conv_func_t yaz_marc8_51_conv;
43 yaz_conv_func_t yaz_marc8_33_conv;
44 yaz_conv_func_t yaz_marc8_34_conv;
45 yaz_conv_func_t yaz_marc8_53_conv;
46 yaz_conv_func_t yaz_marc8_31_conv;
47
48
49 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd,
50                                          struct decoder_data *data,
51                                          unsigned char *inp,
52                                          size_t inbytesleft, size_t *no_read,
53                                          int *comb);
54
55 static unsigned long read_marc8(yaz_iconv_t cd, yaz_iconv_decoder_t d,
56                                unsigned char *inp,
57                                size_t inbytesleft, size_t *no_read)
58 {
59     struct decoder_data *data = (struct decoder_data *) d->data;
60     unsigned long x;
61     if (data->comb_offset < data->comb_size)
62     {
63         *no_read = data->comb_no_read[data->comb_offset];
64         x = data->comb_x[data->comb_offset];
65
66         /* special case for double-diacritic combining characters,
67            INVERTED BREVE and DOUBLE TILDE.
68            We'll increment the no_read counter by 1, since we want to skip over
69            the processing of the closing ligature character
70         */
71         /* this code is no longer necessary.. our handlers code in
72            yaz_marc8_?_conv (generated by charconv.tcl) now returns
73            0 and no_read=1 when a sequence does not match the input.
74            The SECOND HALFs in codetables.xml produces a non-existant
75            entry in the conversion trie.. Hence when met, the input byte is
76            skipped as it should (in yaz_iconv)
77         */
78 #if 0
79         if (x == 0x0361 || x == 0x0360)
80             *no_read += 1;
81 #endif
82         data->comb_offset++;
83         return x;
84     }
85
86     data->comb_offset = 0;
87     for (data->comb_size = 0; data->comb_size < 8; data->comb_size++)
88     {
89         int comb = 0;
90
91         if (inbytesleft == 0 && data->comb_size)
92         {
93             yaz_iconv_set_errno(cd, YAZ_ICONV_EINVAL);
94             x = 0;
95             *no_read = 0;
96             break;
97         }
98         x = yaz_read_marc8_comb(cd, data, inp, inbytesleft, no_read, &comb);
99         if (!comb || !x)
100             break;
101         data->comb_x[data->comb_size] = x;
102         data->comb_no_read[data->comb_size] = *no_read;
103         inp += *no_read;
104         inbytesleft = inbytesleft - *no_read;
105     }
106     return x;
107 }
108
109 static unsigned long read_marc8s(yaz_iconv_t cd, yaz_iconv_decoder_t d,
110                                  unsigned char *inp,
111                                  size_t inbytesleft, size_t *no_read)
112 {
113     struct decoder_data *data = (struct decoder_data *) d->data;
114     unsigned long x = read_marc8(cd, d, inp, inbytesleft, no_read);
115     if (x && data->comb_size == 1)
116     {
117         if (yaz_iso_8859_1_lookup_x12(x, data->comb_x[0], &x))
118         {
119             *no_read += data->comb_no_read[0];
120             data->comb_size = 0;
121         }
122     }
123     return x;
124 }
125
126 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd,
127                                          struct decoder_data *data,
128                                          unsigned char *inp,
129                                          size_t inbytesleft, size_t *no_read,
130                                          int *comb)
131 {
132     *no_read = 0;
133     while (inbytesleft > 0 && *inp == 27)
134     {
135         int *modep = &data->g0_mode;
136         size_t inbytesleft0 = inbytesleft;
137
138         inbytesleft--;
139         inp++;
140         if (inbytesleft == 0)
141             goto incomplete;
142         if (*inp == '$') /* set with multiple bytes */
143         {
144             inbytesleft--;
145             inp++;
146         }
147         if (inbytesleft == 0)
148             goto incomplete;
149         if (*inp == '(' || *inp == ',')  /* G0 */
150         {
151             inbytesleft--;
152             inp++;
153         }
154         else if (*inp == ')' || *inp == '-') /* G1 */
155         {
156             inbytesleft--;
157             inp++;
158             modep = &data->g1_mode;
159         }
160         if (inbytesleft == 0)
161             goto incomplete;
162         if (*inp == '!') /* ANSEL is a special case */
163         {
164             inbytesleft--;
165             inp++;
166         }
167         if (inbytesleft == 0)
168             goto incomplete;
169         *modep = *inp++; /* Final character */
170         inbytesleft--;
171
172         (*no_read) += inbytesleft0 - inbytesleft;
173     }
174     if (inbytesleft == 0)
175         return 0;
176     else if (*inp == ' ')
177     {
178         *no_read += 1;
179         return ' ';
180     }
181     else if (*inp < ' ' && data->control_mode)
182     {
183         *no_read += 1;
184         return *inp;
185     }
186     else
187     {
188         unsigned long x;
189         size_t no_read_sub = 0;
190         int mode = *inp < 128 ? data->g0_mode : data->g1_mode;
191         *comb = 0;
192
193         switch(mode)
194         {
195         case 'B':  /* Basic ASCII */
196         case 's':  /* ASCII */
197             x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
198             break;
199         case 'E':  /* ANSEL */
200             x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb, 127, 128);
201             break;
202         case 'g':  /* Greek */
203             x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
204             break;
205         case 'b':  /* Subscripts */
206             x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
207             break;
208         case 'p':  /* Superscripts */
209             x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
210             break;
211         case '2':  /* Basic Hebrew */
212             x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
213             break;
214         case 'N':  /* Basic Cyrillic */
215             x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
216             break;
217         case 'Q':  /* Extended Cyrillic */
218             x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
219             break;
220         case '3':  /* Basic Arabic */
221             x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
222             break;
223         case '4':  /* Extended Arabic */
224             x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
225             break;
226         case 'S':  /* Greek */
227             x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
228             break;
229         case '1':  /* Chinese, Japanese, Korean (EACC) */
230             x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
231             break;
232         default:
233             *no_read = 0;
234             yaz_iconv_set_errno(cd, YAZ_ICONV_EILSEQ);
235             return 0;
236         }
237         *no_read += no_read_sub;
238         return x;
239     }
240 incomplete:
241     *no_read = 0;
242     yaz_iconv_set_errno(cd, YAZ_ICONV_EINVAL);
243     return 0;
244 }
245
246
247 static size_t init_marc8(yaz_iconv_t cd, yaz_iconv_decoder_t d,
248                          unsigned char *inp,
249                          size_t inbytesleft, size_t *no_read)
250 {
251     struct decoder_data *data = (struct decoder_data *) d->data;
252     data->g0_mode = 'B';
253     data->g1_mode = 'E';
254     data->comb_offset = data->comb_size = 0;
255     data->control_mode = 0;
256     return 0;
257 }
258
259 static size_t init_marc8c(yaz_iconv_t cd, yaz_iconv_decoder_t d,
260                          unsigned char *inp,
261                          size_t inbytesleft, size_t *no_read)
262 {
263     struct decoder_data *data = (struct decoder_data *) d->data;
264
265     init_marc8(cd, d, inp, inbytesleft, no_read);
266     data->control_mode = 1;
267     return 0;
268 }
269
270 void destroy_marc8(yaz_iconv_decoder_t d)
271 {
272     struct decoder_data *data = (struct decoder_data *) d->data;
273     xfree(data);
274 }
275
276 yaz_iconv_decoder_t yaz_marc8_decoder(const char *fromcode,
277                                       yaz_iconv_decoder_t d)
278 {
279     if (!yaz_matchstr(fromcode, "MARC8") || !yaz_matchstr(fromcode, "ANSEL"))
280     {
281         d->read_handle = read_marc8;
282         d->init_handle = init_marc8;
283     }
284     else if (!yaz_matchstr(fromcode, "MARC8s"))
285     {
286         d->read_handle = read_marc8s;
287         d->init_handle = init_marc8;
288     }
289     else if (!yaz_matchstr(fromcode, "MARC8c"))
290     {
291         d->read_handle = read_marc8;
292         d->init_handle = init_marc8c;
293     }
294     else
295         return 0;
296     {
297         struct decoder_data *data = (struct decoder_data *)
298             xmalloc(sizeof(*data));
299         d->data = data;
300         d->destroy_handle = destroy_marc8;
301     }
302     return d;
303 }
304
305
306 /*
307  * Local variables:
308  * c-basic-offset: 4
309  * c-file-style: "Stroustrup"
310  * indent-tabs-mode: nil
311  * End:
312  * vim: shiftwidth=4 tabstop=8 expandtab
313  */
314