Bump year
[yaz-moved-to-github.git] / src / iconv_decode_marc8.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2013 Index Data
3  * See the file LICENSE for details.
4  */
5 /**
6  * \file
7  * \brief MARC-8 decoding
8  *
9  * MARC-8 reference:
10  *  http://www.loc.gov/marc/specifications/speccharmarc8.html
11  */
12
13 #if HAVE_CONFIG_H
14 #include <config.h>
15 #endif
16
17 #include <assert.h>
18 #include <errno.h>
19 #include <string.h>
20
21 #include <yaz/xmalloc.h>
22 #include "iconv-p.h"
23
24 struct decoder_data {
25     int g0_mode;
26     int g1_mode;
27
28     int comb_offset;
29     int comb_size;
30     unsigned long comb_x[8];
31     size_t comb_no_read[8];
32 };
33
34 yaz_conv_func_t yaz_marc8_42_conv;
35 yaz_conv_func_t yaz_marc8_45_conv;
36 yaz_conv_func_t yaz_marc8_67_conv;
37 yaz_conv_func_t yaz_marc8_62_conv;
38 yaz_conv_func_t yaz_marc8_70_conv;
39 yaz_conv_func_t yaz_marc8_32_conv;
40 yaz_conv_func_t yaz_marc8_4E_conv;
41 yaz_conv_func_t yaz_marc8_51_conv;
42 yaz_conv_func_t yaz_marc8_33_conv;
43 yaz_conv_func_t yaz_marc8_34_conv;
44 yaz_conv_func_t yaz_marc8_53_conv;
45 yaz_conv_func_t yaz_marc8_31_conv;
46
47
48 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd,
49                                          struct decoder_data *data,
50                                          unsigned char *inp,
51                                          size_t inbytesleft, size_t *no_read,
52                                          int *comb);
53
54 static unsigned long read_marc8(yaz_iconv_t cd, yaz_iconv_decoder_t d,
55                                unsigned char *inp,
56                                size_t inbytesleft, size_t *no_read)
57 {
58     struct decoder_data *data = (struct decoder_data *) d->data;
59     unsigned long x;
60     if (data->comb_offset < data->comb_size)
61     {
62         *no_read = data->comb_no_read[data->comb_offset];
63         x = data->comb_x[data->comb_offset];
64
65         /* special case for double-diacritic combining characters,
66            INVERTED BREVE and DOUBLE TILDE.
67            We'll increment the no_read counter by 1, since we want to skip over
68            the processing of the closing ligature character
69         */
70         /* this code is no longer necessary.. our handlers code in
71            yaz_marc8_?_conv (generated by charconv.tcl) now returns
72            0 and no_read=1 when a sequence does not match the input.
73            The SECOND HALFs in codetables.xml produces a non-existant
74            entry in the conversion trie.. Hence when met, the input byte is
75            skipped as it should (in yaz_iconv)
76         */
77 #if 0
78         if (x == 0x0361 || x == 0x0360)
79             *no_read += 1;
80 #endif
81         data->comb_offset++;
82         return x;
83     }
84
85     data->comb_offset = 0;
86     for (data->comb_size = 0; data->comb_size < 8; data->comb_size++)
87     {
88         int comb = 0;
89
90         if (inbytesleft == 0 && data->comb_size)
91         {
92             yaz_iconv_set_errno(cd, YAZ_ICONV_EINVAL);
93             x = 0;
94             *no_read = 0;
95             break;
96         }
97         x = yaz_read_marc8_comb(cd, data, inp, inbytesleft, no_read, &comb);
98         if (!comb || !x)
99             break;
100         data->comb_x[data->comb_size] = x;
101         data->comb_no_read[data->comb_size] = *no_read;
102         inp += *no_read;
103         inbytesleft = inbytesleft - *no_read;
104     }
105     return x;
106 }
107
108 static unsigned long read_marc8s(yaz_iconv_t cd, yaz_iconv_decoder_t d,
109                                  unsigned char *inp,
110                                  size_t inbytesleft, size_t *no_read)
111 {
112     struct decoder_data *data = (struct decoder_data *) d->data;
113     unsigned long x = read_marc8(cd, d, inp, inbytesleft, no_read);
114     if (x && data->comb_size == 1)
115     {
116         if (yaz_iso_8859_1_lookup_x12(x, data->comb_x[0], &x))
117         {
118             *no_read += data->comb_no_read[0];
119             data->comb_size = 0;
120         }
121     }
122     return x;
123 }
124
125 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd,
126                                          struct decoder_data *data,
127                                          unsigned char *inp,
128                                          size_t inbytesleft, size_t *no_read,
129                                          int *comb)
130 {
131     *no_read = 0;
132     while (inbytesleft > 0 && *inp == 27)
133     {
134         int *modep = &data->g0_mode;
135         size_t inbytesleft0 = inbytesleft;
136
137         inbytesleft--;
138         inp++;
139         if (inbytesleft == 0)
140             goto incomplete;
141         if (*inp == '$') /* set with multiple bytes */
142         {
143             inbytesleft--;
144             inp++;
145         }
146         if (inbytesleft == 0)
147             goto incomplete;
148         if (*inp == '(' || *inp == ',')  /* G0 */
149         {
150             inbytesleft--;
151             inp++;
152         }
153         else if (*inp == ')' || *inp == '-') /* G1 */
154         {
155             inbytesleft--;
156             inp++;
157             modep = &data->g1_mode;
158         }
159         if (inbytesleft == 0)
160             goto incomplete;
161         if (*inp == '!') /* ANSEL is a special case */
162         {
163             inbytesleft--;
164             inp++;
165         }
166         if (inbytesleft == 0)
167             goto incomplete;
168         *modep = *inp++; /* Final character */
169         inbytesleft--;
170
171         (*no_read) += inbytesleft0 - inbytesleft;
172     }
173     if (inbytesleft == 0)
174         return 0;
175     else if (*inp == ' ')
176     {
177         *no_read += 1;
178         return ' ';
179     }
180     else
181     {
182         unsigned long x;
183         size_t no_read_sub = 0;
184         int mode = *inp < 128 ? data->g0_mode : data->g1_mode;
185         *comb = 0;
186
187         switch(mode)
188         {
189         case 'B':  /* Basic ASCII */
190         case 's':  /* ASCII */
191             x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
192             break;
193         case 'E':  /* ANSEL */
194             x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb, 127, 128);
195             break;
196         case 'g':  /* Greek */
197             x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
198             break;
199         case 'b':  /* Subscripts */
200             x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
201             break;
202         case 'p':  /* Superscripts */
203             x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
204             break;
205         case '2':  /* Basic Hebrew */
206             x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
207             break;
208         case 'N':  /* Basic Cyrillic */
209             x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
210             break;
211         case 'Q':  /* Extended Cyrillic */
212             x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
213             break;
214         case '3':  /* Basic Arabic */
215             x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
216             break;
217         case '4':  /* Extended Arabic */
218             x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
219             break;
220         case 'S':  /* Greek */
221             x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
222             break;
223         case '1':  /* Chinese, Japanese, Korean (EACC) */
224             x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
225             break;
226         default:
227             *no_read = 0;
228             yaz_iconv_set_errno(cd, YAZ_ICONV_EILSEQ);
229             return 0;
230         }
231         *no_read += no_read_sub;
232         return x;
233     }
234 incomplete:
235     *no_read = 0;
236     yaz_iconv_set_errno(cd, YAZ_ICONV_EINVAL);
237     return 0;
238 }
239
240
241 static size_t init_marc8(yaz_iconv_t cd, yaz_iconv_decoder_t d,
242                          unsigned char *inp,
243                          size_t inbytesleft, size_t *no_read)
244 {
245     struct decoder_data *data = (struct decoder_data *) d->data;
246     data->g0_mode = 'B';
247     data->g1_mode = 'E';
248     data->comb_offset = data->comb_size = 0;
249     return 0;
250 }
251
252 void destroy_marc8(yaz_iconv_decoder_t d)
253 {
254     struct decoder_data *data = (struct decoder_data *) d->data;
255     xfree(data);
256 }
257
258 yaz_iconv_decoder_t yaz_marc8_decoder(const char *fromcode,
259                                       yaz_iconv_decoder_t d)
260 {
261     if (!yaz_matchstr(fromcode, "MARC8") || !yaz_matchstr(fromcode, "ANSEL"))
262         d->read_handle = read_marc8;
263     else if (!yaz_matchstr(fromcode, "MARC8s"))
264         d->read_handle = read_marc8s;
265     else
266         return 0;
267     {
268         struct decoder_data *data = (struct decoder_data *)
269             xmalloc(sizeof(*data));
270         d->data = data;
271         d->init_handle = init_marc8;
272         d->destroy_handle = destroy_marc8;
273     }
274     return d;
275 }
276
277
278 /*
279  * Local variables:
280  * c-basic-offset: 4
281  * c-file-style: "Stroustrup"
282  * indent-tabs-mode: nil
283  * End:
284  * vim: shiftwidth=4 tabstop=8 expandtab
285  */
286