MARC-8: allow all characters 0x01-0x20 YAZ-650
[yaz-moved-to-github.git] / src / iconv_encode_marc8.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2013 Index Data
3  * See the file LICENSE for details.
4  */
5 /**
6  * \file
7  * \brief MARC-8 encoding
8  *
9  * MARC-8 reference:
10  *  http://www.loc.gov/marc/specifications/speccharmarc8.html
11  */
12
13 #if HAVE_CONFIG_H
14 #include <config.h>
15 #endif
16
17 #include <assert.h>
18 #include <errno.h>
19 #include <string.h>
20
21 #include <yaz/xmalloc.h>
22 #include <yaz/snprintf.h>
23 #include "iconv-p.h"
24
25 yaz_conv_func_t yaz_marc8r_42_conv;
26 yaz_conv_func_t yaz_marc8r_45_conv;
27 yaz_conv_func_t yaz_marc8r_67_conv;
28 yaz_conv_func_t yaz_marc8r_62_conv;
29 yaz_conv_func_t yaz_marc8r_70_conv;
30 yaz_conv_func_t yaz_marc8r_32_conv;
31 yaz_conv_func_t yaz_marc8r_4E_conv;
32 yaz_conv_func_t yaz_marc8r_51_conv;
33 yaz_conv_func_t yaz_marc8r_33_conv;
34 yaz_conv_func_t yaz_marc8r_34_conv;
35 yaz_conv_func_t yaz_marc8r_53_conv;
36 yaz_conv_func_t yaz_marc8r_31_conv;
37
38 #define ESC "\033"
39
40 struct encoder_data
41 {
42     unsigned write_marc8_second_half_char;
43     unsigned long write_marc8_last;
44     int write_marc8_ncr;
45     const char *write_marc8_lpage;
46     const char *write_marc8_g0;
47     const char *write_marc8_g1;
48 };
49
50 static void init_marc8(yaz_iconv_encoder_t w)
51 {
52     struct encoder_data *data = (struct encoder_data *) w->data;
53     data->write_marc8_second_half_char = 0;
54     data->write_marc8_last = 0;
55     data->write_marc8_ncr = 0;
56     data->write_marc8_lpage = 0;
57     data->write_marc8_g0 = ESC "(B";
58     data->write_marc8_g1 = 0;
59 }
60
61 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd,
62                                        struct encoder_data *w,
63                                        char **outbuf, size_t *outbytesleft,
64                                        const char *page_chr);
65
66 static unsigned long lookup_marc8(yaz_iconv_t cd,
67                                   unsigned long x, int *comb,
68                                   const char **page_chr)
69 {
70     char utf8_buf[7];
71     char *utf8_outbuf = utf8_buf;
72     size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
73     int error_code;
74
75     if (x <= ' ')
76     {
77         *page_chr = ESC "(B";
78         return x;
79     }
80     r = yaz_write_UTF8_char(x, &utf8_outbuf, &utf8_outbytesleft, &error_code);
81     if (r == (size_t)(-1))
82     {
83         yaz_iconv_set_errno(cd, YAZ_ICONV_EILSEQ);
84         return 0;
85     }
86     else
87     {
88         unsigned char *inp;
89         size_t inbytesleft, no_read_sub = 0;
90         unsigned long x;
91
92         *utf8_outbuf = '\0';
93         inp = (unsigned char *) utf8_buf;
94         inbytesleft = strlen(utf8_buf);
95
96         x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
97         if (x)
98         {
99             *page_chr = ESC "(B";
100             return x;
101         }
102         x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
103         if (x)
104         {
105             *page_chr = ESC "(B";
106             return x;
107         }
108         x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
109         if (x)
110         {
111             *page_chr = ESC "b";
112             return x;
113         }
114         x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
115         if (x)
116         {
117             *page_chr = ESC "p";
118             return x;
119         }
120         x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
121         if (x)
122         {
123             *page_chr = ESC "(2";
124             return x;
125         }
126         x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
127         if (x)
128         {
129             *page_chr = ESC "(N";
130             return x;
131         }
132         x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
133         if (x)
134         {
135             *page_chr = ESC "(Q";
136             return x;
137         }
138         x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
139         if (x)
140         {
141             *page_chr = ESC "(3";
142             return x;
143         }
144         x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
145         if (x)
146         {
147             *page_chr = ESC "(4";
148             return x;
149         }
150         x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
151         if (x)
152         {
153             *page_chr = ESC "(S";
154             return x;
155         }
156         x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
157         if (x)
158         {
159             *page_chr = ESC "$1";
160             return x;
161         }
162         yaz_iconv_set_errno(cd, YAZ_ICONV_EILSEQ);
163         return x;
164     }
165 }
166
167 static size_t flush_combos(yaz_iconv_t cd,
168                            struct encoder_data *w,
169                            char **outbuf, size_t *outbytesleft)
170 {
171     unsigned long y = w->write_marc8_last;
172
173     if (!y)
174         return 0;
175
176     assert(w->write_marc8_lpage);
177     if (w->write_marc8_lpage)
178     {
179         size_t r = yaz_write_marc8_page_chr(cd, w, outbuf, outbytesleft,
180                                             w->write_marc8_lpage);
181         if (r)
182             return r;
183     }
184
185     if (9 >= *outbytesleft)
186     {
187         yaz_iconv_set_errno(cd, YAZ_ICONV_E2BIG);
188         return (size_t) (-1);
189     }
190     if (w->write_marc8_ncr)
191     {
192         yaz_snprintf(*outbuf, 9, "&#x%04x;", y);
193         (*outbytesleft) -= 8;
194         (*outbuf) += 8;
195     }
196     else
197     {
198         size_t out_no = 0;
199         unsigned char byte;
200
201         byte = (unsigned char )((y>>16) & 0xff);
202         if (byte)
203             (*outbuf)[out_no++] = byte;
204         byte = (unsigned char)((y>>8) & 0xff);
205         if (byte)
206             (*outbuf)[out_no++] = byte;
207         byte = (unsigned char )(y & 0xff);
208         if (byte)
209             (*outbuf)[out_no++] = byte;
210         *outbuf += out_no;
211         (*outbytesleft) -= out_no;
212     }
213
214     if (w->write_marc8_second_half_char)
215     {
216         *(*outbuf)++ = w->write_marc8_second_half_char;
217         (*outbytesleft)--;
218     }
219
220     w->write_marc8_last = 0;
221     w->write_marc8_ncr = 0;
222     w->write_marc8_lpage = 0;
223     w->write_marc8_second_half_char = 0;
224     return 0;
225 }
226
227 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd,
228                                        struct encoder_data *w,
229                                        char **outbuf, size_t *outbytesleft,
230                                        const char *page_chr)
231 {
232     const char **old_page_chr = &w->write_marc8_g0;
233
234     /* are we going to a G1-set (such as such as ESC ")!E") */
235     if (page_chr && page_chr[1] == ')')
236         old_page_chr = &w->write_marc8_g1;
237
238     if (!*old_page_chr || strcmp(page_chr, *old_page_chr))
239     {
240         size_t plen = 0;
241         const char *page_out = page_chr;
242
243         if (*outbytesleft < 8)
244         {
245             yaz_iconv_set_errno(cd, YAZ_ICONV_E2BIG);
246
247             return (size_t) (-1);
248         }
249
250         if (*old_page_chr)
251         {
252             if (!strcmp(*old_page_chr, ESC "p")
253                 || !strcmp(*old_page_chr, ESC "g")
254                 || !strcmp(*old_page_chr, ESC "b"))
255             {
256                 page_out = ESC "s";
257                 /* Technique 1 leave */
258                 if (strcmp(page_chr, ESC "(B")) /* Not going ASCII page? */
259                 {
260                     /* Must leave script + enter new page */
261                     plen = strlen(page_out);
262                     memcpy(*outbuf, page_out, plen);
263                     (*outbuf) += plen;
264                     (*outbytesleft) -= plen;
265                     page_out = ESC "(B";
266                 }
267             }
268         }
269         *old_page_chr = page_chr;
270         plen = strlen(page_out);
271         memcpy(*outbuf, page_out, plen);
272         (*outbuf) += plen;
273         (*outbytesleft) -= plen;
274     }
275     return 0;
276 }
277
278
279 static size_t yaz_write_marc8_2(yaz_iconv_t cd, struct encoder_data *w,
280                                 unsigned long x,
281                                 char **outbuf, size_t *outbytesleft,
282                                 int loss_mode)
283 {
284     int comb = 0;
285     int enable_ncr = 0;
286     const char *page_chr = 0;
287     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
288
289     if (!y)
290     {
291         if (loss_mode == 0)
292             return (size_t) (-1);
293         page_chr = ESC "(B";
294         if (loss_mode == 1)
295             y = '|';
296         else
297         {
298             y = x;
299             enable_ncr = 1;
300         }
301     }
302
303     if (comb)
304     {
305         if (page_chr)
306         {
307             size_t r = yaz_write_marc8_page_chr(cd, w, outbuf, outbytesleft,
308                                                 page_chr);
309             if (r)
310                 return r;
311         }
312         if (x == 0x0361)
313             w->write_marc8_second_half_char = 0xEC;
314         else if (x == 0x0360)
315             w->write_marc8_second_half_char = 0xFB;
316
317         if (*outbytesleft <= 1)
318         {
319             yaz_iconv_set_errno(cd, YAZ_ICONV_E2BIG);
320             return (size_t) (-1);
321         }
322         *(*outbuf)++ = y;
323         (*outbytesleft)--;
324     }
325     else
326     {
327         size_t r = flush_combos(cd, w, outbuf, outbytesleft);
328         if (r)
329             return r;
330
331         w->write_marc8_last = y;
332         w->write_marc8_lpage = page_chr;
333         w->write_marc8_ncr = enable_ncr;
334     }
335     return 0;
336 }
337
338 static size_t flush_marc8(yaz_iconv_t cd, yaz_iconv_encoder_t en,
339                            char **outbuf, size_t *outbytesleft)
340 {
341     struct encoder_data *w = (struct encoder_data *) en->data;
342     size_t r = flush_combos(cd, w, outbuf, outbytesleft);
343     if (r)
344         return r;
345     w->write_marc8_g1 = 0;
346     return yaz_write_marc8_page_chr(cd, w, outbuf, outbytesleft, ESC "(B");
347 }
348
349 static size_t yaz_write_marc8_generic(yaz_iconv_t cd, struct encoder_data *w,
350                                       unsigned long x,
351                                       char **outbuf, size_t *outbytesleft,
352                                       int loss_mode)
353 {
354     unsigned long x1, x2;
355     if (yaz_iso_8859_1_lookup_y(x, &x1, &x2))
356     {
357         /* save the output pointers .. */
358         char *outbuf0 = *outbuf;
359         size_t outbytesleft0 = *outbytesleft;
360         int last_ch = w->write_marc8_last;
361         int ncr = w->write_marc8_ncr;
362         const char *lpage = w->write_marc8_lpage;
363         size_t r;
364
365         r = yaz_write_marc8_2(cd, w, x1,
366                               outbuf, outbytesleft, loss_mode);
367         if (r)
368             return r;
369         r = yaz_write_marc8_2(cd, w, x2,
370                               outbuf, outbytesleft, loss_mode);
371         if (r && yaz_iconv_error(cd) == YAZ_ICONV_E2BIG)
372         {
373             /* not enough room. reset output to original values */
374             *outbuf = outbuf0;
375             *outbytesleft = outbytesleft0;
376             w->write_marc8_last = last_ch;
377             w->write_marc8_ncr = ncr;
378             w->write_marc8_lpage = lpage;
379         }
380         return r;
381     }
382     return yaz_write_marc8_2(cd, w, x, outbuf, outbytesleft, loss_mode);
383 }
384
385 static size_t write_marc8_normal(yaz_iconv_t cd, yaz_iconv_encoder_t e,
386                                  unsigned long x,
387                                  char **outbuf, size_t *outbytesleft)
388 {
389     return yaz_write_marc8_generic(cd, (struct encoder_data *) e->data,
390                                    x, outbuf, outbytesleft, 0);
391 }
392
393 static size_t write_marc8_lossy(yaz_iconv_t cd, yaz_iconv_encoder_t e,
394                                 unsigned long x,
395                                 char **outbuf, size_t *outbytesleft)
396 {
397     return yaz_write_marc8_generic(cd, (struct encoder_data *) e->data,
398                                    x, outbuf, outbytesleft, 1);
399 }
400
401 static size_t write_marc8_lossless(yaz_iconv_t cd, yaz_iconv_encoder_t e,
402                                    unsigned long x,
403                                    char **outbuf, size_t *outbytesleft)
404 {
405     return yaz_write_marc8_generic(cd, (struct encoder_data *) e->data,
406                                    x, outbuf, outbytesleft, 2);
407 }
408
409 static void destroy_marc8(yaz_iconv_encoder_t e)
410 {
411     xfree(e->data);
412 }
413
414 yaz_iconv_encoder_t yaz_marc8_encoder(const char *tocode,
415                                       yaz_iconv_encoder_t e)
416
417 {
418     if (!yaz_matchstr(tocode, "MARC8"))
419         e->write_handle = write_marc8_normal;
420     else if (!yaz_matchstr(tocode, "MARC8s"))
421         e->write_handle = write_marc8_normal;
422     else if (!yaz_matchstr(tocode, "MARC8lossy"))
423         e->write_handle = write_marc8_lossy;
424     else if (!yaz_matchstr(tocode, "MARC8lossless"))
425         e->write_handle = write_marc8_lossless;
426     else
427         return 0;
428
429     {
430         struct encoder_data *data = (struct encoder_data *)
431             xmalloc(sizeof(*data));
432         e->data = data;
433         e->destroy_handle = destroy_marc8;
434         e->flush_handle = flush_marc8;
435         e->init_handle = init_marc8;
436     }
437     return e;
438 }
439
440
441 /*
442  * Local variables:
443  * c-basic-offset: 4
444  * c-file-style: "Stroustrup"
445  * indent-tabs-mode: nil
446  * End:
447  * vim: shiftwidth=4 tabstop=8 expandtab
448  */
449