Merge branch 'master' into yaz-744
[yaz-moved-to-github.git] / src / iconv_encode_marc8.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) Index Data
3  * See the file LICENSE for details.
4  */
5 /**
6  * \file
7  * \brief MARC-8 encoding
8  *
9  * MARC-8 reference:
10  *  http://www.loc.gov/marc/specifications/speccharmarc8.html
11  */
12
13 #if HAVE_CONFIG_H
14 #include <config.h>
15 #endif
16
17 #include <assert.h>
18 #include <errno.h>
19 #include <string.h>
20
21 #include <yaz/xmalloc.h>
22 #include <yaz/snprintf.h>
23 #include "iconv-p.h"
24
25 yaz_conv_func_t yaz_marc8r_42_conv;
26 yaz_conv_func_t yaz_marc8r_45_conv;
27 yaz_conv_func_t yaz_marc8r_67_conv;
28 yaz_conv_func_t yaz_marc8r_62_conv;
29 yaz_conv_func_t yaz_marc8r_70_conv;
30 yaz_conv_func_t yaz_marc8r_32_conv;
31 yaz_conv_func_t yaz_marc8r_4E_conv;
32 yaz_conv_func_t yaz_marc8r_51_conv;
33 yaz_conv_func_t yaz_marc8r_33_conv;
34 yaz_conv_func_t yaz_marc8r_34_conv;
35 yaz_conv_func_t yaz_marc8r_53_conv;
36 yaz_conv_func_t yaz_marc8r_31_conv;
37
38 #define ESC "\033"
39
40 struct encoder_data
41 {
42     unsigned write_marc8_second_half_char;
43     unsigned long write_marc8_last;
44     int write_marc8_ncr;
45     const char *write_marc8_lpage;
46     const char *write_marc8_g0;
47     const char *write_marc8_g1;
48 };
49
50 static void init_marc8(yaz_iconv_encoder_t w)
51 {
52     struct encoder_data *data = (struct encoder_data *) w->data;
53     data->write_marc8_second_half_char = 0;
54     data->write_marc8_last = 0;
55     data->write_marc8_ncr = 0;
56     data->write_marc8_lpage = 0;
57     data->write_marc8_g0 = ESC "(B";
58     data->write_marc8_g1 = 0;
59 }
60
61 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd,
62                                        struct encoder_data *w,
63                                        char **outbuf, size_t *outbytesleft,
64                                        const char *page_chr);
65
66 static unsigned long lookup_marc8(yaz_iconv_t cd,
67                                   unsigned long x, int *comb,
68                                   const char **page_chr)
69 {
70     char utf8_buf[7];
71     char *utf8_outbuf = utf8_buf;
72     size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
73     int error_code;
74
75     r = yaz_write_UTF8_char(x, &utf8_outbuf, &utf8_outbytesleft, &error_code);
76     if (r == (size_t)(-1))
77     {
78         yaz_iconv_set_errno(cd, YAZ_ICONV_EILSEQ);
79         return 0;
80     }
81     else
82     {
83         unsigned char *inp;
84         size_t inbytesleft, no_read_sub = 0;
85         unsigned long x;
86
87         *utf8_outbuf = '\0';
88         inp = (unsigned char *) utf8_buf;
89         inbytesleft = strlen(utf8_buf);
90
91         x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
92         if (x)
93         {
94             *page_chr = ESC "(B";
95             return x;
96         }
97         x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
98         if (x)
99         {
100             *page_chr = ESC "(B";
101             return x;
102         }
103         x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
104         if (x)
105         {
106             *page_chr = ESC "b";
107             return x;
108         }
109         x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
110         if (x)
111         {
112             *page_chr = ESC "p";
113             return x;
114         }
115         x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
116         if (x)
117         {
118             *page_chr = ESC "(2";
119             return x;
120         }
121         x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
122         if (x)
123         {
124             *page_chr = ESC "(N";
125             return x;
126         }
127         x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
128         if (x)
129         {
130             *page_chr = ESC "(Q";
131             return x;
132         }
133         x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
134         if (x)
135         {
136             *page_chr = ESC "(3";
137             return x;
138         }
139         x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
140         if (x)
141         {
142             *page_chr = ESC "(4";
143             return x;
144         }
145         x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
146         if (x)
147         {
148             *page_chr = ESC "(S";
149             return x;
150         }
151         x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
152         if (x)
153         {
154             *page_chr = ESC "$1";
155             return x;
156         }
157         yaz_iconv_set_errno(cd, YAZ_ICONV_EILSEQ);
158         return x;
159     }
160 }
161
162 static size_t flush_combos(yaz_iconv_t cd,
163                            struct encoder_data *w,
164                            char **outbuf, size_t *outbytesleft)
165 {
166     unsigned long y = w->write_marc8_last;
167
168     if (!y)
169         return 0;
170
171     assert(w->write_marc8_lpage);
172     if (w->write_marc8_lpage)
173     {
174         size_t r = yaz_write_marc8_page_chr(cd, w, outbuf, outbytesleft,
175                                             w->write_marc8_lpage);
176         if (r)
177             return r;
178     }
179
180     if (9 >= *outbytesleft)
181     {
182         yaz_iconv_set_errno(cd, YAZ_ICONV_E2BIG);
183         return (size_t) (-1);
184     }
185     if (w->write_marc8_ncr)
186     {
187         yaz_snprintf(*outbuf, 9, "&#x%04x;", y);
188         (*outbytesleft) -= 8;
189         (*outbuf) += 8;
190     }
191     else
192     {
193         size_t out_no = 0;
194         unsigned char byte;
195
196         byte = (unsigned char )((y>>16) & 0xff);
197         if (byte)
198             (*outbuf)[out_no++] = byte;
199         byte = (unsigned char)((y>>8) & 0xff);
200         if (byte)
201             (*outbuf)[out_no++] = byte;
202         byte = (unsigned char )(y & 0xff);
203         if (byte)
204             (*outbuf)[out_no++] = byte;
205         *outbuf += out_no;
206         (*outbytesleft) -= out_no;
207     }
208
209     if (w->write_marc8_second_half_char)
210     {
211         *(*outbuf)++ = w->write_marc8_second_half_char;
212         (*outbytesleft)--;
213     }
214
215     w->write_marc8_last = 0;
216     w->write_marc8_ncr = 0;
217     w->write_marc8_lpage = 0;
218     w->write_marc8_second_half_char = 0;
219     return 0;
220 }
221
222 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd,
223                                        struct encoder_data *w,
224                                        char **outbuf, size_t *outbytesleft,
225                                        const char *page_chr)
226 {
227     const char **old_page_chr = &w->write_marc8_g0;
228
229     /* are we going to a G1-set (such as such as ESC ")!E") */
230     if (page_chr && page_chr[1] == ')')
231         old_page_chr = &w->write_marc8_g1;
232
233     if (!*old_page_chr || strcmp(page_chr, *old_page_chr))
234     {
235         size_t plen = 0;
236         const char *page_out = page_chr;
237
238         if (*outbytesleft < 8)
239         {
240             yaz_iconv_set_errno(cd, YAZ_ICONV_E2BIG);
241
242             return (size_t) (-1);
243         }
244
245         if (*old_page_chr)
246         {
247             if (!strcmp(*old_page_chr, ESC "p")
248                 || !strcmp(*old_page_chr, ESC "g")
249                 || !strcmp(*old_page_chr, ESC "b"))
250             {
251                 page_out = ESC "s";
252                 /* Technique 1 leave */
253                 if (strcmp(page_chr, ESC "(B")) /* Not going ASCII page? */
254                 {
255                     /* Must leave script + enter new page */
256                     plen = strlen(page_out);
257                     memcpy(*outbuf, page_out, plen);
258                     (*outbuf) += plen;
259                     (*outbytesleft) -= plen;
260                     page_out = ESC "(B";
261                 }
262             }
263         }
264         *old_page_chr = page_chr;
265         plen = strlen(page_out);
266         memcpy(*outbuf, page_out, plen);
267         (*outbuf) += plen;
268         (*outbytesleft) -= plen;
269     }
270     return 0;
271 }
272
273
274 static size_t yaz_write_marc8_2(yaz_iconv_t cd, struct encoder_data *w,
275                                 unsigned long x,
276                                 char **outbuf, size_t *outbytesleft,
277                                 int loss_mode)
278 {
279     int comb = 0;
280     int enable_ncr = 0;
281     const char *page_chr = 0;
282     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
283
284     if (!y)
285     {
286         page_chr = ESC "(B";
287         switch (loss_mode)
288         {
289         case 0:
290             return (size_t) (-1);
291         case 1:
292             y = '|';
293             break;
294         case 2:
295             y = x;
296             enable_ncr = 1;
297             break;
298         case 3:
299             if (x < 32 && x != 27)
300                 y = x;
301             else
302                 return (size_t) (-1);
303         }
304     }
305
306     if (comb)
307     {
308         if (page_chr)
309         {
310             size_t r = yaz_write_marc8_page_chr(cd, w, outbuf, outbytesleft,
311                                                 page_chr);
312             if (r)
313                 return r;
314         }
315         if (x == 0x0361)
316             w->write_marc8_second_half_char = 0xEC;
317         else if (x == 0x0360)
318             w->write_marc8_second_half_char = 0xFB;
319
320         if (*outbytesleft <= 1)
321         {
322             yaz_iconv_set_errno(cd, YAZ_ICONV_E2BIG);
323             return (size_t) (-1);
324         }
325         *(*outbuf)++ = y;
326         (*outbytesleft)--;
327     }
328     else
329     {
330         size_t r = flush_combos(cd, w, outbuf, outbytesleft);
331         if (r)
332             return r;
333
334         w->write_marc8_last = y;
335         w->write_marc8_lpage = page_chr;
336         w->write_marc8_ncr = enable_ncr;
337     }
338     return 0;
339 }
340
341 static size_t flush_marc8(yaz_iconv_t cd, yaz_iconv_encoder_t en,
342                            char **outbuf, size_t *outbytesleft)
343 {
344     struct encoder_data *w = (struct encoder_data *) en->data;
345     size_t r = flush_combos(cd, w, outbuf, outbytesleft);
346     if (r)
347         return r;
348     w->write_marc8_g1 = 0;
349     return yaz_write_marc8_page_chr(cd, w, outbuf, outbytesleft, ESC "(B");
350 }
351
352 static size_t yaz_write_marc8_generic(yaz_iconv_t cd, struct encoder_data *w,
353                                       unsigned long x,
354                                       char **outbuf, size_t *outbytesleft,
355                                       int loss_mode)
356 {
357     unsigned long x1, x2;
358     if (yaz_iso_8859_1_lookup_y(x, &x1, &x2))
359     {
360         /* save the output pointers .. */
361         char *outbuf0 = *outbuf;
362         size_t outbytesleft0 = *outbytesleft;
363         int last_ch = w->write_marc8_last;
364         int ncr = w->write_marc8_ncr;
365         const char *lpage = w->write_marc8_lpage;
366         size_t r;
367
368         r = yaz_write_marc8_2(cd, w, x1,
369                               outbuf, outbytesleft, loss_mode);
370         if (r)
371             return r;
372         r = yaz_write_marc8_2(cd, w, x2,
373                               outbuf, outbytesleft, loss_mode);
374         if (r && yaz_iconv_error(cd) == YAZ_ICONV_E2BIG)
375         {
376             /* not enough room. reset output to original values */
377             *outbuf = outbuf0;
378             *outbytesleft = outbytesleft0;
379             w->write_marc8_last = last_ch;
380             w->write_marc8_ncr = ncr;
381             w->write_marc8_lpage = lpage;
382         }
383         return r;
384     }
385     return yaz_write_marc8_2(cd, w, x, outbuf, outbytesleft, loss_mode);
386 }
387
388 static size_t write_marc8_normal(yaz_iconv_t cd, yaz_iconv_encoder_t e,
389                                  unsigned long x,
390                                  char **outbuf, size_t *outbytesleft)
391 {
392     return yaz_write_marc8_generic(cd, (struct encoder_data *) e->data,
393                                    x, outbuf, outbytesleft, 0);
394 }
395
396 static size_t write_marc8_lossy(yaz_iconv_t cd, yaz_iconv_encoder_t e,
397                                 unsigned long x,
398                                 char **outbuf, size_t *outbytesleft)
399 {
400     return yaz_write_marc8_generic(cd, (struct encoder_data *) e->data,
401                                    x, outbuf, outbytesleft, 1);
402 }
403
404 static size_t write_marc8_lossless(yaz_iconv_t cd, yaz_iconv_encoder_t e,
405                                    unsigned long x,
406                                    char **outbuf, size_t *outbytesleft)
407 {
408     return yaz_write_marc8_generic(cd, (struct encoder_data *) e->data,
409                                    x, outbuf, outbytesleft, 2);
410 }
411
412 static size_t write_marc8_control(yaz_iconv_t cd, yaz_iconv_encoder_t e,
413                                    unsigned long x,
414                                    char **outbuf, size_t *outbytesleft)
415 {
416     return yaz_write_marc8_generic(cd, (struct encoder_data *) e->data,
417                                    x, outbuf, outbytesleft, 3);
418 }
419
420 static void destroy_marc8(yaz_iconv_encoder_t e)
421 {
422     xfree(e->data);
423 }
424
425 yaz_iconv_encoder_t yaz_marc8_encoder(const char *tocode,
426                                       yaz_iconv_encoder_t e)
427
428 {
429     if (!yaz_matchstr(tocode, "MARC8"))
430         e->write_handle = write_marc8_normal;
431     else if (!yaz_matchstr(tocode, "MARC8s"))
432         e->write_handle = write_marc8_normal;
433     else if (!yaz_matchstr(tocode, "MARC8lossy"))
434         e->write_handle = write_marc8_lossy;
435     else if (!yaz_matchstr(tocode, "MARC8lossless"))
436         e->write_handle = write_marc8_lossless;
437     else if (!yaz_matchstr(tocode, "MARC8c"))
438         e->write_handle = write_marc8_control;
439     else
440         return 0;
441
442     {
443         struct encoder_data *data = (struct encoder_data *)
444             xmalloc(sizeof(*data));
445         e->data = data;
446         e->destroy_handle = destroy_marc8;
447         e->flush_handle = flush_marc8;
448         e->init_handle = init_marc8;
449     }
450     return e;
451 }
452
453
454 /*
455  * Local variables:
456  * c-basic-offset: 4
457  * c-file-style: "Stroustrup"
458  * indent-tabs-mode: nil
459  * End:
460  * vim: shiftwidth=4 tabstop=8 expandtab
461  */
462