Factored character encoders out to separate files (iconv system).
[yaz-moved-to-github.git] / src / iconv_encode_marc8.c
1 /*
2  * Copyright (C) 1995-2008, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  */
6 /**
7  * \file
8  * \brief MARC-8 encoding
9  *
10  * MARC-8 reference:
11  *  http://www.loc.gov/marc/specifications/speccharmarc8.html
12  */
13
14 #if HAVE_CONFIG_H
15 #include <config.h>
16 #endif
17
18 #include <assert.h>
19 #include <errno.h>
20 #include <string.h>
21 #include <ctype.h>
22
23 #include <yaz/xmalloc.h>
24 #include <yaz/nmem.h>
25 #include <yaz/snprintf.h>
26 #include "iconv-p.h"
27
28 yaz_conv_func_t yaz_marc8r_42_conv;
29 yaz_conv_func_t yaz_marc8r_45_conv;
30 yaz_conv_func_t yaz_marc8r_67_conv;
31 yaz_conv_func_t yaz_marc8r_62_conv;
32 yaz_conv_func_t yaz_marc8r_70_conv;
33 yaz_conv_func_t yaz_marc8r_32_conv;
34 yaz_conv_func_t yaz_marc8r_4E_conv;
35 yaz_conv_func_t yaz_marc8r_51_conv;
36 yaz_conv_func_t yaz_marc8r_33_conv;
37 yaz_conv_func_t yaz_marc8r_34_conv;
38 yaz_conv_func_t yaz_marc8r_53_conv;
39 yaz_conv_func_t yaz_marc8r_31_conv;
40
41 #define ESC "\033"
42
43 struct encoder_data
44 {
45     unsigned write_marc8_second_half_char;
46     unsigned long write_marc8_last;
47     int write_marc8_ncr;
48     const char *write_marc8_lpage;
49     const char *write_marc8_g0;
50     const char *write_marc8_g1;
51 };
52
53 static void init_marc8(yaz_iconv_encoder_t w)
54 {
55     struct encoder_data *data = w->data;
56     data->write_marc8_second_half_char = 0;
57     data->write_marc8_last = 0;
58     data->write_marc8_ncr = 0;
59     data->write_marc8_lpage = 0;
60     data->write_marc8_g0 = ESC "(B";
61     data->write_marc8_g1 = 0;
62 }
63
64 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd, 
65                                        struct encoder_data *w,
66                                        char **outbuf, size_t *outbytesleft,
67                                        const char *page_chr);
68
69 static unsigned long lookup_marc8(yaz_iconv_t cd,
70                                   unsigned long x, int *comb,
71                                   const char **page_chr)
72 {
73     char utf8_buf[7];
74     char *utf8_outbuf = utf8_buf;
75     size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
76     int error_code;
77
78     r = yaz_write_UTF8_char(x, &utf8_outbuf, &utf8_outbytesleft, &error_code);
79     if (r == (size_t)(-1))
80     {
81         yaz_iconv_set_errno(cd, YAZ_ICONV_EILSEQ);
82         return 0;
83     }
84     else
85     {
86         unsigned char *inp;
87         size_t inbytesleft, no_read_sub = 0;
88         unsigned long x;
89
90         *utf8_outbuf = '\0';        
91         inp = (unsigned char *) utf8_buf;
92         inbytesleft = strlen(utf8_buf);
93
94         x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
95         if (x)
96         {
97             *page_chr = ESC "(B";
98             return x;
99         }
100         x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
101         if (x)
102         {
103             *page_chr = ESC "(B";
104             return x;
105         }
106         x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
107         if (x)
108         {
109             *page_chr = ESC "b";
110             return x;
111         }
112         x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
113         if (x)
114         {
115             *page_chr = ESC "p";
116             return x;
117         }
118         x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
119         if (x)
120         {
121             *page_chr = ESC "(2";
122             return x;
123         }
124         x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
125         if (x)
126         {
127             *page_chr = ESC "(N";
128             return x;
129         }
130         x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
131         if (x)
132         {
133             *page_chr = ESC "(Q";
134             return x;
135         }
136         x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
137         if (x)
138         {
139             *page_chr = ESC "(3";
140             return x;
141         }
142         x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
143         if (x)
144         {
145             *page_chr = ESC "(4";
146             return x;
147         }
148         x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
149         if (x)
150         {
151             *page_chr = ESC "(S";
152             return x;
153         }
154         x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
155         if (x)
156         {
157             *page_chr = ESC "$1";
158             return x;
159         }
160         yaz_iconv_set_errno(cd, YAZ_ICONV_EILSEQ);
161         return x;
162     }
163 }
164
165 static size_t flush_combos(yaz_iconv_t cd,
166                            struct encoder_data *w,
167                            char **outbuf, size_t *outbytesleft)
168 {
169     unsigned long y = w->write_marc8_last;
170
171     if (!y)
172         return 0;
173
174     assert(w->write_marc8_lpage);
175     if (w->write_marc8_lpage)
176     {
177         size_t r = yaz_write_marc8_page_chr(cd, w, outbuf, outbytesleft,
178                                             w->write_marc8_lpage);
179         if (r)
180             return r;
181     }
182
183     if (9 >= *outbytesleft)
184     {
185         yaz_iconv_set_errno(cd, YAZ_ICONV_E2BIG);
186         return (size_t) (-1);
187     }
188     if (w->write_marc8_ncr)
189     {
190         yaz_snprintf(*outbuf, 9, "&#x%04x;", y);
191         (*outbytesleft) -= 8;
192         (*outbuf) += 8;
193     }
194     else
195     {
196         size_t out_no = 0;
197         unsigned char byte;
198
199         byte = (unsigned char )((y>>16) & 0xff);
200         if (byte)
201             (*outbuf)[out_no++] = byte;
202         byte = (unsigned char)((y>>8) & 0xff);
203         if (byte)
204             (*outbuf)[out_no++] = byte;
205         byte = (unsigned char )(y & 0xff);
206         if (byte)
207             (*outbuf)[out_no++] = byte;
208         *outbuf += out_no;
209         (*outbytesleft) -= out_no;
210     }
211
212     if (w->write_marc8_second_half_char)
213     {
214         *(*outbuf)++ = w->write_marc8_second_half_char;
215         (*outbytesleft)--;
216     }        
217
218     w->write_marc8_last = 0;
219     w->write_marc8_ncr = 0;
220     w->write_marc8_lpage = 0;
221     w->write_marc8_second_half_char = 0;
222     return 0;
223 }
224
225 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd, 
226                                        struct encoder_data *w,
227                                        char **outbuf, size_t *outbytesleft,
228                                        const char *page_chr)
229 {
230     const char **old_page_chr = &w->write_marc8_g0;
231
232     /* are we going to a G1-set (such as such as ESC ")!E") */
233     if (page_chr && page_chr[1] == ')')
234         old_page_chr = &w->write_marc8_g1;
235
236     if (!*old_page_chr || strcmp(page_chr, *old_page_chr))
237     {
238         size_t plen = 0;
239         const char *page_out = page_chr;
240         
241         if (*outbytesleft < 8)
242         {
243             yaz_iconv_set_errno(cd, YAZ_ICONV_E2BIG);
244             
245             return (size_t) (-1);
246         }
247
248         if (*old_page_chr)
249         {
250             if (!strcmp(*old_page_chr, ESC "p") 
251                 || !strcmp(*old_page_chr, ESC "g")
252                 || !strcmp(*old_page_chr, ESC "b"))
253             {
254                 page_out = ESC "s";
255                 /* Technique 1 leave */
256                 if (strcmp(page_chr, ESC "(B")) /* Not going ASCII page? */
257                 {
258                     /* Must leave script + enter new page */
259                     plen = strlen(page_out);
260                     memcpy(*outbuf, page_out, plen);
261                     (*outbuf) += plen;
262                     (*outbytesleft) -= plen;
263                     page_out = ESC "(B";
264                 }
265             }
266         }
267         *old_page_chr = page_chr;
268         plen = strlen(page_out);
269         memcpy(*outbuf, page_out, plen);
270         (*outbuf) += plen;
271         (*outbytesleft) -= plen;
272     }
273     return 0;
274 }
275
276
277 static size_t yaz_write_marc8_2(yaz_iconv_t cd, struct encoder_data *w,
278                                 unsigned long x,
279                                 char **outbuf, size_t *outbytesleft,
280                                 int loss_mode)
281 {
282     int comb = 0;
283     int enable_ncr = 0;
284     const char *page_chr = 0;
285     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
286
287     if (!y)
288     {
289         if (loss_mode == 0)
290             return (size_t) (-1);
291         page_chr = ESC "(B";
292         if (loss_mode == 1)
293             y = '|';
294         else
295         {
296             y = x; 
297             enable_ncr = 1;
298         }
299     }
300
301     if (comb)
302     {
303         if (page_chr)
304         {
305             size_t r = yaz_write_marc8_page_chr(cd, w, outbuf, outbytesleft,
306                                                 page_chr);
307             if (r)
308                 return r;
309         }
310         if (x == 0x0361)
311             w->write_marc8_second_half_char = 0xEC;
312         else if (x == 0x0360)
313             w->write_marc8_second_half_char = 0xFB;
314
315         if (*outbytesleft <= 1)
316         {
317             yaz_iconv_set_errno(cd, YAZ_ICONV_E2BIG);
318             return (size_t) (-1);
319         }
320         *(*outbuf)++ = y;
321         (*outbytesleft)--;
322     }
323     else
324     {
325         size_t r = flush_combos(cd, w, outbuf, outbytesleft);
326         if (r)
327             return r;
328
329         w->write_marc8_last = y;
330         w->write_marc8_lpage = page_chr;
331         w->write_marc8_ncr = enable_ncr;
332     }
333     return 0;
334 }
335
336 static size_t flush_marc8(yaz_iconv_t cd, yaz_iconv_encoder_t en,
337                            char **outbuf, size_t *outbytesleft)
338 {
339     struct encoder_data *w = en->data;
340     size_t r = flush_combos(cd, w, outbuf, outbytesleft);
341     if (r)
342         return r;
343     w->write_marc8_g1 = 0;
344     return yaz_write_marc8_page_chr(cd, w, outbuf, outbytesleft, ESC "(B");
345 }
346
347 static size_t yaz_write_marc8_generic(yaz_iconv_t cd, struct encoder_data *w,
348                                       unsigned long x,
349                                       char **outbuf, size_t *outbytesleft,
350                                       int loss_mode)
351 {
352     unsigned long x1, x2;
353     if (yaz_iso_8859_1_lookup_y(x, &x1, &x2))
354     {
355         /* save the output pointers .. */
356         char *outbuf0 = *outbuf;
357         size_t outbytesleft0 = *outbytesleft;
358         int last_ch = w->write_marc8_last;
359         int ncr = w->write_marc8_ncr;
360         const char *lpage = w->write_marc8_lpage;
361         size_t r;
362         
363         r = yaz_write_marc8_2(cd, w, x1,
364                               outbuf, outbytesleft, loss_mode);
365         if (r)
366             return r;
367         r = yaz_write_marc8_2(cd, w, x2,
368                               outbuf, outbytesleft, loss_mode);
369         if (r && yaz_iconv_error(cd) == YAZ_ICONV_E2BIG)
370         {
371             /* not enough room. reset output to original values */
372             *outbuf = outbuf0;
373             *outbytesleft = outbytesleft0;
374             w->write_marc8_last = last_ch;
375             w->write_marc8_ncr = ncr;
376             w->write_marc8_lpage = lpage;
377         }
378         return r;
379     }
380     return yaz_write_marc8_2(cd, w, x, outbuf, outbytesleft, loss_mode);
381 }
382
383 static size_t write_marc8_normal(yaz_iconv_t cd, yaz_iconv_encoder_t e,
384                                  unsigned long x,
385                                  char **outbuf, size_t *outbytesleft)
386 {
387     return yaz_write_marc8_generic(cd, e->data, x, outbuf, outbytesleft, 0);
388 }
389
390 static size_t write_marc8_lossy(yaz_iconv_t cd, yaz_iconv_encoder_t e,
391                                 unsigned long x,
392                                 char **outbuf, size_t *outbytesleft)
393 {
394     return yaz_write_marc8_generic(cd, e->data, x, outbuf, outbytesleft, 1);
395 }
396
397 static size_t write_marc8_lossless(yaz_iconv_t cd, yaz_iconv_encoder_t e,
398                                    unsigned long x,
399                                    char **outbuf, size_t *outbytesleft)
400 {
401     return yaz_write_marc8_generic(cd, e->data, x, outbuf, outbytesleft, 2);
402 }
403
404 static void destroy_marc8(yaz_iconv_encoder_t e)
405 {
406     xfree(e->data);
407 }
408
409 yaz_iconv_encoder_t yaz_marc8_encoder(const char *tocode,
410                                       yaz_iconv_encoder_t e)
411     
412 {
413     if (!yaz_matchstr(tocode, "MARC8"))
414         e->write_handle = write_marc8_normal;
415     else if (!yaz_matchstr(tocode, "MARC8s"))
416         e->write_handle = write_marc8_normal;
417     else if (!yaz_matchstr(tocode, "MARC8lossy"))
418         e->write_handle = write_marc8_lossy;
419     else if (!yaz_matchstr(tocode, "MARC8lossless"))
420         e->write_handle = write_marc8_lossless;
421     else
422         return 0;
423
424     {
425         struct encoder_data *data = xmalloc(sizeof(*data));
426         e->data = data;
427         e->destroy_handle = destroy_marc8;
428         e->flush_handle = flush_marc8;
429         e->init_handle = init_marc8;
430     }
431     return e;
432 }
433
434
435 /*
436  * Local variables:
437  * c-basic-offset: 4
438  * indent-tabs-mode: nil
439  * End:
440  * vim: shiftwidth=4 tabstop=8 expandtab
441  */