7db2348e2c8f18ffecca0c86935e1480d18c2b3b
[yaz-moved-to-github.git] / src / iconv_encode_marc8.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2012 Index Data
3  * See the file LICENSE for details.
4  */
5 /**
6  * \file
7  * \brief MARC-8 encoding
8  *
9  * MARC-8 reference:
10  *  http://www.loc.gov/marc/specifications/speccharmarc8.html
11  */
12
13 #if HAVE_CONFIG_H
14 #include <config.h>
15 #endif
16
17 #include <assert.h>
18 #include <errno.h>
19 #include <string.h>
20
21 #include <yaz/xmalloc.h>
22 #include <yaz/snprintf.h>
23 #include "iconv-p.h"
24
25 yaz_conv_func_t yaz_marc8r_42_conv;
26 yaz_conv_func_t yaz_marc8r_45_conv;
27 yaz_conv_func_t yaz_marc8r_67_conv;
28 yaz_conv_func_t yaz_marc8r_62_conv;
29 yaz_conv_func_t yaz_marc8r_70_conv;
30 yaz_conv_func_t yaz_marc8r_32_conv;
31 yaz_conv_func_t yaz_marc8r_4E_conv;
32 yaz_conv_func_t yaz_marc8r_51_conv;
33 yaz_conv_func_t yaz_marc8r_33_conv;
34 yaz_conv_func_t yaz_marc8r_34_conv;
35 yaz_conv_func_t yaz_marc8r_53_conv;
36 yaz_conv_func_t yaz_marc8r_31_conv;
37
38 #define ESC "\033"
39
40 struct encoder_data
41 {
42     unsigned write_marc8_second_half_char;
43     unsigned long write_marc8_last;
44     int write_marc8_ncr;
45     const char *write_marc8_lpage;
46     const char *write_marc8_g0;
47     const char *write_marc8_g1;
48 };
49
50 static void init_marc8(yaz_iconv_encoder_t w)
51 {
52     struct encoder_data *data = (struct encoder_data *) w->data;
53     data->write_marc8_second_half_char = 0;
54     data->write_marc8_last = 0;
55     data->write_marc8_ncr = 0;
56     data->write_marc8_lpage = 0;
57     data->write_marc8_g0 = ESC "(B";
58     data->write_marc8_g1 = 0;
59 }
60
61 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd, 
62                                        struct encoder_data *w,
63                                        char **outbuf, size_t *outbytesleft,
64                                        const char *page_chr);
65
66 static unsigned long lookup_marc8(yaz_iconv_t cd,
67                                   unsigned long x, int *comb,
68                                   const char **page_chr)
69 {
70     char utf8_buf[7];
71     char *utf8_outbuf = utf8_buf;
72     size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
73     int error_code;
74
75     r = yaz_write_UTF8_char(x, &utf8_outbuf, &utf8_outbytesleft, &error_code);
76     if (r == (size_t)(-1))
77     {
78         yaz_iconv_set_errno(cd, YAZ_ICONV_EILSEQ);
79         return 0;
80     }
81     else
82     {
83         unsigned char *inp;
84         size_t inbytesleft, no_read_sub = 0;
85         unsigned long x;
86
87         *utf8_outbuf = '\0';        
88         inp = (unsigned char *) utf8_buf;
89         inbytesleft = strlen(utf8_buf);
90
91         x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
92         if (x)
93         {
94             *page_chr = ESC "(B";
95             return x;
96         }
97         x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
98         if (x)
99         {
100             *page_chr = ESC "(B";
101             return x;
102         }
103         x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
104         if (x)
105         {
106             *page_chr = ESC "b";
107             return x;
108         }
109         x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
110         if (x)
111         {
112             *page_chr = ESC "p";
113             return x;
114         }
115         x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
116         if (x)
117         {
118             *page_chr = ESC "(2";
119             return x;
120         }
121         x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
122         if (x)
123         {
124             *page_chr = ESC "(N";
125             return x;
126         }
127         x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
128         if (x)
129         {
130             *page_chr = ESC "(Q";
131             return x;
132         }
133         x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
134         if (x)
135         {
136             *page_chr = ESC "(3";
137             return x;
138         }
139         x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
140         if (x)
141         {
142             *page_chr = ESC "(4";
143             return x;
144         }
145         x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
146         if (x)
147         {
148             *page_chr = ESC "(S";
149             return x;
150         }
151         x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
152         if (x)
153         {
154             *page_chr = ESC "$1";
155             return x;
156         }
157         yaz_iconv_set_errno(cd, YAZ_ICONV_EILSEQ);
158         return x;
159     }
160 }
161
162 static size_t flush_combos(yaz_iconv_t cd,
163                            struct encoder_data *w,
164                            char **outbuf, size_t *outbytesleft)
165 {
166     unsigned long y = w->write_marc8_last;
167
168     if (!y)
169         return 0;
170
171     assert(w->write_marc8_lpage);
172     if (w->write_marc8_lpage)
173     {
174         size_t r = yaz_write_marc8_page_chr(cd, w, outbuf, outbytesleft,
175                                             w->write_marc8_lpage);
176         if (r)
177             return r;
178     }
179
180     if (9 >= *outbytesleft)
181     {
182         yaz_iconv_set_errno(cd, YAZ_ICONV_E2BIG);
183         return (size_t) (-1);
184     }
185     if (w->write_marc8_ncr)
186     {
187         yaz_snprintf(*outbuf, 9, "&#x%04x;", y);
188         (*outbytesleft) -= 8;
189         (*outbuf) += 8;
190     }
191     else
192     {
193         size_t out_no = 0;
194         unsigned char byte;
195
196         byte = (unsigned char )((y>>16) & 0xff);
197         if (byte)
198             (*outbuf)[out_no++] = byte;
199         byte = (unsigned char)((y>>8) & 0xff);
200         if (byte)
201             (*outbuf)[out_no++] = byte;
202         byte = (unsigned char )(y & 0xff);
203         if (byte)
204             (*outbuf)[out_no++] = byte;
205         *outbuf += out_no;
206         (*outbytesleft) -= out_no;
207     }
208
209     if (w->write_marc8_second_half_char)
210     {
211         *(*outbuf)++ = w->write_marc8_second_half_char;
212         (*outbytesleft)--;
213     }        
214
215     w->write_marc8_last = 0;
216     w->write_marc8_ncr = 0;
217     w->write_marc8_lpage = 0;
218     w->write_marc8_second_half_char = 0;
219     return 0;
220 }
221
222 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd, 
223                                        struct encoder_data *w,
224                                        char **outbuf, size_t *outbytesleft,
225                                        const char *page_chr)
226 {
227     const char **old_page_chr = &w->write_marc8_g0;
228
229     /* are we going to a G1-set (such as such as ESC ")!E") */
230     if (page_chr && page_chr[1] == ')')
231         old_page_chr = &w->write_marc8_g1;
232
233     if (!*old_page_chr || strcmp(page_chr, *old_page_chr))
234     {
235         size_t plen = 0;
236         const char *page_out = page_chr;
237         
238         if (*outbytesleft < 8)
239         {
240             yaz_iconv_set_errno(cd, YAZ_ICONV_E2BIG);
241             
242             return (size_t) (-1);
243         }
244
245         if (*old_page_chr)
246         {
247             if (!strcmp(*old_page_chr, ESC "p") 
248                 || !strcmp(*old_page_chr, ESC "g")
249                 || !strcmp(*old_page_chr, ESC "b"))
250             {
251                 page_out = ESC "s";
252                 /* Technique 1 leave */
253                 if (strcmp(page_chr, ESC "(B")) /* Not going ASCII page? */
254                 {
255                     /* Must leave script + enter new page */
256                     plen = strlen(page_out);
257                     memcpy(*outbuf, page_out, plen);
258                     (*outbuf) += plen;
259                     (*outbytesleft) -= plen;
260                     page_out = ESC "(B";
261                 }
262             }
263         }
264         *old_page_chr = page_chr;
265         plen = strlen(page_out);
266         memcpy(*outbuf, page_out, plen);
267         (*outbuf) += plen;
268         (*outbytesleft) -= plen;
269     }
270     return 0;
271 }
272
273
274 static size_t yaz_write_marc8_2(yaz_iconv_t cd, struct encoder_data *w,
275                                 unsigned long x,
276                                 char **outbuf, size_t *outbytesleft,
277                                 int loss_mode)
278 {
279     int comb = 0;
280     int enable_ncr = 0;
281     const char *page_chr = 0;
282     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
283
284     if (!y)
285     {
286         if (loss_mode == 0)
287             return (size_t) (-1);
288         page_chr = ESC "(B";
289         if (loss_mode == 1)
290             y = '|';
291         else
292         {
293             y = x; 
294             enable_ncr = 1;
295         }
296     }
297
298     if (comb)
299     {
300         if (page_chr)
301         {
302             size_t r = yaz_write_marc8_page_chr(cd, w, outbuf, outbytesleft,
303                                                 page_chr);
304             if (r)
305                 return r;
306         }
307         if (x == 0x0361)
308             w->write_marc8_second_half_char = 0xEC;
309         else if (x == 0x0360)
310             w->write_marc8_second_half_char = 0xFB;
311
312         if (*outbytesleft <= 1)
313         {
314             yaz_iconv_set_errno(cd, YAZ_ICONV_E2BIG);
315             return (size_t) (-1);
316         }
317         *(*outbuf)++ = y;
318         (*outbytesleft)--;
319     }
320     else
321     {
322         size_t r = flush_combos(cd, w, outbuf, outbytesleft);
323         if (r)
324             return r;
325
326         w->write_marc8_last = y;
327         w->write_marc8_lpage = page_chr;
328         w->write_marc8_ncr = enable_ncr;
329     }
330     return 0;
331 }
332
333 static size_t flush_marc8(yaz_iconv_t cd, yaz_iconv_encoder_t en,
334                            char **outbuf, size_t *outbytesleft)
335 {
336     struct encoder_data *w = (struct encoder_data *) en->data;
337     size_t r = flush_combos(cd, w, outbuf, outbytesleft);
338     if (r)
339         return r;
340     w->write_marc8_g1 = 0;
341     return yaz_write_marc8_page_chr(cd, w, outbuf, outbytesleft, ESC "(B");
342 }
343
344 static size_t yaz_write_marc8_generic(yaz_iconv_t cd, struct encoder_data *w,
345                                       unsigned long x,
346                                       char **outbuf, size_t *outbytesleft,
347                                       int loss_mode)
348 {
349     unsigned long x1, x2;
350     if (yaz_iso_8859_1_lookup_y(x, &x1, &x2))
351     {
352         /* save the output pointers .. */
353         char *outbuf0 = *outbuf;
354         size_t outbytesleft0 = *outbytesleft;
355         int last_ch = w->write_marc8_last;
356         int ncr = w->write_marc8_ncr;
357         const char *lpage = w->write_marc8_lpage;
358         size_t r;
359         
360         r = yaz_write_marc8_2(cd, w, x1,
361                               outbuf, outbytesleft, loss_mode);
362         if (r)
363             return r;
364         r = yaz_write_marc8_2(cd, w, x2,
365                               outbuf, outbytesleft, loss_mode);
366         if (r && yaz_iconv_error(cd) == YAZ_ICONV_E2BIG)
367         {
368             /* not enough room. reset output to original values */
369             *outbuf = outbuf0;
370             *outbytesleft = outbytesleft0;
371             w->write_marc8_last = last_ch;
372             w->write_marc8_ncr = ncr;
373             w->write_marc8_lpage = lpage;
374         }
375         return r;
376     }
377     return yaz_write_marc8_2(cd, w, x, outbuf, outbytesleft, loss_mode);
378 }
379
380 static size_t write_marc8_normal(yaz_iconv_t cd, yaz_iconv_encoder_t e,
381                                  unsigned long x,
382                                  char **outbuf, size_t *outbytesleft)
383 {
384     return yaz_write_marc8_generic(cd, (struct encoder_data *) e->data,
385                                    x, outbuf, outbytesleft, 0);
386 }
387
388 static size_t write_marc8_lossy(yaz_iconv_t cd, yaz_iconv_encoder_t e,
389                                 unsigned long x,
390                                 char **outbuf, size_t *outbytesleft)
391 {
392     return yaz_write_marc8_generic(cd, (struct encoder_data *) e->data,
393                                    x, outbuf, outbytesleft, 1);
394 }
395
396 static size_t write_marc8_lossless(yaz_iconv_t cd, yaz_iconv_encoder_t e,
397                                    unsigned long x,
398                                    char **outbuf, size_t *outbytesleft)
399 {
400     return yaz_write_marc8_generic(cd, (struct encoder_data *) e->data,
401                                    x, outbuf, outbytesleft, 2);
402 }
403
404 static void destroy_marc8(yaz_iconv_encoder_t e)
405 {
406     xfree(e->data);
407 }
408
409 yaz_iconv_encoder_t yaz_marc8_encoder(const char *tocode,
410                                       yaz_iconv_encoder_t e)
411     
412 {
413     if (!yaz_matchstr(tocode, "MARC8"))
414         e->write_handle = write_marc8_normal;
415     else if (!yaz_matchstr(tocode, "MARC8s"))
416         e->write_handle = write_marc8_normal;
417     else if (!yaz_matchstr(tocode, "MARC8lossy"))
418         e->write_handle = write_marc8_lossy;
419     else if (!yaz_matchstr(tocode, "MARC8lossless"))
420         e->write_handle = write_marc8_lossless;
421     else
422         return 0;
423
424     {
425         struct encoder_data *data = (struct encoder_data *)
426             xmalloc(sizeof(*data));
427         e->data = data;
428         e->destroy_handle = destroy_marc8;
429         e->flush_handle = flush_marc8;
430         e->init_handle = init_marc8;
431     }
432     return e;
433 }
434
435
436 /*
437  * Local variables:
438  * c-basic-offset: 4
439  * c-file-style: "Stroustrup"
440  * indent-tabs-mode: nil
441  * End:
442  * vim: shiftwidth=4 tabstop=8 expandtab
443  */
444