Update source headers for 2008. Omit CVS ID keyword subst.
[yaz-moved-to-github.git] / src / iconv_encode_marc8.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2008 Index Data
3  * See the file LICENSE for details.
4  */
5 /**
6  * \file
7  * \brief MARC-8 encoding
8  *
9  * MARC-8 reference:
10  *  http://www.loc.gov/marc/specifications/speccharmarc8.html
11  */
12
13 #if HAVE_CONFIG_H
14 #include <config.h>
15 #endif
16
17 #include <assert.h>
18 #include <errno.h>
19 #include <string.h>
20 #include <ctype.h>
21
22 #include <yaz/xmalloc.h>
23 #include <yaz/nmem.h>
24 #include <yaz/snprintf.h>
25 #include "iconv-p.h"
26
27 yaz_conv_func_t yaz_marc8r_42_conv;
28 yaz_conv_func_t yaz_marc8r_45_conv;
29 yaz_conv_func_t yaz_marc8r_67_conv;
30 yaz_conv_func_t yaz_marc8r_62_conv;
31 yaz_conv_func_t yaz_marc8r_70_conv;
32 yaz_conv_func_t yaz_marc8r_32_conv;
33 yaz_conv_func_t yaz_marc8r_4E_conv;
34 yaz_conv_func_t yaz_marc8r_51_conv;
35 yaz_conv_func_t yaz_marc8r_33_conv;
36 yaz_conv_func_t yaz_marc8r_34_conv;
37 yaz_conv_func_t yaz_marc8r_53_conv;
38 yaz_conv_func_t yaz_marc8r_31_conv;
39
40 #define ESC "\033"
41
42 struct encoder_data
43 {
44     unsigned write_marc8_second_half_char;
45     unsigned long write_marc8_last;
46     int write_marc8_ncr;
47     const char *write_marc8_lpage;
48     const char *write_marc8_g0;
49     const char *write_marc8_g1;
50 };
51
52 static void init_marc8(yaz_iconv_encoder_t w)
53 {
54     struct encoder_data *data = w->data;
55     data->write_marc8_second_half_char = 0;
56     data->write_marc8_last = 0;
57     data->write_marc8_ncr = 0;
58     data->write_marc8_lpage = 0;
59     data->write_marc8_g0 = ESC "(B";
60     data->write_marc8_g1 = 0;
61 }
62
63 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd, 
64                                        struct encoder_data *w,
65                                        char **outbuf, size_t *outbytesleft,
66                                        const char *page_chr);
67
68 static unsigned long lookup_marc8(yaz_iconv_t cd,
69                                   unsigned long x, int *comb,
70                                   const char **page_chr)
71 {
72     char utf8_buf[7];
73     char *utf8_outbuf = utf8_buf;
74     size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
75     int error_code;
76
77     r = yaz_write_UTF8_char(x, &utf8_outbuf, &utf8_outbytesleft, &error_code);
78     if (r == (size_t)(-1))
79     {
80         yaz_iconv_set_errno(cd, YAZ_ICONV_EILSEQ);
81         return 0;
82     }
83     else
84     {
85         unsigned char *inp;
86         size_t inbytesleft, no_read_sub = 0;
87         unsigned long x;
88
89         *utf8_outbuf = '\0';        
90         inp = (unsigned char *) utf8_buf;
91         inbytesleft = strlen(utf8_buf);
92
93         x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
94         if (x)
95         {
96             *page_chr = ESC "(B";
97             return x;
98         }
99         x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
100         if (x)
101         {
102             *page_chr = ESC "(B";
103             return x;
104         }
105         x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
106         if (x)
107         {
108             *page_chr = ESC "b";
109             return x;
110         }
111         x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
112         if (x)
113         {
114             *page_chr = ESC "p";
115             return x;
116         }
117         x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
118         if (x)
119         {
120             *page_chr = ESC "(2";
121             return x;
122         }
123         x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
124         if (x)
125         {
126             *page_chr = ESC "(N";
127             return x;
128         }
129         x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
130         if (x)
131         {
132             *page_chr = ESC "(Q";
133             return x;
134         }
135         x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
136         if (x)
137         {
138             *page_chr = ESC "(3";
139             return x;
140         }
141         x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
142         if (x)
143         {
144             *page_chr = ESC "(4";
145             return x;
146         }
147         x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
148         if (x)
149         {
150             *page_chr = ESC "(S";
151             return x;
152         }
153         x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
154         if (x)
155         {
156             *page_chr = ESC "$1";
157             return x;
158         }
159         yaz_iconv_set_errno(cd, YAZ_ICONV_EILSEQ);
160         return x;
161     }
162 }
163
164 static size_t flush_combos(yaz_iconv_t cd,
165                            struct encoder_data *w,
166                            char **outbuf, size_t *outbytesleft)
167 {
168     unsigned long y = w->write_marc8_last;
169
170     if (!y)
171         return 0;
172
173     assert(w->write_marc8_lpage);
174     if (w->write_marc8_lpage)
175     {
176         size_t r = yaz_write_marc8_page_chr(cd, w, outbuf, outbytesleft,
177                                             w->write_marc8_lpage);
178         if (r)
179             return r;
180     }
181
182     if (9 >= *outbytesleft)
183     {
184         yaz_iconv_set_errno(cd, YAZ_ICONV_E2BIG);
185         return (size_t) (-1);
186     }
187     if (w->write_marc8_ncr)
188     {
189         yaz_snprintf(*outbuf, 9, "&#x%04x;", y);
190         (*outbytesleft) -= 8;
191         (*outbuf) += 8;
192     }
193     else
194     {
195         size_t out_no = 0;
196         unsigned char byte;
197
198         byte = (unsigned char )((y>>16) & 0xff);
199         if (byte)
200             (*outbuf)[out_no++] = byte;
201         byte = (unsigned char)((y>>8) & 0xff);
202         if (byte)
203             (*outbuf)[out_no++] = byte;
204         byte = (unsigned char )(y & 0xff);
205         if (byte)
206             (*outbuf)[out_no++] = byte;
207         *outbuf += out_no;
208         (*outbytesleft) -= out_no;
209     }
210
211     if (w->write_marc8_second_half_char)
212     {
213         *(*outbuf)++ = w->write_marc8_second_half_char;
214         (*outbytesleft)--;
215     }        
216
217     w->write_marc8_last = 0;
218     w->write_marc8_ncr = 0;
219     w->write_marc8_lpage = 0;
220     w->write_marc8_second_half_char = 0;
221     return 0;
222 }
223
224 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd, 
225                                        struct encoder_data *w,
226                                        char **outbuf, size_t *outbytesleft,
227                                        const char *page_chr)
228 {
229     const char **old_page_chr = &w->write_marc8_g0;
230
231     /* are we going to a G1-set (such as such as ESC ")!E") */
232     if (page_chr && page_chr[1] == ')')
233         old_page_chr = &w->write_marc8_g1;
234
235     if (!*old_page_chr || strcmp(page_chr, *old_page_chr))
236     {
237         size_t plen = 0;
238         const char *page_out = page_chr;
239         
240         if (*outbytesleft < 8)
241         {
242             yaz_iconv_set_errno(cd, YAZ_ICONV_E2BIG);
243             
244             return (size_t) (-1);
245         }
246
247         if (*old_page_chr)
248         {
249             if (!strcmp(*old_page_chr, ESC "p") 
250                 || !strcmp(*old_page_chr, ESC "g")
251                 || !strcmp(*old_page_chr, ESC "b"))
252             {
253                 page_out = ESC "s";
254                 /* Technique 1 leave */
255                 if (strcmp(page_chr, ESC "(B")) /* Not going ASCII page? */
256                 {
257                     /* Must leave script + enter new page */
258                     plen = strlen(page_out);
259                     memcpy(*outbuf, page_out, plen);
260                     (*outbuf) += plen;
261                     (*outbytesleft) -= plen;
262                     page_out = ESC "(B";
263                 }
264             }
265         }
266         *old_page_chr = page_chr;
267         plen = strlen(page_out);
268         memcpy(*outbuf, page_out, plen);
269         (*outbuf) += plen;
270         (*outbytesleft) -= plen;
271     }
272     return 0;
273 }
274
275
276 static size_t yaz_write_marc8_2(yaz_iconv_t cd, struct encoder_data *w,
277                                 unsigned long x,
278                                 char **outbuf, size_t *outbytesleft,
279                                 int loss_mode)
280 {
281     int comb = 0;
282     int enable_ncr = 0;
283     const char *page_chr = 0;
284     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
285
286     if (!y)
287     {
288         if (loss_mode == 0)
289             return (size_t) (-1);
290         page_chr = ESC "(B";
291         if (loss_mode == 1)
292             y = '|';
293         else
294         {
295             y = x; 
296             enable_ncr = 1;
297         }
298     }
299
300     if (comb)
301     {
302         if (page_chr)
303         {
304             size_t r = yaz_write_marc8_page_chr(cd, w, outbuf, outbytesleft,
305                                                 page_chr);
306             if (r)
307                 return r;
308         }
309         if (x == 0x0361)
310             w->write_marc8_second_half_char = 0xEC;
311         else if (x == 0x0360)
312             w->write_marc8_second_half_char = 0xFB;
313
314         if (*outbytesleft <= 1)
315         {
316             yaz_iconv_set_errno(cd, YAZ_ICONV_E2BIG);
317             return (size_t) (-1);
318         }
319         *(*outbuf)++ = y;
320         (*outbytesleft)--;
321     }
322     else
323     {
324         size_t r = flush_combos(cd, w, outbuf, outbytesleft);
325         if (r)
326             return r;
327
328         w->write_marc8_last = y;
329         w->write_marc8_lpage = page_chr;
330         w->write_marc8_ncr = enable_ncr;
331     }
332     return 0;
333 }
334
335 static size_t flush_marc8(yaz_iconv_t cd, yaz_iconv_encoder_t en,
336                            char **outbuf, size_t *outbytesleft)
337 {
338     struct encoder_data *w = en->data;
339     size_t r = flush_combos(cd, w, outbuf, outbytesleft);
340     if (r)
341         return r;
342     w->write_marc8_g1 = 0;
343     return yaz_write_marc8_page_chr(cd, w, outbuf, outbytesleft, ESC "(B");
344 }
345
346 static size_t yaz_write_marc8_generic(yaz_iconv_t cd, struct encoder_data *w,
347                                       unsigned long x,
348                                       char **outbuf, size_t *outbytesleft,
349                                       int loss_mode)
350 {
351     unsigned long x1, x2;
352     if (yaz_iso_8859_1_lookup_y(x, &x1, &x2))
353     {
354         /* save the output pointers .. */
355         char *outbuf0 = *outbuf;
356         size_t outbytesleft0 = *outbytesleft;
357         int last_ch = w->write_marc8_last;
358         int ncr = w->write_marc8_ncr;
359         const char *lpage = w->write_marc8_lpage;
360         size_t r;
361         
362         r = yaz_write_marc8_2(cd, w, x1,
363                               outbuf, outbytesleft, loss_mode);
364         if (r)
365             return r;
366         r = yaz_write_marc8_2(cd, w, x2,
367                               outbuf, outbytesleft, loss_mode);
368         if (r && yaz_iconv_error(cd) == YAZ_ICONV_E2BIG)
369         {
370             /* not enough room. reset output to original values */
371             *outbuf = outbuf0;
372             *outbytesleft = outbytesleft0;
373             w->write_marc8_last = last_ch;
374             w->write_marc8_ncr = ncr;
375             w->write_marc8_lpage = lpage;
376         }
377         return r;
378     }
379     return yaz_write_marc8_2(cd, w, x, outbuf, outbytesleft, loss_mode);
380 }
381
382 static size_t write_marc8_normal(yaz_iconv_t cd, yaz_iconv_encoder_t e,
383                                  unsigned long x,
384                                  char **outbuf, size_t *outbytesleft)
385 {
386     return yaz_write_marc8_generic(cd, e->data, x, outbuf, outbytesleft, 0);
387 }
388
389 static size_t write_marc8_lossy(yaz_iconv_t cd, yaz_iconv_encoder_t e,
390                                 unsigned long x,
391                                 char **outbuf, size_t *outbytesleft)
392 {
393     return yaz_write_marc8_generic(cd, e->data, x, outbuf, outbytesleft, 1);
394 }
395
396 static size_t write_marc8_lossless(yaz_iconv_t cd, yaz_iconv_encoder_t e,
397                                    unsigned long x,
398                                    char **outbuf, size_t *outbytesleft)
399 {
400     return yaz_write_marc8_generic(cd, e->data, x, outbuf, outbytesleft, 2);
401 }
402
403 static void destroy_marc8(yaz_iconv_encoder_t e)
404 {
405     xfree(e->data);
406 }
407
408 yaz_iconv_encoder_t yaz_marc8_encoder(const char *tocode,
409                                       yaz_iconv_encoder_t e)
410     
411 {
412     if (!yaz_matchstr(tocode, "MARC8"))
413         e->write_handle = write_marc8_normal;
414     else if (!yaz_matchstr(tocode, "MARC8s"))
415         e->write_handle = write_marc8_normal;
416     else if (!yaz_matchstr(tocode, "MARC8lossy"))
417         e->write_handle = write_marc8_lossy;
418     else if (!yaz_matchstr(tocode, "MARC8lossless"))
419         e->write_handle = write_marc8_lossless;
420     else
421         return 0;
422
423     {
424         struct encoder_data *data = xmalloc(sizeof(*data));
425         e->data = data;
426         e->destroy_handle = destroy_marc8;
427         e->flush_handle = flush_marc8;
428         e->init_handle = init_marc8;
429     }
430     return e;
431 }
432
433
434 /*
435  * Local variables:
436  * c-basic-offset: 4
437  * indent-tabs-mode: nil
438  * End:
439  * vim: shiftwidth=4 tabstop=8 expandtab
440  */