Implemented lossy and lossless MARC-8 encoding.
[yaz-moved-to-github.git] / src / siconv.c
1 /*
2  * Copyright (C) 1995-2008, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: siconv.c,v 1.50 2008-03-12 08:53:28 adam Exp $
6  */
7 /**
8  * \file siconv.c
9  * \brief Implements simple ICONV
10  *
11  * This implements an interface similar to that of iconv and
12  * is used by YAZ to interface with iconv (if present).
13  * For systems where iconv is not present, this layer
14  * provides a few important conversions: UTF-8, MARC-8, Latin-1.
15  *
16  * MARC-8 reference:
17  *  http://www.loc.gov/marc/specifications/speccharmarc8.html
18  */
19
20 #if HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include <assert.h>
25 #include <errno.h>
26 #include <string.h>
27 #include <ctype.h>
28 #if HAVE_WCHAR_H
29 #include <wchar.h>
30 #endif
31
32 #if HAVE_ICONV_H
33 #include <iconv.h>
34 #endif
35
36 #include <yaz/xmalloc.h>
37 #include <yaz/nmem.h>
38 #include <yaz/snprintf.h>
39 #include "iconv-p.h"
40
41 typedef unsigned long yaz_conv_func_t(unsigned char *inp, size_t inbytesleft,
42                                       size_t *no_read, int *combining,
43                                       unsigned mask, int boffset);
44
45
46 yaz_conv_func_t yaz_marc8_42_conv;
47 yaz_conv_func_t yaz_marc8_45_conv;
48 yaz_conv_func_t yaz_marc8_67_conv;
49 yaz_conv_func_t yaz_marc8_62_conv;
50 yaz_conv_func_t yaz_marc8_70_conv;
51 yaz_conv_func_t yaz_marc8_32_conv;
52 yaz_conv_func_t yaz_marc8_4E_conv;
53 yaz_conv_func_t yaz_marc8_51_conv;
54 yaz_conv_func_t yaz_marc8_33_conv;
55 yaz_conv_func_t yaz_marc8_34_conv;
56 yaz_conv_func_t yaz_marc8_53_conv;
57 yaz_conv_func_t yaz_marc8_31_conv;
58
59 yaz_conv_func_t yaz_marc8r_42_conv;
60 yaz_conv_func_t yaz_marc8r_45_conv;
61 yaz_conv_func_t yaz_marc8r_67_conv;
62 yaz_conv_func_t yaz_marc8r_62_conv;
63 yaz_conv_func_t yaz_marc8r_70_conv;
64 yaz_conv_func_t yaz_marc8r_32_conv;
65 yaz_conv_func_t yaz_marc8r_4E_conv;
66 yaz_conv_func_t yaz_marc8r_51_conv;
67 yaz_conv_func_t yaz_marc8r_33_conv;
68 yaz_conv_func_t yaz_marc8r_34_conv;
69 yaz_conv_func_t yaz_marc8r_53_conv;
70 yaz_conv_func_t yaz_marc8r_31_conv;
71
72 struct yaz_iconv_struct {
73     int my_errno;
74     int init_flag;
75     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
76                           size_t inbytesleft, size_t *no_read);
77     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
78                                  size_t inbytesleft, size_t *no_read);
79     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
80                            char **outbuf, size_t *outbytesleft);
81     size_t (*flush_handle)(yaz_iconv_t cd,
82                            char **outbuf, size_t *outbytesleft);
83     int g0_mode;
84     int g1_mode;
85
86     int comb_offset;
87     int comb_size;
88     unsigned long comb_x[8];
89     size_t comb_no_read[8];
90     size_t no_read_x;
91     unsigned long unget_x;
92 #if HAVE_ICONV_H
93     iconv_t iconv_cd;
94 #endif
95     unsigned long compose_char;
96
97     unsigned write_marc8_second_half_char;
98     unsigned long write_marc8_last;
99     int write_marc8_ncr;
100     const char *write_marc8_lpage;
101     const char *write_marc8_g0;
102     const char *write_marc8_g1;
103 };
104
105
106 static struct {
107     unsigned long x1, x2;
108     unsigned y;
109 } latin1_comb[] = {
110     { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
111     { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
112     { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
113     { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
114     { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
115     { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
116     /* no need for 0xc6      LATIN CAPITAL LETTER AE */
117     { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
118     { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
119     { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
120     { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
121     { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
122     { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
123     { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
124     { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
125     { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
126     { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
127     { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
128     { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
129     { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
130     { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
131     { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
132     /* omitted:    0xd7      MULTIPLICATION SIGN */
133     /* omitted:    0xd8      LATIN CAPITAL LETTER O WITH STROKE */
134     { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
135     { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
136     { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
137     { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
138     { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
139     /* omitted:    0xde      LATIN CAPITAL LETTER THORN */
140     /* omitted:    0xdf      LATIN SMALL LETTER SHARP S */
141     { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
142     { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
143     { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
144     { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
145     { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
146     { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
147     /* omitted:    0xe6      LATIN SMALL LETTER AE */
148     { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
149     { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
150     { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
151     { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
152     { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
153     { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
154     { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
155     { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
156     { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
157     /* omitted:    0xf0      LATIN SMALL LETTER ETH */
158     { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
159     { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
160     { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
161     { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
162     { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
163     { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
164     /* omitted:    0xf7      DIVISION SIGN */
165     /* omitted:    0xf8      LATIN SMALL LETTER O WITH STROKE */
166     { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
167     { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
168     { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
169     { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
170     { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
171     /* omitted:    0xfe      LATIN SMALL LETTER THORN */
172     { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
173     
174     { 0, 0, 0}
175 };
176
177 #define ESC "\033"
178
179 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd, 
180                                        char **outbuf, size_t *outbytesleft,
181                                        const char *page_chr);
182
183 static unsigned long yaz_read_ISO8859_1(yaz_iconv_t cd, unsigned char *inp,
184                                         size_t inbytesleft, size_t *no_read)
185 {
186     unsigned long x = inp[0];
187     *no_read = 1;
188     return x;
189 }
190
191
192
193 #if HAVE_WCHAR_H
194 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
195                                        size_t inbytesleft, size_t *no_read)
196 {
197     unsigned long x = 0;
198     
199     if (inbytesleft < sizeof(wchar_t))
200     {
201         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
202         *no_read = 0;
203     }
204     else
205     {
206         wchar_t wch;
207         memcpy (&wch, inp, sizeof(wch));
208         x = wch;
209         *no_read = sizeof(wch);
210     }
211     return x;
212 }
213 #endif
214
215
216 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
217                                           size_t inbytesleft, size_t *no_read,
218                                           int *comb);
219
220 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
221                                      size_t inbytesleft, size_t *no_read)
222 {
223     unsigned long x;
224     if (cd->comb_offset < cd->comb_size)
225     {
226         *no_read = cd->comb_no_read[cd->comb_offset];
227         x = cd->comb_x[cd->comb_offset];
228
229         /* special case for double-diacritic combining characters, 
230            INVERTED BREVE and DOUBLE TILDE.
231            We'll increment the no_read counter by 1, since we want to skip over
232            the processing of the closing ligature character
233         */
234         /* this code is no longer necessary.. our handlers code in
235            yaz_marc8_?_conv (generated by charconv.tcl) now returns
236            0 and no_read=1 when a sequence does not match the input.
237            The SECOND HALFs in codetables.xml produces a non-existant
238            entry in the conversion trie.. Hence when met, the input byte is
239            skipped as it should (in yaz_iconv)
240         */
241 #if 0
242         if (x == 0x0361 || x == 0x0360)
243             *no_read += 1;
244 #endif
245         cd->comb_offset++;
246         return x;
247     }
248
249     cd->comb_offset = 0;
250     for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
251     {
252         int comb = 0;
253
254         if (inbytesleft == 0 && cd->comb_size)
255         {
256             cd->my_errno = YAZ_ICONV_EINVAL;
257             x = 0;
258             *no_read = 0;
259             break;
260         }
261         x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
262         if (!comb || !x)
263             break;
264         cd->comb_x[cd->comb_size] = x;
265         cd->comb_no_read[cd->comb_size] = *no_read;
266         inp += *no_read;
267         inbytesleft = inbytesleft - *no_read;
268     }
269     return x;
270 }
271
272 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
273                                      size_t inbytesleft, size_t *no_read)
274 {
275     unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
276     if (x && cd->comb_size == 1)
277     {
278         /* For MARC8s we try to get a Latin-1 page code out of it */
279         int i;
280         for (i = 0; latin1_comb[i].x1; i++)
281             if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
282             {
283                 *no_read += cd->comb_no_read[0];
284                 cd->comb_size = 0;
285                 x = latin1_comb[i].y;
286                 break;
287             }
288     }
289     return x;
290 }
291
292 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
293                                          size_t inbytesleft, size_t *no_read,
294                                          int *comb)
295 {
296     *no_read = 0;
297     while (inbytesleft > 0 && *inp == 27)
298     {
299         int *modep = &cd->g0_mode;
300         size_t inbytesleft0 = inbytesleft;
301
302         inbytesleft--;
303         inp++;
304         if (inbytesleft == 0)
305             goto incomplete;
306         if (*inp == '$') /* set with multiple bytes */
307         {
308             inbytesleft--;
309             inp++;
310         }
311         if (inbytesleft == 0)
312             goto incomplete;
313         if (*inp == '(' || *inp == ',')  /* G0 */
314         {
315             inbytesleft--;
316             inp++;
317         }
318         else if (*inp == ')' || *inp == '-') /* G1 */
319         {
320             inbytesleft--;
321             inp++;
322             modep = &cd->g1_mode;
323         }
324         if (inbytesleft == 0)
325             goto incomplete;
326         if (*inp == '!') /* ANSEL is a special case */
327         {
328             inbytesleft--;
329             inp++;
330         }
331         if (inbytesleft == 0)
332             goto incomplete;
333         *modep = *inp++; /* Final character */
334         inbytesleft--;
335
336         (*no_read) += inbytesleft0 - inbytesleft;
337     }
338     if (inbytesleft == 0)
339         return 0;
340     else if (*inp == ' ')
341     {
342         *no_read += 1;
343         return ' ';
344     }
345     else
346     {
347         unsigned long x;
348         size_t no_read_sub = 0;
349         int mode = *inp < 128 ? cd->g0_mode : cd->g1_mode;
350         *comb = 0;
351
352         switch(mode)
353         {
354         case 'B':  /* Basic ASCII */
355         case 's':  /* ASCII */
356             x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
357             break;
358         case 'E':  /* ANSEL */
359             x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb, 127, 128);
360             break;
361         case 'g':  /* Greek */
362             x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
363             break;
364         case 'b':  /* Subscripts */
365             x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
366             break;
367         case 'p':  /* Superscripts */
368             x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
369             break;
370         case '2':  /* Basic Hebrew */
371             x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
372             break;
373         case 'N':  /* Basic Cyrillic */
374             x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
375             break;
376         case 'Q':  /* Extended Cyrillic */
377             x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
378             break;
379         case '3':  /* Basic Arabic */
380             x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
381             break;
382         case '4':  /* Extended Arabic */
383             x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
384             break;
385         case 'S':  /* Greek */
386             x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
387             break;
388         case '1':  /* Chinese, Japanese, Korean (EACC) */
389             x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
390             break;
391         default:
392             *no_read = 0;
393             cd->my_errno = YAZ_ICONV_EILSEQ;
394             return 0;
395         }
396         *no_read += no_read_sub;
397         return x;
398     }
399 incomplete:
400     *no_read = 0;
401     cd->my_errno = YAZ_ICONV_EINVAL;
402     return 0;
403 }
404
405 static size_t yaz_write_ISO8859_1(yaz_iconv_t cd, unsigned long x,
406                                   char **outbuf, size_t *outbytesleft)
407 {
408     /* list of two char unicode sequence that, when combined, are
409        equivalent to single unicode chars that can be represented in
410        ISO-8859-1/Latin-1.
411        Regular iconv on Linux at least does not seem to convert these,
412        but since MARC-8 to UTF-8 generates these composed sequence
413        we get a better chance of a successful MARC-8 -> ISO-8859-1
414        conversion */
415     unsigned char *outp = (unsigned char *) *outbuf;
416
417     if (cd->compose_char)
418     {
419         int i;
420         for (i = 0; latin1_comb[i].x1; i++)
421             if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
422             {
423                 x = latin1_comb[i].y;
424                 break;
425             }
426         if (*outbytesleft < 1)
427         {  /* no room. Retain compose_char and bail out */
428             cd->my_errno = YAZ_ICONV_E2BIG;
429             return (size_t)(-1);
430         }
431         if (!latin1_comb[i].x1) 
432         {   /* not found. Just write compose_char */
433             *outp++ = (unsigned char) cd->compose_char;
434             (*outbytesleft)--;
435             *outbuf = (char *) outp;
436         }
437         /* compose_char used so reset it. x now holds current char */
438         cd->compose_char = 0;
439     }
440
441     if (x > 32 && x < 127 && cd->compose_char == 0)
442     {
443         cd->compose_char = x;
444         return 0;
445     }
446     else if (x > 255 || x < 1)
447     {
448         cd->my_errno = YAZ_ICONV_EILSEQ;
449         return (size_t) -1;
450     }
451     else if (*outbytesleft < 1)
452     {
453         cd->my_errno = YAZ_ICONV_E2BIG;
454         return (size_t)(-1);
455     }
456     *outp++ = (unsigned char) x;
457     (*outbytesleft)--;
458     *outbuf = (char *) outp;
459     return 0;
460 }
461
462 static size_t yaz_flush_ISO8859_1(yaz_iconv_t cd,
463                                   char **outbuf, size_t *outbytesleft)
464 {
465     if (cd->compose_char)
466     {
467         unsigned char *outp = (unsigned char *) *outbuf;
468         if (*outbytesleft < 1)
469         {
470             cd->my_errno = YAZ_ICONV_E2BIG;
471             return (size_t)(-1);
472         }
473         *outp++ = (unsigned char) cd->compose_char;
474         (*outbytesleft)--;
475         *outbuf = (char *) outp;
476         cd->compose_char = 0;
477     }
478     return 0;
479 }
480
481 static unsigned long lookup_marc8(yaz_iconv_t cd,
482                                   unsigned long x, int *comb,
483                                   const char **page_chr)
484 {
485     char utf8_buf[7];
486     char *utf8_outbuf = utf8_buf;
487     size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
488
489     r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft);
490     if (r == (size_t)(-1))
491     {
492         cd->my_errno = YAZ_ICONV_EILSEQ;
493         return 0;
494     }
495     else
496     {
497         unsigned char *inp;
498         size_t inbytesleft, no_read_sub = 0;
499         unsigned long x;
500
501         *utf8_outbuf = '\0';        
502         inp = (unsigned char *) utf8_buf;
503         inbytesleft = strlen(utf8_buf);
504
505         x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
506         if (x)
507         {
508             *page_chr = ESC "(B";
509             return x;
510         }
511         x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
512         if (x)
513         {
514             *page_chr = ESC "(B";
515             return x;
516         }
517         x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
518         if (x)
519         {
520             *page_chr = ESC "b";
521             return x;
522         }
523         x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
524         if (x)
525         {
526             *page_chr = ESC "p";
527             return x;
528         }
529         x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
530         if (x)
531         {
532             *page_chr = ESC "(2";
533             return x;
534         }
535         x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
536         if (x)
537         {
538             *page_chr = ESC "(N";
539             return x;
540         }
541         x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
542         if (x)
543         {
544             *page_chr = ESC "(Q";
545             return x;
546         }
547         x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
548         if (x)
549         {
550             *page_chr = ESC "(3";
551             return x;
552         }
553         x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
554         if (x)
555         {
556             *page_chr = ESC "(4";
557             return x;
558         }
559         x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
560         if (x)
561         {
562             *page_chr = ESC "(S";
563             return x;
564         }
565         x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
566         if (x)
567         {
568             *page_chr = ESC "$1";
569             return x;
570         }
571         cd->my_errno = YAZ_ICONV_EILSEQ;
572         return x;
573     }
574 }
575
576 static size_t flush_combos(yaz_iconv_t cd,
577                            char **outbuf, size_t *outbytesleft)
578 {
579     unsigned long y = cd->write_marc8_last;
580
581     if (!y)
582         return 0;
583
584     assert(cd->write_marc8_lpage);
585     if (cd->write_marc8_lpage)
586     {
587         size_t r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft,
588                                             cd->write_marc8_lpage);
589         if (r)
590             return r;
591     }
592
593     if (9 >= *outbytesleft)
594     {
595         cd->my_errno = YAZ_ICONV_E2BIG;
596         return (size_t) (-1);
597     }
598     if (cd->write_marc8_ncr)
599     {
600         yaz_snprintf(*outbuf, 9, "&#x%04x;", y);
601         (*outbytesleft) -= 8;
602         (*outbuf) += 8;
603     }
604     else
605     {
606         char out_buf[4];
607         size_t out_no = 0;
608         unsigned char byte;
609
610
611         byte = (unsigned char )((y>>16) & 0xff);
612         if (byte)
613             out_buf[out_no++] = byte;
614         byte = (unsigned char)((y>>8) & 0xff);
615         if (byte)
616             out_buf[out_no++] = byte;
617         byte = (unsigned char )(y & 0xff);
618         if (byte)
619             out_buf[out_no++] = byte;
620         memcpy(*outbuf, out_buf, out_no);
621         *outbuf += out_no;
622         (*outbytesleft) -= out_no;
623     }
624
625     if (cd->write_marc8_second_half_char)
626     {
627         *(*outbuf)++ = cd->write_marc8_second_half_char;
628         (*outbytesleft)--;
629     }        
630
631     cd->write_marc8_last = 0;
632     cd->write_marc8_ncr = 0;
633     cd->write_marc8_lpage = 0;
634     cd->write_marc8_second_half_char = 0;
635     return 0;
636 }
637
638 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd, 
639                                        char **outbuf, size_t *outbytesleft,
640                                        const char *page_chr)
641 {
642     const char **old_page_chr = &cd->write_marc8_g0;
643
644     /* are we going to a G1-set (such as such as ESC ")!E") */
645     if (page_chr && page_chr[1] == ')')
646         old_page_chr = &cd->write_marc8_g1;
647
648     if (!*old_page_chr || strcmp(page_chr, *old_page_chr))
649     {
650         size_t plen = 0;
651         const char *page_out = page_chr;
652         
653         if (*outbytesleft < 8)
654         {
655             cd->my_errno = YAZ_ICONV_E2BIG;
656             
657             return (size_t) (-1);
658         }
659
660         if (*old_page_chr)
661         {
662             if (!strcmp(*old_page_chr, ESC "p") 
663                 || !strcmp(*old_page_chr, ESC "g")
664                 || !strcmp(*old_page_chr, ESC "b"))
665             {
666                 page_out = ESC "s";
667                 /* Technique 1 leave */
668                 if (strcmp(page_chr, ESC "(B")) /* Not going ASCII page? */
669                 {
670                     /* Must leave script + enter new page */
671                     plen = strlen(page_out);
672                     memcpy(*outbuf, page_out, plen);
673                     (*outbuf) += plen;
674                     (*outbytesleft) -= plen;
675                     page_out = ESC "(B";
676                 }
677             }
678         }
679         *old_page_chr = page_chr;
680         plen = strlen(page_out);
681         memcpy(*outbuf, page_out, plen);
682         (*outbuf) += plen;
683         (*outbytesleft) -= plen;
684     }
685     return 0;
686 }
687
688
689 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
690                                 char **outbuf, size_t *outbytesleft,
691                                 int loss_mode)
692 {
693     int comb = 0;
694     int enable_ncr = 0;
695     const char *page_chr = 0;
696     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
697
698     if (!y)
699     {
700         if (loss_mode == 0 || cd->my_errno != YAZ_ICONV_EILSEQ)
701             return (size_t) (-1);
702         page_chr = ESC "(B";
703         if (loss_mode == 1)
704             y = '|';
705         else
706         {
707             y = x; 
708             enable_ncr = 1;
709         }
710     }
711
712     if (comb)
713     {
714         if (page_chr)
715         {
716             size_t r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft,
717                                                 page_chr);
718             if (r)
719                 return r;
720         }
721         if (x == 0x0361)
722             cd->write_marc8_second_half_char = 0xEC;
723         else if (x == 0x0360)
724             cd->write_marc8_second_half_char = 0xFB;
725
726         if (*outbytesleft <= 1)
727         {
728             cd->my_errno = YAZ_ICONV_E2BIG;
729             return (size_t) (-1);
730         }
731         *(*outbuf)++ = y;
732         (*outbytesleft)--;
733     }
734     else
735     {
736         size_t r = flush_combos(cd, outbuf, outbytesleft);
737         if (r)
738             return r;
739
740         cd->write_marc8_last = y;
741         cd->write_marc8_lpage = page_chr;
742         cd->write_marc8_ncr = enable_ncr;
743     }
744     return 0;
745 }
746
747 static size_t yaz_flush_marc8(yaz_iconv_t cd,
748                               char **outbuf, size_t *outbytesleft)
749 {
750     size_t r = flush_combos(cd, outbuf, outbytesleft);
751     if (r)
752         return r;
753     cd->write_marc8_g1 = 0;
754     return yaz_write_marc8_page_chr(cd, outbuf, outbytesleft, ESC "(B");
755 }
756
757 static size_t yaz_write_marc8_generic(yaz_iconv_t cd, unsigned long x,
758                                       char **outbuf, size_t *outbytesleft,
759                                       int loss_mode);
760
761 static size_t yaz_write_marc8_normal(yaz_iconv_t cd, unsigned long x,
762                                      char **outbuf, size_t *outbytesleft)
763 {
764     return yaz_write_marc8_generic(cd, x, outbuf, outbytesleft, 0);
765 }
766
767 static size_t yaz_write_marc8_lossy(yaz_iconv_t cd, unsigned long x,
768                                     char **outbuf, size_t *outbytesleft)
769 {
770     return yaz_write_marc8_generic(cd, x, outbuf, outbytesleft, 1);
771 }
772
773 static size_t yaz_write_marc8_lossless(yaz_iconv_t cd, unsigned long x,
774                                     char **outbuf, size_t *outbytesleft)
775 {
776     return yaz_write_marc8_generic(cd, x, outbuf, outbytesleft, 2);
777 }
778
779 static size_t yaz_write_marc8_generic(yaz_iconv_t cd, unsigned long x,
780                                       char **outbuf, size_t *outbytesleft,
781                                       int loss_mode)
782 {
783     int i;
784     for (i = 0; latin1_comb[i].x1; i++)
785     {
786         if (x == latin1_comb[i].y)
787         {
788             size_t r ;
789             /* save the output pointers .. */
790             char *outbuf0 = *outbuf;
791             size_t outbytesleft0 = *outbytesleft;
792             int last_ch = cd->write_marc8_last;
793             const char *lpage = cd->write_marc8_lpage;
794
795             r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
796                                   outbuf, outbytesleft, loss_mode);
797             if (r)
798                 return r;
799             r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
800                                   outbuf, outbytesleft, loss_mode);
801             if (r && cd->my_errno == YAZ_ICONV_E2BIG)
802             {
803                 /* not enough room. reset output to original values */
804                 *outbuf = outbuf0;
805                 *outbytesleft = outbytesleft0;
806                 cd->write_marc8_last = last_ch;
807                 cd->write_marc8_lpage = lpage;
808             }
809             return r;
810         }
811     }
812     return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, loss_mode);
813 }
814
815
816 #if HAVE_WCHAR_H
817 static size_t yaz_write_wchar_t(yaz_iconv_t cd, unsigned long x,
818                                 char **outbuf, size_t *outbytesleft)
819 {
820     unsigned char *outp = (unsigned char *) *outbuf;
821
822     if (*outbytesleft >= sizeof(wchar_t))
823     {
824         wchar_t wch = x;
825         memcpy(outp, &wch, sizeof(wch));
826         outp += sizeof(wch);
827         (*outbytesleft) -= sizeof(wch);
828     }
829     else
830     {
831         cd->my_errno = YAZ_ICONV_E2BIG;
832         return (size_t)(-1);
833     }
834     *outbuf = (char *) outp;
835     return 0;
836 }
837 #endif
838
839 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
840 {
841     return cd->read_handle && cd->write_handle;
842 }
843
844 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
845 {
846     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
847
848     cd->write_handle = 0;
849     cd->read_handle = 0;
850     cd->init_handle = 0;
851     cd->flush_handle = 0;
852     cd->my_errno = YAZ_ICONV_UNKNOWN;
853
854     /* a useful hack: if fromcode has leading @,
855        the library not use YAZ's own conversions .. */
856     if (fromcode[0] == '@')
857         fromcode++;
858     else
859     {
860         if (!yaz_matchstr(fromcode, "UTF8"))
861         {
862             cd->read_handle = yaz_read_UTF8;
863             cd->init_handle = yaz_init_UTF8;
864         }
865         else if (!yaz_matchstr(fromcode, "ISO88591"))
866             cd->read_handle = yaz_read_ISO8859_1;
867         else if (!yaz_matchstr(fromcode, "UCS4"))
868             cd->read_handle = yaz_read_UCS4;
869         else if (!yaz_matchstr(fromcode, "UCS4LE"))
870             cd->read_handle = yaz_read_UCS4LE;
871         else if (!yaz_matchstr(fromcode, "MARC8"))
872             cd->read_handle = yaz_read_marc8;
873         else if (!yaz_matchstr(fromcode, "MARC8s"))
874             cd->read_handle = yaz_read_marc8s;
875         else if (!yaz_matchstr(fromcode, "advancegreek"))
876             cd->read_handle = yaz_read_advancegreek;
877         else if (!yaz_matchstr(fromcode, "iso54281984"))
878             cd->read_handle = yaz_read_iso5428_1984;
879         else if (!yaz_matchstr(fromcode, "iso5428:1984"))
880             cd->read_handle = yaz_read_iso5428_1984;
881 #if HAVE_WCHAR_H
882         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
883             cd->read_handle = yaz_read_wchar_t;
884 #endif
885         
886         if (!yaz_matchstr(tocode, "UTF8"))
887             cd->write_handle = yaz_write_UTF8;
888         else if (!yaz_matchstr(tocode, "ISO88591"))
889         {
890             cd->write_handle = yaz_write_ISO8859_1;
891             cd->flush_handle = yaz_flush_ISO8859_1;
892         }
893         else if (!yaz_matchstr (tocode, "UCS4"))
894             cd->write_handle = yaz_write_UCS4;
895         else if (!yaz_matchstr(tocode, "UCS4LE"))
896             cd->write_handle = yaz_write_UCS4LE;
897         else if (!yaz_matchstr(tocode, "MARC8"))
898         {
899             cd->write_handle = yaz_write_marc8_normal;
900             cd->flush_handle = yaz_flush_marc8;
901         }
902         else if (!yaz_matchstr(tocode, "MARC8s"))
903         {
904             cd->write_handle = yaz_write_marc8_normal;
905             cd->flush_handle = yaz_flush_marc8;
906         }
907         else if (!yaz_matchstr(tocode, "MARC8lossy"))
908         {
909             cd->write_handle = yaz_write_marc8_lossy;
910             cd->flush_handle = yaz_flush_marc8;
911         }
912         else if (!yaz_matchstr(tocode, "MARC8lossless"))
913         {
914             cd->write_handle = yaz_write_marc8_lossless;
915             cd->flush_handle = yaz_flush_marc8;
916         }
917         else if (!yaz_matchstr(tocode, "advancegreek"))
918         {
919             cd->write_handle = yaz_write_advancegreek;
920         }
921         else if (!yaz_matchstr(tocode, "iso54281984"))
922         {
923             cd->write_handle = yaz_write_iso5428_1984;
924         }
925         else if (!yaz_matchstr(tocode, "iso5428:1984"))
926         {
927             cd->write_handle = yaz_write_iso5428_1984;
928         }
929 #if HAVE_WCHAR_H
930         else if (!yaz_matchstr(tocode, "WCHAR_T"))
931             cd->write_handle = yaz_write_wchar_t;
932 #endif
933     }
934 #if HAVE_ICONV_H
935     cd->iconv_cd = 0;
936     if (!cd->read_handle || !cd->write_handle)
937     {
938         cd->iconv_cd = iconv_open (tocode, fromcode);
939         if (cd->iconv_cd == (iconv_t) (-1))
940         {
941             xfree (cd);
942             return 0;
943         }
944     }
945 #else
946     if (!cd->read_handle || !cd->write_handle)
947     {
948         xfree (cd);
949         return 0;
950     }
951 #endif
952     cd->init_flag = 1;
953     return cd;
954 }
955
956 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
957                  char **outbuf, size_t *outbytesleft)
958 {
959     char *inbuf0 = 0;
960     size_t r = 0;
961
962 #if HAVE_ICONV_H
963     if (cd->iconv_cd)
964     {
965         size_t r =
966             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
967         if (r == (size_t)(-1))
968         {
969             switch (yaz_errno())
970             {
971             case E2BIG:
972                 cd->my_errno = YAZ_ICONV_E2BIG;
973                 break;
974             case EINVAL:
975                 cd->my_errno = YAZ_ICONV_EINVAL;
976                 break;
977             case EILSEQ:
978                 cd->my_errno = YAZ_ICONV_EILSEQ;
979                 break;
980             default:
981                 cd->my_errno = YAZ_ICONV_UNKNOWN;
982             }
983         }
984         return r;
985     }
986 #endif
987
988     if (inbuf)
989         inbuf0 = *inbuf;
990
991     if (cd->init_flag)
992     {
993         cd->my_errno = YAZ_ICONV_UNKNOWN;
994         cd->g0_mode = 'B';
995         cd->g1_mode = 'E';
996         
997         cd->comb_offset = cd->comb_size = 0;
998         cd->compose_char = 0;
999         
1000         cd->write_marc8_second_half_char = 0;
1001         cd->write_marc8_last = 0;
1002         cd->write_marc8_ncr = 0;
1003         cd->write_marc8_lpage = 0;
1004         cd->write_marc8_g0 = ESC "(B";
1005         cd->write_marc8_g1 = 0;
1006         
1007         cd->unget_x = 0;
1008         cd->no_read_x = 0;
1009     }
1010
1011     if (cd->init_flag)
1012     {
1013         if (cd->init_handle && inbuf && *inbuf)
1014         {
1015             size_t no_read = 0;
1016             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1017                                          *inbytesleft, &no_read);
1018             if (r)
1019             {
1020                 if (cd->my_errno == YAZ_ICONV_EINVAL)
1021                     return r;
1022                 cd->init_flag = 0;
1023                 return r;
1024             }
1025             *inbytesleft -= no_read;
1026             *inbuf += no_read;
1027         }
1028     }
1029     cd->init_flag = 0;
1030
1031     if (!inbuf || !*inbuf)
1032     {
1033         if (outbuf && *outbuf)
1034         {
1035             if (cd->unget_x)
1036                 r = (*cd->write_handle)(cd, cd->unget_x, outbuf, outbytesleft);
1037             if (cd->flush_handle)
1038                 r = (*cd->flush_handle)(cd, outbuf, outbytesleft);
1039         }
1040         if (r == 0)
1041             cd->init_flag = 1;
1042         cd->unget_x = 0;
1043         return r;
1044     }
1045     while (1)
1046     {
1047         unsigned long x;
1048         size_t no_read;
1049
1050         if (cd->unget_x)
1051         {
1052             x = cd->unget_x;
1053             no_read = cd->no_read_x;
1054         }
1055         else
1056         {
1057             if (*inbytesleft == 0)
1058             {
1059                 r = *inbuf - inbuf0;
1060                 break;
1061             }
1062             x = (*cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1063                                    &no_read);
1064             if (no_read == 0)
1065             {
1066                 r = (size_t)(-1);
1067                 break;
1068             }
1069         }
1070         if (x)
1071         {
1072             r = (*cd->write_handle)(cd, x, outbuf, outbytesleft);
1073             if (r)
1074             {
1075                 /* unable to write it. save it because read_handle cannot
1076                    rewind .. */
1077                 if (cd->my_errno == YAZ_ICONV_E2BIG)
1078                 {
1079                     cd->unget_x = x;
1080                     cd->no_read_x = no_read;
1081                     break;
1082                 }
1083             }
1084             cd->unget_x = 0;
1085         }
1086         *inbytesleft -= no_read;
1087         (*inbuf) += no_read;
1088     }
1089     return r;
1090 }
1091
1092 int yaz_iconv_error (yaz_iconv_t cd)
1093 {
1094     return cd->my_errno;
1095 }
1096
1097 int yaz_iconv_close (yaz_iconv_t cd)
1098 {
1099 #if HAVE_ICONV_H
1100     if (cd->iconv_cd)
1101         iconv_close (cd->iconv_cd);
1102 #endif
1103     xfree (cd);
1104     return 0;
1105 }
1106
1107 void yaz_iconv_set_errno(yaz_iconv_t cd, int no)
1108 {
1109     cd->my_errno = no;
1110 }
1111
1112 /*
1113  * Local variables:
1114  * c-basic-offset: 4
1115  * indent-tabs-mode: nil
1116  * End:
1117  * vim: shiftwidth=4 tabstop=8 expandtab
1118  */