de3c54e10326bb307619951ac36f027243a1b19d
[yaz-moved-to-github.git] / src / siconv.c
1 /*
2  * Copyright (C) 1995-2008, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: siconv.c,v 1.50 2008-03-12 08:53:28 adam Exp $
6  */
7 /**
8  * \file siconv.c
9  * \brief Implements simple ICONV
10  *
11  * This implements an interface similar to that of iconv and
12  * is used by YAZ to interface with iconv (if present).
13  * For systems where iconv is not present, this layer
14  * provides a few important conversions: UTF-8, MARC-8, Latin-1.
15  *
16  * MARC-8 reference:
17  *  http://www.loc.gov/marc/specifications/speccharmarc8.html
18  */
19
20 #if HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include <assert.h>
25 #include <errno.h>
26 #include <string.h>
27 #include <ctype.h>
28 #if HAVE_WCHAR_H
29 #include <wchar.h>
30 #endif
31
32 #if HAVE_ICONV_H
33 #include <iconv.h>
34 #endif
35
36 #include <yaz/xmalloc.h>
37 #include <yaz/nmem.h>
38 #include <yaz/snprintf.h>
39 #include "iconv-p.h"
40
41 typedef unsigned long yaz_conv_func_t(unsigned char *inp, size_t inbytesleft,
42                                       size_t *no_read, int *combining,
43                                       unsigned mask, int boffset);
44
45
46 yaz_conv_func_t yaz_marc8_42_conv;
47 yaz_conv_func_t yaz_marc8_45_conv;
48 yaz_conv_func_t yaz_marc8_67_conv;
49 yaz_conv_func_t yaz_marc8_62_conv;
50 yaz_conv_func_t yaz_marc8_70_conv;
51 yaz_conv_func_t yaz_marc8_32_conv;
52 yaz_conv_func_t yaz_marc8_4E_conv;
53 yaz_conv_func_t yaz_marc8_51_conv;
54 yaz_conv_func_t yaz_marc8_33_conv;
55 yaz_conv_func_t yaz_marc8_34_conv;
56 yaz_conv_func_t yaz_marc8_53_conv;
57 yaz_conv_func_t yaz_marc8_31_conv;
58
59 yaz_conv_func_t yaz_marc8r_42_conv;
60 yaz_conv_func_t yaz_marc8r_45_conv;
61 yaz_conv_func_t yaz_marc8r_67_conv;
62 yaz_conv_func_t yaz_marc8r_62_conv;
63 yaz_conv_func_t yaz_marc8r_70_conv;
64 yaz_conv_func_t yaz_marc8r_32_conv;
65 yaz_conv_func_t yaz_marc8r_4E_conv;
66 yaz_conv_func_t yaz_marc8r_51_conv;
67 yaz_conv_func_t yaz_marc8r_33_conv;
68 yaz_conv_func_t yaz_marc8r_34_conv;
69 yaz_conv_func_t yaz_marc8r_53_conv;
70 yaz_conv_func_t yaz_marc8r_31_conv;
71
72 struct yaz_iconv_struct {
73     int my_errno;
74     int init_flag;
75     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
76                           size_t inbytesleft, size_t *no_read);
77     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
78                                  size_t inbytesleft, size_t *no_read);
79     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
80                            char **outbuf, size_t *outbytesleft);
81     size_t (*flush_handle)(yaz_iconv_t cd,
82                            char **outbuf, size_t *outbytesleft);
83     int g0_mode;
84     int g1_mode;
85
86     int comb_offset;
87     int comb_size;
88     unsigned long comb_x[8];
89     size_t comb_no_read[8];
90     size_t no_read_x;
91     unsigned long unget_x;
92 #if HAVE_ICONV_H
93     iconv_t iconv_cd;
94 #endif
95     unsigned long compose_char;
96
97     unsigned write_marc8_second_half_char;
98     unsigned long write_marc8_last;
99     int write_marc8_ncr;
100     const char *write_marc8_lpage;
101     const char *write_marc8_g0;
102     const char *write_marc8_g1;
103 };
104
105
106 static struct {
107     unsigned long x1, x2;
108     unsigned y;
109 } latin1_comb[] = {
110     { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
111     { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
112     { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
113     { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
114     { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
115     { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
116     /* no need for 0xc6      LATIN CAPITAL LETTER AE */
117     { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
118     { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
119     { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
120     { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
121     { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
122     { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
123     { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
124     { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
125     { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
126     { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
127     { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
128     { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
129     { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
130     { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
131     { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
132     /* omitted:    0xd7      MULTIPLICATION SIGN */
133     /* omitted:    0xd8      LATIN CAPITAL LETTER O WITH STROKE */
134     { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
135     { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
136     { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
137     { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
138     { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
139     /* omitted:    0xde      LATIN CAPITAL LETTER THORN */
140     /* omitted:    0xdf      LATIN SMALL LETTER SHARP S */
141     { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
142     { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
143     { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
144     { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
145     { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
146     { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
147     /* omitted:    0xe6      LATIN SMALL LETTER AE */
148     { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
149     { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
150     { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
151     { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
152     { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
153     { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
154     { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
155     { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
156     { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
157     /* omitted:    0xf0      LATIN SMALL LETTER ETH */
158     { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
159     { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
160     { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
161     { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
162     { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
163     { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
164     /* omitted:    0xf7      DIVISION SIGN */
165     /* omitted:    0xf8      LATIN SMALL LETTER O WITH STROKE */
166     { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
167     { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
168     { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
169     { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
170     { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
171     /* omitted:    0xfe      LATIN SMALL LETTER THORN */
172     { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
173     
174     { 0, 0, 0}
175 };
176
177 #define ESC "\033"
178
179 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd, 
180                                        char **outbuf, size_t *outbytesleft,
181                                        const char *page_chr);
182
183 static unsigned long yaz_read_ISO8859_1(yaz_iconv_t cd, unsigned char *inp,
184                                         size_t inbytesleft, size_t *no_read)
185 {
186     unsigned long x = inp[0];
187     *no_read = 1;
188     return x;
189 }
190
191
192
193 #if HAVE_WCHAR_H
194 static unsigned long yaz_read_wchar_t(yaz_iconv_t cd, unsigned char *inp,
195                                       size_t inbytesleft, size_t *no_read)
196 {
197     unsigned long x = 0;
198     
199     if (inbytesleft < sizeof(wchar_t))
200     {
201         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
202         *no_read = 0;
203     }
204     else
205     {
206         wchar_t wch;
207         memcpy(&wch, inp, sizeof(wch));
208         x = wch;
209         *no_read = sizeof(wch);
210     }
211     return x;
212 }
213 #endif
214
215
216 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
217                                          size_t inbytesleft, size_t *no_read,
218                                          int *comb);
219
220 static unsigned long yaz_read_marc8(yaz_iconv_t cd, unsigned char *inp,
221                                     size_t inbytesleft, size_t *no_read)
222 {
223     unsigned long x;
224     if (cd->comb_offset < cd->comb_size)
225     {
226         *no_read = cd->comb_no_read[cd->comb_offset];
227         x = cd->comb_x[cd->comb_offset];
228
229         /* special case for double-diacritic combining characters, 
230            INVERTED BREVE and DOUBLE TILDE.
231            We'll increment the no_read counter by 1, since we want to skip over
232            the processing of the closing ligature character
233         */
234         /* this code is no longer necessary.. our handlers code in
235            yaz_marc8_?_conv (generated by charconv.tcl) now returns
236            0 and no_read=1 when a sequence does not match the input.
237            The SECOND HALFs in codetables.xml produces a non-existant
238            entry in the conversion trie.. Hence when met, the input byte is
239            skipped as it should (in yaz_iconv)
240         */
241 #if 0
242         if (x == 0x0361 || x == 0x0360)
243             *no_read += 1;
244 #endif
245         cd->comb_offset++;
246         return x;
247     }
248
249     cd->comb_offset = 0;
250     for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
251     {
252         int comb = 0;
253
254         if (inbytesleft == 0 && cd->comb_size)
255         {
256             cd->my_errno = YAZ_ICONV_EINVAL;
257             x = 0;
258             *no_read = 0;
259             break;
260         }
261         x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
262         if (!comb || !x)
263             break;
264         cd->comb_x[cd->comb_size] = x;
265         cd->comb_no_read[cd->comb_size] = *no_read;
266         inp += *no_read;
267         inbytesleft = inbytesleft - *no_read;
268     }
269     return x;
270 }
271
272 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
273                                      size_t inbytesleft, size_t *no_read)
274 {
275     unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
276     if (x && cd->comb_size == 1)
277     {
278         /* For MARC8s we try to get a Latin-1 page code out of it */
279         int i;
280         for (i = 0; latin1_comb[i].x1; i++)
281             if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
282             {
283                 *no_read += cd->comb_no_read[0];
284                 cd->comb_size = 0;
285                 x = latin1_comb[i].y;
286                 break;
287             }
288     }
289     return x;
290 }
291
292 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
293                                          size_t inbytesleft, size_t *no_read,
294                                          int *comb)
295 {
296     *no_read = 0;
297     while (inbytesleft > 0 && *inp == 27)
298     {
299         int *modep = &cd->g0_mode;
300         size_t inbytesleft0 = inbytesleft;
301
302         inbytesleft--;
303         inp++;
304         if (inbytesleft == 0)
305             goto incomplete;
306         if (*inp == '$') /* set with multiple bytes */
307         {
308             inbytesleft--;
309             inp++;
310         }
311         if (inbytesleft == 0)
312             goto incomplete;
313         if (*inp == '(' || *inp == ',')  /* G0 */
314         {
315             inbytesleft--;
316             inp++;
317         }
318         else if (*inp == ')' || *inp == '-') /* G1 */
319         {
320             inbytesleft--;
321             inp++;
322             modep = &cd->g1_mode;
323         }
324         if (inbytesleft == 0)
325             goto incomplete;
326         if (*inp == '!') /* ANSEL is a special case */
327         {
328             inbytesleft--;
329             inp++;
330         }
331         if (inbytesleft == 0)
332             goto incomplete;
333         *modep = *inp++; /* Final character */
334         inbytesleft--;
335
336         (*no_read) += inbytesleft0 - inbytesleft;
337     }
338     if (inbytesleft == 0)
339         return 0;
340     else if (*inp == ' ')
341     {
342         *no_read += 1;
343         return ' ';
344     }
345     else
346     {
347         unsigned long x;
348         size_t no_read_sub = 0;
349         int mode = *inp < 128 ? cd->g0_mode : cd->g1_mode;
350         *comb = 0;
351
352         switch(mode)
353         {
354         case 'B':  /* Basic ASCII */
355         case 's':  /* ASCII */
356             x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
357             break;
358         case 'E':  /* ANSEL */
359             x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb, 127, 128);
360             break;
361         case 'g':  /* Greek */
362             x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
363             break;
364         case 'b':  /* Subscripts */
365             x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
366             break;
367         case 'p':  /* Superscripts */
368             x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
369             break;
370         case '2':  /* Basic Hebrew */
371             x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
372             break;
373         case 'N':  /* Basic Cyrillic */
374             x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
375             break;
376         case 'Q':  /* Extended Cyrillic */
377             x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
378             break;
379         case '3':  /* Basic Arabic */
380             x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
381             break;
382         case '4':  /* Extended Arabic */
383             x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
384             break;
385         case 'S':  /* Greek */
386             x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
387             break;
388         case '1':  /* Chinese, Japanese, Korean (EACC) */
389             x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
390             break;
391         default:
392             *no_read = 0;
393             cd->my_errno = YAZ_ICONV_EILSEQ;
394             return 0;
395         }
396         *no_read += no_read_sub;
397         return x;
398     }
399 incomplete:
400     *no_read = 0;
401     cd->my_errno = YAZ_ICONV_EINVAL;
402     return 0;
403 }
404
405 static size_t yaz_write_ISO8859_1(yaz_iconv_t cd, unsigned long x,
406                                   char **outbuf, size_t *outbytesleft)
407 {
408     /* list of two char unicode sequence that, when combined, are
409        equivalent to single unicode chars that can be represented in
410        ISO-8859-1/Latin-1.
411        Regular iconv on Linux at least does not seem to convert these,
412        but since MARC-8 to UTF-8 generates these composed sequence
413        we get a better chance of a successful MARC-8 -> ISO-8859-1
414        conversion */
415     unsigned char *outp = (unsigned char *) *outbuf;
416
417     if (cd->compose_char)
418     {
419         int i;
420         for (i = 0; latin1_comb[i].x1; i++)
421             if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
422             {
423                 x = latin1_comb[i].y;
424                 break;
425             }
426         if (*outbytesleft < 1)
427         {  /* no room. Retain compose_char and bail out */
428             cd->my_errno = YAZ_ICONV_E2BIG;
429             return (size_t)(-1);
430         }
431         if (!latin1_comb[i].x1) 
432         {   /* not found. Just write compose_char */
433             *outp++ = (unsigned char) cd->compose_char;
434             (*outbytesleft)--;
435             *outbuf = (char *) outp;
436         }
437         /* compose_char used so reset it. x now holds current char */
438         cd->compose_char = 0;
439     }
440
441     if (x > 32 && x < 127 && cd->compose_char == 0)
442     {
443         cd->compose_char = x;
444         return 0;
445     }
446     else if (x > 255 || x < 1)
447     {
448         cd->my_errno = YAZ_ICONV_EILSEQ;
449         return (size_t) -1;
450     }
451     else if (*outbytesleft < 1)
452     {
453         cd->my_errno = YAZ_ICONV_E2BIG;
454         return (size_t)(-1);
455     }
456     *outp++ = (unsigned char) x;
457     (*outbytesleft)--;
458     *outbuf = (char *) outp;
459     return 0;
460 }
461
462 static size_t yaz_flush_ISO8859_1(yaz_iconv_t cd,
463                                   char **outbuf, size_t *outbytesleft)
464 {
465     if (cd->compose_char)
466     {
467         unsigned char *outp = (unsigned char *) *outbuf;
468         if (*outbytesleft < 1)
469         {
470             cd->my_errno = YAZ_ICONV_E2BIG;
471             return (size_t)(-1);
472         }
473         *outp++ = (unsigned char) cd->compose_char;
474         (*outbytesleft)--;
475         *outbuf = (char *) outp;
476         cd->compose_char = 0;
477     }
478     return 0;
479 }
480
481 static unsigned long lookup_marc8(yaz_iconv_t cd,
482                                   unsigned long x, int *comb,
483                                   const char **page_chr)
484 {
485     char utf8_buf[7];
486     char *utf8_outbuf = utf8_buf;
487     size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
488
489     r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft);
490     if (r == (size_t)(-1))
491     {
492         cd->my_errno = YAZ_ICONV_EILSEQ;
493         return 0;
494     }
495     else
496     {
497         unsigned char *inp;
498         size_t inbytesleft, no_read_sub = 0;
499         unsigned long x;
500
501         *utf8_outbuf = '\0';        
502         inp = (unsigned char *) utf8_buf;
503         inbytesleft = strlen(utf8_buf);
504
505         x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
506         if (x)
507         {
508             *page_chr = ESC "(B";
509             return x;
510         }
511         x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
512         if (x)
513         {
514             *page_chr = ESC "(B";
515             return x;
516         }
517         x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
518         if (x)
519         {
520             *page_chr = ESC "b";
521             return x;
522         }
523         x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
524         if (x)
525         {
526             *page_chr = ESC "p";
527             return x;
528         }
529         x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
530         if (x)
531         {
532             *page_chr = ESC "(2";
533             return x;
534         }
535         x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
536         if (x)
537         {
538             *page_chr = ESC "(N";
539             return x;
540         }
541         x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
542         if (x)
543         {
544             *page_chr = ESC "(Q";
545             return x;
546         }
547         x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
548         if (x)
549         {
550             *page_chr = ESC "(3";
551             return x;
552         }
553         x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
554         if (x)
555         {
556             *page_chr = ESC "(4";
557             return x;
558         }
559         x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
560         if (x)
561         {
562             *page_chr = ESC "(S";
563             return x;
564         }
565         x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
566         if (x)
567         {
568             *page_chr = ESC "$1";
569             return x;
570         }
571         cd->my_errno = YAZ_ICONV_EILSEQ;
572         return x;
573     }
574 }
575
576 static size_t flush_combos(yaz_iconv_t cd,
577                            char **outbuf, size_t *outbytesleft)
578 {
579     unsigned long y = cd->write_marc8_last;
580
581     if (!y)
582         return 0;
583
584     assert(cd->write_marc8_lpage);
585     if (cd->write_marc8_lpage)
586     {
587         size_t r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft,
588                                             cd->write_marc8_lpage);
589         if (r)
590             return r;
591     }
592
593     if (9 >= *outbytesleft)
594     {
595         cd->my_errno = YAZ_ICONV_E2BIG;
596         return (size_t) (-1);
597     }
598     if (cd->write_marc8_ncr)
599     {
600         yaz_snprintf(*outbuf, 9, "&#x%04x;", y);
601         (*outbytesleft) -= 8;
602         (*outbuf) += 8;
603     }
604     else
605     {
606         size_t out_no = 0;
607         unsigned char byte;
608
609         byte = (unsigned char )((y>>16) & 0xff);
610         if (byte)
611             (*outbuf)[out_no++] = byte;
612         byte = (unsigned char)((y>>8) & 0xff);
613         if (byte)
614             (*outbuf)[out_no++] = byte;
615         byte = (unsigned char )(y & 0xff);
616         if (byte)
617             (*outbuf)[out_no++] = byte;
618         *outbuf += out_no;
619         (*outbytesleft) -= out_no;
620     }
621
622     if (cd->write_marc8_second_half_char)
623     {
624         *(*outbuf)++ = cd->write_marc8_second_half_char;
625         (*outbytesleft)--;
626     }        
627
628     cd->write_marc8_last = 0;
629     cd->write_marc8_ncr = 0;
630     cd->write_marc8_lpage = 0;
631     cd->write_marc8_second_half_char = 0;
632     return 0;
633 }
634
635 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd, 
636                                        char **outbuf, size_t *outbytesleft,
637                                        const char *page_chr)
638 {
639     const char **old_page_chr = &cd->write_marc8_g0;
640
641     /* are we going to a G1-set (such as such as ESC ")!E") */
642     if (page_chr && page_chr[1] == ')')
643         old_page_chr = &cd->write_marc8_g1;
644
645     if (!*old_page_chr || strcmp(page_chr, *old_page_chr))
646     {
647         size_t plen = 0;
648         const char *page_out = page_chr;
649         
650         if (*outbytesleft < 8)
651         {
652             cd->my_errno = YAZ_ICONV_E2BIG;
653             
654             return (size_t) (-1);
655         }
656
657         if (*old_page_chr)
658         {
659             if (!strcmp(*old_page_chr, ESC "p") 
660                 || !strcmp(*old_page_chr, ESC "g")
661                 || !strcmp(*old_page_chr, ESC "b"))
662             {
663                 page_out = ESC "s";
664                 /* Technique 1 leave */
665                 if (strcmp(page_chr, ESC "(B")) /* Not going ASCII page? */
666                 {
667                     /* Must leave script + enter new page */
668                     plen = strlen(page_out);
669                     memcpy(*outbuf, page_out, plen);
670                     (*outbuf) += plen;
671                     (*outbytesleft) -= plen;
672                     page_out = ESC "(B";
673                 }
674             }
675         }
676         *old_page_chr = page_chr;
677         plen = strlen(page_out);
678         memcpy(*outbuf, page_out, plen);
679         (*outbuf) += plen;
680         (*outbytesleft) -= plen;
681     }
682     return 0;
683 }
684
685
686 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
687                                 char **outbuf, size_t *outbytesleft,
688                                 int loss_mode)
689 {
690     int comb = 0;
691     int enable_ncr = 0;
692     const char *page_chr = 0;
693     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
694
695     if (!y)
696     {
697         if (loss_mode == 0 || cd->my_errno != YAZ_ICONV_EILSEQ)
698             return (size_t) (-1);
699         page_chr = ESC "(B";
700         if (loss_mode == 1)
701             y = '|';
702         else
703         {
704             y = x; 
705             enable_ncr = 1;
706         }
707     }
708
709     if (comb)
710     {
711         if (page_chr)
712         {
713             size_t r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft,
714                                                 page_chr);
715             if (r)
716                 return r;
717         }
718         if (x == 0x0361)
719             cd->write_marc8_second_half_char = 0xEC;
720         else if (x == 0x0360)
721             cd->write_marc8_second_half_char = 0xFB;
722
723         if (*outbytesleft <= 1)
724         {
725             cd->my_errno = YAZ_ICONV_E2BIG;
726             return (size_t) (-1);
727         }
728         *(*outbuf)++ = y;
729         (*outbytesleft)--;
730     }
731     else
732     {
733         size_t r = flush_combos(cd, outbuf, outbytesleft);
734         if (r)
735             return r;
736
737         cd->write_marc8_last = y;
738         cd->write_marc8_lpage = page_chr;
739         cd->write_marc8_ncr = enable_ncr;
740     }
741     return 0;
742 }
743
744 static size_t yaz_flush_marc8(yaz_iconv_t cd,
745                               char **outbuf, size_t *outbytesleft)
746 {
747     size_t r = flush_combos(cd, outbuf, outbytesleft);
748     if (r)
749         return r;
750     cd->write_marc8_g1 = 0;
751     return yaz_write_marc8_page_chr(cd, outbuf, outbytesleft, ESC "(B");
752 }
753
754 static size_t yaz_write_marc8_generic(yaz_iconv_t cd, unsigned long x,
755                                       char **outbuf, size_t *outbytesleft,
756                                       int loss_mode);
757
758 static size_t yaz_write_marc8_normal(yaz_iconv_t cd, unsigned long x,
759                                      char **outbuf, size_t *outbytesleft)
760 {
761     return yaz_write_marc8_generic(cd, x, outbuf, outbytesleft, 0);
762 }
763
764 static size_t yaz_write_marc8_lossy(yaz_iconv_t cd, unsigned long x,
765                                     char **outbuf, size_t *outbytesleft)
766 {
767     return yaz_write_marc8_generic(cd, x, outbuf, outbytesleft, 1);
768 }
769
770 static size_t yaz_write_marc8_lossless(yaz_iconv_t cd, unsigned long x,
771                                     char **outbuf, size_t *outbytesleft)
772 {
773     return yaz_write_marc8_generic(cd, x, outbuf, outbytesleft, 2);
774 }
775
776 static size_t yaz_write_marc8_generic(yaz_iconv_t cd, unsigned long x,
777                                       char **outbuf, size_t *outbytesleft,
778                                       int loss_mode)
779 {
780     if (x >= 0xc0 && x <= 0xff) /* optimization. min and max .y values */
781     {
782         int i;
783         for (i = 0; latin1_comb[i].x1; i++)
784         {
785             if (x == latin1_comb[i].y)
786             {
787                 size_t r ;
788                 /* save the output pointers .. */
789                 char *outbuf0 = *outbuf;
790                 size_t outbytesleft0 = *outbytesleft;
791                 int last_ch = cd->write_marc8_last;
792                 int ncr = cd->write_marc8_ncr;
793                 const char *lpage = cd->write_marc8_lpage;
794                 
795                 r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
796                                       outbuf, outbytesleft, loss_mode);
797                 if (r)
798                     return r;
799                 r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
800                                       outbuf, outbytesleft, loss_mode);
801                 if (r && cd->my_errno == YAZ_ICONV_E2BIG)
802                 {
803                     /* not enough room. reset output to original values */
804                     *outbuf = outbuf0;
805                     *outbytesleft = outbytesleft0;
806                     cd->write_marc8_last = last_ch;
807                     cd->write_marc8_ncr = ncr;
808                     cd->write_marc8_lpage = lpage;
809                 }
810                 return r;
811             }
812         }
813     }
814     return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, loss_mode);
815 }
816
817
818 #if HAVE_WCHAR_H
819 static size_t yaz_write_wchar_t(yaz_iconv_t cd, unsigned long x,
820                                 char **outbuf, size_t *outbytesleft)
821 {
822     unsigned char *outp = (unsigned char *) *outbuf;
823
824     if (*outbytesleft >= sizeof(wchar_t))
825     {
826         wchar_t wch = x;
827         memcpy(outp, &wch, sizeof(wch));
828         outp += sizeof(wch);
829         (*outbytesleft) -= sizeof(wch);
830     }
831     else
832     {
833         cd->my_errno = YAZ_ICONV_E2BIG;
834         return (size_t)(-1);
835     }
836     *outbuf = (char *) outp;
837     return 0;
838 }
839 #endif
840
841 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
842 {
843     return cd->read_handle && cd->write_handle;
844 }
845
846 yaz_iconv_t yaz_iconv_open(const char *tocode, const char *fromcode)
847 {
848     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
849
850     cd->write_handle = 0;
851     cd->read_handle = 0;
852     cd->init_handle = 0;
853     cd->flush_handle = 0;
854     cd->my_errno = YAZ_ICONV_UNKNOWN;
855
856     /* a useful hack: if fromcode has leading @,
857        the library not use YAZ's own conversions .. */
858     if (fromcode[0] == '@')
859         fromcode++;
860     else
861     {
862         if (!yaz_matchstr(fromcode, "UTF8"))
863         {
864             cd->read_handle = yaz_read_UTF8;
865             cd->init_handle = yaz_init_UTF8;
866         }
867         else if (!yaz_matchstr(fromcode, "ISO88591"))
868             cd->read_handle = yaz_read_ISO8859_1;
869         else if (!yaz_matchstr(fromcode, "UCS4"))
870             cd->read_handle = yaz_read_UCS4;
871         else if (!yaz_matchstr(fromcode, "UCS4LE"))
872             cd->read_handle = yaz_read_UCS4LE;
873         else if (!yaz_matchstr(fromcode, "MARC8"))
874             cd->read_handle = yaz_read_marc8;
875         else if (!yaz_matchstr(fromcode, "MARC8s"))
876             cd->read_handle = yaz_read_marc8s;
877         else if (!yaz_matchstr(fromcode, "advancegreek"))
878             cd->read_handle = yaz_read_advancegreek;
879         else if (!yaz_matchstr(fromcode, "iso54281984"))
880             cd->read_handle = yaz_read_iso5428_1984;
881         else if (!yaz_matchstr(fromcode, "iso5428:1984"))
882             cd->read_handle = yaz_read_iso5428_1984;
883 #if HAVE_WCHAR_H
884         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
885             cd->read_handle = yaz_read_wchar_t;
886 #endif
887         
888         if (!yaz_matchstr(tocode, "UTF8"))
889             cd->write_handle = yaz_write_UTF8;
890         else if (!yaz_matchstr(tocode, "ISO88591"))
891         {
892             cd->write_handle = yaz_write_ISO8859_1;
893             cd->flush_handle = yaz_flush_ISO8859_1;
894         }
895         else if (!yaz_matchstr(tocode, "UCS4"))
896             cd->write_handle = yaz_write_UCS4;
897         else if (!yaz_matchstr(tocode, "UCS4LE"))
898             cd->write_handle = yaz_write_UCS4LE;
899         else if (!yaz_matchstr(tocode, "MARC8"))
900         {
901             cd->write_handle = yaz_write_marc8_normal;
902             cd->flush_handle = yaz_flush_marc8;
903         }
904         else if (!yaz_matchstr(tocode, "MARC8s"))
905         {
906             cd->write_handle = yaz_write_marc8_normal;
907             cd->flush_handle = yaz_flush_marc8;
908         }
909         else if (!yaz_matchstr(tocode, "MARC8lossy"))
910         {
911             cd->write_handle = yaz_write_marc8_lossy;
912             cd->flush_handle = yaz_flush_marc8;
913         }
914         else if (!yaz_matchstr(tocode, "MARC8lossless"))
915         {
916             cd->write_handle = yaz_write_marc8_lossless;
917             cd->flush_handle = yaz_flush_marc8;
918         }
919         else if (!yaz_matchstr(tocode, "advancegreek"))
920         {
921             cd->write_handle = yaz_write_advancegreek;
922         }
923         else if (!yaz_matchstr(tocode, "iso54281984"))
924         {
925             cd->write_handle = yaz_write_iso5428_1984;
926         }
927         else if (!yaz_matchstr(tocode, "iso5428:1984"))
928         {
929             cd->write_handle = yaz_write_iso5428_1984;
930         }
931 #if HAVE_WCHAR_H
932         else if (!yaz_matchstr(tocode, "WCHAR_T"))
933             cd->write_handle = yaz_write_wchar_t;
934 #endif
935     }
936 #if HAVE_ICONV_H
937     cd->iconv_cd = 0;
938     if (!cd->read_handle || !cd->write_handle)
939     {
940         cd->iconv_cd = iconv_open(tocode, fromcode);
941         if (cd->iconv_cd == (iconv_t) (-1))
942         {
943             xfree(cd);
944             return 0;
945         }
946     }
947 #else
948     if (!cd->read_handle || !cd->write_handle)
949     {
950         xfree(cd);
951         return 0;
952     }
953 #endif
954     cd->init_flag = 1;
955     return cd;
956 }
957
958 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
959                  char **outbuf, size_t *outbytesleft)
960 {
961     char *inbuf0 = 0;
962     size_t r = 0;
963
964 #if HAVE_ICONV_H
965     if (cd->iconv_cd)
966     {
967         size_t r =
968             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
969         if (r == (size_t)(-1))
970         {
971             switch (yaz_errno())
972             {
973             case E2BIG:
974                 cd->my_errno = YAZ_ICONV_E2BIG;
975                 break;
976             case EINVAL:
977                 cd->my_errno = YAZ_ICONV_EINVAL;
978                 break;
979             case EILSEQ:
980                 cd->my_errno = YAZ_ICONV_EILSEQ;
981                 break;
982             default:
983                 cd->my_errno = YAZ_ICONV_UNKNOWN;
984             }
985         }
986         return r;
987     }
988 #endif
989
990     if (inbuf)
991         inbuf0 = *inbuf;
992
993     if (cd->init_flag)
994     {
995         cd->my_errno = YAZ_ICONV_UNKNOWN;
996         cd->g0_mode = 'B';
997         cd->g1_mode = 'E';
998         
999         cd->comb_offset = cd->comb_size = 0;
1000         cd->compose_char = 0;
1001         
1002         cd->write_marc8_second_half_char = 0;
1003         cd->write_marc8_last = 0;
1004         cd->write_marc8_ncr = 0;
1005         cd->write_marc8_lpage = 0;
1006         cd->write_marc8_g0 = ESC "(B";
1007         cd->write_marc8_g1 = 0;
1008         
1009         cd->unget_x = 0;
1010         cd->no_read_x = 0;
1011     }
1012
1013     if (cd->init_flag)
1014     {
1015         if (cd->init_handle && inbuf && *inbuf)
1016         {
1017             size_t no_read = 0;
1018             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1019                                          *inbytesleft, &no_read);
1020             if (r)
1021             {
1022                 if (cd->my_errno == YAZ_ICONV_EINVAL)
1023                     return r;
1024                 cd->init_flag = 0;
1025                 return r;
1026             }
1027             *inbytesleft -= no_read;
1028             *inbuf += no_read;
1029         }
1030     }
1031     cd->init_flag = 0;
1032
1033     if (!inbuf || !*inbuf)
1034     {
1035         if (outbuf && *outbuf)
1036         {
1037             if (cd->unget_x)
1038                 r = (*cd->write_handle)(cd, cd->unget_x, outbuf, outbytesleft);
1039             if (cd->flush_handle)
1040                 r = (*cd->flush_handle)(cd, outbuf, outbytesleft);
1041         }
1042         if (r == 0)
1043             cd->init_flag = 1;
1044         cd->unget_x = 0;
1045         return r;
1046     }
1047     while (1)
1048     {
1049         unsigned long x;
1050         size_t no_read;
1051
1052         if (cd->unget_x)
1053         {
1054             x = cd->unget_x;
1055             no_read = cd->no_read_x;
1056         }
1057         else
1058         {
1059             if (*inbytesleft == 0)
1060             {
1061                 r = *inbuf - inbuf0;
1062                 break;
1063             }
1064             x = (*cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1065                                    &no_read);
1066             if (no_read == 0)
1067             {
1068                 r = (size_t)(-1);
1069                 break;
1070             }
1071         }
1072         if (x)
1073         {
1074             r = (*cd->write_handle)(cd, x, outbuf, outbytesleft);
1075             if (r)
1076             {
1077                 /* unable to write it. save it because read_handle cannot
1078                    rewind .. */
1079                 if (cd->my_errno == YAZ_ICONV_E2BIG)
1080                 {
1081                     cd->unget_x = x;
1082                     cd->no_read_x = no_read;
1083                     break;
1084                 }
1085             }
1086             cd->unget_x = 0;
1087         }
1088         *inbytesleft -= no_read;
1089         (*inbuf) += no_read;
1090     }
1091     return r;
1092 }
1093
1094 int yaz_iconv_error(yaz_iconv_t cd)
1095 {
1096     return cd->my_errno;
1097 }
1098
1099 int yaz_iconv_close(yaz_iconv_t cd)
1100 {
1101 #if HAVE_ICONV_H
1102     if (cd->iconv_cd)
1103         iconv_close(cd->iconv_cd);
1104 #endif
1105     xfree(cd);
1106     return 0;
1107 }
1108
1109 void yaz_iconv_set_errno(yaz_iconv_t cd, int no)
1110 {
1111     cd->my_errno = no;
1112 }
1113
1114 /*
1115  * Local variables:
1116  * c-basic-offset: 4
1117  * indent-tabs-mode: nil
1118  * End:
1119  * vim: shiftwidth=4 tabstop=8 expandtab
1120  */