Factor iconv conversions to separate C files.
[yaz-moved-to-github.git] / src / siconv.c
1 /*
2  * Copyright (C) 1995-2008, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: siconv.c,v 1.50 2008-03-12 08:53:28 adam Exp $
6  */
7 /**
8  * \file siconv.c
9  * \brief Implements simple ICONV
10  *
11  * This implements an interface similar to that of iconv and
12  * is used by YAZ to interface with iconv (if present).
13  * For systems where iconv is not present, this layer
14  * provides a few important conversions: UTF-8, MARC-8, Latin-1.
15  *
16  * MARC-8 reference:
17  *  http://www.loc.gov/marc/specifications/speccharmarc8.html
18  */
19
20 #if HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include <assert.h>
25 #include <errno.h>
26 #include <string.h>
27 #include <ctype.h>
28 #if HAVE_WCHAR_H
29 #include <wchar.h>
30 #endif
31
32 #if HAVE_ICONV_H
33 #include <iconv.h>
34 #endif
35
36 #include <yaz/xmalloc.h>
37 #include <yaz/nmem.h>
38 #include "iconv-p.h"
39
40
41 unsigned long yaz_marc8_42_conv(unsigned char *inp, size_t inbytesleft,
42                                size_t *no_read, int *combining);
43 unsigned long yaz_marc8_45_conv(unsigned char *inp, size_t inbytesleft,
44                                size_t *no_read, int *combining);
45 unsigned long yaz_marc8_67_conv(unsigned char *inp, size_t inbytesleft,
46                                size_t *no_read, int *combining);
47 unsigned long yaz_marc8_62_conv(unsigned char *inp, size_t inbytesleft,
48                                size_t *no_read, int *combining);
49 unsigned long yaz_marc8_70_conv(unsigned char *inp, size_t inbytesleft,
50                                size_t *no_read, int *combining);
51 unsigned long yaz_marc8_32_conv(unsigned char *inp, size_t inbytesleft,
52                                size_t *no_read, int *combining);
53 unsigned long yaz_marc8_4E_conv(unsigned char *inp, size_t inbytesleft,
54                                size_t *no_read, int *combining);
55 unsigned long yaz_marc8_51_conv(unsigned char *inp, size_t inbytesleft,
56                                size_t *no_read, int *combining);
57 unsigned long yaz_marc8_33_conv(unsigned char *inp, size_t inbytesleft,
58                                size_t *no_read, int *combining);
59 unsigned long yaz_marc8_34_conv(unsigned char *inp, size_t inbytesleft,
60                                size_t *no_read, int *combining);
61 unsigned long yaz_marc8_53_conv(unsigned char *inp, size_t inbytesleft,
62                                size_t *no_read, int *combining);
63 unsigned long yaz_marc8_31_conv(unsigned char *inp, size_t inbytesleft,
64                                size_t *no_read, int *combining);
65
66
67 unsigned long yaz_marc8r_42_conv(unsigned char *inp, size_t inbytesleft,
68                                  size_t *no_read, int *combining);
69 unsigned long yaz_marc8r_45_conv(unsigned char *inp, size_t inbytesleft,
70                                  size_t *no_read, int *combining);
71 unsigned long yaz_marc8r_67_conv(unsigned char *inp, size_t inbytesleft,
72                                  size_t *no_read, int *combining);
73 unsigned long yaz_marc8r_62_conv(unsigned char *inp, size_t inbytesleft,
74                                  size_t *no_read, int *combining);
75 unsigned long yaz_marc8r_70_conv(unsigned char *inp, size_t inbytesleft,
76                                  size_t *no_read, int *combining);
77 unsigned long yaz_marc8r_32_conv(unsigned char *inp, size_t inbytesleft,
78                                  size_t *no_read, int *combining);
79 unsigned long yaz_marc8r_4E_conv(unsigned char *inp, size_t inbytesleft,
80                                  size_t *no_read, int *combining);
81 unsigned long yaz_marc8r_51_conv(unsigned char *inp, size_t inbytesleft,
82                                  size_t *no_read, int *combining);
83 unsigned long yaz_marc8r_33_conv(unsigned char *inp, size_t inbytesleft,
84                                  size_t *no_read, int *combining);
85 unsigned long yaz_marc8r_34_conv(unsigned char *inp, size_t inbytesleft,
86                                  size_t *no_read, int *combining);
87 unsigned long yaz_marc8r_53_conv(unsigned char *inp, size_t inbytesleft,
88                                  size_t *no_read, int *combining);
89 unsigned long yaz_marc8r_31_conv(unsigned char *inp, size_t inbytesleft,
90                                  size_t *no_read, int *combining);
91
92 struct yaz_iconv_struct {
93     int my_errno;
94     int init_flag;
95     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
96                           size_t inbytesleft, size_t *no_read);
97     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
98                                  size_t inbytesleft, size_t *no_read);
99     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
100                            char **outbuf, size_t *outbytesleft);
101     size_t (*flush_handle)(yaz_iconv_t cd,
102                            char **outbuf, size_t *outbytesleft);
103     int g0_mode;
104     int g1_mode;
105
106     int comb_offset;
107     int comb_size;
108     unsigned long comb_x[8];
109     size_t comb_no_read[8];
110     size_t no_read_x;
111     unsigned long unget_x;
112 #if HAVE_ICONV_H
113     iconv_t iconv_cd;
114 #endif
115     unsigned long compose_char;
116
117     unsigned write_marc8_second_half_char;
118     unsigned long write_marc8_last;
119     const char *write_marc8_lpage;
120     const char *write_marc8_g0;
121     const char *write_marc8_g1;
122 };
123
124
125 static struct {
126     unsigned long x1, x2;
127     unsigned y;
128 } latin1_comb[] = {
129     { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
130     { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
131     { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
132     { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
133     { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
134     { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
135     /* no need for 0xc6      LATIN CAPITAL LETTER AE */
136     { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
137     { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
138     { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
139     { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
140     { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
141     { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
142     { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
143     { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
144     { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
145     { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
146     { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
147     { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
148     { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
149     { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
150     { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
151     /* omitted:    0xd7      MULTIPLICATION SIGN */
152     /* omitted:    0xd8      LATIN CAPITAL LETTER O WITH STROKE */
153     { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
154     { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
155     { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
156     { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
157     { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
158     /* omitted:    0xde      LATIN CAPITAL LETTER THORN */
159     /* omitted:    0xdf      LATIN SMALL LETTER SHARP S */
160     { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
161     { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
162     { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
163     { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
164     { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
165     { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
166     /* omitted:    0xe6      LATIN SMALL LETTER AE */
167     { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
168     { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
169     { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
170     { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
171     { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
172     { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
173     { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
174     { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
175     { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
176     /* omitted:    0xf0      LATIN SMALL LETTER ETH */
177     { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
178     { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
179     { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
180     { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
181     { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
182     { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
183     /* omitted:    0xf7      DIVISION SIGN */
184     /* omitted:    0xf8      LATIN SMALL LETTER O WITH STROKE */
185     { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
186     { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
187     { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
188     { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
189     { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
190     /* omitted:    0xfe      LATIN SMALL LETTER THORN */
191     { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
192     
193     { 0, 0, 0}
194 };
195
196 #define ESC "\033"
197
198 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd, 
199                                        char **outbuf, size_t *outbytesleft,
200                                        const char *page_chr);
201
202 static unsigned long yaz_read_ISO8859_1(yaz_iconv_t cd, unsigned char *inp,
203                                         size_t inbytesleft, size_t *no_read)
204 {
205     unsigned long x = inp[0];
206     *no_read = 1;
207     return x;
208 }
209
210
211
212 #if HAVE_WCHAR_H
213 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
214                                        size_t inbytesleft, size_t *no_read)
215 {
216     unsigned long x = 0;
217     
218     if (inbytesleft < sizeof(wchar_t))
219     {
220         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
221         *no_read = 0;
222     }
223     else
224     {
225         wchar_t wch;
226         memcpy (&wch, inp, sizeof(wch));
227         x = wch;
228         *no_read = sizeof(wch);
229     }
230     return x;
231 }
232 #endif
233
234
235 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
236                                           size_t inbytesleft, size_t *no_read,
237                                           int *comb);
238
239 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
240                                      size_t inbytesleft, size_t *no_read)
241 {
242     unsigned long x;
243     if (cd->comb_offset < cd->comb_size)
244     {
245         *no_read = cd->comb_no_read[cd->comb_offset];
246         x = cd->comb_x[cd->comb_offset];
247
248         /* special case for double-diacritic combining characters, 
249            INVERTED BREVE and DOUBLE TILDE.
250            We'll increment the no_read counter by 1, since we want to skip over
251            the processing of the closing ligature character
252         */
253         /* this code is no longer necessary.. our handlers code in
254            yaz_marc8_?_conv (generated by charconv.tcl) now returns
255            0 and no_read=1 when a sequence does not match the input.
256            The SECOND HALFs in codetables.xml produces a non-existant
257            entry in the conversion trie.. Hence when met, the input byte is
258            skipped as it should (in yaz_iconv)
259         */
260 #if 0
261         if (x == 0x0361 || x == 0x0360)
262             *no_read += 1;
263 #endif
264         cd->comb_offset++;
265         return x;
266     }
267
268     cd->comb_offset = 0;
269     for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
270     {
271         int comb = 0;
272
273         if (inbytesleft == 0 && cd->comb_size)
274         {
275             cd->my_errno = YAZ_ICONV_EINVAL;
276             x = 0;
277             *no_read = 0;
278             break;
279         }
280         x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
281         if (!comb || !x)
282             break;
283         cd->comb_x[cd->comb_size] = x;
284         cd->comb_no_read[cd->comb_size] = *no_read;
285         inp += *no_read;
286         inbytesleft = inbytesleft - *no_read;
287     }
288     return x;
289 }
290
291 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
292                                      size_t inbytesleft, size_t *no_read)
293 {
294     unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
295     if (x && cd->comb_size == 1)
296     {
297         /* For MARC8s we try to get a Latin-1 page code out of it */
298         int i;
299         for (i = 0; latin1_comb[i].x1; i++)
300             if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
301             {
302                 *no_read += cd->comb_no_read[0];
303                 cd->comb_size = 0;
304                 x = latin1_comb[i].y;
305                 break;
306             }
307     }
308     return x;
309 }
310
311 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
312                                          size_t inbytesleft, size_t *no_read,
313                                          int *comb)
314 {
315     *no_read = 0;
316     while (inbytesleft > 0 && *inp == 27)
317     {
318         int *modep = &cd->g0_mode;
319         size_t inbytesleft0 = inbytesleft;
320
321         inbytesleft--;
322         inp++;
323         if (inbytesleft == 0)
324             goto incomplete;
325         if (*inp == '$') /* set with multiple bytes */
326         {
327             inbytesleft--;
328             inp++;
329         }
330         if (inbytesleft == 0)
331             goto incomplete;
332         if (*inp == '(' || *inp == ',')  /* G0 */
333         {
334             inbytesleft--;
335             inp++;
336         }
337         else if (*inp == ')' || *inp == '-') /* G1 */
338         {
339             inbytesleft--;
340             inp++;
341             modep = &cd->g1_mode;
342         }
343         if (inbytesleft == 0)
344             goto incomplete;
345         if (*inp == '!') /* ANSEL is a special case */
346         {
347             inbytesleft--;
348             inp++;
349         }
350         if (inbytesleft == 0)
351             goto incomplete;
352         *modep = *inp++; /* Final character */
353         inbytesleft--;
354
355         (*no_read) += inbytesleft0 - inbytesleft;
356     }
357     if (inbytesleft == 0)
358         return 0;
359     else if (*inp == ' ')
360     {
361         *no_read += 1;
362         return ' ';
363     }
364     else
365     {
366         unsigned long x;
367         size_t no_read_sub = 0;
368         int mode = *inp < 128 ? cd->g0_mode : cd->g1_mode;
369         *comb = 0;
370
371         switch(mode)
372         {
373         case 'B':  /* Basic ASCII */
374         case 's':  /* ASCII */
375         case 'E':  /* ANSEL */
376             x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb);
377             if (!x)
378             {
379                 no_read_sub = 0;
380                 x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb);
381             }
382             break;
383         case 'g':  /* Greek */
384             x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb);
385             break;
386         case 'b':  /* Subscripts */
387             x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb);
388             break;
389         case 'p':  /* Superscripts */
390             x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb);
391             break;
392         case '2':  /* Basic Hebrew */
393             x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb);
394             break;
395         case 'N':  /* Basic Cyrillic */
396             x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb);
397             break;
398         case 'Q':  /* Extended Cyrillic */
399             x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb);
400             break;
401         case '3':  /* Basic Arabic */
402             x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb);
403             break;
404         case '4':  /* Extended Arabic */
405             x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb);
406             break;
407         case 'S':  /* Greek */
408             x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb);
409             break;
410         case '1':  /* Chinese, Japanese, Korean (EACC) */
411             x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb);
412             break;
413         default:
414             *no_read = 0;
415             cd->my_errno = YAZ_ICONV_EILSEQ;
416             return 0;
417         }
418         *no_read += no_read_sub;
419         return x;
420     }
421 incomplete:
422     *no_read = 0;
423     cd->my_errno = YAZ_ICONV_EINVAL;
424     return 0;
425 }
426
427 static size_t yaz_write_ISO8859_1(yaz_iconv_t cd, unsigned long x,
428                                   char **outbuf, size_t *outbytesleft)
429 {
430     /* list of two char unicode sequence that, when combined, are
431        equivalent to single unicode chars that can be represented in
432        ISO-8859-1/Latin-1.
433        Regular iconv on Linux at least does not seem to convert these,
434        but since MARC-8 to UTF-8 generates these composed sequence
435        we get a better chance of a successful MARC-8 -> ISO-8859-1
436        conversion */
437     unsigned char *outp = (unsigned char *) *outbuf;
438
439     if (cd->compose_char)
440     {
441         int i;
442         for (i = 0; latin1_comb[i].x1; i++)
443             if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
444             {
445                 x = latin1_comb[i].y;
446                 break;
447             }
448         if (*outbytesleft < 1)
449         {  /* no room. Retain compose_char and bail out */
450             cd->my_errno = YAZ_ICONV_E2BIG;
451             return (size_t)(-1);
452         }
453         if (!latin1_comb[i].x1) 
454         {   /* not found. Just write compose_char */
455             *outp++ = (unsigned char) cd->compose_char;
456             (*outbytesleft)--;
457             *outbuf = (char *) outp;
458         }
459         /* compose_char used so reset it. x now holds current char */
460         cd->compose_char = 0;
461     }
462
463     if (x > 32 && x < 127 && cd->compose_char == 0)
464     {
465         cd->compose_char = x;
466         return 0;
467     }
468     else if (x > 255 || x < 1)
469     {
470         cd->my_errno = YAZ_ICONV_EILSEQ;
471         return (size_t) -1;
472     }
473     else if (*outbytesleft < 1)
474     {
475         cd->my_errno = YAZ_ICONV_E2BIG;
476         return (size_t)(-1);
477     }
478     *outp++ = (unsigned char) x;
479     (*outbytesleft)--;
480     *outbuf = (char *) outp;
481     return 0;
482 }
483
484 static size_t yaz_flush_ISO8859_1(yaz_iconv_t cd,
485                                   char **outbuf, size_t *outbytesleft)
486 {
487     if (cd->compose_char)
488     {
489         unsigned char *outp = (unsigned char *) *outbuf;
490         if (*outbytesleft < 1)
491         {
492             cd->my_errno = YAZ_ICONV_E2BIG;
493             return (size_t)(-1);
494         }
495         *outp++ = (unsigned char) cd->compose_char;
496         (*outbytesleft)--;
497         *outbuf = (char *) outp;
498         cd->compose_char = 0;
499     }
500     return 0;
501 }
502
503 static unsigned long lookup_marc8(yaz_iconv_t cd,
504                                   unsigned long x, int *comb,
505                                   const char **page_chr)
506 {
507     char utf8_buf[7];
508     char *utf8_outbuf = utf8_buf;
509     size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
510
511     r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft);
512     if (r == (size_t)(-1))
513     {
514         cd->my_errno = YAZ_ICONV_EILSEQ;
515         return 0;
516     }
517     else
518     {
519         unsigned char *inp;
520         size_t inbytesleft, no_read_sub = 0;
521         unsigned long x;
522
523         *utf8_outbuf = '\0';        
524         inp = (unsigned char *) utf8_buf;
525         inbytesleft = strlen(utf8_buf);
526
527         x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb);
528         if (x)
529         {
530             *page_chr = ESC "(B";
531             return x;
532         }
533         x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb);
534         if (x)
535         {
536             *page_chr = ESC "(B";
537             return x;
538         }
539         x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb);
540         if (x)
541         {
542             *page_chr = ESC "b";
543             return x;
544         }
545         x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb);
546         if (x)
547         {
548             *page_chr = ESC "p";
549             return x;
550         }
551         x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb);
552         if (x)
553         {
554             *page_chr = ESC "(2";
555             return x;
556         }
557         x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb);
558         if (x)
559         {
560             *page_chr = ESC "(N";
561             return x;
562         }
563         x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb);
564         if (x)
565         {
566             *page_chr = ESC "(Q";
567             return x;
568         }
569         x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb);
570         if (x)
571         {
572             *page_chr = ESC "(3";
573             return x;
574         }
575         x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb);
576         if (x)
577         {
578             *page_chr = ESC "(4";
579             return x;
580         }
581         x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb);
582         if (x)
583         {
584             *page_chr = ESC "(S";
585             return x;
586         }
587         x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb);
588         if (x)
589         {
590             *page_chr = ESC "$1";
591             return x;
592         }
593         cd->my_errno = YAZ_ICONV_EILSEQ;
594         return x;
595     }
596 }
597
598 static size_t flush_combos(yaz_iconv_t cd,
599                            char **outbuf, size_t *outbytesleft)
600 {
601     unsigned long y = cd->write_marc8_last;
602     unsigned char byte;
603     char out_buf[4];
604     size_t out_no = 0;
605
606     if (!y)
607         return 0;
608
609     assert(cd->write_marc8_lpage);
610     if (cd->write_marc8_lpage)
611     {
612         size_t r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft,
613                                             cd->write_marc8_lpage);
614         if (r)
615             return r;
616     }
617
618     byte = (unsigned char )((y>>16) & 0xff);
619     if (byte)
620         out_buf[out_no++] = byte;
621     byte = (unsigned char)((y>>8) & 0xff);
622     if (byte)
623         out_buf[out_no++] = byte;
624     byte = (unsigned char )(y & 0xff);
625     if (byte)
626         out_buf[out_no++] = byte;
627
628     if (out_no + 2 >= *outbytesleft)
629     {
630         cd->my_errno = YAZ_ICONV_E2BIG;
631         return (size_t) (-1);
632     }
633
634     memcpy(*outbuf, out_buf, out_no);
635     *outbuf += out_no;
636     (*outbytesleft) -= out_no;
637     if (cd->write_marc8_second_half_char)
638     {
639         *(*outbuf)++ = cd->write_marc8_second_half_char;
640         (*outbytesleft)--;
641     }        
642
643     cd->write_marc8_last = 0;
644     cd->write_marc8_lpage = 0;
645     cd->write_marc8_second_half_char = 0;
646     return 0;
647 }
648
649 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd, 
650                                        char **outbuf, size_t *outbytesleft,
651                                        const char *page_chr)
652 {
653     const char **old_page_chr = &cd->write_marc8_g0;
654
655     /* are we going to a G1-set (such as such as ESC ")!E") */
656     if (page_chr && page_chr[1] == ')')
657         old_page_chr = &cd->write_marc8_g1;
658
659     if (!*old_page_chr || strcmp(page_chr, *old_page_chr))
660     {
661         size_t plen = 0;
662         const char *page_out = page_chr;
663         
664         if (*outbytesleft < 8)
665         {
666             cd->my_errno = YAZ_ICONV_E2BIG;
667             
668             return (size_t) (-1);
669         }
670
671         if (*old_page_chr)
672         {
673             if (!strcmp(*old_page_chr, ESC "p") 
674                 || !strcmp(*old_page_chr, ESC "g")
675                 || !strcmp(*old_page_chr, ESC "b"))
676             {
677                 page_out = ESC "s";
678                 /* Technique 1 leave */
679                 if (strcmp(page_chr, ESC "(B")) /* Not going ASCII page? */
680                 {
681                     /* Must leave script + enter new page */
682                     plen = strlen(page_out);
683                     memcpy(*outbuf, page_out, plen);
684                     (*outbuf) += plen;
685                     (*outbytesleft) -= plen;
686                     page_out = ESC "(B";
687                 }
688             }
689         }
690         *old_page_chr = page_chr;
691         plen = strlen(page_out);
692         memcpy(*outbuf, page_out, plen);
693         (*outbuf) += plen;
694         (*outbytesleft) -= plen;
695     }
696     return 0;
697 }
698
699
700 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
701                                 char **outbuf, size_t *outbytesleft)
702 {
703     int comb = 0;
704     const char *page_chr = 0;
705     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
706
707     if (!y)
708         return (size_t) (-1);
709
710     if (comb)
711     {
712         if (page_chr)
713         {
714             size_t r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft,
715                                                 page_chr);
716             if (r)
717                 return r;
718         }
719         if (x == 0x0361)
720             cd->write_marc8_second_half_char = 0xEC;
721         else if (x == 0x0360)
722             cd->write_marc8_second_half_char = 0xFB;
723
724         if (*outbytesleft <= 1)
725         {
726             cd->my_errno = YAZ_ICONV_E2BIG;
727             return (size_t) (-1);
728         }
729         *(*outbuf)++ = y;
730         (*outbytesleft)--;
731     }
732     else
733     {
734         size_t r = flush_combos(cd, outbuf, outbytesleft);
735         if (r)
736             return r;
737
738         cd->write_marc8_last = y;
739         cd->write_marc8_lpage = page_chr;
740     }
741     return 0;
742 }
743
744 static size_t yaz_flush_marc8(yaz_iconv_t cd,
745                               char **outbuf, size_t *outbytesleft)
746 {
747     size_t r = flush_combos(cd, outbuf, outbytesleft);
748     if (r)
749         return r;
750     cd->write_marc8_g1 = 0;
751     return yaz_write_marc8_page_chr(cd, outbuf, outbytesleft, ESC "(B");
752 }
753
754 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
755                               char **outbuf, size_t *outbytesleft)
756 {
757     int i;
758     for (i = 0; latin1_comb[i].x1; i++)
759     {
760         if (x == latin1_comb[i].y)
761         {
762             size_t r ;
763             /* save the output pointers .. */
764             char *outbuf0 = *outbuf;
765             size_t outbytesleft0 = *outbytesleft;
766             int last_ch = cd->write_marc8_last;
767             const char *lpage = cd->write_marc8_lpage;
768
769             r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
770                                   outbuf, outbytesleft);
771             if (r)
772                 return r;
773             r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
774                                   outbuf, outbytesleft);
775             if (r && cd->my_errno == YAZ_ICONV_E2BIG)
776             {
777                 /* not enough room. reset output to original values */
778                 *outbuf = outbuf0;
779                 *outbytesleft = outbytesleft0;
780                 cd->write_marc8_last = last_ch;
781                 cd->write_marc8_lpage = lpage;
782             }
783             return r;
784         }
785     }
786     return yaz_write_marc8_2(cd, x, outbuf, outbytesleft);
787 }
788
789
790 #if HAVE_WCHAR_H
791 static size_t yaz_write_wchar_t(yaz_iconv_t cd, unsigned long x,
792                                 char **outbuf, size_t *outbytesleft)
793 {
794     unsigned char *outp = (unsigned char *) *outbuf;
795
796     if (*outbytesleft >= sizeof(wchar_t))
797     {
798         wchar_t wch = x;
799         memcpy(outp, &wch, sizeof(wch));
800         outp += sizeof(wch);
801         (*outbytesleft) -= sizeof(wch);
802     }
803     else
804     {
805         cd->my_errno = YAZ_ICONV_E2BIG;
806         return (size_t)(-1);
807     }
808     *outbuf = (char *) outp;
809     return 0;
810 }
811 #endif
812
813 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
814 {
815     return cd->read_handle && cd->write_handle;
816 }
817
818 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
819 {
820     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
821
822     cd->write_handle = 0;
823     cd->read_handle = 0;
824     cd->init_handle = 0;
825     cd->flush_handle = 0;
826     cd->my_errno = YAZ_ICONV_UNKNOWN;
827
828     /* a useful hack: if fromcode has leading @,
829        the library not use YAZ's own conversions .. */
830     if (fromcode[0] == '@')
831         fromcode++;
832     else
833     {
834         if (!yaz_matchstr(fromcode, "UTF8"))
835         {
836             cd->read_handle = yaz_read_UTF8;
837             cd->init_handle = yaz_init_UTF8;
838         }
839         else if (!yaz_matchstr(fromcode, "ISO88591"))
840             cd->read_handle = yaz_read_ISO8859_1;
841         else if (!yaz_matchstr(fromcode, "UCS4"))
842             cd->read_handle = yaz_read_UCS4;
843         else if (!yaz_matchstr(fromcode, "UCS4LE"))
844             cd->read_handle = yaz_read_UCS4LE;
845         else if (!yaz_matchstr(fromcode, "MARC8"))
846             cd->read_handle = yaz_read_marc8;
847         else if (!yaz_matchstr(fromcode, "MARC8s"))
848             cd->read_handle = yaz_read_marc8s;
849         else if (!yaz_matchstr(fromcode, "advancegreek"))
850             cd->read_handle = yaz_read_advancegreek;
851         else if (!yaz_matchstr(fromcode, "iso54281984"))
852             cd->read_handle = yaz_read_iso5428_1984;
853         else if (!yaz_matchstr(fromcode, "iso5428:1984"))
854             cd->read_handle = yaz_read_iso5428_1984;
855 #if HAVE_WCHAR_H
856         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
857             cd->read_handle = yaz_read_wchar_t;
858 #endif
859         
860         if (!yaz_matchstr(tocode, "UTF8"))
861             cd->write_handle = yaz_write_UTF8;
862         else if (!yaz_matchstr(tocode, "ISO88591"))
863         {
864             cd->write_handle = yaz_write_ISO8859_1;
865             cd->flush_handle = yaz_flush_ISO8859_1;
866         }
867         else if (!yaz_matchstr (tocode, "UCS4"))
868             cd->write_handle = yaz_write_UCS4;
869         else if (!yaz_matchstr(tocode, "UCS4LE"))
870             cd->write_handle = yaz_write_UCS4LE;
871         else if (!yaz_matchstr(tocode, "MARC8"))
872         {
873             cd->write_handle = yaz_write_marc8;
874             cd->flush_handle = yaz_flush_marc8;
875         }
876         else if (!yaz_matchstr(tocode, "MARC8s"))
877         {
878             cd->write_handle = yaz_write_marc8;
879             cd->flush_handle = yaz_flush_marc8;
880         }
881         else if (!yaz_matchstr(tocode, "advancegreek"))
882         {
883             cd->write_handle = yaz_write_advancegreek;
884         }
885         else if (!yaz_matchstr(tocode, "iso54281984"))
886         {
887             cd->write_handle = yaz_write_iso5428_1984;
888         }
889         else if (!yaz_matchstr(tocode, "iso5428:1984"))
890         {
891             cd->write_handle = yaz_write_iso5428_1984;
892         }
893 #if HAVE_WCHAR_H
894         else if (!yaz_matchstr(tocode, "WCHAR_T"))
895             cd->write_handle = yaz_write_wchar_t;
896 #endif
897     }
898 #if HAVE_ICONV_H
899     cd->iconv_cd = 0;
900     if (!cd->read_handle || !cd->write_handle)
901     {
902         cd->iconv_cd = iconv_open (tocode, fromcode);
903         if (cd->iconv_cd == (iconv_t) (-1))
904         {
905             xfree (cd);
906             return 0;
907         }
908     }
909 #else
910     if (!cd->read_handle || !cd->write_handle)
911     {
912         xfree (cd);
913         return 0;
914     }
915 #endif
916     cd->init_flag = 1;
917     return cd;
918 }
919
920 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
921                  char **outbuf, size_t *outbytesleft)
922 {
923     char *inbuf0 = 0;
924     size_t r = 0;
925
926 #if HAVE_ICONV_H
927     if (cd->iconv_cd)
928     {
929         size_t r =
930             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
931         if (r == (size_t)(-1))
932         {
933             switch (yaz_errno())
934             {
935             case E2BIG:
936                 cd->my_errno = YAZ_ICONV_E2BIG;
937                 break;
938             case EINVAL:
939                 cd->my_errno = YAZ_ICONV_EINVAL;
940                 break;
941             case EILSEQ:
942                 cd->my_errno = YAZ_ICONV_EILSEQ;
943                 break;
944             default:
945                 cd->my_errno = YAZ_ICONV_UNKNOWN;
946             }
947         }
948         return r;
949     }
950 #endif
951
952     if (inbuf)
953         inbuf0 = *inbuf;
954
955     if (cd->init_flag)
956     {
957         cd->my_errno = YAZ_ICONV_UNKNOWN;
958         cd->g0_mode = 'B';
959         cd->g1_mode = 'B';
960         
961         cd->comb_offset = cd->comb_size = 0;
962         cd->compose_char = 0;
963         
964         cd->write_marc8_second_half_char = 0;
965         cd->write_marc8_last = 0;
966         cd->write_marc8_lpage = 0;
967         cd->write_marc8_g0 = ESC "(B";
968         cd->write_marc8_g1 = 0;
969         
970         cd->unget_x = 0;
971         cd->no_read_x = 0;
972     }
973
974     if (cd->init_flag)
975     {
976         if (cd->init_handle && inbuf && *inbuf)
977         {
978             size_t no_read = 0;
979             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
980                                          *inbytesleft, &no_read);
981             if (r)
982             {
983                 if (cd->my_errno == YAZ_ICONV_EINVAL)
984                     return r;
985                 cd->init_flag = 0;
986                 return r;
987             }
988             *inbytesleft -= no_read;
989             *inbuf += no_read;
990         }
991     }
992     cd->init_flag = 0;
993
994     if (!inbuf || !*inbuf)
995     {
996         if (outbuf && *outbuf)
997         {
998             if (cd->unget_x)
999                 r = (*cd->write_handle)(cd, cd->unget_x, outbuf, outbytesleft);
1000             if (cd->flush_handle)
1001                 r = (*cd->flush_handle)(cd, outbuf, outbytesleft);
1002         }
1003         if (r == 0)
1004             cd->init_flag = 1;
1005         cd->unget_x = 0;
1006         return r;
1007     }
1008     while (1)
1009     {
1010         unsigned long x;
1011         size_t no_read;
1012
1013         if (cd->unget_x)
1014         {
1015             x = cd->unget_x;
1016             no_read = cd->no_read_x;
1017         }
1018         else
1019         {
1020             if (*inbytesleft == 0)
1021             {
1022                 r = *inbuf - inbuf0;
1023                 break;
1024             }
1025             x = (*cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1026                                    &no_read);
1027             if (no_read == 0)
1028             {
1029                 r = (size_t)(-1);
1030                 break;
1031             }
1032         }
1033         if (x)
1034         {
1035             r = (*cd->write_handle)(cd, x, outbuf, outbytesleft);
1036             if (r)
1037             {
1038                 /* unable to write it. save it because read_handle cannot
1039                    rewind .. */
1040                 if (cd->my_errno == YAZ_ICONV_E2BIG)
1041                 {
1042                     cd->unget_x = x;
1043                     cd->no_read_x = no_read;
1044                     break;
1045                 }
1046             }
1047             cd->unget_x = 0;
1048         }
1049         *inbytesleft -= no_read;
1050         (*inbuf) += no_read;
1051     }
1052     return r;
1053 }
1054
1055 int yaz_iconv_error (yaz_iconv_t cd)
1056 {
1057     return cd->my_errno;
1058 }
1059
1060 int yaz_iconv_close (yaz_iconv_t cd)
1061 {
1062 #if HAVE_ICONV_H
1063     if (cd->iconv_cd)
1064         iconv_close (cd->iconv_cd);
1065 #endif
1066     xfree (cd);
1067     return 0;
1068 }
1069
1070 void yaz_iconv_set_errno(yaz_iconv_t cd, int no)
1071 {
1072     cd->my_errno = no;
1073 }
1074
1075 /*
1076  * Local variables:
1077  * c-basic-offset: 4
1078  * indent-tabs-mode: nil
1079  * End:
1080  * vim: shiftwidth=4 tabstop=8 expandtab
1081  */