changed output to be non-cascarding when using -n switch
[yaz-moved-to-github.git] / src / siconv.c
1 /*
2  * Copyright (C) 1995-2007, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: siconv.c,v 1.32 2007-01-03 08:42:15 adam Exp $
6  */
7 /**
8  * \file siconv.c
9  * \brief Implements simple ICONV
10  *
11  * This implements an interface similar to that of iconv and
12  * is used by YAZ to interface with iconv (if present).
13  * For systems where iconv is not present, this layer
14  * provides a few important conversions: UTF-8, MARC-8, Latin-1.
15  *
16  * MARC-8 reference:
17  *  http://www.loc.gov/marc/specifications/speccharmarc8.html
18  */
19
20 #if HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include <assert.h>
25 #include <errno.h>
26 #include <string.h>
27 #include <ctype.h>
28 #if HAVE_WCHAR_H
29 #include <wchar.h>
30 #endif
31
32 #if HAVE_ICONV_H
33 #include <iconv.h>
34 #endif
35
36
37 #include <yaz/yaz-util.h>
38
39 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
40                                size_t *no_read, int *combining);
41 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
42                                size_t *no_read, int *combining);
43 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
44                                size_t *no_read, int *combining);
45 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
46                                size_t *no_read, int *combining);
47 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
48                                size_t *no_read, int *combining);
49 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
50                                size_t *no_read, int *combining);
51 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
52                                size_t *no_read, int *combining);
53 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
54                                size_t *no_read, int *combining);
55 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
56                                size_t *no_read, int *combining);
57
58
59 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
60                                 size_t *no_read, int *combining);
61 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
62                                 size_t *no_read, int *combining);
63 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
64                                 size_t *no_read, int *combining);
65 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
66                                 size_t *no_read, int *combining);
67 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
68                                 size_t *no_read, int *combining);
69 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
70                                 size_t *no_read, int *combining);
71 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
72                                 size_t *no_read, int *combining);
73 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
74                                 size_t *no_read, int *combining);
75 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
76                                 size_t *no_read, int *combining);
77
78 struct yaz_iconv_struct {
79     int my_errno;
80     int init_flag;
81     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
82                           size_t inbytesleft, size_t *no_read);
83     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
84                                  size_t inbytesleft, size_t *no_read);
85     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
86                            char **outbuf, size_t *outbytesleft,
87                            int last);
88     int marc8_esc_mode;
89
90     int comb_offset;
91     int comb_size;
92     unsigned long comb_x[8];
93     size_t comb_no_read[8];
94     size_t no_read_x;
95     unsigned long unget_x;
96 #if HAVE_ICONV_H
97     iconv_t iconv_cd;
98 #endif
99     unsigned long compose_char;
100
101     unsigned long write_marc8_comb_ch[8];
102     size_t write_marc8_comb_no;
103     unsigned write_marc8_second_half_char;
104     unsigned long write_marc8_last;
105     const char *write_marc8_page_chr;
106 };
107
108 static struct {
109     unsigned long x1, x2;
110     unsigned y;
111 } latin1_comb[] = {
112     { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
113     { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
114     { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
115     { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
116     { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
117     { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
118     /* no need for 0xc6      LATIN CAPITAL LETTER AE */
119     { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
120     { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
121     { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
122     { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
123     { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
124     { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
125     { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
126     { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
127     { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
128     { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
129     { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
130     { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
131     { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
132     { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
133     { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
134     /* omitted:    0xd7      MULTIPLICATION SIGN */
135     /* omitted:    0xd8      LATIN CAPITAL LETTER O WITH STROKE */
136     { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
137     { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
138     { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
139     { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
140     { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
141     /* omitted:    0xde      LATIN CAPITAL LETTER THORN */
142     /* omitted:    0xdf      LATIN SMALL LETTER SHARP S */
143     { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
144     { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
145     { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
146     { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
147     { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
148     { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
149     /* omitted:    0xe6      LATIN SMALL LETTER AE */
150     { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
151     { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
152     { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
153     { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
154     { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
155     { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
156     { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
157     { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
158     { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
159     /* omitted:    0xf0      LATIN SMALL LETTER ETH */
160     { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
161     { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
162     { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
163     { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
164     { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
165     { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
166     /* omitted:    0xf7      DIVISION SIGN */
167     /* omitted:    0xf8      LATIN SMALL LETTER O WITH STROKE */
168     { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
169     { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
170     { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
171     { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
172     { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
173     /* omitted:    0xfe      LATIN SMALL LETTER THORN */
174     { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
175     
176     { 0, 0, 0}
177 };
178
179 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
180                                          size_t inbytesleft, size_t *no_read)
181 {
182     unsigned long x = inp[0];
183     *no_read = 1;
184     return x;
185 }
186
187
188 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
189                              size_t inbytesleft, size_t *no_read)
190 {
191     if (inp[0] != 0xef)
192     {
193         *no_read = 0;
194         return 0;
195     }
196     if (inbytesleft < 3)
197     {
198         cd->my_errno = YAZ_ICONV_EINVAL;
199         return (size_t) -1;
200     }
201     if (inp[1] != 0xbb && inp[2] == 0xbf)
202         *no_read = 3;
203     else
204         *no_read = 0;
205     return 0;
206 }
207
208 unsigned long yaz_read_UTF8_char(unsigned char *inp,
209                                  size_t inbytesleft, size_t *no_read,
210                                  int *error)
211 {
212     unsigned long x = 0;
213
214     if (inp[0] <= 0x7f)
215     {
216         x = inp[0];
217         *no_read = 1;
218     }
219     else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
220     {
221         *no_read = 0;
222         *error = YAZ_ICONV_EILSEQ;
223     }
224     else if (inp[0] <= 0xdf && inbytesleft >= 2)
225     {
226         x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
227         if (x >= 0x80)
228             *no_read = 2;
229         else
230         {
231             *no_read = 0;
232             *error = YAZ_ICONV_EILSEQ;
233         }
234     }
235     else if (inp[0] <= 0xef && inbytesleft >= 3)
236     {
237         x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
238             (inp[2] & 0x3f);
239         if (x >= 0x800)
240             *no_read = 3;
241         else
242         {
243             *no_read = 0;
244             *error = YAZ_ICONV_EILSEQ;
245         }
246     }
247     else if (inp[0] <= 0xf7 && inbytesleft >= 4)
248     {
249         x =  ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
250             ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
251         if (x >= 0x10000)
252             *no_read = 4;
253         else
254         {
255             *no_read = 0;
256             *error = YAZ_ICONV_EILSEQ;
257         }
258     }
259     else if (inp[0] <= 0xfb && inbytesleft >= 5)
260     {
261         x =  ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
262             ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
263             (inp[4] & 0x3f);
264         if (x >= 0x200000)
265             *no_read = 5;
266         else
267         {
268             *no_read = 0;
269             *error = YAZ_ICONV_EILSEQ;
270         }
271     }
272     else if (inp[0] <= 0xfd && inbytesleft >= 6)
273     {
274         x =  ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
275             ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
276             ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
277         if (x >= 0x4000000)
278             *no_read = 6;
279         else
280         {
281             *no_read = 0;
282             *error = YAZ_ICONV_EILSEQ;
283         }
284     }
285     else
286     {
287         *no_read = 0;
288         *error = YAZ_ICONV_EINVAL;
289     }
290     return x;
291 }
292
293 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
294                                     size_t inbytesleft, size_t *no_read)
295 {
296     return yaz_read_UTF8_char(inp, inbytesleft, no_read, &cd->my_errno);
297 }
298
299 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
300                                     size_t inbytesleft, size_t *no_read)
301 {
302     unsigned long x = 0;
303     
304     if (inbytesleft < 4)
305     {
306         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
307         *no_read = 0;
308     }
309     else
310     {
311         x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
312         *no_read = 4;
313     }
314     return x;
315 }
316
317 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
318                                       size_t inbytesleft, size_t *no_read)
319 {
320     unsigned long x = 0;
321     
322     if (inbytesleft < 4)
323     {
324         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
325         *no_read = 0;
326     }
327     else
328     {
329         x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
330         *no_read = 4;
331     }
332     return x;
333 }
334
335 #if HAVE_WCHAR_H
336 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
337                                        size_t inbytesleft, size_t *no_read)
338 {
339     unsigned long x = 0;
340     
341     if (inbytesleft < sizeof(wchar_t))
342     {
343         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
344         *no_read = 0;
345     }
346     else
347     {
348         wchar_t wch;
349         memcpy (&wch, inp, sizeof(wch));
350         x = wch;
351         *no_read = sizeof(wch);
352     }
353     return x;
354 }
355 #endif
356
357
358 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
359                                           size_t inbytesleft, size_t *no_read,
360                                           int *comb);
361
362 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
363                                      size_t inbytesleft, size_t *no_read)
364 {
365     unsigned long x;
366     if (cd->comb_offset < cd->comb_size)
367     {
368         *no_read = cd->comb_no_read[cd->comb_offset];
369         x = cd->comb_x[cd->comb_offset];
370
371         /* special case for double-diacritic combining characters, 
372            INVERTED BREVE and DOUBLE TILDE.
373            We'll increment the no_read counter by 1, since we want to skip over
374            the processing of the closing ligature character
375         */
376         /* this code is no longer necessary.. our handlers code in
377            yaz_marc8_?_conv (generated by charconv.tcl) now returns
378            0 and no_read=1 when a sequence does not match the input.
379            The SECOND HALFs in codetables.xml produces a non-existant
380            entry in the conversion trie.. Hence when met, the input byte is
381            skipped as it should (in yaz_iconv)
382         */
383 #if 0
384         if (x == 0x0361 || x == 0x0360)
385             *no_read += 1;
386 #endif
387         cd->comb_offset++;
388         return x;
389     }
390
391     cd->comb_offset = 0;
392     for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
393     {
394         int comb = 0;
395         x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
396         if (!comb || !x)
397             break;
398         cd->comb_x[cd->comb_size] = x;
399         cd->comb_no_read[cd->comb_size] = *no_read;
400         inp += *no_read;
401         inbytesleft = inbytesleft - *no_read;
402     }
403     return x;
404 }
405
406 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
407                                      size_t inbytesleft, size_t *no_read)
408 {
409     unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
410     if (x && cd->comb_size == 1)
411     {
412         /* For MARC8s we try to get a Latin-1 page code out of it */
413         int i;
414         for (i = 0; latin1_comb[i].x1; i++)
415             if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
416             {
417                 *no_read += cd->comb_no_read[0];
418                 cd->comb_size = 0;
419                 x = latin1_comb[i].y;
420                 break;
421             }
422     }
423     return x;
424 }
425
426 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
427                                          size_t inbytesleft, size_t *no_read,
428                                          int *comb)
429 {
430     *no_read = 0;
431     while(inbytesleft >= 1 && inp[0] == 27)
432     {
433         size_t inbytesleft0 = inbytesleft;
434         inp++;
435         inbytesleft--;
436         while(inbytesleft > 0 && strchr("(,$!)-", *inp))
437         {
438             inbytesleft--;
439             inp++;
440         }
441         if (inbytesleft <= 0)
442         {
443             *no_read = 0;
444             cd->my_errno = YAZ_ICONV_EINVAL;
445             return 0;
446         }
447         cd->marc8_esc_mode = *inp++;
448         inbytesleft--;
449         (*no_read) += inbytesleft0 - inbytesleft;
450     }
451     if (inbytesleft <= 0)
452         return 0;
453     else
454     {
455         unsigned long x;
456         size_t no_read_sub = 0;
457         *comb = 0;
458
459         switch(cd->marc8_esc_mode)
460         {
461         case 'B':  /* Basic ASCII */
462         case 'E':  /* ANSEL */
463         case 's':  /* ASCII */
464             x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
465             break;
466         case 'g':  /* Greek */
467             x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
468             break;
469         case 'b':  /* Subscripts */
470             x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
471             break;
472         case 'p':  /* Superscripts */
473             x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
474             break;
475         case '2':  /* Basic Hebrew */
476             x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
477             break;
478         case 'N':  /* Basic Cyrillic */
479         case 'Q':  /* Extended Cyrillic */
480             x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
481             break;
482         case '3':  /* Basic Arabic */
483         case '4':  /* Extended Arabic */
484             x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
485             break;
486         case 'S':  /* Greek */
487             x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
488             break;
489         case '1':  /* Chinese, Japanese, Korean (EACC) */
490             x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
491             break;
492         default:
493             *no_read = 0;
494             cd->my_errno = YAZ_ICONV_EILSEQ;
495             return 0;
496         }
497         *no_read += no_read_sub;
498         return x;
499     }
500 }
501
502 static size_t yaz_write_UTF8(yaz_iconv_t cd, unsigned long x,
503                              char **outbuf, size_t *outbytesleft,
504                              int last)
505 {
506     return yaz_write_UTF8_char(x, outbuf, outbytesleft, &cd->my_errno);
507 }
508
509 size_t yaz_write_UTF8_char(unsigned long x,
510                            char **outbuf, size_t *outbytesleft,
511                            int *error)
512 {
513     unsigned char *outp = (unsigned char *) *outbuf;
514
515     if (x <= 0x7f && *outbytesleft >= 1)
516     {
517         *outp++ = (unsigned char) x;
518         (*outbytesleft)--;
519     } 
520     else if (x <= 0x7ff && *outbytesleft >= 2)
521     {
522         *outp++ = (unsigned char) ((x >> 6) | 0xc0);
523         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
524         (*outbytesleft) -= 2;
525     }
526     else if (x <= 0xffff && *outbytesleft >= 3)
527     {
528         *outp++ = (unsigned char) ((x >> 12) | 0xe0);
529         *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
530         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
531         (*outbytesleft) -= 3;
532     }
533     else if (x <= 0x1fffff && *outbytesleft >= 4)
534     {
535         *outp++ = (unsigned char) ((x >> 18) | 0xf0);
536         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
537         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
538         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
539         (*outbytesleft) -= 4;
540     }
541     else if (x <= 0x3ffffff && *outbytesleft >= 5)
542     {
543         *outp++ = (unsigned char) ((x >> 24) | 0xf8);
544         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
545         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
546         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
547         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
548         (*outbytesleft) -= 5;
549     }
550     else if (*outbytesleft >= 6)
551     {
552         *outp++ = (unsigned char) ((x >> 30) | 0xfc);
553         *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
554         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
555         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
556         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
557         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
558         (*outbytesleft) -= 6;
559     }
560     else 
561     {
562         *error = YAZ_ICONV_E2BIG;  /* not room for output */
563         return (size_t)(-1);
564     }
565     *outbuf = (char *) outp;
566     return 0;
567 }
568
569
570 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
571                                    char **outbuf, size_t *outbytesleft,
572                                    int last)
573 {
574     /* list of two char unicode sequence that, when combined, are
575        equivalent to single unicode chars that can be represented in
576        ISO-8859-1/Latin-1.
577        Regular iconv on Linux at least does not seem to convert these,
578        but since MARC-8 to UTF-8 generates these composed sequence
579        we get a better chance of a successful MARC-8 -> ISO-8859-1
580        conversion */
581     unsigned char *outp = (unsigned char *) *outbuf;
582
583     if (cd->compose_char)
584     {
585         int i;
586         for (i = 0; latin1_comb[i].x1; i++)
587             if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
588             {
589                 x = latin1_comb[i].y;
590                 break;
591             }
592         if (*outbytesleft < 1)
593         {  /* no room. Retain compose_char and bail out */
594             cd->my_errno = YAZ_ICONV_E2BIG;
595             return (size_t)(-1);
596         }
597         if (!latin1_comb[i].x1) 
598         {   /* not found. Just write compose_char */
599             *outp++ = (unsigned char) cd->compose_char;
600             (*outbytesleft)--;
601             *outbuf = (char *) outp;
602         }
603         /* compose_char used so reset it. x now holds current char */
604         cd->compose_char = 0;
605     }
606
607     if (!last && x > 32 && x < 127 && cd->compose_char == 0)
608     {
609         cd->compose_char = x;
610         return 0;
611     }
612     else if (x > 255 || x < 1)
613     {
614         cd->my_errno = YAZ_ICONV_EILSEQ;
615         return (size_t) -1;
616     }
617     else if (*outbytesleft < 1)
618     {
619         cd->my_errno = YAZ_ICONV_E2BIG;
620         return (size_t)(-1);
621     }
622     *outp++ = (unsigned char) x;
623     (*outbytesleft)--;
624     *outbuf = (char *) outp;
625     return 0;
626 }
627
628
629 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
630                               char **outbuf, size_t *outbytesleft,
631                               int last)
632 {
633     unsigned char *outp = (unsigned char *) *outbuf;
634     if (*outbytesleft >= 4)
635     {
636         *outp++ = (unsigned char) (x>>24);
637         *outp++ = (unsigned char) (x>>16);
638         *outp++ = (unsigned char) (x>>8);
639         *outp++ = (unsigned char) x;
640         (*outbytesleft) -= 4;
641     }
642     else
643     {
644         cd->my_errno = YAZ_ICONV_E2BIG;
645         return (size_t)(-1);
646     }
647     *outbuf = (char *) outp;
648     return 0;
649 }
650
651 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
652                                 char **outbuf, size_t *outbytesleft,
653                                 int last)
654 {
655     unsigned char *outp = (unsigned char *) *outbuf;
656     if (*outbytesleft >= 4)
657     {
658         *outp++ = (unsigned char) x;
659         *outp++ = (unsigned char) (x>>8);
660         *outp++ = (unsigned char) (x>>16);
661         *outp++ = (unsigned char) (x>>24);
662         (*outbytesleft) -= 4;
663     }
664     else
665     {
666         cd->my_errno = YAZ_ICONV_E2BIG;
667         return (size_t)(-1);
668     }
669     *outbuf = (char *) outp;
670     return 0;
671 }
672
673 static unsigned long lookup_marc8(yaz_iconv_t cd,
674                                   unsigned long x, int *comb,
675                                   const char **page_chr)
676 {
677     char utf8_buf[7];
678     char *utf8_outbuf = utf8_buf;
679     size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
680
681     r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
682     if (r == (size_t)(-1))
683     {
684         cd->my_errno = YAZ_ICONV_EILSEQ;
685         return 0;
686     }
687     else
688     {
689         unsigned char *inp;
690         size_t inbytesleft, no_read_sub = 0;
691         unsigned long x;
692
693         *utf8_outbuf = '\0';        
694         inp = (unsigned char *) utf8_buf;
695         inbytesleft = strlen(utf8_buf);
696         
697         x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
698         if (x)
699         {
700             *page_chr = "\033(B";
701             return x;
702         }
703         x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
704         if (x)
705         {
706             *page_chr = "\033g";
707             return x;
708         }
709         x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
710         if (x)
711         {
712             *page_chr = "\033b";
713             return x;
714         }
715         x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
716         if (x)
717         {
718             *page_chr = "\033p";
719             return x;
720         }
721         x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
722         if (x)
723         {
724             *page_chr = "\033(2";
725             return x;
726         }
727         x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
728         if (x)
729         {
730             *page_chr = "\033(N";
731             return x;
732         }
733         x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
734         if (x)
735         {
736             *page_chr = "\033(3";
737             return x;
738         }
739         x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
740         if (x)
741         {
742             *page_chr = "\033(S";
743             return x;
744         }
745         x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
746         if (x)
747         {
748             *page_chr = "\033$1";
749             return x;
750         }
751         cd->my_errno = YAZ_ICONV_EILSEQ;
752         return x;
753     }
754 }
755
756 static size_t flush_combos(yaz_iconv_t cd,
757                            char **outbuf, size_t *outbytesleft)
758 {
759     unsigned long y = cd->write_marc8_last;
760     unsigned char byte;
761     char out_buf[10];
762     size_t i, out_no = 0;
763
764     if (!y)
765         return 0;
766
767     byte = (unsigned char )((y>>16) & 0xff);
768     if (byte)
769         out_buf[out_no++] = byte;
770     byte = (unsigned char)((y>>8) & 0xff);
771     if (byte)
772         out_buf[out_no++] = byte;
773     byte = (unsigned char )(y & 0xff);
774     if (byte)
775         out_buf[out_no++] = byte;
776
777     if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
778     {
779         cd->my_errno = YAZ_ICONV_E2BIG;
780         return (size_t) (-1);
781     }
782
783     for (i = 0; i < cd->write_marc8_comb_no; i++)
784     {
785         /* all MARC-8 combined characters are simple bytes */
786         byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
787         *(*outbuf)++ = byte;
788         (*outbytesleft)--;
789     }
790     memcpy(*outbuf, out_buf, out_no);
791     *outbuf += out_no;
792     (*outbytesleft) -= out_no;
793     if (cd->write_marc8_second_half_char)
794     {
795         *(*outbuf)++ = cd->write_marc8_second_half_char;
796         (*outbytesleft)--;
797     }        
798
799     cd->write_marc8_last = 0;
800     cd->write_marc8_comb_no = 0;
801     cd->write_marc8_second_half_char = 0;
802     return 0;
803 }
804
805 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
806                                 char **outbuf, size_t *outbytesleft,
807                                 int last)
808 {
809     int comb = 0;
810     const char *page_chr = 0;
811     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
812
813     if (!y)
814         return (size_t) (-1);
815
816     if (comb)
817     {
818         if (x == 0x0361)
819             cd->write_marc8_second_half_char = 0xEC;
820         else if (x == 0x0360)
821             cd->write_marc8_second_half_char = 0xFB;
822
823         if (cd->write_marc8_comb_no < 6)
824             cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
825     }
826     else
827     {
828         size_t r = flush_combos(cd, outbuf, outbytesleft);
829         const char *old_page_chr = cd->write_marc8_page_chr;
830         if (r)
831             return r;
832         if (strcmp(page_chr, old_page_chr))
833         {
834             size_t plen = 0;
835             const char *page_out = page_chr;
836
837             if (*outbytesleft < 8)
838             {
839                 cd->my_errno = YAZ_ICONV_E2BIG;
840                 
841                 return (size_t) (-1);
842             }
843             cd->write_marc8_page_chr = page_chr;
844
845             if (!strcmp(old_page_chr, "\033p") 
846                 || !strcmp(old_page_chr, "\033g")
847                 || !strcmp(old_page_chr, "\033b"))
848             {
849                 /* Technique 1 leave */
850                 page_out = "\033s";
851                 if (strcmp(page_chr, "\033(B")) /* Not going ASCII page? */
852                 {
853                     /* Must leave script + enter new page */
854                     plen = strlen(page_out);
855                     memcpy(*outbuf, page_out, plen);
856                     (*outbuf) += plen;
857                     (*outbytesleft) -= plen;
858                     page_out = page_chr;
859                 }
860             }
861             plen = strlen(page_out);
862             memcpy(*outbuf, page_out, plen);
863             (*outbuf) += plen;
864             (*outbytesleft) -= plen;
865         }
866         cd->write_marc8_last = y;
867     }
868     if (last)
869     {
870         size_t r = flush_combos(cd, outbuf, outbytesleft);
871         if (r)
872         {
873             if (comb)
874                 cd->write_marc8_comb_no--;
875             else
876                 cd->write_marc8_last = 0;
877             return r;
878         }
879     }
880     return 0;
881 }
882
883 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
884                               char **outbuf, size_t *outbytesleft,
885                               int last)
886 {
887     int i;
888     for (i = 0; latin1_comb[i].x1; i++)
889     {
890         if (x == latin1_comb[i].y)
891         {
892             size_t r ;
893             /* save the output pointers .. */
894             char *outbuf0 = *outbuf;
895             size_t outbytesleft0 = *outbytesleft;
896             int last_ch = cd->write_marc8_last;
897
898             r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
899                                   outbuf, outbytesleft, 0);
900             if (r)
901                 return r;
902             r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
903                                   outbuf, outbytesleft, last);
904             if (r && cd->my_errno == YAZ_ICONV_E2BIG)
905             {
906                 /* not enough room. reset output to original values */
907                 *outbuf = outbuf0;
908                 *outbytesleft = outbytesleft0;
909                 cd->write_marc8_last = last_ch;
910             }
911             return r;
912         }
913     }
914     return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, last);
915 }
916
917
918 #if HAVE_WCHAR_H
919 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
920                                  char **outbuf, size_t *outbytesleft,
921                                  int last)
922 {
923     unsigned char *outp = (unsigned char *) *outbuf;
924
925     if (*outbytesleft >= sizeof(wchar_t))
926     {
927         wchar_t wch = x;
928         memcpy(outp, &wch, sizeof(wch));
929         outp += sizeof(wch);
930         (*outbytesleft) -= sizeof(wch);
931     }
932     else
933     {
934         cd->my_errno = YAZ_ICONV_E2BIG;
935         return (size_t)(-1);
936     }
937     *outbuf = (char *) outp;
938     return 0;
939 }
940 #endif
941
942 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
943 {
944     return cd->read_handle && cd->write_handle;
945 }
946
947 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
948 {
949     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
950
951     cd->write_handle = 0;
952     cd->read_handle = 0;
953     cd->init_handle = 0;
954     cd->my_errno = YAZ_ICONV_UNKNOWN;
955
956     /* a useful hack: if fromcode has leading @,
957        the library not use YAZ's own conversions .. */
958     if (fromcode[0] == '@')
959         fromcode++;
960     else
961     {
962         if (!yaz_matchstr(fromcode, "UTF8"))
963         {
964             cd->read_handle = yaz_read_UTF8;
965             cd->init_handle = yaz_init_UTF8;
966         }
967         else if (!yaz_matchstr(fromcode, "ISO88591"))
968             cd->read_handle = yaz_read_ISO8859_1;
969         else if (!yaz_matchstr(fromcode, "UCS4"))
970             cd->read_handle = yaz_read_UCS4;
971         else if (!yaz_matchstr(fromcode, "UCS4LE"))
972             cd->read_handle = yaz_read_UCS4LE;
973         else if (!yaz_matchstr(fromcode, "MARC8"))
974             cd->read_handle = yaz_read_marc8;
975         else if (!yaz_matchstr(fromcode, "MARC8s"))
976             cd->read_handle = yaz_read_marc8s;
977 #if HAVE_WCHAR_H
978         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
979             cd->read_handle = yaz_read_wchar_t;
980 #endif
981         
982         if (!yaz_matchstr(tocode, "UTF8"))
983             cd->write_handle = yaz_write_UTF8;
984         else if (!yaz_matchstr(tocode, "ISO88591"))
985             cd->write_handle = yaz_write_ISO8859_1;
986         else if (!yaz_matchstr (tocode, "UCS4"))
987             cd->write_handle = yaz_write_UCS4;
988         else if (!yaz_matchstr(tocode, "UCS4LE"))
989             cd->write_handle = yaz_write_UCS4LE;
990         else if (!yaz_matchstr(tocode, "MARC8"))
991             cd->write_handle = yaz_write_marc8;
992         else if (!yaz_matchstr(tocode, "MARC8s"))
993             cd->write_handle = yaz_write_marc8;
994 #if HAVE_WCHAR_H
995         else if (!yaz_matchstr(tocode, "WCHAR_T"))
996             cd->write_handle = yaz_write_wchar_t;
997 #endif
998     }
999 #if HAVE_ICONV_H
1000     cd->iconv_cd = 0;
1001     if (!cd->read_handle || !cd->write_handle)
1002     {
1003         cd->iconv_cd = iconv_open (tocode, fromcode);
1004         if (cd->iconv_cd == (iconv_t) (-1))
1005         {
1006             xfree (cd);
1007             return 0;
1008         }
1009     }
1010 #else
1011     if (!cd->read_handle || !cd->write_handle)
1012     {
1013         xfree (cd);
1014         return 0;
1015     }
1016 #endif
1017     cd->init_flag = 1;
1018     return cd;
1019 }
1020
1021 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
1022                  char **outbuf, size_t *outbytesleft)
1023 {
1024     char *inbuf0;
1025     size_t r = 0;
1026
1027 #if HAVE_ICONV_H
1028     if (cd->iconv_cd)
1029     {
1030         size_t r =
1031             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
1032         if (r == (size_t)(-1))
1033         {
1034             switch (yaz_errno())
1035             {
1036             case E2BIG:
1037                 cd->my_errno = YAZ_ICONV_E2BIG;
1038                 break;
1039             case EINVAL:
1040                 cd->my_errno = YAZ_ICONV_EINVAL;
1041                 break;
1042             case EILSEQ:
1043                 cd->my_errno = YAZ_ICONV_EILSEQ;
1044                 break;
1045             default:
1046                 cd->my_errno = YAZ_ICONV_UNKNOWN;
1047             }
1048         }
1049         return r;
1050     }
1051 #endif
1052     if (inbuf == 0 || *inbuf == 0)
1053     {
1054         cd->init_flag = 1;
1055         cd->my_errno = YAZ_ICONV_UNKNOWN;
1056         return 0;
1057     }
1058     inbuf0 = *inbuf;
1059
1060     if (cd->init_flag)
1061     {
1062         if (cd->init_handle)
1063         {
1064             size_t no_read = 0;
1065             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1066                                          *inbytesleft, &no_read);
1067             if (r)
1068             {
1069                 if (cd->my_errno == YAZ_ICONV_EINVAL)
1070                     return r;
1071                 cd->init_flag = 0;
1072                 return r;
1073             }
1074             *inbytesleft -= no_read;
1075             *inbuf += no_read;
1076         }
1077         cd->marc8_esc_mode = 'B';
1078         
1079         cd->comb_offset = cd->comb_size = 0;
1080         cd->compose_char = 0;
1081         
1082         cd->write_marc8_comb_no = 0;
1083         cd->write_marc8_second_half_char = 0;
1084         cd->write_marc8_last = 0;
1085         cd->write_marc8_page_chr = "\033(B";
1086         
1087         cd->init_flag = 0;
1088         cd->unget_x = 0;
1089         cd->no_read_x = 0;
1090     }
1091     while (1)
1092     {
1093         unsigned long x;
1094         size_t no_read;
1095
1096         if (*inbytesleft == 0)
1097         {
1098             r = *inbuf - inbuf0;
1099             break;
1100         }
1101         if (!cd->unget_x)
1102         {
1103             x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1104                                   &no_read);
1105             if (no_read == 0)
1106             {
1107                 r = (size_t)(-1);
1108                 break;
1109             }
1110         }
1111         else
1112         {
1113             x = cd->unget_x;
1114             no_read = cd->no_read_x;
1115         }
1116         if (x)
1117         {
1118             r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1119                                    (*inbytesleft - no_read) == 0 ? 1 : 0);
1120             if (r)
1121             {
1122                 /* unable to write it. save it because read_handle cannot
1123                    rewind .. */
1124                 if (cd->my_errno == YAZ_ICONV_E2BIG)
1125                 {
1126                     cd->unget_x = x;
1127                     cd->no_read_x = no_read;
1128                     break;
1129                 }
1130             }
1131             cd->unget_x = 0;
1132         }
1133         *inbytesleft -= no_read;
1134         (*inbuf) += no_read;
1135     }
1136     return r;
1137 }
1138
1139 int yaz_iconv_error (yaz_iconv_t cd)
1140 {
1141     return cd->my_errno;
1142 }
1143
1144 int yaz_iconv_close (yaz_iconv_t cd)
1145 {
1146 #if HAVE_ICONV_H
1147     if (cd->iconv_cd)
1148         iconv_close (cd->iconv_cd);
1149 #endif
1150     xfree (cd);
1151     return 0;
1152 }
1153
1154 /*
1155  * Local variables:
1156  * c-basic-offset: 4
1157  * indent-tabs-mode: nil
1158  * End:
1159  * vim: shiftwidth=4 tabstop=8 expandtab
1160  */
1161