8deb714f9a7f884b7e3dfd2b2c444fbe93b6ffb7
[yaz-moved-to-github.git] / src / siconv.c
1 /*
2  * Copyright (C) 1995-2006, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: siconv.c,v 1.28 2006-08-30 20:14:51 adam Exp $
6  */
7 /**
8  * \file siconv.c
9  * \brief Implements simple ICONV
10  *
11  * This implements an interface similar to that of iconv and
12  * is used by YAZ to interface with iconv (if present).
13  * For systems where iconv is not present, this layer
14  * provides a few important conversions: UTF-8, MARC-8, Latin-1.
15  *
16  * MARC-8 reference:
17  *  http://www.loc.gov/marc/specifications/speccharmarc8.html
18  */
19
20 #if HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include <assert.h>
25 #include <errno.h>
26 #include <string.h>
27 #include <ctype.h>
28 #if HAVE_WCHAR_H
29 #include <wchar.h>
30 #endif
31
32 #if HAVE_ICONV_H
33 #include <iconv.h>
34 #endif
35
36
37 #include <yaz/yaz-util.h>
38
39 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
40                                size_t *no_read, int *combining);
41 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
42                                size_t *no_read, int *combining);
43 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
44                                size_t *no_read, int *combining);
45 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
46                                size_t *no_read, int *combining);
47 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
48                                size_t *no_read, int *combining);
49 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
50                                size_t *no_read, int *combining);
51 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
52                                size_t *no_read, int *combining);
53 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
54                                size_t *no_read, int *combining);
55 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
56                                size_t *no_read, int *combining);
57
58
59 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
60                                 size_t *no_read, int *combining);
61 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
62                                 size_t *no_read, int *combining);
63 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
64                                 size_t *no_read, int *combining);
65 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
66                                 size_t *no_read, int *combining);
67 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
68                                 size_t *no_read, int *combining);
69 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
70                                 size_t *no_read, int *combining);
71 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
72                                 size_t *no_read, int *combining);
73 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
74                                 size_t *no_read, int *combining);
75 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
76                                 size_t *no_read, int *combining);
77
78 struct yaz_iconv_struct {
79     int my_errno;
80     int init_flag;
81     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
82                           size_t inbytesleft, size_t *no_read);
83     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
84                                  size_t inbytesleft, size_t *no_read);
85     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
86                            char **outbuf, size_t *outbytesleft,
87                            int last);
88     int marc8_esc_mode;
89
90     int comb_offset;
91     int comb_size;
92     unsigned long comb_x[8];
93     size_t comb_no_read[8];
94     size_t no_read_x;
95     unsigned long unget_x;
96 #if HAVE_ICONV_H
97     iconv_t iconv_cd;
98 #endif
99     unsigned long compose_char;
100
101     unsigned long write_marc8_comb_ch[8];
102     size_t write_marc8_comb_no;
103     unsigned long write_marc8_last;
104     const char *write_marc8_page_chr;
105 };
106
107 static struct {
108     unsigned long x1, x2;
109     unsigned y;
110 } latin1_comb[] = {
111     { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
112     { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
113     { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
114     { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
115     { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
116     { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
117     /* no need for 0xc6      LATIN CAPITAL LETTER AE */
118     { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
119     { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
120     { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
121     { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
122     { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
123     { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
124     { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
125     { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
126     { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
127     { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
128     { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
129     { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
130     { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
131     { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
132     { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
133     /* omitted:    0xd7      MULTIPLICATION SIGN */
134     /* omitted:    0xd8      LATIN CAPITAL LETTER O WITH STROKE */
135     { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
136     { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
137     { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
138     { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
139     { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
140     /* omitted:    0xde      LATIN CAPITAL LETTER THORN */
141     /* omitted:    0xdf      LATIN SMALL LETTER SHARP S */
142     { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
143     { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
144     { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
145     { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
146     { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
147     { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
148     /* omitted:    0xe6      LATIN SMALL LETTER AE */
149     { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
150     { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
151     { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
152     { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
153     { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
154     { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
155     { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
156     { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
157     { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
158     /* omitted:    0xf0      LATIN SMALL LETTER ETH */
159     { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
160     { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
161     { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
162     { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
163     { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
164     { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
165     /* omitted:    0xf7      DIVISION SIGN */
166     /* omitted:    0xf8      LATIN SMALL LETTER O WITH STROKE */
167     { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
168     { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
169     { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
170     { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
171     { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
172     /* omitted:    0xfe      LATIN SMALL LETTER THORN */
173     { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
174     
175     { 0, 0, 0}
176 };
177
178 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
179                                          size_t inbytesleft, size_t *no_read)
180 {
181     unsigned long x = inp[0];
182     *no_read = 1;
183     return x;
184 }
185
186 static size_t yaz_init_marc8(yaz_iconv_t cd, unsigned char *inp,
187                              size_t inbytesleft, size_t *no_read)
188 {
189     cd->marc8_esc_mode = 'B';
190
191     cd->comb_offset = cd->comb_size = 0;
192     cd->compose_char = 0;
193
194     cd->write_marc8_comb_no = 0;
195     cd->write_marc8_last = 0;
196     cd->write_marc8_page_chr = "\033(B";
197
198     return 0;
199 }
200
201 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
202                              size_t inbytesleft, size_t *no_read)
203 {
204     if (inp[0] != 0xef)
205     {
206         *no_read = 0;
207         return 0;
208     }
209     if (inbytesleft < 3)
210     {
211         cd->my_errno = YAZ_ICONV_EINVAL;
212         return (size_t) -1;
213     }
214     if (inp[1] != 0xbb && inp[2] == 0xbf)
215         *no_read = 3;
216     else
217         *no_read = 0;
218     return 0;
219 }
220
221 unsigned long yaz_read_UTF8_char(unsigned char *inp,
222                                  size_t inbytesleft, size_t *no_read,
223                                  int *error)
224 {
225     unsigned long x = 0;
226
227     if (inp[0] <= 0x7f)
228     {
229         x = inp[0];
230         *no_read = 1;
231     }
232     else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
233     {
234         *no_read = 0;
235         *error = YAZ_ICONV_EILSEQ;
236     }
237     else if (inp[0] <= 0xdf && inbytesleft >= 2)
238     {
239         x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
240         if (x >= 0x80)
241             *no_read = 2;
242         else
243         {
244             *no_read = 0;
245             *error = YAZ_ICONV_EILSEQ;
246         }
247     }
248     else if (inp[0] <= 0xef && inbytesleft >= 3)
249     {
250         x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
251             (inp[2] & 0x3f);
252         if (x >= 0x800)
253             *no_read = 3;
254         else
255         {
256             *no_read = 0;
257             *error = YAZ_ICONV_EILSEQ;
258         }
259     }
260     else if (inp[0] <= 0xf7 && inbytesleft >= 4)
261     {
262         x =  ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
263             ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
264         if (x >= 0x10000)
265             *no_read = 4;
266         else
267         {
268             *no_read = 0;
269             *error = YAZ_ICONV_EILSEQ;
270         }
271     }
272     else if (inp[0] <= 0xfb && inbytesleft >= 5)
273     {
274         x =  ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
275             ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
276             (inp[4] & 0x3f);
277         if (x >= 0x200000)
278             *no_read = 5;
279         else
280         {
281             *no_read = 0;
282             *error = YAZ_ICONV_EILSEQ;
283         }
284     }
285     else if (inp[0] <= 0xfd && inbytesleft >= 6)
286     {
287         x =  ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
288             ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
289             ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
290         if (x >= 0x4000000)
291             *no_read = 6;
292         else
293         {
294             *no_read = 0;
295             *error = YAZ_ICONV_EILSEQ;
296         }
297     }
298     else
299     {
300         *no_read = 0;
301         *error = YAZ_ICONV_EINVAL;
302     }
303     return x;
304 }
305
306 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
307                                     size_t inbytesleft, size_t *no_read)
308 {
309     return yaz_read_UTF8_char(inp, inbytesleft, no_read, &cd->my_errno);
310 }
311
312 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
313                                     size_t inbytesleft, size_t *no_read)
314 {
315     unsigned long x = 0;
316     
317     if (inbytesleft < 4)
318     {
319         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
320         *no_read = 0;
321     }
322     else
323     {
324         x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
325         *no_read = 4;
326     }
327     return x;
328 }
329
330 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
331                                       size_t inbytesleft, size_t *no_read)
332 {
333     unsigned long x = 0;
334     
335     if (inbytesleft < 4)
336     {
337         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
338         *no_read = 0;
339     }
340     else
341     {
342         x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
343         *no_read = 4;
344     }
345     return x;
346 }
347
348 #if HAVE_WCHAR_H
349 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
350                                        size_t inbytesleft, size_t *no_read)
351 {
352     unsigned long x = 0;
353     
354     if (inbytesleft < sizeof(wchar_t))
355     {
356         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
357         *no_read = 0;
358     }
359     else
360     {
361         wchar_t wch;
362         memcpy (&wch, inp, sizeof(wch));
363         x = wch;
364         *no_read = sizeof(wch);
365     }
366     return x;
367 }
368 #endif
369
370
371 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
372                                           size_t inbytesleft, size_t *no_read,
373                                           int *comb);
374
375 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
376                                      size_t inbytesleft, size_t *no_read)
377 {
378     unsigned long x;
379     if (cd->comb_offset < cd->comb_size)
380     {
381         *no_read = cd->comb_no_read[cd->comb_offset];
382         x = cd->comb_x[cd->comb_offset];
383
384         /* special case for double-diacritic combining characters, 
385            INVERTED BREVE and DOUBLE TILDE.
386            We'll increment the no_read counter by 1, since we want to skip over
387            the processing of the closing ligature character
388         */
389         /* this code is no longer necessary.. our handlers code in
390            yaz_marc8_?_conv (generated by charconv.tcl) now returns
391            0 and no_read=1 when a sequence does not match the input.
392            The SECOND HALFs in codetables.xml produces a non-existant
393            entry in the conversion trie.. Hence when met, the input byte is
394            skipped as it should (in yaz_iconv)
395         */
396 #if 0
397         if (x == 0x0361 || x == 0x0360)
398             *no_read += 1;
399 #endif
400         cd->comb_offset++;
401         return x;
402     }
403
404     cd->comb_offset = 0;
405     for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
406     {
407         int comb = 0;
408         x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
409         if (!comb || !x)
410             break;
411         cd->comb_x[cd->comb_size] = x;
412         cd->comb_no_read[cd->comb_size] = *no_read;
413         inp += *no_read;
414         inbytesleft = inbytesleft - *no_read;
415     }
416     return x;
417 }
418
419 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
420                                      size_t inbytesleft, size_t *no_read)
421 {
422     unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
423     if (x && cd->comb_size == 1)
424     {
425         /* For MARC8s we try to get a Latin-1 page code out of it */
426         int i;
427         for (i = 0; latin1_comb[i].x1; i++)
428             if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
429             {
430                 *no_read += cd->comb_no_read[0];
431                 cd->comb_size = 0;
432                 x = latin1_comb[i].y;
433                 break;
434             }
435     }
436     return x;
437 }
438
439 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
440                                          size_t inbytesleft, size_t *no_read,
441                                          int *comb)
442 {
443     *no_read = 0;
444     while(inbytesleft >= 1 && inp[0] == 27)
445     {
446         size_t inbytesleft0 = inbytesleft;
447         inp++;
448         inbytesleft--;
449         while(inbytesleft > 0 && strchr("(,$!", *inp))
450         {
451             inbytesleft--;
452             inp++;
453         }
454         if (inbytesleft <= 0)
455         {
456             *no_read = 0;
457             cd->my_errno = YAZ_ICONV_EINVAL;
458             return 0;
459         }
460         cd->marc8_esc_mode = *inp++;
461         inbytesleft--;
462         (*no_read) += inbytesleft0 - inbytesleft;
463     }
464     if (inbytesleft <= 0)
465         return 0;
466     else
467     {
468         unsigned long x;
469         size_t no_read_sub = 0;
470         *comb = 0;
471
472         switch(cd->marc8_esc_mode)
473         {
474         case 'B':  /* Basic ASCII */
475         case 'E':  /* ANSEL */
476         case 's':  /* ASCII */
477             x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
478             break;
479         case 'g':  /* Greek */
480             x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
481             break;
482         case 'b':  /* Subscripts */
483             x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
484             break;
485         case 'p':  /* Superscripts */
486             x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
487             break;
488         case '2':  /* Basic Hebrew */
489             x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
490             break;
491         case 'N':  /* Basic Cyrillic */
492         case 'Q':  /* Extended Cyrillic */
493             x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
494             break;
495         case '3':  /* Basic Arabic */
496         case '4':  /* Extended Arabic */
497             x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
498             break;
499         case 'S':  /* Greek */
500             x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
501             break;
502         case '1':  /* Chinese, Japanese, Korean (EACC) */
503             x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
504             break;
505         default:
506             *no_read = 0;
507             cd->my_errno = YAZ_ICONV_EILSEQ;
508             return 0;
509         }
510         *no_read += no_read_sub;
511         return x;
512     }
513 }
514
515 static size_t yaz_write_UTF8(yaz_iconv_t cd, unsigned long x,
516                              char **outbuf, size_t *outbytesleft,
517                              int last)
518 {
519     return yaz_write_UTF8_char(x, outbuf, outbytesleft, &cd->my_errno);
520 }
521
522 size_t yaz_write_UTF8_char(unsigned long x,
523                            char **outbuf, size_t *outbytesleft,
524                            int *error)
525 {
526     unsigned char *outp = (unsigned char *) *outbuf;
527
528     if (x <= 0x7f && *outbytesleft >= 1)
529     {
530         *outp++ = (unsigned char) x;
531         (*outbytesleft)--;
532     } 
533     else if (x <= 0x7ff && *outbytesleft >= 2)
534     {
535         *outp++ = (unsigned char) ((x >> 6) | 0xc0);
536         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
537         (*outbytesleft) -= 2;
538     }
539     else if (x <= 0xffff && *outbytesleft >= 3)
540     {
541         *outp++ = (unsigned char) ((x >> 12) | 0xe0);
542         *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
543         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
544         (*outbytesleft) -= 3;
545     }
546     else if (x <= 0x1fffff && *outbytesleft >= 4)
547     {
548         *outp++ = (unsigned char) ((x >> 18) | 0xf0);
549         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
550         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
551         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
552         (*outbytesleft) -= 4;
553     }
554     else if (x <= 0x3ffffff && *outbytesleft >= 5)
555     {
556         *outp++ = (unsigned char) ((x >> 24) | 0xf8);
557         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
558         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
559         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
560         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
561         (*outbytesleft) -= 5;
562     }
563     else if (*outbytesleft >= 6)
564     {
565         *outp++ = (unsigned char) ((x >> 30) | 0xfc);
566         *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
567         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
568         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
569         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
570         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
571         (*outbytesleft) -= 6;
572     }
573     else 
574     {
575         *error = YAZ_ICONV_E2BIG;  /* not room for output */
576         return (size_t)(-1);
577     }
578     *outbuf = (char *) outp;
579     return 0;
580 }
581
582
583 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
584                                    char **outbuf, size_t *outbytesleft,
585                                    int last)
586 {
587     /* list of two char unicode sequence that, when combined, are
588        equivalent to single unicode chars that can be represented in
589        ISO-8859-1/Latin-1.
590        Regular iconv on Linux at least does not seem to convert these,
591        but since MARC-8 to UTF-8 generates these composed sequence
592        we get a better chance of a successful MARC-8 -> ISO-8859-1
593        conversion */
594     unsigned char *outp = (unsigned char *) *outbuf;
595
596     if (cd->compose_char)
597     {
598         int i;
599         for (i = 0; latin1_comb[i].x1; i++)
600             if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
601             {
602                 x = latin1_comb[i].y;
603                 break;
604             }
605         if (*outbytesleft < 1)
606         {  /* no room. Retain compose_char and bail out */
607             cd->my_errno = YAZ_ICONV_E2BIG;
608             return (size_t)(-1);
609         }
610         if (!latin1_comb[i].x1) 
611         {   /* not found. Just write compose_char */
612             *outp++ = (unsigned char) cd->compose_char;
613             (*outbytesleft)--;
614             *outbuf = (char *) outp;
615         }
616         /* compose_char used so reset it. x now holds current char */
617         cd->compose_char = 0;
618     }
619
620     if (!last && x > 32 && x < 127 && cd->compose_char == 0)
621     {
622         cd->compose_char = x;
623         return 0;
624     }
625     else if (x > 255 || x < 1)
626     {
627         cd->my_errno = YAZ_ICONV_EILSEQ;
628         return (size_t) -1;
629     }
630     else if (*outbytesleft < 1)
631     {
632         cd->my_errno = YAZ_ICONV_E2BIG;
633         return (size_t)(-1);
634     }
635     *outp++ = (unsigned char) x;
636     (*outbytesleft)--;
637     *outbuf = (char *) outp;
638     return 0;
639 }
640
641
642 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
643                               char **outbuf, size_t *outbytesleft,
644                               int last)
645 {
646     unsigned char *outp = (unsigned char *) *outbuf;
647     if (*outbytesleft >= 4)
648     {
649         *outp++ = (unsigned char) (x>>24);
650         *outp++ = (unsigned char) (x>>16);
651         *outp++ = (unsigned char) (x>>8);
652         *outp++ = (unsigned char) x;
653         (*outbytesleft) -= 4;
654     }
655     else
656     {
657         cd->my_errno = YAZ_ICONV_E2BIG;
658         return (size_t)(-1);
659     }
660     *outbuf = (char *) outp;
661     return 0;
662 }
663
664 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
665                                 char **outbuf, size_t *outbytesleft,
666                                 int last)
667 {
668     unsigned char *outp = (unsigned char *) *outbuf;
669     if (*outbytesleft >= 4)
670     {
671         *outp++ = (unsigned char) x;
672         *outp++ = (unsigned char) (x>>8);
673         *outp++ = (unsigned char) (x>>16);
674         *outp++ = (unsigned char) (x>>24);
675         (*outbytesleft) -= 4;
676     }
677     else
678     {
679         cd->my_errno = YAZ_ICONV_E2BIG;
680         return (size_t)(-1);
681     }
682     *outbuf = (char *) outp;
683     return 0;
684 }
685
686 static unsigned long lookup_marc8(yaz_iconv_t cd,
687                                   unsigned long x, int *comb,
688                                   const char **page_chr)
689 {
690     char utf8_buf[7];
691     char *utf8_outbuf = utf8_buf;
692     size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
693
694     r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
695     if (r == (size_t)(-1))
696     {
697         cd->my_errno = YAZ_ICONV_EILSEQ;
698         return 0;
699     }
700     else
701     {
702         unsigned char *inp;
703         size_t inbytesleft, no_read_sub = 0;
704         unsigned long x;
705
706         *utf8_outbuf = '\0';        
707         inp = (unsigned char *) utf8_buf;
708         inbytesleft = strlen(utf8_buf);
709         
710         x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
711         if (x)
712         {
713             *page_chr = "\033(B";
714             return x;
715         }
716         x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
717         if (x)
718         {
719             *page_chr = "\033g";
720             return x;
721         }
722         x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
723         if (x)
724         {
725             *page_chr = "\033b";
726             return x;
727         }
728         x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
729         if (x)
730         {
731             *page_chr = "\033p";
732             return x;
733         }
734         x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
735         if (x)
736         {
737             *page_chr = "\033(2";
738             return x;
739         }
740         x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
741         if (x)
742         {
743             *page_chr = "\033(N";
744             return x;
745         }
746         x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
747         if (x)
748         {
749             *page_chr = "\033(3";
750             return x;
751         }
752         x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
753         if (x)
754         {
755             *page_chr = "\033(S";
756             return x;
757         }
758         x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
759         if (x)
760         {
761             *page_chr = "\033$1";
762             return x;
763         }
764         cd->my_errno = YAZ_ICONV_EILSEQ;
765         return x;
766     }
767 }
768
769 static size_t flush_combos(yaz_iconv_t cd,
770                            char **outbuf, size_t *outbytesleft)
771 {
772     unsigned long y = cd->write_marc8_last;
773     unsigned char byte, second_half = 0;
774     char out_buf[10];
775     size_t i, out_no = 0;
776
777     if (!y)
778         return 0;
779
780     byte = (unsigned char )((y>>16) & 0xff);
781     if (byte)
782         out_buf[out_no++] = byte;
783     byte = (unsigned char)((y>>8) & 0xff);
784     if (byte)
785         out_buf[out_no++] = byte;
786     byte = (unsigned char )(y & 0xff);
787     if (byte)
788         out_buf[out_no++] = byte;
789
790     if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
791     {
792         cd->my_errno = YAZ_ICONV_E2BIG;
793         return (size_t) (-1);
794     }
795
796     for (i = 0; i < cd->write_marc8_comb_no; i++)
797     {
798         /* all MARC-8 combined characters are simple bytes */
799         byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
800         if (byte == 0xEB)
801             second_half = 0xEC;
802         else if (byte == 0xFA)
803             second_half = 0xFB;
804
805         *(*outbuf)++ = byte;
806         (*outbytesleft)--;
807     }
808     memcpy(*outbuf, out_buf, out_no);
809     *outbuf += out_no;
810     (*outbytesleft) -= out_no;
811     if (second_half)
812     {
813         *(*outbuf)++ = second_half;
814         (*outbytesleft)--;
815     }        
816
817     cd->write_marc8_last = 0;
818     cd->write_marc8_comb_no = 0;
819     return 0;
820 }
821
822 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
823                                 char **outbuf, size_t *outbytesleft,
824                                 int last)
825 {
826     int comb = 0;
827     const char *page_chr = 0;
828     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
829
830     if (!y)
831         return (size_t) (-1);
832
833     if (comb)
834     {
835         if (cd->write_marc8_comb_no < 6)
836             cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
837     }
838     else
839     {
840         size_t r = flush_combos(cd, outbuf, outbytesleft);
841         const char *old_page_chr = cd->write_marc8_page_chr;
842         if (r)
843             return r;
844         if (strcmp(page_chr, old_page_chr))
845         {
846             size_t plen = 0;
847             const char *page_out = page_chr;
848
849             if (*outbytesleft < 8)
850             {
851                 cd->my_errno = YAZ_ICONV_E2BIG;
852                 
853                 return (size_t) (-1);
854             }
855             cd->write_marc8_page_chr = page_chr;
856
857             if (!strcmp(old_page_chr, "\033p") 
858                 || !strcmp(old_page_chr, "\033g")
859                 || !strcmp(old_page_chr, "\033b"))
860             {
861                 /* Technique 1 leave */
862                 page_out = "\033s";
863                 if (strcmp(page_chr, "\033(B")) /* Not going ASCII page? */
864                 {
865                     /* Must leave script + enter new page */
866                     plen = strlen(page_out);
867                     memcpy(*outbuf, page_out, plen);
868                     (*outbuf) += plen;
869                     (*outbytesleft) -= plen;
870                     page_out = page_chr;
871                 }
872             }
873             plen = strlen(page_out);
874             memcpy(*outbuf, page_out, plen);
875             (*outbuf) += plen;
876             (*outbytesleft) -= plen;
877         }
878         cd->write_marc8_last = y;
879     }
880     if (last)
881     {
882         size_t r = flush_combos(cd, outbuf, outbytesleft);
883         if (r)
884         {
885             if (comb)
886                 cd->write_marc8_comb_no--;
887             else
888                 cd->write_marc8_last = 0;
889             return r;
890         }
891     }
892     return 0;
893 }
894
895 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
896                               char **outbuf, size_t *outbytesleft,
897                               int last)
898 {
899     int i;
900     for (i = 0; latin1_comb[i].x1; i++)
901     {
902         if (x == latin1_comb[i].y)
903         {
904             size_t r ;
905             /* save the output pointers .. */
906             char *outbuf0 = *outbuf;
907             size_t outbytesleft0 = *outbytesleft;
908             int last_ch = cd->write_marc8_last;
909
910             r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
911                                   outbuf, outbytesleft, 0);
912             if (r)
913                 return r;
914             r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
915                                   outbuf, outbytesleft, last);
916             if (r && cd->my_errno == YAZ_ICONV_E2BIG)
917             {
918                 /* not enough room. reset output to original values */
919                 *outbuf = outbuf0;
920                 *outbytesleft = outbytesleft0;
921                 cd->write_marc8_last = last_ch;
922             }
923             return r;
924         }
925     }
926     return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, last);
927 }
928
929
930 #if HAVE_WCHAR_H
931 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
932                                  char **outbuf, size_t *outbytesleft,
933                                  int last)
934 {
935     unsigned char *outp = (unsigned char *) *outbuf;
936
937     if (*outbytesleft >= sizeof(wchar_t))
938     {
939         wchar_t wch = x;
940         memcpy(outp, &wch, sizeof(wch));
941         outp += sizeof(wch);
942         (*outbytesleft) -= sizeof(wch);
943     }
944     else
945     {
946         cd->my_errno = YAZ_ICONV_E2BIG;
947         return (size_t)(-1);
948     }
949     *outbuf = (char *) outp;
950     return 0;
951 }
952 #endif
953
954 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
955 {
956     return cd->read_handle && cd->write_handle;
957 }
958
959 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
960 {
961     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
962
963     cd->write_handle = 0;
964     cd->read_handle = 0;
965     cd->init_handle = 0;
966     cd->my_errno = YAZ_ICONV_UNKNOWN;
967
968     /* a useful hack: if fromcode has leading @,
969        the library not use YAZ's own conversions .. */
970     if (fromcode[0] == '@')
971         fromcode++;
972     else
973     {
974         if (!yaz_matchstr(fromcode, "UTF8"))
975         {
976             cd->read_handle = yaz_read_UTF8;
977             cd->init_handle = yaz_init_UTF8;
978         }
979         else if (!yaz_matchstr(fromcode, "ISO88591"))
980             cd->read_handle = yaz_read_ISO8859_1;
981         else if (!yaz_matchstr(fromcode, "UCS4"))
982             cd->read_handle = yaz_read_UCS4;
983         else if (!yaz_matchstr(fromcode, "UCS4LE"))
984             cd->read_handle = yaz_read_UCS4LE;
985         else if (!yaz_matchstr(fromcode, "MARC8"))
986         {
987             cd->read_handle = yaz_read_marc8;
988             cd->init_handle = yaz_init_marc8;
989         }
990         else if (!yaz_matchstr(fromcode, "MARC8s"))
991         {
992             cd->read_handle = yaz_read_marc8s;
993             cd->init_handle = yaz_init_marc8;
994         }
995 #if HAVE_WCHAR_H
996         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
997             cd->read_handle = yaz_read_wchar_t;
998 #endif
999         
1000         if (!yaz_matchstr(tocode, "UTF8"))
1001             cd->write_handle = yaz_write_UTF8;
1002         else if (!yaz_matchstr(tocode, "ISO88591"))
1003             cd->write_handle = yaz_write_ISO8859_1;
1004         else if (!yaz_matchstr (tocode, "UCS4"))
1005             cd->write_handle = yaz_write_UCS4;
1006         else if (!yaz_matchstr(tocode, "UCS4LE"))
1007             cd->write_handle = yaz_write_UCS4LE;
1008         else if (!yaz_matchstr(tocode, "MARC8"))
1009         {
1010             cd->write_handle = yaz_write_marc8;
1011             cd->init_handle = yaz_init_marc8;
1012         }
1013         else if (!yaz_matchstr(tocode, "MARC8s"))
1014         {
1015             cd->write_handle = yaz_write_marc8;
1016             cd->init_handle = yaz_init_marc8;
1017         }
1018 #if HAVE_WCHAR_H
1019         else if (!yaz_matchstr(tocode, "WCHAR_T"))
1020             cd->write_handle = yaz_write_wchar_t;
1021 #endif
1022     }
1023 #if HAVE_ICONV_H
1024     cd->iconv_cd = 0;
1025     if (!cd->read_handle || !cd->write_handle)
1026     {
1027         cd->iconv_cd = iconv_open (tocode, fromcode);
1028         if (cd->iconv_cd == (iconv_t) (-1))
1029         {
1030             xfree (cd);
1031             return 0;
1032         }
1033     }
1034 #else
1035     if (!cd->read_handle || !cd->write_handle)
1036     {
1037         xfree (cd);
1038         return 0;
1039     }
1040 #endif
1041     cd->init_flag = 1;
1042     return cd;
1043 }
1044
1045 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
1046                  char **outbuf, size_t *outbytesleft)
1047 {
1048     char *inbuf0;
1049     size_t r = 0;
1050
1051 #if HAVE_ICONV_H
1052     if (cd->iconv_cd)
1053     {
1054         size_t r =
1055             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
1056         if (r == (size_t)(-1))
1057         {
1058             switch (yaz_errno())
1059             {
1060             case E2BIG:
1061                 cd->my_errno = YAZ_ICONV_E2BIG;
1062                 break;
1063             case EINVAL:
1064                 cd->my_errno = YAZ_ICONV_EINVAL;
1065                 break;
1066             case EILSEQ:
1067                 cd->my_errno = YAZ_ICONV_EILSEQ;
1068                 break;
1069             default:
1070                 cd->my_errno = YAZ_ICONV_UNKNOWN;
1071             }
1072         }
1073         return r;
1074     }
1075 #endif
1076     if (inbuf == 0 || *inbuf == 0)
1077     {
1078         cd->init_flag = 1;
1079         cd->my_errno = YAZ_ICONV_UNKNOWN;
1080         return 0;
1081     }
1082     inbuf0 = *inbuf;
1083
1084     if (cd->init_flag)
1085     {
1086         if (cd->init_handle)
1087         {
1088             size_t no_read = 0;
1089             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1090                                          *inbytesleft, &no_read);
1091             if (r)
1092             {
1093                 if (cd->my_errno == YAZ_ICONV_EINVAL)
1094                     return r;
1095                 cd->init_flag = 0;
1096                 return r;
1097             }
1098             *inbytesleft -= no_read;
1099             *inbuf += no_read;
1100         }
1101         cd->init_flag = 0;
1102         cd->unget_x = 0;
1103         cd->no_read_x = 0;
1104     }
1105     while (1)
1106     {
1107         unsigned long x;
1108         size_t no_read;
1109
1110         if (*inbytesleft == 0)
1111         {
1112             r = *inbuf - inbuf0;
1113             break;
1114         }
1115         if (!cd->unget_x)
1116         {
1117             x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1118                                   &no_read);
1119             if (no_read == 0)
1120             {
1121                 r = (size_t)(-1);
1122                 break;
1123             }
1124         }
1125         else
1126         {
1127             x = cd->unget_x;
1128             no_read = cd->no_read_x;
1129         }
1130         if (x)
1131         {
1132             r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1133                                    (*inbytesleft - no_read) == 0 ? 1 : 0);
1134             if (r)
1135             {
1136                 /* unable to write it. save it because read_handle cannot
1137                    rewind .. */
1138                 if (cd->my_errno == YAZ_ICONV_E2BIG)
1139                 {
1140                     cd->unget_x = x;
1141                     cd->no_read_x = no_read;
1142                     break;
1143                 }
1144             }
1145             cd->unget_x = 0;
1146         }
1147         *inbytesleft -= no_read;
1148         (*inbuf) += no_read;
1149     }
1150     return r;
1151 }
1152
1153 int yaz_iconv_error (yaz_iconv_t cd)
1154 {
1155     return cd->my_errno;
1156 }
1157
1158 int yaz_iconv_close (yaz_iconv_t cd)
1159 {
1160 #if HAVE_ICONV_H
1161     if (cd->iconv_cd)
1162         iconv_close (cd->iconv_cd);
1163 #endif
1164     xfree (cd);
1165     return 0;
1166 }
1167
1168 /*
1169  * Local variables:
1170  * c-basic-offset: 4
1171  * indent-tabs-mode: nil
1172  * End:
1173  * vim: shiftwidth=4 tabstop=8 expandtab
1174  */
1175