Added skeleton for query charset conversion. Bug #977.
[yaz-moved-to-github.git] / src / siconv.c
1 /*
2  * Copyright (C) 1995-2007, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: siconv.c,v 1.37 2007-03-20 21:37:32 adam Exp $
6  */
7 /**
8  * \file siconv.c
9  * \brief Implements simple ICONV
10  *
11  * This implements an interface similar to that of iconv and
12  * is used by YAZ to interface with iconv (if present).
13  * For systems where iconv is not present, this layer
14  * provides a few important conversions: UTF-8, MARC-8, Latin-1.
15  *
16  * MARC-8 reference:
17  *  http://www.loc.gov/marc/specifications/speccharmarc8.html
18  */
19
20 #if HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include <assert.h>
25 #include <errno.h>
26 #include <string.h>
27 #include <ctype.h>
28 #if HAVE_WCHAR_H
29 #include <wchar.h>
30 #endif
31
32 #if HAVE_ICONV_H
33 #include <iconv.h>
34 #endif
35
36
37 #include <yaz/yaz-util.h>
38
39 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
40                                size_t *no_read, int *combining);
41 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
42                                size_t *no_read, int *combining);
43 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
44                                size_t *no_read, int *combining);
45 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
46                                size_t *no_read, int *combining);
47 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
48                                size_t *no_read, int *combining);
49 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
50                                size_t *no_read, int *combining);
51 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
52                                size_t *no_read, int *combining);
53 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
54                                size_t *no_read, int *combining);
55 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
56                                size_t *no_read, int *combining);
57
58
59 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
60                                 size_t *no_read, int *combining);
61 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
62                                 size_t *no_read, int *combining);
63 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
64                                 size_t *no_read, int *combining);
65 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
66                                 size_t *no_read, int *combining);
67 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
68                                 size_t *no_read, int *combining);
69 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
70                                 size_t *no_read, int *combining);
71 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
72                                 size_t *no_read, int *combining);
73 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
74                                 size_t *no_read, int *combining);
75 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
76                                 size_t *no_read, int *combining);
77
78 struct yaz_iconv_struct {
79     int my_errno;
80     int init_flag;
81     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
82                           size_t inbytesleft, size_t *no_read);
83     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
84                                  size_t inbytesleft, size_t *no_read);
85     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
86                            char **outbuf, size_t *outbytesleft);
87     size_t (*flush_handle)(yaz_iconv_t cd,
88                            char **outbuf, size_t *outbytesleft);
89     int marc8_esc_mode;
90
91     int comb_offset;
92     int comb_size;
93     unsigned long comb_x[8];
94     size_t comb_no_read[8];
95     size_t no_read_x;
96     unsigned long unget_x;
97 #if HAVE_ICONV_H
98     iconv_t iconv_cd;
99 #endif
100     unsigned long compose_char;
101
102     unsigned long write_marc8_comb_ch[8];
103     size_t write_marc8_comb_no;
104     unsigned write_marc8_second_half_char;
105     unsigned long write_marc8_last;
106     const char *write_marc8_page_chr;
107 };
108
109 static struct {
110     unsigned long x1, x2;
111     unsigned y;
112 } latin1_comb[] = {
113     { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
114     { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
115     { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
116     { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
117     { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
118     { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
119     /* no need for 0xc6      LATIN CAPITAL LETTER AE */
120     { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
121     { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
122     { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
123     { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
124     { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
125     { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
126     { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
127     { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
128     { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
129     { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
130     { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
131     { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
132     { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
133     { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
134     { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
135     /* omitted:    0xd7      MULTIPLICATION SIGN */
136     /* omitted:    0xd8      LATIN CAPITAL LETTER O WITH STROKE */
137     { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
138     { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
139     { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
140     { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
141     { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
142     /* omitted:    0xde      LATIN CAPITAL LETTER THORN */
143     /* omitted:    0xdf      LATIN SMALL LETTER SHARP S */
144     { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
145     { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
146     { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
147     { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
148     { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
149     { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
150     /* omitted:    0xe6      LATIN SMALL LETTER AE */
151     { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
152     { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
153     { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
154     { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
155     { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
156     { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
157     { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
158     { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
159     { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
160     /* omitted:    0xf0      LATIN SMALL LETTER ETH */
161     { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
162     { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
163     { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
164     { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
165     { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
166     { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
167     /* omitted:    0xf7      DIVISION SIGN */
168     /* omitted:    0xf8      LATIN SMALL LETTER O WITH STROKE */
169     { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
170     { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
171     { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
172     { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
173     { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
174     /* omitted:    0xfe      LATIN SMALL LETTER THORN */
175     { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
176     
177     { 0, 0, 0}
178 };
179
180 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
181                                          size_t inbytesleft, size_t *no_read)
182 {
183     unsigned long x = inp[0];
184     *no_read = 1;
185     return x;
186 }
187
188
189 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
190                              size_t inbytesleft, size_t *no_read)
191 {
192     if (inp[0] != 0xef)
193     {
194         *no_read = 0;
195         return 0;
196     }
197     if (inbytesleft < 3)
198     {
199         cd->my_errno = YAZ_ICONV_EINVAL;
200         return (size_t) -1;
201     }
202     if (inp[1] != 0xbb && inp[2] == 0xbf)
203         *no_read = 3;
204     else
205         *no_read = 0;
206     return 0;
207 }
208
209 unsigned long yaz_read_UTF8_char(unsigned char *inp,
210                                  size_t inbytesleft, size_t *no_read,
211                                  int *error)
212 {
213     unsigned long x = 0;
214
215     if (inp[0] <= 0x7f)
216     {
217         x = inp[0];
218         *no_read = 1;
219     }
220     else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
221     {
222         *no_read = 0;
223         *error = YAZ_ICONV_EILSEQ;
224     }
225     else if (inp[0] <= 0xdf && inbytesleft >= 2)
226     {
227         x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
228         if (x >= 0x80)
229             *no_read = 2;
230         else
231         {
232             *no_read = 0;
233             *error = YAZ_ICONV_EILSEQ;
234         }
235     }
236     else if (inp[0] <= 0xef && inbytesleft >= 3)
237     {
238         x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
239             (inp[2] & 0x3f);
240         if (x >= 0x800)
241             *no_read = 3;
242         else
243         {
244             *no_read = 0;
245             *error = YAZ_ICONV_EILSEQ;
246         }
247     }
248     else if (inp[0] <= 0xf7 && inbytesleft >= 4)
249     {
250         x =  ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
251             ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
252         if (x >= 0x10000)
253             *no_read = 4;
254         else
255         {
256             *no_read = 0;
257             *error = YAZ_ICONV_EILSEQ;
258         }
259     }
260     else if (inp[0] <= 0xfb && inbytesleft >= 5)
261     {
262         x =  ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
263             ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
264             (inp[4] & 0x3f);
265         if (x >= 0x200000)
266             *no_read = 5;
267         else
268         {
269             *no_read = 0;
270             *error = YAZ_ICONV_EILSEQ;
271         }
272     }
273     else if (inp[0] <= 0xfd && inbytesleft >= 6)
274     {
275         x =  ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
276             ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
277             ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
278         if (x >= 0x4000000)
279             *no_read = 6;
280         else
281         {
282             *no_read = 0;
283             *error = YAZ_ICONV_EILSEQ;
284         }
285     }
286     else
287     {
288         *no_read = 0;
289         *error = YAZ_ICONV_EINVAL;
290     }
291     return x;
292 }
293
294 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
295                                     size_t inbytesleft, size_t *no_read)
296 {
297     return yaz_read_UTF8_char(inp, inbytesleft, no_read, &cd->my_errno);
298 }
299
300 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
301                                     size_t inbytesleft, size_t *no_read)
302 {
303     unsigned long x = 0;
304     
305     if (inbytesleft < 4)
306     {
307         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
308         *no_read = 0;
309     }
310     else
311     {
312         x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
313         *no_read = 4;
314     }
315     return x;
316 }
317
318 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
319                                       size_t inbytesleft, size_t *no_read)
320 {
321     unsigned long x = 0;
322     
323     if (inbytesleft < 4)
324     {
325         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
326         *no_read = 0;
327     }
328     else
329     {
330         x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
331         *no_read = 4;
332     }
333     return x;
334 }
335
336 #if HAVE_WCHAR_H
337 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
338                                        size_t inbytesleft, size_t *no_read)
339 {
340     unsigned long x = 0;
341     
342     if (inbytesleft < sizeof(wchar_t))
343     {
344         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
345         *no_read = 0;
346     }
347     else
348     {
349         wchar_t wch;
350         memcpy (&wch, inp, sizeof(wch));
351         x = wch;
352         *no_read = sizeof(wch);
353     }
354     return x;
355 }
356 #endif
357
358 static unsigned long yaz_read_advancegreek(yaz_iconv_t cd, unsigned char *inp,
359                                            size_t inbytesleft, size_t *no_read)
360 {
361     unsigned long x = 0;
362     int shift = 0;
363     int tonos = 0;
364     int dialitika = 0;
365
366     *no_read = 0;
367     while (inbytesleft > 0)
368     {
369         if (*inp == 0x9d)
370         {
371             tonos = 1;
372         }
373         else if (*inp == 0x9e)
374         {
375             dialitika = 1;
376         }
377         else if (*inp == 0x9f)
378         {
379             shift = 1;
380         }
381         else
382             break;
383         inp++;
384         --inbytesleft;
385         (*no_read)++;
386     }    
387     if (inbytesleft == 0)
388     {
389         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
390         *no_read = 0;
391         return 0;
392     }
393     switch (*inp) {
394     case 0x81:
395         if (shift) 
396             if (tonos) 
397                 x = 0x0386;
398             else 
399                 x = 0x0391;
400         else 
401             if (tonos) 
402                 x = 0x03ac;
403             else 
404                 x = 0x03b1;
405         break;
406     case 0x82:
407         if (shift) 
408             x = 0x0392;
409         else 
410             x = 0x03b2;
411         
412         break;
413     case 0x83:
414         if (shift) 
415             x = 0x0393;
416         else 
417             x = 0x03b3;
418         break;
419     case 0x84:
420         if (shift) 
421             x = 0x0394;
422         else 
423             x = 0x03b4;
424         break;
425     case 0x85:
426         if (shift) 
427             if (tonos) 
428                 x = 0x0388;
429             else 
430                 x = 0x0395;
431         else 
432             if (tonos) 
433                 x = 0x03ad;
434             else 
435                 x = 0x03b5;
436         break;
437     case 0x86:
438         if (shift) 
439             x = 0x0396;
440         else 
441             x = 0x03b6;
442         break;
443     case 0x87:
444         if (shift) 
445             if (tonos) 
446                 x = 0x0389;
447             else 
448                 x = 0x0397;
449         else 
450             if (tonos) 
451                 x = 0x03ae;
452             else 
453                 x = 0x03b7;
454         break;
455     case 0x88:
456         if (shift) 
457             x = 0x0398;
458         else 
459             x = 0x03b8;
460         break;
461     case 0x89:
462         if (shift) 
463             if (tonos) 
464                 x = 0x038a;
465             else 
466                 if (dialitika) 
467                     x = 0x9e;
468                 else 
469                     x = 0x0399;
470         else 
471             if (tonos) 
472                 if (dialitika) 
473                     x = 0x0390;
474                 else 
475                     x = 0x03af;
476         
477             else 
478                 if (dialitika) 
479                     x = 0x03ca;
480                 else 
481                     x = 0x03b9;
482         break;
483     case 0x8a:
484         if (shift) 
485             x = 0x039a;
486         else 
487             x = 0x03ba;
488         
489         break;
490     case 0x8b:
491         if (shift) 
492             x = 0x039b;
493         else 
494             x = 0x03bb;
495         break;
496     case 0x8c:
497         if (shift) 
498             x = 0x039c;
499         else 
500             x = 0x03bc;
501         
502         break;
503     case 0x8d:
504         if (shift) 
505             x = 0x039d;
506         else 
507             x = 0x03bd;
508         break;
509     case 0x8e:
510         if (shift) 
511             x = 0x039e;
512         else 
513             x = 0x03be;
514         break;
515     case 0x8f:
516         if (shift) 
517             if (tonos) 
518                 x = 0x038c;
519             else 
520                 x = 0x039f;
521         else 
522             if (tonos) 
523                 x = 0x03cc;
524             else 
525                 x = 0x03bf;
526         break;
527     case 0x90:
528         if (shift) 
529             x = 0x03a0;
530         else 
531             x = 0x03c0;
532         break;
533     case 0x91:
534         if (shift) 
535             x = 0x03a1;
536         else 
537             x = 0x03c1;
538         break;
539     case 0x92:
540         x = 0x03c2;
541         break;
542     case 0x93:
543         if (shift) 
544             x = 0x03a3;
545         else 
546             x = 0x03c3;
547         break;
548     case 0x94:
549         if (shift) 
550             x = 0x03a4;
551         else 
552             x = 0x03c4;
553         break;
554     case 0x95:
555         if (shift) 
556             if (tonos) 
557                 x = 0x038e;
558             else 
559                 if (dialitika) 
560                     x = 0x03ab;
561                 else 
562                     x = 0x03a5;
563         else 
564             if (tonos) 
565                 if (dialitika) 
566                     x = 0x03b0;
567                 else 
568                     x = 0x03cd;
569         
570             else 
571                 if (dialitika) 
572                     x = 0x03cb;
573                 else 
574                     x = 0x03c5;
575         break;
576     case 0x96:
577         if (shift) 
578             x = 0x03a6;
579         else 
580             x = 0x03c6;
581         break;
582     case 0x97:
583         if (shift) 
584             x = 0x03a7;
585         else 
586             x = 0x03c7;
587         break;
588     case 0x98:
589         if (shift) 
590             x = 0x03a8;
591         else 
592             x = 0x03c8;
593         
594         break;
595         
596     case 0x99:
597         if (shift) 
598             if (tonos) 
599                 x = 0x038f;
600             else 
601                 x = 0x03a9;
602         else 
603             if (tonos) 
604                 x = 0x03ce;
605             else 
606                 x = 0x03c9;
607         break;
608     default:
609         x = *inp;
610         break;
611     }
612     (*no_read)++;
613     
614     return x;
615 }
616
617 static size_t yaz_write_advancegreek(yaz_iconv_t cd, unsigned long x,
618                                      char **outbuf, size_t *outbytesleft)
619 {
620     size_t k = 0;
621     unsigned char *out = (unsigned char*) *outbuf;
622     if (*outbytesleft < 3)
623     {
624         cd->my_errno = YAZ_ICONV_E2BIG;  /* not room for output */
625         return (size_t)(-1);
626     }
627     switch (x)
628     {
629     case 0x03ac : out[k++]=0x9d; out[k++]=0x81; break;
630     case 0x03ad : out[k++]=0x9d; out[k++]=0x85; break;
631     case 0x03ae : out[k++]=0x9d; out[k++]=0x87; break;
632     case 0x03af : out[k++]=0x9d; out[k++]=0x89; break;
633     case 0x03cc : out[k++]=0x9d; out[k++]=0x8f; break;
634     case 0x03cd : out[k++]=0x9d; out[k++]=0x95; break;
635     case 0x03ce : out[k++]=0x9d; out[k++]=0x99; break;
636     case 0x0390 : out[k++]=0x9d; out[k++]=0x9e; out[k++]=0x89; break;
637     case 0x03b0 : out[k++]=0x9d; out[k++]=0x9e; out[k++]=0x95; break;
638     case 0x0386 : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x81; break;
639     case 0x0388 : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x85; break;
640     case 0x0389 : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x87; break;
641     case 0x038a : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x89; break;
642     case 0x038c : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x8f; break;
643     case 0x038e : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x95; break;
644     case 0x038f : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x99; break;
645     case 0x03ca : out[k++]=0x9e; out[k++]=0x89; break;
646     case 0x03cb : out[k++]=0x9e; out[k++]=0x95; break;
647     case 0x03aa : out[k++]=0x9e; out[k++]=0x9f; out[k++]=0x89; break;
648     case 0x03ab : out[k++]=0x9e; out[k++]=0x9f; out[k++]=0x95; break;
649     case 0x0391 : out[k++]=0x9f; out[k++]=0x81; break;
650     case 0x0392 : out[k++]=0x9f; out[k++]=0x82; break;
651     case 0x0393 : out[k++]=0x9f; out[k++]=0x83; break;
652     case 0x0394 : out[k++]=0x9f; out[k++]=0x84; break;
653     case 0x0395 : out[k++]=0x9f; out[k++]=0x85; break;
654     case 0x0396 : out[k++]=0x9f; out[k++]=0x86; break;
655     case 0x0397 : out[k++]=0x9f; out[k++]=0x87; break;
656     case 0x0398 : out[k++]=0x9f; out[k++]=0x88; break;
657     case 0x0399 : out[k++]=0x9f; out[k++]=0x89; break;
658     case 0x039a : out[k++]=0x9f; out[k++]=0x8a; break;
659     case 0x039b : out[k++]=0x9f; out[k++]=0x8b; break;
660     case 0x039c : out[k++]=0x9f; out[k++]=0x8c; break;
661     case 0x039d : out[k++]=0x9f; out[k++]=0x8d; break;
662     case 0x039e : out[k++]=0x9f; out[k++]=0x8e; break;
663     case 0x039f : out[k++]=0x9f; out[k++]=0x8f; break;
664     case 0x03a0 : out[k++]=0x9f; out[k++]=0x90; break;
665     case 0x03a1 : out[k++]=0x9f; out[k++]=0x91; break;
666     case 0x03a3 : out[k++]=0x9f; out[k++]=0x93; break;
667     case 0x03a4 : out[k++]=0x9f; out[k++]=0x94; break;
668     case 0x03a5 : out[k++]=0x9f; out[k++]=0x95; break;
669     case 0x03a6 : out[k++]=0x9f; out[k++]=0x96; break;
670     case 0x03a7 : out[k++]=0x9f; out[k++]=0x97; break;
671     case 0x03a8 : out[k++]=0x9f; out[k++]=0x98; break;
672     case 0x03a9 : out[k++]=0x9f; out[k++]=0x99; break;
673     case 0x03b1 : out[k++]=0x81; break;
674     case 0x03b2 : out[k++]=0x82; break;
675     case 0x03b3 : out[k++]=0x83; break;
676     case 0x03b4 : out[k++]=0x84; break;
677     case 0x03b5 : out[k++]=0x85; break;
678     case 0x03b6 : out[k++]=0x86; break;
679     case 0x03b7 : out[k++]=0x87; break;
680     case 0x03b8 : out[k++]=0x88; break;
681     case 0x03b9 : out[k++]=0x89; break;
682     case 0x03ba : out[k++]=0x8a; break;
683     case 0x03bb : out[k++]=0x8b; break;
684     case 0x03bc : out[k++]=0x8c; break;
685     case 0x03bd : out[k++]=0x8d; break;
686     case 0x03be : out[k++]=0x8e; break;
687     case 0x03bf : out[k++]=0x8f; break;
688     case 0x03c0 : out[k++]=0x90; break;
689     case 0x03c1 : out[k++]=0x91; break;
690     case 0x03c2 : out[k++]=0x92; break;
691     case 0x03c3 : out[k++]=0x93; break;
692     case 0x03c4 : out[k++]=0x94; break;
693     case 0x03c5 : out[k++]=0x95; break;
694     case 0x03c6 : out[k++]=0x96; break;
695     case 0x03c7 : out[k++]=0x96; break;
696     case 0x03c8 : out[k++]=0x98; break;
697     case 0x03c9 : out[k++]=0x99; break;
698     default:
699         if (x > 255)
700         {
701             cd->my_errno = YAZ_ICONV_EILSEQ;
702             return (size_t) -1;
703         }
704         out[k++] = x;
705         break;
706     }
707     *outbytesleft -= k;
708     (*outbuf) += k;
709     return 0;
710 }
711
712
713 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
714                                           size_t inbytesleft, size_t *no_read,
715                                           int *comb);
716
717 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
718                                      size_t inbytesleft, size_t *no_read)
719 {
720     unsigned long x;
721     if (cd->comb_offset < cd->comb_size)
722     {
723         *no_read = cd->comb_no_read[cd->comb_offset];
724         x = cd->comb_x[cd->comb_offset];
725
726         /* special case for double-diacritic combining characters, 
727            INVERTED BREVE and DOUBLE TILDE.
728            We'll increment the no_read counter by 1, since we want to skip over
729            the processing of the closing ligature character
730         */
731         /* this code is no longer necessary.. our handlers code in
732            yaz_marc8_?_conv (generated by charconv.tcl) now returns
733            0 and no_read=1 when a sequence does not match the input.
734            The SECOND HALFs in codetables.xml produces a non-existant
735            entry in the conversion trie.. Hence when met, the input byte is
736            skipped as it should (in yaz_iconv)
737         */
738 #if 0
739         if (x == 0x0361 || x == 0x0360)
740             *no_read += 1;
741 #endif
742         cd->comb_offset++;
743         return x;
744     }
745
746     cd->comb_offset = 0;
747     for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
748     {
749         int comb = 0;
750
751         if (inbytesleft == 0 && cd->comb_size)
752         {
753             cd->my_errno = YAZ_ICONV_EINVAL;
754             x = 0;
755             *no_read = 0;
756             break;
757         }
758         x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
759         if (!comb || !x)
760             break;
761         cd->comb_x[cd->comb_size] = x;
762         cd->comb_no_read[cd->comb_size] = *no_read;
763         inp += *no_read;
764         inbytesleft = inbytesleft - *no_read;
765     }
766     return x;
767 }
768
769 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
770                                      size_t inbytesleft, size_t *no_read)
771 {
772     unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
773     if (x && cd->comb_size == 1)
774     {
775         /* For MARC8s we try to get a Latin-1 page code out of it */
776         int i;
777         for (i = 0; latin1_comb[i].x1; i++)
778             if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
779             {
780                 *no_read += cd->comb_no_read[0];
781                 cd->comb_size = 0;
782                 x = latin1_comb[i].y;
783                 break;
784             }
785     }
786     return x;
787 }
788
789 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
790                                          size_t inbytesleft, size_t *no_read,
791                                          int *comb)
792 {
793     *no_read = 0;
794     while(inbytesleft >= 1 && inp[0] == 27)
795     {
796         size_t inbytesleft0 = inbytesleft;
797         inp++;
798         inbytesleft--;
799         while(inbytesleft > 0 && strchr("(,$!)-", *inp))
800         {
801             inbytesleft--;
802             inp++;
803         }
804         if (inbytesleft <= 0)
805         {
806             *no_read = 0;
807             cd->my_errno = YAZ_ICONV_EINVAL;
808             return 0;
809         }
810         cd->marc8_esc_mode = *inp++;
811         inbytesleft--;
812         (*no_read) += inbytesleft0 - inbytesleft;
813     }
814     if (inbytesleft <= 0)
815         return 0;
816     else
817     {
818         unsigned long x;
819         size_t no_read_sub = 0;
820         *comb = 0;
821
822         switch(cd->marc8_esc_mode)
823         {
824         case 'B':  /* Basic ASCII */
825         case 'E':  /* ANSEL */
826         case 's':  /* ASCII */
827             x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
828             break;
829         case 'g':  /* Greek */
830             x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
831             break;
832         case 'b':  /* Subscripts */
833             x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
834             break;
835         case 'p':  /* Superscripts */
836             x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
837             break;
838         case '2':  /* Basic Hebrew */
839             x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
840             break;
841         case 'N':  /* Basic Cyrillic */
842         case 'Q':  /* Extended Cyrillic */
843             x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
844             break;
845         case '3':  /* Basic Arabic */
846         case '4':  /* Extended Arabic */
847             x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
848             break;
849         case 'S':  /* Greek */
850             x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
851             break;
852         case '1':  /* Chinese, Japanese, Korean (EACC) */
853             x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
854             break;
855         default:
856             *no_read = 0;
857             cd->my_errno = YAZ_ICONV_EILSEQ;
858             return 0;
859         }
860         *no_read += no_read_sub;
861         return x;
862     }
863 }
864
865 static size_t yaz_write_UTF8(yaz_iconv_t cd, unsigned long x,
866                              char **outbuf, size_t *outbytesleft)
867 {
868     return yaz_write_UTF8_char(x, outbuf, outbytesleft, &cd->my_errno);
869 }
870
871 size_t yaz_write_UTF8_char(unsigned long x,
872                            char **outbuf, size_t *outbytesleft,
873                            int *error)
874 {
875     unsigned char *outp = (unsigned char *) *outbuf;
876
877     if (x <= 0x7f && *outbytesleft >= 1)
878     {
879         *outp++ = (unsigned char) x;
880         (*outbytesleft)--;
881     } 
882     else if (x <= 0x7ff && *outbytesleft >= 2)
883     {
884         *outp++ = (unsigned char) ((x >> 6) | 0xc0);
885         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
886         (*outbytesleft) -= 2;
887     }
888     else if (x <= 0xffff && *outbytesleft >= 3)
889     {
890         *outp++ = (unsigned char) ((x >> 12) | 0xe0);
891         *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
892         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
893         (*outbytesleft) -= 3;
894     }
895     else if (x <= 0x1fffff && *outbytesleft >= 4)
896     {
897         *outp++ = (unsigned char) ((x >> 18) | 0xf0);
898         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
899         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
900         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
901         (*outbytesleft) -= 4;
902     }
903     else if (x <= 0x3ffffff && *outbytesleft >= 5)
904     {
905         *outp++ = (unsigned char) ((x >> 24) | 0xf8);
906         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
907         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
908         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
909         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
910         (*outbytesleft) -= 5;
911     }
912     else if (*outbytesleft >= 6)
913     {
914         *outp++ = (unsigned char) ((x >> 30) | 0xfc);
915         *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
916         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
917         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
918         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
919         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
920         (*outbytesleft) -= 6;
921     }
922     else 
923     {
924         *error = YAZ_ICONV_E2BIG;  /* not room for output */
925         return (size_t)(-1);
926     }
927     *outbuf = (char *) outp;
928     return 0;
929 }
930
931 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
932                                    char **outbuf, size_t *outbytesleft)
933 {
934     /* list of two char unicode sequence that, when combined, are
935        equivalent to single unicode chars that can be represented in
936        ISO-8859-1/Latin-1.
937        Regular iconv on Linux at least does not seem to convert these,
938        but since MARC-8 to UTF-8 generates these composed sequence
939        we get a better chance of a successful MARC-8 -> ISO-8859-1
940        conversion */
941     unsigned char *outp = (unsigned char *) *outbuf;
942
943     if (cd->compose_char)
944     {
945         int i;
946         for (i = 0; latin1_comb[i].x1; i++)
947             if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
948             {
949                 x = latin1_comb[i].y;
950                 break;
951             }
952         if (*outbytesleft < 1)
953         {  /* no room. Retain compose_char and bail out */
954             cd->my_errno = YAZ_ICONV_E2BIG;
955             return (size_t)(-1);
956         }
957         if (!latin1_comb[i].x1) 
958         {   /* not found. Just write compose_char */
959             *outp++ = (unsigned char) cd->compose_char;
960             (*outbytesleft)--;
961             *outbuf = (char *) outp;
962         }
963         /* compose_char used so reset it. x now holds current char */
964         cd->compose_char = 0;
965     }
966
967     if (x > 32 && x < 127 && cd->compose_char == 0)
968     {
969         cd->compose_char = x;
970         return 0;
971     }
972     else if (x > 255 || x < 1)
973     {
974         cd->my_errno = YAZ_ICONV_EILSEQ;
975         return (size_t) -1;
976     }
977     else if (*outbytesleft < 1)
978     {
979         cd->my_errno = YAZ_ICONV_E2BIG;
980         return (size_t)(-1);
981     }
982     *outp++ = (unsigned char) x;
983     (*outbytesleft)--;
984     *outbuf = (char *) outp;
985     return 0;
986 }
987
988 static size_t yaz_flush_ISO8859_1(yaz_iconv_t cd,
989                                   char **outbuf, size_t *outbytesleft)
990 {
991     if (cd->compose_char)
992     {
993         unsigned char *outp = (unsigned char *) *outbuf;
994         if (*outbytesleft < 1)
995         {
996             cd->my_errno = YAZ_ICONV_E2BIG;
997             return (size_t)(-1);
998         }
999         *outp++ = (unsigned char) cd->compose_char;
1000         (*outbytesleft)--;
1001         *outbuf = (char *) outp;
1002         cd->compose_char = 0;
1003     }
1004     return 0;
1005 }
1006
1007 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
1008                               char **outbuf, size_t *outbytesleft)
1009 {
1010     unsigned char *outp = (unsigned char *) *outbuf;
1011     if (*outbytesleft >= 4)
1012     {
1013         *outp++ = (unsigned char) (x>>24);
1014         *outp++ = (unsigned char) (x>>16);
1015         *outp++ = (unsigned char) (x>>8);
1016         *outp++ = (unsigned char) x;
1017         (*outbytesleft) -= 4;
1018     }
1019     else
1020     {
1021         cd->my_errno = YAZ_ICONV_E2BIG;
1022         return (size_t)(-1);
1023     }
1024     *outbuf = (char *) outp;
1025     return 0;
1026 }
1027
1028 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
1029                                 char **outbuf, size_t *outbytesleft)
1030 {
1031     unsigned char *outp = (unsigned char *) *outbuf;
1032     if (*outbytesleft >= 4)
1033     {
1034         *outp++ = (unsigned char) x;
1035         *outp++ = (unsigned char) (x>>8);
1036         *outp++ = (unsigned char) (x>>16);
1037         *outp++ = (unsigned char) (x>>24);
1038         (*outbytesleft) -= 4;
1039     }
1040     else
1041     {
1042         cd->my_errno = YAZ_ICONV_E2BIG;
1043         return (size_t)(-1);
1044     }
1045     *outbuf = (char *) outp;
1046     return 0;
1047 }
1048
1049 static unsigned long lookup_marc8(yaz_iconv_t cd,
1050                                   unsigned long x, int *comb,
1051                                   const char **page_chr)
1052 {
1053     char utf8_buf[7];
1054     char *utf8_outbuf = utf8_buf;
1055     size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
1056
1057     r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft);
1058     if (r == (size_t)(-1))
1059     {
1060         cd->my_errno = YAZ_ICONV_EILSEQ;
1061         return 0;
1062     }
1063     else
1064     {
1065         unsigned char *inp;
1066         size_t inbytesleft, no_read_sub = 0;
1067         unsigned long x;
1068
1069         *utf8_outbuf = '\0';        
1070         inp = (unsigned char *) utf8_buf;
1071         inbytesleft = strlen(utf8_buf);
1072         
1073         x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
1074         if (x)
1075         {
1076             *page_chr = "\033(B";
1077             return x;
1078         }
1079         x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
1080         if (x)
1081         {
1082             *page_chr = "\033g";
1083             return x;
1084         }
1085         x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
1086         if (x)
1087         {
1088             *page_chr = "\033b";
1089             return x;
1090         }
1091         x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
1092         if (x)
1093         {
1094             *page_chr = "\033p";
1095             return x;
1096         }
1097         x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
1098         if (x)
1099         {
1100             *page_chr = "\033(2";
1101             return x;
1102         }
1103         x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
1104         if (x)
1105         {
1106             *page_chr = "\033(N";
1107             return x;
1108         }
1109         x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
1110         if (x)
1111         {
1112             *page_chr = "\033(3";
1113             return x;
1114         }
1115         x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
1116         if (x)
1117         {
1118             *page_chr = "\033(S";
1119             return x;
1120         }
1121         x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
1122         if (x)
1123         {
1124             *page_chr = "\033$1";
1125             return x;
1126         }
1127         cd->my_errno = YAZ_ICONV_EILSEQ;
1128         return x;
1129     }
1130 }
1131
1132 static size_t flush_combos(yaz_iconv_t cd,
1133                            char **outbuf, size_t *outbytesleft)
1134 {
1135     unsigned long y = cd->write_marc8_last;
1136     unsigned char byte;
1137     char out_buf[10];
1138     size_t i, out_no = 0;
1139
1140     if (!y)
1141         return 0;
1142
1143     byte = (unsigned char )((y>>16) & 0xff);
1144     if (byte)
1145         out_buf[out_no++] = byte;
1146     byte = (unsigned char)((y>>8) & 0xff);
1147     if (byte)
1148         out_buf[out_no++] = byte;
1149     byte = (unsigned char )(y & 0xff);
1150     if (byte)
1151         out_buf[out_no++] = byte;
1152
1153     if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
1154     {
1155         cd->my_errno = YAZ_ICONV_E2BIG;
1156         return (size_t) (-1);
1157     }
1158
1159     for (i = 0; i < cd->write_marc8_comb_no; i++)
1160     {
1161         /* all MARC-8 combined characters are simple bytes */
1162         byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
1163         *(*outbuf)++ = byte;
1164         (*outbytesleft)--;
1165     }
1166     memcpy(*outbuf, out_buf, out_no);
1167     *outbuf += out_no;
1168     (*outbytesleft) -= out_no;
1169     if (cd->write_marc8_second_half_char)
1170     {
1171         *(*outbuf)++ = cd->write_marc8_second_half_char;
1172         (*outbytesleft)--;
1173     }        
1174
1175     cd->write_marc8_last = 0;
1176     cd->write_marc8_comb_no = 0;
1177     cd->write_marc8_second_half_char = 0;
1178     return 0;
1179 }
1180
1181 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd, 
1182                                        char **outbuf, size_t *outbytesleft,
1183                                        const char *page_chr)
1184 {
1185     const char *old_page_chr = cd->write_marc8_page_chr;
1186     if (strcmp(page_chr, old_page_chr))
1187     {
1188         size_t plen = 0;
1189         const char *page_out = page_chr;
1190         
1191         if (*outbytesleft < 8)
1192         {
1193             cd->my_errno = YAZ_ICONV_E2BIG;
1194             
1195             return (size_t) (-1);
1196         }
1197         cd->write_marc8_page_chr = page_chr;
1198         
1199         if (!strcmp(old_page_chr, "\033p") 
1200             || !strcmp(old_page_chr, "\033g")
1201             || !strcmp(old_page_chr, "\033b"))
1202         {
1203             /* Technique 1 leave */
1204             page_out = "\033s";
1205             if (strcmp(page_chr, "\033(B")) /* Not going ASCII page? */
1206             {
1207                 /* Must leave script + enter new page */
1208                 plen = strlen(page_out);
1209                 memcpy(*outbuf, page_out, plen);
1210                 (*outbuf) += plen;
1211                 (*outbytesleft) -= plen;
1212                 page_out = page_chr;
1213             }
1214         }
1215         plen = strlen(page_out);
1216         memcpy(*outbuf, page_out, plen);
1217         (*outbuf) += plen;
1218         (*outbytesleft) -= plen;
1219     }
1220     return 0;
1221 }
1222
1223
1224 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
1225                                 char **outbuf, size_t *outbytesleft)
1226 {
1227     int comb = 0;
1228     const char *page_chr = 0;
1229     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
1230
1231     if (!y)
1232         return (size_t) (-1);
1233
1234     if (comb)
1235     {
1236         if (x == 0x0361)
1237             cd->write_marc8_second_half_char = 0xEC;
1238         else if (x == 0x0360)
1239             cd->write_marc8_second_half_char = 0xFB;
1240
1241         if (cd->write_marc8_comb_no < 6)
1242             cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
1243     }
1244     else
1245     {
1246         size_t r = flush_combos(cd, outbuf, outbytesleft);
1247         if (r)
1248             return r;
1249
1250         r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft, page_chr);
1251         if (r)
1252             return r;
1253         cd->write_marc8_last = y;
1254     }
1255     return 0;
1256 }
1257
1258 static size_t yaz_flush_marc8(yaz_iconv_t cd,
1259                               char **outbuf, size_t *outbytesleft)
1260 {
1261     size_t r = flush_combos(cd, outbuf, outbytesleft);
1262     if (r)
1263         return r;
1264     return yaz_write_marc8_page_chr(cd, outbuf, outbytesleft, "\033(B");
1265 }
1266
1267 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
1268                               char **outbuf, size_t *outbytesleft)
1269 {
1270     int i;
1271     for (i = 0; latin1_comb[i].x1; i++)
1272     {
1273         if (x == latin1_comb[i].y)
1274         {
1275             size_t r ;
1276             /* save the output pointers .. */
1277             char *outbuf0 = *outbuf;
1278             size_t outbytesleft0 = *outbytesleft;
1279             int last_ch = cd->write_marc8_last;
1280
1281             r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
1282                                   outbuf, outbytesleft);
1283             if (r)
1284                 return r;
1285             r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
1286                                   outbuf, outbytesleft);
1287             if (r && cd->my_errno == YAZ_ICONV_E2BIG)
1288             {
1289                 /* not enough room. reset output to original values */
1290                 *outbuf = outbuf0;
1291                 *outbytesleft = outbytesleft0;
1292                 cd->write_marc8_last = last_ch;
1293             }
1294             return r;
1295         }
1296     }
1297     return yaz_write_marc8_2(cd, x, outbuf, outbytesleft);
1298 }
1299
1300
1301 #if HAVE_WCHAR_H
1302 static size_t yaz_write_wchar_t(yaz_iconv_t cd, unsigned long x,
1303                                 char **outbuf, size_t *outbytesleft)
1304 {
1305     unsigned char *outp = (unsigned char *) *outbuf;
1306
1307     if (*outbytesleft >= sizeof(wchar_t))
1308     {
1309         wchar_t wch = x;
1310         memcpy(outp, &wch, sizeof(wch));
1311         outp += sizeof(wch);
1312         (*outbytesleft) -= sizeof(wch);
1313     }
1314     else
1315     {
1316         cd->my_errno = YAZ_ICONV_E2BIG;
1317         return (size_t)(-1);
1318     }
1319     *outbuf = (char *) outp;
1320     return 0;
1321 }
1322 #endif
1323
1324 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
1325 {
1326     return cd->read_handle && cd->write_handle;
1327 }
1328
1329 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
1330 {
1331     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
1332
1333     cd->write_handle = 0;
1334     cd->read_handle = 0;
1335     cd->init_handle = 0;
1336     cd->flush_handle = 0;
1337     cd->my_errno = YAZ_ICONV_UNKNOWN;
1338
1339     /* a useful hack: if fromcode has leading @,
1340        the library not use YAZ's own conversions .. */
1341     if (fromcode[0] == '@')
1342         fromcode++;
1343     else
1344     {
1345         if (!yaz_matchstr(fromcode, "UTF8"))
1346         {
1347             cd->read_handle = yaz_read_UTF8;
1348             cd->init_handle = yaz_init_UTF8;
1349         }
1350         else if (!yaz_matchstr(fromcode, "ISO88591"))
1351             cd->read_handle = yaz_read_ISO8859_1;
1352         else if (!yaz_matchstr(fromcode, "UCS4"))
1353             cd->read_handle = yaz_read_UCS4;
1354         else if (!yaz_matchstr(fromcode, "UCS4LE"))
1355             cd->read_handle = yaz_read_UCS4LE;
1356         else if (!yaz_matchstr(fromcode, "MARC8"))
1357             cd->read_handle = yaz_read_marc8;
1358         else if (!yaz_matchstr(fromcode, "MARC8s"))
1359             cd->read_handle = yaz_read_marc8s;
1360         else if (!yaz_matchstr(fromcode, "advancegreek"))
1361             cd->read_handle = yaz_read_advancegreek;
1362 #if HAVE_WCHAR_H
1363         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
1364             cd->read_handle = yaz_read_wchar_t;
1365 #endif
1366         
1367         if (!yaz_matchstr(tocode, "UTF8"))
1368             cd->write_handle = yaz_write_UTF8;
1369         else if (!yaz_matchstr(tocode, "ISO88591"))
1370         {
1371             cd->write_handle = yaz_write_ISO8859_1;
1372             cd->flush_handle = yaz_flush_ISO8859_1;
1373         }
1374         else if (!yaz_matchstr (tocode, "UCS4"))
1375             cd->write_handle = yaz_write_UCS4;
1376         else if (!yaz_matchstr(tocode, "UCS4LE"))
1377             cd->write_handle = yaz_write_UCS4LE;
1378         else if (!yaz_matchstr(tocode, "MARC8"))
1379         {
1380             cd->write_handle = yaz_write_marc8;
1381             cd->flush_handle = yaz_flush_marc8;
1382         }
1383         else if (!yaz_matchstr(tocode, "MARC8s"))
1384         {
1385             cd->write_handle = yaz_write_marc8;
1386             cd->flush_handle = yaz_flush_marc8;
1387         }
1388         else if (!yaz_matchstr(tocode, "advancegreek"))
1389         {
1390             cd->write_handle = yaz_write_advancegreek;
1391         }
1392 #if HAVE_WCHAR_H
1393         else if (!yaz_matchstr(tocode, "WCHAR_T"))
1394             cd->write_handle = yaz_write_wchar_t;
1395 #endif
1396     }
1397 #if HAVE_ICONV_H
1398     cd->iconv_cd = 0;
1399     if (!cd->read_handle || !cd->write_handle)
1400     {
1401         cd->iconv_cd = iconv_open (tocode, fromcode);
1402         if (cd->iconv_cd == (iconv_t) (-1))
1403         {
1404             xfree (cd);
1405             return 0;
1406         }
1407     }
1408 #else
1409     if (!cd->read_handle || !cd->write_handle)
1410     {
1411         xfree (cd);
1412         return 0;
1413     }
1414 #endif
1415     cd->init_flag = 1;
1416     return cd;
1417 }
1418
1419 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
1420                  char **outbuf, size_t *outbytesleft)
1421 {
1422     char *inbuf0 = 0;
1423     size_t r = 0;
1424
1425 #if HAVE_ICONV_H
1426     if (cd->iconv_cd)
1427     {
1428         size_t r =
1429             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
1430         if (r == (size_t)(-1))
1431         {
1432             switch (yaz_errno())
1433             {
1434             case E2BIG:
1435                 cd->my_errno = YAZ_ICONV_E2BIG;
1436                 break;
1437             case EINVAL:
1438                 cd->my_errno = YAZ_ICONV_EINVAL;
1439                 break;
1440             case EILSEQ:
1441                 cd->my_errno = YAZ_ICONV_EILSEQ;
1442                 break;
1443             default:
1444                 cd->my_errno = YAZ_ICONV_UNKNOWN;
1445             }
1446         }
1447         return r;
1448     }
1449 #endif
1450
1451     if (inbuf)
1452         inbuf0 = *inbuf;
1453
1454     if (cd->init_flag)
1455     {
1456         cd->my_errno = YAZ_ICONV_UNKNOWN;
1457         cd->marc8_esc_mode = 'B';
1458         
1459         cd->comb_offset = cd->comb_size = 0;
1460         cd->compose_char = 0;
1461         
1462         cd->write_marc8_comb_no = 0;
1463         cd->write_marc8_second_half_char = 0;
1464         cd->write_marc8_last = 0;
1465         cd->write_marc8_page_chr = "\033(B";
1466         
1467         cd->unget_x = 0;
1468         cd->no_read_x = 0;
1469     }
1470
1471     if (cd->init_flag)
1472     {
1473         if (cd->init_handle && inbuf && *inbuf)
1474         {
1475             size_t no_read = 0;
1476             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1477                                          *inbytesleft, &no_read);
1478             if (r)
1479             {
1480                 if (cd->my_errno == YAZ_ICONV_EINVAL)
1481                     return r;
1482                 cd->init_flag = 0;
1483                 return r;
1484             }
1485             *inbytesleft -= no_read;
1486             *inbuf += no_read;
1487         }
1488     }
1489     cd->init_flag = 0;
1490
1491     if (!inbuf || !*inbuf)
1492     {
1493         if (outbuf && *outbuf)
1494         {
1495             if (cd->unget_x)
1496                 r = (*cd->write_handle)(cd, cd->unget_x, outbuf, outbytesleft);
1497             if (cd->flush_handle)
1498                 r = (*cd->flush_handle)(cd, outbuf, outbytesleft);
1499         }
1500         if (r == 0)
1501             cd->init_flag = 1;
1502         cd->unget_x = 0;
1503         return r;
1504     }
1505     while (1)
1506     {
1507         unsigned long x;
1508         size_t no_read;
1509
1510         if (cd->unget_x)
1511         {
1512             x = cd->unget_x;
1513             no_read = cd->no_read_x;
1514         }
1515         else
1516         {
1517             if (*inbytesleft == 0)
1518             {
1519                 r = *inbuf - inbuf0;
1520                 break;
1521             }
1522             x = (*cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1523                                    &no_read);
1524             if (no_read == 0)
1525             {
1526                 r = (size_t)(-1);
1527                 break;
1528             }
1529         }
1530         if (x)
1531         {
1532             r = (*cd->write_handle)(cd, x, outbuf, outbytesleft);
1533             if (r)
1534             {
1535                 /* unable to write it. save it because read_handle cannot
1536                    rewind .. */
1537                 if (cd->my_errno == YAZ_ICONV_E2BIG)
1538                 {
1539                     cd->unget_x = x;
1540                     cd->no_read_x = no_read;
1541                     break;
1542                 }
1543             }
1544             cd->unget_x = 0;
1545         }
1546         *inbytesleft -= no_read;
1547         (*inbuf) += no_read;
1548     }
1549     return r;
1550 }
1551
1552 int yaz_iconv_error (yaz_iconv_t cd)
1553 {
1554     return cd->my_errno;
1555 }
1556
1557 int yaz_iconv_close (yaz_iconv_t cd)
1558 {
1559 #if HAVE_ICONV_H
1560     if (cd->iconv_cd)
1561         iconv_close (cd->iconv_cd);
1562 #endif
1563     xfree (cd);
1564     return 0;
1565 }
1566
1567 /*
1568  * Local variables:
1569  * c-basic-offset: 4
1570  * indent-tabs-mode: nil
1571  * End:
1572  * vim: shiftwidth=4 tabstop=8 expandtab
1573  */
1574