Using old C style comments.
[yaz-moved-to-github.git] / src / siconv.c
1 /*
2  * Copyright (C) 1995-2007, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: siconv.c,v 1.40 2007-05-03 22:20:45 adam Exp $
6  */
7 /**
8  * \file siconv.c
9  * \brief Implements simple ICONV
10  *
11  * This implements an interface similar to that of iconv and
12  * is used by YAZ to interface with iconv (if present).
13  * For systems where iconv is not present, this layer
14  * provides a few important conversions: UTF-8, MARC-8, Latin-1.
15  *
16  * MARC-8 reference:
17  *  http://www.loc.gov/marc/specifications/speccharmarc8.html
18  */
19
20 #if HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include <assert.h>
25 #include <errno.h>
26 #include <string.h>
27 #include <ctype.h>
28 #if HAVE_WCHAR_H
29 #include <wchar.h>
30 #endif
31
32 #if HAVE_ICONV_H
33 #include <iconv.h>
34 #endif
35
36
37 #include <yaz/yaz-util.h>
38
39 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
40                                size_t *no_read, int *combining);
41 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
42                                size_t *no_read, int *combining);
43 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
44                                size_t *no_read, int *combining);
45 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
46                                size_t *no_read, int *combining);
47 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
48                                size_t *no_read, int *combining);
49 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
50                                size_t *no_read, int *combining);
51 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
52                                size_t *no_read, int *combining);
53 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
54                                size_t *no_read, int *combining);
55 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
56                                size_t *no_read, int *combining);
57
58
59 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
60                                 size_t *no_read, int *combining);
61 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
62                                 size_t *no_read, int *combining);
63 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
64                                 size_t *no_read, int *combining);
65 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
66                                 size_t *no_read, int *combining);
67 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
68                                 size_t *no_read, int *combining);
69 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
70                                 size_t *no_read, int *combining);
71 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
72                                 size_t *no_read, int *combining);
73 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
74                                 size_t *no_read, int *combining);
75 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
76                                 size_t *no_read, int *combining);
77
78 struct yaz_iconv_struct {
79     int my_errno;
80     int init_flag;
81     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
82                           size_t inbytesleft, size_t *no_read);
83     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
84                                  size_t inbytesleft, size_t *no_read);
85     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
86                            char **outbuf, size_t *outbytesleft);
87     size_t (*flush_handle)(yaz_iconv_t cd,
88                            char **outbuf, size_t *outbytesleft);
89     int marc8_esc_mode;
90
91     int comb_offset;
92     int comb_size;
93     unsigned long comb_x[8];
94     size_t comb_no_read[8];
95     size_t no_read_x;
96     unsigned long unget_x;
97 #if HAVE_ICONV_H
98     iconv_t iconv_cd;
99 #endif
100     unsigned long compose_char;
101
102     unsigned long write_marc8_comb_ch[8];
103     size_t write_marc8_comb_no;
104     unsigned write_marc8_second_half_char;
105     unsigned long write_marc8_last;
106     const char *write_marc8_page_chr;
107 };
108
109 static struct {
110     unsigned long x1, x2;
111     unsigned y;
112 } latin1_comb[] = {
113     { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
114     { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
115     { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
116     { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
117     { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
118     { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
119     /* no need for 0xc6      LATIN CAPITAL LETTER AE */
120     { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
121     { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
122     { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
123     { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
124     { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
125     { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
126     { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
127     { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
128     { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
129     { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
130     { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
131     { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
132     { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
133     { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
134     { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
135     /* omitted:    0xd7      MULTIPLICATION SIGN */
136     /* omitted:    0xd8      LATIN CAPITAL LETTER O WITH STROKE */
137     { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
138     { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
139     { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
140     { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
141     { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
142     /* omitted:    0xde      LATIN CAPITAL LETTER THORN */
143     /* omitted:    0xdf      LATIN SMALL LETTER SHARP S */
144     { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
145     { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
146     { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
147     { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
148     { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
149     { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
150     /* omitted:    0xe6      LATIN SMALL LETTER AE */
151     { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
152     { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
153     { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
154     { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
155     { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
156     { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
157     { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
158     { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
159     { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
160     /* omitted:    0xf0      LATIN SMALL LETTER ETH */
161     { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
162     { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
163     { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
164     { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
165     { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
166     { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
167     /* omitted:    0xf7      DIVISION SIGN */
168     /* omitted:    0xf8      LATIN SMALL LETTER O WITH STROKE */
169     { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
170     { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
171     { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
172     { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
173     { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
174     /* omitted:    0xfe      LATIN SMALL LETTER THORN */
175     { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
176     
177     { 0, 0, 0}
178 };
179
180 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
181                                          size_t inbytesleft, size_t *no_read)
182 {
183     unsigned long x = inp[0];
184     *no_read = 1;
185     return x;
186 }
187
188
189 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
190                              size_t inbytesleft, size_t *no_read)
191 {
192     if (inp[0] != 0xef)
193     {
194         *no_read = 0;
195         return 0;
196     }
197     if (inbytesleft < 3)
198     {
199         cd->my_errno = YAZ_ICONV_EINVAL;
200         return (size_t) -1;
201     }
202     if (inp[1] != 0xbb && inp[2] == 0xbf)
203         *no_read = 3;
204     else
205         *no_read = 0;
206     return 0;
207 }
208
209 unsigned long yaz_read_UTF8_char(unsigned char *inp,
210                                  size_t inbytesleft, size_t *no_read,
211                                  int *error)
212 {
213     unsigned long x = 0;
214
215     if (inp[0] <= 0x7f)
216     {
217         x = inp[0];
218         *no_read = 1;
219     }
220     else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
221     {
222         *no_read = 0;
223         *error = YAZ_ICONV_EILSEQ;
224     }
225     else if (inp[0] <= 0xdf && inbytesleft >= 2)
226     {
227         x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
228         if (x >= 0x80)
229             *no_read = 2;
230         else
231         {
232             *no_read = 0;
233             *error = YAZ_ICONV_EILSEQ;
234         }
235     }
236     else if (inp[0] <= 0xef && inbytesleft >= 3)
237     {
238         x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
239             (inp[2] & 0x3f);
240         if (x >= 0x800)
241             *no_read = 3;
242         else
243         {
244             *no_read = 0;
245             *error = YAZ_ICONV_EILSEQ;
246         }
247     }
248     else if (inp[0] <= 0xf7 && inbytesleft >= 4)
249     {
250         x =  ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
251             ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
252         if (x >= 0x10000)
253             *no_read = 4;
254         else
255         {
256             *no_read = 0;
257             *error = YAZ_ICONV_EILSEQ;
258         }
259     }
260     else if (inp[0] <= 0xfb && inbytesleft >= 5)
261     {
262         x =  ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
263             ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
264             (inp[4] & 0x3f);
265         if (x >= 0x200000)
266             *no_read = 5;
267         else
268         {
269             *no_read = 0;
270             *error = YAZ_ICONV_EILSEQ;
271         }
272     }
273     else if (inp[0] <= 0xfd && inbytesleft >= 6)
274     {
275         x =  ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
276             ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
277             ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
278         if (x >= 0x4000000)
279             *no_read = 6;
280         else
281         {
282             *no_read = 0;
283             *error = YAZ_ICONV_EILSEQ;
284         }
285     }
286     else
287     {
288         *no_read = 0;
289         *error = YAZ_ICONV_EINVAL;
290     }
291     return x;
292 }
293
294 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
295                                     size_t inbytesleft, size_t *no_read)
296 {
297     return yaz_read_UTF8_char(inp, inbytesleft, no_read, &cd->my_errno);
298 }
299
300 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
301                                     size_t inbytesleft, size_t *no_read)
302 {
303     unsigned long x = 0;
304     
305     if (inbytesleft < 4)
306     {
307         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
308         *no_read = 0;
309     }
310     else
311     {
312         x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
313         *no_read = 4;
314     }
315     return x;
316 }
317
318 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
319                                       size_t inbytesleft, size_t *no_read)
320 {
321     unsigned long x = 0;
322     
323     if (inbytesleft < 4)
324     {
325         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
326         *no_read = 0;
327     }
328     else
329     {
330         x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
331         *no_read = 4;
332     }
333     return x;
334 }
335
336 #if HAVE_WCHAR_H
337 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
338                                        size_t inbytesleft, size_t *no_read)
339 {
340     unsigned long x = 0;
341     
342     if (inbytesleft < sizeof(wchar_t))
343     {
344         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
345         *no_read = 0;
346     }
347     else
348     {
349         wchar_t wch;
350         memcpy (&wch, inp, sizeof(wch));
351         x = wch;
352         *no_read = sizeof(wch);
353     }
354     return x;
355 }
356 #endif
357
358 static unsigned long yaz_read_iso5428_1984(yaz_iconv_t cd, unsigned char *inp,
359                                            size_t inbytesleft, size_t *no_read)
360 {
361     unsigned long x = 0;
362     int tonos = 0;
363     int dialitika = 0;
364
365     *no_read = 0;
366     while (inbytesleft > 0)
367     {
368         if (*inp == 0xa2)
369         {
370             tonos = 1;
371         }
372         else if (*inp == 0xa3)
373         {
374             dialitika = 1;
375         }
376         else
377             break;
378         inp++;
379         --inbytesleft;
380         (*no_read)++;
381     }    
382     if (inbytesleft == 0)
383     {
384         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
385         *no_read = 0;
386         return 0;
387     }
388     switch (*inp) {
389     case 0xe1: /*  alpha small */
390             if (tonos) 
391                 x = 0x03ac;
392             else 
393                 x = 0x03b1;
394             break;
395     case 0xc1: /*  alpha capital */
396             if (tonos) 
397                 x = 0x0386;
398             else 
399                 x = 0x0391;
400             break;
401
402     case 0xe2: /*  Beta small */
403             x = 0x03b2;
404             break;
405     case 0xc2: /*  Beta capital */
406             x = 0x0392;
407             break;
408
409     case 0xe4: /*  Gamma small */
410             x = 0x03b3;
411             break;
412     case 0xc4: /*  Gamma capital */
413             x = 0x0393;
414             break;
415
416     case 0xe5: /*  Delta small */
417             x = 0x03b4;
418             break;
419     case 0xc5: /*  Delta capital */
420             x = 0x0394;
421             break;
422     case 0xe6: /*  epsilon small */
423             if (tonos) 
424                 x = 0x03ad;
425             else 
426                 x = 0x03b5;
427             break;
428     case 0xc6: /*  epsilon capital */
429             if (tonos) 
430                 x = 0x0388;
431             else 
432                 x = 0x0395;
433             break;
434     case 0xe9: /*  Zeta small */
435             x = 0x03b6;
436             break;
437     case 0xc9: /*  Zeta capital */
438             x = 0x0396;
439             break;
440     case 0xea: /*  Eta small */
441             if (tonos) 
442                 x = 0x03ae;
443             else 
444                 x = 0x03b7;
445             break;
446     case 0xca: /*  Eta capital */
447             if (tonos) 
448                 x = 0x0389;
449             else 
450                 x = 0x0397;
451             break;
452     case 0xeb: /*  Theta small */
453             x = 0x03b8;
454             break;
455     case 0xcb: /*  Theta capital */
456             x = 0x0398;
457             break;
458     case 0xec: /*  Iota small */
459             if (tonos) 
460                 if (dialitika) 
461                     x = 0x0390;
462                 else 
463                     x = 0x03af;
464             else 
465                 if (dialitika) 
466                     x = 0x03ca;
467                 else 
468                     x = 0x03b9;
469             break;
470     case 0xcc: /*  Iota capital */
471             if (tonos) 
472                 x = 0x038a;
473             else 
474                 if (dialitika) 
475                     x = 0x03aa;
476                 else 
477                     x = 0x0399;
478             break;
479     case 0xed: /*  Kappa small */
480             x = 0x03ba;
481             break;
482     case 0xcd: /*  Kappa capital */
483             x = 0x039a;
484             break;
485     case 0xee: /*  Lambda small */
486             x = 0x03bb;
487             break;
488     case 0xce: /*  Lambda capital */
489             x = 0x039b;
490             break;
491     case 0xef: /*  Mu small */
492             x = 0x03bc;
493             break;
494     case 0xcf: /*  Mu capital */
495             x = 0x039c;
496             break;
497     case 0xf0: /*  Nu small */
498             x = 0x03bd;
499             break;
500     case 0xd0: /*  Nu capital */
501             x = 0x039d;
502             break;
503     case 0xf1: /*  Xi small */
504             x = 0x03be;
505             break;
506     case 0xd1: /*  Xi capital */
507             x = 0x039e;
508             break;
509     case 0xf2: /*  Omicron small */
510             if (tonos) 
511                 x = 0x03cc;
512             else 
513                 x = 0x03bf;
514             break;
515     case 0xd2: /*  Omicron capital */
516             if (tonos) 
517                 x = 0x038c;
518             else 
519                 x = 0x039f;
520             break;
521     case 0xf3: /*  Pi small */
522             x = 0x03c0;
523             break;
524     case 0xd3: /*  Pi capital */
525             x = 0x03a0;
526             break;
527     case 0xf5: /*  Rho small */
528             x = 0x03c1;
529             break;
530     case 0xd5: /*  Rho capital */
531             x = 0x03a1;
532             break;
533     case 0xf7: /*  Sigma small (end of words) */
534             x = 0x03c2;
535             break;
536     case 0xf6: /*  Sigma small */
537             x = 0x03c3;
538             break;
539     case 0xd6: /*  Sigma capital */
540             x = 0x03a3;
541             break;
542     case 0xf8: /*  Tau small */
543             x = 0x03c4;
544             break;
545     case 0xd8: /*  Tau capital */
546             x = 0x03a4;
547             break;
548     case 0xf9: /*  Upsilon small */
549             if (tonos) 
550                 if (dialitika) 
551                     x = 0x03b0;
552                 else 
553                     x = 0x03cd;
554             else 
555                 if (dialitika) 
556                     x = 0x03cb;
557                 else 
558                     x = 0x03c5;
559             break;
560     case 0xd9: /*  Upsilon capital */
561             if (tonos) 
562                 x = 0x038e;
563             else 
564                 if (dialitika) 
565                     x = 0x03ab;
566                 else 
567                     x = 0x03a5;
568             break;
569     case 0xfa: /*  Phi small */
570             x = 0x03c6;
571             break;
572     case 0xda: /*  Phi capital */
573             x = 0x03a6;
574             break;
575     case 0xfb: /*  Chi small */
576             x = 0x03c7;
577             break;
578     case 0xdb: /*  Chi capital */
579             x = 0x03a7;
580             break;
581     case 0xfc: /*  Psi small */
582             x = 0x03c8;
583             break;
584     case 0xdc: /*  Psi capital */
585             x = 0x03a8;
586             break;
587     case 0xfd: /*  Omega small */
588             if (tonos) 
589                 x = 0x03ce;
590             else 
591                 x = 0x03c9;
592             break;
593     case 0xdd: /*  Omega capital */
594             if (tonos) 
595                 x = 0x038f;
596             else 
597                 x = 0x03a9;
598             break;
599     default:
600         x = *inp;
601         break;
602     }
603     (*no_read)++;
604     
605     return x;
606 }
607
608 static size_t yaz_write_iso5428_1984(yaz_iconv_t cd, unsigned long x,
609                                      char **outbuf, size_t *outbytesleft)
610 {
611     size_t k = 0;
612     unsigned char *out = (unsigned char*) *outbuf;
613     if (*outbytesleft < 3)
614     {
615         cd->my_errno = YAZ_ICONV_E2BIG;  /* not room for output */
616         return (size_t)(-1);
617     }
618     switch (x)
619     {
620     case 0x03ac : out[k++]=0xa2; out[k++]=0xe1; break;
621     case 0x03b1 : out[k++]=0xe1; break;
622     case 0x0386 : out[k++]=0xa2; out[k++]=0xc1; break;
623     case 0x0391 : out[k++]=0xc1; break;
624     case 0x03b2 : out[k++]=0xe2; break;
625     case 0x0392 : out[k++]=0xc2; break;
626     case 0x03b3 : out[k++]=0xe4; break;
627     case 0x0393 : out[k++]=0xc4; break;
628     case 0x03b4 : out[k++]=0xe5; break;
629     case 0x0394 : out[k++]=0xc5; break;
630     case 0x03ad : out[k++]=0xa2; out[k++]=0xe6; break;
631     case 0x03b5 : out[k++]=0xe6; break;
632     case 0x0388 : out[k++]=0xa2; out[k++]=0xc6; break;
633     case 0x0395 : out[k++]=0xc6; break;
634     case 0x03b6 : out[k++]=0xe9; break;
635     case 0x0396 : out[k++]=0xc9; break;
636     case 0x03ae : out[k++]=0xa2; out[k++]=0xea; break;
637     case 0x03b7 : out[k++]=0xea; break;
638     case 0x0389 : out[k++]=0xa2; out[k++]=0xca; break;
639     case 0x0397 : out[k++]=0xca; break;
640     case 0x03b8 : out[k++]=0xeb; break;
641     case 0x0398 : out[k++]=0xcb; break;
642     case 0x0390 : out[k++]=0xa2; out[k++]=0xa3; out[k++]=0xec; break;
643     case 0x03af : out[k++]=0xa2; out[k++]=0xec; break;
644     case 0x03ca : out[k++]=0xa3; out[k++]=0xec; break;
645     case 0x03b9 : out[k++]=0xec; break;
646     case 0x038a : out[k++]=0xa2; out[k++]=0xcc; break;
647     case 0x03aa : out[k++]=0xa3; out[k++]=0xcc; break;
648     case 0x0399 : out[k++]=0xcc; break;
649     case 0x03ba : out[k++]=0xed; break;
650     case 0x039a : out[k++]=0xcd; break;
651     case 0x03bb : out[k++]=0xee; break;
652     case 0x039b : out[k++]=0xce; break;
653     case 0x03bc : out[k++]=0xef; break;
654     case 0x039c : out[k++]=0xcf; break;
655     case 0x03bd : out[k++]=0xf0; break;
656     case 0x039d : out[k++]=0xd0; break;
657     case 0x03be : out[k++]=0xf1; break;
658     case 0x039e : out[k++]=0xd1; break;
659     case 0x03cc : out[k++]=0xa2; out[k++]=0xf2; break;
660     case 0x03bf : out[k++]=0xf2; break;
661     case 0x038c : out[k++]=0xa2; out[k++]=0xd2; break;
662     case 0x039f : out[k++]=0xd2; break;
663     case 0x03c0 : out[k++]=0xf3; break;
664     case 0x03a0 : out[k++]=0xd3; break;
665     case 0x03c1 : out[k++]=0xf5; break;
666     case 0x03a1 : out[k++]=0xd5; break;
667     case 0x03c2 : out[k++]=0xf7; break;
668     case 0x03c3 : out[k++]=0xf6; break;
669     case 0x03a3 : out[k++]=0xd6; break;
670     case 0x03c4 : out[k++]=0xf8; break;
671     case 0x03a4 : out[k++]=0xd8; break;
672     case 0x03b0 : out[k++]=0xa2; out[k++]=0xa3; out[k++]=0xf9; break;
673     case 0x03cd : out[k++]=0xa2; out[k++]=0xf9; break;
674     case 0x03cb : out[k++]=0xa3; out[k++]=0xf9; break;
675     case 0x03c5 : out[k++]=0xf9; break;
676     case 0x038e : out[k++]=0xa2; out[k++]=0xd9; break;
677     case 0x03ab : out[k++]=0xa3; out[k++]=0xd9; break;
678     case 0x03a5 : out[k++]=0xd9; break;
679     case 0x03c6 : out[k++]=0xfa; break;
680     case 0x03a6 : out[k++]=0xda; break;
681     case 0x03c7 : out[k++]=0xfb; break;
682     case 0x03a7 : out[k++]=0xdb; break;
683     case 0x03c8 : out[k++]=0xfc; break;
684     case 0x03a8 : out[k++]=0xdc; break;
685     case 0x03ce : out[k++]=0xa2; out[k++]=0xfd; break;
686     case 0x03c9 : out[k++]=0xfd; break;
687     case 0x038f : out[k++]=0xa2; out[k++]=0xdd; break;
688     case 0x03a9 : out[k++]=0xdd; break;
689     default:
690         if (x > 255)
691         {
692             cd->my_errno = YAZ_ICONV_EILSEQ;
693             return (size_t) -1;
694         }
695         out[k++] = x;
696         break;
697     }
698     *outbytesleft -= k;
699     (*outbuf) += k;
700     return 0;
701 }
702
703 static unsigned long yaz_read_advancegreek(yaz_iconv_t cd, unsigned char *inp,
704                                            size_t inbytesleft, size_t *no_read)
705 {
706     unsigned long x = 0;
707     int shift = 0;
708     int tonos = 0;
709     int dialitika = 0;
710
711     *no_read = 0;
712     while (inbytesleft > 0)
713     {
714         if (*inp == 0x9d)
715         {
716             tonos = 1;
717         }
718         else if (*inp == 0x9e)
719         {
720             dialitika = 1;
721         }
722         else if (*inp == 0x9f)
723         {
724             shift = 1;
725         }
726         else
727             break;
728         inp++;
729         --inbytesleft;
730         (*no_read)++;
731     }    
732     if (inbytesleft == 0)
733     {
734         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
735         *no_read = 0;
736         return 0;
737     }
738     switch (*inp) {
739     case 0x81:
740         if (shift) 
741             if (tonos) 
742                 x = 0x0386;
743             else 
744                 x = 0x0391;
745         else 
746             if (tonos) 
747                 x = 0x03ac;
748             else 
749                 x = 0x03b1;
750         break;
751     case 0x82:
752         if (shift) 
753             x = 0x0392;
754         else 
755             x = 0x03b2;
756         
757         break;
758     case 0x83:
759         if (shift) 
760             x = 0x0393;
761         else 
762             x = 0x03b3;
763         break;
764     case 0x84:
765         if (shift) 
766             x = 0x0394;
767         else 
768             x = 0x03b4;
769         break;
770     case 0x85:
771         if (shift) 
772             if (tonos) 
773                 x = 0x0388;
774             else 
775                 x = 0x0395;
776         else 
777             if (tonos) 
778                 x = 0x03ad;
779             else 
780                 x = 0x03b5;
781         break;
782     case 0x86:
783         if (shift) 
784             x = 0x0396;
785         else 
786             x = 0x03b6;
787         break;
788     case 0x87:
789         if (shift) 
790             if (tonos) 
791                 x = 0x0389;
792             else 
793                 x = 0x0397;
794         else 
795             if (tonos) 
796                 x = 0x03ae;
797             else 
798                 x = 0x03b7;
799         break;
800     case 0x88:
801         if (shift) 
802             x = 0x0398;
803         else 
804             x = 0x03b8;
805         break;
806     case 0x89:
807         if (shift) 
808             if (tonos) 
809                 x = 0x038a;
810             else 
811                 if (dialitika) 
812                     x = 0x03aa;
813                 else 
814                     x = 0x0399;
815         else 
816             if (tonos) 
817                 if (dialitika) 
818                     x = 0x0390;
819                 else 
820                     x = 0x03af;
821         
822             else 
823                 if (dialitika) 
824                     x = 0x03ca;
825                 else 
826                     x = 0x03b9;
827         break;
828     case 0x8a:
829         if (shift) 
830             x = 0x039a;
831         else 
832             x = 0x03ba;
833         
834         break;
835     case 0x8b:
836         if (shift) 
837             x = 0x039b;
838         else 
839             x = 0x03bb;
840         break;
841     case 0x8c:
842         if (shift) 
843             x = 0x039c;
844         else 
845             x = 0x03bc;
846         
847         break;
848     case 0x8d:
849         if (shift) 
850             x = 0x039d;
851         else 
852             x = 0x03bd;
853         break;
854     case 0x8e:
855         if (shift) 
856             x = 0x039e;
857         else 
858             x = 0x03be;
859         break;
860     case 0x8f:
861         if (shift) 
862             if (tonos) 
863                 x = 0x038c;
864             else 
865                 x = 0x039f;
866         else 
867             if (tonos) 
868                 x = 0x03cc;
869             else 
870                 x = 0x03bf;
871         break;
872     case 0x90:
873         if (shift) 
874             x = 0x03a0;
875         else 
876             x = 0x03c0;
877         break;
878     case 0x91:
879         if (shift) 
880             x = 0x03a1;
881         else 
882             x = 0x03c1;
883         break;
884     case 0x92:
885         x = 0x03c2;
886         break;
887     case 0x93:
888         if (shift) 
889             x = 0x03a3;
890         else 
891             x = 0x03c3;
892         break;
893     case 0x94:
894         if (shift) 
895             x = 0x03a4;
896         else 
897             x = 0x03c4;
898         break;
899     case 0x95:
900         if (shift) 
901             if (tonos) 
902                 x = 0x038e;
903             else 
904                 if (dialitika) 
905                     x = 0x03ab;
906                 else 
907                     x = 0x03a5;
908         else 
909             if (tonos) 
910                 if (dialitika) 
911                     x = 0x03b0;
912                 else 
913                     x = 0x03cd;
914         
915             else 
916                 if (dialitika) 
917                     x = 0x03cb;
918                 else 
919                     x = 0x03c5;
920         break;
921     case 0x96:
922         if (shift) 
923             x = 0x03a6;
924         else 
925             x = 0x03c6;
926         break;
927     case 0x97:
928         if (shift) 
929             x = 0x03a7;
930         else 
931             x = 0x03c7;
932         break;
933     case 0x98:
934         if (shift) 
935             x = 0x03a8;
936         else 
937             x = 0x03c8;
938         
939         break;
940         
941     case 0x99:
942         if (shift) 
943             if (tonos) 
944                 x = 0x038f;
945             else 
946                 x = 0x03a9;
947         else 
948             if (tonos) 
949                 x = 0x03ce;
950             else 
951                 x = 0x03c9;
952         break;
953     default:
954         x = *inp;
955         break;
956     }
957     (*no_read)++;
958     
959     return x;
960 }
961
962 static size_t yaz_write_advancegreek(yaz_iconv_t cd, unsigned long x,
963                                      char **outbuf, size_t *outbytesleft)
964 {
965     size_t k = 0;
966     unsigned char *out = (unsigned char*) *outbuf;
967     if (*outbytesleft < 3)
968     {
969         cd->my_errno = YAZ_ICONV_E2BIG;  /* not room for output */
970         return (size_t)(-1);
971     }
972     switch (x)
973     {
974     case 0x03ac : out[k++]=0x9d; out[k++]=0x81; break;
975     case 0x03ad : out[k++]=0x9d; out[k++]=0x85; break;
976     case 0x03ae : out[k++]=0x9d; out[k++]=0x87; break;
977     case 0x03af : out[k++]=0x9d; out[k++]=0x89; break;
978     case 0x03cc : out[k++]=0x9d; out[k++]=0x8f; break;
979     case 0x03cd : out[k++]=0x9d; out[k++]=0x95; break;
980     case 0x03ce : out[k++]=0x9d; out[k++]=0x99; break;
981     case 0x0390 : out[k++]=0x9d; out[k++]=0x9e; out[k++]=0x89; break;
982     case 0x03b0 : out[k++]=0x9d; out[k++]=0x9e; out[k++]=0x95; break;
983     case 0x0386 : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x81; break;
984     case 0x0388 : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x85; break;
985     case 0x0389 : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x87; break;
986     case 0x038a : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x89; break;
987     case 0x038c : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x8f; break;
988     case 0x038e : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x95; break;
989     case 0x038f : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x99; break;
990     case 0x03ca : out[k++]=0x9e; out[k++]=0x89; break;
991     case 0x03cb : out[k++]=0x9e; out[k++]=0x95; break;
992     case 0x03aa : out[k++]=0x9e; out[k++]=0x9f; out[k++]=0x89; break;
993     case 0x03ab : out[k++]=0x9e; out[k++]=0x9f; out[k++]=0x95; break;
994     case 0x0391 : out[k++]=0x9f; out[k++]=0x81; break;
995     case 0x0392 : out[k++]=0x9f; out[k++]=0x82; break;
996     case 0x0393 : out[k++]=0x9f; out[k++]=0x83; break;
997     case 0x0394 : out[k++]=0x9f; out[k++]=0x84; break;
998     case 0x0395 : out[k++]=0x9f; out[k++]=0x85; break;
999     case 0x0396 : out[k++]=0x9f; out[k++]=0x86; break;
1000     case 0x0397 : out[k++]=0x9f; out[k++]=0x87; break;
1001     case 0x0398 : out[k++]=0x9f; out[k++]=0x88; break;
1002     case 0x0399 : out[k++]=0x9f; out[k++]=0x89; break;
1003     case 0x039a : out[k++]=0x9f; out[k++]=0x8a; break;
1004     case 0x039b : out[k++]=0x9f; out[k++]=0x8b; break;
1005     case 0x039c : out[k++]=0x9f; out[k++]=0x8c; break;
1006     case 0x039d : out[k++]=0x9f; out[k++]=0x8d; break;
1007     case 0x039e : out[k++]=0x9f; out[k++]=0x8e; break;
1008     case 0x039f : out[k++]=0x9f; out[k++]=0x8f; break;
1009     case 0x03a0 : out[k++]=0x9f; out[k++]=0x90; break;
1010     case 0x03a1 : out[k++]=0x9f; out[k++]=0x91; break;
1011     case 0x03a3 : out[k++]=0x9f; out[k++]=0x93; break;
1012     case 0x03a4 : out[k++]=0x9f; out[k++]=0x94; break;
1013     case 0x03a5 : out[k++]=0x9f; out[k++]=0x95; break;
1014     case 0x03a6 : out[k++]=0x9f; out[k++]=0x96; break;
1015     case 0x03a7 : out[k++]=0x9f; out[k++]=0x97; break;
1016     case 0x03a8 : out[k++]=0x9f; out[k++]=0x98; break;
1017     case 0x03a9 : out[k++]=0x9f; out[k++]=0x99; break;
1018     case 0x03b1 : out[k++]=0x81; break;
1019     case 0x03b2 : out[k++]=0x82; break;
1020     case 0x03b3 : out[k++]=0x83; break;
1021     case 0x03b4 : out[k++]=0x84; break;
1022     case 0x03b5 : out[k++]=0x85; break;
1023     case 0x03b6 : out[k++]=0x86; break;
1024     case 0x03b7 : out[k++]=0x87; break;
1025     case 0x03b8 : out[k++]=0x88; break;
1026     case 0x03b9 : out[k++]=0x89; break;
1027     case 0x03ba : out[k++]=0x8a; break;
1028     case 0x03bb : out[k++]=0x8b; break;
1029     case 0x03bc : out[k++]=0x8c; break;
1030     case 0x03bd : out[k++]=0x8d; break;
1031     case 0x03be : out[k++]=0x8e; break;
1032     case 0x03bf : out[k++]=0x8f; break;
1033     case 0x03c0 : out[k++]=0x90; break;
1034     case 0x03c1 : out[k++]=0x91; break;
1035     case 0x03c2 : out[k++]=0x92; break;
1036     case 0x03c3 : out[k++]=0x93; break;
1037     case 0x03c4 : out[k++]=0x94; break;
1038     case 0x03c5 : out[k++]=0x95; break;
1039     case 0x03c6 : out[k++]=0x96; break;
1040     case 0x03c7 : out[k++]=0x96; break;
1041     case 0x03c8 : out[k++]=0x98; break;
1042     case 0x03c9 : out[k++]=0x99; break;
1043     default:
1044         if (x > 255)
1045         {
1046             cd->my_errno = YAZ_ICONV_EILSEQ;
1047             return (size_t) -1;
1048         }
1049         out[k++] = x;
1050         break;
1051     }
1052     *outbytesleft -= k;
1053     (*outbuf) += k;
1054     return 0;
1055 }
1056
1057
1058 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
1059                                           size_t inbytesleft, size_t *no_read,
1060                                           int *comb);
1061
1062 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
1063                                      size_t inbytesleft, size_t *no_read)
1064 {
1065     unsigned long x;
1066     if (cd->comb_offset < cd->comb_size)
1067     {
1068         *no_read = cd->comb_no_read[cd->comb_offset];
1069         x = cd->comb_x[cd->comb_offset];
1070
1071         /* special case for double-diacritic combining characters, 
1072            INVERTED BREVE and DOUBLE TILDE.
1073            We'll increment the no_read counter by 1, since we want to skip over
1074            the processing of the closing ligature character
1075         */
1076         /* this code is no longer necessary.. our handlers code in
1077            yaz_marc8_?_conv (generated by charconv.tcl) now returns
1078            0 and no_read=1 when a sequence does not match the input.
1079            The SECOND HALFs in codetables.xml produces a non-existant
1080            entry in the conversion trie.. Hence when met, the input byte is
1081            skipped as it should (in yaz_iconv)
1082         */
1083 #if 0
1084         if (x == 0x0361 || x == 0x0360)
1085             *no_read += 1;
1086 #endif
1087         cd->comb_offset++;
1088         return x;
1089     }
1090
1091     cd->comb_offset = 0;
1092     for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
1093     {
1094         int comb = 0;
1095
1096         if (inbytesleft == 0 && cd->comb_size)
1097         {
1098             cd->my_errno = YAZ_ICONV_EINVAL;
1099             x = 0;
1100             *no_read = 0;
1101             break;
1102         }
1103         x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
1104         if (!comb || !x)
1105             break;
1106         cd->comb_x[cd->comb_size] = x;
1107         cd->comb_no_read[cd->comb_size] = *no_read;
1108         inp += *no_read;
1109         inbytesleft = inbytesleft - *no_read;
1110     }
1111     return x;
1112 }
1113
1114 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
1115                                      size_t inbytesleft, size_t *no_read)
1116 {
1117     unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
1118     if (x && cd->comb_size == 1)
1119     {
1120         /* For MARC8s we try to get a Latin-1 page code out of it */
1121         int i;
1122         for (i = 0; latin1_comb[i].x1; i++)
1123             if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
1124             {
1125                 *no_read += cd->comb_no_read[0];
1126                 cd->comb_size = 0;
1127                 x = latin1_comb[i].y;
1128                 break;
1129             }
1130     }
1131     return x;
1132 }
1133
1134 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
1135                                          size_t inbytesleft, size_t *no_read,
1136                                          int *comb)
1137 {
1138     *no_read = 0;
1139     while(inbytesleft >= 1 && inp[0] == 27)
1140     {
1141         size_t inbytesleft0 = inbytesleft;
1142         inp++;
1143         inbytesleft--;
1144         while(inbytesleft > 0 && strchr("(,$!)-", *inp))
1145         {
1146             inbytesleft--;
1147             inp++;
1148         }
1149         if (inbytesleft <= 0)
1150         {
1151             *no_read = 0;
1152             cd->my_errno = YAZ_ICONV_EINVAL;
1153             return 0;
1154         }
1155         cd->marc8_esc_mode = *inp++;
1156         inbytesleft--;
1157         (*no_read) += inbytesleft0 - inbytesleft;
1158     }
1159     if (inbytesleft <= 0)
1160         return 0;
1161     else
1162     {
1163         unsigned long x;
1164         size_t no_read_sub = 0;
1165         *comb = 0;
1166
1167         switch(cd->marc8_esc_mode)
1168         {
1169         case 'B':  /* Basic ASCII */
1170         case 'E':  /* ANSEL */
1171         case 's':  /* ASCII */
1172             x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
1173             break;
1174         case 'g':  /* Greek */
1175             x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
1176             break;
1177         case 'b':  /* Subscripts */
1178             x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
1179             break;
1180         case 'p':  /* Superscripts */
1181             x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
1182             break;
1183         case '2':  /* Basic Hebrew */
1184             x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
1185             break;
1186         case 'N':  /* Basic Cyrillic */
1187         case 'Q':  /* Extended Cyrillic */
1188             x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
1189             break;
1190         case '3':  /* Basic Arabic */
1191         case '4':  /* Extended Arabic */
1192             x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
1193             break;
1194         case 'S':  /* Greek */
1195             x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
1196             break;
1197         case '1':  /* Chinese, Japanese, Korean (EACC) */
1198             x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
1199             break;
1200         default:
1201             *no_read = 0;
1202             cd->my_errno = YAZ_ICONV_EILSEQ;
1203             return 0;
1204         }
1205         *no_read += no_read_sub;
1206         return x;
1207     }
1208 }
1209
1210 static size_t yaz_write_UTF8(yaz_iconv_t cd, unsigned long x,
1211                              char **outbuf, size_t *outbytesleft)
1212 {
1213     return yaz_write_UTF8_char(x, outbuf, outbytesleft, &cd->my_errno);
1214 }
1215
1216 size_t yaz_write_UTF8_char(unsigned long x,
1217                            char **outbuf, size_t *outbytesleft,
1218                            int *error)
1219 {
1220     unsigned char *outp = (unsigned char *) *outbuf;
1221
1222     if (x <= 0x7f && *outbytesleft >= 1)
1223     {
1224         *outp++ = (unsigned char) x;
1225         (*outbytesleft)--;
1226     } 
1227     else if (x <= 0x7ff && *outbytesleft >= 2)
1228     {
1229         *outp++ = (unsigned char) ((x >> 6) | 0xc0);
1230         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
1231         (*outbytesleft) -= 2;
1232     }
1233     else if (x <= 0xffff && *outbytesleft >= 3)
1234     {
1235         *outp++ = (unsigned char) ((x >> 12) | 0xe0);
1236         *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
1237         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
1238         (*outbytesleft) -= 3;
1239     }
1240     else if (x <= 0x1fffff && *outbytesleft >= 4)
1241     {
1242         *outp++ = (unsigned char) ((x >> 18) | 0xf0);
1243         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
1244         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
1245         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
1246         (*outbytesleft) -= 4;
1247     }
1248     else if (x <= 0x3ffffff && *outbytesleft >= 5)
1249     {
1250         *outp++ = (unsigned char) ((x >> 24) | 0xf8);
1251         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
1252         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
1253         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
1254         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
1255         (*outbytesleft) -= 5;
1256     }
1257     else if (*outbytesleft >= 6)
1258     {
1259         *outp++ = (unsigned char) ((x >> 30) | 0xfc);
1260         *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
1261         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
1262         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
1263         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
1264         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
1265         (*outbytesleft) -= 6;
1266     }
1267     else 
1268     {
1269         *error = YAZ_ICONV_E2BIG;  /* not room for output */
1270         return (size_t)(-1);
1271     }
1272     *outbuf = (char *) outp;
1273     return 0;
1274 }
1275
1276 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
1277                                    char **outbuf, size_t *outbytesleft)
1278 {
1279     /* list of two char unicode sequence that, when combined, are
1280        equivalent to single unicode chars that can be represented in
1281        ISO-8859-1/Latin-1.
1282        Regular iconv on Linux at least does not seem to convert these,
1283        but since MARC-8 to UTF-8 generates these composed sequence
1284        we get a better chance of a successful MARC-8 -> ISO-8859-1
1285        conversion */
1286     unsigned char *outp = (unsigned char *) *outbuf;
1287
1288     if (cd->compose_char)
1289     {
1290         int i;
1291         for (i = 0; latin1_comb[i].x1; i++)
1292             if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
1293             {
1294                 x = latin1_comb[i].y;
1295                 break;
1296             }
1297         if (*outbytesleft < 1)
1298         {  /* no room. Retain compose_char and bail out */
1299             cd->my_errno = YAZ_ICONV_E2BIG;
1300             return (size_t)(-1);
1301         }
1302         if (!latin1_comb[i].x1) 
1303         {   /* not found. Just write compose_char */
1304             *outp++ = (unsigned char) cd->compose_char;
1305             (*outbytesleft)--;
1306             *outbuf = (char *) outp;
1307         }
1308         /* compose_char used so reset it. x now holds current char */
1309         cd->compose_char = 0;
1310     }
1311
1312     if (x > 32 && x < 127 && cd->compose_char == 0)
1313     {
1314         cd->compose_char = x;
1315         return 0;
1316     }
1317     else if (x > 255 || x < 1)
1318     {
1319         cd->my_errno = YAZ_ICONV_EILSEQ;
1320         return (size_t) -1;
1321     }
1322     else if (*outbytesleft < 1)
1323     {
1324         cd->my_errno = YAZ_ICONV_E2BIG;
1325         return (size_t)(-1);
1326     }
1327     *outp++ = (unsigned char) x;
1328     (*outbytesleft)--;
1329     *outbuf = (char *) outp;
1330     return 0;
1331 }
1332
1333 static size_t yaz_flush_ISO8859_1(yaz_iconv_t cd,
1334                                   char **outbuf, size_t *outbytesleft)
1335 {
1336     if (cd->compose_char)
1337     {
1338         unsigned char *outp = (unsigned char *) *outbuf;
1339         if (*outbytesleft < 1)
1340         {
1341             cd->my_errno = YAZ_ICONV_E2BIG;
1342             return (size_t)(-1);
1343         }
1344         *outp++ = (unsigned char) cd->compose_char;
1345         (*outbytesleft)--;
1346         *outbuf = (char *) outp;
1347         cd->compose_char = 0;
1348     }
1349     return 0;
1350 }
1351
1352 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
1353                               char **outbuf, size_t *outbytesleft)
1354 {
1355     unsigned char *outp = (unsigned char *) *outbuf;
1356     if (*outbytesleft >= 4)
1357     {
1358         *outp++ = (unsigned char) (x>>24);
1359         *outp++ = (unsigned char) (x>>16);
1360         *outp++ = (unsigned char) (x>>8);
1361         *outp++ = (unsigned char) x;
1362         (*outbytesleft) -= 4;
1363     }
1364     else
1365     {
1366         cd->my_errno = YAZ_ICONV_E2BIG;
1367         return (size_t)(-1);
1368     }
1369     *outbuf = (char *) outp;
1370     return 0;
1371 }
1372
1373 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
1374                                 char **outbuf, size_t *outbytesleft)
1375 {
1376     unsigned char *outp = (unsigned char *) *outbuf;
1377     if (*outbytesleft >= 4)
1378     {
1379         *outp++ = (unsigned char) x;
1380         *outp++ = (unsigned char) (x>>8);
1381         *outp++ = (unsigned char) (x>>16);
1382         *outp++ = (unsigned char) (x>>24);
1383         (*outbytesleft) -= 4;
1384     }
1385     else
1386     {
1387         cd->my_errno = YAZ_ICONV_E2BIG;
1388         return (size_t)(-1);
1389     }
1390     *outbuf = (char *) outp;
1391     return 0;
1392 }
1393
1394 static unsigned long lookup_marc8(yaz_iconv_t cd,
1395                                   unsigned long x, int *comb,
1396                                   const char **page_chr)
1397 {
1398     char utf8_buf[7];
1399     char *utf8_outbuf = utf8_buf;
1400     size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
1401
1402     r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft);
1403     if (r == (size_t)(-1))
1404     {
1405         cd->my_errno = YAZ_ICONV_EILSEQ;
1406         return 0;
1407     }
1408     else
1409     {
1410         unsigned char *inp;
1411         size_t inbytesleft, no_read_sub = 0;
1412         unsigned long x;
1413
1414         *utf8_outbuf = '\0';        
1415         inp = (unsigned char *) utf8_buf;
1416         inbytesleft = strlen(utf8_buf);
1417         
1418         x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
1419         if (x)
1420         {
1421             *page_chr = "\033(B";
1422             return x;
1423         }
1424         x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
1425         if (x)
1426         {
1427             *page_chr = "\033g";
1428             return x;
1429         }
1430         x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
1431         if (x)
1432         {
1433             *page_chr = "\033b";
1434             return x;
1435         }
1436         x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
1437         if (x)
1438         {
1439             *page_chr = "\033p";
1440             return x;
1441         }
1442         x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
1443         if (x)
1444         {
1445             *page_chr = "\033(2";
1446             return x;
1447         }
1448         x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
1449         if (x)
1450         {
1451             *page_chr = "\033(N";
1452             return x;
1453         }
1454         x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
1455         if (x)
1456         {
1457             *page_chr = "\033(3";
1458             return x;
1459         }
1460         x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
1461         if (x)
1462         {
1463             *page_chr = "\033(S";
1464             return x;
1465         }
1466         x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
1467         if (x)
1468         {
1469             *page_chr = "\033$1";
1470             return x;
1471         }
1472         cd->my_errno = YAZ_ICONV_EILSEQ;
1473         return x;
1474     }
1475 }
1476
1477 static size_t flush_combos(yaz_iconv_t cd,
1478                            char **outbuf, size_t *outbytesleft)
1479 {
1480     unsigned long y = cd->write_marc8_last;
1481     unsigned char byte;
1482     char out_buf[10];
1483     size_t i, out_no = 0;
1484
1485     if (!y)
1486         return 0;
1487
1488     byte = (unsigned char )((y>>16) & 0xff);
1489     if (byte)
1490         out_buf[out_no++] = byte;
1491     byte = (unsigned char)((y>>8) & 0xff);
1492     if (byte)
1493         out_buf[out_no++] = byte;
1494     byte = (unsigned char )(y & 0xff);
1495     if (byte)
1496         out_buf[out_no++] = byte;
1497
1498     if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
1499     {
1500         cd->my_errno = YAZ_ICONV_E2BIG;
1501         return (size_t) (-1);
1502     }
1503
1504     for (i = 0; i < cd->write_marc8_comb_no; i++)
1505     {
1506         /* all MARC-8 combined characters are simple bytes */
1507         byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
1508         *(*outbuf)++ = byte;
1509         (*outbytesleft)--;
1510     }
1511     memcpy(*outbuf, out_buf, out_no);
1512     *outbuf += out_no;
1513     (*outbytesleft) -= out_no;
1514     if (cd->write_marc8_second_half_char)
1515     {
1516         *(*outbuf)++ = cd->write_marc8_second_half_char;
1517         (*outbytesleft)--;
1518     }        
1519
1520     cd->write_marc8_last = 0;
1521     cd->write_marc8_comb_no = 0;
1522     cd->write_marc8_second_half_char = 0;
1523     return 0;
1524 }
1525
1526 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd, 
1527                                        char **outbuf, size_t *outbytesleft,
1528                                        const char *page_chr)
1529 {
1530     const char *old_page_chr = cd->write_marc8_page_chr;
1531     if (strcmp(page_chr, old_page_chr))
1532     {
1533         size_t plen = 0;
1534         const char *page_out = page_chr;
1535         
1536         if (*outbytesleft < 8)
1537         {
1538             cd->my_errno = YAZ_ICONV_E2BIG;
1539             
1540             return (size_t) (-1);
1541         }
1542         cd->write_marc8_page_chr = page_chr;
1543         
1544         if (!strcmp(old_page_chr, "\033p") 
1545             || !strcmp(old_page_chr, "\033g")
1546             || !strcmp(old_page_chr, "\033b"))
1547         {
1548             /* Technique 1 leave */
1549             page_out = "\033s";
1550             if (strcmp(page_chr, "\033(B")) /* Not going ASCII page? */
1551             {
1552                 /* Must leave script + enter new page */
1553                 plen = strlen(page_out);
1554                 memcpy(*outbuf, page_out, plen);
1555                 (*outbuf) += plen;
1556                 (*outbytesleft) -= plen;
1557                 page_out = page_chr;
1558             }
1559         }
1560         plen = strlen(page_out);
1561         memcpy(*outbuf, page_out, plen);
1562         (*outbuf) += plen;
1563         (*outbytesleft) -= plen;
1564     }
1565     return 0;
1566 }
1567
1568
1569 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
1570                                 char **outbuf, size_t *outbytesleft)
1571 {
1572     int comb = 0;
1573     const char *page_chr = 0;
1574     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
1575
1576     if (!y)
1577         return (size_t) (-1);
1578
1579     if (comb)
1580     {
1581         if (x == 0x0361)
1582             cd->write_marc8_second_half_char = 0xEC;
1583         else if (x == 0x0360)
1584             cd->write_marc8_second_half_char = 0xFB;
1585
1586         if (cd->write_marc8_comb_no < 6)
1587             cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
1588     }
1589     else
1590     {
1591         size_t r = flush_combos(cd, outbuf, outbytesleft);
1592         if (r)
1593             return r;
1594
1595         r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft, page_chr);
1596         if (r)
1597             return r;
1598         cd->write_marc8_last = y;
1599     }
1600     return 0;
1601 }
1602
1603 static size_t yaz_flush_marc8(yaz_iconv_t cd,
1604                               char **outbuf, size_t *outbytesleft)
1605 {
1606     size_t r = flush_combos(cd, outbuf, outbytesleft);
1607     if (r)
1608         return r;
1609     return yaz_write_marc8_page_chr(cd, outbuf, outbytesleft, "\033(B");
1610 }
1611
1612 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
1613                               char **outbuf, size_t *outbytesleft)
1614 {
1615     int i;
1616     for (i = 0; latin1_comb[i].x1; i++)
1617     {
1618         if (x == latin1_comb[i].y)
1619         {
1620             size_t r ;
1621             /* save the output pointers .. */
1622             char *outbuf0 = *outbuf;
1623             size_t outbytesleft0 = *outbytesleft;
1624             int last_ch = cd->write_marc8_last;
1625
1626             r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
1627                                   outbuf, outbytesleft);
1628             if (r)
1629                 return r;
1630             r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
1631                                   outbuf, outbytesleft);
1632             if (r && cd->my_errno == YAZ_ICONV_E2BIG)
1633             {
1634                 /* not enough room. reset output to original values */
1635                 *outbuf = outbuf0;
1636                 *outbytesleft = outbytesleft0;
1637                 cd->write_marc8_last = last_ch;
1638             }
1639             return r;
1640         }
1641     }
1642     return yaz_write_marc8_2(cd, x, outbuf, outbytesleft);
1643 }
1644
1645
1646 #if HAVE_WCHAR_H
1647 static size_t yaz_write_wchar_t(yaz_iconv_t cd, unsigned long x,
1648                                 char **outbuf, size_t *outbytesleft)
1649 {
1650     unsigned char *outp = (unsigned char *) *outbuf;
1651
1652     if (*outbytesleft >= sizeof(wchar_t))
1653     {
1654         wchar_t wch = x;
1655         memcpy(outp, &wch, sizeof(wch));
1656         outp += sizeof(wch);
1657         (*outbytesleft) -= sizeof(wch);
1658     }
1659     else
1660     {
1661         cd->my_errno = YAZ_ICONV_E2BIG;
1662         return (size_t)(-1);
1663     }
1664     *outbuf = (char *) outp;
1665     return 0;
1666 }
1667 #endif
1668
1669 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
1670 {
1671     return cd->read_handle && cd->write_handle;
1672 }
1673
1674 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
1675 {
1676     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
1677
1678     cd->write_handle = 0;
1679     cd->read_handle = 0;
1680     cd->init_handle = 0;
1681     cd->flush_handle = 0;
1682     cd->my_errno = YAZ_ICONV_UNKNOWN;
1683
1684     /* a useful hack: if fromcode has leading @,
1685        the library not use YAZ's own conversions .. */
1686     if (fromcode[0] == '@')
1687         fromcode++;
1688     else
1689     {
1690         if (!yaz_matchstr(fromcode, "UTF8"))
1691         {
1692             cd->read_handle = yaz_read_UTF8;
1693             cd->init_handle = yaz_init_UTF8;
1694         }
1695         else if (!yaz_matchstr(fromcode, "ISO88591"))
1696             cd->read_handle = yaz_read_ISO8859_1;
1697         else if (!yaz_matchstr(fromcode, "UCS4"))
1698             cd->read_handle = yaz_read_UCS4;
1699         else if (!yaz_matchstr(fromcode, "UCS4LE"))
1700             cd->read_handle = yaz_read_UCS4LE;
1701         else if (!yaz_matchstr(fromcode, "MARC8"))
1702             cd->read_handle = yaz_read_marc8;
1703         else if (!yaz_matchstr(fromcode, "MARC8s"))
1704             cd->read_handle = yaz_read_marc8s;
1705         else if (!yaz_matchstr(fromcode, "advancegreek"))
1706             cd->read_handle = yaz_read_advancegreek;
1707         else if (!yaz_matchstr(fromcode, "iso54281984"))
1708             cd->read_handle = yaz_read_iso5428_1984;
1709 #if HAVE_WCHAR_H
1710         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
1711             cd->read_handle = yaz_read_wchar_t;
1712 #endif
1713         
1714         if (!yaz_matchstr(tocode, "UTF8"))
1715             cd->write_handle = yaz_write_UTF8;
1716         else if (!yaz_matchstr(tocode, "ISO88591"))
1717         {
1718             cd->write_handle = yaz_write_ISO8859_1;
1719             cd->flush_handle = yaz_flush_ISO8859_1;
1720         }
1721         else if (!yaz_matchstr (tocode, "UCS4"))
1722             cd->write_handle = yaz_write_UCS4;
1723         else if (!yaz_matchstr(tocode, "UCS4LE"))
1724             cd->write_handle = yaz_write_UCS4LE;
1725         else if (!yaz_matchstr(tocode, "MARC8"))
1726         {
1727             cd->write_handle = yaz_write_marc8;
1728             cd->flush_handle = yaz_flush_marc8;
1729         }
1730         else if (!yaz_matchstr(tocode, "MARC8s"))
1731         {
1732             cd->write_handle = yaz_write_marc8;
1733             cd->flush_handle = yaz_flush_marc8;
1734         }
1735         else if (!yaz_matchstr(tocode, "advancegreek"))
1736         {
1737             cd->write_handle = yaz_write_advancegreek;
1738         }
1739         else if (!yaz_matchstr(tocode, "iso54281984"))
1740         {
1741             cd->write_handle = yaz_write_iso5428_1984;
1742         }
1743 #if HAVE_WCHAR_H
1744         else if (!yaz_matchstr(tocode, "WCHAR_T"))
1745             cd->write_handle = yaz_write_wchar_t;
1746 #endif
1747     }
1748 #if HAVE_ICONV_H
1749     cd->iconv_cd = 0;
1750     if (!cd->read_handle || !cd->write_handle)
1751     {
1752         cd->iconv_cd = iconv_open (tocode, fromcode);
1753         if (cd->iconv_cd == (iconv_t) (-1))
1754         {
1755             xfree (cd);
1756             return 0;
1757         }
1758     }
1759 #else
1760     if (!cd->read_handle || !cd->write_handle)
1761     {
1762         xfree (cd);
1763         return 0;
1764     }
1765 #endif
1766     cd->init_flag = 1;
1767     return cd;
1768 }
1769
1770 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
1771                  char **outbuf, size_t *outbytesleft)
1772 {
1773     char *inbuf0 = 0;
1774     size_t r = 0;
1775
1776 #if HAVE_ICONV_H
1777     if (cd->iconv_cd)
1778     {
1779         size_t r =
1780             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
1781         if (r == (size_t)(-1))
1782         {
1783             switch (yaz_errno())
1784             {
1785             case E2BIG:
1786                 cd->my_errno = YAZ_ICONV_E2BIG;
1787                 break;
1788             case EINVAL:
1789                 cd->my_errno = YAZ_ICONV_EINVAL;
1790                 break;
1791             case EILSEQ:
1792                 cd->my_errno = YAZ_ICONV_EILSEQ;
1793                 break;
1794             default:
1795                 cd->my_errno = YAZ_ICONV_UNKNOWN;
1796             }
1797         }
1798         return r;
1799     }
1800 #endif
1801
1802     if (inbuf)
1803         inbuf0 = *inbuf;
1804
1805     if (cd->init_flag)
1806     {
1807         cd->my_errno = YAZ_ICONV_UNKNOWN;
1808         cd->marc8_esc_mode = 'B';
1809         
1810         cd->comb_offset = cd->comb_size = 0;
1811         cd->compose_char = 0;
1812         
1813         cd->write_marc8_comb_no = 0;
1814         cd->write_marc8_second_half_char = 0;
1815         cd->write_marc8_last = 0;
1816         cd->write_marc8_page_chr = "\033(B";
1817         
1818         cd->unget_x = 0;
1819         cd->no_read_x = 0;
1820     }
1821
1822     if (cd->init_flag)
1823     {
1824         if (cd->init_handle && inbuf && *inbuf)
1825         {
1826             size_t no_read = 0;
1827             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1828                                          *inbytesleft, &no_read);
1829             if (r)
1830             {
1831                 if (cd->my_errno == YAZ_ICONV_EINVAL)
1832                     return r;
1833                 cd->init_flag = 0;
1834                 return r;
1835             }
1836             *inbytesleft -= no_read;
1837             *inbuf += no_read;
1838         }
1839     }
1840     cd->init_flag = 0;
1841
1842     if (!inbuf || !*inbuf)
1843     {
1844         if (outbuf && *outbuf)
1845         {
1846             if (cd->unget_x)
1847                 r = (*cd->write_handle)(cd, cd->unget_x, outbuf, outbytesleft);
1848             if (cd->flush_handle)
1849                 r = (*cd->flush_handle)(cd, outbuf, outbytesleft);
1850         }
1851         if (r == 0)
1852             cd->init_flag = 1;
1853         cd->unget_x = 0;
1854         return r;
1855     }
1856     while (1)
1857     {
1858         unsigned long x;
1859         size_t no_read;
1860
1861         if (cd->unget_x)
1862         {
1863             x = cd->unget_x;
1864             no_read = cd->no_read_x;
1865         }
1866         else
1867         {
1868             if (*inbytesleft == 0)
1869             {
1870                 r = *inbuf - inbuf0;
1871                 break;
1872             }
1873             x = (*cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1874                                    &no_read);
1875             if (no_read == 0)
1876             {
1877                 r = (size_t)(-1);
1878                 break;
1879             }
1880         }
1881         if (x)
1882         {
1883             r = (*cd->write_handle)(cd, x, outbuf, outbytesleft);
1884             if (r)
1885             {
1886                 /* unable to write it. save it because read_handle cannot
1887                    rewind .. */
1888                 if (cd->my_errno == YAZ_ICONV_E2BIG)
1889                 {
1890                     cd->unget_x = x;
1891                     cd->no_read_x = no_read;
1892                     break;
1893                 }
1894             }
1895             cd->unget_x = 0;
1896         }
1897         *inbytesleft -= no_read;
1898         (*inbuf) += no_read;
1899     }
1900     return r;
1901 }
1902
1903 int yaz_iconv_error (yaz_iconv_t cd)
1904 {
1905     return cd->my_errno;
1906 }
1907
1908 int yaz_iconv_close (yaz_iconv_t cd)
1909 {
1910 #if HAVE_ICONV_H
1911     if (cd->iconv_cd)
1912         iconv_close (cd->iconv_cd);
1913 #endif
1914     xfree (cd);
1915     return 0;
1916 }
1917
1918 /*
1919  * Local variables:
1920  * c-basic-offset: 4
1921  * indent-tabs-mode: nil
1922  * End:
1923  * vim: shiftwidth=4 tabstop=8 expandtab
1924  */