2 * Copyright (C) 1995-2006, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.25 2006-08-24 10:01:03 adam Exp $
9 * \brief Implements simple ICONV
11 * This implements an interface similar to that of iconv and
12 * is used by YAZ to interface with iconv (if present).
13 * For systems where iconv is not present, this layer
14 * provides a few important conversions: UTF-8, MARC-8, Latin-1.
32 #include <yaz/yaz-util.h>
34 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
35 size_t *no_read, int *combining);
36 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
37 size_t *no_read, int *combining);
38 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
39 size_t *no_read, int *combining);
40 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
41 size_t *no_read, int *combining);
42 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
43 size_t *no_read, int *combining);
44 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
45 size_t *no_read, int *combining);
46 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
47 size_t *no_read, int *combining);
48 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
49 size_t *no_read, int *combining);
50 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
51 size_t *no_read, int *combining);
54 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
55 size_t *no_read, int *combining);
56 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
57 size_t *no_read, int *combining);
58 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
59 size_t *no_read, int *combining);
60 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
61 size_t *no_read, int *combining);
62 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
63 size_t *no_read, int *combining);
64 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
65 size_t *no_read, int *combining);
66 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
67 size_t *no_read, int *combining);
68 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
69 size_t *no_read, int *combining);
70 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
71 size_t *no_read, int *combining);
73 struct yaz_iconv_struct {
76 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
77 size_t inbytesleft, size_t *no_read);
78 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
79 size_t inbytesleft, size_t *no_read);
80 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
81 char **outbuf, size_t *outbytesleft,
87 unsigned long comb_x[8];
88 size_t comb_no_read[8];
90 unsigned long unget_x;
94 unsigned long compose_char;
96 unsigned long write_marc8_comb_ch[8];
97 size_t write_marc8_comb_no;
98 unsigned long write_marc8_last;
99 const char *write_marc8_page_chr;
103 unsigned long x1, x2;
106 { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
107 { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
108 { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
109 { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
110 { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
111 { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
112 /* no need for 0xc6 LATIN CAPITAL LETTER AE */
113 { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
114 { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
115 { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
116 { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
117 { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
118 { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
119 { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
120 { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
121 { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
122 { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
123 { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
124 { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
125 { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
126 { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
127 { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
128 /* omitted: 0xd7 MULTIPLICATION SIGN */
129 /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
130 { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
131 { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
132 { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
133 { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
134 { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
135 /* omitted: 0xde LATIN CAPITAL LETTER THORN */
136 /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
137 { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
138 { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
139 { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
140 { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
141 { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
142 { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
143 /* omitted: 0xe6 LATIN SMALL LETTER AE */
144 { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
145 { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
146 { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
147 { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
148 { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
149 { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
150 { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
151 { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
152 { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
153 /* omitted: 0xf0 LATIN SMALL LETTER ETH */
154 { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
155 { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
156 { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
157 { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
158 { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
159 { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
160 /* omitted: 0xf7 DIVISION SIGN */
161 /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
162 { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
163 { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
164 { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
165 { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
166 { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
167 /* omitted: 0xfe LATIN SMALL LETTER THORN */
168 { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
173 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
174 size_t inbytesleft, size_t *no_read)
176 unsigned long x = inp[0];
181 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
182 size_t inbytesleft, size_t *no_read)
191 cd->my_errno = YAZ_ICONV_EINVAL;
194 if (inp[1] != 0xbb && inp[2] == 0xbf)
201 unsigned long yaz_read_UTF8_char(unsigned char *inp,
202 size_t inbytesleft, size_t *no_read,
212 else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
215 *error = YAZ_ICONV_EILSEQ;
217 else if (inp[0] <= 0xdf && inbytesleft >= 2)
219 x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
225 *error = YAZ_ICONV_EILSEQ;
228 else if (inp[0] <= 0xef && inbytesleft >= 3)
230 x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
237 *error = YAZ_ICONV_EILSEQ;
240 else if (inp[0] <= 0xf7 && inbytesleft >= 4)
242 x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
243 ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
249 *error = YAZ_ICONV_EILSEQ;
252 else if (inp[0] <= 0xfb && inbytesleft >= 5)
254 x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
255 ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
262 *error = YAZ_ICONV_EILSEQ;
265 else if (inp[0] <= 0xfd && inbytesleft >= 6)
267 x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
268 ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
269 ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
275 *error = YAZ_ICONV_EILSEQ;
281 *error = YAZ_ICONV_EINVAL;
286 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
287 size_t inbytesleft, size_t *no_read)
289 return yaz_read_UTF8_char(inp, inbytesleft, no_read, &cd->my_errno);
292 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
293 size_t inbytesleft, size_t *no_read)
299 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
304 x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
310 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
311 size_t inbytesleft, size_t *no_read)
317 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
322 x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
329 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
330 size_t inbytesleft, size_t *no_read)
334 if (inbytesleft < sizeof(wchar_t))
336 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
342 memcpy (&wch, inp, sizeof(wch));
344 *no_read = sizeof(wch);
351 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
352 size_t inbytesleft, size_t *no_read,
355 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
356 size_t inbytesleft, size_t *no_read)
359 if (cd->comb_offset < cd->comb_size)
361 *no_read = cd->comb_no_read[cd->comb_offset];
362 x = cd->comb_x[cd->comb_offset];
364 /* special case for double-diacritic combining characters,
365 INVERTED BREVE and DOUBLE TILDE.
366 We'll increment the no_read counter by 1, since we want to skip over
367 the processing of the closing ligature character
369 /* this code is no longer necessary.. our handlers code in
370 yaz_marc8_?_conv (generated by charconv.tcl) now returns
371 0 and no_read=1 when a sequence does not match the input.
372 The SECOND HALFs in codetables.xml produces a non-existant
373 entry in the conversion trie.. Hence when met, the input byte is
374 skipped as it should (in yaz_iconv)
377 if (x == 0x0361 || x == 0x0360)
385 for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
388 x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
391 cd->comb_x[cd->comb_size] = x;
392 cd->comb_no_read[cd->comb_size] = *no_read;
394 inbytesleft = inbytesleft - *no_read;
399 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
400 size_t inbytesleft, size_t *no_read)
402 unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
403 if (x && cd->comb_size == 1)
405 /* For MARC8s we try to get a Latin-1 page code out of it */
407 for (i = 0; latin1_comb[i].x1; i++)
408 if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
410 *no_read += cd->comb_no_read[0];
412 x = latin1_comb[i].y;
419 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
420 size_t inbytesleft, size_t *no_read,
424 while(inbytesleft >= 1 && inp[0] == 27)
426 size_t inbytesleft0 = inbytesleft;
429 while(inbytesleft > 0 && strchr("(,$!", *inp))
434 if (inbytesleft <= 0)
437 cd->my_errno = YAZ_ICONV_EINVAL;
440 cd->marc8_esc_mode = *inp++;
442 (*no_read) += inbytesleft0 - inbytesleft;
444 if (inbytesleft <= 0)
449 size_t no_read_sub = 0;
452 switch(cd->marc8_esc_mode)
454 case 'B': /* Basic ASCII */
455 case 'E': /* ANSEL */
456 case 's': /* ASCII */
457 x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
459 case 'g': /* Greek */
460 x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
462 case 'b': /* Subscripts */
463 x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
465 case 'p': /* Superscripts */
466 x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
468 case '2': /* Basic Hebrew */
469 x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
471 case 'N': /* Basic Cyrillic */
472 case 'Q': /* Extended Cyrillic */
473 x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
475 case '3': /* Basic Arabic */
476 case '4': /* Extended Arabic */
477 x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
479 case 'S': /* Greek */
480 x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
482 case '1': /* Chinese, Japanese, Korean (EACC) */
483 x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
487 cd->my_errno = YAZ_ICONV_EILSEQ;
490 *no_read += no_read_sub;
495 static size_t yaz_write_UTF8(yaz_iconv_t cd, unsigned long x,
496 char **outbuf, size_t *outbytesleft,
499 return yaz_write_UTF8_char(x, outbuf, outbytesleft, &cd->my_errno);
502 size_t yaz_write_UTF8_char(unsigned long x,
503 char **outbuf, size_t *outbytesleft,
506 unsigned char *outp = (unsigned char *) *outbuf;
508 if (x <= 0x7f && *outbytesleft >= 1)
510 *outp++ = (unsigned char) x;
513 else if (x <= 0x7ff && *outbytesleft >= 2)
515 *outp++ = (unsigned char) ((x >> 6) | 0xc0);
516 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
517 (*outbytesleft) -= 2;
519 else if (x <= 0xffff && *outbytesleft >= 3)
521 *outp++ = (unsigned char) ((x >> 12) | 0xe0);
522 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
523 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
524 (*outbytesleft) -= 3;
526 else if (x <= 0x1fffff && *outbytesleft >= 4)
528 *outp++ = (unsigned char) ((x >> 18) | 0xf0);
529 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
530 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
531 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
532 (*outbytesleft) -= 4;
534 else if (x <= 0x3ffffff && *outbytesleft >= 5)
536 *outp++ = (unsigned char) ((x >> 24) | 0xf8);
537 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
538 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
539 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
540 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
541 (*outbytesleft) -= 5;
543 else if (*outbytesleft >= 6)
545 *outp++ = (unsigned char) ((x >> 30) | 0xfc);
546 *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
547 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
548 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
549 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
550 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
551 (*outbytesleft) -= 6;
555 *error = YAZ_ICONV_E2BIG; /* not room for output */
558 *outbuf = (char *) outp;
563 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
564 char **outbuf, size_t *outbytesleft,
567 /* list of two char unicode sequence that, when combined, are
568 equivalent to single unicode chars that can be represented in
570 Regular iconv on Linux at least does not seem to convert these,
571 but since MARC-8 to UTF-8 generates these composed sequence
572 we get a better chance of a successful MARC-8 -> ISO-8859-1
574 unsigned char *outp = (unsigned char *) *outbuf;
576 if (cd->compose_char)
579 for (i = 0; latin1_comb[i].x1; i++)
580 if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
582 x = latin1_comb[i].y;
585 if (*outbytesleft < 1)
586 { /* no room. Retain compose_char and bail out */
587 cd->my_errno = YAZ_ICONV_E2BIG;
590 if (!latin1_comb[i].x1)
591 { /* not found. Just write compose_char */
592 *outp++ = (unsigned char) cd->compose_char;
594 *outbuf = (char *) outp;
596 /* compose_char used so reset it. x now holds current char */
597 cd->compose_char = 0;
600 if (!last && x > 32 && x < 127 && cd->compose_char == 0)
602 cd->compose_char = x;
605 else if (x > 255 || x < 1)
607 cd->my_errno = YAZ_ICONV_EILSEQ;
610 else if (*outbytesleft < 1)
612 cd->my_errno = YAZ_ICONV_E2BIG;
615 *outp++ = (unsigned char) x;
617 *outbuf = (char *) outp;
622 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
623 char **outbuf, size_t *outbytesleft,
626 unsigned char *outp = (unsigned char *) *outbuf;
627 if (*outbytesleft >= 4)
629 *outp++ = (unsigned char) (x>>24);
630 *outp++ = (unsigned char) (x>>16);
631 *outp++ = (unsigned char) (x>>8);
632 *outp++ = (unsigned char) x;
633 (*outbytesleft) -= 4;
637 cd->my_errno = YAZ_ICONV_E2BIG;
640 *outbuf = (char *) outp;
644 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
645 char **outbuf, size_t *outbytesleft,
648 unsigned char *outp = (unsigned char *) *outbuf;
649 if (*outbytesleft >= 4)
651 *outp++ = (unsigned char) x;
652 *outp++ = (unsigned char) (x>>8);
653 *outp++ = (unsigned char) (x>>16);
654 *outp++ = (unsigned char) (x>>24);
655 (*outbytesleft) -= 4;
659 cd->my_errno = YAZ_ICONV_E2BIG;
662 *outbuf = (char *) outp;
666 static unsigned long lookup_marc8(yaz_iconv_t cd,
667 unsigned long x, int *comb,
668 const char **page_chr)
671 char *utf8_outbuf = utf8_buf;
672 size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
674 r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
675 if (r == (size_t)(-1))
677 cd->my_errno = YAZ_ICONV_EILSEQ;
683 size_t inbytesleft, no_read_sub = 0;
687 inp = (unsigned char *) utf8_buf;
688 inbytesleft = strlen(utf8_buf);
690 x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
693 *page_chr = "\033(B";
696 x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
702 x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
708 x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
714 x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
717 *page_chr = "\033(2";
720 x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
723 *page_chr = "\033(N";
726 x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
729 *page_chr = "\033(3";
732 x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
735 *page_chr = "\033(S";
738 x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
741 *page_chr = "\033(1";
744 cd->my_errno = YAZ_ICONV_EILSEQ;
749 static size_t flush_combos(yaz_iconv_t cd,
750 char **outbuf, size_t *outbytesleft)
752 unsigned long y = cd->write_marc8_last;
753 unsigned char byte, second_half = 0;
755 size_t i, out_no = 0;
760 byte = (unsigned char )((y>>16) & 0xff);
762 out_buf[out_no++] = byte;
763 byte = (unsigned char)((y>>8) & 0xff);
765 out_buf[out_no++] = byte;
766 byte = (unsigned char )(y & 0xff);
768 out_buf[out_no++] = byte;
770 if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
772 cd->my_errno = YAZ_ICONV_E2BIG;
773 return (size_t) (-1);
776 for (i = 0; i < cd->write_marc8_comb_no; i++)
778 /* all MARC-8 combined characters are simple bytes */
779 byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
782 else if (byte == 0xFA)
788 memcpy(*outbuf, out_buf, out_no);
790 (*outbytesleft) -= out_no;
793 *(*outbuf)++ = second_half;
797 cd->write_marc8_last = 0;
798 cd->write_marc8_comb_no = 0;
802 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
803 char **outbuf, size_t *outbytesleft,
807 const char *page_chr = 0;
808 unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
811 return (size_t) (-1);
815 if (cd->write_marc8_comb_no < 6)
816 cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
820 size_t r = flush_combos(cd, outbuf, outbytesleft);
823 if (strcmp(page_chr, cd->write_marc8_page_chr))
825 size_t plen = strlen(page_chr);
827 if (*outbytesleft < plen)
829 cd->my_errno = YAZ_ICONV_E2BIG;
830 return (size_t) (-1);
832 memcpy(*outbuf, page_chr, plen);
834 (*outbytesleft) -= plen;
835 cd->write_marc8_page_chr = page_chr;
837 cd->write_marc8_last = y;
841 size_t r = flush_combos(cd, outbuf, outbytesleft);
845 cd->write_marc8_comb_no--;
847 cd->write_marc8_last = 0;
854 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
855 char **outbuf, size_t *outbytesleft,
859 for (i = 0; latin1_comb[i].x1; i++)
861 if (x == latin1_comb[i].y)
864 /* save the output pointers .. */
865 char *outbuf0 = *outbuf;
866 size_t outbytesleft0 = *outbytesleft;
867 int last_ch = cd->write_marc8_last;
869 r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
870 outbuf, outbytesleft, 0);
873 r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
874 outbuf, outbytesleft, last);
875 if (r && cd->my_errno == YAZ_ICONV_E2BIG)
877 /* not enough room. reset output to original values */
879 *outbytesleft = outbytesleft0;
880 cd->write_marc8_last = last_ch;
885 return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, last);
890 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
891 char **outbuf, size_t *outbytesleft,
894 unsigned char *outp = (unsigned char *) *outbuf;
896 if (*outbytesleft >= sizeof(wchar_t))
899 memcpy(outp, &wch, sizeof(wch));
901 (*outbytesleft) -= sizeof(wch);
905 cd->my_errno = YAZ_ICONV_E2BIG;
908 *outbuf = (char *) outp;
913 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
915 return cd->read_handle && cd->write_handle;
918 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
920 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
922 cd->write_handle = 0;
925 cd->my_errno = YAZ_ICONV_UNKNOWN;
926 cd->marc8_esc_mode = 'B';
927 cd->comb_offset = cd->comb_size = 0;
928 cd->compose_char = 0;
930 cd->write_marc8_comb_no = 0;
931 cd->write_marc8_last = 0;
932 cd->write_marc8_page_chr = "\033(B";
934 /* a useful hack: if fromcode has leading @,
935 the library not use YAZ's own conversions .. */
936 if (fromcode[0] == '@')
940 if (!yaz_matchstr(fromcode, "UTF8"))
942 cd->read_handle = yaz_read_UTF8;
943 cd->init_handle = yaz_init_UTF8;
945 else if (!yaz_matchstr(fromcode, "ISO88591"))
946 cd->read_handle = yaz_read_ISO8859_1;
947 else if (!yaz_matchstr(fromcode, "UCS4"))
948 cd->read_handle = yaz_read_UCS4;
949 else if (!yaz_matchstr(fromcode, "UCS4LE"))
950 cd->read_handle = yaz_read_UCS4LE;
951 else if (!yaz_matchstr(fromcode, "MARC8"))
952 cd->read_handle = yaz_read_marc8;
953 else if (!yaz_matchstr(fromcode, "MARC8s"))
954 cd->read_handle = yaz_read_marc8s;
956 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
957 cd->read_handle = yaz_read_wchar_t;
960 if (!yaz_matchstr(tocode, "UTF8"))
961 cd->write_handle = yaz_write_UTF8;
962 else if (!yaz_matchstr(tocode, "ISO88591"))
963 cd->write_handle = yaz_write_ISO8859_1;
964 else if (!yaz_matchstr (tocode, "UCS4"))
965 cd->write_handle = yaz_write_UCS4;
966 else if (!yaz_matchstr(tocode, "UCS4LE"))
967 cd->write_handle = yaz_write_UCS4LE;
968 else if (!yaz_matchstr(tocode, "MARC8"))
969 cd->write_handle = yaz_write_marc8;
970 else if (!yaz_matchstr(tocode, "MARC8s"))
971 cd->write_handle = yaz_write_marc8;
973 else if (!yaz_matchstr(tocode, "WCHAR_T"))
974 cd->write_handle = yaz_write_wchar_t;
979 if (!cd->read_handle || !cd->write_handle)
981 cd->iconv_cd = iconv_open (tocode, fromcode);
982 if (cd->iconv_cd == (iconv_t) (-1))
989 if (!cd->read_handle || !cd->write_handle)
999 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
1000 char **outbuf, size_t *outbytesleft)
1009 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
1010 if (r == (size_t)(-1))
1012 switch (yaz_errno())
1015 cd->my_errno = YAZ_ICONV_E2BIG;
1018 cd->my_errno = YAZ_ICONV_EINVAL;
1021 cd->my_errno = YAZ_ICONV_EILSEQ;
1024 cd->my_errno = YAZ_ICONV_UNKNOWN;
1030 if (inbuf == 0 || *inbuf == 0)
1033 cd->my_errno = YAZ_ICONV_UNKNOWN;
1040 if (cd->init_handle)
1043 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1044 *inbytesleft, &no_read);
1047 if (cd->my_errno == YAZ_ICONV_EINVAL)
1052 *inbytesleft -= no_read;
1064 if (*inbytesleft == 0)
1066 r = *inbuf - inbuf0;
1071 x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1082 no_read = cd->no_read_x;
1086 r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1087 (*inbytesleft - no_read) == 0 ? 1 : 0);
1090 /* unable to write it. save it because read_handle cannot
1092 if (cd->my_errno == YAZ_ICONV_E2BIG)
1095 cd->no_read_x = no_read;
1101 *inbytesleft -= no_read;
1102 (*inbuf) += no_read;
1107 int yaz_iconv_error (yaz_iconv_t cd)
1109 return cd->my_errno;
1112 int yaz_iconv_close (yaz_iconv_t cd)
1116 iconv_close (cd->iconv_cd);
1125 * indent-tabs-mode: nil
1127 * vim: shiftwidth=4 tabstop=8 expandtab