2 * Copyright (C) 1995-2006, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.26 2006-08-27 19:04:03 adam Exp $
9 * \brief Implements simple ICONV
11 * This implements an interface similar to that of iconv and
12 * is used by YAZ to interface with iconv (if present).
13 * For systems where iconv is not present, this layer
14 * provides a few important conversions: UTF-8, MARC-8, Latin-1.
17 * http://www.loc.gov/marc/specifications/speccharmarc8.html
37 #include <yaz/yaz-util.h>
39 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
40 size_t *no_read, int *combining);
41 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
42 size_t *no_read, int *combining);
43 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
44 size_t *no_read, int *combining);
45 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
46 size_t *no_read, int *combining);
47 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
48 size_t *no_read, int *combining);
49 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
50 size_t *no_read, int *combining);
51 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
52 size_t *no_read, int *combining);
53 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
54 size_t *no_read, int *combining);
55 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
56 size_t *no_read, int *combining);
59 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
60 size_t *no_read, int *combining);
61 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
62 size_t *no_read, int *combining);
63 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
64 size_t *no_read, int *combining);
65 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
66 size_t *no_read, int *combining);
67 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
68 size_t *no_read, int *combining);
69 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
70 size_t *no_read, int *combining);
71 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
72 size_t *no_read, int *combining);
73 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
74 size_t *no_read, int *combining);
75 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
76 size_t *no_read, int *combining);
78 struct yaz_iconv_struct {
81 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
82 size_t inbytesleft, size_t *no_read);
83 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
84 size_t inbytesleft, size_t *no_read);
85 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
86 char **outbuf, size_t *outbytesleft,
92 unsigned long comb_x[8];
93 size_t comb_no_read[8];
95 unsigned long unget_x;
99 unsigned long compose_char;
101 unsigned long write_marc8_comb_ch[8];
102 size_t write_marc8_comb_no;
103 unsigned long write_marc8_last;
104 const char *write_marc8_page_chr;
108 unsigned long x1, x2;
111 { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
112 { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
113 { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
114 { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
115 { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
116 { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
117 /* no need for 0xc6 LATIN CAPITAL LETTER AE */
118 { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
119 { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
120 { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
121 { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
122 { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
123 { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
124 { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
125 { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
126 { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
127 { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
128 { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
129 { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
130 { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
131 { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
132 { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
133 /* omitted: 0xd7 MULTIPLICATION SIGN */
134 /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
135 { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
136 { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
137 { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
138 { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
139 { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
140 /* omitted: 0xde LATIN CAPITAL LETTER THORN */
141 /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
142 { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
143 { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
144 { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
145 { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
146 { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
147 { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
148 /* omitted: 0xe6 LATIN SMALL LETTER AE */
149 { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
150 { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
151 { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
152 { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
153 { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
154 { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
155 { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
156 { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
157 { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
158 /* omitted: 0xf0 LATIN SMALL LETTER ETH */
159 { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
160 { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
161 { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
162 { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
163 { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
164 { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
165 /* omitted: 0xf7 DIVISION SIGN */
166 /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
167 { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
168 { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
169 { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
170 { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
171 { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
172 /* omitted: 0xfe LATIN SMALL LETTER THORN */
173 { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
178 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
179 size_t inbytesleft, size_t *no_read)
181 unsigned long x = inp[0];
186 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
187 size_t inbytesleft, size_t *no_read)
196 cd->my_errno = YAZ_ICONV_EINVAL;
199 if (inp[1] != 0xbb && inp[2] == 0xbf)
206 unsigned long yaz_read_UTF8_char(unsigned char *inp,
207 size_t inbytesleft, size_t *no_read,
217 else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
220 *error = YAZ_ICONV_EILSEQ;
222 else if (inp[0] <= 0xdf && inbytesleft >= 2)
224 x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
230 *error = YAZ_ICONV_EILSEQ;
233 else if (inp[0] <= 0xef && inbytesleft >= 3)
235 x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
242 *error = YAZ_ICONV_EILSEQ;
245 else if (inp[0] <= 0xf7 && inbytesleft >= 4)
247 x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
248 ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
254 *error = YAZ_ICONV_EILSEQ;
257 else if (inp[0] <= 0xfb && inbytesleft >= 5)
259 x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
260 ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
267 *error = YAZ_ICONV_EILSEQ;
270 else if (inp[0] <= 0xfd && inbytesleft >= 6)
272 x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
273 ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
274 ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
280 *error = YAZ_ICONV_EILSEQ;
286 *error = YAZ_ICONV_EINVAL;
291 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
292 size_t inbytesleft, size_t *no_read)
294 return yaz_read_UTF8_char(inp, inbytesleft, no_read, &cd->my_errno);
297 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
298 size_t inbytesleft, size_t *no_read)
304 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
309 x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
315 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
316 size_t inbytesleft, size_t *no_read)
322 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
327 x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
334 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
335 size_t inbytesleft, size_t *no_read)
339 if (inbytesleft < sizeof(wchar_t))
341 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
347 memcpy (&wch, inp, sizeof(wch));
349 *no_read = sizeof(wch);
356 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
357 size_t inbytesleft, size_t *no_read,
360 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
361 size_t inbytesleft, size_t *no_read)
364 if (cd->comb_offset < cd->comb_size)
366 *no_read = cd->comb_no_read[cd->comb_offset];
367 x = cd->comb_x[cd->comb_offset];
369 /* special case for double-diacritic combining characters,
370 INVERTED BREVE and DOUBLE TILDE.
371 We'll increment the no_read counter by 1, since we want to skip over
372 the processing of the closing ligature character
374 /* this code is no longer necessary.. our handlers code in
375 yaz_marc8_?_conv (generated by charconv.tcl) now returns
376 0 and no_read=1 when a sequence does not match the input.
377 The SECOND HALFs in codetables.xml produces a non-existant
378 entry in the conversion trie.. Hence when met, the input byte is
379 skipped as it should (in yaz_iconv)
382 if (x == 0x0361 || x == 0x0360)
390 for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
393 x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
396 cd->comb_x[cd->comb_size] = x;
397 cd->comb_no_read[cd->comb_size] = *no_read;
399 inbytesleft = inbytesleft - *no_read;
404 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
405 size_t inbytesleft, size_t *no_read)
407 unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
408 if (x && cd->comb_size == 1)
410 /* For MARC8s we try to get a Latin-1 page code out of it */
412 for (i = 0; latin1_comb[i].x1; i++)
413 if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
415 *no_read += cd->comb_no_read[0];
417 x = latin1_comb[i].y;
424 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
425 size_t inbytesleft, size_t *no_read,
429 while(inbytesleft >= 1 && inp[0] == 27)
431 size_t inbytesleft0 = inbytesleft;
434 while(inbytesleft > 0 && strchr("(,$!", *inp))
439 if (inbytesleft <= 0)
442 cd->my_errno = YAZ_ICONV_EINVAL;
445 cd->marc8_esc_mode = *inp++;
447 (*no_read) += inbytesleft0 - inbytesleft;
449 if (inbytesleft <= 0)
454 size_t no_read_sub = 0;
457 switch(cd->marc8_esc_mode)
459 case 'B': /* Basic ASCII */
460 case 'E': /* ANSEL */
461 case 's': /* ASCII */
462 x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
464 case 'g': /* Greek */
465 x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
467 case 'b': /* Subscripts */
468 x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
470 case 'p': /* Superscripts */
471 x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
473 case '2': /* Basic Hebrew */
474 x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
476 case 'N': /* Basic Cyrillic */
477 case 'Q': /* Extended Cyrillic */
478 x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
480 case '3': /* Basic Arabic */
481 case '4': /* Extended Arabic */
482 x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
484 case 'S': /* Greek */
485 x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
487 case '1': /* Chinese, Japanese, Korean (EACC) */
488 x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
492 cd->my_errno = YAZ_ICONV_EILSEQ;
495 *no_read += no_read_sub;
500 static size_t yaz_write_UTF8(yaz_iconv_t cd, unsigned long x,
501 char **outbuf, size_t *outbytesleft,
504 return yaz_write_UTF8_char(x, outbuf, outbytesleft, &cd->my_errno);
507 size_t yaz_write_UTF8_char(unsigned long x,
508 char **outbuf, size_t *outbytesleft,
511 unsigned char *outp = (unsigned char *) *outbuf;
513 if (x <= 0x7f && *outbytesleft >= 1)
515 *outp++ = (unsigned char) x;
518 else if (x <= 0x7ff && *outbytesleft >= 2)
520 *outp++ = (unsigned char) ((x >> 6) | 0xc0);
521 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
522 (*outbytesleft) -= 2;
524 else if (x <= 0xffff && *outbytesleft >= 3)
526 *outp++ = (unsigned char) ((x >> 12) | 0xe0);
527 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
528 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
529 (*outbytesleft) -= 3;
531 else if (x <= 0x1fffff && *outbytesleft >= 4)
533 *outp++ = (unsigned char) ((x >> 18) | 0xf0);
534 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
535 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
536 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
537 (*outbytesleft) -= 4;
539 else if (x <= 0x3ffffff && *outbytesleft >= 5)
541 *outp++ = (unsigned char) ((x >> 24) | 0xf8);
542 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
543 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
544 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
545 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
546 (*outbytesleft) -= 5;
548 else if (*outbytesleft >= 6)
550 *outp++ = (unsigned char) ((x >> 30) | 0xfc);
551 *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
552 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
553 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
554 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
555 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
556 (*outbytesleft) -= 6;
560 *error = YAZ_ICONV_E2BIG; /* not room for output */
563 *outbuf = (char *) outp;
568 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
569 char **outbuf, size_t *outbytesleft,
572 /* list of two char unicode sequence that, when combined, are
573 equivalent to single unicode chars that can be represented in
575 Regular iconv on Linux at least does not seem to convert these,
576 but since MARC-8 to UTF-8 generates these composed sequence
577 we get a better chance of a successful MARC-8 -> ISO-8859-1
579 unsigned char *outp = (unsigned char *) *outbuf;
581 if (cd->compose_char)
584 for (i = 0; latin1_comb[i].x1; i++)
585 if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
587 x = latin1_comb[i].y;
590 if (*outbytesleft < 1)
591 { /* no room. Retain compose_char and bail out */
592 cd->my_errno = YAZ_ICONV_E2BIG;
595 if (!latin1_comb[i].x1)
596 { /* not found. Just write compose_char */
597 *outp++ = (unsigned char) cd->compose_char;
599 *outbuf = (char *) outp;
601 /* compose_char used so reset it. x now holds current char */
602 cd->compose_char = 0;
605 if (!last && x > 32 && x < 127 && cd->compose_char == 0)
607 cd->compose_char = x;
610 else if (x > 255 || x < 1)
612 cd->my_errno = YAZ_ICONV_EILSEQ;
615 else if (*outbytesleft < 1)
617 cd->my_errno = YAZ_ICONV_E2BIG;
620 *outp++ = (unsigned char) x;
622 *outbuf = (char *) outp;
627 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
628 char **outbuf, size_t *outbytesleft,
631 unsigned char *outp = (unsigned char *) *outbuf;
632 if (*outbytesleft >= 4)
634 *outp++ = (unsigned char) (x>>24);
635 *outp++ = (unsigned char) (x>>16);
636 *outp++ = (unsigned char) (x>>8);
637 *outp++ = (unsigned char) x;
638 (*outbytesleft) -= 4;
642 cd->my_errno = YAZ_ICONV_E2BIG;
645 *outbuf = (char *) outp;
649 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
650 char **outbuf, size_t *outbytesleft,
653 unsigned char *outp = (unsigned char *) *outbuf;
654 if (*outbytesleft >= 4)
656 *outp++ = (unsigned char) x;
657 *outp++ = (unsigned char) (x>>8);
658 *outp++ = (unsigned char) (x>>16);
659 *outp++ = (unsigned char) (x>>24);
660 (*outbytesleft) -= 4;
664 cd->my_errno = YAZ_ICONV_E2BIG;
667 *outbuf = (char *) outp;
671 static unsigned long lookup_marc8(yaz_iconv_t cd,
672 unsigned long x, int *comb,
673 const char **page_chr)
676 char *utf8_outbuf = utf8_buf;
677 size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
679 r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
680 if (r == (size_t)(-1))
682 cd->my_errno = YAZ_ICONV_EILSEQ;
688 size_t inbytesleft, no_read_sub = 0;
692 inp = (unsigned char *) utf8_buf;
693 inbytesleft = strlen(utf8_buf);
695 x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
698 *page_chr = "\033(B";
701 x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
707 x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
713 x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
719 x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
722 *page_chr = "\033(2";
725 x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
728 *page_chr = "\033(N";
731 x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
734 *page_chr = "\033(3";
737 x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
740 *page_chr = "\033(S";
743 x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
746 *page_chr = "\033(1";
749 cd->my_errno = YAZ_ICONV_EILSEQ;
754 static size_t flush_combos(yaz_iconv_t cd,
755 char **outbuf, size_t *outbytesleft)
757 unsigned long y = cd->write_marc8_last;
758 unsigned char byte, second_half = 0;
760 size_t i, out_no = 0;
765 byte = (unsigned char )((y>>16) & 0xff);
767 out_buf[out_no++] = byte;
768 byte = (unsigned char)((y>>8) & 0xff);
770 out_buf[out_no++] = byte;
771 byte = (unsigned char )(y & 0xff);
773 out_buf[out_no++] = byte;
775 if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
777 cd->my_errno = YAZ_ICONV_E2BIG;
778 return (size_t) (-1);
781 for (i = 0; i < cd->write_marc8_comb_no; i++)
783 /* all MARC-8 combined characters are simple bytes */
784 byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
787 else if (byte == 0xFA)
793 memcpy(*outbuf, out_buf, out_no);
795 (*outbytesleft) -= out_no;
798 *(*outbuf)++ = second_half;
802 cd->write_marc8_last = 0;
803 cd->write_marc8_comb_no = 0;
807 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
808 char **outbuf, size_t *outbytesleft,
812 const char *page_chr = 0;
813 unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
816 return (size_t) (-1);
820 if (cd->write_marc8_comb_no < 6)
821 cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
825 size_t r = flush_combos(cd, outbuf, outbytesleft);
826 const char *old_page_chr = cd->write_marc8_page_chr;
829 if (strcmp(page_chr, old_page_chr))
832 const char *page_out = page_chr;
834 if (*outbytesleft < 8)
836 cd->my_errno = YAZ_ICONV_E2BIG;
838 return (size_t) (-1);
840 cd->write_marc8_page_chr = page_chr;
842 if (!strcmp(old_page_chr, "\033p")
843 || !strcmp(old_page_chr, "\033g")
844 || !strcmp(old_page_chr, "\033b"))
846 /* Technique 1 leave */
848 if (strcmp(page_chr, "\033(B")) /* Not going ASCII page? */
850 /* Must leave script + enter new page */
851 plen = strlen(page_out);
852 memcpy(*outbuf, page_out, plen);
854 (*outbytesleft) -= plen;
858 plen = strlen(page_out);
859 memcpy(*outbuf, page_out, plen);
861 (*outbytesleft) -= plen;
863 cd->write_marc8_last = y;
867 size_t r = flush_combos(cd, outbuf, outbytesleft);
871 cd->write_marc8_comb_no--;
873 cd->write_marc8_last = 0;
880 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
881 char **outbuf, size_t *outbytesleft,
885 for (i = 0; latin1_comb[i].x1; i++)
887 if (x == latin1_comb[i].y)
890 /* save the output pointers .. */
891 char *outbuf0 = *outbuf;
892 size_t outbytesleft0 = *outbytesleft;
893 int last_ch = cd->write_marc8_last;
895 r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
896 outbuf, outbytesleft, 0);
899 r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
900 outbuf, outbytesleft, last);
901 if (r && cd->my_errno == YAZ_ICONV_E2BIG)
903 /* not enough room. reset output to original values */
905 *outbytesleft = outbytesleft0;
906 cd->write_marc8_last = last_ch;
911 return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, last);
916 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
917 char **outbuf, size_t *outbytesleft,
920 unsigned char *outp = (unsigned char *) *outbuf;
922 if (*outbytesleft >= sizeof(wchar_t))
925 memcpy(outp, &wch, sizeof(wch));
927 (*outbytesleft) -= sizeof(wch);
931 cd->my_errno = YAZ_ICONV_E2BIG;
934 *outbuf = (char *) outp;
939 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
941 return cd->read_handle && cd->write_handle;
944 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
946 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
948 cd->write_handle = 0;
951 cd->my_errno = YAZ_ICONV_UNKNOWN;
952 cd->marc8_esc_mode = 'B';
953 cd->comb_offset = cd->comb_size = 0;
954 cd->compose_char = 0;
956 cd->write_marc8_comb_no = 0;
957 cd->write_marc8_last = 0;
958 cd->write_marc8_page_chr = "\033(B";
960 /* a useful hack: if fromcode has leading @,
961 the library not use YAZ's own conversions .. */
962 if (fromcode[0] == '@')
966 if (!yaz_matchstr(fromcode, "UTF8"))
968 cd->read_handle = yaz_read_UTF8;
969 cd->init_handle = yaz_init_UTF8;
971 else if (!yaz_matchstr(fromcode, "ISO88591"))
972 cd->read_handle = yaz_read_ISO8859_1;
973 else if (!yaz_matchstr(fromcode, "UCS4"))
974 cd->read_handle = yaz_read_UCS4;
975 else if (!yaz_matchstr(fromcode, "UCS4LE"))
976 cd->read_handle = yaz_read_UCS4LE;
977 else if (!yaz_matchstr(fromcode, "MARC8"))
978 cd->read_handle = yaz_read_marc8;
979 else if (!yaz_matchstr(fromcode, "MARC8s"))
980 cd->read_handle = yaz_read_marc8s;
982 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
983 cd->read_handle = yaz_read_wchar_t;
986 if (!yaz_matchstr(tocode, "UTF8"))
987 cd->write_handle = yaz_write_UTF8;
988 else if (!yaz_matchstr(tocode, "ISO88591"))
989 cd->write_handle = yaz_write_ISO8859_1;
990 else if (!yaz_matchstr (tocode, "UCS4"))
991 cd->write_handle = yaz_write_UCS4;
992 else if (!yaz_matchstr(tocode, "UCS4LE"))
993 cd->write_handle = yaz_write_UCS4LE;
994 else if (!yaz_matchstr(tocode, "MARC8"))
995 cd->write_handle = yaz_write_marc8;
996 else if (!yaz_matchstr(tocode, "MARC8s"))
997 cd->write_handle = yaz_write_marc8;
999 else if (!yaz_matchstr(tocode, "WCHAR_T"))
1000 cd->write_handle = yaz_write_wchar_t;
1005 if (!cd->read_handle || !cd->write_handle)
1007 cd->iconv_cd = iconv_open (tocode, fromcode);
1008 if (cd->iconv_cd == (iconv_t) (-1))
1015 if (!cd->read_handle || !cd->write_handle)
1025 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
1026 char **outbuf, size_t *outbytesleft)
1035 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
1036 if (r == (size_t)(-1))
1038 switch (yaz_errno())
1041 cd->my_errno = YAZ_ICONV_E2BIG;
1044 cd->my_errno = YAZ_ICONV_EINVAL;
1047 cd->my_errno = YAZ_ICONV_EILSEQ;
1050 cd->my_errno = YAZ_ICONV_UNKNOWN;
1056 if (inbuf == 0 || *inbuf == 0)
1059 cd->my_errno = YAZ_ICONV_UNKNOWN;
1066 if (cd->init_handle)
1069 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1070 *inbytesleft, &no_read);
1073 if (cd->my_errno == YAZ_ICONV_EINVAL)
1078 *inbytesleft -= no_read;
1090 if (*inbytesleft == 0)
1092 r = *inbuf - inbuf0;
1097 x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1108 no_read = cd->no_read_x;
1112 r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1113 (*inbytesleft - no_read) == 0 ? 1 : 0);
1116 /* unable to write it. save it because read_handle cannot
1118 if (cd->my_errno == YAZ_ICONV_E2BIG)
1121 cd->no_read_x = no_read;
1127 *inbytesleft -= no_read;
1128 (*inbuf) += no_read;
1133 int yaz_iconv_error (yaz_iconv_t cd)
1135 return cd->my_errno;
1138 int yaz_iconv_close (yaz_iconv_t cd)
1142 iconv_close (cd->iconv_cd);
1151 * indent-tabs-mode: nil
1153 * vim: shiftwidth=4 tabstop=8 expandtab