2 * Copyright (C) 1995-2008, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.50 2008-03-12 08:53:28 adam Exp $
9 * \brief Implements simple ICONV
11 * This implements an interface similar to that of iconv and
12 * is used by YAZ to interface with iconv (if present).
13 * For systems where iconv is not present, this layer
14 * provides a few important conversions: UTF-8, MARC-8, Latin-1.
17 * http://www.loc.gov/marc/specifications/speccharmarc8.html
36 #include <yaz/xmalloc.h>
38 #include <yaz/snprintf.h>
41 typedef unsigned long yaz_conv_func_t(unsigned char *inp, size_t inbytesleft,
42 size_t *no_read, int *combining,
43 unsigned mask, int boffset);
46 yaz_conv_func_t yaz_marc8_42_conv;
47 yaz_conv_func_t yaz_marc8_45_conv;
48 yaz_conv_func_t yaz_marc8_67_conv;
49 yaz_conv_func_t yaz_marc8_62_conv;
50 yaz_conv_func_t yaz_marc8_70_conv;
51 yaz_conv_func_t yaz_marc8_32_conv;
52 yaz_conv_func_t yaz_marc8_4E_conv;
53 yaz_conv_func_t yaz_marc8_51_conv;
54 yaz_conv_func_t yaz_marc8_33_conv;
55 yaz_conv_func_t yaz_marc8_34_conv;
56 yaz_conv_func_t yaz_marc8_53_conv;
57 yaz_conv_func_t yaz_marc8_31_conv;
59 yaz_conv_func_t yaz_marc8r_42_conv;
60 yaz_conv_func_t yaz_marc8r_45_conv;
61 yaz_conv_func_t yaz_marc8r_67_conv;
62 yaz_conv_func_t yaz_marc8r_62_conv;
63 yaz_conv_func_t yaz_marc8r_70_conv;
64 yaz_conv_func_t yaz_marc8r_32_conv;
65 yaz_conv_func_t yaz_marc8r_4E_conv;
66 yaz_conv_func_t yaz_marc8r_51_conv;
67 yaz_conv_func_t yaz_marc8r_33_conv;
68 yaz_conv_func_t yaz_marc8r_34_conv;
69 yaz_conv_func_t yaz_marc8r_53_conv;
70 yaz_conv_func_t yaz_marc8r_31_conv;
72 struct yaz_iconv_struct {
75 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
76 size_t inbytesleft, size_t *no_read);
77 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
78 size_t inbytesleft, size_t *no_read);
79 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
80 char **outbuf, size_t *outbytesleft);
81 size_t (*flush_handle)(yaz_iconv_t cd,
82 char **outbuf, size_t *outbytesleft);
88 unsigned long comb_x[8];
89 size_t comb_no_read[8];
91 unsigned long unget_x;
95 unsigned long compose_char;
97 unsigned write_marc8_second_half_char;
98 unsigned long write_marc8_last;
100 const char *write_marc8_lpage;
101 const char *write_marc8_g0;
102 const char *write_marc8_g1;
107 unsigned long x1, x2;
110 { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
111 { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
112 { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
113 { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
114 { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
115 { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
116 /* no need for 0xc6 LATIN CAPITAL LETTER AE */
117 { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
118 { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
119 { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
120 { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
121 { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
122 { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
123 { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
124 { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
125 { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
126 { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
127 { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
128 { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
129 { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
130 { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
131 { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
132 /* omitted: 0xd7 MULTIPLICATION SIGN */
133 /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
134 { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
135 { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
136 { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
137 { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
138 { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
139 /* omitted: 0xde LATIN CAPITAL LETTER THORN */
140 /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
141 { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
142 { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
143 { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
144 { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
145 { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
146 { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
147 /* omitted: 0xe6 LATIN SMALL LETTER AE */
148 { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
149 { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
150 { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
151 { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
152 { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
153 { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
154 { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
155 { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
156 { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
157 /* omitted: 0xf0 LATIN SMALL LETTER ETH */
158 { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
159 { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
160 { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
161 { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
162 { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
163 { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
164 /* omitted: 0xf7 DIVISION SIGN */
165 /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
166 { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
167 { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
168 { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
169 { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
170 { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
171 /* omitted: 0xfe LATIN SMALL LETTER THORN */
172 { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
179 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd,
180 char **outbuf, size_t *outbytesleft,
181 const char *page_chr);
183 static unsigned long yaz_read_ISO8859_1(yaz_iconv_t cd, unsigned char *inp,
184 size_t inbytesleft, size_t *no_read)
186 unsigned long x = inp[0];
194 static unsigned long yaz_read_wchar_t(yaz_iconv_t cd, unsigned char *inp,
195 size_t inbytesleft, size_t *no_read)
199 if (inbytesleft < sizeof(wchar_t))
201 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
207 memcpy(&wch, inp, sizeof(wch));
209 *no_read = sizeof(wch);
216 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
217 size_t inbytesleft, size_t *no_read,
220 static unsigned long yaz_read_marc8(yaz_iconv_t cd, unsigned char *inp,
221 size_t inbytesleft, size_t *no_read)
224 if (cd->comb_offset < cd->comb_size)
226 *no_read = cd->comb_no_read[cd->comb_offset];
227 x = cd->comb_x[cd->comb_offset];
229 /* special case for double-diacritic combining characters,
230 INVERTED BREVE and DOUBLE TILDE.
231 We'll increment the no_read counter by 1, since we want to skip over
232 the processing of the closing ligature character
234 /* this code is no longer necessary.. our handlers code in
235 yaz_marc8_?_conv (generated by charconv.tcl) now returns
236 0 and no_read=1 when a sequence does not match the input.
237 The SECOND HALFs in codetables.xml produces a non-existant
238 entry in the conversion trie.. Hence when met, the input byte is
239 skipped as it should (in yaz_iconv)
242 if (x == 0x0361 || x == 0x0360)
250 for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
254 if (inbytesleft == 0 && cd->comb_size)
256 cd->my_errno = YAZ_ICONV_EINVAL;
261 x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
264 cd->comb_x[cd->comb_size] = x;
265 cd->comb_no_read[cd->comb_size] = *no_read;
267 inbytesleft = inbytesleft - *no_read;
272 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
273 size_t inbytesleft, size_t *no_read)
275 unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
276 if (x && cd->comb_size == 1)
278 /* For MARC8s we try to get a Latin-1 page code out of it */
280 for (i = 0; latin1_comb[i].x1; i++)
281 if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
283 *no_read += cd->comb_no_read[0];
285 x = latin1_comb[i].y;
292 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
293 size_t inbytesleft, size_t *no_read,
297 while (inbytesleft > 0 && *inp == 27)
299 int *modep = &cd->g0_mode;
300 size_t inbytesleft0 = inbytesleft;
304 if (inbytesleft == 0)
306 if (*inp == '$') /* set with multiple bytes */
311 if (inbytesleft == 0)
313 if (*inp == '(' || *inp == ',') /* G0 */
318 else if (*inp == ')' || *inp == '-') /* G1 */
322 modep = &cd->g1_mode;
324 if (inbytesleft == 0)
326 if (*inp == '!') /* ANSEL is a special case */
331 if (inbytesleft == 0)
333 *modep = *inp++; /* Final character */
336 (*no_read) += inbytesleft0 - inbytesleft;
338 if (inbytesleft == 0)
340 else if (*inp == ' ')
348 size_t no_read_sub = 0;
349 int mode = *inp < 128 ? cd->g0_mode : cd->g1_mode;
354 case 'B': /* Basic ASCII */
355 case 's': /* ASCII */
356 x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
358 case 'E': /* ANSEL */
359 x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb, 127, 128);
361 case 'g': /* Greek */
362 x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
364 case 'b': /* Subscripts */
365 x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
367 case 'p': /* Superscripts */
368 x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
370 case '2': /* Basic Hebrew */
371 x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
373 case 'N': /* Basic Cyrillic */
374 x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
376 case 'Q': /* Extended Cyrillic */
377 x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
379 case '3': /* Basic Arabic */
380 x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
382 case '4': /* Extended Arabic */
383 x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
385 case 'S': /* Greek */
386 x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
388 case '1': /* Chinese, Japanese, Korean (EACC) */
389 x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
393 cd->my_errno = YAZ_ICONV_EILSEQ;
396 *no_read += no_read_sub;
401 cd->my_errno = YAZ_ICONV_EINVAL;
405 static size_t yaz_write_ISO8859_1(yaz_iconv_t cd, unsigned long x,
406 char **outbuf, size_t *outbytesleft)
408 /* list of two char unicode sequence that, when combined, are
409 equivalent to single unicode chars that can be represented in
411 Regular iconv on Linux at least does not seem to convert these,
412 but since MARC-8 to UTF-8 generates these composed sequence
413 we get a better chance of a successful MARC-8 -> ISO-8859-1
415 unsigned char *outp = (unsigned char *) *outbuf;
417 if (cd->compose_char)
420 for (i = 0; latin1_comb[i].x1; i++)
421 if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
423 x = latin1_comb[i].y;
426 if (*outbytesleft < 1)
427 { /* no room. Retain compose_char and bail out */
428 cd->my_errno = YAZ_ICONV_E2BIG;
431 if (!latin1_comb[i].x1)
432 { /* not found. Just write compose_char */
433 *outp++ = (unsigned char) cd->compose_char;
435 *outbuf = (char *) outp;
437 /* compose_char used so reset it. x now holds current char */
438 cd->compose_char = 0;
441 if (x > 32 && x < 127 && cd->compose_char == 0)
443 cd->compose_char = x;
446 else if (x > 255 || x < 1)
448 cd->my_errno = YAZ_ICONV_EILSEQ;
451 else if (*outbytesleft < 1)
453 cd->my_errno = YAZ_ICONV_E2BIG;
456 *outp++ = (unsigned char) x;
458 *outbuf = (char *) outp;
462 static size_t yaz_flush_ISO8859_1(yaz_iconv_t cd,
463 char **outbuf, size_t *outbytesleft)
465 if (cd->compose_char)
467 unsigned char *outp = (unsigned char *) *outbuf;
468 if (*outbytesleft < 1)
470 cd->my_errno = YAZ_ICONV_E2BIG;
473 *outp++ = (unsigned char) cd->compose_char;
475 *outbuf = (char *) outp;
476 cd->compose_char = 0;
481 static unsigned long lookup_marc8(yaz_iconv_t cd,
482 unsigned long x, int *comb,
483 const char **page_chr)
486 char *utf8_outbuf = utf8_buf;
487 size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
489 r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft);
490 if (r == (size_t)(-1))
492 cd->my_errno = YAZ_ICONV_EILSEQ;
498 size_t inbytesleft, no_read_sub = 0;
502 inp = (unsigned char *) utf8_buf;
503 inbytesleft = strlen(utf8_buf);
505 x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
508 *page_chr = ESC "(B";
511 x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
514 *page_chr = ESC "(B";
517 x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
523 x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
529 x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
532 *page_chr = ESC "(2";
535 x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
538 *page_chr = ESC "(N";
541 x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
544 *page_chr = ESC "(Q";
547 x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
550 *page_chr = ESC "(3";
553 x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
556 *page_chr = ESC "(4";
559 x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
562 *page_chr = ESC "(S";
565 x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
568 *page_chr = ESC "$1";
571 cd->my_errno = YAZ_ICONV_EILSEQ;
576 static size_t flush_combos(yaz_iconv_t cd,
577 char **outbuf, size_t *outbytesleft)
579 unsigned long y = cd->write_marc8_last;
584 assert(cd->write_marc8_lpage);
585 if (cd->write_marc8_lpage)
587 size_t r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft,
588 cd->write_marc8_lpage);
593 if (9 >= *outbytesleft)
595 cd->my_errno = YAZ_ICONV_E2BIG;
596 return (size_t) (-1);
598 if (cd->write_marc8_ncr)
600 yaz_snprintf(*outbuf, 9, "&#x%04x;", y);
601 (*outbytesleft) -= 8;
609 byte = (unsigned char )((y>>16) & 0xff);
611 (*outbuf)[out_no++] = byte;
612 byte = (unsigned char)((y>>8) & 0xff);
614 (*outbuf)[out_no++] = byte;
615 byte = (unsigned char )(y & 0xff);
617 (*outbuf)[out_no++] = byte;
619 (*outbytesleft) -= out_no;
622 if (cd->write_marc8_second_half_char)
624 *(*outbuf)++ = cd->write_marc8_second_half_char;
628 cd->write_marc8_last = 0;
629 cd->write_marc8_ncr = 0;
630 cd->write_marc8_lpage = 0;
631 cd->write_marc8_second_half_char = 0;
635 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd,
636 char **outbuf, size_t *outbytesleft,
637 const char *page_chr)
639 const char **old_page_chr = &cd->write_marc8_g0;
641 /* are we going to a G1-set (such as such as ESC ")!E") */
642 if (page_chr && page_chr[1] == ')')
643 old_page_chr = &cd->write_marc8_g1;
645 if (!*old_page_chr || strcmp(page_chr, *old_page_chr))
648 const char *page_out = page_chr;
650 if (*outbytesleft < 8)
652 cd->my_errno = YAZ_ICONV_E2BIG;
654 return (size_t) (-1);
659 if (!strcmp(*old_page_chr, ESC "p")
660 || !strcmp(*old_page_chr, ESC "g")
661 || !strcmp(*old_page_chr, ESC "b"))
664 /* Technique 1 leave */
665 if (strcmp(page_chr, ESC "(B")) /* Not going ASCII page? */
667 /* Must leave script + enter new page */
668 plen = strlen(page_out);
669 memcpy(*outbuf, page_out, plen);
671 (*outbytesleft) -= plen;
676 *old_page_chr = page_chr;
677 plen = strlen(page_out);
678 memcpy(*outbuf, page_out, plen);
680 (*outbytesleft) -= plen;
686 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
687 char **outbuf, size_t *outbytesleft,
692 const char *page_chr = 0;
693 unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
697 if (loss_mode == 0 || cd->my_errno != YAZ_ICONV_EILSEQ)
698 return (size_t) (-1);
713 size_t r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft,
719 cd->write_marc8_second_half_char = 0xEC;
720 else if (x == 0x0360)
721 cd->write_marc8_second_half_char = 0xFB;
723 if (*outbytesleft <= 1)
725 cd->my_errno = YAZ_ICONV_E2BIG;
726 return (size_t) (-1);
733 size_t r = flush_combos(cd, outbuf, outbytesleft);
737 cd->write_marc8_last = y;
738 cd->write_marc8_lpage = page_chr;
739 cd->write_marc8_ncr = enable_ncr;
744 static size_t yaz_flush_marc8(yaz_iconv_t cd,
745 char **outbuf, size_t *outbytesleft)
747 size_t r = flush_combos(cd, outbuf, outbytesleft);
750 cd->write_marc8_g1 = 0;
751 return yaz_write_marc8_page_chr(cd, outbuf, outbytesleft, ESC "(B");
754 static size_t yaz_write_marc8_generic(yaz_iconv_t cd, unsigned long x,
755 char **outbuf, size_t *outbytesleft,
758 static size_t yaz_write_marc8_normal(yaz_iconv_t cd, unsigned long x,
759 char **outbuf, size_t *outbytesleft)
761 return yaz_write_marc8_generic(cd, x, outbuf, outbytesleft, 0);
764 static size_t yaz_write_marc8_lossy(yaz_iconv_t cd, unsigned long x,
765 char **outbuf, size_t *outbytesleft)
767 return yaz_write_marc8_generic(cd, x, outbuf, outbytesleft, 1);
770 static size_t yaz_write_marc8_lossless(yaz_iconv_t cd, unsigned long x,
771 char **outbuf, size_t *outbytesleft)
773 return yaz_write_marc8_generic(cd, x, outbuf, outbytesleft, 2);
776 static size_t yaz_write_marc8_generic(yaz_iconv_t cd, unsigned long x,
777 char **outbuf, size_t *outbytesleft,
780 if (x >= 0xc0 && x <= 0xff) /* optimization. min and max .y values */
783 for (i = 0; latin1_comb[i].x1; i++)
785 if (x == latin1_comb[i].y)
788 /* save the output pointers .. */
789 char *outbuf0 = *outbuf;
790 size_t outbytesleft0 = *outbytesleft;
791 int last_ch = cd->write_marc8_last;
792 int ncr = cd->write_marc8_ncr;
793 const char *lpage = cd->write_marc8_lpage;
795 r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
796 outbuf, outbytesleft, loss_mode);
799 r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
800 outbuf, outbytesleft, loss_mode);
801 if (r && cd->my_errno == YAZ_ICONV_E2BIG)
803 /* not enough room. reset output to original values */
805 *outbytesleft = outbytesleft0;
806 cd->write_marc8_last = last_ch;
807 cd->write_marc8_ncr = ncr;
808 cd->write_marc8_lpage = lpage;
814 return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, loss_mode);
819 static size_t yaz_write_wchar_t(yaz_iconv_t cd, unsigned long x,
820 char **outbuf, size_t *outbytesleft)
822 unsigned char *outp = (unsigned char *) *outbuf;
824 if (*outbytesleft >= sizeof(wchar_t))
827 memcpy(outp, &wch, sizeof(wch));
829 (*outbytesleft) -= sizeof(wch);
833 cd->my_errno = YAZ_ICONV_E2BIG;
836 *outbuf = (char *) outp;
841 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
843 return cd->read_handle && cd->write_handle;
846 yaz_iconv_t yaz_iconv_open(const char *tocode, const char *fromcode)
848 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
850 cd->write_handle = 0;
853 cd->flush_handle = 0;
854 cd->my_errno = YAZ_ICONV_UNKNOWN;
856 /* a useful hack: if fromcode has leading @,
857 the library not use YAZ's own conversions .. */
858 if (fromcode[0] == '@')
862 if (!yaz_matchstr(fromcode, "UTF8"))
864 cd->read_handle = yaz_read_UTF8;
865 cd->init_handle = yaz_init_UTF8;
867 else if (!yaz_matchstr(fromcode, "ISO88591"))
868 cd->read_handle = yaz_read_ISO8859_1;
869 else if (!yaz_matchstr(fromcode, "UCS4"))
870 cd->read_handle = yaz_read_UCS4;
871 else if (!yaz_matchstr(fromcode, "UCS4LE"))
872 cd->read_handle = yaz_read_UCS4LE;
873 else if (!yaz_matchstr(fromcode, "MARC8"))
874 cd->read_handle = yaz_read_marc8;
875 else if (!yaz_matchstr(fromcode, "MARC8s"))
876 cd->read_handle = yaz_read_marc8s;
877 else if (!yaz_matchstr(fromcode, "advancegreek"))
878 cd->read_handle = yaz_read_advancegreek;
879 else if (!yaz_matchstr(fromcode, "iso54281984"))
880 cd->read_handle = yaz_read_iso5428_1984;
881 else if (!yaz_matchstr(fromcode, "iso5428:1984"))
882 cd->read_handle = yaz_read_iso5428_1984;
884 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
885 cd->read_handle = yaz_read_wchar_t;
888 if (!yaz_matchstr(tocode, "UTF8"))
889 cd->write_handle = yaz_write_UTF8;
890 else if (!yaz_matchstr(tocode, "ISO88591"))
892 cd->write_handle = yaz_write_ISO8859_1;
893 cd->flush_handle = yaz_flush_ISO8859_1;
895 else if (!yaz_matchstr(tocode, "UCS4"))
896 cd->write_handle = yaz_write_UCS4;
897 else if (!yaz_matchstr(tocode, "UCS4LE"))
898 cd->write_handle = yaz_write_UCS4LE;
899 else if (!yaz_matchstr(tocode, "MARC8"))
901 cd->write_handle = yaz_write_marc8_normal;
902 cd->flush_handle = yaz_flush_marc8;
904 else if (!yaz_matchstr(tocode, "MARC8s"))
906 cd->write_handle = yaz_write_marc8_normal;
907 cd->flush_handle = yaz_flush_marc8;
909 else if (!yaz_matchstr(tocode, "MARC8lossy"))
911 cd->write_handle = yaz_write_marc8_lossy;
912 cd->flush_handle = yaz_flush_marc8;
914 else if (!yaz_matchstr(tocode, "MARC8lossless"))
916 cd->write_handle = yaz_write_marc8_lossless;
917 cd->flush_handle = yaz_flush_marc8;
919 else if (!yaz_matchstr(tocode, "advancegreek"))
921 cd->write_handle = yaz_write_advancegreek;
923 else if (!yaz_matchstr(tocode, "iso54281984"))
925 cd->write_handle = yaz_write_iso5428_1984;
927 else if (!yaz_matchstr(tocode, "iso5428:1984"))
929 cd->write_handle = yaz_write_iso5428_1984;
932 else if (!yaz_matchstr(tocode, "WCHAR_T"))
933 cd->write_handle = yaz_write_wchar_t;
938 if (!cd->read_handle || !cd->write_handle)
940 cd->iconv_cd = iconv_open(tocode, fromcode);
941 if (cd->iconv_cd == (iconv_t) (-1))
948 if (!cd->read_handle || !cd->write_handle)
958 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
959 char **outbuf, size_t *outbytesleft)
968 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
969 if (r == (size_t)(-1))
974 cd->my_errno = YAZ_ICONV_E2BIG;
977 cd->my_errno = YAZ_ICONV_EINVAL;
980 cd->my_errno = YAZ_ICONV_EILSEQ;
983 cd->my_errno = YAZ_ICONV_UNKNOWN;
995 cd->my_errno = YAZ_ICONV_UNKNOWN;
999 cd->comb_offset = cd->comb_size = 0;
1000 cd->compose_char = 0;
1002 cd->write_marc8_second_half_char = 0;
1003 cd->write_marc8_last = 0;
1004 cd->write_marc8_ncr = 0;
1005 cd->write_marc8_lpage = 0;
1006 cd->write_marc8_g0 = ESC "(B";
1007 cd->write_marc8_g1 = 0;
1015 if (cd->init_handle && inbuf && *inbuf)
1018 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1019 *inbytesleft, &no_read);
1022 if (cd->my_errno == YAZ_ICONV_EINVAL)
1027 *inbytesleft -= no_read;
1033 if (!inbuf || !*inbuf)
1035 if (outbuf && *outbuf)
1038 r = (*cd->write_handle)(cd, cd->unget_x, outbuf, outbytesleft);
1039 if (cd->flush_handle)
1040 r = (*cd->flush_handle)(cd, outbuf, outbytesleft);
1055 no_read = cd->no_read_x;
1059 if (*inbytesleft == 0)
1061 r = *inbuf - inbuf0;
1064 x = (*cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1074 r = (*cd->write_handle)(cd, x, outbuf, outbytesleft);
1077 /* unable to write it. save it because read_handle cannot
1079 if (cd->my_errno == YAZ_ICONV_E2BIG)
1082 cd->no_read_x = no_read;
1088 *inbytesleft -= no_read;
1089 (*inbuf) += no_read;
1094 int yaz_iconv_error(yaz_iconv_t cd)
1096 return cd->my_errno;
1099 int yaz_iconv_close(yaz_iconv_t cd)
1103 iconv_close(cd->iconv_cd);
1109 void yaz_iconv_set_errno(yaz_iconv_t cd, int no)
1117 * indent-tabs-mode: nil
1119 * vim: shiftwidth=4 tabstop=8 expandtab