X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=src%2Fsiconv.c;h=05ad9353ab9fe0917f906948425a1ef6e0b2e42c;hb=1db6276c63f9693d66273d710b4643572ed3f503;hp=f9ba0448b17947120c8015c4d81316785468e689;hpb=65584e65c2b3b76e16c2ff984975a44813cf5353;p=yaz-moved-to-github.git diff --git a/src/siconv.c b/src/siconv.c index f9ba044..05ad935 100644 --- a/src/siconv.c +++ b/src/siconv.c @@ -2,7 +2,7 @@ * Copyright (C) 1995-2005, Index Data ApS * See the file LICENSE for details. * - * $Id: siconv.c,v 1.11 2005-02-07 11:23:18 adam Exp $ + * $Id: siconv.c,v 1.12 2005-05-08 07:35:23 adam Exp $ */ /** * \file siconv.c @@ -60,7 +60,8 @@ struct yaz_iconv_struct { unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf, size_t inbytesleft, size_t *no_read); size_t (*write_handle)(yaz_iconv_t cd, unsigned long x, - char **outbuf, size_t *outbytesleft); + char **outbuf, size_t *outbytesleft, + int last); int marc8_esc_mode; #if NEW_COMB int comb_offset; @@ -72,10 +73,11 @@ struct yaz_iconv_struct { int marc8_comb_no_read; #endif size_t no_read_x; - unsigned unget_x; + unsigned long unget_x; #if HAVE_ICONV_H iconv_t iconv_cd; #endif + unsigned long compose_char; }; static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp, @@ -462,7 +464,8 @@ static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp, #endif static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x, - char **outbuf, size_t *outbytesleft) + char **outbuf, size_t *outbytesleft, + int last) { unsigned char *outp = (unsigned char *) *outbuf; if (x <= 0x7f && *outbytesleft >= 1) @@ -519,10 +522,126 @@ static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x, return 0; } + static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x, - char **outbuf, size_t *outbytesleft) + char **outbuf, size_t *outbytesleft, + int last) { + /* list of two char unicode sequence that, when combined, are + equivalent to single unicode chars that can be represented in + ISO-8859-1/Latin-1. + Regular iconv on Linux at least does not seem to convert these, + but since MARC-8 to UTF-8 generates these composed sequence + we get a better chance of a successful MARC-8 -> ISO-8859-1 + conversion */ + static struct { + unsigned long x1, x2; + unsigned y; + } comb[] = { + { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */ + { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */ + { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */ + { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */ + { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */ + { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */ + /* no need for 0xc6 LATIN CAPITAL LETTER AE */ + { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */ + { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */ + { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */ + { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */ + { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */ + { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */ + { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */ + { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */ + { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */ + { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */ + { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */ + { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */ + { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */ + { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */ + { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */ + /* omitted: 0xd7 MULTIPLICATION SIGN */ + /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */ + { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */ + { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */ + { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */ + { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */ + { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */ + /* omitted: 0xde LATIN CAPITAL LETTER THORN */ + /* omitted: 0xdf LATIN SMALL LETTER SHARP S */ + { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */ + { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */ + { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */ + { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */ + { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */ + { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */ + /* omitted: 0xe6 LATIN SMALL LETTER AE */ + { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */ + { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */ + { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */ + { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */ + { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */ + { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */ + { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */ + { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */ + { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */ + /* omitted: 0xf0 LATIN SMALL LETTER ETH */ + { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */ + { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */ + { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */ + { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */ + { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */ + { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */ + /* omitted: 0xf7 DIVISION SIGN */ + /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */ + { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */ + { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */ + { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */ + { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */ + { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */ + /* omitted: 0xfe LATIN SMALL LETTER THORN */ + { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */ + + { 0, 0, 0} + }; unsigned char *outp = (unsigned char *) *outbuf; + + if (!last && x > 32 && x < 127 && cd->compose_char == 0) + { + cd->compose_char = x; + return 0; + } + else if (cd->compose_char) + { + int i; + for (i = 0; comb[i].x1; i++) + if (cd->compose_char == comb[i].x1 && x == comb[i].x2) + { + x = comb[i].y; + break; + } + if (!comb[i].x1) + { /* not found */ + if (*outbytesleft >= 1) + { + *outp++ = (unsigned char) cd->compose_char; + (*outbytesleft)--; + *outbuf = (char *) outp; + if (!last && x > 32 && x < 127) + { + cd->compose_char = x; + return 0; + } + } + else + { + cd->my_errno = YAZ_ICONV_E2BIG; + return (size_t)(-1); + } + } + /* compose_char and old x combined to one new char: x */ + cd->compose_char = 0; + } if (x > 255 || x < 1) { cd->my_errno = YAZ_ICONV_EILSEQ; @@ -544,7 +663,8 @@ static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x, static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x, - char **outbuf, size_t *outbytesleft) + char **outbuf, size_t *outbytesleft, + int last) { unsigned char *outp = (unsigned char *) *outbuf; if (*outbytesleft >= 4) @@ -565,7 +685,8 @@ static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x, } static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x, - char **outbuf, size_t *outbytesleft) + char **outbuf, size_t *outbytesleft, + int last) { unsigned char *outp = (unsigned char *) *outbuf; if (*outbytesleft >= 4) @@ -587,7 +708,8 @@ static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x, #if HAVE_WCHAR_H static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x, - char **outbuf, size_t *outbytesleft) + char **outbuf, size_t *outbytesleft, + int last) { unsigned char *outp = (unsigned char *) *outbuf; @@ -627,6 +749,7 @@ yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode) #else cd->marc8_comb_x = 0; #endif + cd->compose_char = 0; /* a useful hack: if fromcode has leading @, the library not use YAZ's own conversions .. */ @@ -773,7 +896,8 @@ size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft, } if (x) { - r = (cd->write_handle)(cd, x, outbuf, outbytesleft); + r = (cd->write_handle)(cd, x, outbuf, outbytesleft, + (*inbytesleft - no_read) == 0 ? 1 : 0); if (r) { /* unable to write it. save it because read_handle cannot