From de7e9019dfe1a48e9bae5563152bed76075b850b Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Sun, 8 May 2005 07:35:23 +0000 Subject: [PATCH] Fixed bug #320: Improve UTF-8 -> Latin-1 conversion. The yaz_iconv utility now converts several composed UNICODE sequences to their single code Latin-1 equivalents. This improves the MARC-8 to ISO-8859-1 conversion in that more sequences are correctly converted. --- src/siconv.c | 142 +++++++++++++++++++++++++++++++++++++++++++++++++++---- test/tsticonv.c | 20 +++++--- 2 files changed, 146 insertions(+), 16 deletions(-) diff --git a/src/siconv.c b/src/siconv.c index f9ba044..05ad935 100644 --- a/src/siconv.c +++ b/src/siconv.c @@ -2,7 +2,7 @@ * Copyright (C) 1995-2005, Index Data ApS * See the file LICENSE for details. * - * $Id: siconv.c,v 1.11 2005-02-07 11:23:18 adam Exp $ + * $Id: siconv.c,v 1.12 2005-05-08 07:35:23 adam Exp $ */ /** * \file siconv.c @@ -60,7 +60,8 @@ struct yaz_iconv_struct { unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf, size_t inbytesleft, size_t *no_read); size_t (*write_handle)(yaz_iconv_t cd, unsigned long x, - char **outbuf, size_t *outbytesleft); + char **outbuf, size_t *outbytesleft, + int last); int marc8_esc_mode; #if NEW_COMB int comb_offset; @@ -72,10 +73,11 @@ struct yaz_iconv_struct { int marc8_comb_no_read; #endif size_t no_read_x; - unsigned unget_x; + unsigned long unget_x; #if HAVE_ICONV_H iconv_t iconv_cd; #endif + unsigned long compose_char; }; static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp, @@ -462,7 +464,8 @@ static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp, #endif static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x, - char **outbuf, size_t *outbytesleft) + char **outbuf, size_t *outbytesleft, + int last) { unsigned char *outp = (unsigned char *) *outbuf; if (x <= 0x7f && *outbytesleft >= 1) @@ -519,10 +522,126 @@ static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x, return 0; } + static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x, - char **outbuf, size_t *outbytesleft) + char **outbuf, size_t *outbytesleft, + int last) { + /* list of two char unicode sequence that, when combined, are + equivalent to single unicode chars that can be represented in + ISO-8859-1/Latin-1. + Regular iconv on Linux at least does not seem to convert these, + but since MARC-8 to UTF-8 generates these composed sequence + we get a better chance of a successful MARC-8 -> ISO-8859-1 + conversion */ + static struct { + unsigned long x1, x2; + unsigned y; + } comb[] = { + { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */ + { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */ + { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */ + { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */ + { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */ + { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */ + /* no need for 0xc6 LATIN CAPITAL LETTER AE */ + { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */ + { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */ + { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */ + { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */ + { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */ + { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */ + { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */ + { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */ + { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */ + { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */ + { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */ + { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */ + { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */ + { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */ + { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */ + /* omitted: 0xd7 MULTIPLICATION SIGN */ + /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */ + { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */ + { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */ + { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */ + { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */ + { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */ + /* omitted: 0xde LATIN CAPITAL LETTER THORN */ + /* omitted: 0xdf LATIN SMALL LETTER SHARP S */ + { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */ + { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */ + { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */ + { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */ + { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */ + { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */ + /* omitted: 0xe6 LATIN SMALL LETTER AE */ + { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */ + { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */ + { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */ + { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */ + { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */ + { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */ + { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */ + { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */ + { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */ + /* omitted: 0xf0 LATIN SMALL LETTER ETH */ + { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */ + { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */ + { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */ + { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */ + { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */ + { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */ + /* omitted: 0xf7 DIVISION SIGN */ + /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */ + { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */ + { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */ + { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */ + { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */ + { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */ + /* omitted: 0xfe LATIN SMALL LETTER THORN */ + { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */ + + { 0, 0, 0} + }; unsigned char *outp = (unsigned char *) *outbuf; + + if (!last && x > 32 && x < 127 && cd->compose_char == 0) + { + cd->compose_char = x; + return 0; + } + else if (cd->compose_char) + { + int i; + for (i = 0; comb[i].x1; i++) + if (cd->compose_char == comb[i].x1 && x == comb[i].x2) + { + x = comb[i].y; + break; + } + if (!comb[i].x1) + { /* not found */ + if (*outbytesleft >= 1) + { + *outp++ = (unsigned char) cd->compose_char; + (*outbytesleft)--; + *outbuf = (char *) outp; + if (!last && x > 32 && x < 127) + { + cd->compose_char = x; + return 0; + } + } + else + { + cd->my_errno = YAZ_ICONV_E2BIG; + return (size_t)(-1); + } + } + /* compose_char and old x combined to one new char: x */ + cd->compose_char = 0; + } if (x > 255 || x < 1) { cd->my_errno = YAZ_ICONV_EILSEQ; @@ -544,7 +663,8 @@ static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x, static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x, - char **outbuf, size_t *outbytesleft) + char **outbuf, size_t *outbytesleft, + int last) { unsigned char *outp = (unsigned char *) *outbuf; if (*outbytesleft >= 4) @@ -565,7 +685,8 @@ static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x, } static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x, - char **outbuf, size_t *outbytesleft) + char **outbuf, size_t *outbytesleft, + int last) { unsigned char *outp = (unsigned char *) *outbuf; if (*outbytesleft >= 4) @@ -587,7 +708,8 @@ static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x, #if HAVE_WCHAR_H static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x, - char **outbuf, size_t *outbytesleft) + char **outbuf, size_t *outbytesleft, + int last) { unsigned char *outp = (unsigned char *) *outbuf; @@ -627,6 +749,7 @@ yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode) #else cd->marc8_comb_x = 0; #endif + cd->compose_char = 0; /* a useful hack: if fromcode has leading @, the library not use YAZ's own conversions .. */ @@ -773,7 +896,8 @@ size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft, } if (x) { - r = (cd->write_handle)(cd, x, outbuf, outbytesleft); + r = (cd->write_handle)(cd, x, outbuf, outbytesleft, + (*inbytesleft - no_read) == 0 ? 1 : 0); if (r) { /* unable to write it. save it because read_handle cannot diff --git a/test/tsticonv.c b/test/tsticonv.c index a011c37..6f230c1 100644 --- a/test/tsticonv.c +++ b/test/tsticonv.c @@ -2,7 +2,7 @@ * Copyright (C) 1995-2005, Index Data ApS * See the file LICENSE for details. * - * $Id: tsticonv.c,v 1.8 2005-02-02 23:27:05 adam Exp $ + * $Id: tsticonv.c,v 1.9 2005-05-08 07:35:23 adam Exp $ */ #if HAVE_CONFIG_H @@ -52,8 +52,11 @@ static int compare_buffers(char *msg, int no, /* some test strings in ISO-8859-1 format */ static const char *iso_8859_1_a[] = { "ax" , - "\330", + "\xd8", "eneb\346r", + "\xe5" "\xd8", + "\xe5" "\xd8" "b", + "\xe5" "\xe5", 0 }; /* same test strings in MARC-8 format */ @@ -61,6 +64,9 @@ static const char *marc8_a[] = { "ax", "\xa2", /* latin capital letter o with stroke */ "eneb\xb5r", /* latin small letter ae */ + "\xea" "a\xa2", + "\xea" "a\xa2" "b", + "\xea" "a" "\xea" "a", 0 }; @@ -265,7 +271,7 @@ static void dconvert(int mandatory, const char *tmpcode) { if (!mandatory) return; - printf ("tsticonv code=%s 1\n", tmpcode); + printf ("tsticonv code=%s i=%d 1\n", tmpcode, i); exit(1); } r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); @@ -273,7 +279,7 @@ static void dconvert(int mandatory, const char *tmpcode) { int e = yaz_iconv_error(cd); - printf ("tsticonv code=%s 2 e=%d\n", tmpcode, e); + printf ("tsticonv code=%s i=%d 2 e=%d\n", tmpcode, i, e); exit(2); } yaz_iconv_close(cd); @@ -283,7 +289,7 @@ static void dconvert(int mandatory, const char *tmpcode) { if (!mandatory) return; - printf ("tsticonv code=%s 3\n", tmpcode); + printf ("tsticonv code=%s i=%d 3\n", tmpcode, i); exit(3); } inbuf = outbuf0; @@ -295,7 +301,7 @@ static void dconvert(int mandatory, const char *tmpcode) if (r == (size_t)(-1)) { int e = yaz_iconv_error(cd); - printf ("tsticonv code=%s 4 e=%d\n", tmpcode, e); + printf ("tsticonv code=%s i=%d 4 e=%d\n", tmpcode, i, e); exit(4); } compare_buffers("dconvert", i, @@ -315,5 +321,5 @@ int main (int argc, char **argv) tst_marc8_to_iso_8859_1(); tst_marc8_to_ucs4b(); tst_ucs4b_to_utf8(); - exit (0); + exit(0); } -- 1.7.10.4