X-Git-Url: http://git.indexdata.com/?p=yaz-moved-to-github.git;a=blobdiff_plain;f=test%2Ftest_iconv.c;h=76d3965afd2aeec02d14a2fc28111392e749368e;hp=857dce73728f7a9ea981ac4f86646c9f1db73c36;hb=6ee0d7c0404834a0a59547c3bd7e2686f838ce37;hpb=d0e351c12fff564d876958e860338d43716dc269 diff --git a/test/test_iconv.c b/test/test_iconv.c index 857dce7..76d3965 100644 --- a/test/test_iconv.c +++ b/test/test_iconv.c @@ -1,5 +1,5 @@ /* This file is part of the YAZ toolkit. - * Copyright (C) 1995-2010 Index Data + * Copyright (C) 1995-2013 Index Data * See the file LICENSE for details. */ #if HAVE_CONFIG_H @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -23,7 +22,7 @@ static int compare_buffers(char *msg, int no, if (expect_len == got_len && !memcmp(expect_buf, got_buf, expect_len)) return 1; - + if (0) /* use 1 see how the buffers differ (for debug purposes) */ { int i; @@ -33,21 +32,21 @@ static int compare_buffers(char *msg, int no, { char got_char[10]; char expect_char[10]; - + if (i < got_len) sprintf(got_char, "%02X", got_buf[i]); else sprintf(got_char, "? "); - + if (i < expect_len) sprintf(expect_char, "%02X", expect_buf[i]); else sprintf(expect_char, "? "); - + printf("%02d %s %s %c\n", i, got_char, expect_char, got_buf[i] == expect_buf[i] ? ' ' : '*'); - + } } return 0; @@ -125,7 +124,7 @@ static int tst_convert_x(yaz_iconv_t cd, const char *buf, const char *cmpbuf, break; } } - if (wrbuf_len(b) == strlen(cmpbuf) + if (wrbuf_len(b) == strlen(cmpbuf) && !memcmp(cmpbuf, wrbuf_buf(b), wrbuf_len(b))) ; else @@ -140,7 +139,7 @@ static int tst_convert_x(yaz_iconv_t cd, const char *buf, const char *cmpbuf, wrbuf_rewind(w); wrbuf_write_escaped(w, wrbuf_buf(b), wrbuf_len(b)); yaz_log(YLOG_LOG, "got %s", wrbuf_cstr(w)); - + wrbuf_rewind(w); wrbuf_puts_escaped(w, cmpbuf); yaz_log(YLOG_LOG, "exp %s", wrbuf_cstr(w)); @@ -157,28 +156,18 @@ static int tst_convert(yaz_iconv_t cd, const char *buf, const char *cmpbuf) return tst_convert_x(cd, buf, cmpbuf, 0); } -/* some test strings in ISO-8859-1 format */ -static const char *iso_8859_1_a[] = { - "ax" , - "\xd8", - "eneb\346r", - "\xe5" "\xd8", - "\xe5" "\xd8" "b", - "\xe5" "\xe5", - 0 }; - static void tst_marc8_to_ucs4b(void) { yaz_iconv_t cd = yaz_iconv_open("UCS4", "MARC8"); YAZ_CHECK(cd); if (!cd) return; - + YAZ_CHECK(tst_convert_l( cd, 0, "\033$1" "\x21\x2B\x3B" /* FF1F */ "\033(B" "o", - 8, + 8, "\x00\x00\xFF\x1F" "\x00\x00\x00o")); YAZ_CHECK(tst_convert_l( cd, @@ -198,7 +187,7 @@ static void tst_marc8_to_ucs4b(void) "\x21\x33\x53" /* UCS 5206 */ "\x21\x44\x2B" /* UCS 6790 */ "\033(B", - 24, + 24, "\x00\x00\x7C\xFB" "\x00\x00\x7D\x71" "\x00\x00\x5B\x89" @@ -210,13 +199,13 @@ static void tst_marc8_to_ucs4b(void) cd, 0, "\xB0\xB2", /* AYN and oSLASH */ - 8, + 8, "\x00\x00\x02\xBB" "\x00\x00\x00\xF8")); YAZ_CHECK(tst_convert_l( cd, 0, "\xF6\x61", /* a underscore */ - 8, + 8, "\x00\x00\x00\x61" "\x00\x00\x03\x32")); YAZ_CHECK(tst_convert_l( @@ -245,7 +234,7 @@ static void tst_marc8_to_ucs4b(void) cd, 0, "\xe5\xe8\x41", - 12, + 12, "\x00\x00\x00\x41" "\x00\x00\x03\x04" "\x00\x00\x03\x08")); /* bug #416 */ YAZ_CHECK(tst_convert_l( @@ -259,7 +248,7 @@ static void tst_marc8_to_ucs4b(void) cd, 0, "\xFA\x74\xFB\x73", - 12, + 12, "\x00\x00\x00\x74" "\x00\x00\x03\x60" "\x00\x00\x00\x73")); yaz_iconv_close(cd); @@ -280,7 +269,7 @@ static void tst_ucs4b_to_utf8(void) YAZ_CHECK(tst_convert_l( cd, - 8, + 8, "\x00\x00\xAE\x0E\x00\x00\xC0\xF4", 6, "\xEA\xB8\x8E\xEC\x83\xB4")); @@ -289,6 +278,15 @@ static void tst_ucs4b_to_utf8(void) static void dconvert(int mandatory, const char *tmpcode) { + /* some test strings in ISO-8859-1 format */ + static const char *iso_8859_1_a[] = { + "ax" , + "\xd8", + "eneb\346r", + "\xe5" "\xd8", + "\xe5" "\xd8" "b", + "\xe5" "\xe5", + 0 }; int i; int ret; yaz_iconv_t cd; @@ -314,7 +312,7 @@ static void dconvert(int mandatory, const char *tmpcode) yaz_iconv_close(cd); if (r == (size_t) (-1)) return; - + cd = yaz_iconv_open("ISO-8859-1", tmpcode); YAZ_CHECK(cd || !mandatory); if (!cd) @@ -334,7 +332,7 @@ static void dconvert(int mandatory, const char *tmpcode) } YAZ_CHECK(r != (size_t) (-1)); - if (r != (size_t)(-1)) + if (r != (size_t)(-1)) { ret = compare_buffers("dconvert", i, strlen(iso_8859_1_a[i]), iso_8859_1_a[i], @@ -363,7 +361,7 @@ int utf8_check(unsigned c) return 0; for (i = 0; i<4; i++) src[i] = c >> (i*8); - + r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); yaz_iconv_close(cd); @@ -390,7 +388,7 @@ int utf8_check(unsigned c) } return 1; } - + static void tst_marc8_to_utf8(void) { yaz_iconv_t cd = yaz_iconv_open("UTF-8", "MARC8"); @@ -399,10 +397,10 @@ static void tst_marc8_to_utf8(void) if (!cd) return; - YAZ_CHECK(tst_convert(cd, "Cours de math", + YAZ_CHECK(tst_convert(cd, "Cours de math", "Cours de math")); /* COMBINING ACUTE ACCENT */ - YAZ_CHECK(tst_convert(cd, "Cours de mathâe", + YAZ_CHECK(tst_convert(cd, "Cours de mathâe", "Cours de mathe\xcc\x81")); YAZ_CHECK(tst_convert(cd, "\xea" "a", "a\xcc\x8a")); @@ -434,10 +432,10 @@ static void tst_marc8s_to_utf8(void) if (!cd) return; - YAZ_CHECK(tst_convert(cd, "Cours de math", + YAZ_CHECK(tst_convert(cd, "Cours de math", "Cours de math")); /* E9: LATIN SMALL LETTER E WITH ACUTE */ - YAZ_CHECK(tst_convert(cd, "Cours de mathâe", + YAZ_CHECK(tst_convert(cd, "Cours de mathâe", "Cours de math\xc3\xa9")); yaz_iconv_close(cd); @@ -466,21 +464,21 @@ static void tst_marc8_to_latin1(void) YAZ_CHECK(tst_convert(cd, "\xea" "a" "\xea" "a", "\xe5" "\xe5")); - YAZ_CHECK(tst_convert(cd, "Cours de math", + YAZ_CHECK(tst_convert(cd, "Cours de math", "Cours de math")); - YAZ_CHECK(tst_convert(cd, "Cours de mathâe", + YAZ_CHECK(tst_convert(cd, "Cours de mathâe", "Cours de mathé")); - YAZ_CHECK(tst_convert(cd, "12345678âe", + YAZ_CHECK(tst_convert(cd, "12345678âe", "12345678é")); - YAZ_CHECK(tst_convert(cd, "123456789âe", + YAZ_CHECK(tst_convert(cd, "123456789âe", "123456789é")); - YAZ_CHECK(tst_convert(cd, "1234567890âe", + YAZ_CHECK(tst_convert(cd, "1234567890âe", "1234567890é")); - YAZ_CHECK(tst_convert(cd, "12345678901âe", + YAZ_CHECK(tst_convert(cd, "12345678901âe", "12345678901é")); - YAZ_CHECK(tst_convert(cd, "Cours de mathâem", + YAZ_CHECK(tst_convert(cd, "Cours de mathâem", "Cours de mathém")); - YAZ_CHECK(tst_convert(cd, "Cours de mathâematiques", + YAZ_CHECK(tst_convert(cd, "Cours de mathâematiques", "Cours de mathématiques")); yaz_iconv_close(cd); @@ -508,13 +506,16 @@ static void tst_utf8_to_marc8(const char *marc8_type) /** UPPERCASE SCANDINAVIAN O */ YAZ_CHECK(tst_convert(cd, "S\xc3\x98", "S\xa2")); - /** ARING */ + /** ARING (NFD) */ YAZ_CHECK(tst_convert(cd, "A" "\xCC\x8A", "\xEA" "A")); + /** ARING (NFC) */ + YAZ_CHECK(tst_convert(cd, "\xC3\x85", "\xEA" "A")); + /** A MACRON + UMLAUT, DIAERESIS */ YAZ_CHECK(tst_convert(cd, "A" "\xCC\x84" "\xCC\x88", "\xE5\xE8\x41")); - + /* Ligature spanning two characters */ YAZ_CHECK(tst_convert(cd, "\x74" "\xCD\xA1" "\x73", /* UTF-8 */ @@ -540,21 +541,21 @@ static void tst_utf8_to_marc8(const char *marc8_type) YAZ_CHECK(tst_convert(cd, "(\xe2\x81\xb0)", /* UTF-8 */ "(\033p0\x1bs)")); - - + + /** bug #1778 */ YAZ_CHECK(tst_convert(cd, /* offset 0x530 in UTF-8 rec marccol4.u8.marc */ - "\xE3\x83\xB3" "\xE3\x82\xBF" + "\xE3\x83\xB3" "\xE3\x82\xBF" "\xCC\x84" "\xCC\x84" "\xE3\x83\xBC" /* UTF-8 */, "\x1B\x24\x31" "\x69\x25\x73" - "\x1B\x28\x42" "\xE5\xE5" "\x1B\x24\x31" + "\x1B\x28\x42" "\xE5\xE5" "\x1B\x24\x31" "\x69\x25\x3F" "\x69\x21\x3C" "\x1B\x28\x42")); - + /** bug #2120 */ - YAZ_CHECK(tst_convert(cd, + YAZ_CHECK(tst_convert(cd, "\xCE\x94\xCE\xB5\xCF\x84" "\xCE\xBF\xCF\x81\xCE\xB1" "\xCE\xBA\xCE\xB7\xCF\x82\x2C", @@ -563,7 +564,7 @@ static void tst_utf8_to_marc8(const char *marc8_type) "\x61\x6D\x6A\x77" "\x1B\x28\x42\x2C" )); - + { char *inbuf0 = "\xe2\x81\xb0"; char *inbuf = inbuf0; @@ -680,9 +681,9 @@ static void tst_utf8_codes(void) YAZ_CHECK(utf8_check(100000000)); } -static void tst_danmarc_to_latin1(void) +static void tst_danmarc_to_utf8(void) { - yaz_iconv_t cd = yaz_iconv_open("iso-8859-1", "danmarc"); + yaz_iconv_t cd = yaz_iconv_open("utf-8", "danmarc"); YAZ_CHECK(cd); if (!cd) @@ -692,14 +693,49 @@ static void tst_danmarc_to_latin1(void) YAZ_CHECK(tst_convert(cd, "a@@b", "a@b")); YAZ_CHECK(tst_convert(cd, "a@@@@b", "a@@b")); - YAZ_CHECK(tst_convert(cd, "@000ab", "\nb")); - YAZ_CHECK(tst_convert(cd, "@\xe5", "aa")); - YAZ_CHECK(tst_convert(cd, "@\xc5.", "Aa.")); - + YAZ_CHECK(tst_convert(cd, "@*", "*")); + YAZ_CHECK(tst_convert(cd, "@@", "@")); + YAZ_CHECK(tst_convert(cd, "@\xa4", "\xC2\xA4")); + YAZ_CHECK(tst_convert(cd, "\xa4", "\xC2\xA4")); + YAZ_CHECK(tst_convert(cd, "@\xe5", "\xEA\x9C\xB3")); + YAZ_CHECK(tst_convert(cd, "@\xc5.", "\xEA\x9C\xB2" ".")); + + YAZ_CHECK(tst_convert(cd, "@a733", "\xEA\x9C\xB3")); + YAZ_CHECK(tst_convert(cd, "@a732.", "\xEA\x9C\xB2" ".")); + + YAZ_CHECK(tst_convert(cd, "a@03BBb", "a\xce\xbb" "b")); /* lambda */ + yaz_iconv_close(cd); } +static void tst_utf8_to_danmarc(void) +{ + yaz_iconv_t cd = yaz_iconv_open("danmarc", "utf-8"); + + YAZ_CHECK(cd); + if (!cd) + return; + + YAZ_CHECK(tst_convert(cd, "ax", "ax")); + + YAZ_CHECK(tst_convert(cd, "a@b", "a@@b")); + YAZ_CHECK(tst_convert(cd, "a@@b", "a@@@@b")); + + YAZ_CHECK(tst_convert(cd, "*", "@*")); + YAZ_CHECK(tst_convert(cd, "@", "@@")); + YAZ_CHECK(tst_convert(cd, "\xC2\xA4", "\xa4")); + + YAZ_CHECK(tst_convert(cd, "a\xc3\xa5" "b", "a\xe5" "b")); /* aring */ + YAZ_CHECK(tst_convert(cd, "a\xce\xbb" "b", "a@03BBb")); /* lambda */ + + YAZ_CHECK(tst_convert(cd, "\xEA\x9C\xB2" ".", "@\xc5.")); + YAZ_CHECK(tst_convert(cd, "\xEA\x9C\xB3", "@\xe5")); + + yaz_iconv_close(cd); +} + + int main (int argc, char **argv) { @@ -720,7 +756,8 @@ int main (int argc, char **argv) tst_utf8_to_marc8("marc8lossy"); tst_utf8_to_marc8("marc8lossless"); - tst_danmarc_to_latin1(); + tst_danmarc_to_utf8(); + tst_utf8_to_danmarc(); tst_latin1_to_marc8();