From: Adam Dickmeiss Date: Tue, 25 Mar 2008 20:50:41 +0000 (+0100) Subject: Handle G1 in MARC-8 decoding. X-Git-Tag: v3.0.30~64 X-Git-Url: http://git.indexdata.com/?p=yaz-moved-to-github.git;a=commitdiff_plain;h=ffa73561a78abb70d06d51cdcbf8ef3f8536a0ab Handle G1 in MARC-8 decoding. The MARC-8 decoding now handles G1 sequences in any code page. The code until now has only handled G1 in the ANSEL page. --- diff --git a/src/charconv.tcl b/src/charconv.tcl index 5732f8d..3086d26 100755 --- a/src/charconv.tcl +++ b/src/charconv.tcl @@ -36,16 +36,16 @@ proc preamble_trie {ofilehandle ifiles ofile} { " puts $f { static unsigned long lookup(struct yaz_iconv_trie **ptrs, int ptr, unsigned char *inp, - size_t inbytesleft, size_t *no_read, int *combining) + size_t inbytesleft, size_t *no_read, int *combining, unsigned mask, int boffset) { struct yaz_iconv_trie *t = (ptr > 0) ? ptrs[ptr-1] : 0; if (!t || inbytesleft < 1) return 0; if (t->dir) { - size_t ch = inp[0] & 0xff; + size_t ch = (inp[0] & mask) + boffset; unsigned long code = - lookup(ptrs, t->dir[ch].ptr, inp+1, inbytesleft-1, no_read, combining); + lookup(ptrs, t->dir[ch].ptr, inp+1, inbytesleft-1, no_read, combining, mask, boffset); if (code) { (*no_read)++; @@ -67,7 +67,13 @@ proc preamble_trie {ofilehandle ifiles ofile} { size_t len = strlen(flat->from); if (len <= inbytesleft) { - if (memcmp(flat->from, inp, len) == 0) + size_t i; + for (i = 0; i < len; i++) + { + if (((unsigned char *) flat->from)[i] != (inp[i] & mask) + boffset) + break; + } + if (i == len) { *no_read = len; *combining = flat->combining; @@ -256,11 +262,11 @@ proc dump_trie {ofilehandle} { puts $f "" puts $f "unsigned long yaz_$trie(prefix)_conv - (unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining) + (unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining, unsigned mask, int boffset) { unsigned long code; - code = lookup($trie(prefix)ptrs, 1, inp, inbytesleft, no_read, combining); + code = lookup($trie(prefix)ptrs, 1, inp, inbytesleft, no_read, combining, mask, boffset); if (!code) { *no_read = 1; diff --git a/src/siconv.c b/src/siconv.c index 8557bbd..3a9c8e1 100644 --- a/src/siconv.c +++ b/src/siconv.c @@ -37,57 +37,36 @@ #include #include "iconv-p.h" - -unsigned long yaz_marc8_42_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_45_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_67_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_62_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_70_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_32_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_4E_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_51_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_33_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_34_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_53_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_31_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); - - -unsigned long yaz_marc8r_42_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_45_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_67_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_62_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_70_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_32_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_4E_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_51_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_33_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_34_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_53_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_31_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); +typedef unsigned long yaz_conv_func_t(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining, + unsigned mask, int boffset); + + +yaz_conv_func_t yaz_marc8_42_conv; +yaz_conv_func_t yaz_marc8_45_conv; +yaz_conv_func_t yaz_marc8_67_conv; +yaz_conv_func_t yaz_marc8_62_conv; +yaz_conv_func_t yaz_marc8_70_conv; +yaz_conv_func_t yaz_marc8_32_conv; +yaz_conv_func_t yaz_marc8_4E_conv; +yaz_conv_func_t yaz_marc8_51_conv; +yaz_conv_func_t yaz_marc8_33_conv; +yaz_conv_func_t yaz_marc8_34_conv; +yaz_conv_func_t yaz_marc8_53_conv; +yaz_conv_func_t yaz_marc8_31_conv; + +yaz_conv_func_t yaz_marc8r_42_conv; +yaz_conv_func_t yaz_marc8r_45_conv; +yaz_conv_func_t yaz_marc8r_67_conv; +yaz_conv_func_t yaz_marc8r_62_conv; +yaz_conv_func_t yaz_marc8r_70_conv; +yaz_conv_func_t yaz_marc8r_32_conv; +yaz_conv_func_t yaz_marc8r_4E_conv; +yaz_conv_func_t yaz_marc8r_51_conv; +yaz_conv_func_t yaz_marc8r_33_conv; +yaz_conv_func_t yaz_marc8r_34_conv; +yaz_conv_func_t yaz_marc8r_53_conv; +yaz_conv_func_t yaz_marc8r_31_conv; struct yaz_iconv_struct { int my_errno; @@ -372,43 +351,40 @@ static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp, { case 'B': /* Basic ASCII */ case 's': /* ASCII */ + x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; case 'E': /* ANSEL */ - x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb); - if (!x) - { - no_read_sub = 0; - x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb); - } + x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb, 127, 128); break; case 'g': /* Greek */ - x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case 'b': /* Subscripts */ - x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case 'p': /* Superscripts */ - x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case '2': /* Basic Hebrew */ - x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case 'N': /* Basic Cyrillic */ - x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case 'Q': /* Extended Cyrillic */ - x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case '3': /* Basic Arabic */ - x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case '4': /* Extended Arabic */ - x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case 'S': /* Greek */ - x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case '1': /* Chinese, Japanese, Korean (EACC) */ - x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; default: *no_read = 0; @@ -524,67 +500,67 @@ static unsigned long lookup_marc8(yaz_iconv_t cd, inp = (unsigned char *) utf8_buf; inbytesleft = strlen(utf8_buf); - x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "(B"; return x; } - x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "(B"; return x; } - x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "b"; return x; } - x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "p"; return x; } - x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "(2"; return x; } - x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "(N"; return x; } - x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "(Q"; return x; } - x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "(3"; return x; } - x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "(4"; return x; } - x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "(S"; return x; } - x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "$1"; @@ -956,7 +932,7 @@ size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft, { cd->my_errno = YAZ_ICONV_UNKNOWN; cd->g0_mode = 'B'; - cd->g1_mode = 'B'; + cd->g1_mode = 'E'; cd->comb_offset = cd->comb_size = 0; cd->compose_char = 0; diff --git a/test/tsticonv.c b/test/tsticonv.c index f31b6d6..08394ed 100644 --- a/test/tsticonv.c +++ b/test/tsticonv.c @@ -423,6 +423,9 @@ static void tst_marc8_to_utf8(void) YAZ_CHECK(tst_convert_x(cd, ESC "(", "", YAZ_ICONV_EINVAL)); YAZ_CHECK(tst_convert_x(cd, ESC "(B", "", 0)); + YAZ_CHECK(tst_convert(cd, ESC "(B" "\x31", "1")); /* ASCII in G0 */ + YAZ_CHECK(tst_convert(cd, ESC ")B" "\xB1", "1")); /* ASCII in G1 */ + yaz_iconv_close(cd); }