X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=src%2Fsiconv.c;h=27b54ee4373d428ae7759fd830cc562b29eb8c2d;hb=062ae8c4cbdfd50766b13f34c11a5254825075a4;hp=8557bbd301ecc4dafb6cc6c43a8b38ebae06bbab;hpb=cccb7ecd623450d5b3ca2391327788c84aed71c8;p=yaz-moved-to-github.git diff --git a/src/siconv.c b/src/siconv.c index 8557bbd..27b54ee 100644 --- a/src/siconv.c +++ b/src/siconv.c @@ -35,59 +35,39 @@ #include #include +#include #include "iconv-p.h" - -unsigned long yaz_marc8_42_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_45_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_67_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_62_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_70_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_32_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_4E_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_51_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_33_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_34_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_53_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_31_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); - - -unsigned long yaz_marc8r_42_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_45_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_67_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_62_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_70_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_32_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_4E_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_51_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_33_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_34_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_53_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_31_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); +typedef unsigned long yaz_conv_func_t(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining, + unsigned mask, int boffset); + + +yaz_conv_func_t yaz_marc8_42_conv; +yaz_conv_func_t yaz_marc8_45_conv; +yaz_conv_func_t yaz_marc8_67_conv; +yaz_conv_func_t yaz_marc8_62_conv; +yaz_conv_func_t yaz_marc8_70_conv; +yaz_conv_func_t yaz_marc8_32_conv; +yaz_conv_func_t yaz_marc8_4E_conv; +yaz_conv_func_t yaz_marc8_51_conv; +yaz_conv_func_t yaz_marc8_33_conv; +yaz_conv_func_t yaz_marc8_34_conv; +yaz_conv_func_t yaz_marc8_53_conv; +yaz_conv_func_t yaz_marc8_31_conv; + +yaz_conv_func_t yaz_marc8r_42_conv; +yaz_conv_func_t yaz_marc8r_45_conv; +yaz_conv_func_t yaz_marc8r_67_conv; +yaz_conv_func_t yaz_marc8r_62_conv; +yaz_conv_func_t yaz_marc8r_70_conv; +yaz_conv_func_t yaz_marc8r_32_conv; +yaz_conv_func_t yaz_marc8r_4E_conv; +yaz_conv_func_t yaz_marc8r_51_conv; +yaz_conv_func_t yaz_marc8r_33_conv; +yaz_conv_func_t yaz_marc8r_34_conv; +yaz_conv_func_t yaz_marc8r_53_conv; +yaz_conv_func_t yaz_marc8r_31_conv; struct yaz_iconv_struct { int my_errno; @@ -116,6 +96,7 @@ struct yaz_iconv_struct { unsigned write_marc8_second_half_char; unsigned long write_marc8_last; + int write_marc8_ncr; const char *write_marc8_lpage; const char *write_marc8_g0; const char *write_marc8_g1; @@ -372,43 +353,40 @@ static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp, { case 'B': /* Basic ASCII */ case 's': /* ASCII */ + x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; case 'E': /* ANSEL */ - x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb); - if (!x) - { - no_read_sub = 0; - x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb); - } + x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb, 127, 128); break; case 'g': /* Greek */ - x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case 'b': /* Subscripts */ - x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case 'p': /* Superscripts */ - x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case '2': /* Basic Hebrew */ - x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case 'N': /* Basic Cyrillic */ - x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case 'Q': /* Extended Cyrillic */ - x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case '3': /* Basic Arabic */ - x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case '4': /* Extended Arabic */ - x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case 'S': /* Greek */ - x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case '1': /* Chinese, Japanese, Korean (EACC) */ - x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; default: *no_read = 0; @@ -524,67 +502,67 @@ static unsigned long lookup_marc8(yaz_iconv_t cd, inp = (unsigned char *) utf8_buf; inbytesleft = strlen(utf8_buf); - x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "(B"; return x; } - x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "(B"; return x; } - x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "b"; return x; } - x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "p"; return x; } - x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "(2"; return x; } - x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "(N"; return x; } - x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "(Q"; return x; } - x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "(3"; return x; } - x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "(4"; return x; } - x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "(S"; return x; } - x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0); if (x) { *page_chr = ESC "$1"; @@ -599,9 +577,6 @@ static size_t flush_combos(yaz_iconv_t cd, char **outbuf, size_t *outbytesleft) { unsigned long y = cd->write_marc8_last; - unsigned char byte; - char out_buf[4]; - size_t out_no = 0; if (!y) return 0; @@ -615,25 +590,38 @@ static size_t flush_combos(yaz_iconv_t cd, return r; } - byte = (unsigned char )((y>>16) & 0xff); - if (byte) - out_buf[out_no++] = byte; - byte = (unsigned char)((y>>8) & 0xff); - if (byte) - out_buf[out_no++] = byte; - byte = (unsigned char )(y & 0xff); - if (byte) - out_buf[out_no++] = byte; - - if (out_no + 2 >= *outbytesleft) + if (9 >= *outbytesleft) { cd->my_errno = YAZ_ICONV_E2BIG; return (size_t) (-1); } + if (cd->write_marc8_ncr) + { + yaz_snprintf(*outbuf, 9, "&#x%04x;", y); + (*outbytesleft) -= 8; + (*outbuf) += 8; + } + else + { + char out_buf[4]; + size_t out_no = 0; + unsigned char byte; + + + byte = (unsigned char )((y>>16) & 0xff); + if (byte) + out_buf[out_no++] = byte; + byte = (unsigned char)((y>>8) & 0xff); + if (byte) + out_buf[out_no++] = byte; + byte = (unsigned char )(y & 0xff); + if (byte) + out_buf[out_no++] = byte; + memcpy(*outbuf, out_buf, out_no); + *outbuf += out_no; + (*outbytesleft) -= out_no; + } - memcpy(*outbuf, out_buf, out_no); - *outbuf += out_no; - (*outbytesleft) -= out_no; if (cd->write_marc8_second_half_char) { *(*outbuf)++ = cd->write_marc8_second_half_char; @@ -641,6 +629,7 @@ static size_t flush_combos(yaz_iconv_t cd, } cd->write_marc8_last = 0; + cd->write_marc8_ncr = 0; cd->write_marc8_lpage = 0; cd->write_marc8_second_half_char = 0; return 0; @@ -698,14 +687,27 @@ static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd, static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x, - char **outbuf, size_t *outbytesleft) + char **outbuf, size_t *outbytesleft, + int loss_mode) { int comb = 0; + int enable_ncr = 0; const char *page_chr = 0; unsigned long y = lookup_marc8(cd, x, &comb, &page_chr); if (!y) - return (size_t) (-1); + { + if (loss_mode == 0 || cd->my_errno != YAZ_ICONV_EILSEQ) + return (size_t) (-1); + page_chr = ESC "(B"; + if (loss_mode == 1) + y = '|'; + else + { + y = x; + enable_ncr = 1; + } + } if (comb) { @@ -737,6 +739,7 @@ static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x, cd->write_marc8_last = y; cd->write_marc8_lpage = page_chr; + cd->write_marc8_ncr = enable_ncr; } return 0; } @@ -751,8 +754,31 @@ static size_t yaz_flush_marc8(yaz_iconv_t cd, return yaz_write_marc8_page_chr(cd, outbuf, outbytesleft, ESC "(B"); } -static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x, - char **outbuf, size_t *outbytesleft) +static size_t yaz_write_marc8_generic(yaz_iconv_t cd, unsigned long x, + char **outbuf, size_t *outbytesleft, + int loss_mode); + +static size_t yaz_write_marc8_normal(yaz_iconv_t cd, unsigned long x, + char **outbuf, size_t *outbytesleft) +{ + return yaz_write_marc8_generic(cd, x, outbuf, outbytesleft, 0); +} + +static size_t yaz_write_marc8_lossy(yaz_iconv_t cd, unsigned long x, + char **outbuf, size_t *outbytesleft) +{ + return yaz_write_marc8_generic(cd, x, outbuf, outbytesleft, 1); +} + +static size_t yaz_write_marc8_lossless(yaz_iconv_t cd, unsigned long x, + char **outbuf, size_t *outbytesleft) +{ + return yaz_write_marc8_generic(cd, x, outbuf, outbytesleft, 2); +} + +static size_t yaz_write_marc8_generic(yaz_iconv_t cd, unsigned long x, + char **outbuf, size_t *outbytesleft, + int loss_mode) { int i; for (i = 0; latin1_comb[i].x1; i++) @@ -767,11 +793,11 @@ static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x, const char *lpage = cd->write_marc8_lpage; r = yaz_write_marc8_2(cd, latin1_comb[i].x1, - outbuf, outbytesleft); + outbuf, outbytesleft, loss_mode); if (r) return r; r = yaz_write_marc8_2(cd, latin1_comb[i].x2, - outbuf, outbytesleft); + outbuf, outbytesleft, loss_mode); if (r && cd->my_errno == YAZ_ICONV_E2BIG) { /* not enough room. reset output to original values */ @@ -783,7 +809,7 @@ static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x, return r; } } - return yaz_write_marc8_2(cd, x, outbuf, outbytesleft); + return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, loss_mode); } @@ -870,12 +896,22 @@ yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode) cd->write_handle = yaz_write_UCS4LE; else if (!yaz_matchstr(tocode, "MARC8")) { - cd->write_handle = yaz_write_marc8; + cd->write_handle = yaz_write_marc8_normal; cd->flush_handle = yaz_flush_marc8; } else if (!yaz_matchstr(tocode, "MARC8s")) { - cd->write_handle = yaz_write_marc8; + cd->write_handle = yaz_write_marc8_normal; + cd->flush_handle = yaz_flush_marc8; + } + else if (!yaz_matchstr(tocode, "MARC8lossy")) + { + cd->write_handle = yaz_write_marc8_lossy; + cd->flush_handle = yaz_flush_marc8; + } + else if (!yaz_matchstr(tocode, "MARC8lossless")) + { + cd->write_handle = yaz_write_marc8_lossless; cd->flush_handle = yaz_flush_marc8; } else if (!yaz_matchstr(tocode, "advancegreek")) @@ -956,13 +992,14 @@ size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft, { cd->my_errno = YAZ_ICONV_UNKNOWN; cd->g0_mode = 'B'; - cd->g1_mode = 'B'; + cd->g1_mode = 'E'; cd->comb_offset = cd->comb_size = 0; cd->compose_char = 0; cd->write_marc8_second_half_char = 0; cd->write_marc8_last = 0; + cd->write_marc8_ncr = 0; cd->write_marc8_lpage = 0; cd->write_marc8_g0 = ESC "(B"; cd->write_marc8_g1 = 0;