X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=src%2Fsiconv.c;h=d3e078fe6bca4eabbcf59286e259292838a8a3d6;hb=98128a8684aa2474140872c25e8c2f5a231710b1;hp=9e5393b89cad8ddf4af040eabc5f90d0b8d8b553;hpb=a51e14df3987e0c5328d0379c059b772aa4976fa;p=yaz-moved-to-github.git diff --git a/src/siconv.c b/src/siconv.c index 9e5393b..d3e078f 100644 --- a/src/siconv.c +++ b/src/siconv.c @@ -1,8 +1,8 @@ /* - * Copyright (C) 1995-2005, Index Data ApS + * Copyright (C) 1995-2006, Index Data ApS * See the file LICENSE for details. * - * $Id: siconv.c,v 1.15 2005-11-06 01:28:09 adam Exp $ + * $Id: siconv.c,v 1.21 2006-04-19 23:48:06 adam Exp $ */ /** * \file siconv.c @@ -11,7 +11,7 @@ * This implements an interface similar to that of iconv and * is used by YAZ to interface with iconv (if present). * For systems where iconv is not present, this layer - * provides a few important conversion: UTF-8, MARC-8, Latin-1. + * provides a few important conversions: UTF-8, MARC-8, Latin-1. */ #if HAVE_CONFIG_H @@ -31,25 +31,45 @@ #include -unsigned long yaz_marc8_1_conv (unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8_2_conv (unsigned char *inp, size_t inbytesleft, +unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); + + +unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining); -unsigned long yaz_marc8_3_conv (unsigned char *inp, size_t inbytesleft, +unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining); -unsigned long yaz_marc8_4_conv (unsigned char *inp, size_t inbytesleft, +unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining); -unsigned long yaz_marc8_5_conv (unsigned char *inp, size_t inbytesleft, +unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining); -unsigned long yaz_marc8_6_conv (unsigned char *inp, size_t inbytesleft, +unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining); -unsigned long yaz_marc8_7_conv (unsigned char *inp, size_t inbytesleft, +unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining); -unsigned long yaz_marc8_8_conv (unsigned char *inp, size_t inbytesleft, +unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining); -unsigned long yaz_marc8_9_conv (unsigned char *inp, size_t inbytesleft, +unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining); - +unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); + struct yaz_iconv_struct { int my_errno; int init_flag; @@ -72,6 +92,11 @@ struct yaz_iconv_struct { iconv_t iconv_cd; #endif unsigned long compose_char; + + unsigned long write_marc8_comb_ch[8]; + size_t write_marc8_comb_no; + unsigned long write_marc8_last; + const char *write_marc8_page_chr; }; static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp, @@ -95,12 +120,10 @@ static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp, cd->my_errno = YAZ_ICONV_EINVAL; return (size_t) -1; } - if (inp[1] != 0xbb || inp[2] != 0xbf) - { - cd->my_errno = YAZ_ICONV_EILSEQ; - return (size_t) -1; - } - *no_read = 3; + if (inp[1] != 0xbb && inp[2] == 0xbf) + *no_read = 3; + else + *no_read = 0; return 0; } @@ -133,7 +156,7 @@ static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp, else if (inp[0] <= 0xef && inbytesleft >= 3) { x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) | - (inp[1] & 0x3f); + (inp[2] & 0x3f); if (x >= 0x800) *no_read = 3; else @@ -265,6 +288,13 @@ static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp, We'll increment the no_read counter by 1, since we want to skip over the processing of the closing ligature character */ + /* this code is no longer necessary.. our handlers code in + yaz_marc8_?_conv (generated by charconv.tcl) now returns + 0 and no_read=1 when a sequence does not match the input. + The SECOND HALFs in codetables.xml produces a non-existant + entry in the conversion trie.. Hence when met, the input byte is + skipped as it should (in yaz_iconv) + */ #if 0 if (x == 0x0361 || x == 0x0360) *no_read += 1; @@ -369,6 +399,7 @@ static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x, int last) { unsigned char *outp = (unsigned char *) *outbuf; + if (x <= 0x7f && *outbytesleft >= 1) { *outp++ = (unsigned char) x; @@ -438,7 +469,7 @@ static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x, static struct { unsigned long x1, x2; unsigned y; - } comb[] = { + } latin1_comb[] = { { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */ { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */ { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */ @@ -507,57 +538,47 @@ static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x, }; unsigned char *outp = (unsigned char *) *outbuf; - if (!last && x > 32 && x < 127 && cd->compose_char == 0) - { - cd->compose_char = x; - return 0; - } - else if (cd->compose_char) + if (cd->compose_char) { int i; - for (i = 0; comb[i].x1; i++) - if (cd->compose_char == comb[i].x1 && x == comb[i].x2) + for (i = 0; latin1_comb[i].x1; i++) + if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2) { - x = comb[i].y; + x = latin1_comb[i].y; break; } - if (!comb[i].x1) - { /* not found */ - if (*outbytesleft >= 1) - { - *outp++ = (unsigned char) cd->compose_char; - (*outbytesleft)--; - *outbuf = (char *) outp; - if (!last && x > 32 && x < 127) - { - cd->compose_char = x; - return 0; - } - } - else - { - cd->my_errno = YAZ_ICONV_E2BIG; - return (size_t)(-1); - } + if (*outbytesleft < 1) + { /* no room. Retain compose_char and bail out */ + cd->my_errno = YAZ_ICONV_E2BIG; + return (size_t)(-1); } - /* compose_char and old x combined to one new char: x */ + if (!latin1_comb[i].x1) + { /* not found. Just write compose_char */ + *outp++ = (unsigned char) cd->compose_char; + (*outbytesleft)--; + *outbuf = (char *) outp; + } + /* compose_char used so reset it. x now holds current char */ cd->compose_char = 0; } - if (x > 255 || x < 1) + + if (!last && x > 32 && x < 127 && cd->compose_char == 0) { - cd->my_errno = YAZ_ICONV_EILSEQ; - return (size_t) -1; + cd->compose_char = x; + return 0; } - else if (*outbytesleft >= 1) + else if (x > 255 || x < 1) { - *outp++ = (unsigned char) x; - (*outbytesleft)--; + cd->my_errno = YAZ_ICONV_EILSEQ; + return (size_t) -1; } - else + else if (*outbytesleft < 1) { cd->my_errno = YAZ_ICONV_E2BIG; return (size_t)(-1); } + *outp++ = (unsigned char) x; + (*outbytesleft)--; *outbuf = (char *) outp; return 0; } @@ -607,6 +628,194 @@ static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x, return 0; } +static unsigned long lookup_marc8(yaz_iconv_t cd, + unsigned long x, int *comb, + const char **page_chr) +{ + char utf8_buf[7]; + char *utf8_outbuf = utf8_buf; + size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r; + + r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0); + if (r == (size_t)(-1)) + { + cd->my_errno = YAZ_ICONV_EILSEQ; + return 0; + } + else + { + unsigned char *inp; + size_t inbytesleft, no_read_sub = 0; + unsigned long x; + + *utf8_outbuf = '\0'; + inp = (unsigned char *) utf8_buf; + inbytesleft = strlen(utf8_buf); + + x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb); + if (x) + { + *page_chr = "\033(B"; + return x; + } + x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb); + if (x) + { + *page_chr = "\033g"; + return x; + } + x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb); + if (x) + { + *page_chr = "\033b"; + return x; + } + x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb); + if (x) + { + *page_chr = "\033p"; + return x; + } + x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb); + if (x) + { + *page_chr = "\033(2"; + return x; + } + x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb); + if (x) + { + *page_chr = "\033(N"; + return x; + } + x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb); + if (x) + { + *page_chr = "\033(3"; + return x; + } + x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb); + if (x) + { + *page_chr = "\033(S"; + return x; + } + x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb); + if (x) + { + *page_chr = "\033(1"; + return x; + } + cd->my_errno = YAZ_ICONV_EILSEQ; + return x; + } +} + +static size_t flush_combos(yaz_iconv_t cd, + char **outbuf, size_t *outbytesleft) +{ + unsigned long y = cd->write_marc8_last; + unsigned char byte, second_half = 0; + char out_buf[10]; + size_t i, out_no = 0; + + if (!y) + return 0; + + byte = (unsigned char )((y>>16) & 0xff); + if (byte) + out_buf[out_no++] = byte; + byte = (unsigned char)((y>>8) & 0xff); + if (byte) + out_buf[out_no++] = byte; + byte = (unsigned char )(y & 0xff); + if (byte) + out_buf[out_no++] = byte; + + if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft) + { + cd->my_errno = YAZ_ICONV_E2BIG; + return (size_t) (-1); + } + + for (i = 0; i < cd->write_marc8_comb_no; i++) + { + /* all MARC-8 combined characters are simple bytes */ + byte = (unsigned char )(cd->write_marc8_comb_ch[i]); + if (byte == 0xEB) + second_half = 0xEC; + else if (byte == 0xFA) + second_half = 0xFB; + + *(*outbuf)++ = byte; + (*outbytesleft)--; + } + memcpy(*outbuf, out_buf, out_no); + *outbuf += out_no; + (*outbytesleft) -= out_no; + if (second_half) + { + *(*outbuf)++ = second_half; + (*outbytesleft)--; + } + + cd->write_marc8_last = 0; + cd->write_marc8_comb_no = 0; + return 0; +} + +static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x, + char **outbuf, size_t *outbytesleft, + int last) +{ + int comb = 0; + const char *page_chr = 0; + unsigned long y = lookup_marc8(cd, x, &comb, &page_chr); + + if (!y) + return (size_t) (-1); + + if (comb) + { + if (cd->write_marc8_comb_no < 6) + cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y; + } + else + { + size_t r = flush_combos(cd, outbuf, outbytesleft); + if (r) + return r; + if (strcmp(page_chr, cd->write_marc8_page_chr)) + { + size_t plen = strlen(page_chr); + + if (*outbytesleft < plen) + { + cd->my_errno = YAZ_ICONV_E2BIG; + return (size_t) (-1); + } + memcpy(*outbuf, page_chr, plen); + (*outbuf) += plen; + (*outbytesleft) -= plen; + cd->write_marc8_page_chr = page_chr; + } + cd->write_marc8_last = y; + } + if (last) + { + size_t r = flush_combos(cd, outbuf, outbytesleft); + if (r) + { + if (comb) + cd->write_marc8_comb_no--; + else + cd->write_marc8_last = 0; + return r; + } + } + return 0; +} + #if HAVE_WCHAR_H static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x, char **outbuf, size_t *outbytesleft, @@ -648,6 +857,10 @@ yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode) cd->comb_offset = cd->comb_size = 0; cd->compose_char = 0; + cd->write_marc8_comb_no = 0; + cd->write_marc8_last = 0; + cd->write_marc8_page_chr = "\033(B"; + /* a useful hack: if fromcode has leading @, the library not use YAZ's own conversions .. */ if (fromcode[0] == '@') @@ -680,6 +893,8 @@ yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode) cd->write_handle = yaz_write_UCS4; else if (!yaz_matchstr(tocode, "UCS4LE")) cd->write_handle = yaz_write_UCS4LE; + else if (!yaz_matchstr(tocode, "MARC8")) + cd->write_handle = yaz_write_marc8; #if HAVE_WCHAR_H else if (!yaz_matchstr(tocode, "WCHAR_T")) cd->write_handle = yaz_write_wchar_t; @@ -712,6 +927,7 @@ size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft, { char *inbuf0; size_t r = 0; + #if HAVE_ICONV_H if (cd->iconv_cd) { @@ -799,9 +1015,12 @@ size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft, { /* unable to write it. save it because read_handle cannot rewind .. */ - cd->unget_x = x; - cd->no_read_x = no_read; - break; + if (cd->my_errno == YAZ_ICONV_E2BIG) + { + cd->unget_x = x; + cd->no_read_x = no_read; + break; + } } cd->unget_x = 0; } @@ -826,7 +1045,6 @@ int yaz_iconv_close (yaz_iconv_t cd) return 0; } - /* * Local variables: * c-basic-offset: 4