X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=util%2Fsiconv.c;h=73d7148c06942fe3629c78419fd5f1875cfb5128;hb=f9d9c90585dcce8b81c69104f1634da29a039cb3;hp=a01b10393b661244fab84ba692c074ea656b7b51;hpb=7dec30565506b5ecdd449866ebabe67bd816fc59;p=yaz-moved-to-github.git diff --git a/util/siconv.c b/util/siconv.c index a01b103..73d7148 100644 --- a/util/siconv.c +++ b/util/siconv.c @@ -1,10 +1,12 @@ /* - * Copyright (c) 1997-2002, Index Data + * Copyright (c) 1997-2003, Index Data * See the file LICENSE for details. * - * $Id: siconv.c,v 1.1 2002-08-27 14:02:13 adam Exp $ + * $Id: siconv.c,v 1.9 2003-01-06 08:20:28 adam Exp $ */ +/* mini iconv and wrapper for system iconv library (if present) */ + #if HAVE_CONFIG_H #include #endif @@ -12,6 +14,9 @@ #include #include #include +#if HAVE_WCHAR_H +#include +#endif #if HAVE_ICONV_H #include @@ -19,10 +24,16 @@ #include +unsigned long yaz_marc8_conv (unsigned char *inp, size_t inbytesleft, + size_t *no_read); + struct yaz_iconv_struct { int my_errno; - unsigned long (*read_handle)(yaz_iconv_t cd, char **inbuf, - size_t *inbytesleft); + int init_flag; + size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf, + size_t inbytesleft, size_t *no_read); + unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf, + size_t inbytesleft, size_t *no_read); size_t (*write_handle)(yaz_iconv_t cd, unsigned long x, char **outbuf, size_t *outbytesleft); #if HAVE_ICONV_H @@ -30,129 +41,246 @@ struct yaz_iconv_struct { #endif }; - -static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, - char **inbuf, size_t *inbytesleft) +static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp, + size_t inbytesleft, size_t *no_read) { - unsigned char *inp = *inbuf; - unsigned long x = 0; - x = inp[0]; - (*inbytesleft)--; - inp++; - *inbuf = inp; + unsigned long x = inp[0]; + *no_read = 1; return x; } -static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, - char **inbuf, size_t *inbytesleft) +static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp, + size_t inbytesleft, size_t *no_read) +{ + if (inp[0] != 0xef) + { + *no_read = 0; + return 0; + } + if (inbytesleft < 3) + { + cd->my_errno = YAZ_ICONV_EINVAL; + return (size_t) -1; + } + if (inp[1] != 0xbb || inp[2] != 0xbf) + { + cd->my_errno = YAZ_ICONV_EILSEQ; + return (size_t) -1; + } + *no_read = 3; + return 0; +} + +static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp, + size_t inbytesleft, size_t *no_read) { - unsigned char *inp = *inbuf; unsigned long x = 0; + if (inp[0] <= 0x7f) { x = inp[0]; - - (*inbytesleft)--; - inp++; + *no_read = 1; } - else if (inp[0] <= 0xdf && *inbytesleft >= 2) + else if (inp[0] <= 0xbf || inp[0] >= 0xfe) { - x = ((inp[0] & 0x1f) << 6) + (inp[1] & 0x3f); - - (*inbytesleft) -= 2; - inp += 2; + *no_read = 0; + cd->my_errno = YAZ_ICONV_EILSEQ; } - else if (inp[0] <= 0xef && *inbytesleft >= 3) + else if (inp[0] <= 0xdf && inbytesleft >= 2) { - x = ((inp[0] & 0x0f) << 12) + - ((inp[1] & 0x3f) << 6) + (inp[1] & 0x3f); - - (*inbytesleft) -= 3; - inp += 3; + x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f); + if (x >= 0x80) + *no_read = 2; + else + { + *no_read = 0; + cd->my_errno = YAZ_ICONV_EILSEQ; + } } - else if (inp[0] <= 0xef && *inbytesleft >= 4) + else if (inp[0] <= 0xef && inbytesleft >= 3) { - x = ((inp[0] & 0x07) << 18) + - ((inp[1] & 0x3f) << 12) + ((inp[2] & 0x3f) << 6) + - (inp[3] & 0x3f); - - (*inbytesleft) -= 4; - inp += 4; + x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) | + (inp[1] & 0x3f); + if (x >= 0x800) + *no_read = 3; + else + { + *no_read = 0; + cd->my_errno = YAZ_ICONV_EILSEQ; + } + } + else if (inp[0] <= 0xf7 && inbytesleft >= 4) + { + x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) | + ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f); + if (x >= 0x10000) + *no_read = 4; + else + { + *no_read = 0; + cd->my_errno = YAZ_ICONV_EILSEQ; + } + } + else if (inp[0] <= 0xfb && inbytesleft >= 5) + { + x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) | + ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) | + (inp[4] & 0x3f); + if (x >= 0x200000) + *no_read = 5; + else + { + *no_read = 0; + cd->my_errno = YAZ_ICONV_EILSEQ; + } + } + else if (inp[0] <= 0xfd && inbytesleft >= 6) + { + x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) | + ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) | + ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f); + if (x >= 0x4000000) + *no_read = 6; + else + { + *no_read = 0; + cd->my_errno = YAZ_ICONV_EILSEQ; + } } else { + *no_read = 0; cd->my_errno = YAZ_ICONV_EINVAL; } - *inbuf = inp; return x; } -static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, - char **inbuf, size_t *inbytesleft) +static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp, + size_t inbytesleft, size_t *no_read) { - unsigned char *inp = *inbuf; unsigned long x = 0; - if (*inbytesleft < 4) + if (inbytesleft < 4) { cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */ - return 0; + *no_read = 0; + } + else + { + x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3]; + *no_read = 4; + } + return x; +} + +static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp, + size_t inbytesleft, size_t *no_read) +{ + unsigned long x = 0; + + if (inbytesleft < 4) + { + cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */ + *no_read = 0; + } + else + { + x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0]; + *no_read = 4; + } + return x; +} + +#if HAVE_WCHAR_H +static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp, + size_t inbytesleft, size_t *no_read) +{ + unsigned long x = 0; + + if (inbytesleft < sizeof(wchar_t)) + { + cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */ + *no_read = 0; + } + else + { + wchar_t wch; + memcpy (&wch, inp, sizeof(wch)); + x = wch; + *no_read = sizeof(wch); } - memcpy (&x, inp, sizeof(x)); - (*inbytesleft) -= 4; - inp += 4; - *inbuf = inp; return x; } +#endif + +static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp, + size_t inbytesleft, size_t *no_read) +{ + return yaz_marc8_conv(inp, inbytesleft, no_read); +} static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x, char **outbuf, size_t *outbytesleft) { - unsigned char *outp = *outbuf; + unsigned char *outp = (unsigned char *) *outbuf; if (x <= 0x7f && *outbytesleft >= 1) { - *outp++ = x; + *outp++ = (unsigned char) x; (*outbytesleft)--; } else if (x <= 0x7ff && *outbytesleft >= 2) { - *outp++ = (x >> 6) | 0xc0; - *outp++ = (x & 0x3f) | 0x80; + *outp++ = (unsigned char) ((x >> 6) | 0xc0); + *outp++ = (unsigned char) ((x & 0x3f) | 0x80); (*outbytesleft) -= 2; } else if (x <= 0xffff && *outbytesleft >= 3) { - *outp++ = (x >> 12) | 0xe0; - *outp++ = ((x >> 6) & 0x3f) | 0x80; - *outp++ = (x & 0x3f) | 0x80; + *outp++ = (unsigned char) ((x >> 12) | 0xe0); + *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80); + *outp++ = (unsigned char) ((x & 0x3f) | 0x80); (*outbytesleft) -= 3; } else if (x <= 0x1fffff && *outbytesleft >= 4) { - *outp++ = (x >> 18) | 0xf0; - *outp++ = ((x >> 12) & 0x3f) | 0x80; - *outp++ = ((x >> 6) & 0x3f) | 0x80; - *outp++ = (x & 0x3f) | 0x80; + *outp++ = (unsigned char) ((x >> 18) | 0xf0); + *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80); + *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80); + *outp++ = (unsigned char) ((x & 0x3f) | 0x80); (*outbytesleft) -= 4; } - else if (x > 0x1fffff) + else if (x <= 0x3ffffff && *outbytesleft >= 5) { - cd->my_errno = YAZ_ICONV_EILSEQ; /* invalid sequence */ - return (size_t)(-1); + *outp++ = (unsigned char) ((x >> 24) | 0xf8); + *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80); + *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80); + *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80); + *outp++ = (unsigned char) ((x & 0x3f) | 0x80); + (*outbytesleft) -= 5; + } + else if (*outbytesleft >= 6) + { + *outp++ = (unsigned char) ((x >> 30) | 0xfc); + *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80); + *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80); + *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80); + *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80); + *outp++ = (unsigned char) ((x & 0x3f) | 0x80); + (*outbytesleft) -= 6; } else { cd->my_errno = YAZ_ICONV_E2BIG; /* not room for output */ return (size_t)(-1); } - *outbuf = outp; + *outbuf = (char *) outp; return 0; } static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x, char **outbuf, size_t *outbytesleft) { - unsigned char *outp = *outbuf; + unsigned char *outp = (unsigned char *) *outbuf; if (x > 255 || x < 1) { cd->my_errno = YAZ_ICONV_EILSEQ; @@ -160,7 +288,7 @@ static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x, } else if (*outbytesleft >= 1) { - *outp++ = x; + *outp++ = (unsigned char) x; (*outbytesleft)--; } else @@ -168,7 +296,7 @@ static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x, cd->my_errno = YAZ_ICONV_E2BIG; return (size_t)(-1); } - *outbuf = outp; + *outbuf = (char *) outp; return 0; } @@ -176,16 +304,34 @@ static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x, static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x, char **outbuf, size_t *outbytesleft) { - unsigned char *outp = *outbuf; - if (x < 1 || x > 0x1fffff) + unsigned char *outp = (unsigned char *) *outbuf; + if (*outbytesleft >= 4) { - cd->my_errno = YAZ_ICONV_EILSEQ; + *outp++ = (unsigned char) (x<<24); + *outp++ = (unsigned char) (x<<16); + *outp++ = (unsigned char) (x<<8); + *outp++ = (unsigned char) x; + (*outbytesleft) -= 4; + } + else + { + cd->my_errno = YAZ_ICONV_E2BIG; return (size_t)(-1); } - else if (*outbytesleft >= 4) + *outbuf = (char *) outp; + return 0; +} + +static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x, + char **outbuf, size_t *outbytesleft) +{ + unsigned char *outp = (unsigned char *) *outbuf; + if (*outbytesleft >= 4) { - memcpy (outp, &x, sizeof(x)); - outp += 4; + *outp++ = (unsigned char) x; + *outp++ = (unsigned char) (x<<8); + *outp++ = (unsigned char) (x<<16); + *outp++ = (unsigned char) (x<<24); (*outbytesleft) -= 4; } else @@ -193,33 +339,84 @@ static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x, cd->my_errno = YAZ_ICONV_E2BIG; return (size_t)(-1); } - *outbuf = outp; + *outbuf = (char *) outp; + return 0; +} + +#if HAVE_WCHAR_H +static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x, + char **outbuf, size_t *outbytesleft) +{ + unsigned char *outp = (unsigned char *) *outbuf; + + if (*outbytesleft >= sizeof(wchar_t)) + { + wchar_t wch = x; + memcpy(outp, &wch, sizeof(wch)); + outp += sizeof(wch); + (*outbytesleft) -= sizeof(wch); + } + else + { + cd->my_errno = YAZ_ICONV_E2BIG; + return (size_t)(-1); + } + *outbuf = (char *) outp; return 0; } +#endif + +int yaz_iconv_isbuiltin(yaz_iconv_t cd) +{ + return cd->read_handle && cd->write_handle; +} yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode) { - yaz_iconv_t cd = xmalloc (sizeof(*cd)); + yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd)); cd->write_handle = 0; cd->read_handle = 0; + cd->init_handle = 0; cd->my_errno = YAZ_ICONV_UNKNOWN; - if (!strcmp(fromcode, "UTF-8")) - cd->read_handle = yaz_read_UTF8; - else if (!strcmp(fromcode, "ISO-8859-1")) - cd->read_handle = yaz_read_ISO8859_1; - else if (!strcmp(fromcode, "UCS-4")) - cd->read_handle = yaz_read_UCS4; - - - if (!strcmp(tocode, "UTF-8")) - cd->write_handle = yaz_write_UTF8; - else if (!strcmp (tocode, "ISO-8859-1")) - cd->write_handle = yaz_write_ISO8859_1; - else if (!strcmp (tocode, "UCS-4")) - cd->write_handle = yaz_write_UCS4; - + /* a useful hack: if fromcode has leading @, + the library not use YAZ's own conversions .. */ + if (fromcode[0] == '@') + fromcode++; + else + { + if (!yaz_matchstr(fromcode, "UTF8")) + { + cd->read_handle = yaz_read_UTF8; + cd->init_handle = yaz_init_UTF8; + } + else if (!yaz_matchstr(fromcode, "ISO88591")) + cd->read_handle = yaz_read_ISO8859_1; + else if (!yaz_matchstr(fromcode, "UCS4")) + cd->read_handle = yaz_read_UCS4; + else if (!yaz_matchstr(fromcode, "UCS4LE")) + cd->read_handle = yaz_read_UCS4LE; + else if (!yaz_matchstr(fromcode, "MARC8")) + cd->read_handle = yaz_read_marc8; +#if HAVE_WCHAR_H + else if (!yaz_matchstr(fromcode, "WCHAR_T")) + cd->read_handle = yaz_read_wchar_t; +#endif + + if (!yaz_matchstr(tocode, "UTF8")) + cd->write_handle = yaz_write_UTF8; + else if (!yaz_matchstr(tocode, "ISO88591")) + cd->write_handle = yaz_write_ISO8859_1; + else if (!yaz_matchstr (tocode, "UCS4")) + cd->write_handle = yaz_write_UCS4; + else if (!yaz_matchstr(tocode, "UCS4LE")) + cd->write_handle = yaz_write_UCS4LE; +#if HAVE_WCHAR_H + else if (!yaz_matchstr(tocode, "WCHAR_T")) + cd->write_handle = yaz_write_wchar_t; +#endif + } #if HAVE_ICONV_H cd->iconv_cd = 0; if (!cd->read_handle || !cd->write_handle) @@ -232,12 +429,13 @@ yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode) } } #else - if (!cd->to_UCS4 || !cd->from_UCS4) + if (!cd->read_handle || !cd->write_handle) { xfree (cd); return 0; } #endif + cd->init_flag = 1; return cd; } @@ -253,7 +451,7 @@ size_t yaz_iconv (yaz_iconv_t cd, char **inbuf, size_t *inbytesleft, iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft); if (r == (size_t)(-1)) { - switch (errno) + switch (yaz_errno()) { case E2BIG: cd->my_errno = YAZ_ICONV_E2BIG; @@ -272,11 +470,36 @@ size_t yaz_iconv (yaz_iconv_t cd, char **inbuf, size_t *inbytesleft, } #endif if (inbuf == 0 || *inbuf == 0) + { + cd->init_flag = 1; + cd->my_errno = YAZ_ICONV_UNKNOWN; return 0; + } inbuf0 = *inbuf; + + if (cd->init_flag) + { + if (cd->init_handle) + { + size_t no_read; + size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf, + *inbytesleft, &no_read); + if (r) + { + if (cd->my_errno == YAZ_ICONV_EINVAL) + return r; + cd->init_flag = 0; + return r; + } + *inbytesleft -= no_read; + *inbuf += no_read; + } + cd->init_flag = 0; + } while (1) { unsigned long x; + size_t no_read; if (*inbytesleft == 0) { @@ -284,8 +507,9 @@ size_t yaz_iconv (yaz_iconv_t cd, char **inbuf, size_t *inbytesleft, break; } - x = (cd->read_handle)(cd, inbuf, inbytesleft); - if (x == 0) + x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft, + &no_read); + if (no_read == 0) { r = (size_t)(-1); break; @@ -293,6 +517,8 @@ size_t yaz_iconv (yaz_iconv_t cd, char **inbuf, size_t *inbytesleft, r = (cd->write_handle)(cd, x, outbuf, outbytesleft); if (r) break; + *inbytesleft -= no_read; + (*inbuf) += no_read; } return r; }