/*
- * Copyright (c) 1997-2004, Index Data
+ * Copyright (C) 1995-2005, Index Data ApS
* See the file LICENSE for details.
*
- * $Id: siconv.c,v 1.4 2004-03-15 22:51:10 adam Exp $
+ * $Id: siconv.c,v 1.11 2005-02-07 11:23:18 adam Exp $
+ */
+/**
+ * \file siconv.c
+ * \brief Implements simple ICONV
+ *
+ * This implements an interface similar to that of iconv and
+ * is used by YAZ to interface with iconv (if present).
+ * For systems where iconv is not present, this layer
+ * provides a few important conversion: UTF-8, MARC-8, Latin-1.
*/
-
-/* mini iconv and wrapper for system iconv library (if present) */
#if HAVE_CONFIG_H
#include <config.h>
#include <yaz/yaz-util.h>
-unsigned long yaz_marc8_conv (unsigned char *inp, size_t inbytesleft,
- size_t *no_read);
-
-unsigned long yaz_marc8_cjk_conv (unsigned char *inp, size_t inbytesleft,
- size_t *no_read);
+unsigned long yaz_marc8_1_conv (unsigned char *inp, size_t inbytesleft,
+ size_t *no_read, int *combining);
+unsigned long yaz_marc8_2_conv (unsigned char *inp, size_t inbytesleft,
+ size_t *no_read, int *combining);
+unsigned long yaz_marc8_3_conv (unsigned char *inp, size_t inbytesleft,
+ size_t *no_read, int *combining);
+unsigned long yaz_marc8_4_conv (unsigned char *inp, size_t inbytesleft,
+ size_t *no_read, int *combining);
+unsigned long yaz_marc8_5_conv (unsigned char *inp, size_t inbytesleft,
+ size_t *no_read, int *combining);
+unsigned long yaz_marc8_6_conv (unsigned char *inp, size_t inbytesleft,
+ size_t *no_read, int *combining);
+unsigned long yaz_marc8_7_conv (unsigned char *inp, size_t inbytesleft,
+ size_t *no_read, int *combining);
+unsigned long yaz_marc8_8_conv (unsigned char *inp, size_t inbytesleft,
+ size_t *no_read, int *combining);
+unsigned long yaz_marc8_9_conv (unsigned char *inp, size_t inbytesleft,
+ size_t *no_read, int *combining);
+#define NEW_COMB 1
+
struct yaz_iconv_struct {
int my_errno;
int init_flag;
size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
char **outbuf, size_t *outbytesleft);
int marc8_esc_mode;
+#if NEW_COMB
+ int comb_offset;
+ int comb_size;
+ unsigned long comb_x[8];
+ size_t comb_no_read[8];
+#else
+ int marc8_comb_x;
+ int marc8_comb_no_read;
+#endif
+ size_t no_read_x;
+ unsigned unget_x;
#if HAVE_ICONV_H
iconv_t iconv_cd;
#endif
}
#endif
+
+#if NEW_COMB
+static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
+ size_t inbytesleft, size_t *no_read,
+ int *comb);
+
static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
- size_t inbytesleft, size_t *no_read)
+ size_t inbytesleft, size_t *no_read)
+{
+ unsigned long x;
+ if (cd->comb_offset < cd->comb_size)
+ {
+ *no_read = cd->comb_no_read[cd->comb_offset];
+ x = cd->comb_x[cd->comb_offset];
+ cd->comb_offset++;
+ return x;
+ }
+
+ cd->comb_offset = 0;
+ for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
+ {
+ int comb = 0;
+ x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
+ if (!comb || !x)
+ break;
+ cd->comb_x[cd->comb_size] = x;
+ cd->comb_no_read[cd->comb_size] = *no_read;
+ inp += *no_read;
+ inbytesleft = inbytesleft - *no_read;
+ }
+ return x;
+}
+
+static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
+ size_t inbytesleft, size_t *no_read,
+ int *comb)
{
*no_read = 0;
while(inbytesleft >= 1 && inp[0] == 27)
size_t inbytesleft0 = inbytesleft;
inp++;
inbytesleft--;
- while(inbytesleft > 0 && strchr("(,$", *inp))
+ while(inbytesleft > 0 && strchr("(,$!", *inp))
{
inbytesleft--;
inp++;
cd->my_errno = YAZ_ICONV_EINVAL;
return 0;
}
- if (*inp == '!')
+ cd->marc8_esc_mode = *inp++;
+ inbytesleft--;
+ (*no_read) += inbytesleft0 - inbytesleft;
+ }
+ if (inbytesleft <= 0)
+ return 0;
+ else
+ {
+ unsigned long x;
+ size_t no_read_sub = 0;
+ *comb = 0;
+
+ switch(cd->marc8_esc_mode)
+ {
+ case 'B': /* Basic ASCII */
+ case 'E': /* ANSEL */
+ case 's': /* ASCII */
+ x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
+ break;
+ case 'g': /* Greek */
+ x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
+ break;
+ case 'b': /* Subscripts */
+ x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
+ break;
+ case 'p': /* Superscripts */
+ x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
+ break;
+ case '2': /* Basic Hebrew */
+ x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
+ break;
+ case 'N': /* Basic Cyrillic */
+ case 'Q': /* Extended Cyrillic */
+ x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
+ break;
+ case '3': /* Basic Arabic */
+ case '4': /* Extended Arabic */
+ x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
+ break;
+ case 'S': /* Greek */
+ x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
+ break;
+ case '1': /* Chinese, Japanese, Korean (EACC) */
+ x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
+ break;
+ default:
+ *no_read = 0;
+ cd->my_errno = YAZ_ICONV_EILSEQ;
+ return 0;
+ }
+ *no_read += no_read_sub;
+ return x;
+ }
+}
+#else
+static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
+ size_t inbytesleft, size_t *no_read)
+{
+ if (cd->marc8_comb_x)
+ {
+ unsigned long x = cd->marc8_comb_x;
+ *no_read = cd->marc8_comb_no_read;
+ cd->marc8_comb_x = 0;
+ return x;
+ }
+ *no_read = 0;
+ while(inbytesleft >= 1 && inp[0] == 27)
+ {
+ size_t inbytesleft0 = inbytesleft;
+ inp++;
+ inbytesleft--;
+ while(inbytesleft > 0 && strchr("(,$!", *inp))
{
- if (inbytesleft <= 1)
- {
- *no_read = 0;
- cd->my_errno = YAZ_ICONV_EINVAL;
- return 0;
- }
inbytesleft--;
inp++;
}
+ if (inbytesleft <= 0)
+ {
+ *no_read = 0;
+ cd->my_errno = YAZ_ICONV_EINVAL;
+ return 0;
+ }
cd->marc8_esc_mode = *inp++;
inbytesleft--;
(*no_read) += inbytesleft0 - inbytesleft;
else
{
unsigned long x;
+ int comb = 0;
size_t no_read_sub = 0;
switch(cd->marc8_esc_mode)
{
- case 'B':
- case 'E':
- x = yaz_marc8_conv(inp, inbytesleft, &no_read_sub);
- *no_read += no_read_sub;
- return x;
- case '1':
- x = yaz_marc8_cjk_conv(inp, inbytesleft, &no_read_sub);
- *no_read += no_read_sub;
- return x;
+ case 'B': /* Basic ASCII */
+ case 'E': /* ANSEL */
+ case 's': /* ASCII */
+ x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, &comb);
+ break;
+ case 'g': /* Greek */
+ x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, &comb);
+ break;
+ case 'b': /* Subscripts */
+ x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, &comb);
+ break;
+ case 'p': /* Superscripts */
+ x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, &comb);
+ break;
+ case '2': /* Basic Hebrew */
+ x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, &comb);
+ break;
+ case 'N': /* Basic Cyrillic */
+ case 'Q': /* Extended Cyrillic */
+ x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, &comb);
+ break;
+ case '3': /* Basic Arabic */
+ case '4': /* Extended Arabic */
+ x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, &comb);
+ break;
+ case 'S': /* Greek */
+ x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, &comb);
+ break;
+ case '1': /* Chinese, Japanese, Korean (EACC) */
+ x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, &comb);
+ break;
default:
*no_read = 0;
cd->my_errno = YAZ_ICONV_EILSEQ;
return 0;
}
+#if 0
+ printf ("esc mode=%c x=%04lX comb=%d\n", cd->marc8_esc_mode, x, comb);
+#endif
+ *no_read += no_read_sub;
+
+ if (comb && cd->marc8_comb_x == 0)
+ {
+ size_t tmp_read = 0;
+ unsigned long next_x;
+
+ /* read next char .. */
+ next_x = yaz_read_marc8(cd, inp + *no_read,
+ inbytesleft - *no_read, &tmp_read);
+ /* save this x for later .. */
+ cd->marc8_comb_x = x;
+ /* save next read for later .. */
+ cd->marc8_comb_no_read = tmp_read;
+ /* return next x - thereby swap */
+ x = next_x;
+ }
+ return x;
}
}
+#endif
static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x,
char **outbuf, size_t *outbytesleft)
cd->init_handle = 0;
cd->my_errno = YAZ_ICONV_UNKNOWN;
cd->marc8_esc_mode = 'B';
+#if NEW_COMB
+ cd->comb_offset = cd->comb_size = 0;
+#else
+ cd->marc8_comb_x = 0;
+#endif
/* a useful hack: if fromcode has leading @,
the library not use YAZ's own conversions .. */
return cd;
}
-size_t yaz_iconv (yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
- char **outbuf, size_t *outbytesleft)
+size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
+ char **outbuf, size_t *outbytesleft)
{
char *inbuf0;
size_t r = 0;
*inbuf += no_read;
}
cd->init_flag = 0;
+ cd->unget_x = 0;
+ cd->no_read_x = 0;
}
while (1)
{
r = *inbuf - inbuf0;
break;
}
-
- x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
- &no_read);
- if (no_read == 0)
- {
- r = (size_t)(-1);
- break;
- }
+ if (!cd->unget_x)
+ {
+ x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
+ &no_read);
+ if (no_read == 0)
+ {
+ r = (size_t)(-1);
+ break;
+ }
+ }
+ else
+ {
+ x = cd->unget_x;
+ no_read = cd->no_read_x;
+ }
if (x)
{
r = (cd->write_handle)(cd, x, outbuf, outbytesleft);
if (r)
+ {
+ /* unable to write it. save it because read_handle cannot
+ rewind .. */
+ cd->unget_x = x;
+ cd->no_read_x = no_read;
break;
+ }
+ cd->unget_x = 0;
}
*inbytesleft -= no_read;
(*inbuf) += no_read;