Bump year
[yaz-moved-to-github.git] / src / siconv.c
index e351b13..3b5928d 100644 (file)
@@ -1,11 +1,18 @@
 /*
- * Copyright (c) 1997-2004, Index Data
+ * Copyright (C) 1995-2005, Index Data ApS
  * See the file LICENSE for details.
  *
- * $Id: siconv.c,v 1.3 2004-03-15 21:39:06 adam Exp $
+ * $Id: siconv.c,v 1.8 2005-01-15 19:47:14 adam Exp $
+ */
+/**
+ * \file siconv.c
+ * \brief Implements simple ICONV
+ *
+ * This implements an interface similar to that of iconv and
+ * is used by YAZ to interface with iconv (if present).
+ * For systems where iconv is not present, this layer
+ * provides a few important conversion: UTF-8, MARC-8, Latin-1.
  */
-
-/* mini iconv and wrapper for system iconv library (if present) */
 
 #if HAVE_CONFIG_H
 #include <config.h>
 
 #include <yaz/yaz-util.h>
 
-unsigned long yaz_marc8_conv (unsigned char *inp, size_t inbytesleft,
-                              size_t *no_read);
-    
-unsigned long yaz_marc8_cjk_conv (unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read);
+unsigned long yaz_marc8_1_conv (unsigned char *inp, size_t inbytesleft,
+                             size_t *no_read, int *combining);
+unsigned long yaz_marc8_2_conv (unsigned char *inp, size_t inbytesleft,
+                               size_t *no_read, int *combining);
+unsigned long yaz_marc8_3_conv (unsigned char *inp, size_t inbytesleft,
+                               size_t *no_read, int *combining);
+unsigned long yaz_marc8_4_conv (unsigned char *inp, size_t inbytesleft,
+                               size_t *no_read, int *combining);
+unsigned long yaz_marc8_5_conv (unsigned char *inp, size_t inbytesleft,
+                               size_t *no_read, int *combining);
+unsigned long yaz_marc8_6_conv (unsigned char *inp, size_t inbytesleft,
+                               size_t *no_read, int *combining);
+unsigned long yaz_marc8_7_conv (unsigned char *inp, size_t inbytesleft,
+                               size_t *no_read, int *combining);
+unsigned long yaz_marc8_8_conv (unsigned char *inp, size_t inbytesleft,
+                               size_t *no_read, int *combining);
+unsigned long yaz_marc8_9_conv (unsigned char *inp, size_t inbytesleft,
+                               size_t *no_read, int *combining);
     
 struct yaz_iconv_struct {
     int my_errno;
@@ -40,6 +60,8 @@ struct yaz_iconv_struct {
     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
                            char **outbuf, size_t *outbytesleft);
     int marc8_esc_mode;
+    int marc8_comb_x;
+    int marc8_comb_no_read;
 #if HAVE_ICONV_H
     iconv_t iconv_cd;
 #endif
@@ -220,50 +242,30 @@ static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
                                      size_t inbytesleft, size_t *no_read)
 {
+    if (cd->marc8_comb_x)
+    {
+       unsigned long x = cd->marc8_comb_x;
+       *no_read = cd->marc8_comb_no_read;
+       cd->marc8_comb_x = 0;
+       return x;
+    }
     *no_read = 0;
     while(inbytesleft >= 1 && inp[0] == 27)
     {
        size_t inbytesleft0 = inbytesleft;
        inp++;
        inbytesleft--;
-       if (inbytesleft <= 1)
-       {
-           *no_read = 0;
-           cd->my_errno = YAZ_ICONV_EINVAL;
-           return 0;
-       }
-       if (*inp == '(' || *inp == ',') /* GO, one bytes */
+       while(inbytesleft > 0 && strchr("(,$!", *inp))
        {
            inbytesleft--;
            inp++;
        }
-       else if (*inp == '$') /* G0, multi byte */
-       {
-           inbytesleft--;
-           inp++;
-           if (inp[0] == ',')
-           {
-               inbytesleft--;
-               inp++;
-           }
-       }
        if (inbytesleft <= 0)
        {
            *no_read = 0;
            cd->my_errno = YAZ_ICONV_EINVAL;
            return 0;
        }
-       if (*inp == '!')
-       {
-           if (inbytesleft <= 1)
-           {
-               *no_read = 0;
-               cd->my_errno = YAZ_ICONV_EINVAL;
-               return 0;
-           }
-           inbytesleft--;
-           inp++;
-       }
        cd->marc8_esc_mode = *inp++;
        inbytesleft--;
        (*no_read) += inbytesleft0 - inbytesleft;
@@ -273,24 +275,68 @@ static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
     else
     {
        unsigned long x;
+       int comb = 0;
        size_t no_read_sub = 0;
 
        switch(cd->marc8_esc_mode)
        {
-       case 'B':
-       case 'E':
-           x = yaz_marc8_conv(inp, inbytesleft, &no_read_sub);
-           *no_read += no_read_sub;
-           return x;
-       case '1':
-           x = yaz_marc8_cjk_conv(inp, inbytesleft, &no_read_sub);
-           *no_read += no_read_sub;
-           return x;
+       case 'B':  /* Basic ASCII */
+       case 'E':  /* ANSEL */
+       case 's':  /* ASCII */
+           x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, &comb);
+           break;
+       case 'g':  /* Greek */
+           x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, &comb);
+           break;
+       case 'b':  /* Subscripts */
+           x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, &comb);
+           break;
+       case 'p':  /* Superscripts */
+           x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, &comb);
+           break;
+       case '2':  /* Basic Hebrew */
+           x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, &comb);
+           break;
+       case 'N':  /* Basic Cyrillic */
+       case 'Q':  /* Extended Cyrillic */
+           x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, &comb);
+           break;
+       case '3':  /* Basic Arabic */
+       case '4':  /* Extended Arabic */
+           x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, &comb);
+           break;
+       case 'S':  /* Greek */
+           x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, &comb);
+           break;
+       case '1':  /* Chinese, Japanese, Korean (EACC) */
+           x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, &comb);
+           break;
        default:
            *no_read = 0;
            cd->my_errno = YAZ_ICONV_EILSEQ;
            return 0;
        }
+#if 0
+       printf ("esc mode=%c x=%04lX comb=%d\n", cd->marc8_esc_mode, x, comb);
+#endif
+       *no_read += no_read_sub;
+
+       if (comb && cd->marc8_comb_x == 0)
+       {
+           size_t tmp_read = 0;
+           unsigned long next_x;
+
+           /* read next char .. */
+           next_x = yaz_read_marc8(cd, inp + *no_read,
+                                   inbytesleft - *no_read, &tmp_read);
+           /* save this x for later .. */
+           cd->marc8_comb_x = x;
+           /* save next read for later .. */
+           cd->marc8_comb_no_read = tmp_read;
+           /* return next x - thereby swap */
+           x = next_x;
+       }
+       return x;
     }
 }
 
@@ -455,6 +501,7 @@ yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
     cd->init_handle = 0;
     cd->my_errno = YAZ_ICONV_UNKNOWN;
     cd->marc8_esc_mode = 'B';
+    cd->marc8_comb_x = 0;
 
     /* a useful hack: if fromcode has leading @,
        the library not use YAZ's own conversions .. */