Fix sample PQF
[yaz-moved-to-github.git] / util / siconv.c
index a01b103..73d7148 100644 (file)
@@ -1,10 +1,12 @@
 /*
- * Copyright (c) 1997-2002, Index Data
+ * Copyright (c) 1997-2003, Index Data
  * See the file LICENSE for details.
  *
- * $Id: siconv.c,v 1.1 2002-08-27 14:02:13 adam Exp $
+ * $Id: siconv.c,v 1.9 2003-01-06 08:20:28 adam Exp $
  */
 
+/* mini iconv and wrapper for system iconv library (if present) */
+
 #if HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -12,6 +14,9 @@
 #include <errno.h>
 #include <string.h>
 #include <ctype.h>
+#if HAVE_WCHAR_H
+#include <wchar.h>
+#endif
 
 #if HAVE_ICONV_H
 #include <iconv.h>
 
 #include <yaz/yaz-util.h>
 
+unsigned long yaz_marc8_conv (unsigned char *inp, size_t inbytesleft,
+                              size_t *no_read);
+    
 struct yaz_iconv_struct {
     int my_errno;
-    unsigned long (*read_handle)(yaz_iconv_t cd, char **inbuf,
-                                 size_t *inbytesleft);
+    int init_flag;
+    size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
+                          size_t inbytesleft, size_t *no_read);
+    unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
+                                 size_t inbytesleft, size_t *no_read);
     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
                            char **outbuf, size_t *outbytesleft);
 #if HAVE_ICONV_H
@@ -30,129 +41,246 @@ struct yaz_iconv_struct {
 #endif
 };
 
-
-static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd,
-                                         char **inbuf, size_t *inbytesleft)
+static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
+                                         size_t inbytesleft, size_t *no_read)
 {
-    unsigned char *inp = *inbuf;
-    unsigned long x = 0;
-    x = inp[0];
-    (*inbytesleft)--;
-    inp++;
-    *inbuf = inp;
+    unsigned long x = inp[0];
+    *no_read = 1;
     return x;
 }
 
-static unsigned long yaz_read_UTF8 (yaz_iconv_t cd,
-                                    char **inbuf, size_t *inbytesleft)
+static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
+                             size_t inbytesleft, size_t *no_read)
+{
+    if (inp[0] != 0xef)
+    {
+        *no_read = 0;
+        return 0;
+    }
+    if (inbytesleft < 3)
+    {
+        cd->my_errno = YAZ_ICONV_EINVAL;
+        return (size_t) -1;
+    }
+    if (inp[1] != 0xbb || inp[2] != 0xbf)
+    {
+        cd->my_errno = YAZ_ICONV_EILSEQ;
+        return (size_t) -1;
+    }
+    *no_read = 3;
+    return 0;
+}
+
+static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
+                                    size_t inbytesleft, size_t *no_read)
 {
-    unsigned char *inp = *inbuf;
     unsigned long x = 0;
+
     if (inp[0] <= 0x7f)
     {
         x = inp[0];
-        
-        (*inbytesleft)--;
-        inp++;
+        *no_read = 1;
     }
-    else if (inp[0] <= 0xdf && *inbytesleft >= 2)
+    else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
     {
-        x = ((inp[0] & 0x1f) << 6) + (inp[1] & 0x3f);
-        
-        (*inbytesleft) -= 2;
-        inp += 2;
+        *no_read = 0;
+        cd->my_errno = YAZ_ICONV_EILSEQ;
     }
-    else if (inp[0] <= 0xef && *inbytesleft >= 3)
+    else if (inp[0] <= 0xdf && inbytesleft >= 2)
     {
-        x =  ((inp[0] & 0x0f) << 12) +
-            ((inp[1] & 0x3f) << 6) +  (inp[1] & 0x3f);
-        
-        (*inbytesleft) -= 3;
-        inp += 3;
+        x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
+        if (x >= 0x80)
+            *no_read = 2;
+        else
+        {
+            *no_read = 0;
+            cd->my_errno = YAZ_ICONV_EILSEQ;
+        }
     }
-    else if (inp[0] <= 0xef && *inbytesleft >= 4)
+    else if (inp[0] <= 0xef && inbytesleft >= 3)
     {
-        x =  ((inp[0] & 0x07) << 18) +
-            ((inp[1] & 0x3f) << 12) + ((inp[2] & 0x3f) << 6) +
-            (inp[3] & 0x3f);
-        
-        (*inbytesleft) -= 4;
-        inp += 4;
+        x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
+            (inp[1] & 0x3f);
+        if (x >= 0x800)
+            *no_read = 3;
+        else
+        {
+            *no_read = 0;
+            cd->my_errno = YAZ_ICONV_EILSEQ;
+        }
+    }
+    else if (inp[0] <= 0xf7 && inbytesleft >= 4)
+    {
+        x =  ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
+            ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
+        if (x >= 0x10000)
+            *no_read = 4;
+        else
+        {
+            *no_read = 0;
+            cd->my_errno = YAZ_ICONV_EILSEQ;
+        }
+    }
+    else if (inp[0] <= 0xfb && inbytesleft >= 5)
+    {
+        x =  ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
+            ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
+            (inp[4] & 0x3f);
+        if (x >= 0x200000)
+            *no_read = 5;
+        else
+        {
+            *no_read = 0;
+            cd->my_errno = YAZ_ICONV_EILSEQ;
+        }
+    }
+    else if (inp[0] <= 0xfd && inbytesleft >= 6)
+    {
+        x =  ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
+            ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
+            ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
+        if (x >= 0x4000000)
+            *no_read = 6;
+        else
+        {
+            *no_read = 0;
+            cd->my_errno = YAZ_ICONV_EILSEQ;
+        }
     }
     else
     {
+        *no_read = 0;
         cd->my_errno = YAZ_ICONV_EINVAL;
     }
-    *inbuf = inp;
     return x;
 }
 
-static unsigned long yaz_read_UCS4 (yaz_iconv_t cd,
-                                    char **inbuf, size_t *inbytesleft)
+static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
+                                    size_t inbytesleft, size_t *no_read)
 {
-    unsigned char *inp = *inbuf;
     unsigned long x = 0;
     
-    if (*inbytesleft < 4)
+    if (inbytesleft < 4)
     {
         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
-        return 0;
+        *no_read = 0;
+    }
+    else
+    {
+        x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
+        *no_read = 4;
+    }
+    return x;
+}
+
+static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
+                                      size_t inbytesleft, size_t *no_read)
+{
+    unsigned long x = 0;
+    
+    if (inbytesleft < 4)
+    {
+        cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
+        *no_read = 0;
+    }
+    else
+    {
+        x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
+        *no_read = 4;
+    }
+    return x;
+}
+
+#if HAVE_WCHAR_H
+static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
+                                       size_t inbytesleft, size_t *no_read)
+{
+    unsigned long x = 0;
+    
+    if (inbytesleft < sizeof(wchar_t))
+    {
+        cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
+        *no_read = 0;
+    }
+    else
+    {
+        wchar_t wch;
+        memcpy (&wch, inp, sizeof(wch));
+        x = wch;
+        *no_read = sizeof(wch);
     }
-    memcpy (&x, inp, sizeof(x));
-    (*inbytesleft) -= 4;
-    inp += 4;
-    *inbuf = inp;
     return x;
 }
+#endif
+
+static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
+                                     size_t inbytesleft, size_t *no_read)
+{
+    return yaz_marc8_conv(inp, inbytesleft, no_read);
+}
 
 static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x,
                               char **outbuf, size_t *outbytesleft)
 {
-    unsigned char *outp = *outbuf;
+    unsigned char *outp = (unsigned char *) *outbuf;
     if (x <= 0x7f && *outbytesleft >= 1)
     {
-        *outp++ = x;
+        *outp++ = (unsigned char) x;
         (*outbytesleft)--;
     } 
     else if (x <= 0x7ff && *outbytesleft >= 2)
     {
-        *outp++ = (x >> 6) | 0xc0;
-        *outp++ = (x & 0x3f) | 0x80;
+        *outp++ = (unsigned char) ((x >> 6) | 0xc0);
+        *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
         (*outbytesleft) -= 2;
     }
     else if (x <= 0xffff && *outbytesleft >= 3)
     {
-        *outp++ = (x >> 12) | 0xe0;
-        *outp++ = ((x >> 6) & 0x3f) | 0x80;
-        *outp++ = (x & 0x3f) | 0x80;
+        *outp++ = (unsigned char) ((x >> 12) | 0xe0);
+        *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
+        *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
         (*outbytesleft) -= 3;
     }
     else if (x <= 0x1fffff && *outbytesleft >= 4)
     {
-        *outp++ = (x >> 18) | 0xf0;
-        *outp++ = ((x >> 12) & 0x3f) | 0x80;
-        *outp++ = ((x >> 6) & 0x3f) | 0x80;
-        *outp++ = (x & 0x3f) | 0x80;
+        *outp++ = (unsigned char) ((x >> 18) | 0xf0);
+        *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
+        *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
+        *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
         (*outbytesleft) -= 4;
     }
-    else if (x > 0x1fffff)
+    else if (x <= 0x3ffffff && *outbytesleft >= 5)
     {
-        cd->my_errno = YAZ_ICONV_EILSEQ;  /* invalid sequence */
-        return (size_t)(-1);
+        *outp++ = (unsigned char) ((x >> 24) | 0xf8);
+        *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
+        *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
+        *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
+        *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
+        (*outbytesleft) -= 5;
+    }
+    else if (*outbytesleft >= 6)
+    {
+        *outp++ = (unsigned char) ((x >> 30) | 0xfc);
+        *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
+        *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
+        *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
+        *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
+        *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
+        (*outbytesleft) -= 6;
     }
     else 
     {
         cd->my_errno = YAZ_ICONV_E2BIG;  /* not room for output */
         return (size_t)(-1);
     }
-    *outbuf = outp;
+    *outbuf = (char *) outp;
     return 0;
 }
 
 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
                                    char **outbuf, size_t *outbytesleft)
 {
-    unsigned char *outp = *outbuf;
+    unsigned char *outp = (unsigned char *) *outbuf;
     if (x > 255 || x < 1)
     {
         cd->my_errno = YAZ_ICONV_EILSEQ;
@@ -160,7 +288,7 @@ static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
     }
     else if (*outbytesleft >= 1)
     {
-        *outp++ = x;
+        *outp++ = (unsigned char) x;
         (*outbytesleft)--;
     }
     else 
@@ -168,7 +296,7 @@ static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
         cd->my_errno = YAZ_ICONV_E2BIG;
         return (size_t)(-1);
     }
-    *outbuf = outp;
+    *outbuf = (char *) outp;
     return 0;
 }
 
@@ -176,16 +304,34 @@ static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
                               char **outbuf, size_t *outbytesleft)
 {
-    unsigned char *outp = *outbuf;
-    if (x < 1 || x > 0x1fffff)
+    unsigned char *outp = (unsigned char *) *outbuf;
+    if (*outbytesleft >= 4)
     {
-        cd->my_errno = YAZ_ICONV_EILSEQ;
+        *outp++ = (unsigned char) (x<<24);
+        *outp++ = (unsigned char) (x<<16);
+        *outp++ = (unsigned char) (x<<8);
+        *outp++ = (unsigned char) x;
+        (*outbytesleft) -= 4;
+    }
+    else
+    {
+        cd->my_errno = YAZ_ICONV_E2BIG;
         return (size_t)(-1);
     }
-    else if (*outbytesleft >= 4)
+    *outbuf = (char *) outp;
+    return 0;
+}
+
+static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
+                                char **outbuf, size_t *outbytesleft)
+{
+    unsigned char *outp = (unsigned char *) *outbuf;
+    if (*outbytesleft >= 4)
     {
-        memcpy (outp, &x, sizeof(x));
-        outp += 4;
+        *outp++ = (unsigned char) x;
+        *outp++ = (unsigned char) (x<<8);
+        *outp++ = (unsigned char) (x<<16);
+        *outp++ = (unsigned char) (x<<24);
         (*outbytesleft) -= 4;
     }
     else
@@ -193,33 +339,84 @@ static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
         cd->my_errno = YAZ_ICONV_E2BIG;
         return (size_t)(-1);
     }
-    *outbuf = outp;
+    *outbuf = (char *) outp;
+    return 0;
+}
+
+#if HAVE_WCHAR_H
+static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
+                                 char **outbuf, size_t *outbytesleft)
+{
+    unsigned char *outp = (unsigned char *) *outbuf;
+
+    if (*outbytesleft >= sizeof(wchar_t))
+    {
+        wchar_t wch = x;
+        memcpy(outp, &wch, sizeof(wch));
+        outp += sizeof(wch);
+        (*outbytesleft) -= sizeof(wch);
+    }
+    else
+    {
+        cd->my_errno = YAZ_ICONV_E2BIG;
+        return (size_t)(-1);
+    }
+    *outbuf = (char *) outp;
     return 0;
 }
+#endif
+
+int yaz_iconv_isbuiltin(yaz_iconv_t cd)
+{
+    return cd->read_handle && cd->write_handle;
+}
 
 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
 {
-    yaz_iconv_t cd = xmalloc (sizeof(*cd));
+    yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
 
     cd->write_handle = 0;
     cd->read_handle = 0;
+    cd->init_handle = 0;
     cd->my_errno = YAZ_ICONV_UNKNOWN;
 
-    if (!strcmp(fromcode, "UTF-8"))
-        cd->read_handle = yaz_read_UTF8;
-    else if (!strcmp(fromcode, "ISO-8859-1"))
-        cd->read_handle = yaz_read_ISO8859_1;
-    else if (!strcmp(fromcode, "UCS-4"))
-        cd->read_handle = yaz_read_UCS4;
-
-
-    if (!strcmp(tocode, "UTF-8"))
-        cd->write_handle = yaz_write_UTF8;
-    else if (!strcmp (tocode, "ISO-8859-1"))
-        cd->write_handle = yaz_write_ISO8859_1;
-    else if (!strcmp (tocode, "UCS-4"))
-        cd->write_handle = yaz_write_UCS4;
-
+    /* a useful hack: if fromcode has leading @,
+       the library not use YAZ's own conversions .. */
+    if (fromcode[0] == '@')
+        fromcode++;
+    else
+    {
+        if (!yaz_matchstr(fromcode, "UTF8"))
+        {
+            cd->read_handle = yaz_read_UTF8;
+            cd->init_handle = yaz_init_UTF8;
+        }
+        else if (!yaz_matchstr(fromcode, "ISO88591"))
+            cd->read_handle = yaz_read_ISO8859_1;
+        else if (!yaz_matchstr(fromcode, "UCS4"))
+            cd->read_handle = yaz_read_UCS4;
+        else if (!yaz_matchstr(fromcode, "UCS4LE"))
+            cd->read_handle = yaz_read_UCS4LE;
+        else if (!yaz_matchstr(fromcode, "MARC8"))
+            cd->read_handle = yaz_read_marc8;
+#if HAVE_WCHAR_H
+        else if (!yaz_matchstr(fromcode, "WCHAR_T"))
+            cd->read_handle = yaz_read_wchar_t;
+#endif
+        
+        if (!yaz_matchstr(tocode, "UTF8"))
+            cd->write_handle = yaz_write_UTF8;
+        else if (!yaz_matchstr(tocode, "ISO88591"))
+            cd->write_handle = yaz_write_ISO8859_1;
+        else if (!yaz_matchstr (tocode, "UCS4"))
+            cd->write_handle = yaz_write_UCS4;
+        else if (!yaz_matchstr(tocode, "UCS4LE"))
+            cd->write_handle = yaz_write_UCS4LE;
+#if HAVE_WCHAR_H
+        else if (!yaz_matchstr(tocode, "WCHAR_T"))
+            cd->write_handle = yaz_write_wchar_t;
+#endif
+    }
 #if HAVE_ICONV_H
     cd->iconv_cd = 0;
     if (!cd->read_handle || !cd->write_handle)
@@ -232,12 +429,13 @@ yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
         }
     }
 #else
-    if (!cd->to_UCS4 || !cd->from_UCS4)
+    if (!cd->read_handle || !cd->write_handle)
     {
         xfree (cd);
         return 0;
     }
 #endif
+    cd->init_flag = 1;
     return cd;
 }
 
@@ -253,7 +451,7 @@ size_t yaz_iconv (yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
         if (r == (size_t)(-1))
         {
-            switch (errno)
+            switch (yaz_errno())
             {
             case E2BIG:
                 cd->my_errno = YAZ_ICONV_E2BIG;
@@ -272,11 +470,36 @@ size_t yaz_iconv (yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
     }
 #endif
     if (inbuf == 0 || *inbuf == 0)
+    {
+        cd->init_flag = 1;
+        cd->my_errno = YAZ_ICONV_UNKNOWN;
         return 0;
+    }
     inbuf0 = *inbuf;
+
+    if (cd->init_flag)
+    {
+        if (cd->init_handle)
+        {
+            size_t no_read;
+            size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
+                                         *inbytesleft, &no_read);
+            if (r)
+            {
+                if (cd->my_errno == YAZ_ICONV_EINVAL)
+                    return r;
+                cd->init_flag = 0;
+                return r;
+            }
+            *inbytesleft -= no_read;
+            *inbuf += no_read;
+        }
+        cd->init_flag = 0;
+    }
     while (1)
     {
         unsigned long x;
+        size_t no_read;
 
         if (*inbytesleft == 0)
         {
@@ -284,8 +507,9 @@ size_t yaz_iconv (yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
             break;
         }
         
-        x = (cd->read_handle)(cd, inbuf, inbytesleft);
-        if (x == 0)
+        x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
+                              &no_read);
+        if (no_read == 0)
         {
             r = (size_t)(-1);
             break;
@@ -293,6 +517,8 @@ size_t yaz_iconv (yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
         r = (cd->write_handle)(cd, x, outbuf, outbytesleft);
         if (r)
             break;
+        *inbytesleft -= no_read;
+        (*inbuf) += no_read;
     }
     return r;
 }