Implemented lossy and lossless MARC-8 encoding.
[yaz-moved-to-github.git] / src / siconv.c
index 8557bbd..27b54ee 100644 (file)
 
 #include <yaz/xmalloc.h>
 #include <yaz/nmem.h>
+#include <yaz/snprintf.h>
 #include "iconv-p.h"
 
-
-unsigned long yaz_marc8_42_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_45_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_67_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_62_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_70_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_32_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_4E_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_51_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_33_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_34_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_53_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_31_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-
-
-unsigned long yaz_marc8r_42_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_45_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_67_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_62_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_70_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_32_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_4E_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_51_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_33_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_34_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_53_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_31_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
+typedef unsigned long yaz_conv_func_t(unsigned char *inp, size_t inbytesleft,
+                                      size_t *no_read, int *combining,
+                                      unsigned mask, int boffset);
+
+
+yaz_conv_func_t yaz_marc8_42_conv;
+yaz_conv_func_t yaz_marc8_45_conv;
+yaz_conv_func_t yaz_marc8_67_conv;
+yaz_conv_func_t yaz_marc8_62_conv;
+yaz_conv_func_t yaz_marc8_70_conv;
+yaz_conv_func_t yaz_marc8_32_conv;
+yaz_conv_func_t yaz_marc8_4E_conv;
+yaz_conv_func_t yaz_marc8_51_conv;
+yaz_conv_func_t yaz_marc8_33_conv;
+yaz_conv_func_t yaz_marc8_34_conv;
+yaz_conv_func_t yaz_marc8_53_conv;
+yaz_conv_func_t yaz_marc8_31_conv;
+
+yaz_conv_func_t yaz_marc8r_42_conv;
+yaz_conv_func_t yaz_marc8r_45_conv;
+yaz_conv_func_t yaz_marc8r_67_conv;
+yaz_conv_func_t yaz_marc8r_62_conv;
+yaz_conv_func_t yaz_marc8r_70_conv;
+yaz_conv_func_t yaz_marc8r_32_conv;
+yaz_conv_func_t yaz_marc8r_4E_conv;
+yaz_conv_func_t yaz_marc8r_51_conv;
+yaz_conv_func_t yaz_marc8r_33_conv;
+yaz_conv_func_t yaz_marc8r_34_conv;
+yaz_conv_func_t yaz_marc8r_53_conv;
+yaz_conv_func_t yaz_marc8r_31_conv;
 
 struct yaz_iconv_struct {
     int my_errno;
@@ -116,6 +96,7 @@ struct yaz_iconv_struct {
 
     unsigned write_marc8_second_half_char;
     unsigned long write_marc8_last;
+    int write_marc8_ncr;
     const char *write_marc8_lpage;
     const char *write_marc8_g0;
     const char *write_marc8_g1;
@@ -372,43 +353,40 @@ static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
         {
         case 'B':  /* Basic ASCII */
         case 's':  /* ASCII */
+            x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+            break;
         case 'E':  /* ANSEL */
-            x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb);
-            if (!x)
-            {
-                no_read_sub = 0;
-                x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb);
-            }
+            x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb, 127, 128);
             break;
         case 'g':  /* Greek */
-            x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb);
+            x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
             break;
         case 'b':  /* Subscripts */
-            x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb);
+            x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
             break;
         case 'p':  /* Superscripts */
-            x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb);
+            x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
             break;
         case '2':  /* Basic Hebrew */
-            x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb);
+            x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
             break;
         case 'N':  /* Basic Cyrillic */
-            x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb);
+            x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
             break;
         case 'Q':  /* Extended Cyrillic */
-            x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb);
+            x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
             break;
         case '3':  /* Basic Arabic */
-            x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb);
+            x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
             break;
         case '4':  /* Extended Arabic */
-            x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb);
+            x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
             break;
         case 'S':  /* Greek */
-            x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb);
+            x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
             break;
         case '1':  /* Chinese, Japanese, Korean (EACC) */
-            x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb);
+            x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
             break;
         default:
             *no_read = 0;
@@ -524,67 +502,67 @@ static unsigned long lookup_marc8(yaz_iconv_t cd,
         inp = (unsigned char *) utf8_buf;
         inbytesleft = strlen(utf8_buf);
 
-        x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "(B";
             return x;
         }
-        x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "(B";
             return x;
         }
-        x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "b";
             return x;
         }
-        x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "p";
             return x;
         }
-        x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "(2";
             return x;
         }
-        x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "(N";
             return x;
         }
-        x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "(Q";
             return x;
         }
-        x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "(3";
             return x;
         }
-        x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "(4";
             return x;
         }
-        x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "(S";
             return x;
         }
-        x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "$1";
@@ -599,9 +577,6 @@ static size_t flush_combos(yaz_iconv_t cd,
                            char **outbuf, size_t *outbytesleft)
 {
     unsigned long y = cd->write_marc8_last;
-    unsigned char byte;
-    char out_buf[4];
-    size_t out_no = 0;
 
     if (!y)
         return 0;
@@ -615,25 +590,38 @@ static size_t flush_combos(yaz_iconv_t cd,
             return r;
     }
 
-    byte = (unsigned char )((y>>16) & 0xff);
-    if (byte)
-        out_buf[out_no++] = byte;
-    byte = (unsigned char)((y>>8) & 0xff);
-    if (byte)
-        out_buf[out_no++] = byte;
-    byte = (unsigned char )(y & 0xff);
-    if (byte)
-        out_buf[out_no++] = byte;
-
-    if (out_no + 2 >= *outbytesleft)
+    if (9 >= *outbytesleft)
     {
         cd->my_errno = YAZ_ICONV_E2BIG;
         return (size_t) (-1);
     }
+    if (cd->write_marc8_ncr)
+    {
+        yaz_snprintf(*outbuf, 9, "&#x%04x;", y);
+        (*outbytesleft) -= 8;
+        (*outbuf) += 8;
+    }
+    else
+    {
+        char out_buf[4];
+        size_t out_no = 0;
+        unsigned char byte;
+
+
+        byte = (unsigned char )((y>>16) & 0xff);
+        if (byte)
+            out_buf[out_no++] = byte;
+        byte = (unsigned char)((y>>8) & 0xff);
+        if (byte)
+            out_buf[out_no++] = byte;
+        byte = (unsigned char )(y & 0xff);
+        if (byte)
+            out_buf[out_no++] = byte;
+        memcpy(*outbuf, out_buf, out_no);
+        *outbuf += out_no;
+        (*outbytesleft) -= out_no;
+    }
 
-    memcpy(*outbuf, out_buf, out_no);
-    *outbuf += out_no;
-    (*outbytesleft) -= out_no;
     if (cd->write_marc8_second_half_char)
     {
         *(*outbuf)++ = cd->write_marc8_second_half_char;
@@ -641,6 +629,7 @@ static size_t flush_combos(yaz_iconv_t cd,
     }        
 
     cd->write_marc8_last = 0;
+    cd->write_marc8_ncr = 0;
     cd->write_marc8_lpage = 0;
     cd->write_marc8_second_half_char = 0;
     return 0;
@@ -698,14 +687,27 @@ static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd,
 
 
 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
-                                char **outbuf, size_t *outbytesleft)
+                                char **outbuf, size_t *outbytesleft,
+                                int loss_mode)
 {
     int comb = 0;
+    int enable_ncr = 0;
     const char *page_chr = 0;
     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
 
     if (!y)
-        return (size_t) (-1);
+    {
+        if (loss_mode == 0 || cd->my_errno != YAZ_ICONV_EILSEQ)
+            return (size_t) (-1);
+        page_chr = ESC "(B";
+        if (loss_mode == 1)
+            y = '|';
+        else
+        {
+            y = x; 
+            enable_ncr = 1;
+        }
+    }
 
     if (comb)
     {
@@ -737,6 +739,7 @@ static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
 
         cd->write_marc8_last = y;
         cd->write_marc8_lpage = page_chr;
+        cd->write_marc8_ncr = enable_ncr;
     }
     return 0;
 }
@@ -751,8 +754,31 @@ static size_t yaz_flush_marc8(yaz_iconv_t cd,
     return yaz_write_marc8_page_chr(cd, outbuf, outbytesleft, ESC "(B");
 }
 
-static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
-                              char **outbuf, size_t *outbytesleft)
+static size_t yaz_write_marc8_generic(yaz_iconv_t cd, unsigned long x,
+                                      char **outbuf, size_t *outbytesleft,
+                                      int loss_mode);
+
+static size_t yaz_write_marc8_normal(yaz_iconv_t cd, unsigned long x,
+                                     char **outbuf, size_t *outbytesleft)
+{
+    return yaz_write_marc8_generic(cd, x, outbuf, outbytesleft, 0);
+}
+
+static size_t yaz_write_marc8_lossy(yaz_iconv_t cd, unsigned long x,
+                                    char **outbuf, size_t *outbytesleft)
+{
+    return yaz_write_marc8_generic(cd, x, outbuf, outbytesleft, 1);
+}
+
+static size_t yaz_write_marc8_lossless(yaz_iconv_t cd, unsigned long x,
+                                    char **outbuf, size_t *outbytesleft)
+{
+    return yaz_write_marc8_generic(cd, x, outbuf, outbytesleft, 2);
+}
+
+static size_t yaz_write_marc8_generic(yaz_iconv_t cd, unsigned long x,
+                                      char **outbuf, size_t *outbytesleft,
+                                      int loss_mode)
 {
     int i;
     for (i = 0; latin1_comb[i].x1; i++)
@@ -767,11 +793,11 @@ static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
             const char *lpage = cd->write_marc8_lpage;
 
             r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
-                                  outbuf, outbytesleft);
+                                  outbuf, outbytesleft, loss_mode);
             if (r)
                 return r;
             r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
-                                  outbuf, outbytesleft);
+                                  outbuf, outbytesleft, loss_mode);
             if (r && cd->my_errno == YAZ_ICONV_E2BIG)
             {
                 /* not enough room. reset output to original values */
@@ -783,7 +809,7 @@ static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
             return r;
         }
     }
-    return yaz_write_marc8_2(cd, x, outbuf, outbytesleft);
+    return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, loss_mode);
 }
 
 
@@ -870,12 +896,22 @@ yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
             cd->write_handle = yaz_write_UCS4LE;
         else if (!yaz_matchstr(tocode, "MARC8"))
         {
-            cd->write_handle = yaz_write_marc8;
+            cd->write_handle = yaz_write_marc8_normal;
             cd->flush_handle = yaz_flush_marc8;
         }
         else if (!yaz_matchstr(tocode, "MARC8s"))
         {
-            cd->write_handle = yaz_write_marc8;
+            cd->write_handle = yaz_write_marc8_normal;
+            cd->flush_handle = yaz_flush_marc8;
+        }
+        else if (!yaz_matchstr(tocode, "MARC8lossy"))
+        {
+            cd->write_handle = yaz_write_marc8_lossy;
+            cd->flush_handle = yaz_flush_marc8;
+        }
+        else if (!yaz_matchstr(tocode, "MARC8lossless"))
+        {
+            cd->write_handle = yaz_write_marc8_lossless;
             cd->flush_handle = yaz_flush_marc8;
         }
         else if (!yaz_matchstr(tocode, "advancegreek"))
@@ -956,13 +992,14 @@ size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
     {
         cd->my_errno = YAZ_ICONV_UNKNOWN;
         cd->g0_mode = 'B';
-        cd->g1_mode = 'B';
+        cd->g1_mode = 'E';
         
         cd->comb_offset = cd->comb_size = 0;
         cd->compose_char = 0;
         
         cd->write_marc8_second_half_char = 0;
         cd->write_marc8_last = 0;
+        cd->write_marc8_ncr = 0;
         cd->write_marc8_lpage = 0;
         cd->write_marc8_g0 = ESC "(B";
         cd->write_marc8_g1 = 0;