Handle G1 in MARC-8 decoding.
authorAdam Dickmeiss <adam@indexdata.dk>
Tue, 25 Mar 2008 20:50:41 +0000 (21:50 +0100)
committerAdam Dickmeiss <adam@indexdata.dk>
Tue, 25 Mar 2008 20:50:41 +0000 (21:50 +0100)
The MARC-8 decoding now handles G1 sequences in any code page. The
code until now has only handled G1 in the ANSEL page.

src/charconv.tcl
src/siconv.c
test/tsticonv.c

index 5732f8d..3086d26 100755 (executable)
@@ -36,16 +36,16 @@ proc preamble_trie {ofilehandle ifiles ofile} {
     "
     puts $f {
         static unsigned long lookup(struct yaz_iconv_trie **ptrs, int ptr, unsigned char *inp,
-                                    size_t inbytesleft, size_t *no_read, int *combining)
+                                    size_t inbytesleft, size_t *no_read, int *combining, unsigned mask, int boffset)
         {
             struct yaz_iconv_trie *t = (ptr > 0) ? ptrs[ptr-1] : 0;
             if (!t || inbytesleft < 1)
                 return 0;
             if (t->dir)
             {
-                size_t ch = inp[0] & 0xff;
+                size_t ch = (inp[0] & mask) + boffset;
                 unsigned long code =
-                lookup(ptrs, t->dir[ch].ptr, inp+1, inbytesleft-1, no_read, combining);
+                lookup(ptrs, t->dir[ch].ptr, inp+1, inbytesleft-1, no_read, combining, mask, boffset);
                 if (code)
                 {
                     (*no_read)++;
@@ -67,7 +67,13 @@ proc preamble_trie {ofilehandle ifiles ofile} {
                     size_t len = strlen(flat->from);
                     if (len <= inbytesleft)
                     {
-                        if (memcmp(flat->from, inp, len) == 0)
+                        size_t i;
+                       for (i = 0; i < len; i++)
+                       {
+                           if (((unsigned char *) flat->from)[i] != (inp[i] & mask) + boffset)
+                               break;
+                       }
+                        if (i == len)
                         {
                             *no_read = len;
                            *combining = flat->combining;
@@ -256,11 +262,11 @@ proc dump_trie {ofilehandle} {
     puts $f ""
 
     puts $f "unsigned long yaz_$trie(prefix)_conv
-            (unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining)
+            (unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining, unsigned mask, int boffset)
         {
             unsigned long code;
             
-            code = lookup($trie(prefix)ptrs, 1, inp, inbytesleft, no_read, combining);
+            code = lookup($trie(prefix)ptrs, 1, inp, inbytesleft, no_read, combining, mask, boffset);
             if (!code)
             {
                 *no_read = 1;
index 8557bbd..3a9c8e1 100644 (file)
 #include <yaz/nmem.h>
 #include "iconv-p.h"
 
-
-unsigned long yaz_marc8_42_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_45_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_67_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_62_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_70_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_32_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_4E_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_51_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_33_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_34_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_53_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-unsigned long yaz_marc8_31_conv(unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read, int *combining);
-
-
-unsigned long yaz_marc8r_42_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_45_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_67_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_62_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_70_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_32_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_4E_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_51_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_33_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_34_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_53_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
-unsigned long yaz_marc8r_31_conv(unsigned char *inp, size_t inbytesleft,
-                                 size_t *no_read, int *combining);
+typedef unsigned long yaz_conv_func_t(unsigned char *inp, size_t inbytesleft,
+                                      size_t *no_read, int *combining,
+                                      unsigned mask, int boffset);
+
+
+yaz_conv_func_t yaz_marc8_42_conv;
+yaz_conv_func_t yaz_marc8_45_conv;
+yaz_conv_func_t yaz_marc8_67_conv;
+yaz_conv_func_t yaz_marc8_62_conv;
+yaz_conv_func_t yaz_marc8_70_conv;
+yaz_conv_func_t yaz_marc8_32_conv;
+yaz_conv_func_t yaz_marc8_4E_conv;
+yaz_conv_func_t yaz_marc8_51_conv;
+yaz_conv_func_t yaz_marc8_33_conv;
+yaz_conv_func_t yaz_marc8_34_conv;
+yaz_conv_func_t yaz_marc8_53_conv;
+yaz_conv_func_t yaz_marc8_31_conv;
+
+yaz_conv_func_t yaz_marc8r_42_conv;
+yaz_conv_func_t yaz_marc8r_45_conv;
+yaz_conv_func_t yaz_marc8r_67_conv;
+yaz_conv_func_t yaz_marc8r_62_conv;
+yaz_conv_func_t yaz_marc8r_70_conv;
+yaz_conv_func_t yaz_marc8r_32_conv;
+yaz_conv_func_t yaz_marc8r_4E_conv;
+yaz_conv_func_t yaz_marc8r_51_conv;
+yaz_conv_func_t yaz_marc8r_33_conv;
+yaz_conv_func_t yaz_marc8r_34_conv;
+yaz_conv_func_t yaz_marc8r_53_conv;
+yaz_conv_func_t yaz_marc8r_31_conv;
 
 struct yaz_iconv_struct {
     int my_errno;
@@ -372,43 +351,40 @@ static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
         {
         case 'B':  /* Basic ASCII */
         case 's':  /* ASCII */
+            x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+            break;
         case 'E':  /* ANSEL */
-            x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb);
-            if (!x)
-            {
-                no_read_sub = 0;
-                x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb);
-            }
+            x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb, 127, 128);
             break;
         case 'g':  /* Greek */
-            x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb);
+            x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
             break;
         case 'b':  /* Subscripts */
-            x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb);
+            x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
             break;
         case 'p':  /* Superscripts */
-            x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb);
+            x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
             break;
         case '2':  /* Basic Hebrew */
-            x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb);
+            x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
             break;
         case 'N':  /* Basic Cyrillic */
-            x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb);
+            x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
             break;
         case 'Q':  /* Extended Cyrillic */
-            x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb);
+            x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
             break;
         case '3':  /* Basic Arabic */
-            x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb);
+            x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
             break;
         case '4':  /* Extended Arabic */
-            x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb);
+            x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
             break;
         case 'S':  /* Greek */
-            x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb);
+            x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
             break;
         case '1':  /* Chinese, Japanese, Korean (EACC) */
-            x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb);
+            x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
             break;
         default:
             *no_read = 0;
@@ -524,67 +500,67 @@ static unsigned long lookup_marc8(yaz_iconv_t cd,
         inp = (unsigned char *) utf8_buf;
         inbytesleft = strlen(utf8_buf);
 
-        x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "(B";
             return x;
         }
-        x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "(B";
             return x;
         }
-        x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "b";
             return x;
         }
-        x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "p";
             return x;
         }
-        x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "(2";
             return x;
         }
-        x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "(N";
             return x;
         }
-        x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "(Q";
             return x;
         }
-        x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "(3";
             return x;
         }
-        x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "(4";
             return x;
         }
-        x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "(S";
             return x;
         }
-        x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb);
+        x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
         if (x)
         {
             *page_chr = ESC "$1";
@@ -956,7 +932,7 @@ size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
     {
         cd->my_errno = YAZ_ICONV_UNKNOWN;
         cd->g0_mode = 'B';
-        cd->g1_mode = 'B';
+        cd->g1_mode = 'E';
         
         cd->comb_offset = cd->comb_size = 0;
         cd->compose_char = 0;
index f31b6d6..08394ed 100644 (file)
@@ -423,6 +423,9 @@ static void tst_marc8_to_utf8(void)
     YAZ_CHECK(tst_convert_x(cd, ESC "(", "", YAZ_ICONV_EINVAL));
     YAZ_CHECK(tst_convert_x(cd, ESC "(B", "", 0));
 
+    YAZ_CHECK(tst_convert(cd, ESC "(B" "\x31", "1"));  /* ASCII in G0 */
+    YAZ_CHECK(tst_convert(cd, ESC ")B" "\xB1", "1"));  /* ASCII in G1 */
+
     yaz_iconv_close(cd);
 }