add copy of iconv_decode_marc8.c

author Wolfram Schneider <wosch@indexdata.dk>

Mon, 17 Nov 2008 14:17:16 +0000 (15:17 +0100)

committer Wolfram Schneider <wosch@indexdata.dk>

Mon, 17 Nov 2008 14:17:16 +0000 (15:17 +0100)
author Wolfram Schneider <wosch@indexdata.dk>
Mon, 17 Nov 2008 14:17:16 +0000 (15:17 +0100)
committer Wolfram Schneider <wosch@indexdata.dk>
Mon, 17 Nov 2008 14:17:16 +0000 (15:17 +0100)
diff --git a/src/iconv_decode_iso5426.c b/src/iconv_decode_iso5426.c

new file mode 100644 (file)

index 0000000..eecee04
--- /dev/null
+++ b/src/iconv_decode_iso5426.c
@@ -0,0 +1,287 @@
+/* This file is part of the YAZ toolkit.
+ * Copyright (C) 1995-2008 Index Data
+ * See the file LICENSE for details.
+ */
+/**
+ * \file
+ * \brief MARC-8 decoding
+ *
+ * MARC-8 reference:
+ *  http://www.loc.gov/marc/specifications/speccharmarc8.html
+ */
+
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <assert.h>
+#include <errno.h>
+#include <string.h>
+#include <ctype.h>
+
+#include <yaz/xmalloc.h>
+#include "iconv-p.h"
+
+struct decoder_data {
+    int g0_mode;
+    int g1_mode;
+
+    int comb_offset;
+    int comb_size;
+    unsigned long comb_x[8];
+    size_t comb_no_read[8];
+};
+
+yaz_conv_func_t yaz_marc8_42_conv;
+yaz_conv_func_t yaz_marc8_45_conv;
+yaz_conv_func_t yaz_marc8_67_conv;
+yaz_conv_func_t yaz_marc8_62_conv;
+yaz_conv_func_t yaz_marc8_70_conv;
+yaz_conv_func_t yaz_marc8_32_conv;
+yaz_conv_func_t yaz_marc8_4E_conv;
+yaz_conv_func_t yaz_marc8_51_conv;
+yaz_conv_func_t yaz_marc8_33_conv;
+yaz_conv_func_t yaz_marc8_34_conv;
+yaz_conv_func_t yaz_marc8_53_conv;
+yaz_conv_func_t yaz_marc8_31_conv;
+
+
+static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd,
+                                         struct decoder_data *data,
+                                         unsigned char *inp,
+                                         size_t inbytesleft, size_t *no_read,
+                                         int *comb);
+
+static unsigned long read_marc8(yaz_iconv_t cd, yaz_iconv_decoder_t d,
+                               unsigned char *inp,
+                               size_t inbytesleft, size_t *no_read)
+{
+    struct decoder_data *data = (struct decoder_data *) d->data;
+    unsigned long x;
+    if (data->comb_offset < data->comb_size)
+    {
+        *no_read = data->comb_no_read[data->comb_offset];
+        x = data->comb_x[data->comb_offset];
+
+        /* special case for double-diacritic combining characters, 
+           INVERTED BREVE and DOUBLE TILDE.
+           We'll increment the no_read counter by 1, since we want to skip over
+           the processing of the closing ligature character
+        */
+        /* this code is no longer necessary.. our handlers code in
+           yaz_marc8_?_conv (generated by charconv.tcl) now returns
+           0 and no_read=1 when a sequence does not match the input.
+           The SECOND HALFs in codetables.xml produces a non-existant
+           entry in the conversion trie.. Hence when met, the input byte is
+           skipped as it should (in yaz_iconv)
+        */
+#if 0
+        if (x == 0x0361 || x == 0x0360)
+            *no_read += 1;
+#endif
+        data->comb_offset++;
+        return x;
+    }
+
+    data->comb_offset = 0;
+    for (data->comb_size = 0; data->comb_size < 8; data->comb_size++)
+    {
+        int comb = 0;
+
+        if (inbytesleft == 0 && data->comb_size)
+        {
+            yaz_iconv_set_errno(cd, YAZ_ICONV_EINVAL);
+            x = 0;
+            *no_read = 0;
+            break;
+        }
+        x = yaz_read_marc8_comb(cd, data, inp, inbytesleft, no_read, &comb);
+        if (!comb || !x)
+            break;
+        data->comb_x[data->comb_size] = x;
+        data->comb_no_read[data->comb_size] = *no_read;
+        inp += *no_read;
+        inbytesleft = inbytesleft - *no_read;
+    }
+    return x;
+}
+
+static unsigned long read_marc8s(yaz_iconv_t cd, yaz_iconv_decoder_t d,
+                                 unsigned char *inp,
+                                 size_t inbytesleft, size_t *no_read)
+{
+    struct decoder_data *data = (struct decoder_data *) d->data;
+    unsigned long x = read_marc8(cd, d, inp, inbytesleft, no_read);
+    if (x && data->comb_size == 1)
+    {
+        if (yaz_iso_8859_1_lookup_x12(x, data->comb_x[0], &x))
+        {
+            *no_read += data->comb_no_read[0];
+            data->comb_size = 0;
+        }
+    }
+    return x;
+}
+
+static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd,
+                                         struct decoder_data *data,
+                                         unsigned char *inp,
+                                         size_t inbytesleft, size_t *no_read,
+                                         int *comb)
+{
+    *no_read = 0;
+    while (inbytesleft > 0 && *inp == 27)
+    {
+        int *modep = &data->g0_mode;
+        size_t inbytesleft0 = inbytesleft;
+
+        inbytesleft--;
+        inp++;
+        if (inbytesleft == 0)
+            goto incomplete;
+        if (*inp == '$') /* set with multiple bytes */
+        {
+            inbytesleft--;
+            inp++;
+        }
+        if (inbytesleft == 0)
+            goto incomplete;
+        if (*inp == '(' || *inp == ',')  /* G0 */
+        {
+            inbytesleft--;
+            inp++;
+        }
+        else if (*inp == ')' || *inp == '-') /* G1 */
+        {
+            inbytesleft--;
+            inp++;
+            modep = &data->g1_mode;
+        }
+        if (inbytesleft == 0)
+            goto incomplete;
+        if (*inp == '!') /* ANSEL is a special case */
+        {
+            inbytesleft--;
+            inp++;
+        }
+        if (inbytesleft == 0)
+            goto incomplete;
+        *modep = *inp++; /* Final character */
+        inbytesleft--;
+
+        (*no_read) += inbytesleft0 - inbytesleft;
+    }
+    if (inbytesleft == 0)
+        return 0;
+    else if (*inp == ' ')
+    {
+        *no_read += 1;
+        return ' ';
+    }
+    else
+    {
+        unsigned long x;
+        size_t no_read_sub = 0;
+        int mode = *inp < 128 ? data->g0_mode : data->g1_mode;
+        *comb = 0;
+
+        switch(mode)
+        {
+        case 'B':  /* Basic ASCII */
+        case 's':  /* ASCII */
+            x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+            break;
+        case 'E':  /* ANSEL */
+            x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb, 127, 128);
+            break;
+        case 'g':  /* Greek */
+            x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+            break;
+        case 'b':  /* Subscripts */
+            x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+            break;
+        case 'p':  /* Superscripts */
+            x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+            break;
+        case '2':  /* Basic Hebrew */
+            x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+            break;
+        case 'N':  /* Basic Cyrillic */
+            x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+            break;
+        case 'Q':  /* Extended Cyrillic */
+            x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+            break;
+        case '3':  /* Basic Arabic */
+            x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+            break;
+        case '4':  /* Extended Arabic */
+            x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+            break;
+        case 'S':  /* Greek */
+            x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+            break;
+        case '1':  /* Chinese, Japanese, Korean (EACC) */
+            x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+            break;
+        default:
+            *no_read = 0;
+            yaz_iconv_set_errno(cd, YAZ_ICONV_EILSEQ);
+            return 0;
+        }
+        *no_read += no_read_sub;
+        return x;
+    }
+incomplete:
+    *no_read = 0;
+    yaz_iconv_set_errno(cd, YAZ_ICONV_EINVAL);
+    return 0;
+}
+
+
+static size_t init_marc8(yaz_iconv_t cd, yaz_iconv_decoder_t d,
+                         unsigned char *inp,
+                         size_t inbytesleft, size_t *no_read)
+{
+    struct decoder_data *data = (struct decoder_data *) d->data;
+    data->g0_mode = 'B';
+    data->g1_mode = 'E';
+    data->comb_offset = data->comb_size = 0;
+    return 0;
+}
+
+void destroy_marc8(yaz_iconv_decoder_t d)
+{
+    struct decoder_data *data = (struct decoder_data *) d->data;
+    xfree(data);
+}
+
+yaz_iconv_decoder_t yaz_marc8_decoder(const char *fromcode,
+                                      yaz_iconv_decoder_t d)
+{
+    if (!yaz_matchstr(fromcode, "MARC8") || !yaz_matchstr(fromcode, "ANSEL"))
+        d->read_handle = read_marc8;
+    else if (!yaz_matchstr(fromcode, "ISO5426"))
+        d->read_handle = read_marc8;
+    else if (!yaz_matchstr(fromcode, "MARC8s"))
+        d->read_handle = read_marc8s;
+    else
+        return 0;
+    {
+        struct decoder_data *data = (struct decoder_data *)
+            xmalloc(sizeof(*data));
+        d->data = data;
+        d->init_handle = init_marc8;
+        d->destroy_handle = destroy_marc8;
+    }
+    return d;
+}
+
+
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */
author	Wolfram Schneider <wosch@indexdata.dk>
	Mon, 17 Nov 2008 14:17:16 +0000 (15:17 +0100)
committer	Wolfram Schneider <wosch@indexdata.dk>
	Mon, 17 Nov 2008 14:17:16 +0000 (15:17 +0100)