MARC line reader: allow control fields of size 1
[yaz-moved-to-github.git] / src / marc_read_line.c
index c05af1b..eefeb84 100644 (file)
@@ -1,13 +1,11 @@
-/*
- * Copyright (C) 1995-2006, Index Data ApS
+/* This file is part of the YAZ toolkit.
+ * Copyright (C) 1995-2010 Index Data
  * See the file LICENSE for details.
- *
- * $Id: marc_read_line.c,v 1.1 2006-12-15 12:37:18 adam Exp $
  */
 
 /**
- * \file marc_read_iso2709.c
- * \brief Implements reading of MARC as ISO2709
+ * \file marc_read_line.c
+ * \brief Implements reading of MARC in line format
  */
 
 #if HAVE_CONFIG_H
@@ -18,6 +16,7 @@
 #include <windows.h>
 #endif
 
+#include <assert.h>
 #include <stdio.h>
 #include <string.h>
 #include <ctype.h>
 #include <yaz/wrbuf.h>
 #include <yaz/yaz-util.h>
 
+int yaz_gets(int (*getbyte)(void *client_data),
+             void (*ungetbyte)(int b, void *client_data),
+             void *client_data,
+             WRBUF w)
+{
+    size_t sz = 0;
+    int ch = getbyte(client_data);
+
+    while (ch != '\0' && ch != '\r' && ch != '\n')
+    {
+        wrbuf_putc(w, ch);
+        sz++;
+        ch = getbyte(client_data);
+    }
+    if (ch == '\r')
+    {
+        ch = getbyte(client_data);
+        if (ch != '\n' && ch != '\0')
+            ungetbyte(ch, client_data);
+    }
+    else if (ch == '\n')
+    {
+        ch = getbyte(client_data);
+        if (ch != '\r' && ch != '\0')
+            ungetbyte(ch, client_data);
+    }
+    if (sz)
+    {
+        return 1;
+    }
+    return 0;
+}
+
+static int yaz_marc_line_gets(int (*getbyte)(void *client_data),
+                              void (*ungetbyte)(int b, void *client_data),
+                              void *client_data,
+                              WRBUF w)
+{
+    int more;
+
+    wrbuf_rewind(w);
+    more = yaz_gets(getbyte, ungetbyte, client_data, w);
+    if (!more)
+        return 0;
+
+    while (more)
+    {
+        int i;
+        for (i = 0; i<4; i++)
+        {
+            int ch = getbyte(client_data);
+            if (ch != ' ')
+            {
+                if (ch)
+                    ungetbyte(ch, client_data);
+                return 1;
+            }
+        }
+        if (wrbuf_len(w) > 60 && wrbuf_buf(w)[wrbuf_len(w)-1] == '=')
+            wrbuf_cut_right(w, 1);
+        else
+            wrbuf_puts(w, " ");
+        more = yaz_gets(getbyte, ungetbyte, client_data, w);
+    }
+    return 1;
+}
+
+    
 int yaz_marc_read_line(yaz_marc_t mt,
                        int (*getbyte)(void *client_data),
                        void (*ungetbyte)(int b, void *client_data),
                        void *client_data)
 {
+    int indicator_length;
+    int identifier_length;
+    int base_address;
+    int length_data_entry;
+    int length_starting;
+    int length_implementation;
+    int marker_ch = 0;
+    int marker_skip = 0;
+    int header_created = 0;
+    WRBUF wrbuf_line = wrbuf_alloc();
+
     yaz_marc_reset(mt);
 
-    return -1;
+    while (yaz_marc_line_gets(getbyte, ungetbyte, client_data, wrbuf_line))
+    {
+        const char *line = wrbuf_cstr(wrbuf_line);
+        int val;
+        size_t line_len = strlen(line);
+        if (line_len == 0)       /* empty line indicates end of record */
+        {
+            if (header_created)
+                break;
+        }
+        else if (line[0] == '$') /* indicates beginning/end of record */
+        {
+            if (header_created)
+                break;
+        }
+        else if (line[0] == '(') /* annotation, skip it */
+            ;
+        else if (line_len == 24 && atoi_n_check(line, 5, &val) && val >= 24)
+        {
+            /* deal with header lines:  00366nam  22001698a 4500
+            */
+
+            if (header_created)
+                break;
+            yaz_marc_set_leader(mt, line,
+                                &indicator_length,
+                                &identifier_length,
+                                &base_address,
+                                &length_data_entry,
+                                &length_starting,
+                                &length_implementation);
+            header_created = 1;
+        }
+        else if (line_len > 4 && line[0] != ' ' && line[1] != ' '
+                 && line[2] != ' ' && line[3] == ' ' )
+        {
+            /* deal with data/control lines: 245 12 ........ */
+            char tag[4];
+            const char *datafield_start = line+6;
+            marker_ch = 0;
+            marker_skip = 0;
+
+            memcpy(tag, line, 3);
+            tag[3] = '\0';
+            if (line_len >= 8) /* control - or datafield ? */
+            {
+                if (*datafield_start == ' ')
+                    datafield_start++;  /* skip blank after indicator */
+
+                if (strchr("$_*", *datafield_start))
+                {
+                    marker_ch = *datafield_start;
+                    if (datafield_start[2] == ' ')
+                        marker_skip = 1; /* subfields has blank before data */
+                }
+            }
+            if (!header_created)
+            {
+                const char *leader = "01000cam  2200265 i 4500";
+
+                yaz_marc_set_leader(mt, leader,
+                                    &indicator_length,
+                                    &identifier_length,
+                                    &base_address,
+                                    &length_data_entry,
+                                    &length_starting,
+                                    &length_implementation);
+                header_created = 1;
+            }
+
+            if (marker_ch == 0)
+            {   /* control field */
+                yaz_marc_add_controlfield(mt, tag, line+4, strlen(line+4));
+            }
+            else
+            {   /* data field */
+                const char *indicator = line+4;
+                int indicator_len = 2;
+                const char *cp = datafield_start;
+
+                yaz_marc_add_datafield(mt, tag, indicator, indicator_len);
+                for (;;)
+                {
+                    const char *next;
+                    size_t len;
+                    
+                    assert(cp[0] == marker_ch);
+                    cp++;
+                    next = cp;
+                    while ((next = strchr(next, marker_ch)))
+                    {
+                        if ((next[1] >= 'A' && next[1] <= 'Z')
+                            ||(next[1] >= 'a' && next[1] <= 'z'))
+                        {
+                            if (!marker_skip)
+                                break;
+                            else if (next[2] == ' ')
+                                break;
+                        }
+                        next++;
+                    }
+                    len = strlen(cp);
+                    if (next)
+                        len = next - cp - marker_skip;
+
+                    if (marker_skip)
+                    {
+                        /* remove ' ' after subfield marker */
+                        char *cp_blank = strchr(cp, ' ');
+                        if (cp_blank)
+                        {
+                            len--;
+                            while (cp_blank != cp)
+                            {
+                                cp_blank[0] = cp_blank[-1];
+                                cp_blank--;
+                            }
+                            cp++;
+                        }
+                    }
+                    yaz_marc_add_subfield(mt, cp, len);
+                    if (!next)
+                        break;
+                    cp = next;
+                }
+            }
+        }
+        else
+        {
+            yaz_marc_cprintf(mt, "Ignoring line: %s", line);
+        }
+    }
+    wrbuf_destroy(wrbuf_line);
+    if (!header_created)
+        return -1;
+    return 0;
 }
 
 /*
  * Local variables:
  * c-basic-offset: 4
+ * c-file-style: "Stroustrup"
  * indent-tabs-mode: nil
  * End:
  * vim: shiftwidth=4 tabstop=8 expandtab