X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=index%2Fmod_grs_marc.c;fp=index%2Fmod_grs_marc.c;h=52b59e8c628a65dd9490c1435d92e826da4a2efb;hb=b3679b1ecd730266f70a62347f66e5964a2fa030;hp=0000000000000000000000000000000000000000;hpb=78b13a3ac6a79768fb609c14db2a8e0c94a9c4da;p=idzebra-moved-to-github.git diff --git a/index/mod_grs_marc.c b/index/mod_grs_marc.c new file mode 100644 index 0000000..52b59e8 --- /dev/null +++ b/index/mod_grs_marc.c @@ -0,0 +1,914 @@ +/* This file is part of the Zebra server. + Copyright (C) 1995-2008 Index Data + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*/ + +#include +#include +#include + +#include +#include +#include +#include "marcomp.h" +#include "inline.h" + +#define MARC_DEBUG 0 +#define MARCOMP_DEBUG 0 + +struct marc_info { + char type[256]; +}; + +static data1_node *grs_read_iso2709 (struct grs_read_info *p, int marc_xml) +{ + struct marc_info *mi = (struct marc_info*) p->clientData; + char buf[100000]; + int entry_p; + int record_length; + int indicator_length; + int identifier_length; + int base_address; + int end_of_directory; + int length_data_entry; + int length_starting; + int length_implementation; + int read_bytes; +#if MARC_DEBUG + FILE *outf = stdout; +#endif + data1_node *res_root, *res_top; + char *absynName; + data1_marctab *marctab; + + if (p->stream->readf(p->stream, buf, 5) != 5) + return NULL; + while (*buf < '0' || *buf > '9') + { + int i; + + yaz_log(YLOG_WARN, "MARC: Skipping bad byte %d (0x%02X)", + *buf & 0xff, *buf & 0xff); + for (i = 0; i<4; i++) + buf[i] = buf[i+1]; + + if (p->stream->readf(p->stream, buf+4, 1) != 1) + return NULL; + } + record_length = atoi_n (buf, 5); + if (record_length < 25) + { + yaz_log (YLOG_WARN, "MARC record length < 25, is %d", record_length); + return NULL; + } + + read_bytes = p->stream->readf(p->stream, buf+5, record_length-5); + if (read_bytes < record_length-5) + { + yaz_log (YLOG_WARN, "Couldn't read whole MARC record"); + return NULL; + } + /* skip until we meet a record separator */ + while (buf[record_length-1] != ISO2709_RS) + { + if (record_length > sizeof(buf)-2) + break; + read_bytes = p->stream->readf(p->stream, buf+record_length, 1); + if (read_bytes != 1) + break; + record_length++; + } + /* read one byte ahead to see if there is more ... */ + read_bytes = p->stream->readf(p->stream, buf+record_length, 1); + if (read_bytes == 1) + { + off_t cur_offset = p->stream->tellf(p->stream); + if (p->stream->endf) + { + off_t end_offset = cur_offset - 1; + p->stream->endf(p->stream, &end_offset); + } + } + + absynName = mi->type; + res_root = data1_mk_root (p->dh, p->mem, absynName); + if (!res_root) + { + yaz_log (YLOG_WARN, "cannot read MARC without an abstract syntax"); + return 0; + } + if (marc_xml) + { + data1_node *lead; + const char *attr[] = { "xmlns", "http://www.loc.gov/MARC21/slim", 0}; + + res_top = data1_mk_tag (p->dh, p->mem, "record", attr, res_root); + + lead = data1_mk_tag(p->dh, p->mem, "leader", 0, res_top); + data1_mk_text_n(p->dh, p->mem, buf, 24, lead); + } + else + res_top = data1_mk_tag (p->dh, p->mem, absynName, 0, res_root); + + if ((marctab = data1_absyn_getmarctab(p->dh, res_root))) + { + memcpy(marctab->leader, buf, 24); + memcpy(marctab->implementation_codes, buf+6, 4); + marctab->implementation_codes[4] = '\0'; + memcpy(marctab->user_systems, buf+17, 3); + marctab->user_systems[3] = '\0'; + } + + if (marctab && marctab->force_indicator_length >= 0) + indicator_length = marctab->force_indicator_length; + else + indicator_length = atoi_n (buf+10, 1); + if (marctab && marctab->force_identifier_length >= 0) + identifier_length = marctab->force_identifier_length; + else + identifier_length = atoi_n (buf+11, 1); + base_address = atoi_n (buf+12, 5); + + length_data_entry = atoi_n (buf+20, 1); + length_starting = atoi_n (buf+21, 1); + length_implementation = atoi_n (buf+22, 1); + + for (entry_p = 24; buf[entry_p] != ISO2709_FS; ) + { + int l = 3 + length_data_entry + length_starting; + if (entry_p + l >= record_length) + { + yaz_log(YLOG_WARN, "MARC: Directory offset %d: end of record.", + entry_p); + return 0; + } + /* check for digits in length info */ + while (--l >= 3) + if (!isdigit(*(const unsigned char *) (buf + entry_p+l))) + break; + if (l >= 3) + { + /* not all digits, so stop directory scan */ + yaz_log(YLOG_LOG, "MARC: Bad directory"); + break; + } + entry_p += 3 + length_data_entry + length_starting; + } + end_of_directory = entry_p; + if (base_address != entry_p+1) + { + yaz_log(YLOG_WARN, "MARC: Base address does not follow directory"); + } + for (entry_p = 24; entry_p != end_of_directory; ) + { + int data_length; + int data_offset; + int end_offset; + int i, i0; + char tag[4]; + data1_node *res; + data1_node *parent = res_top; + + memcpy (tag, buf+entry_p, 3); + entry_p += 3; + tag[3] = '\0'; + + if (marc_xml) + res = parent; + else + res = data1_mk_tag_n (p->dh, p->mem, tag, 3, 0 /* attr */, parent); + +#if MARC_DEBUG + fprintf (outf, "%s ", tag); +#endif + data_length = atoi_n (buf+entry_p, length_data_entry); + entry_p += length_data_entry; + data_offset = atoi_n (buf+entry_p, length_starting); + entry_p += length_starting; + i = data_offset + base_address; + end_offset = i+data_length-1; + + if (data_length <= 0 || data_offset < 0 || end_offset >= record_length) + { + yaz_log(YLOG_WARN, "MARC: Bad offsets in data. Skipping rest"); + break; + } + + if (memcmp (tag, "00", 2) && indicator_length) + { + /* generate indicator node */ + if (marc_xml) + { + const char *attr[10]; + int j; + + attr[0] = "tag"; + attr[1] = tag; + attr[2] = 0; + + res = data1_mk_tag(p->dh, p->mem, "datafield", attr, res); + + for (j = 0; jdh, p->mem, res, attr); + } + } + else + { +#if MARC_DEBUG + int j; +#endif + res = data1_mk_tag_n (p->dh, p->mem, + buf+i, indicator_length, 0 /* attr */, res); +#if MARC_DEBUG + for (j = 0; jdh, p->mem, "controlfield", attr, res); + } + } + parent = res; + /* traverse sub fields */ + i0 = i; + while (buf[i] != ISO2709_RS && buf[i] != ISO2709_FS && i < end_offset) + { + if (memcmp (tag, "00", 2) && identifier_length) + { + data1_node *res; + if (marc_xml) + { + int j; + const char *attr[3]; + char code[10]; + + for (j = 1; jdh, p->mem, "subfield", + attr, parent); + } + else + { + res = data1_mk_tag_n (p->dh, p->mem, + buf+i+1, identifier_length-1, + 0 /* attr */, parent); + } +#if MARC_DEBUG + fprintf (outf, " $"); + for (j = 1; jdh, p->mem, buf + i0, i - i0, res); + i0 = i; + } + else + { +#if MARC_DEBUG + fprintf (outf, "%c", buf[i]); +#endif + i++; + } + } + if (i > i0) + { + data1_mk_text_n (p->dh, p->mem, buf + i0, i - i0, parent); + } +#if MARC_DEBUG + fprintf (outf, "\n"); + if (i < end_offset) + fprintf (outf, "-- separator but not at end of field\n"); + if (buf[i] != ISO2709_RS && buf[i] != ISO2709_FS) + fprintf (outf, "-- no separator at end of field\n"); +#endif + } + return res_root; +} + +/* + * Locate some data under this node. This routine should handle variants + * prettily. + */ +static char *get_data(data1_node *n, int *len) +{ + char *r; + + while (n) + { + if (n->which == DATA1N_data) + { + int i; + *len = n->u.data.len; + + for (i = 0; i<*len; i++) + if (!d1_isspace(n->u.data.data[i])) + break; + while (*len && d1_isspace(n->u.data.data[*len - 1])) + (*len)--; + *len = *len - i; + if (*len > 0) + return n->u.data.data + i; + } + if (n->which == DATA1N_tag) + n = n->child; + else if (n->which == DATA1N_data) + n = n->next; + else + break; + } + r = ""; + *len = strlen(r); + return r; +} + +static data1_node *lookup_subfield(data1_node *node, const char *name) +{ + data1_node *p; + + for (p=node; p; p=p->next) + { + if (!yaz_matchstr(p->u.tag.tag, name)) + return p; + } + return 0; +} + +static inline_subfield *lookup_inline_subfield(inline_subfield *pisf, + const char *name) +{ + inline_subfield *p; + + for (p=pisf; p; p=p->next) + { + if (!yaz_matchstr(p->name, name)) + return p; + } + return 0; +} + +static inline_subfield *cat_inline_subfield(mc_subfield *psf, WRBUF buf, + inline_subfield *pisf) +{ + mc_subfield *p; + + for (p = psf; p && pisf; p = p->next) + { + if (p->which == MC_SF) + { + inline_subfield *found = lookup_inline_subfield(pisf, p->name); + + if (found) + { + if (strcmp(p->prefix, "_")) + { + wrbuf_puts(buf, " "); + wrbuf_puts(buf, p->prefix); + } + if (p->interval.start == -1) + { + wrbuf_puts(buf, found->data); + } + else + { + wrbuf_write(buf, found->data+p->interval.start, + p->interval.end-p->interval.start); + wrbuf_puts(buf, ""); + } + if (strcmp(p->suffix, "_")) + { + wrbuf_puts(buf, p->suffix); + wrbuf_puts(buf, " "); + } +#if MARCOMP_DEBUG + yaz_log(YLOG_LOG, "cat_inline_subfield(): add subfield $%s", found->name); +#endif + pisf = found->next; + } + } + else if (p->which == MC_SFVARIANT) + { + inline_subfield *next; + + do { + next = cat_inline_subfield(p->u.child, buf, pisf); + if (next == pisf) + break; + pisf = next; + } while (pisf); + } + else if (p->which == MC_SFGROUP) + { + mc_subfield *pp; + int found; + + for (pp = p->u.child, found = 0; pp; pp = pp->next) + { + if (!yaz_matchstr(pisf->name, p->name)) + { + found = 1; + break; + } + } + if (found) + { + wrbuf_puts(buf, " ("); + pisf = cat_inline_subfield(p->u.child, buf, pisf); + wrbuf_puts(buf, ") "); + } + } + } + return pisf; +} + +static void cat_inline_field(mc_field *pf, WRBUF buf, data1_node *subfield) +{ + if (!pf || !subfield) + return; + + for (;subfield;) + { + int len; + inline_field *pif=NULL; + data1_node *psubf; + + if (yaz_matchstr(subfield->u.tag.tag, "1")) + { + subfield = subfield->next; + continue; + } + + psubf = subfield; + pif = inline_mk_field(); + do + { + int i; + if ((i=inline_parse(pif, psubf->u.tag.tag, get_data(psubf, &len)))<0) + { + yaz_log(YLOG_WARN, "inline subfield ($%s): parse error", + psubf->u.tag.tag); + inline_destroy_field(pif); + return; + } + psubf = psubf->next; + } while (psubf && yaz_matchstr(psubf->u.tag.tag, "1")); + + subfield = psubf; + + if (pif && !yaz_matchstr(pif->name, pf->name)) + { + if (!pf->list && pif->list) + { + wrbuf_puts(buf, pif->list->data); + } + else + { + int ind1, ind2; + + /* + check indicators + */ + + ind1 = (pif->ind1[0] == ' ') ? '_':pif->ind1[0]; + ind2 = (pif->ind2[0] == ' ') ? '_':pif->ind2[0]; + + if (((pf->ind1[0] == '.') || (ind1 == pf->ind1[0])) && + ((pf->ind2[0] == '.') || (ind2 == pf->ind2[0]))) + { + cat_inline_subfield(pf->list, buf, pif->list); + + /* + add separator for inline fields + */ + if (wrbuf_len(buf)) + { + wrbuf_puts(buf, "\n"); + } + } + else + { + yaz_log(YLOG_WARN, "In-line field %s missed -- indicators do not match", pif->name); + } + } + } + inline_destroy_field(pif); + } +#if MARCOMP_DEBUG + yaz_log(YLOG_LOG, "cat_inline_field(): got buffer {%s}", buf); +#endif +} + +static data1_node *cat_subfield(mc_subfield *psf, WRBUF buf, + data1_node *subfield) +{ + mc_subfield *p; + + for (p = psf; p && subfield; p = p->next) + { + if (p->which == MC_SF) + { + data1_node *found = lookup_subfield(subfield, p->name); + + if (found) + { + int len; + + if (strcmp(p->prefix, "_")) + { + wrbuf_puts(buf, " "); + wrbuf_puts(buf, p->prefix); + } + + if (p->u.in_line) + { + cat_inline_field(p->u.in_line, buf, found); + } + else if (p->interval.start == -1) + { + wrbuf_puts(buf, get_data(found, &len)); + } + else + { + wrbuf_write(buf, get_data(found, &len)+p->interval.start, + p->interval.end-p->interval.start); + wrbuf_puts(buf, ""); + } + if (strcmp(p->suffix, "_")) + { + wrbuf_puts(buf, p->suffix); + wrbuf_puts(buf, " "); + } +#if MARCOMP_DEBUG + yaz_log(YLOG_LOG, "cat_subfield(): add subfield $%s", found->u.tag.tag); +#endif + subfield = found->next; + } + } + else if (p->which == MC_SFVARIANT) + { + data1_node *next; + do { + next = cat_subfield(p->u.child, buf, subfield); + if (next == subfield) + break; + subfield = next; + } while (subfield); + } + else if (p->which == MC_SFGROUP) + { + mc_subfield *pp; + int found; + + for (pp = p->u.child, found = 0; pp; pp = pp->next) + { + if (!yaz_matchstr(subfield->u.tag.tag, pp->name)) + { + found = 1; + break; + } + } + if (found) + { + wrbuf_puts(buf, " ("); + subfield = cat_subfield(p->u.child, buf, subfield); + wrbuf_puts(buf, ") "); + } + } + } + return subfield; +} + +static data1_node *cat_field(struct grs_read_info *p, mc_field *pf, + WRBUF buf, data1_node *field) +{ + data1_node *subfield; + int ind1, ind2; + + if (!pf || !field) + return 0; + + + if (yaz_matchstr(field->u.tag.tag, pf->name)) + return field->next; + + subfield = field->child; + + if (!subfield) + return field->next; + + /* + check subfield without indicators + */ + + if (!pf->list && subfield->which == DATA1N_data) + { + int len; + + if (pf->interval.start == -1) + { + wrbuf_puts(buf, get_data(field, &len)); + } + else + { + wrbuf_write(buf, get_data(field, &len)+pf->interval.start, + pf->interval.end-pf->interval.start); + wrbuf_puts(buf, ""); + } +#if MARCOMP_DEBUG + yaz_log(YLOG_LOG, "cat_field(): got buffer {%s}", buf); +#endif + return field->next; + } + + /* + check indicators + */ + + ind1 = (subfield->u.tag.tag[0] == ' ') ? '_':subfield->u.tag.tag[0]; + ind2 = (subfield->u.tag.tag[1] == ' ') ? '_':subfield->u.tag.tag[1]; + + if (!( + ((pf->ind1[0] == '.') || (ind1 == pf->ind1[0])) && + ((pf->ind2[0] == '.') || (ind2 == pf->ind2[0])) + )) + { +#if MARCOMP_DEBUG + yaz_log(YLOG_WARN, "Field %s missed -- does not match indicators", field->u.tag.tag); +#endif + return field->next; + } + + subfield = subfield->child; + + if (!subfield) + return field->next; + + cat_subfield(pf->list, buf, subfield); + +#if MARCOMP_DEBUG + yaz_log(YLOG_LOG, "cat_field(): got buffer {%s}", buf); +#endif + + return field->next; +} + +static int is_empty(char *s) +{ + char *p = s; + + for (p = s; *p; p++) + { + if (!isspace(*(unsigned char *)p)) + return 0; + } + return 1; +} + +static void parse_data1_tree(struct grs_read_info *p, const char *mc_stmnt, + data1_node *root) +{ + data1_marctab *marctab = data1_absyn_getmarctab(p->dh, root); + data1_node *top = root->child; + data1_node *field; + mc_context *c; + mc_field *pf; + WRBUF buf; + + c = mc_mk_context(mc_stmnt+3); + + if (!c) + return; + + pf = mc_getfield(c); + + if (!pf) + { + mc_destroy_context(c); + return; + } + buf = wrbuf_alloc(); +#if MARCOMP_DEBUG + yaz_log(YLOG_LOG, "parse_data1_tree(): statement -{%s}", mc_stmnt); +#endif + if (!yaz_matchstr(pf->name, "ldr")) + { + data1_node *new; +#if MARCOMP_DEBUG + yaz_log(YLOG_LOG,"parse_data1_tree(): try LEADER from {%d} to {%d} positions", + pf->interval.start, pf->interval.end); +#endif + if (marctab) + { + new = data1_mk_tag_n(p->dh, p->mem, mc_stmnt, strlen(mc_stmnt), 0, top); + data1_mk_text_n(p->dh, p->mem, marctab->leader+pf->interval.start, + pf->interval.end-pf->interval.start+1, new); + } + } + else + { + field=top->child; + + while(field) + { + if (!yaz_matchstr(field->u.tag.tag, pf->name)) + { + data1_node *new; + char *pb; +#if MARCOMP_DEBUG + yaz_log(YLOG_LOG, "parse_data1_tree(): try field {%s}", field->u.tag.tag); +#endif + wrbuf_rewind(buf); + wrbuf_puts(buf, ""); + + field = cat_field(p, pf, buf, field); + + wrbuf_cstr(buf); + pb = wrbuf_buf(buf); + for (pb = strtok(pb, "\n"); pb; pb = strtok(NULL, "\n")) + { + if (!is_empty(pb)) + { + new = data1_mk_tag_n(p->dh, p->mem, mc_stmnt, strlen(mc_stmnt), 0, top); + data1_mk_text_n(p->dh, p->mem, pb, strlen(pb), new); + } + } + } + else + { + field = field->next; + } + } + } + mc_destroy_field(pf); + mc_destroy_context(c); + wrbuf_destroy(buf); +} + +data1_node *grs_read_marcxml(struct grs_read_info *p) +{ + data1_node *root = grs_read_iso2709(p, 1); + data1_element *e; + + if (!root) + return 0; + + for (e = data1_absyn_getelements(p->dh, root); e; e=e->next) + { + data1_tag *tag = e->tag; + + if (tag && tag->which == DATA1T_string && + !yaz_matchstr(tag->value.string, "mc?")) + parse_data1_tree(p, tag->value.string, root); + } + return root; +} + +data1_node *grs_read_marc(struct grs_read_info *p) +{ + data1_node *root = grs_read_iso2709(p, 0); + data1_element *e; + + if (!root) + return 0; + + for (e = data1_absyn_getelements(p->dh, root); e; e=e->next) + { + data1_tag *tag = e->tag; + + if (tag && tag->which == DATA1T_string && + !yaz_matchstr(tag->value.string, "mc?")) + parse_data1_tree(p, tag->value.string, root); + } + return root; +} + +static void *init_marc(Res res, RecType rt) +{ + struct marc_info *p = xmalloc(sizeof(*p)); + strcpy(p->type, ""); + return p; +} + +static ZEBRA_RES config_marc(void *clientData, Res res, const char *args) +{ + struct marc_info *p = (struct marc_info*) clientData; + if (strlen(args) < sizeof(p->type)) + strcpy(p->type, args); + return ZEBRA_OK; +} + +static void destroy_marc(void *clientData) +{ + struct marc_info *p = (struct marc_info*) clientData; + xfree (p); +} + + +static int extract_marc(void *clientData, struct recExtractCtrl *ctrl) +{ + return zebra_grs_extract(clientData, ctrl, grs_read_marc); +} + +static int retrieve_marc(void *clientData, struct recRetrieveCtrl *ctrl) +{ + return zebra_grs_retrieve(clientData, ctrl, grs_read_marc); +} + +static struct recType marc_type = { + 0, + "grs.marc", + init_marc, + config_marc, + destroy_marc, + extract_marc, + retrieve_marc, +}; + +static int extract_marcxml(void *clientData, struct recExtractCtrl *ctrl) +{ + return zebra_grs_extract(clientData, ctrl, grs_read_marcxml); +} + +static int retrieve_marcxml(void *clientData, struct recRetrieveCtrl *ctrl) +{ + return zebra_grs_retrieve(clientData, ctrl, grs_read_marcxml); +} + +static struct recType marcxml_type = { + 0, + "grs.marcxml", + init_marc, + config_marc, + destroy_marc, + extract_marcxml, + retrieve_marcxml, +}; + +RecType +#ifdef IDZEBRA_STATIC_GRS_MARC +idzebra_filter_grs_marc +#else +idzebra_filter +#endif + +[] = { + &marc_type, + &marcxml_type, + 0, +}; + +/* + * Local variables: + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ +