X-Git-Url: http://git.indexdata.com/?p=yaz-moved-to-github.git;a=blobdiff_plain;f=retrieval%2Fd1_read.c;h=281f7caf5494f9c845e58a50d81548eafc444b64;hp=223927947a1505131142cba03a71db97331969c8;hb=6866fce7b026f5ce8b75945f72a844d7b44abd85;hpb=f4be2b662141a53c7b9a8a9b410b549dfd2bc1fb diff --git a/retrieval/d1_read.c b/retrieval/d1_read.c index 2239279..281f7ca 100644 --- a/retrieval/d1_read.c +++ b/retrieval/d1_read.c @@ -1,155 +1,157 @@ /* - * Copyright (c) 1995-1998, Index Data. + * Copyright (c) 1995-2002, Index Data. * See the file LICENSE for details. * Sebastian Hammer, Adam Dickmeiss * - * $Log: d1_read.c,v $ - * Revision 1.21 1998-02-27 14:08:05 adam - * Added const to some char pointer arguments. - * Reworked data1_read_node so that it doesn't create a tree with - * pointers to original "SGML"-buffer. - * - * Revision 1.20 1998/02/11 11:53:35 adam - * Changed code so that it compiles as C++. - * - * Revision 1.19 1997/12/09 16:17:09 adam - * Fix bug regarding variants. Tags with prefix "var" was incorrectly - * interpreted as "start of variants". Now, only "var" indicates such - * start. - * Cleaned up data1_read_node so tag names and variant names are - * copied and not pointed to by the generated data1 tree. Data nodes - * still point to old buffer. - * - * Revision 1.18 1997/11/18 09:51:09 adam - * Removed element num_children from data1_node. Minor changes in - * data1 to Explain. - * - * Revision 1.17 1997/11/05 09:20:51 adam - * Minor change. - * - * Revision 1.16 1997/09/17 12:10:37 adam - * YAZ version 1.4. - * - * Revision 1.15 1997/09/05 09:50:57 adam - * Removed global data1_tabpath - uses data1_get_tabpath() instead. - * - * Revision 1.14 1997/05/14 06:54:04 adam - * C++ support. - * - * Revision 1.13 1996/10/29 13:35:38 adam - * Implemented data1_set_tabpath and data1_get_tabpath. - * - * Revision 1.12 1996/10/11 10:35:38 adam - * Fixed a bug that caused data1_read_node to core dump when no abstract - * syntax was defined in a "sgml"-record. - * - * Revision 1.11 1996/07/06 19:58:35 quinn - * System headerfiles gathered in yconfig - * - * Revision 1.10 1996/01/19 15:41:47 quinn - * Fixed uninitialized boolean. - * - * Revision 1.9 1996/01/17 14:52:47 adam - * Changed prototype for reader function parsed to data1_read_record. - * - * Revision 1.8 1995/12/15 16:20:41 quinn - * Added formatted text. - * - * Revision 1.7 1995/12/13 13:44:32 quinn - * Modified Data1-system to use nmem - * - * Revision 1.6 1995/12/12 16:37:08 quinn - * Added destroy element to data1_node. - * - * Revision 1.5 1995/12/11 15:22:37 quinn - * Added last_child field to the node. - * Rewrote schema-mapping. - * - * Revision 1.4 1995/11/13 09:27:36 quinn - * Fiddling with the variant stuff. - * - * Revision 1.3 1995/11/01 16:34:57 quinn - * Making data1 look for tables in data1_tabpath - * - * Revision 1.2 1995/11/01 13:54:48 quinn - * Minor adjustments - * - * Revision 1.1 1995/11/01 11:56:09 quinn - * Added Retrieval (data management) functions en masse. - * - * Revision 1.14 1995/10/30 12:40:55 quinn - * Fixed a couple of bugs. - * - * Revision 1.13 1995/10/25 16:00:47 quinn - * USMARC support is now almost operational - * - * Revision 1.12 1995/10/16 14:02:55 quinn - * Changes to support element set names and espec1 - * - * Revision 1.11 1995/10/13 16:05:08 quinn - * Adding Espec1-processing - * - * Revision 1.10 1995/10/11 14:53:44 quinn - * Work on variants. - * - * Revision 1.9 1995/10/06 16:56:50 quinn - * Fixed ranked result. - * - * Revision 1.8 1995/10/06 16:44:13 quinn - * Work on attribute set mapping, etc. - * - * Revision 1.7 1995/10/06 12:58:35 quinn - * SUTRS support - * - * Revision 1.6 1995/10/04 09:29:49 quinn - * Adjustments to support USGS test data - * - * Revision 1.5 1995/10/03 17:56:43 quinn - * Fixing GRS code. - * - * Revision 1.4 1995/10/02 15:53:19 quinn - * Work - * - * Revision 1.3 1995/10/02 14:55:21 quinn - * *** empty log message *** - * - * Revision 1.2 1995/09/14 15:18:13 quinn - * Work - * - * Revision 1.1 1995/09/12 11:24:30 quinn - * Beginning to add code for structured records. - * - * + * $Id: d1_read.c,v 1.54 2002-10-08 23:00:09 adam Exp $ */ #include -#include #include #include -#include -#include -#include +#include +#include +#include +#include +data1_node *data1_get_root_tag (data1_handle dh, data1_node *n) +{ + if (!n) + return 0; + if (data1_is_xmlmode(dh)) + { + n = n->child; + while (n && n->which != DATA1N_tag) + n = n->next; + } + return n; +} + /* * get the tag which is the immediate parent of this node (this may mean * traversing intermediate things like variants and stuff. */ data1_node *get_parent_tag (data1_handle dh, data1_node *n) { - for (; n && n->which != DATA1N_root; n = n->parent) - if (n->which == DATA1N_tag) - return n; + if (data1_is_xmlmode(dh)) + { + for (; n && n->which != DATA1N_root; n = n->parent) + if (n->which == DATA1N_tag && n->parent && + n->parent->which != DATA1N_root) + return n; + } + else + { + for (; n && n->which != DATA1N_root; n = n->parent) + if (n->which == DATA1N_tag) + return n; + } return 0; } data1_node *data1_mk_node (data1_handle dh, NMEM m) { - data1_node *r; + return data1_mk_node2 (dh, m, DATA1N_root, 0); +} + +data1_node *data1_mk_node_type (data1_handle dh, NMEM m, int type) +{ + return data1_mk_node2 (dh, m, type, 0); +} + +static void data1_init_node (data1_handle dh, data1_node *r, int type) +{ + r->which = type; + switch(type) + { + case DATA1N_tag: + r->u.tag.tag = 0; + r->u.tag.element = 0; + r->u.tag.no_data_requested = 0; + r->u.tag.node_selected = 0; + r->u.tag.make_variantlist = 0; + r->u.tag.get_bytes = -1; + r->u.tag.attributes = 0; + break; + case DATA1N_root: + r->u.root.type = 0; + r->u.root.absyn = 0; + break; + case DATA1N_data: + r->u.data.data = 0; + r->u.data.len = 0; + r->u.data.what = 0; + r->u.data.formatted_text = 0; + break; + case DATA1N_comment: + r->u.data.data = 0; + r->u.data.len = 0; + r->u.data.what = 0; + r->u.data.formatted_text = 1; + break; + case DATA1N_variant: + r->u.variant.type = 0; + r->u.variant.value = 0; + break; + case DATA1N_preprocess: + r->u.preprocess.target = 0; + r->u.preprocess.attributes = 0; + break; + default: + logf (LOG_WARN, "data_mk_node_type. bad type = %d\n", type); + } +} + +data1_node *data1_append_node (data1_handle dh, NMEM m, int type, + data1_node *parent) +{ + data1_node *r = (data1_node *)nmem_malloc(m, sizeof(*r)); + r->next = r->child = r->last_child = 0; + r->destroy = 0; + + if (!parent) + r->root = r; + else + { + r->root = parent->root; + r->parent = parent; + if (!parent->child) + parent->child = parent->last_child = r; + else + parent->last_child->next = r; + parent->last_child = r; + } + data1_init_node(dh, r, type); + return r; +} + +data1_node *data1_mk_node2 (data1_handle dh, NMEM m, int type, + data1_node *parent) +{ + return data1_append_node (dh, m, type, parent); +} - r = (data1_node *)nmem_malloc(m, sizeof(*r)); - r->next = r->child = r->last_child = r->parent = 0; +data1_node *data1_insert_node (data1_handle dh, NMEM m, int type, + data1_node *parent) +{ + data1_node *r = (data1_node *)nmem_malloc(m, sizeof(*r)); + r->next = r->child = r->last_child = 0; r->destroy = 0; + + if (!parent) + r->root = r; + else + { + r->root = parent->root; + r->parent = parent; + if (!parent->child) + parent->last_child = r; + else + r->next = parent->child; + parent->child = r; + } + data1_init_node(dh, r, type); return r; } @@ -167,76 +169,257 @@ void data1_free_tree (data1_handle dh, data1_node *t) (*t->destroy)(t); } -char *data1_insert_string (data1_handle dh, data1_node *res, - NMEM m, const char *str) +data1_node *data1_mk_root (data1_handle dh, NMEM nmem, const char *name) +{ + data1_absyn *absyn = data1_get_absyn (dh, name); + data1_node *res; + if (!absyn) + { + yaz_log(LOG_WARN, "Unable to acquire abstract syntax " "for '%s'", + name); + /* It's now OK for a record not to have an absyn */ + } + res = data1_mk_node2 (dh, nmem, DATA1N_root, 0); + res->u.root.type = data1_insert_string (dh, res, nmem, name); + res->u.root.absyn = absyn; + return res; +} + +void data1_set_root(data1_handle dh, data1_node *res, + NMEM nmem, const char *name) { - int len = strlen(str); + data1_absyn *absyn = data1_get_absyn (dh, name); - if (len >= DATA1_LOCALDATA) - return nmem_strdup (m, str); - else + res->u.root.type = data1_insert_string (dh, res, nmem, name); + res->u.root.absyn = absyn; +} + +data1_node *data1_mk_preprocess (data1_handle dh, NMEM nmem, + const char *target, + const char **attr, data1_node *at) +{ + return data1_mk_preprocess_n (dh, nmem, target, strlen(target), + attr, at); +} + +data1_node *data1_mk_preprocess_n (data1_handle dh, NMEM nmem, + const char *target, size_t len, + const char **attr, data1_node *at) +{ + data1_xattr **p; + data1_node *res = data1_mk_node2 (dh, nmem, DATA1N_preprocess, at); + res->u.preprocess.target = data1_insert_string_n (dh, res, nmem, + target, len); + + p = &res->u.preprocess.attributes; + while (attr && *attr) { - strcpy (res->lbuf, str); - return res->lbuf; + *p = (data1_xattr*) nmem_malloc (nmem, sizeof(**p)); + (*p)->name = nmem_strdup (nmem, *attr++); + (*p)->value = nmem_strdup (nmem, *attr++); + (*p)->what = DATA1I_text; + + p = &(*p)->next; } + *p = 0; + return res; } +data1_node *data1_mk_tag_n (data1_handle dh, NMEM nmem, + const char *tag, size_t len, const char **attr, + data1_node *at) +{ + data1_node *partag = get_parent_tag(dh, at); + data1_node *res = data1_mk_node2 (dh, nmem, DATA1N_tag, at); + data1_xattr **p; + data1_element *e = 0; + + res->u.tag.tag = data1_insert_string_n (dh, res, nmem, tag, len); + + if (!partag) /* top tag? */ + e = data1_getelementbytagname (dh, at->root->u.root.absyn, + 0 /* index as local */, + res->u.tag.tag); + else + { + /* only set element for known tags */ + e = partag->u.tag.element; + if (e) + e = data1_getelementbytagname (dh, at->root->u.root.absyn, + e, res->u.tag.tag); + } + res->u.tag.element = e; + p = &res->u.tag.attributes; + while (attr && *attr) + { + *p = (data1_xattr*) nmem_malloc (nmem, sizeof(**p)); + (*p)->name = nmem_strdup (nmem, *attr++); + (*p)->value = nmem_strdup (nmem, *attr++); + (*p)->what = DATA1I_text; + p = &(*p)->next; + } + *p = 0; + return res; +} -data1_node *data1_add_insert_taggeddata(data1_handle dh, data1_node *root, - data1_node *at, - const char *tagname, NMEM m, - int first_flag) +void data1_tag_add_attr (data1_handle dh, NMEM nmem, + data1_node *res, const char **attr) { - data1_node *partag = get_parent_tag (dh, at); - data1_node *tagn = data1_mk_node (dh, m); - data1_element *e = NULL; - data1_node *datn; + data1_xattr **p; - tagn->which = DATA1N_tag; - tagn->line = -1; - tagn->u.tag.tag = data1_insert_string (dh, tagn, m, tagname); - tagn->u.tag.node_selected = 0; - tagn->u.tag.make_variantlist = 0; - tagn->u.tag.no_data_requested = 0; - tagn->u.tag.get_bytes = -1; + if (res->which != DATA1N_tag) + return; - if (partag) - e = partag->u.tag.element; - tagn->u.tag.element = - data1_getelementbytagname (dh, root->u.root.absyn, e, tagname); - tagn->last_child = tagn->child = datn = data1_mk_node (dh, m); - datn->parent = tagn; - datn->root = root; - datn->which = DATA1N_data; - datn->u.data.formatted_text = 0; - tagn->parent = at; - - if (first_flag) + p = &res->u.tag.attributes; + while (*p) + p = &(*p)->next; + + while (attr && *attr) { - tagn->next = at->child; - if (!tagn->next) - at->last_child = tagn; - at->child = tagn; + *p = (data1_xattr*) nmem_malloc (nmem, sizeof(**p)); + (*p)->name = nmem_strdup (nmem, *attr++); + (*p)->value = nmem_strdup (nmem, *attr++); + (*p)->what = DATA1I_text; + p = &(*p)->next; } - else + *p = 0; +} + +data1_node *data1_mk_tag (data1_handle dh, NMEM nmem, + const char *tag, const char **attr, data1_node *at) +{ + return data1_mk_tag_n (dh, nmem, tag, strlen(tag), attr, at); +} + +data1_node *data1_search_tag (data1_handle dh, data1_node *n, + const char *tag) +{ + if (*tag == '/') { - if (!at->child) - at->child = tagn; - else + n = data1_get_root_tag (dh, n); + if (n) + n = n->child; + tag++; + } + for (; n; n = n->next) + if (n->which == DATA1N_tag && n->u.tag.tag && + !yaz_matchstr (tag, n->u.tag.tag)) { - assert (at->last_child); - at->last_child->next = tagn; + return n; } - at->last_child = tagn; + return 0; +} + +data1_node *data1_mk_tag_uni (data1_handle dh, NMEM nmem, + const char *tag, data1_node *at) +{ + data1_node *node = data1_search_tag (dh, at->child, tag); + if (!node) + node = data1_mk_tag (dh, nmem, tag, 0 /* attr */, at); + else + node->child = node->last_child = 0; + return node; +} + +data1_node *data1_mk_text_n (data1_handle dh, NMEM mem, + const char *buf, size_t len, data1_node *parent) +{ + data1_node *res = data1_mk_node2 (dh, mem, DATA1N_data, parent); + res->u.data.what = DATA1I_text; + res->u.data.len = len; + + res->u.data.data = data1_insert_string_n (dh, res, mem, buf, len); + return res; +} + +data1_node *data1_mk_text_nf (data1_handle dh, NMEM mem, + const char *buf, size_t len, data1_node *parent) +{ + data1_node *res = data1_mk_text_n (dh, mem, buf, len, parent); + res->u.data.formatted_text = 1; + return res; +} + +data1_node *data1_mk_text (data1_handle dh, NMEM mem, + const char *buf, data1_node *parent) +{ + return data1_mk_text_n (dh, mem, buf, strlen(buf), parent); +} + +data1_node *data1_mk_comment_n (data1_handle dh, NMEM mem, + const char *buf, size_t len, + data1_node *parent) +{ + data1_node *res = data1_mk_node2 (dh, mem, DATA1N_comment, parent); + res->u.data.what = DATA1I_text; + res->u.data.len = len; + + res->u.data.data = data1_insert_string_n (dh, res, mem, buf, len); + return res; +} + +data1_node *data1_mk_comment (data1_handle dh, NMEM mem, + const char *buf, data1_node *parent) +{ + return data1_mk_comment_n (dh, mem, buf, strlen(buf), parent); +} + +char *data1_insert_string_n (data1_handle dh, data1_node *res, + NMEM m, const char *str, size_t len) +{ + char *b; + if (len >= DATA1_LOCALDATA) + b = (char *) nmem_malloc (m, len+1); + else + b = res->lbuf; + memcpy (b, str, len); + b[len] = 0; + return b; +} + +char *data1_insert_string (data1_handle dh, data1_node *res, + NMEM m, const char *str) +{ + return data1_insert_string_n (dh, res, m, str, strlen(str)); +} + +static data1_node *data1_add_insert_taggeddata(data1_handle dh, + data1_node *at, + const char *tagname, NMEM m, + int local_allowed, + int insert_mode) +{ + data1_node *root = at->root; + data1_node *partag = get_parent_tag (dh, at); + data1_element *e = NULL; + data1_node *datn = 0; + data1_node *tagn = 0; + + if (!partag) + e = data1_getelementbytagname (dh, root->u.root.absyn, 0, tagname); + else + { + e = partag->u.tag.element; + if (e) + e = data1_getelementbytagname (dh, root->u.root.absyn, e, tagname); + } + if (local_allowed || e) + { + if (insert_mode) + tagn = data1_insert_node (dh, m, DATA1N_tag, at); + else + tagn = data1_append_node (dh, m, DATA1N_tag, at); + tagn->u.tag.tag = data1_insert_string (dh, tagn, m, tagname); + tagn->u.tag.element = e; + datn = data1_mk_node2 (dh, m, DATA1N_data, tagn); } return datn; } -data1_node *data1_add_taggeddata(data1_handle dh, data1_node *root, - data1_node *at, - const char *tagname, NMEM m) +data1_node *data1_mk_tag_data(data1_handle dh, data1_node *at, + const char *tagname, NMEM m) { - return data1_add_insert_taggeddata (dh, root, at, tagname, m, 0); + return data1_add_insert_taggeddata (dh, at, tagname, m, 1, 0); } @@ -245,222 +428,505 @@ data1_node *data1_add_taggeddata(data1_handle dh, data1_node *root, * which should be root or tag itself). Returns pointer to the data node, * which can then be modified. */ -data1_node *data1_insert_taggeddata(data1_handle dh, data1_node *root, - data1_node *at, - const char *tagname, NMEM m) +data1_node *data1_mk_tag_data_wd(data1_handle dh, data1_node *at, + const char *tagname, NMEM m) { - return data1_add_insert_taggeddata (dh, root, at, tagname, m, 1); + return data1_add_insert_taggeddata (dh, at, tagname, m, 0, 1); } -/* - * Ugh. Sometimes functions just grow and grow on you. This one reads a - * 'node' and its children. - */ -data1_node *data1_read_node (data1_handle dh, const char **buf, - data1_node *parent, int *line, - data1_absyn *absyn, NMEM m) +data1_node *data1_insert_taggeddata (data1_handle dh, data1_node *root, + data1_node *at, const char *tagname, + NMEM m) { - data1_node *res; + return data1_add_insert_taggeddata (dh, at, tagname, m, 0, 1); +} + +data1_node *data1_add_taggeddata (data1_handle dh, data1_node *root, + data1_node *at, const char *tagname, + NMEM m) +{ + return data1_add_insert_taggeddata (dh, at, tagname, m, 1, 0); +} + +data1_node *data1_mk_tag_data_int (data1_handle dh, data1_node *at, + const char *tag, int num, + NMEM nmem) +{ + data1_node *node_data; + + node_data = data1_mk_tag_data (dh, at, tag, nmem); + if (!node_data) + return 0; + node_data->u.data.what = DATA1I_num; + node_data->u.data.data = node_data->lbuf; + sprintf (node_data->u.data.data, "%d", num); + node_data->u.data.len = strlen (node_data->u.data.data); + return node_data; +} - while (**buf && isspace(**buf)) +data1_node *data1_mk_tag_data_oid (data1_handle dh, data1_node *at, + const char *tag, Odr_oid *oid, + NMEM nmem) +{ + data1_node *node_data; + char str[128], *p = str; + Odr_oid *ii; + + node_data = data1_mk_tag_data (dh, at, tag, nmem); + if (!node_data) + return 0; + + for (ii = oid; *ii >= 0; ii++) { - if (**buf == '\n') - (*line)++; - (*buf)++; + if (ii != oid) + *p++ = '.'; + sprintf (p, "%d", *ii); + p += strlen (p); } - if (!**buf) + node_data->u.data.what = DATA1I_oid; + node_data->u.data.len = strlen (str); + node_data->u.data.data = data1_insert_string (dh, node_data, nmem, str); + return node_data; +} + + +data1_node *data1_mk_tag_data_text (data1_handle dh, data1_node *at, + const char *tag, const char *str, + NMEM nmem) +{ + data1_node *node_data; + + node_data = data1_mk_tag_data (dh, at, tag, nmem); + if (!node_data) return 0; + node_data->u.data.what = DATA1I_text; + node_data->u.data.len = strlen (str); + node_data->u.data.data = data1_insert_string (dh, node_data, nmem, str); + return node_data; +} + + +data1_node *data1_mk_tag_data_text_uni (data1_handle dh, data1_node *at, + const char *tag, const char *str, + NMEM nmem) +{ + data1_node *node = data1_search_tag (dh, at->child, tag); + if (!node) + return data1_mk_tag_data_text (dh, at, tag, str, nmem); + else + { + data1_node *node_data = node->child; + node_data->u.data.what = DATA1I_text; + node_data->u.data.len = strlen (str); + node_data->u.data.data = data1_insert_string (dh, node_data, + nmem, str); + node_data->child = node_data->last_child = 0; + return node_data; + } +} + +static int ampr (int (*get_byte)(void *fh), void *fh, int *amp) +{ +#if 1 + int c = (*get_byte)(fh); + *amp = 0; + return c; +#else + int c = (*get_byte)(fh); + *amp = 0; + if (c == '&') + { + char ent[20]; + int i = 0; + + while (1) + { + c = (*get_byte)(fh); + if (c == ';') + { + ent[i] = 0; + + c = ' '; + if (!strcmp (ent, "quot")) + c = '"'; + if (!strcmp (ent, "apos")) + c = '\''; + if (!strcmp (ent, "gt")) + c = '>'; + if (!strcmp (ent, "lt")) + c = '<'; + if (!strcmp (ent, "amp")) + c = '&'; + *amp = 1; + break; + } + else if (c == 0 || d1_isspace(c)) + break; + if (i < 19) + ent[i++] = c; + } + } + return c; +#endif +} - if (**buf == '<') /* beginning of tag */ +data1_xattr *data1_read_xattr (data1_handle dh, NMEM m, + int (*get_byte)(void *fh), void *fh, + WRBUF wrbuf, int *ch, int *amp) +{ + data1_xattr *p_first = 0; + data1_xattr **pp = &p_first; + int c = *ch; + for (;;) { - char tag[64]; - char args[256]; - int i; - const char *t = (*buf) + 1; - data1_node **pp; - data1_element *elem = 0; + data1_xattr *p; + int len; + while (*amp || (c && d1_isspace(c))) + c = ampr (get_byte, fh, amp); + if (*amp == 0 && (c == 0 || c == '>' || c == '/')) + break; + *pp = p = (data1_xattr *) nmem_malloc (m, sizeof(*p)); + p->next = 0; + pp = &p->next; + p->value = 0; + p->what = DATA1I_xmltext; - for (i = 0; *t && *t != '>' && !isspace(*t); t++) - if (i < (sizeof(tag)-1)) - tag[i++] = *t; - tag[i] = '\0'; - while (isspace(*t)) - t++; - for (i = 0; *t && *t != '>'; t++) - if (i < (sizeof(args)-1)) - args[i++] = *t; - args[i] = '\0'; - if (*t != '>' && !isspace(*t)) + wrbuf_rewind(wrbuf); + while (c && c != '=' && c != '>' && c != '/' && !d1_isspace(c)) { - logf(LOG_WARN, "d1: %d: Malformed tag", *line); - return 0; + wrbuf_putc (wrbuf, c); + c = ampr (get_byte, fh, amp); } - /* - * if end-tag, see if we terminate parent. If so, consume and return. - * Else, return. - */ - if (*tag == '/') - { - if (parent && (!*(tag +1) || - (parent->which == DATA1N_root && - !strcmp(tag + 1,parent->u.root.type)) || - (parent->which == DATA1N_tag && - !strcmp(tag + 1, parent->u.tag.tag)))) - *buf = t + 1; - return 0; - } - if (!absyn) /* parent node - what are we? */ + wrbuf_putc (wrbuf, '\0'); + len = wrbuf_len(wrbuf); + p->name = (char*) nmem_malloc (m, len); + strcpy (p->name, wrbuf_buf(wrbuf)); + if (c == '=') { - if (!(absyn = data1_get_absyn (dh, tag))) + c = ampr (get_byte, fh, amp); + if (*amp == 0 && c == '"') { - logf(LOG_WARN, "Unable to acquire abstract syntax for '%s'", - tag); - return 0; + c = ampr (get_byte, fh, amp); + wrbuf_rewind(wrbuf); + while (*amp || (c && c != '"')) + { + wrbuf_putc (wrbuf, c); + c = ampr (get_byte, fh, amp); + } + if (c) + c = ampr (get_byte, fh, amp); } - res = data1_mk_node (dh, m); - res->which = DATA1N_root; - res->u.root.type = data1_insert_string (dh, res, m, tag); - res->u.root.absyn = absyn; - res->root = res; - *buf = t + 1; + else if (*amp == 0 && c == '\'') + { + c = ampr (get_byte, fh, amp); + wrbuf_rewind(wrbuf); + while (*amp || (c && c != '\'')) + { + wrbuf_putc (wrbuf, c); + c = ampr (get_byte, fh, amp); + } + if (c) + c = ampr (get_byte, fh, amp); + } + else + { + wrbuf_rewind(wrbuf); + while (*amp || (c && c != '>' && c != '/')) + { + wrbuf_putc (wrbuf, c); + c = ampr (get_byte, fh, amp); + } + } + wrbuf_putc (wrbuf, '\0'); + len = wrbuf_len(wrbuf); + p->value = (char*) nmem_malloc (m, len); + strcpy (p->value, wrbuf_buf(wrbuf)); } - else if (!strcmp(tag, "var")) + } + *ch = c; + return p_first; +} + +/* + * Ugh. Sometimes functions just grow and grow on you. This one reads a + * 'node' and its children. + */ +data1_node *data1_read_nodex (data1_handle dh, NMEM m, + int (*get_byte)(void *fh), void *fh, WRBUF wrbuf) +{ + data1_node *d1_stack[256]; + data1_node *res; + int c, amp; + int level = 0; + int line = 1; + + d1_stack[level] = 0; + c = ampr (get_byte, fh, &); + while (c != '\0') + { + data1_node *parent = level ? d1_stack[level-1] : 0; + + if (amp == 0 && c == '<') /* beginning of tag */ { - char tclass[DATA1_MAX_SYMBOL], type[DATA1_MAX_SYMBOL]; - data1_vartype *tp; - int val_offset; - data1_node *p; + data1_xattr *xattr; + + char tag[64]; + char args[256]; + int null_tag = 0; + int end_tag = 0; + size_t i = 0; - if (sscanf(args, "%s %s %n", tclass, type, &val_offset) != 2) + c = ampr (get_byte, fh, &); + if (amp == 0 && c == '/') { - logf(LOG_WARN, "Malformed variant triple at '%s'", tag); - return 0; + end_tag = 1; + c = ampr (get_byte, fh, &); } - if (!(tp = - data1_getvartypebyct(dh, parent->root->u.root.absyn->varset, - tclass, type))) - return 0; - - /* - * If we're the first variant in this group, create a parent var, - * and insert it before the current variant. - */ - if (parent->which != DATA1N_variant) + else if (amp == 0 && c == '!') + { + int c0, amp0; + + wrbuf_rewind(wrbuf); + + c0 = ampr (get_byte, fh, &0); + if (amp0 == 0 && c0 == '\0') + break; + c = ampr (get_byte, fh, &); + + if (amp0 == 0 && c0 == '-' && amp == 0 && c == '-') + { + /* COMMENT: */ + int no_dash = 0; + + c = ampr (get_byte, fh, &); + while (amp || c) + { + if (amp == 0 && c == '-') + no_dash++; + else if (amp == 0 && c == '>' && no_dash >= 2) + { + if (level > 0) + d1_stack[level] = + data1_mk_comment_n ( + dh, m, + wrbuf_buf(wrbuf), wrbuf_len(wrbuf)-2, + d1_stack[level-1]); + c = ampr (get_byte, fh, &); /* skip > */ + break; + } + else + no_dash = 0; + wrbuf_putc (wrbuf, c); + c = ampr (get_byte, fh, &); + } + continue; + } + else + { /* DIRECTIVE: */ + + int blevel = 0; + while (amp || c) + { + if (amp == 0 && c == '>' && blevel == 0) + { + c = ampr (get_byte, fh, &); + break; + } + if (amp == 0 && c == '[') + blevel++; + if (amp == 0 && c == ']' && blevel > 0) + blevel--; + c = ampr (get_byte, fh, &); + } + continue; + } + } + while (amp || (c && c != '>' && c != '/' && !d1_isspace(c))) + { + if (i < (sizeof(tag)-1)) + tag[i++] = c; + c = ampr (get_byte, fh, &); + } + tag[i] = '\0'; + xattr = data1_read_xattr (dh, m, get_byte, fh, wrbuf, &c, &); + args[0] = '\0'; + if (amp == 0 && c == '/') + { /* or */ + null_tag = 1; + c = ampr (get_byte, fh, &); + } + if (amp || c != '>') { - res = data1_mk_node (dh, m); - res->which = DATA1N_variant; - res->u.variant.type = 0; - res->u.variant.value = 0; - res->root = parent->root; + yaz_log(LOG_WARN, "d1: %d: Malformed tag", line); + return 0; } else + c = ampr (get_byte, fh, &); + + /* End tag? */ + if (end_tag) { + if (*tag == '\0') + --level; /* */ + else + { /* */ + int i = level; + while (i > 0) + { + parent = d1_stack[--i]; + if ((parent->which == DATA1N_root && + !strcmp(tag, parent->u.root.type)) || + (parent->which == DATA1N_tag && + !strcmp(tag, parent->u.tag.tag))) + { + level = i; + break; + } + } + if (i != level) + { + yaz_log (LOG_WARN, "%d: no begin tag for %s", + line, tag); + break; + } + } + if (data1_is_xmlmode(dh)) + { + if (level <= 1) + return d1_stack[0]; + } + else + { + if (level <= 0) + return d1_stack[0]; + } + continue; + } + else if (!strcmp(tag, "var")) + { + char tclass[DATA1_MAX_SYMBOL], type[DATA1_MAX_SYMBOL]; + data1_vartype *tp; + int val_offset; + + if (sscanf(args, "%s %s %n", tclass, type, &val_offset) != 2) + { + yaz_log(LOG_WARN, "Malformed variant triple at '%s'", tag); + continue; + } + if (!(tp = + data1_getvartypebyct(dh, + parent->root->u.root.absyn->varset, + tclass, type))) + continue; /* - * now determine if one of our ancestor triples is of same type. - * If so, we break here. This will make the parser unwind until - * we become a sibling (alternate variant) to the aforementioned - * triple. It stinks that we re-parse these tags on every - * iteration of this. This is a function in need of a rewrite. + * If we're the first variant in this group, create a parent + * variant, and insert it before the current variant. */ - for (p = parent; p->which == DATA1N_variant; p = p->parent) - if (p->u.variant.type == tp) - return 0; - res = data1_mk_node (dh, m); - res->which = DATA1N_variant; - res->root = parent->root; - res->u.variant.type = tp; - res->u.variant.value = - data1_insert_string (dh, res, m, args + val_offset); - *buf = t + 1; + if (parent->which != DATA1N_variant) + { + res = data1_mk_node2 (dh, m, DATA1N_variant, parent); + } + else + { + /* + * now determine if one of our ancestor triples is of + * same type. If so, we break here. + */ + int i; + for (i = level-1; d1_stack[i]->which==DATA1N_variant; --i) + if (d1_stack[i]->u.variant.type == tp) + { + level = i; + break; + } + res = data1_mk_node2 (dh, m, DATA1N_variant, parent); + res->u.variant.type = tp; + res->u.variant.value = + data1_insert_string (dh, res, m, args + val_offset); + } } - } - else /* tag.. acquire our element in the abstract syntax */ - { - data1_node *partag = get_parent_tag (dh, parent); - data1_element *e = 0; - int localtag = 0; + else + { + + /* tag .. acquire our element in the abstract syntax */ + if (level == 0) + { + parent = data1_mk_root (dh, m, tag); + res = d1_stack[level] = parent; - if (parent->which == DATA1N_variant) - return 0; - if (partag) - if (!(e = partag->u.tag.element)) - localtag = 1; /* our parent is a local tag */ - - elem = data1_getelementbytagname(dh, absyn, e, tag); - res = data1_mk_node (dh, m); - res->which = DATA1N_tag; - res->u.tag.tag = data1_insert_string (dh, res, m, tag); - res->u.tag.element = elem; - res->u.tag.node_selected = 0; - res->u.tag.make_variantlist = 0; - res->u.tag.no_data_requested = 0; - res->u.tag.get_bytes = -1; - res->root = parent->root; - *buf = t + 1; + if (data1_is_xmlmode(dh)) + { + level++; + res = data1_mk_tag (dh, m, tag, 0 /* attr */, parent); + res->u.tag.attributes = xattr; + } + } + else + { + res = data1_mk_tag (dh, m, tag, 0 /* attr */, parent); + res->u.tag.attributes = xattr; + } + } + d1_stack[level] = res; + d1_stack[level+1] = 0; + if (level < 250 && !null_tag) + ++level; } - - res->parent = parent; - pp = &res->child; - /* - * Read child nodes. - */ - while ((*pp = data1_read_node(dh, buf, res, line, absyn, m))) - { - res->last_child = *pp; - pp = &(*pp)->next; - } - } - else /* != '<'... this is a body of text */ - { - const char *src; - char *dst; - int len, prev_char = 0; - - if (!parent) - return 0; - - res = data1_mk_node(dh, m); - res->parent = parent; - res->which = DATA1N_data; - res->u.data.what = DATA1I_text; - res->u.data.formatted_text = 0; - res->root = parent->root; - - /* determine length of "data" */ - src = strchr (*buf, '<'); - if (src) - len = src - *buf; - else - len = strlen (*buf); - - /* use local buffer of nmem if too large */ - if (len >= DATA1_LOCALDATA) - res->u.data.data = nmem_malloc (m, len); - else - res->u.data.data = res->lbuf; - - /* read "data" and transfer while removing white space */ - dst = res->u.data.data; - for (src = *buf; --len >= 0; src++) + else /* != '<'... this is a body of text */ { - if (*src == '\n') - (*line)++; - if (isspace (*src)) - prev_char = ' '; - else + int len; + + if (level == 0) { - if (prev_char) - { - *dst++ = prev_char; - prev_char = 0; - } - *dst++ = *src; + c = ampr (get_byte, fh, &); + continue; } + res = data1_mk_node2 (dh, m, DATA1N_data, parent); + res->u.data.what = DATA1I_xmltext; + res->u.data.formatted_text = 0; + d1_stack[level] = res; + + wrbuf_rewind(wrbuf); + + while (amp || (c && c != '<')) + { + wrbuf_putc (wrbuf, c); + c = ampr (get_byte, fh, &); + } + len = wrbuf_len(wrbuf); + + /* use local buffer of nmem if too large */ + if (len >= DATA1_LOCALDATA) + res->u.data.data = (char*) nmem_malloc (m, len); + else + res->u.data.data = res->lbuf; + + if (len) + memcpy (res->u.data.data, wrbuf_buf(wrbuf), len); + else + res->u.data.data = 0; + res->u.data.len = len; } - *buf = src; - res->u.data.len = dst - res->u.data.data; } - return res; + return 0; +} + +int getc_mem (void *fh) +{ + const char **p = (const char **) fh; + if (**p) + return *(*p)++; + return 0; +} + +data1_node *data1_read_node (data1_handle dh, const char **buf, NMEM m) +{ + WRBUF wrbuf = wrbuf_alloc(); + data1_node *node; + + node = data1_read_nodex(dh, m, getc_mem, (void *) (buf), wrbuf); + wrbuf_free (wrbuf, 1); + return node; } /* @@ -474,21 +940,21 @@ data1_node *data1_read_record(data1_handle dh, char **buf = data1_get_read_buf (dh, &size); const char *bp; int rd = 0, res; - int line = 0; if (!*buf) *buf = (char *)xmalloc(*size = 4096); for (;;) { - if (rd + 4096 > *size && !(*buf =(char *)xrealloc(*buf, *size *= 2))) + if (rd + 2048 >= *size && !(*buf =(char *)xrealloc(*buf, *size *= 2))) abort(); - if ((res = (*rf)(fh, *buf + rd, 4096)) <= 0) + if ((res = (*rf)(fh, *buf + rd, 2048)) <= 0) { if (!res) { bp = *buf; - return data1_read_node(dh, &bp, 0, &line, 0, m); + (*buf)[rd] = '\0'; + return data1_read_node(dh, &bp, m); } else return 0; @@ -500,6 +966,127 @@ data1_node *data1_read_record(data1_handle dh, data1_node *data1_read_sgml (data1_handle dh, NMEM m, const char *buf) { const char *bp = buf; - int line = 0; - return data1_read_node (dh, &bp, 0, &line, 0, m); + return data1_read_node (dh, &bp, m); +} + + +static int conv_item (NMEM m, yaz_iconv_t t, + WRBUF wrbuf, char *inbuf, size_t inlen) +{ + wrbuf_rewind (wrbuf); + if (wrbuf->size < 10) + wrbuf_grow (wrbuf, 10); + for (;;) + { + char *outbuf = wrbuf->buf + wrbuf->pos; + size_t outlen = wrbuf->size - wrbuf->pos; + if (yaz_iconv (t, &inbuf, &inlen, &outbuf, &outlen) == + (size_t)(-1) && yaz_iconv_error(t) != YAZ_ICONV_E2BIG) + { + /* bad data. stop and skip conversion entirely */ + return -1; + } + else if (inlen == 0) + { /* finished converting */ + wrbuf->pos = wrbuf->size - outlen; + break; + } + else + { + /* buffer too small: make sure we expand buffer */ + wrbuf->pos = wrbuf->size - outlen; + wrbuf_grow(wrbuf, 20); + } + } + return 0; +} + +static void data1_iconv_s (data1_handle dh, NMEM m, data1_node *n, + yaz_iconv_t t, WRBUF wrbuf, const char *tocode) +{ + for (; n; n = n->next) + { + switch (n->which) + { + case DATA1N_data: + case DATA1N_comment: + if (conv_item (m, t, wrbuf, n->u.data.data, n->u.data.len) == 0) + { + n->u.data.data = + data1_insert_string_n (dh, n, m, wrbuf->buf, + wrbuf->pos); + n->u.data.len = wrbuf->pos; + } + break; + case DATA1N_tag: + if (conv_item (m, t, wrbuf, n->u.tag.tag, strlen(n->u.tag.tag)) + == 0) + { + n->u.tag.tag = + data1_insert_string_n (dh, n, m, + wrbuf->buf, wrbuf->pos); + } + if (n->u.tag.attributes) + { + data1_xattr *p; + for (p = n->u.tag.attributes; p; p = p->next) + { + if (p->value && + conv_item(m, t, wrbuf, p->value, strlen(p->value)) + == 0) + { + wrbuf_puts (wrbuf, ""); + p->value = nmem_strdup (m, wrbuf->buf); + } + } + } + break; + case DATA1N_preprocess: + if (strcmp(n->u.preprocess.target, "xml") == 0) + { + data1_xattr *p = n->u.preprocess.attributes; + for (; p; p = p->next) + if (strcmp (p->name, "encoding") == 0) + p->value = nmem_strdup (m, tocode); + } + break; + } + data1_iconv_s (dh, m, n->child, t, wrbuf, tocode); + } +} + +const char *data1_get_encoding (data1_handle dh, data1_node *n) +{ + /* see if we have an xml header that specifies encoding */ + if (n && n->child && n->child->which == DATA1N_preprocess && + strcmp (n->child->u.preprocess.target, "xml") == 0) + { + data1_xattr *xp = n->child->u.preprocess.attributes; + for (; xp; xp = xp->next) + if (!strcmp (xp->name, "encoding") == 0) + return xp->value; + } + /* no encoding in header, so see if "encoding" was specified for abs */ + if (n && n->which == DATA1N_root && + n->u.root.absyn && n->u.root.absyn->encoding) + return n->u.root.absyn->encoding; + /* none of above, return a hard coded default */ + return "ISO-8859-1"; +} + +int data1_iconv (data1_handle dh, NMEM m, data1_node *n, + const char *tocode, + const char *fromcode) +{ + if (strcmp (tocode, fromcode)) + { + WRBUF wrbuf = wrbuf_alloc(); + yaz_iconv_t t = yaz_iconv_open (tocode, fromcode); + if (!t) + return -1; + data1_iconv_s (dh, m, n, t, wrbuf, tocode); + yaz_iconv_close (t); + wrbuf_free (wrbuf, 1); + } + return 0; }