X-Git-Url: http://git.indexdata.com/?p=yaz-moved-to-github.git;a=blobdiff_plain;f=retrieval%2Fd1_read.c;h=0af749251df77c3602e5943a91ce0708c260e2ee;hp=34759f4d633f97a6ab8a37ef5ec5f0c980aa4064;hb=09405de098947fc5f359e9bd1225b2747596b513;hpb=b440dce0831a72bebe4f4821ab7771cc05e8facb diff --git a/retrieval/d1_read.c b/retrieval/d1_read.c index 34759f4..0af7492 100644 --- a/retrieval/d1_read.c +++ b/retrieval/d1_read.c @@ -1,10 +1,47 @@ /* - * Copyright (c) 1995-1999, Index Data. + * Copyright (c) 1995-2001, Index Data. * See the file LICENSE for details. * Sebastian Hammer, Adam Dickmeiss * * $Log: d1_read.c,v $ - * Revision 1.27 1999-08-27 09:40:32 adam + * Revision 1.38 2001-03-27 23:06:21 adam + * Quotes and slashes may occur within attributes. + * + * Revision 1.37 2001/02/28 09:00:06 adam + * Fixed problem with stack overflow for very nested records. + * + * Revision 1.36 2001/02/21 13:46:53 adam + * C++ fixes. + * + * Revision 1.35 2000/12/05 14:44:25 adam + * Readers skips sections. + * + * Revision 1.34 2000/12/05 10:06:23 adam + * Added support for null-data rules like . + * + * Revision 1.33 2000/11/29 14:22:47 adam + * Implemented XML/SGML attributes for data1 so that d1_read reads them + * and d1_write generates proper attributes for XML/SGML records. Added + * register locking for threaded version. + * + * Revision 1.32 2000/01/06 11:25:59 adam + * Added case to prevent warning. + * + * Revision 1.31 1999/12/21 14:16:20 ian + * Changed retrieval module to allow data1 trees with no associated absyn. + * Also added a simple interface for extracting values from data1 trees using + * a string based tagpath. + * + * Revision 1.30 1999/11/30 13:47:12 adam + * Improved installation. Moved header files to include/yaz. + * + * Revision 1.29 1999/10/21 12:06:29 adam + * Retrieval module no longer uses ctype.h - functions. + * + * Revision 1.28 1999/10/21 09:50:33 adam + * SGML reader uses own isspace - it doesn't do 8-bit on WIN32! + * + * Revision 1.27 1999/08/27 09:40:32 adam * Renamed logf function to yaz_log. Removed VC++ project files. * * Revision 1.26 1999/07/13 13:23:48 adam @@ -146,13 +183,12 @@ */ #include -#include #include #include -#include -#include -#include +#include +#include +#include /* * get the tag which is the immediate parent of this node (this may mean @@ -176,6 +212,41 @@ data1_node *data1_mk_node (data1_handle dh, NMEM m) return r; } +data1_node *data1_mk_node_type (data1_handle dh, NMEM m, int type) +{ + data1_node *r; + + r = data1_mk_node(dh, m); + r->which = type; + switch(type) + { + case DATA1N_tag: + r->u.tag.tag = 0; + r->u.tag.element = 0; + r->u.tag.no_data_requested = 0; + r->u.tag.node_selected = 0; + r->u.tag.make_variantlist = 0; + r->u.tag.get_bytes = -1; +#if DATA1_USING_XATTR + r->u.tag.attributes = 0; +#endif + break; + case DATA1N_root: + r->u.root.type = 0; + r->u.root.absyn = 0; + break; + case DATA1N_data: + r->u.data.data = 0; + r->u.data.len = 0; + r->u.data.what = 0; + r->u.data.formatted_text = 0; + break; + default: + logf (LOG_WARN, "data_mk_node_type. bad type = %d\n", type); + } + return r; +} + void data1_free_tree (data1_handle dh, data1_node *t) { data1_node *p = t->child, *pn; @@ -210,16 +281,11 @@ data1_node *data1_add_insert_taggeddata(data1_handle dh, data1_node *root, int first_flag, int local_allowed) { data1_node *partag = get_parent_tag (dh, at); - data1_node *tagn = data1_mk_node (dh, m); + data1_node *tagn = data1_mk_node_type (dh, m, DATA1N_tag); data1_element *e = NULL; data1_node *datn; - tagn->which = DATA1N_tag; tagn->u.tag.tag = data1_insert_string (dh, tagn, m, tagname); - tagn->u.tag.node_selected = 0; - tagn->u.tag.make_variantlist = 0; - tagn->u.tag.no_data_requested = 0; - tagn->u.tag.get_bytes = -1; if (partag) e = partag->u.tag.element; @@ -227,11 +293,10 @@ data1_node *data1_add_insert_taggeddata(data1_handle dh, data1_node *root, data1_getelementbytagname (dh, root->u.root.absyn, e, tagname); if (!local_allowed && !tagn->u.tag.element) return NULL; - tagn->last_child = tagn->child = datn = data1_mk_node (dh, m); + tagn->last_child = tagn->child = datn = data1_mk_node_type (dh, m, DATA1N_data); tagn->root = root; datn->parent = tagn; datn->root = root; - datn->which = DATA1N_data; datn->u.data.formatted_text = 0; tagn->parent = at; @@ -276,6 +341,72 @@ data1_node *data1_insert_taggeddata(data1_handle dh, data1_node *root, return data1_add_insert_taggeddata (dh, root, at, tagname, m, 1, 0); } +#if DATA1_USING_XATTR +data1_xattr *data1_read_xattr (data1_handle dh, NMEM m, + int (*get_byte)(void *fh), void *fh, + WRBUF wrbuf, int *ch) +{ + data1_xattr *p_first = 0; + data1_xattr **pp = &p_first; + int c = *ch; + for (;;) + { + data1_xattr *p; + int len; + while (c && d1_isspace(c)) + c = (*get_byte)(fh); + if (!c || c == '>' || c == '/') + break; + *pp = p = (data1_xattr *) nmem_malloc (m, sizeof(*p)); + p->next = 0; + pp = &p->next; + p->value = 0; + + wrbuf_rewind(wrbuf); + while (c && c != '=' && c != '>' && c != '/' && !d1_isspace(c)) + { + wrbuf_putc (wrbuf, c); + c = (*get_byte)(fh); + } + wrbuf_putc (wrbuf, '\0'); + len = wrbuf_len(wrbuf); + p->name = (char*) nmem_malloc (m, len); + strcpy (p->name, wrbuf_buf(wrbuf)); + if (c == '=') + { + c = (*get_byte)(fh); + if (c == '"') + { + c = (*get_byte)(fh); + wrbuf_rewind(wrbuf); + while (c && c != '"') + { + wrbuf_putc (wrbuf, c); + c = (*get_byte)(fh); + } + if (c) + c = (*get_byte)(fh); + } + else + { + wrbuf_rewind(wrbuf); + while (c && c != '>' && c != '/') + { + wrbuf_putc (wrbuf, c); + c = (*get_byte)(fh); + } + } + wrbuf_putc (wrbuf, '\0'); + len = wrbuf_len(wrbuf); + p->value = (char*) nmem_malloc (m, len); + strcpy (p->value, wrbuf_buf(wrbuf)); + } + } + *ch = c; + return p_first; +} +#endif + /* * Ugh. Sometimes functions just grow and grow on you. This one reads a * 'node' and its children. @@ -295,7 +426,7 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, while (1) { data1_node *parent = level ? d1_stack[level-1] : 0; - while (c != '\0' && isspace(c)) + while (c != '\0' && d1_isspace(c)) { if (c == '\n') line++; @@ -306,20 +437,52 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, if (c == '<') /* beginning of tag */ { +#if DATA1_USING_XATTR + data1_xattr *xattr; +#endif char tag[64]; char args[256]; - size_t i; - - for (i = 0; (c=(*get_byte)(fh)) && c != '>' && !isspace(c);) + int null_tag = 0; + int end_tag = 0; + size_t i = 0; + + c = (*get_byte)(fh); + if (c == '/') + { + end_tag = 1; + c = (*get_byte)(fh); + } + else if (c == '!') /* tags/comments that we don't deal with yet */ + { + while (c && c != '>') + c = (*get_byte)(fh); + if (c) + c = (*get_byte)(fh); + continue; + } + while (c && c != '>' && c != '/' && !d1_isspace(c)) + { if (i < (sizeof(tag)-1)) tag[i++] = c; + c = (*get_byte)(fh); + } tag[i] = '\0'; - while (isspace(c)) +#if DATA1_USING_XATTR + xattr = data1_read_xattr (dh, m, get_byte, fh, wrbuf, &c); + args[0] = '\0'; +#else + while (d1_isspace(c)) c = (*get_byte)(fh); - for (i = 0; c && c != '>'; c = (*get_byte)(fh)) + for (i = 0; c && c != '>' && c != '/'; c = (*get_byte)(fh)) if (i < (sizeof(args)-1)) args[i++] = c; args[i] = '\0'; +#endif + if (c == '/') + { /* or */ + null_tag = 1; + c = (*get_byte)(fh); + } if (c != '>') { yaz_log(LOG_WARN, "d1: %d: Malformed tag", line); @@ -329,9 +492,9 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, c = (*get_byte)(fh); /* End tag? */ - if (*tag == '/') + if (end_tag) { - if (tag[1] == '\0') + if (*tag == '\0') --level; /* */ else { /* */ @@ -340,9 +503,9 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, { parent = d1_stack[--i]; if ((parent->which == DATA1N_root && - !strcmp(tag+1, parent->u.root.type)) || + !strcmp(tag, parent->u.root.type)) || (parent->which == DATA1N_tag && - !strcmp(tag+1, parent->u.tag.tag))) + !strcmp(tag, parent->u.tag.tag))) { level = i; break; @@ -363,12 +526,10 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, { if (!(absyn = data1_get_absyn (dh, tag))) { - yaz_log(LOG_WARN, "Unable to acquire abstract syntax " - "for '%s'", tag); - return 0; + yaz_log(LOG_WARN, "Unable to acquire abstract syntax " "for '%s'", tag); + /* It's now OK for a record not to have an absyn */ } - res = data1_mk_node (dh, m); - res->which = DATA1N_root; + res = data1_mk_node_type (dh, m, DATA1N_root); res->u.root.type = data1_insert_string (dh, res, m, tag); res->u.root.absyn = absyn; res->root = res; @@ -433,14 +594,12 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, localtag = 1; /* our parent is a local tag */ elem = data1_getelementbytagname(dh, absyn, e, tag); - res = data1_mk_node (dh, m); - res->which = DATA1N_tag; + res = data1_mk_node_type (dh, m, DATA1N_tag); res->u.tag.tag = data1_insert_string (dh, res, m, tag); res->u.tag.element = elem; - res->u.tag.node_selected = 0; - res->u.tag.make_variantlist = 0; - res->u.tag.no_data_requested = 0; - res->u.tag.get_bytes = -1; +#if DATA1_USING_XATTR + res->u.tag.attributes = xattr; +#endif } if (parent) { @@ -453,7 +612,9 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, else if (parent) parent->child = res; d1_stack[level] = res; - d1_stack[++level] = 0; + d1_stack[level+1] = 0; + if (level < 250 && !null_tag) + ++level; } else /* != '<'... this is a body of text */ { @@ -466,9 +627,8 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, c = (*get_byte)(fh); continue; } - res = data1_mk_node(dh, m); + res = data1_mk_node_type (dh, m, DATA1N_data); res->parent = parent; - res->which = DATA1N_data; res->u.data.what = DATA1I_text; res->u.data.formatted_text = 0; res->root = parent->root; @@ -481,7 +641,7 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, wrbuf_rewind(wrbuf); - while (c != '<') + while (c && c != '<') { wrbuf_putc (wrbuf, c); c = (*get_byte)(fh); @@ -500,7 +660,7 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, { if (*src == '\n') line++; - if (isspace (*src)) + if (d1_isspace (*src)) prev_char = ' '; else { @@ -531,7 +691,7 @@ data1_node *data1_read_node (data1_handle dh, const char **buf, NMEM m) WRBUF wrbuf = wrbuf_alloc(); data1_node *node; - node = data1_read_nodex(dh, m, getc_mem, buf, wrbuf); + node = data1_read_nodex(dh, m, getc_mem, (void *) (buf), wrbuf); wrbuf_free (wrbuf, 1); return node; }