X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=retrieval%2Fd1_read.c;h=539c9154feffea1085a9ae8194aca7ec75ae9d97;hb=0e8b1116b97d769ea233858fe3c2ced5d1da5d7c;hp=2bbf6ae1eec0f2e35a4e6e3e040797c8d84090fd;hpb=a9950f112b740bd1b6a2ccf20a584995608d26d6;p=yaz-moved-to-github.git diff --git a/retrieval/d1_read.c b/retrieval/d1_read.c index 2bbf6ae..539c915 100644 --- a/retrieval/d1_read.c +++ b/retrieval/d1_read.c @@ -3,17 +3,36 @@ * See the file LICENSE for details. * Sebastian Hammer, Adam Dickmeiss * - * $Id: d1_read.c,v 1.39 2002-04-15 09:06:30 adam Exp $ + * $Id: d1_read.c,v 1.47 2002-07-29 20:04:08 adam Exp $ */ #include #include #include +#include + +#if HAVE_ICONV_H +#include +#endif + #include #include #include +data1_node *data1_get_root_tag (data1_handle dh, data1_node *n) +{ + if (!n) + return 0; + if (data1_is_xmlmode(dh)) + { + n = n->child; + while (n && n->which != DATA1N_tag) + n = n->next; + } + return n; +} + /* * get the tag which is the immediate parent of this node (this may mean * traversing intermediate things like variants and stuff. @@ -28,19 +47,37 @@ data1_node *get_parent_tag (data1_handle dh, data1_node *n) data1_node *data1_mk_node (data1_handle dh, NMEM m) { - data1_node *r; - - r = (data1_node *)nmem_malloc(m, sizeof(*r)); - r->next = r->child = r->last_child = r->parent = 0; - r->destroy = 0; - return r; + return data1_mk_node2 (dh, m, DATA1N_root, 0); } data1_node *data1_mk_node_type (data1_handle dh, NMEM m, int type) { + return data1_mk_node2 (dh, m, type, 0); +} + +data1_node *data1_mk_node2 (data1_handle dh, NMEM m, int type, + data1_node *parent) +{ data1_node *r; + + r = (data1_node *)nmem_malloc(m, sizeof(*r)); + r->next = r->child = r->last_child = 0; + r->destroy = 0; - r = data1_mk_node(dh, m); + if (!parent) + { + r->root = r; + } + else + { + r->root = parent->root; + r->parent = parent; + if (!parent->child) + parent->child = parent->last_child = r; + else + parent->last_child->next = r; + parent->last_child = r; + } r->which = type; switch(type) { @@ -51,9 +88,7 @@ data1_node *data1_mk_node_type (data1_handle dh, NMEM m, int type) r->u.tag.node_selected = 0; r->u.tag.make_variantlist = 0; r->u.tag.get_bytes = -1; -#if DATA1_USING_XATTR r->u.tag.attributes = 0; -#endif break; case DATA1N_root: r->u.root.type = 0; @@ -64,7 +99,21 @@ data1_node *data1_mk_node_type (data1_handle dh, NMEM m, int type) r->u.data.len = 0; r->u.data.what = 0; r->u.data.formatted_text = 0; + break; + case DATA1N_comment: + r->u.data.data = 0; + r->u.data.len = 0; + r->u.data.what = 0; + r->u.data.formatted_text = 1; + break; + case DATA1N_variant: + r->u.variant.type = 0; + r->u.variant.value = 0; break; + case DATA1N_preprocess: + r->u.preprocess.target = 0; + r->u.preprocess.attributes = 0; + break; default: logf (LOG_WARN, "data_mk_node_type. bad type = %d\n", type); } @@ -85,71 +134,227 @@ void data1_free_tree (data1_handle dh, data1_node *t) (*t->destroy)(t); } -char *data1_insert_string (data1_handle dh, data1_node *res, - NMEM m, const char *str) +data1_node *data1_mk_root (data1_handle dh, NMEM nmem, const char *name) { - int len = strlen(str); - - if (len >= DATA1_LOCALDATA) - return nmem_strdup (m, str); - else + data1_absyn *absyn = data1_get_absyn (dh, name); + data1_node *res; + if (!absyn) { - strcpy (res->lbuf, str); - return res->lbuf; + yaz_log(LOG_WARN, "Unable to acquire abstract syntax " "for '%s'", + name); + /* It's now OK for a record not to have an absyn */ } + res = data1_mk_node2 (dh, nmem, DATA1N_root, 0); + res->u.root.type = data1_insert_string (dh, res, nmem, name); + res->u.root.absyn = absyn; + return res; } -data1_node *data1_add_insert_taggeddata(data1_handle dh, data1_node *root, - data1_node *at, - const char *tagname, NMEM m, - int first_flag, int local_allowed) +void data1_set_root(data1_handle dh, data1_node *res, + NMEM nmem, const char *name) { - data1_node *partag = get_parent_tag (dh, at); - data1_node *tagn = data1_mk_node_type (dh, m, DATA1N_tag); - data1_element *e = NULL; - data1_node *datn; + data1_absyn *absyn = data1_get_absyn (dh, name); - tagn->u.tag.tag = data1_insert_string (dh, tagn, m, tagname); + res->u.root.type = data1_insert_string (dh, res, nmem, name); + res->u.root.absyn = absyn; +} +data1_node *data1_mk_preprocess (data1_handle dh, NMEM nmem, + const char *target, const char **attr, + data1_node *at) +{ + data1_xattr **p; + data1_node *res = data1_mk_node2 (dh, nmem, DATA1N_preprocess, at); + res->u.preprocess.target = data1_insert_string (dh, res, nmem, target); + + p = &res->u.preprocess.attributes; + while (attr && *attr) + { + *p = (data1_xattr*) nmem_malloc (nmem, sizeof(**p)); + (*p)->name = nmem_strdup (nmem, *attr++); + (*p)->value = nmem_strdup (nmem, *attr++); + p = &(*p)->next; + } + *p = 0; + return res; +} + +data1_node *data1_mk_tag_n (data1_handle dh, NMEM nmem, + const char *tag, size_t len, const char **attr, + data1_node *at) +{ + data1_node *partag = get_parent_tag(dh, at); + data1_node *res = data1_mk_node2 (dh, nmem, DATA1N_tag, at); + data1_element *e = NULL; + data1_xattr **p; + + res->u.tag.tag = data1_insert_string_n (dh, res, nmem, tag, len); + if (partag) e = partag->u.tag.element; - tagn->u.tag.element = - data1_getelementbytagname (dh, root->u.root.absyn, e, tagname); - if (!local_allowed && !tagn->u.tag.element) - return NULL; - tagn->last_child = tagn->child = datn = data1_mk_node_type (dh, m, DATA1N_data); - tagn->root = root; - datn->parent = tagn; - datn->root = root; - datn->u.data.formatted_text = 0; - tagn->parent = at; - - if (first_flag) + res->u.tag.element = + data1_getelementbytagname (dh, at->root->u.root.absyn, + e, res->u.tag.tag); + p = &res->u.tag.attributes; + while (attr && *attr) { - tagn->next = at->child; - if (!tagn->next) - at->last_child = tagn; - at->child = tagn; + *p = (data1_xattr*) nmem_malloc (nmem, sizeof(**p)); + (*p)->name = nmem_strdup (nmem, *attr++); + (*p)->value = nmem_strdup (nmem, *attr++); + p = &(*p)->next; } - else + *p = 0; + return res; +} + +void data1_tag_add_attr (data1_handle dh, NMEM nmem, + data1_node *res, const char **attr) +{ + data1_xattr **p; + + if (res->which != DATA1N_tag) + return; + + p = &res->u.tag.attributes; + while (*p) + p = &(*p)->next; + + while (attr && *attr) { - if (!at->child) - at->child = tagn; - else + *p = (data1_xattr*) nmem_malloc (nmem, sizeof(**p)); + (*p)->name = nmem_strdup (nmem, *attr++); + (*p)->value = nmem_strdup (nmem, *attr++); + p = &(*p)->next; + } + *p = 0; +} + +data1_node *data1_mk_tag (data1_handle dh, NMEM nmem, + const char *tag, const char **attr, data1_node *at) +{ + return data1_mk_tag_n (dh, nmem, tag, strlen(tag), attr, at); +} + +data1_node *data1_search_tag (data1_handle dh, data1_node *n, + const char *tag) +{ + if (*tag == '/') + { + n = data1_get_root_tag (dh, n); + if (n) + n = n->child; + tag++; + } + for (; n; n = n->next) + if (n->which == DATA1N_tag && n->u.tag.tag && + !yaz_matchstr (tag, n->u.tag.tag)) { - assert (at->last_child); - at->last_child->next = tagn; + return n; } - at->last_child = tagn; + return 0; +} + +data1_node *data1_mk_tag_uni (data1_handle dh, NMEM nmem, + const char *tag, data1_node *at) +{ + data1_node *node = data1_search_tag (dh, at->child, tag); + if (!node) + node = data1_mk_tag (dh, nmem, tag, 0 /* attr */, at); + else + node->child = node->last_child = 0; + return node; +} + +data1_node *data1_mk_text_n (data1_handle dh, NMEM mem, + const char *buf, size_t len, data1_node *parent) +{ + data1_node *res = data1_mk_node2 (dh, mem, DATA1N_data, parent); + res->u.data.what = DATA1I_text; + res->u.data.len = len; + + res->u.data.data = data1_insert_string_n (dh, res, mem, buf, len); + return res; +} + +data1_node *data1_mk_text_nf (data1_handle dh, NMEM mem, + const char *buf, size_t len, data1_node *parent) +{ + data1_node *res = data1_mk_text_n (dh, mem, buf, len, parent); + res->u.data.formatted_text = 1; + return res; +} + +data1_node *data1_mk_text (data1_handle dh, NMEM mem, + const char *buf, data1_node *parent) +{ + return data1_mk_text_n (dh, mem, buf, strlen(buf), parent); +} + +data1_node *data1_mk_comment_n (data1_handle dh, NMEM mem, + const char *buf, size_t len, + data1_node *parent) +{ + data1_node *res = data1_mk_node2 (dh, mem, DATA1N_comment, parent); + res->u.data.what = DATA1I_text; + res->u.data.len = len; + + res->u.data.data = data1_insert_string_n (dh, res, mem, buf, len); + return res; +} + +data1_node *data1_mk_comment (data1_handle dh, NMEM mem, + const char *buf, data1_node *parent) +{ + return data1_mk_comment_n (dh, mem, buf, strlen(buf), parent); +} + +char *data1_insert_string_n (data1_handle dh, data1_node *res, + NMEM m, const char *str, size_t len) +{ + char *b; + if (len >= DATA1_LOCALDATA) + b = nmem_malloc (m, len+1); + else + b = res->lbuf; + memcpy (b, str, len); + b[len] = 0; + return b; +} + +char *data1_insert_string (data1_handle dh, data1_node *res, + NMEM m, const char *str) +{ + return data1_insert_string_n (dh, res, m, str, strlen(str)); +} + +static data1_node *data1_add_insert_taggeddata(data1_handle dh, + data1_node *at, + const char *tagname, NMEM m, + int local_allowed) +{ + data1_node *root = at->root; + data1_node *partag = get_parent_tag (dh, at); + data1_element *e = NULL; + data1_node *datn = 0; + data1_node *tagn = 0; + + if (partag) + e = partag->u.tag.element; + e = data1_getelementbytagname (dh, root->u.root.absyn, e, tagname); + if (local_allowed || e) + { + tagn = data1_mk_node2 (dh, m, DATA1N_tag, at); + tagn->u.tag.tag = data1_insert_string (dh, tagn, m, tagname); + tagn->u.tag.element = e; + datn = data1_mk_node2 (dh, m, DATA1N_data, tagn); } return datn; } -data1_node *data1_add_taggeddata(data1_handle dh, data1_node *root, - data1_node *at, - const char *tagname, NMEM m) +data1_node *data1_mk_tag_data(data1_handle dh, data1_node *at, + const char *tagname, NMEM m) { - return data1_add_insert_taggeddata (dh, root, at, tagname, m, 0, 1); + return data1_add_insert_taggeddata (dh, at, tagname, m, 1); } @@ -158,14 +363,104 @@ data1_node *data1_add_taggeddata(data1_handle dh, data1_node *root, * which should be root or tag itself). Returns pointer to the data node, * which can then be modified. */ -data1_node *data1_insert_taggeddata(data1_handle dh, data1_node *root, - data1_node *at, - const char *tagname, NMEM m) +data1_node *data1_mk_tag_data_wd(data1_handle dh, data1_node *at, + const char *tagname, NMEM m) { - return data1_add_insert_taggeddata (dh, root, at, tagname, m, 1, 0); + return data1_add_insert_taggeddata (dh, at, tagname, m, 0); } -#if DATA1_USING_XATTR +data1_node *data1_insert_taggeddata (data1_handle dh, data1_node *root, + data1_node *at, const char *tagname, + NMEM m) +{ + return data1_add_insert_taggeddata (dh, at, tagname, m, 0); +} + +data1_node *data1_add_taggeddata (data1_handle dh, data1_node *root, + data1_node *at, const char *tagname, + NMEM m) +{ + return data1_add_insert_taggeddata (dh, at, tagname, m, 1); +} + +data1_node *data1_mk_tag_data_int (data1_handle dh, data1_node *at, + const char *tag, int num, + NMEM nmem) +{ + data1_node *node_data; + + node_data = data1_mk_tag_data (dh, at, tag, nmem); + if (!node_data) + return 0; + node_data->u.data.what = DATA1I_num; + node_data->u.data.data = node_data->lbuf; + sprintf (node_data->u.data.data, "%d", num); + node_data->u.data.len = strlen (node_data->u.data.data); + return node_data; +} + +data1_node *data1_mk_tag_data_oid (data1_handle dh, data1_node *at, + const char *tag, Odr_oid *oid, + NMEM nmem) +{ + data1_node *node_data; + char str[128], *p = str; + Odr_oid *ii; + + node_data = data1_mk_tag_data (dh, at, tag, nmem); + if (!node_data) + return 0; + + for (ii = oid; *ii >= 0; ii++) + { + if (ii != oid) + *p++ = '.'; + sprintf (p, "%d", *ii); + p += strlen (p); + } + node_data->u.data.what = DATA1I_oid; + node_data->u.data.len = strlen (str); + node_data->u.data.data = data1_insert_string (dh, node_data, nmem, str); + return node_data; +} + + +data1_node *data1_mk_tag_data_text (data1_handle dh, data1_node *at, + const char *tag, const char *str, + NMEM nmem) +{ + data1_node *node_data; + + node_data = data1_mk_tag_data (dh, at, tag, nmem); + if (!node_data) + return 0; + node_data->u.data.what = DATA1I_text; + node_data->u.data.len = strlen (str); + node_data->u.data.data = data1_insert_string (dh, node_data, nmem, str); + return node_data; +} + + +data1_node *data1_mk_tag_data_text_uni (data1_handle dh, data1_node *at, + const char *tag, const char *str, + NMEM nmem) +{ + data1_node *node = data1_search_tag (dh, at->child, tag); + if (!node) + return data1_mk_tag_data_text (dh, at, tag, str, nmem); + else + { + data1_node *node_data = node->child; + node_data->u.data.what = DATA1I_text; + node_data->u.data.len = strlen (str); + node_data->u.data.data = data1_insert_string (dh, node_data, + nmem, str); + node_data->child = node_data->last_child = 0; + return node_data; + } +} + + data1_xattr *data1_read_xattr (data1_handle dh, NMEM m, int (*get_byte)(void *fh), void *fh, WRBUF wrbuf, int *ch) @@ -229,7 +524,6 @@ data1_xattr *data1_read_xattr (data1_handle dh, NMEM m, *ch = c; return p_first; } -#endif /* * Ugh. Sometimes functions just grow and grow on you. This one reads a @@ -238,7 +532,6 @@ data1_xattr *data1_read_xattr (data1_handle dh, NMEM m, data1_node *data1_read_nodex (data1_handle dh, NMEM m, int (*get_byte)(void *fh), void *fh, WRBUF wrbuf) { - data1_absyn *absyn = 0; data1_node *d1_stack[256]; data1_node *res; int c; @@ -261,9 +554,8 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, if (c == '<') /* beginning of tag */ { -#if DATA1_USING_XATTR data1_xattr *xattr; -#endif + char tag[64]; char args[256]; int null_tag = 0; @@ -291,17 +583,8 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, c = (*get_byte)(fh); } tag[i] = '\0'; -#if DATA1_USING_XATTR xattr = data1_read_xattr (dh, m, get_byte, fh, wrbuf, &c); args[0] = '\0'; -#else - while (d1_isspace(c)) - c = (*get_byte)(fh); - for (i = 0; c && c != '>' && c != '/'; c = (*get_byte)(fh)) - if (i < (sizeof(args)-1)) - args[i++] = c; - args[i] = '\0'; -#endif if (c == '/') { /* or */ null_tag = 1; @@ -342,22 +625,18 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, break; } } - if (level == 0) - return d1_stack[0]; + if (data1_is_xmlmode(dh)) + { + if (level <= 1) + return d1_stack[0]; + } + else + { + if (level <= 0) + return d1_stack[0]; + } continue; } - if (level == 0) /* root ? */ - { - if (!(absyn = data1_get_absyn (dh, tag))) - { - yaz_log(LOG_WARN, "Unable to acquire abstract syntax " "for '%s'", tag); - /* It's now OK for a record not to have an absyn */ - } - res = data1_mk_node_type (dh, m, DATA1N_root); - res->u.root.type = data1_insert_string (dh, res, m, tag); - res->u.root.absyn = absyn; - res->root = res; - } else if (!strcmp(tag, "var")) { char tclass[DATA1_MAX_SYMBOL], type[DATA1_MAX_SYMBOL]; @@ -380,10 +659,7 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, */ if (parent->which != DATA1N_variant) { - res = data1_mk_node (dh, m); - res->which = DATA1N_variant; - res->u.variant.type = 0; - res->u.variant.value = 0; + res = data1_mk_node2 (dh, m, DATA1N_variant, parent); } else { @@ -398,43 +674,34 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, level = i; break; } - res = data1_mk_node (dh, m); - res->which = DATA1N_variant; + res = data1_mk_node2 (dh, m, DATA1N_variant, parent); res->u.variant.type = tp; res->u.variant.value = data1_insert_string (dh, res, m, args + val_offset); } } - else /* tag.. acquire our element in the abstract syntax */ - { - data1_node *partag = get_parent_tag (dh, parent); - data1_element *elem, *e = 0; - int localtag = 0; - - if (parent->which == DATA1N_variant) - return 0; - if (partag) - if (!(e = partag->u.tag.element)) - localtag = 1; /* our parent is a local tag */ - - elem = data1_getelementbytagname(dh, absyn, e, tag); - res = data1_mk_node_type (dh, m, DATA1N_tag); - res->u.tag.tag = data1_insert_string (dh, res, m, tag); - res->u.tag.element = elem; -#if DATA1_USING_XATTR - res->u.tag.attributes = xattr; -#endif - } - if (parent) - { - parent->last_child = res; - res->root = parent->root; - } - res->parent = parent; - if (d1_stack[level]) - d1_stack[level]->next = res; - else if (parent) - parent->child = res; + else + { + + /* tag .. acquire our element in the abstract syntax */ + if (level == 0) + { + parent = data1_mk_root (dh, m, tag); + res = d1_stack[level] = parent; + + if (data1_is_xmlmode(dh)) + { + level++; + res = data1_mk_tag (dh, m, tag, 0 /* attr */, parent); + res->u.tag.attributes = xattr; + } + } + else + { + res = data1_mk_tag (dh, m, tag, 0 /* attr */, parent); + res->u.tag.attributes = xattr; + } + } d1_stack[level] = res; d1_stack[level+1] = 0; if (level < 250 && !null_tag) @@ -451,16 +718,9 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, c = (*get_byte)(fh); continue; } - res = data1_mk_node_type (dh, m, DATA1N_data); - res->parent = parent; + res = data1_mk_node2 (dh, m, DATA1N_data, parent); res->u.data.what = DATA1I_text; res->u.data.formatted_text = 0; - res->root = parent->root; - parent->last_child = res; - if (d1_stack[level]) - d1_stack[level]->next = res; - else - parent->child = res; d1_stack[level] = res; wrbuf_rewind(wrbuf); @@ -560,3 +820,130 @@ data1_node *data1_read_sgml (data1_handle dh, NMEM m, const char *buf) return data1_read_node (dh, &bp, m); } + +#if HAVE_ICONV_H + +static int conv_item (NMEM m, iconv_t t, + WRBUF wrbuf, char *inbuf, size_t inlen) +{ + wrbuf_rewind (wrbuf); + if (wrbuf->size < 10) + wrbuf_grow (wrbuf, 10); + for (;;) + { + char *outbuf = wrbuf->buf + wrbuf->pos; + size_t outlen = wrbuf->size - wrbuf->pos; + if (iconv (t, &inbuf, &inlen, &outbuf, &outlen) == + (size_t)(-1) && errno != E2BIG) + { + /* bad data. stop and skip conversion entirely */ + return -1; + } + else if (inlen == 0) + { /* finished converting */ + wrbuf->pos = wrbuf->size - outlen; + break; + } + else + { + /* buffer too small: make sure we expand buffer */ + wrbuf->pos = wrbuf->size - outlen; + wrbuf_grow(wrbuf, 20); + } + } + return 0; +} + +static void data1_iconv_s (data1_handle dh, NMEM m, data1_node *n, + iconv_t t, WRBUF wrbuf, const char *tocode) +{ + for (; n; n = n->next) + { + switch (n->which) + { + case DATA1N_data: + case DATA1N_comment: + if (conv_item (m, t, wrbuf, n->u.data.data, n->u.data.len) == 0) + { + n->u.data.data = + data1_insert_string_n (dh, n, m, wrbuf->buf, + wrbuf->pos); + n->u.data.len = wrbuf->pos; + } + break; + case DATA1N_tag: + if (conv_item (m, t, wrbuf, n->u.tag.tag, strlen(n->u.tag.tag)) + == 0) + { + n->u.tag.tag = + data1_insert_string_n (dh, n, m, + wrbuf->buf, wrbuf->pos); + } + if (n->u.tag.attributes) + { + data1_xattr *p; + for (p = n->u.tag.attributes; p; p = p->next) + { + if (conv_item(m, t, wrbuf, p->value, strlen(p->value)) + == 0) + { + wrbuf_puts (wrbuf, ""); + p->value = nmem_strdup (m, wrbuf->buf); + } + } + } + break; + case DATA1N_preprocess: + if (strcmp(n->u.preprocess.target, "xml") == 0) + { + data1_xattr *p = n->u.preprocess.attributes; + for (; p; p = p->next) + if (strcmp (p->name, "encoding") == 0) + p->value = nmem_strdup (m, tocode); + } + break; + } + data1_iconv_s (dh, m, n->child, t, wrbuf, tocode); + } +} +#endif + +const char *data1_get_encoding (data1_handle dh, data1_node *n) +{ + /* see if we have an xml header that specifies encoding */ + if (n && n->child && n->child->which == DATA1N_preprocess && + strcmp (n->child->u.preprocess.target, "xml") == 0) + { + data1_xattr *xp = n->child->u.preprocess.attributes; + for (; xp; xp = xp->next) + if (!strcmp (xp->name, "encoding") == 0) + return xp->value; + } + /* no encoding in header, so see if "encoding" was specified for abs */ + if (n && n->which == DATA1N_root && + n->u.root.absyn && n->u.root.absyn->encoding) + return n->u.root.absyn->encoding; + /* none of above, return a hard coded default */ + return "ISO-8859-1"; +} + +int data1_iconv (data1_handle dh, NMEM m, data1_node *n, + const char *tocode, + const char *fromcode) +{ +#if HAVE_ICONV_H + if (strcmp (tocode, fromcode)) + { + WRBUF wrbuf = wrbuf_alloc(); + iconv_t t = iconv_open (tocode, fromcode); + if (t == (iconv_t) (-1)) + return -1; + data1_iconv_s (dh, m, n, t, wrbuf, tocode); + iconv_close (t); + wrbuf_free (wrbuf, 1); + } + return 0; +#else + return -2; +#endif +}