X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=retrieval%2Fd1_read.c;h=281f7caf5494f9c845e58a50d81548eafc444b64;hb=5b690aebb8dc2d05cad8f668de8fd821a1c231fa;hp=66baf41cc3a20db3a2decf2dea20c57fcb0bd1b1;hpb=a774858896a9b56c966edbfb70d8eae1c91c4ad1;p=yaz-moved-to-github.git diff --git a/retrieval/d1_read.c b/retrieval/d1_read.c index 66baf41..281f7ca 100644 --- a/retrieval/d1_read.c +++ b/retrieval/d1_read.c @@ -3,7 +3,7 @@ * See the file LICENSE for details. * Sebastian Hammer, Adam Dickmeiss * - * $Id: d1_read.c,v 1.44 2002-07-03 10:04:04 adam Exp $ + * $Id: d1_read.c,v 1.54 2002-10-08 23:00:09 adam Exp $ */ #include @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -18,9 +19,12 @@ data1_node *data1_get_root_tag (data1_handle dh, data1_node *n) { if (!n) return 0; - n = n->child; - while (n && n->which != DATA1N_tag) - n = n->next; + if (data1_is_xmlmode(dh)) + { + n = n->child; + while (n && n->which != DATA1N_tag) + n = n->next; + } return n; } @@ -30,9 +34,19 @@ data1_node *data1_get_root_tag (data1_handle dh, data1_node *n) */ data1_node *get_parent_tag (data1_handle dh, data1_node *n) { - for (; n && n->which != DATA1N_root; n = n->parent) - if (n->which == DATA1N_tag) - return n; + if (data1_is_xmlmode(dh)) + { + for (; n && n->which != DATA1N_root; n = n->parent) + if (n->which == DATA1N_tag && n->parent && + n->parent->which != DATA1N_root) + return n; + } + else + { + for (; n && n->which != DATA1N_root; n = n->parent) + if (n->which == DATA1N_tag) + return n; + } return 0; } @@ -46,29 +60,8 @@ data1_node *data1_mk_node_type (data1_handle dh, NMEM m, int type) return data1_mk_node2 (dh, m, type, 0); } -data1_node *data1_mk_node2 (data1_handle dh, NMEM m, int type, - data1_node *parent) +static void data1_init_node (data1_handle dh, data1_node *r, int type) { - data1_node *r; - - r = (data1_node *)nmem_malloc(m, sizeof(*r)); - r->next = r->child = r->last_child = 0; - r->destroy = 0; - - if (!parent) - { - r->root = r; - } - else - { - r->root = parent->root; - r->parent = parent; - if (!parent->child) - parent->child = parent->last_child = r; - else - parent->last_child->next = r; - parent->last_child = r; - } r->which = type; switch(type) { @@ -108,6 +101,57 @@ data1_node *data1_mk_node2 (data1_handle dh, NMEM m, int type, default: logf (LOG_WARN, "data_mk_node_type. bad type = %d\n", type); } +} + +data1_node *data1_append_node (data1_handle dh, NMEM m, int type, + data1_node *parent) +{ + data1_node *r = (data1_node *)nmem_malloc(m, sizeof(*r)); + r->next = r->child = r->last_child = 0; + r->destroy = 0; + + if (!parent) + r->root = r; + else + { + r->root = parent->root; + r->parent = parent; + if (!parent->child) + parent->child = parent->last_child = r; + else + parent->last_child->next = r; + parent->last_child = r; + } + data1_init_node(dh, r, type); + return r; +} + +data1_node *data1_mk_node2 (data1_handle dh, NMEM m, int type, + data1_node *parent) +{ + return data1_append_node (dh, m, type, parent); +} + +data1_node *data1_insert_node (data1_handle dh, NMEM m, int type, + data1_node *parent) +{ + data1_node *r = (data1_node *)nmem_malloc(m, sizeof(*r)); + r->next = r->child = r->last_child = 0; + r->destroy = 0; + + if (!parent) + r->root = r; + else + { + r->root = parent->root; + r->parent = parent; + if (!parent->child) + parent->last_child = r; + else + r->next = parent->child; + parent->child = r; + } + data1_init_node(dh, r, type); return r; } @@ -151,19 +195,30 @@ void data1_set_root(data1_handle dh, data1_node *res, } data1_node *data1_mk_preprocess (data1_handle dh, NMEM nmem, - const char *target, const char **attr, - data1_node *at) + const char *target, + const char **attr, data1_node *at) +{ + return data1_mk_preprocess_n (dh, nmem, target, strlen(target), + attr, at); +} + +data1_node *data1_mk_preprocess_n (data1_handle dh, NMEM nmem, + const char *target, size_t len, + const char **attr, data1_node *at) { data1_xattr **p; data1_node *res = data1_mk_node2 (dh, nmem, DATA1N_preprocess, at); - res->u.preprocess.target = data1_insert_string (dh, res, nmem, target); - + res->u.preprocess.target = data1_insert_string_n (dh, res, nmem, + target, len); + p = &res->u.preprocess.attributes; while (attr && *attr) { *p = (data1_xattr*) nmem_malloc (nmem, sizeof(**p)); (*p)->name = nmem_strdup (nmem, *attr++); (*p)->value = nmem_strdup (nmem, *attr++); + (*p)->what = DATA1I_text; + p = &(*p)->next; } *p = 0; @@ -176,22 +231,31 @@ data1_node *data1_mk_tag_n (data1_handle dh, NMEM nmem, { data1_node *partag = get_parent_tag(dh, at); data1_node *res = data1_mk_node2 (dh, nmem, DATA1N_tag, at); - data1_element *e = NULL; data1_xattr **p; + data1_element *e = 0; res->u.tag.tag = data1_insert_string_n (dh, res, nmem, tag, len); - if (partag) - e = partag->u.tag.element; - res->u.tag.element = - data1_getelementbytagname (dh, at->root->u.root.absyn, - e, res->u.tag.tag); + if (!partag) /* top tag? */ + e = data1_getelementbytagname (dh, at->root->u.root.absyn, + 0 /* index as local */, + res->u.tag.tag); + else + { + /* only set element for known tags */ + e = partag->u.tag.element; + if (e) + e = data1_getelementbytagname (dh, at->root->u.root.absyn, + e, res->u.tag.tag); + } + res->u.tag.element = e; p = &res->u.tag.attributes; while (attr && *attr) { *p = (data1_xattr*) nmem_malloc (nmem, sizeof(**p)); (*p)->name = nmem_strdup (nmem, *attr++); (*p)->value = nmem_strdup (nmem, *attr++); + (*p)->what = DATA1I_text; p = &(*p)->next; } *p = 0; @@ -215,6 +279,7 @@ void data1_tag_add_attr (data1_handle dh, NMEM nmem, *p = (data1_xattr*) nmem_malloc (nmem, sizeof(**p)); (*p)->name = nmem_strdup (nmem, *attr++); (*p)->value = nmem_strdup (nmem, *attr++); + (*p)->what = DATA1I_text; p = &(*p)->next; } *p = 0; @@ -304,7 +369,7 @@ char *data1_insert_string_n (data1_handle dh, data1_node *res, { char *b; if (len >= DATA1_LOCALDATA) - b = nmem_malloc (m, len+1); + b = (char *) nmem_malloc (m, len+1); else b = res->lbuf; memcpy (b, str, len); @@ -321,7 +386,8 @@ char *data1_insert_string (data1_handle dh, data1_node *res, static data1_node *data1_add_insert_taggeddata(data1_handle dh, data1_node *at, const char *tagname, NMEM m, - int local_allowed) + int local_allowed, + int insert_mode) { data1_node *root = at->root; data1_node *partag = get_parent_tag (dh, at); @@ -329,12 +395,20 @@ static data1_node *data1_add_insert_taggeddata(data1_handle dh, data1_node *datn = 0; data1_node *tagn = 0; - if (partag) + if (!partag) + e = data1_getelementbytagname (dh, root->u.root.absyn, 0, tagname); + else + { e = partag->u.tag.element; - e = data1_getelementbytagname (dh, root->u.root.absyn, e, tagname); + if (e) + e = data1_getelementbytagname (dh, root->u.root.absyn, e, tagname); + } if (local_allowed || e) { - tagn = data1_mk_node2 (dh, m, DATA1N_tag, at); + if (insert_mode) + tagn = data1_insert_node (dh, m, DATA1N_tag, at); + else + tagn = data1_append_node (dh, m, DATA1N_tag, at); tagn->u.tag.tag = data1_insert_string (dh, tagn, m, tagname); tagn->u.tag.element = e; datn = data1_mk_node2 (dh, m, DATA1N_data, tagn); @@ -345,7 +419,7 @@ static data1_node *data1_add_insert_taggeddata(data1_handle dh, data1_node *data1_mk_tag_data(data1_handle dh, data1_node *at, const char *tagname, NMEM m) { - return data1_add_insert_taggeddata (dh, at, tagname, m, 1); + return data1_add_insert_taggeddata (dh, at, tagname, m, 1, 0); } @@ -357,21 +431,21 @@ data1_node *data1_mk_tag_data(data1_handle dh, data1_node *at, data1_node *data1_mk_tag_data_wd(data1_handle dh, data1_node *at, const char *tagname, NMEM m) { - return data1_add_insert_taggeddata (dh, at, tagname, m, 0); + return data1_add_insert_taggeddata (dh, at, tagname, m, 0, 1); } data1_node *data1_insert_taggeddata (data1_handle dh, data1_node *root, data1_node *at, const char *tagname, NMEM m) { - return data1_add_insert_taggeddata (dh, at, tagname, m, 0); + return data1_add_insert_taggeddata (dh, at, tagname, m, 0, 1); } data1_node *data1_add_taggeddata (data1_handle dh, data1_node *root, data1_node *at, const char *tagname, NMEM m) { - return data1_add_insert_taggeddata (dh, at, tagname, m, 1); + return data1_add_insert_taggeddata (dh, at, tagname, m, 1, 0); } data1_node *data1_mk_tag_data_int (data1_handle dh, data1_node *at, @@ -451,10 +525,54 @@ data1_node *data1_mk_tag_data_text_uni (data1_handle dh, data1_node *at, } } +static int ampr (int (*get_byte)(void *fh), void *fh, int *amp) +{ +#if 1 + int c = (*get_byte)(fh); + *amp = 0; + return c; +#else + int c = (*get_byte)(fh); + *amp = 0; + if (c == '&') + { + char ent[20]; + int i = 0; + + while (1) + { + c = (*get_byte)(fh); + if (c == ';') + { + ent[i] = 0; + + c = ' '; + if (!strcmp (ent, "quot")) + c = '"'; + if (!strcmp (ent, "apos")) + c = '\''; + if (!strcmp (ent, "gt")) + c = '>'; + if (!strcmp (ent, "lt")) + c = '<'; + if (!strcmp (ent, "amp")) + c = '&'; + *amp = 1; + break; + } + else if (c == 0 || d1_isspace(c)) + break; + if (i < 19) + ent[i++] = c; + } + } + return c; +#endif +} data1_xattr *data1_read_xattr (data1_handle dh, NMEM m, int (*get_byte)(void *fh), void *fh, - WRBUF wrbuf, int *ch) + WRBUF wrbuf, int *ch, int *amp) { data1_xattr *p_first = 0; data1_xattr **pp = &p_first; @@ -463,20 +581,21 @@ data1_xattr *data1_read_xattr (data1_handle dh, NMEM m, { data1_xattr *p; int len; - while (c && d1_isspace(c)) - c = (*get_byte)(fh); - if (!c || c == '>' || c == '/') + while (*amp || (c && d1_isspace(c))) + c = ampr (get_byte, fh, amp); + if (*amp == 0 && (c == 0 || c == '>' || c == '/')) break; *pp = p = (data1_xattr *) nmem_malloc (m, sizeof(*p)); p->next = 0; pp = &p->next; p->value = 0; + p->what = DATA1I_xmltext; wrbuf_rewind(wrbuf); while (c && c != '=' && c != '>' && c != '/' && !d1_isspace(c)) { wrbuf_putc (wrbuf, c); - c = (*get_byte)(fh); + c = ampr (get_byte, fh, amp); } wrbuf_putc (wrbuf, '\0'); len = wrbuf_len(wrbuf); @@ -484,26 +603,38 @@ data1_xattr *data1_read_xattr (data1_handle dh, NMEM m, strcpy (p->name, wrbuf_buf(wrbuf)); if (c == '=') { - c = (*get_byte)(fh); - if (c == '"') + c = ampr (get_byte, fh, amp); + if (*amp == 0 && c == '"') + { + c = ampr (get_byte, fh, amp); + wrbuf_rewind(wrbuf); + while (*amp || (c && c != '"')) + { + wrbuf_putc (wrbuf, c); + c = ampr (get_byte, fh, amp); + } + if (c) + c = ampr (get_byte, fh, amp); + } + else if (*amp == 0 && c == '\'') { - c = (*get_byte)(fh); + c = ampr (get_byte, fh, amp); wrbuf_rewind(wrbuf); - while (c && c != '"') + while (*amp || (c && c != '\'')) { wrbuf_putc (wrbuf, c); - c = (*get_byte)(fh); + c = ampr (get_byte, fh, amp); } if (c) - c = (*get_byte)(fh); + c = ampr (get_byte, fh, amp); } else { wrbuf_rewind(wrbuf); - while (c && c != '>' && c != '/') + while (*amp || (c && c != '>' && c != '/')) { wrbuf_putc (wrbuf, c); - c = (*get_byte)(fh); + c = ampr (get_byte, fh, amp); } } wrbuf_putc (wrbuf, '\0'); @@ -525,25 +656,17 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, { data1_node *d1_stack[256]; data1_node *res; - int c; + int c, amp; int level = 0; int line = 1; d1_stack[level] = 0; - c = (*get_byte)(fh); - while (1) + c = ampr (get_byte, fh, &); + while (c != '\0') { data1_node *parent = level ? d1_stack[level-1] : 0; - while (c != '\0' && d1_isspace(c)) - { - if (c == '\n') - line++; - c = (*get_byte)(fh); - } - if (c == '\0') - break; - - if (c == '<') /* beginning of tag */ + + if (amp == 0 && c == '<') /* beginning of tag */ { data1_xattr *xattr; @@ -553,41 +676,92 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, int end_tag = 0; size_t i = 0; - c = (*get_byte)(fh); - if (c == '/') + c = ampr (get_byte, fh, &); + if (amp == 0 && c == '/') { end_tag = 1; - c = (*get_byte)(fh); + c = ampr (get_byte, fh, &); } - else if (c == '!') /* tags/comments that we don't deal with yet */ + else if (amp == 0 && c == '!') { - while (c && c != '>') - c = (*get_byte)(fh); - if (c) - c = (*get_byte)(fh); - continue; + int c0, amp0; + + wrbuf_rewind(wrbuf); + + c0 = ampr (get_byte, fh, &0); + if (amp0 == 0 && c0 == '\0') + break; + c = ampr (get_byte, fh, &); + + if (amp0 == 0 && c0 == '-' && amp == 0 && c == '-') + { + /* COMMENT: */ + int no_dash = 0; + + c = ampr (get_byte, fh, &); + while (amp || c) + { + if (amp == 0 && c == '-') + no_dash++; + else if (amp == 0 && c == '>' && no_dash >= 2) + { + if (level > 0) + d1_stack[level] = + data1_mk_comment_n ( + dh, m, + wrbuf_buf(wrbuf), wrbuf_len(wrbuf)-2, + d1_stack[level-1]); + c = ampr (get_byte, fh, &); /* skip > */ + break; + } + else + no_dash = 0; + wrbuf_putc (wrbuf, c); + c = ampr (get_byte, fh, &); + } + continue; + } + else + { /* DIRECTIVE: */ + + int blevel = 0; + while (amp || c) + { + if (amp == 0 && c == '>' && blevel == 0) + { + c = ampr (get_byte, fh, &); + break; + } + if (amp == 0 && c == '[') + blevel++; + if (amp == 0 && c == ']' && blevel > 0) + blevel--; + c = ampr (get_byte, fh, &); + } + continue; + } } - while (c && c != '>' && c != '/' && !d1_isspace(c)) + while (amp || (c && c != '>' && c != '/' && !d1_isspace(c))) { if (i < (sizeof(tag)-1)) tag[i++] = c; - c = (*get_byte)(fh); + c = ampr (get_byte, fh, &); } tag[i] = '\0'; - xattr = data1_read_xattr (dh, m, get_byte, fh, wrbuf, &c); + xattr = data1_read_xattr (dh, m, get_byte, fh, wrbuf, &c, &); args[0] = '\0'; - if (c == '/') + if (amp == 0 && c == '/') { /* or */ null_tag = 1; - c = (*get_byte)(fh); + c = ampr (get_byte, fh, &); } - if (c != '>') + if (amp || c != '>') { yaz_log(LOG_WARN, "d1: %d: Malformed tag", line); return 0; } else - c = (*get_byte)(fh); + c = ampr (get_byte, fh, &); /* End tag? */ if (end_tag) @@ -616,8 +790,16 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, break; } } - if (level <= 1) - return d1_stack[0]; + if (data1_is_xmlmode(dh)) + { + if (level <= 1) + return d1_stack[0]; + } + else + { + if (level <= 0) + return d1_stack[0]; + } continue; } else if (!strcmp(tag, "var")) @@ -670,10 +852,20 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, if (level == 0) { parent = data1_mk_root (dh, m, tag); - d1_stack[level++] = parent; + res = d1_stack[level] = parent; + + if (data1_is_xmlmode(dh)) + { + level++; + res = data1_mk_tag (dh, m, tag, 0 /* attr */, parent); + res->u.tag.attributes = xattr; + } + } + else + { + res = data1_mk_tag (dh, m, tag, 0 /* attr */, parent); + res->u.tag.attributes = xattr; } - res = data1_mk_tag (dh, m, tag, 0 /* attr */, parent); - res->u.tag.attributes = xattr; } d1_stack[level] = res; d1_stack[level+1] = 0; @@ -682,26 +874,24 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, } else /* != '<'... this is a body of text */ { - const char *src; - char *dst; - int len, prev_char = 0; + int len; if (level == 0) { - c = (*get_byte)(fh); + c = ampr (get_byte, fh, &); continue; } res = data1_mk_node2 (dh, m, DATA1N_data, parent); - res->u.data.what = DATA1I_text; + res->u.data.what = DATA1I_xmltext; res->u.data.formatted_text = 0; d1_stack[level] = res; wrbuf_rewind(wrbuf); - while (c && c != '<') + while (amp || (c && c != '<')) { wrbuf_putc (wrbuf, c); - c = (*get_byte)(fh); + c = ampr (get_byte, fh, &); } len = wrbuf_len(wrbuf); @@ -710,26 +900,12 @@ data1_node *data1_read_nodex (data1_handle dh, NMEM m, res->u.data.data = (char*) nmem_malloc (m, len); else res->u.data.data = res->lbuf; - - /* read "data" and transfer while removing white space */ - dst = res->u.data.data; - for (src = wrbuf_buf(wrbuf); --len >= 0; src++) - { - if (*src == '\n') - line++; - if (d1_isspace (*src)) - prev_char = ' '; - else - { - if (prev_char) - { - *dst++ = prev_char; - prev_char = 0; - } - *dst++ = *src; - } - } - res->u.data.len = dst - res->u.data.data; + + if (len) + memcpy (res->u.data.data, wrbuf_buf(wrbuf), len); + else + res->u.data.data = 0; + res->u.data.len = len; } } return 0; @@ -793,3 +969,124 @@ data1_node *data1_read_sgml (data1_handle dh, NMEM m, const char *buf) return data1_read_node (dh, &bp, m); } + +static int conv_item (NMEM m, yaz_iconv_t t, + WRBUF wrbuf, char *inbuf, size_t inlen) +{ + wrbuf_rewind (wrbuf); + if (wrbuf->size < 10) + wrbuf_grow (wrbuf, 10); + for (;;) + { + char *outbuf = wrbuf->buf + wrbuf->pos; + size_t outlen = wrbuf->size - wrbuf->pos; + if (yaz_iconv (t, &inbuf, &inlen, &outbuf, &outlen) == + (size_t)(-1) && yaz_iconv_error(t) != YAZ_ICONV_E2BIG) + { + /* bad data. stop and skip conversion entirely */ + return -1; + } + else if (inlen == 0) + { /* finished converting */ + wrbuf->pos = wrbuf->size - outlen; + break; + } + else + { + /* buffer too small: make sure we expand buffer */ + wrbuf->pos = wrbuf->size - outlen; + wrbuf_grow(wrbuf, 20); + } + } + return 0; +} + +static void data1_iconv_s (data1_handle dh, NMEM m, data1_node *n, + yaz_iconv_t t, WRBUF wrbuf, const char *tocode) +{ + for (; n; n = n->next) + { + switch (n->which) + { + case DATA1N_data: + case DATA1N_comment: + if (conv_item (m, t, wrbuf, n->u.data.data, n->u.data.len) == 0) + { + n->u.data.data = + data1_insert_string_n (dh, n, m, wrbuf->buf, + wrbuf->pos); + n->u.data.len = wrbuf->pos; + } + break; + case DATA1N_tag: + if (conv_item (m, t, wrbuf, n->u.tag.tag, strlen(n->u.tag.tag)) + == 0) + { + n->u.tag.tag = + data1_insert_string_n (dh, n, m, + wrbuf->buf, wrbuf->pos); + } + if (n->u.tag.attributes) + { + data1_xattr *p; + for (p = n->u.tag.attributes; p; p = p->next) + { + if (p->value && + conv_item(m, t, wrbuf, p->value, strlen(p->value)) + == 0) + { + wrbuf_puts (wrbuf, ""); + p->value = nmem_strdup (m, wrbuf->buf); + } + } + } + break; + case DATA1N_preprocess: + if (strcmp(n->u.preprocess.target, "xml") == 0) + { + data1_xattr *p = n->u.preprocess.attributes; + for (; p; p = p->next) + if (strcmp (p->name, "encoding") == 0) + p->value = nmem_strdup (m, tocode); + } + break; + } + data1_iconv_s (dh, m, n->child, t, wrbuf, tocode); + } +} + +const char *data1_get_encoding (data1_handle dh, data1_node *n) +{ + /* see if we have an xml header that specifies encoding */ + if (n && n->child && n->child->which == DATA1N_preprocess && + strcmp (n->child->u.preprocess.target, "xml") == 0) + { + data1_xattr *xp = n->child->u.preprocess.attributes; + for (; xp; xp = xp->next) + if (!strcmp (xp->name, "encoding") == 0) + return xp->value; + } + /* no encoding in header, so see if "encoding" was specified for abs */ + if (n && n->which == DATA1N_root && + n->u.root.absyn && n->u.root.absyn->encoding) + return n->u.root.absyn->encoding; + /* none of above, return a hard coded default */ + return "ISO-8859-1"; +} + +int data1_iconv (data1_handle dh, NMEM m, data1_node *n, + const char *tocode, + const char *fromcode) +{ + if (strcmp (tocode, fromcode)) + { + WRBUF wrbuf = wrbuf_alloc(); + yaz_iconv_t t = yaz_iconv_open (tocode, fromcode); + if (!t) + return -1; + data1_iconv_s (dh, m, n, t, wrbuf, tocode); + yaz_iconv_close (t); + wrbuf_free (wrbuf, 1); + } + return 0; +}