X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=index%2Fmod_dom.c;h=b1555d7add25712f74d73edfc626957f16e4faba;hb=cf66499bac7c49c5bdd363a2c927295fa92f547a;hp=02c7eb00e731884769ba69ae47e67572915560e2;hpb=62c8013389b7d982b7423a89880154320b42e2d6;p=idzebra-moved-to-github.git diff --git a/index/mod_dom.c b/index/mod_dom.c index 02c7eb0..b1555d7 100644 --- a/index/mod_dom.c +++ b/index/mod_dom.c @@ -1,4 +1,5 @@ -/* $Id: mod_dom.c,v 1.21 2007-02-26 16:12:24 adam Exp $ + +/* $Id: mod_dom.c,v 1.30 2007-03-07 14:18:35 marc Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -108,6 +109,7 @@ struct filter_info { struct filter_retrieve *retrieve_list; struct filter_input *input_list; struct filter_store *store; + int record_info_invoked; }; @@ -183,6 +185,7 @@ static void *filter_init(Res res, RecType recType) tinfo->input_list = 0; tinfo->store = 0; tinfo->doc_config = 0; + tinfo->record_info_invoked = 0; #if YAZ_HAVE_EXSLT exsltRegisterAll(); @@ -331,6 +334,7 @@ static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr, } static ZEBRA_RES perform_convert(struct filter_info *tinfo, + struct recExtractCtrl *extctr, struct convert_s *convert, const char **params, xmlDocPtr *doc, @@ -338,12 +342,34 @@ static ZEBRA_RES perform_convert(struct filter_info *tinfo, { for (; convert; convert = convert->next) { + xmlChar *buf_out = 0; + int len_out = 0; xmlDocPtr res_doc = xsltApplyStylesheet(convert->stylesheet_xsp, *doc, params); if (last_xsp) *last_xsp = convert->stylesheet_xsp; + + if (!res_doc) + break; + + /* now saving into buffer and re-reading into DOM to avoid annoing + XSLT problem with thrown-out indentation text nodes */ + xsltSaveResultToString(&buf_out, &len_out, res_doc, + convert->stylesheet_xsp); + xmlFreeDoc(res_doc); + xmlFreeDoc(*doc); - *doc = res_doc; + + *doc = xmlParseMemory((const char *) buf_out, len_out); + + /* writing debug info out */ + if (extctr && extctr->flagShowRecords) + yaz_log(YLOG_LOG, "%s: XSLT %s\n %.*s", + tinfo->fname ? tinfo->fname : "(none)", + convert->stylesheet, + len_out, buf_out); + + xmlFree(buf_out); } return ZEBRA_OK; } @@ -690,96 +716,114 @@ static void index_value_of(struct filter_info *tinfo, xmlNodePtr node, xmlChar * index_p) { - xmlChar *text = xmlNodeGetContent(node); - size_t text_len = strlen((const char *)text); - - yaz_log(YLOG_LOG, "Indexing :%.*s:", text_len, text); - - /* if there is no text, we do not need to proceed */ - if (text_len) - { - xmlChar *look = index_p; - xmlChar *bval; - xmlChar *eval; - - xmlChar index[256]; - xmlChar type[256]; + if (tinfo->record_info_invoked == 1) + { + xmlChar *text = xmlNodeGetContent(node); + size_t text_len = strlen((const char *)text); + + /* if there is no text, we do not need to proceed */ + if (text_len) + { + xmlChar *look = index_p; + xmlChar *bval; + xmlChar *eval; + + xmlChar index[256]; + xmlChar type[256]; - /* assingning text to be indexed */ - recword->term_buf = (const char *)text; - recword->term_len = text_len; + /* assingning text to be indexed */ + recword->term_buf = (const char *)text; + recword->term_len = text_len; - /* parsing all index name/type pairs */ - /* may not start with ' ' or ':' */ - while (*look && ' ' != *look && ':' != *look) - { - /* setting name and type to zero */ - *index = '\0'; - *type = '\0'; - - /* parsing one index name */ - bval = look; - while (*look && ':' != *look && ' ' != *look) + /* parsing all index name/type pairs */ + /* may not start with ' ' or ':' */ + while (*look && ' ' != *look && ':' != *look) { - look++; - } - eval = look; - strncpy((char *)index, (const char *)bval, eval - bval); - index[eval - bval] = '\0'; - + /* setting name and type to zero */ + *index = '\0'; + *type = '\0'; - /* parsing one index type, if existing */ - if (':' == *look) - { - look++; - + /* parsing one index name */ bval = look; - while (*look && ' ' != *look) + while (*look && ':' != *look && ' ' != *look) { look++; } eval = look; - strncpy((char *)type, (const char *)bval, eval - bval); - type[eval - bval] = '\0'; - } - - /* actually indexing the text given */ - dom_log(YLOG_DEBUG, tinfo, 0, - "INDEX '%s:%s' '%s'", - index ? (const char *) index : "null", - type ? (const char *) type : "null", - text ? (const char *) text : "null"); - - recword->index_name = (const char *)index; - if (type && *type) - recword->index_type = *type; - (extctr->tokenAdd)(recword); + strncpy((char *)index, (const char *)bval, eval - bval); + index[eval - bval] = '\0'; + + + /* parsing one index type, if existing */ + if (':' == *look) + { + look++; + + bval = look; + while (*look && ' ' != *look) + { + look++; + } + eval = look; + strncpy((char *)type, (const char *)bval, eval - bval); + type[eval - bval] = '\0'; + } - /* eat whitespaces */ - if (*look && ' ' == *look && *(look+1)) - { - look++; - } + /* actually indexing the text given */ + dom_log(YLOG_DEBUG, tinfo, 0, + "INDEX '%s:%s' '%s'", + index ? (const char *) index : "null", + type ? (const char *) type : "null", + text ? (const char *) text : "null"); + + recword->index_name = (const char *)index; + if (type && *type) + recword->index_type = *type; + + /* writing debug out */ + if (extctr->flagShowRecords) + dom_log(YLOG_LOG, tinfo, 0, + "INDEX '%s:%s' '%s'", + index ? (const char *) index : "null", + type ? (const char *) type : "null", + text ? (const char *) text : "null"); + + /* actually indexing the text given */ + recword->index_name = (const char *)index; + if (type && *type) + recword->index_type = *type; + (extctr->tokenAdd)(recword); + + /* eat whitespaces */ + if (*look && ' ' == *look && *(look+1)) + { + look++; + } + } } + xmlFree(text); } - - xmlFree(text); } /* DOM filter style indexing */ static void set_record_info(struct filter_info *tinfo, struct recExtractCtrl *extctr, + xmlNodePtr node, xmlChar * id_p, xmlChar * rank_p, xmlChar * type_p) { - dom_log(YLOG_DEBUG, tinfo, 0, - "RECORD id=%s rank=%s type=%s", - id_p ? (const char *) id_p : "null", - rank_p ? (const char *) rank_p : "null", - type_p ? (const char *) type_p : "null"); + + /* writing debug info out */ + if (extctr->flagShowRecords) + dom_log(YLOG_LOG, tinfo, 0, + "RECORD id=%s rank=%s type=%s", + id_p ? (const char *) id_p : "(null)", + rank_p ? (const char *) rank_p : "(null)", + type_p ? (const char *) type_p : "(null)"); + if (id_p) sscanf((const char *)id_p, "%255s", extctr->match_criteria); @@ -793,6 +837,12 @@ static void set_record_info(struct filter_info *tinfo, /* else */ /* dom_log(YLOG_WARN, tinfo, ptr, "dom filter: unknown record type '%s'", */ /* type_str); */ + if (tinfo->record_info_invoked == 1) + { + /* warn about multiple only once */ + dom_log(YLOG_WARN, tinfo, node, "multiple record elements"); + } + tinfo->record_info_invoked++; } @@ -854,7 +904,7 @@ static void process_xml_element_zebra_node(struct filter_info *tinfo, attr->name); } } - set_record_info(tinfo, extctr, id_p, rank_p, type_p); + set_record_info(tinfo, extctr, node, id_p, rank_p, type_p); } else { @@ -938,7 +988,7 @@ static void process_xml_pi_node(struct filter_info *tinfo, pi_p, look); } else - set_record_info(tinfo, extctr, id, rank, 0); + set_record_info(tinfo, extctr, node, id, rank, 0); } /* parsing index instruction */ @@ -1004,20 +1054,10 @@ static void extract_dom_doc_node(struct filter_info *tinfo, struct recExtractCtrl *extctr, xmlDocPtr doc) { - xmlChar *buf_out; - int len_out; - /* only need to do the initialization once, reuse recword for all terms */ RecWord recword; (*extctr->init)(extctr, &recword); - if (extctr->flagShowRecords) - { - xmlDocDumpMemory(doc, &buf_out, &len_out); - fwrite(buf_out, len_out, 1, stdout); - xmlFree(buf_out); - } - process_xml_element_node(tinfo, extctr, &recword, (xmlNodePtr)doc); } @@ -1036,27 +1076,35 @@ static int convert_extract_doc(struct filter_info *tinfo, xsltStylesheetPtr last_xsp = 0; xmlDocPtr store_doc = 0; + /* per default do not ingest record */ + tinfo->record_info_invoked = 0; + + /* exit if empty document given */ + if (!doc) + return RECCTRL_EXTRACT_SKIP; + + /* we actuallu have a document which needs to be processed further */ params[0] = 0; set_param_str(params, "schema", zebra_dom_ns, tinfo->odr_record); /* input conversion */ - perform_convert(tinfo, input->convert, params, &doc, 0); + perform_convert(tinfo, p, input->convert, params, &doc, 0); if (tinfo->store) { /* store conversion */ store_doc = xmlCopyDoc(doc, 1); - perform_convert(tinfo, tinfo->store->convert, + perform_convert(tinfo, p, tinfo->store->convert, params, &store_doc, &last_xsp); } + /* saving either store doc or original doc in case no store doc exists */ if (last_xsp) xsltSaveResultToString(&buf_out, &len_out, store_doc ? store_doc : doc, last_xsp); else xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out); - if (p->flagShowRecords) - fwrite(buf_out, len_out, 1, stdout); + (*p->setStoreData)(p, buf_out, len_out); xmlFree(buf_out); @@ -1064,15 +1112,18 @@ static int convert_extract_doc(struct filter_info *tinfo, xmlFreeDoc(store_doc); /* extract conversion */ - perform_convert(tinfo, tinfo->extract->convert, params, &doc, 0); + perform_convert(tinfo, p, tinfo->extract->convert, params, &doc, 0); + /* finally, do the indexing */ - if (doc) - { + if (doc){ extract_dom_doc_node(tinfo, p, doc); - /* extract_doc_alvis(tinfo, p, doc); */ xmlFreeDoc(doc); } + + /* there was nothing to index, so there is no inserted/updated record */ + if (tinfo->record_info_invoked == 0) + return RECCTRL_EXTRACT_SKIP; return RECCTRL_EXTRACT_OK; } @@ -1091,8 +1142,9 @@ static int extract_xml_split(struct filter_info *tinfo, p /* I/O handler */, 0 /* URL */, 0 /* encoding */, - XML_PARSE_XINCLUDE| - XML_PARSE_NOENT); + XML_PARSE_XINCLUDE + | XML_PARSE_NOENT + | XML_PARSE_NONET); } if (!input->u.xmlreader.reader) return RECCTRL_EXTRACT_ERROR_GENERIC; @@ -1102,18 +1154,37 @@ static int extract_xml_split(struct filter_info *tinfo, { int type = xmlTextReaderNodeType(input->u.xmlreader.reader); int depth = xmlTextReaderDepth(input->u.xmlreader.reader); + if (type == XML_READER_TYPE_ELEMENT && input->u.xmlreader.split_level == depth) { - xmlNodePtr ptr - = xmlTextReaderExpand(input->u.xmlreader.reader); + xmlNodePtr ptr; + + /* per default do not ingest record */ + tinfo->record_info_invoked = 0; + + ptr = xmlTextReaderExpand(input->u.xmlreader.reader); if (ptr) - { + { + /* we have a new document */ + xmlNodePtr ptr2 = xmlCopyNode(ptr, 1); xmlDocPtr doc = xmlNewDoc((const xmlChar*) "1.0"); xmlDocSetRootElement(doc, ptr2); + /* writing debug info out */ + if (p->flagShowRecords) + { + xmlChar *buf_out = 0; + int len_out = 0; + xmlDocDumpMemory(doc, &buf_out, &len_out); + yaz_log(YLOG_LOG, "%s: XMLREADER depth: %i\n%.*s", + tinfo->fname ? tinfo->fname : "(none)", + depth, len_out, buf_out); + xmlFree(buf_out); + } + return convert_extract_doc(tinfo, input, p, doc); } else @@ -1140,7 +1211,9 @@ static int extract_xml_full(struct filter_info *tinfo, p /* I/O handler */, 0 /* URL */, 0 /* encoding */, - XML_PARSE_XINCLUDE|XML_PARSE_NOENT); + XML_PARSE_XINCLUDE + | XML_PARSE_NOENT + | XML_PARSE_NONET); if (!doc) { return RECCTRL_EXTRACT_ERROR_GENERIC; @@ -1299,7 +1372,7 @@ static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p) doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */, 0 /* URL */, 0 /* encoding */, - XML_PARSE_XINCLUDE|XML_PARSE_NOENT); + XML_PARSE_XINCLUDE | XML_PARSE_NOENT | XML_PARSE_NONET); if (!doc) { p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS; @@ -1307,7 +1380,7 @@ static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p) } /* retrieve conversion */ - perform_convert(tinfo, retrieve->convert, params, &doc, &last_xsp); + perform_convert(tinfo, 0, retrieve->convert, params, &doc, &last_xsp); if (!doc) { p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;