X-Git-Url: http://git.indexdata.com/?p=yaz-moved-to-github.git;a=blobdiff_plain;f=src%2Fcclfind.c;h=12dca7923a32a0fdb69bd6f36d82fdc96640f8b2;hp=2ebd11f3073f21a24c9317e783ae0de50257ab06;hb=f60a3fd863c6a5a8ec97c4918f5cad4a117a8f7d;hpb=78b138b80fadc817d621f0a692fca857ee31399b diff --git a/src/cclfind.c b/src/cclfind.c index 2ebd11f..12dca79 100644 --- a/src/cclfind.c +++ b/src/cclfind.c @@ -1,47 +1,7 @@ -/* - * Copyright (c) 1995, the EUROPAGATE consortium (see below). - * - * The EUROPAGATE consortium members are: - * - * University College Dublin - * Danmarks Teknologiske Videnscenter - * An Chomhairle Leabharlanna - * Consejo Superior de Investigaciones Cientificas - * - * Permission to use, copy, modify, distribute, and sell this software and - * its documentation, in whole or in part, for any purpose, is hereby granted, - * provided that: - * - * 1. This copyright and permission notice appear in all copies of the - * software and its documentation. Notices of copyright or attribution - * which appear at the beginning of any file must remain unchanged. - * - * 2. The names of EUROPAGATE or the project partners may not be used to - * endorse or promote products derived from this software without specific - * prior written permission. - * - * 3. Users of this software (implementors and gateway operators) agree to - * inform the EUROPAGATE consortium of their use of the software. This - * information will be used to evaluate the EUROPAGATE project and the - * software, and to plan further developments. The consortium may use - * the information in later publications. - * - * 4. Users of this software agree to make their best efforts, when - * documenting their use of the software, to acknowledge the EUROPAGATE - * consortium, and the role played by the software in their work. - * - * THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND, - * EXPRESS, IMPLIED, OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY - * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. - * IN NO EVENT SHALL THE EUROPAGATE CONSORTIUM OR ITS MEMBERS BE LIABLE - * FOR ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF - * ANY KIND, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA - * OR PROFITS, WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND - * ON ANY THEORY OF LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE - * USE OR PERFORMANCE OF THIS SOFTWARE. - * +/* This file is part of the YAZ toolkit. + * Copyright (C) 1995-2012 Index Data + * See the file LICENSE for details. */ - /** * \file cclfind.c * \brief Implements parsing of a CCL FIND query. @@ -51,67 +11,13 @@ * of lookahead in the handling of relational operations.. So * it's not really pure. */ - - -/* CCL find (to rpn conversion) - * Europagate, 1995 - * - * $Id: cclfind.c,v 1.12 2007-04-30 11:33:49 adam Exp $ - * - * Old Europagate log: - * - * Revision 1.16 1996/01/08 08:41:13 adam - * Removed unused function. - * - * Revision 1.15 1995/07/20 08:14:34 adam - * Qualifiers were observed too often. Instead tokens are treated as - * qualifiers only when separated by comma. - * - * Revision 1.14 1995/05/16 09:39:26 adam - * LICENSE. - * - * Revision 1.13 1995/04/17 09:31:42 adam - * Improved handling of qualifiers. Aliases or reserved words. - * - * Revision 1.12 1995/03/20 15:27:43 adam - * Minor changes. - * - * Revision 1.11 1995/02/23 08:31:59 adam - * Changed header. - * - * Revision 1.9 1995/02/16 13:20:06 adam - * Spell fix. - * - * Revision 1.8 1995/02/14 19:59:42 adam - * Removed a syntax error. - * - * Revision 1.7 1995/02/14 19:55:10 adam - * Header files ccl.h/cclp.h are gone! They have been merged an - * moved to ../include/ccl.h. - * Node kind(s) in ccl_rpn_node have changed names. - * - * Revision 1.6 1995/02/14 16:20:55 adam - * Qualifiers are read from a file now. - * - * Revision 1.5 1995/02/14 14:12:41 adam - * Ranges for ordered qualfiers implemented (e.g. pd=1980-1990). - * - * Revision 1.4 1995/02/14 13:16:29 adam - * Left and/or right truncation implemented. - * - * Revision 1.3 1995/02/14 10:25:56 adam - * The constructions 'qualifier rel term ...' implemented. - * - * Revision 1.2 1995/02/13 15:15:07 adam - * Added handling of qualifiers. Not finished yet. - * - * Revision 1.1 1995/02/13 12:35:20 adam - * First version of CCL. Qualifiers aren't handled yet. - * - */ +#if HAVE_CONFIG_H +#include +#endif #include #include +#include #include "cclp.h" @@ -199,6 +105,7 @@ struct ccl_rpn_node *ccl_rpn_node_create(enum ccl_rpn_kind kind) case CCL_RPN_TERM: p->u.t.attr_list = 0; p->u.t.term = 0; + p->u.t.qual = 0; break; default: break; @@ -225,6 +132,7 @@ void ccl_rpn_delete(struct ccl_rpn_node *rpn) break; case CCL_RPN_TERM: xfree(rpn->u.t.term); + xfree(rpn->u.t.qual); for (attr = rpn->u.t.attr_list; attr; attr = attr1) { attr1 = attr->next; @@ -304,6 +212,112 @@ void ccl_add_attr_string(struct ccl_rpn_node *p, const char *set, n->value.str = xstrdup(value); } +static size_t cmp_operator(const char **aliases, const char *input) +{ + for (; *aliases; aliases++) + { + const char *cp = *aliases; + size_t i; + for (i = 0; *cp && *cp == input[i]; i++, cp++) + ; + if (*cp == '\0') + return i; + } + return 0; +} + + +#define REGEX_CHARS "^[]{}()|.*+?!$" +#define CCL_CHARS "#?\\" +static int append_term(CCL_parser cclp, const char *src_str, size_t src_len, + char *dst_term, int *regex_trunc, int *z3958_trunc, + const char **truncation_aliases, + int is_first, int is_last, + int *left_trunc, int *right_trunc) +{ + size_t j; + int quote_mode = 0; + + for (j = 0; j < src_len; j++) + { + size_t op_size; + if (j > 0 && src_str[j-1] == '\\') + { + if (*regex_trunc && strchr(REGEX_CHARS "\\", src_str[j])) + { + *regex_trunc = 2; + strcat(dst_term, "\\"); + } + else if (*z3958_trunc && strchr(CCL_CHARS "\\", src_str[j])) + { + *z3958_trunc = 2; + strcat(dst_term, "\\"); + } + strxcat(dst_term, src_str + j, 1); + } + else if (src_str[j] == '"') + quote_mode = !quote_mode; + else if (!quote_mode && + (op_size = cmp_operator(truncation_aliases, + src_str + j)) + ) + { + j += (op_size - 1); /* j++ in for loop */ + if (*regex_trunc) + { + strcat(dst_term, ".*"); + *regex_trunc = 2; /* regex trunc is really needed */ + } + else if (*z3958_trunc) + { + strcat(dst_term, "?"); + *z3958_trunc = 2; + } + else if (is_first && j == 0) + *left_trunc = 1; + else if (is_last && j == src_len - 1) + *right_trunc = 1; + else + { + cclp->error_code = CCL_ERR_TRUNC_NOT_EMBED; + return -1; + } + } + else if (!quote_mode && src_str[j] == '#') + { + if (*regex_trunc) + { + strcat(dst_term, "."); + *regex_trunc = 2; /* regex trunc is really needed */ + } + else if (*z3958_trunc) + { + strcat(dst_term, "#"); + *z3958_trunc = 2; + } + else + { + cclp->error_code = CCL_ERR_TRUNC_NOT_SINGLE; + return -1; + } + } + else if (src_str[j] != '\\') + { + if (*regex_trunc && strchr(REGEX_CHARS, src_str[j])) + { + *regex_trunc = 2; + strcat(dst_term, "\\"); + } + else if (*z3958_trunc && strchr(CCL_CHARS, src_str[j])) + { + *z3958_trunc = 2; + strcat(dst_term, "\\"); + } + strxcat(dst_term, src_str + j, 1); + } + } + return 0; +} /** * search_term: Parse CCL search term. @@ -320,33 +334,42 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, struct ccl_rpn_node *p_top = 0; struct ccl_token *lookahead = cclp->look_token; int and_list = 0; + int auto_group = 0; int or_list = 0; char *attset; - const char *truncation_aliases; + const char **truncation_aliases; + const char *t_default[2]; truncation_aliases = ccl_qual_search_special(cclp->bibset, "truncation"); if (!truncation_aliases) - truncation_aliases = "?"; + { + truncation_aliases = t_default; + t_default[0] = "?"; + t_default[1] = 0; + } if (qual_val_type(qa, CCL_BIB1_STR, CCL_BIB1_STR_AND_LIST, 0)) and_list = 1; + if (qual_val_type(qa, CCL_BIB1_STR, CCL_BIB1_STR_AUTO_GROUP, 0)) + auto_group = 1; if (qual_val_type(qa, CCL_BIB1_STR, CCL_BIB1_STR_OR_LIST, 0)) or_list = 1; while (1) { struct ccl_rpn_node *p; size_t no, i; - int no_spaces = 0; - int left_trunc = 0; - int right_trunc = 0; - int mid_trunc = 0; + int is_phrase = 0; int relation_value = -1; int position_value = -1; int structure_value = -1; int truncation_value = -1; int completeness_value = -1; int len = 0; + int left_trunc = 0; + int right_trunc = 0; + int regex_trunc = 0; + int z3958_trunc = 0; size_t max = 200; if (and_list || or_list || !multi) max = 1; @@ -358,56 +381,39 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, ADVANCE; continue; } - /* go through each TERM token. If no truncation attribute is yet - met, then look for left/right truncation markers (?) and - set left_trunc/right_trunc/mid_trunc accordingly */ for (no = 0; no < max && is_term_ok(lookahead->kind, term_list); no++) { + int this_is_phrase = 0; for (i = 0; ilen; i++) if (lookahead->name[i] == ' ') - no_spaces++; - else if (strchr(truncation_aliases, lookahead->name[i])) - { - if (no == 0 && i == 0 && lookahead->len >= 1) - left_trunc = 1; - else if (!is_term_ok(lookahead->next->kind, term_list) && - i == lookahead->len-1 && i >= 1) - right_trunc = 1; - else - mid_trunc = 1; - } + this_is_phrase = 1; + + if (auto_group) + { + if (no > 0 && (is_phrase || is_phrase != this_is_phrase)) + break; + is_phrase = this_is_phrase; + } + else if (this_is_phrase || no > 0) + is_phrase = 1; len += 1+lookahead->len+lookahead->ws_prefix_len; lookahead = lookahead->next; } if (len == 0) break; /* no more terms . stop . */ - - - if (p_top) - { - if (or_list) - p = ccl_rpn_node_create(CCL_RPN_OR); - else if (and_list) - p = ccl_rpn_node_create(CCL_RPN_AND); - else - p = ccl_rpn_node_create(CCL_RPN_AND); - p->u.p[0] = p_top; - p_top = p; - } /* create the term node, but wait a moment before adding the term */ p = ccl_rpn_node_create(CCL_RPN_TERM); p->u.t.attr_list = NULL; p->u.t.term = NULL; + if (qa && qa[0]) + { + const char *n = ccl_qual_get_name(qa[0]); + if (n) + p->u.t.qual = xstrdup(n); + } - /* make the top node point to us.. */ - if (p_top) - p_top->u.p[1] = p; - else - p_top = p; - - /* go through all attributes and add them to the attribute list */ for (i=0; qa && qa[i]; i++) { @@ -444,7 +450,6 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, if (truncation_value != -1) continue; truncation_value = attr->value.numeric; - left_trunc = right_trunc = mid_trunc = 0; break; case CCL_BIB1_COM: if (completeness_value != -1) @@ -457,49 +462,72 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, } } } - /* len now holds the number of characters in the RPN term */ - /* no holds the number of CCL tokens (1 or more) */ - - if (structure_value == -1 && - qual_val_type(qa, CCL_BIB1_STR, CCL_BIB1_STR_WP, &attset)) - { /* no structure attribute met. Apply either structure attribute - WORD or PHRASE depending on number of CCL tokens */ - if (no == 1 && no_spaces == 0) + attset = 0; + if (structure_value == -1 && ( + auto_group || + qual_val_type(qa, CCL_BIB1_STR, CCL_BIB1_STR_WP, &attset)) + ) + { + if (!is_phrase) ccl_add_attr_numeric(p, attset, CCL_BIB1_STR, 2); else ccl_add_attr_numeric(p, attset, CCL_BIB1_STR, 1); } + if (qual_val_type(qa, CCL_BIB1_TRU, CCL_BIB1_TRU_CAN_REGEX, + &attset)) + { + regex_trunc = 1; /* regex trunc (102) allowed */ + } + else if (qual_val_type(qa, CCL_BIB1_TRU, CCL_BIB1_TRU_CAN_Z3958, + &attset)) + { + z3958_trunc = 1; /* Z39.58 trunc (CCL) trunc allowed */ + } + /* make the RPN token */ - p->u.t.term = (char *)xmalloc(len); + p->u.t.term = (char *)xmalloc(len * 2 + 2); ccl_assert(p->u.t.term); p->u.t.term[0] = '\0'; for (i = 0; ilook_token->name; - int src_len = cclp->look_token->len; - - if (i == 0 && left_trunc) + size_t src_len = cclp->look_token->len; + + if (p->u.t.term[0] && cclp->look_token->ws_prefix_len) { - src_len--; - src_str++; + strxcat(p->u.t.term, cclp->look_token->ws_prefix_buf, + cclp->look_token->ws_prefix_len); } - if (i == no-1 && right_trunc) - src_len--; -#if 0 - fprintf(stderr, "[%s %.*s]", - ccl_qual_get_name(qa[0]), src_len, src_str); -#endif - if (i && cclp->look_token->ws_prefix_len) + if (append_term(cclp, src_str, src_len, p->u.t.term, ®ex_trunc, + &z3958_trunc, truncation_aliases, i == 0, i == no - 1, + &left_trunc, &right_trunc)) { - size_t len = strlen(p->u.t.term); - memcpy(p->u.t.term + len, cclp->look_token->ws_prefix_buf, - cclp->look_token->ws_prefix_len); - p->u.t.term[len + cclp->look_token->ws_prefix_len] = '\0'; + ccl_rpn_delete(p); + return NULL; } - strxcat(p->u.t.term, src_str, src_len); ADVANCE; } + /* make the top node point to us.. */ + if (p_top) + { + struct ccl_rpn_node *tmp; + + if (or_list) + tmp = ccl_rpn_node_create(CCL_RPN_OR); + else if (and_list) + tmp = ccl_rpn_node_create(CCL_RPN_AND); + else + tmp = ccl_rpn_node_create(CCL_RPN_AND); + tmp->u.p[0] = p_top; + tmp->u.p[1] = p; + + p_top = tmp; + } + else + p_top = p; + + if (left_trunc && right_trunc) { if (!qual_val_type(qa, CCL_BIB1_TRU, CCL_BIB1_TRU_CAN_BOTH, @@ -533,6 +561,14 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, } ccl_add_attr_numeric(p, attset, CCL_BIB1_TRU, 2); } + else if (regex_trunc == 2) + { + ccl_add_attr_numeric(p, attset, CCL_BIB1_TRU, 102); + } + else if (z3958_trunc == 2) + { + ccl_add_attr_numeric(p, attset, CCL_BIB1_TRU, 104); + } else { if (qual_val_type(qa, CCL_BIB1_TRU, CCL_BIB1_TRU_CAN_NONE, @@ -553,6 +589,37 @@ static struct ccl_rpn_node *search_term(CCL_parser cclp, ccl_qualifier_t *qa) return search_term_x(cclp, qa, list, 0); } + +static struct ccl_rpn_node *search_terms2(CCL_parser cclp, + ccl_qualifier_t *qa) +{ + if (KIND == CCL_TOK_LP) + { + struct ccl_rpn_node *p; + ADVANCE; + if (!(p = find_spec(cclp, qa))) + return NULL; + if (KIND != CCL_TOK_RP) + { + cclp->error_code = CCL_ERR_RP_EXPECTED; + ccl_rpn_delete(p); + return NULL; + } + ADVANCE; + return p; + } + else + { + static int list[] = { + CCL_TOK_TERM, CCL_TOK_COMMA,CCL_TOK_EQ, + CCL_TOK_REL, CCL_TOK_SET, -1}; + + return search_term_x(cclp, qa, list, 1); + } +} + + + static struct ccl_rpn_node *qualifiers_order(CCL_parser cclp, ccl_qualifier_t *ap, char *attset) @@ -695,20 +762,6 @@ struct ccl_rpn_node *qualifiers_order(CCL_parser cclp, ccl_add_attr_numeric(p, attset, CCL_BIB1_REL, 2); return p; } - else if (KIND == CCL_TOK_LP) - { - ADVANCE; - if (!(p = find_spec(cclp, ap))) - return NULL; - if (KIND != CCL_TOK_RP) - { - cclp->error_code = CCL_ERR_RP_EXPECTED; - ccl_rpn_delete(p); - return NULL; - } - ADVANCE; - return p; - } else { if (!(p = search_terms(cclp, ap))) @@ -724,7 +777,6 @@ static struct ccl_rpn_node *qualifier_relation(CCL_parser cclp, ccl_qualifier_t *ap) { char *attset; - struct ccl_rpn_node *p; if (qual_val_type(ap, CCL_BIB1_REL, CCL_BIB1_REL_ORDER, &attset) || qual_val_type(ap, CCL_BIB1_REL, CCL_BIB1_REL_PORDER, &attset)) @@ -737,24 +789,7 @@ struct ccl_rpn_node *qualifier_relation(CCL_parser cclp, ccl_qualifier_t *ap) return NULL; } ADVANCE; - if (KIND == CCL_TOK_LP) - { - ADVANCE; - if (!(p = find_spec(cclp, ap))) - { - return NULL; - } - if (KIND != CCL_TOK_RP) - { - cclp->error_code = CCL_ERR_RP_EXPECTED; - ccl_rpn_delete(p); - return NULL; - } - ADVANCE; - } - else - p = search_terms(cclp, ap); - return p; + return search_terms(cclp, ap); } /** @@ -764,14 +799,15 @@ struct ccl_rpn_node *qualifier_relation(CCL_parser cclp, ccl_qualifier_t *ap) * qa: Qualifier attributes already applied. * return: pointer to node(s); NULL on error. */ -static struct ccl_rpn_node *qualifier_list(CCL_parser cclp, struct ccl_token *la, - ccl_qualifier_t *qa) +static struct ccl_rpn_node *qualifier_list(CCL_parser cclp, + struct ccl_token *la, + ccl_qualifier_t *qa) { struct ccl_token *lookahead = cclp->look_token; struct ccl_token *look_start = cclp->look_token; ccl_qualifier_t *ap; struct ccl_rpn_node *node = 0; - const char *field_str; + const char **field_str; int no = 0; int seq = 0; int i; @@ -795,9 +831,9 @@ static struct ccl_rpn_node *qualifier_list(CCL_parser cclp, struct ccl_token *la field_str = ccl_qual_search_special(cclp->bibset, "field"); if (field_str) { - if (!strcmp(field_str, "or")) + if (!strcmp(field_str[0], "or")) mode_merge = 0; - else if (!strcmp(field_str, "merge")) + else if (!strcmp(field_str[0], "merge")) mode_merge = 1; } if (!mode_merge) @@ -920,9 +956,10 @@ static struct ccl_rpn_node *qualifier_list(CCL_parser cclp, struct ccl_token *la static struct ccl_rpn_node *search_terms(CCL_parser cclp, ccl_qualifier_t *qa) { static int list[] = { - CCL_TOK_TERM, CCL_TOK_COMMA,CCL_TOK_EQ, CCL_TOK_REL, CCL_TOK_SET, -1}; + CCL_TOK_TERM, CCL_TOK_COMMA,CCL_TOK_EQ, + CCL_TOK_REL, CCL_TOK_SET, -1}; struct ccl_rpn_node *p1, *p2, *pn; - p1 = search_term_x(cclp, qa, list, 1); + p1 = search_terms2(cclp, qa); if (!p1) return NULL; while (1) @@ -940,7 +977,7 @@ static struct ccl_rpn_node *search_terms(CCL_parser cclp, ccl_qualifier_t *qa) p_prox->u.t.attr_list = 0; ADVANCE; - p2 = search_term_x(cclp, qa, list, 1); + p2 = search_terms2(cclp, qa); if (!p2) { ccl_rpn_delete(p1); @@ -954,7 +991,7 @@ static struct ccl_rpn_node *search_terms(CCL_parser cclp, ccl_qualifier_t *qa) } else if (is_term_ok(KIND, list)) { - p2 = search_term_x(cclp, qa, list, 1); + p2 = search_terms2(cclp, qa); if (!p2) { ccl_rpn_delete(p1); @@ -983,22 +1020,7 @@ static struct ccl_rpn_node *search_elements(CCL_parser cclp, { struct ccl_rpn_node *p1; struct ccl_token *lookahead; - if (KIND == CCL_TOK_LP) - { - ADVANCE; - p1 = find_spec(cclp, qa); - if (!p1) - return NULL; - if (KIND != CCL_TOK_RP) - { - cclp->error_code = CCL_ERR_RP_EXPECTED; - ccl_rpn_delete(p1); - return NULL; - } - ADVANCE; - return p1; - } - else if (KIND == CCL_TOK_SET) + if (KIND == CCL_TOK_SET) { ADVANCE; if (KIND == CCL_TOK_EQ) @@ -1024,7 +1046,7 @@ static struct ccl_rpn_node *search_elements(CCL_parser cclp, break; lookahead = lookahead->next; } - if (qa) + if (qa || lookahead->kind == CCL_TOK_LP) return search_terms(cclp, qa); else { @@ -1188,9 +1210,11 @@ struct ccl_rpn_node *ccl_find_str(CCL_bibset bibset, const char *str, ccl_token_del(list); return p; } + /* * Local variables: * c-basic-offset: 4 + * c-file-style: "Stroustrup" * indent-tabs-mode: nil * End: * vim: shiftwidth=4 tabstop=8 expandtab