X-Git-Url: http://git.indexdata.com/?p=yaz-moved-to-github.git;a=blobdiff_plain;f=src%2Fcclfind.c;h=a2ea081e8e105c38a85d8a4b55899551acc692ee;hp=6fd31447614c383b440c68fe51ec3adea6038f52;hb=c13416878ec0afc9d91bdd20deb20c82425e67b6;hpb=26e1223d0b98f56cf288fb8fdf1acaa15047db2b diff --git a/src/cclfind.c b/src/cclfind.c index 6fd3144..a2ea081 100644 --- a/src/cclfind.c +++ b/src/cclfind.c @@ -1,8 +1,8 @@ /* This file is part of the YAZ toolkit. - * Copyright (C) 1995-2011 Index Data + * Copyright (C) 1995-2013 Index Data * See the file LICENSE for details. */ -/** +/** * \file cclfind.c * \brief Implements parsing of a CCL FIND query. * @@ -61,7 +61,7 @@ static int qual_val_type(ccl_qualifier_t *qa, int type, int value, /** * strxcat: concatenate strings. - * n: Null-terminated Destination string + * n: Null-terminated Destination string * src: Source string to be appended (not null-terminated) * len: Length of source string. */ @@ -171,7 +171,7 @@ static struct ccl_rpn_attr *add_attr_node(struct ccl_rpn_node *p, const char *set, int type) { struct ccl_rpn_attr *n; - + n = (struct ccl_rpn_attr *)xmalloc(sizeof(*n)); ccl_assert(n); if (set) @@ -181,7 +181,7 @@ static struct ccl_rpn_attr *add_attr_node(struct ccl_rpn_node *p, n->type = type; n->next = p->u.t.attr_list; p->u.t.attr_list = n; - + return n; } @@ -212,10 +212,118 @@ void ccl_add_attr_string(struct ccl_rpn_node *p, const char *set, n->value.str = xstrdup(value); } +static size_t cmp_operator(const char **aliases, const char *input) +{ + for (; *aliases; aliases++) + { + const char *cp = *aliases; + size_t i; + for (i = 0; *cp && *cp == input[i]; i++, cp++) + ; + if (*cp == '\0') + return i; + } + return 0; +} + + +#define REGEX_CHARS "^[]{}()|.*+?!$" +#define CCL_CHARS "#?\\" +static int append_term(CCL_parser cclp, const char *src_str, size_t src_len, + char *dst_term, int *regex_trunc, int *z3958_trunc, + const char **truncation_aliases, + const char **mask_aliases, + int is_first, int is_last, + int *left_trunc, int *right_trunc) +{ + size_t j; + int quote_mode = 0; + + for (j = 0; j < src_len; j++) + { + size_t op_size; + if (j > 0 && src_str[j-1] == '\\') + { + if (*regex_trunc && strchr(REGEX_CHARS "\\", src_str[j])) + { + *regex_trunc = 2; + strcat(dst_term, "\\"); + } + else if (*z3958_trunc && strchr(CCL_CHARS "\\", src_str[j])) + { + *z3958_trunc = 2; + strcat(dst_term, "\\"); + } + strxcat(dst_term, src_str + j, 1); + } + else if (src_str[j] == '"') + quote_mode = !quote_mode; + else if (!quote_mode && + (op_size = cmp_operator(truncation_aliases, + src_str + j)) + ) + { + j += (op_size - 1); /* j++ in for loop */ + if (*regex_trunc) + { + strcat(dst_term, ".*"); + *regex_trunc = 2; /* regex trunc is really needed */ + } + else if (*z3958_trunc) + { + strcat(dst_term, "?"); + *z3958_trunc = 2; + } + else if (is_first && j == 0) + *left_trunc = 1; + else if (is_last && j == src_len - 1) + *right_trunc = 1; + else + { + cclp->error_code = CCL_ERR_TRUNC_NOT_EMBED; + return -1; + } + } + else if (!quote_mode && + (op_size = cmp_operator(mask_aliases, src_str + j))) + { + j += (op_size - 1); /* j++ in for loop */ + if (*regex_trunc) + { + strcat(dst_term, "."); + *regex_trunc = 2; /* regex trunc is really needed */ + } + else if (*z3958_trunc) + { + strcat(dst_term, "#"); + *z3958_trunc = 2; + } + else + { + cclp->error_code = CCL_ERR_TRUNC_NOT_SINGLE; + return -1; + } + } + else if (src_str[j] != '\\') + { + if (*regex_trunc && strchr(REGEX_CHARS, src_str[j])) + { + *regex_trunc = 2; + strcat(dst_term, "\\"); + } + else if (*z3958_trunc && strchr(CCL_CHARS, src_str[j])) + { + *z3958_trunc = 2; + strcat(dst_term, "\\"); + } + strxcat(dst_term, src_str + j, 1); + } + } + return 0; +} -#define REGEX_CHARS "^[]{}()|.*+?!\"$" /** - * search_term: Parse CCL search term. + * search_term: Parse CCL search term. * cclp: CCL Parser * qa: Qualifier attributes already applied. * term_list: tokens we accept as terms in context @@ -229,10 +337,13 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, struct ccl_rpn_node *p_top = 0; struct ccl_token *lookahead = cclp->look_token; int and_list = 0; + int auto_group = 0; int or_list = 0; char *attset; const char **truncation_aliases; const char *t_default[2]; + const char **mask_aliases; + const char *m_default[2]; truncation_aliases = ccl_qual_search_special(cclp->bibset, "truncation"); @@ -243,15 +354,27 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, t_default[1] = 0; } + mask_aliases = + ccl_qual_search_special(cclp->bibset, "mask"); + if (!mask_aliases) + { + mask_aliases = m_default; + m_default[0] = "#"; + m_default[1] = 0; + } + + if (qual_val_type(qa, CCL_BIB1_STR, CCL_BIB1_STR_AND_LIST, 0)) and_list = 1; + if (qual_val_type(qa, CCL_BIB1_STR, CCL_BIB1_STR_AUTO_GROUP, 0)) + auto_group = 1; if (qual_val_type(qa, CCL_BIB1_STR, CCL_BIB1_STR_OR_LIST, 0)) or_list = 1; while (1) { struct ccl_rpn_node *p; size_t no, i; - int no_spaces = 0; + int is_phrase = 0; int relation_value = -1; int position_value = -1; int structure_value = -1; @@ -261,10 +384,11 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, int left_trunc = 0; int right_trunc = 0; int regex_trunc = 0; + int z3958_trunc = 0; size_t max = 200; if (and_list || or_list || !multi) max = 1; - + /* ignore commas when dealing with and-lists .. */ if (and_list && lookahead && lookahead->kind == CCL_TOK_COMMA) { @@ -272,21 +396,28 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, ADVANCE; continue; } - /* go through each TERM token. If no truncation attribute is yet - met, then look for left/right truncation markers (?) and - set left_trunc/right_trunc/mid_trunc accordingly */ for (no = 0; no < max && is_term_ok(lookahead->kind, term_list); no++) { + int this_is_phrase = 0; for (i = 0; ilen; i++) if (lookahead->name[i] == ' ') - no_spaces++; + this_is_phrase = 1; + + if (auto_group) + { + if (no > 0 && (is_phrase || is_phrase != this_is_phrase)) + break; + is_phrase = this_is_phrase; + } + else if (this_is_phrase || no > 0) + is_phrase = 1; len += 1+lookahead->len+lookahead->ws_prefix_len; lookahead = lookahead->next; } if (len == 0) break; /* no more terms . stop . */ - + /* create the term node, but wait a moment before adding the term */ p = ccl_rpn_node_create(CCL_RPN_TERM); p->u.t.attr_list = NULL; @@ -302,7 +433,7 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, for (i=0; qa && qa[i]; i++) { struct ccl_rpn_attr *attr; - + for (attr = ccl_qual_get_attr(qa[i]); attr; attr = attr->next) switch(attr->kind) { @@ -346,14 +477,13 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, } } } - /* len now holds the number of characters in the RPN term */ - /* no holds the number of CCL tokens (1 or more) */ - - if (structure_value == -1 && - qual_val_type(qa, CCL_BIB1_STR, CCL_BIB1_STR_WP, &attset)) - { /* no structure attribute met. Apply either structure attribute - WORD or PHRASE depending on number of CCL tokens */ - if (no == 1 && no_spaces == 0) + attset = 0; + if (structure_value == -1 && ( + auto_group || + qual_val_type(qa, CCL_BIB1_STR, CCL_BIB1_STR_WP, &attset)) + ) + { + if (!is_phrase) ccl_add_attr_numeric(p, attset, CCL_BIB1_STR, 2); else ccl_add_attr_numeric(p, attset, CCL_BIB1_STR, 1); @@ -364,6 +494,11 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, { regex_trunc = 1; /* regex trunc (102) allowed */ } + else if (qual_val_type(qa, CCL_BIB1_TRU, CCL_BIB1_TRU_CAN_Z3958, + &attset)) + { + z3958_trunc = 1; /* Z39.58 trunc (CCL) trunc allowed */ + } /* make the RPN token */ p->u.t.term = (char *)xmalloc(len * 2 + 2); @@ -373,76 +508,22 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, { const char *src_str = cclp->look_token->name; size_t src_len = cclp->look_token->len; - int j; - int quote_mode = 0; if (p->u.t.term[0] && cclp->look_token->ws_prefix_len) { - size_t len = strlen(p->u.t.term); - memcpy(p->u.t.term + len, cclp->look_token->ws_prefix_buf, - cclp->look_token->ws_prefix_len); - p->u.t.term[len + cclp->look_token->ws_prefix_len] = '\0'; + strxcat(p->u.t.term, cclp->look_token->ws_prefix_buf, + cclp->look_token->ws_prefix_len); } - for (j = 0; j < src_len; j++) + if (append_term(cclp, src_str, src_len, p->u.t.term, ®ex_trunc, + &z3958_trunc, truncation_aliases, mask_aliases, + i == 0, i == no - 1, + &left_trunc, &right_trunc)) { - if (j > 0 && src_str[j-1] == '\\') - { - if (regex_trunc && strchr(REGEX_CHARS "\\", src_str[j])) - { - regex_trunc = 2; - strcat(p->u.t.term, "\\\\"); - } - if (src_str[j] == '\\') - strcat(p->u.t.term, "\\"); - strxcat(p->u.t.term, src_str + j, 1); - } - else if (src_str[j] == '"') - quote_mode = !quote_mode; - else if (!quote_mode && src_str[j] == '?') - { - if (regex_trunc) - { - strcat(p->u.t.term, ".*"); - regex_trunc = 2; /* regex trunc is really needed */ - } - else if (i == 0 && j == 0) - left_trunc = 1; - else if (i == no - 1 && j == src_len - 1) - right_trunc = 1; - else - { - cclp->error_code = CCL_ERR_TRUNC_NOT_BOTH; - ccl_rpn_delete(p); - return NULL; - } - } - else if (!quote_mode && src_str[j] == '#') - { - if (regex_trunc) - { - strcat(p->u.t.term, "."); - regex_trunc = 2; /* regex trunc is really needed */ - } - else - { - cclp->error_code = CCL_ERR_TRUNC_NOT_BOTH; - ccl_rpn_delete(p); - return NULL; - } - } - else if (src_str[j] != '\\') - { - if (regex_trunc && strchr(REGEX_CHARS, src_str[j])) - { - regex_trunc = 2; - strcat(p->u.t.term, "\\\\"); - } - strxcat(p->u.t.term, src_str + j, 1); - } + ccl_rpn_delete(p); + return NULL; } ADVANCE; } - /* make the top node point to us.. */ if (p_top) { @@ -500,6 +581,10 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, { ccl_add_attr_numeric(p, attset, CCL_BIB1_TRU, 102); } + else if (z3958_trunc == 2) + { + ccl_add_attr_numeric(p, attset, CCL_BIB1_TRU, 104); + } else { if (qual_val_type(qa, CCL_BIB1_TRU, CCL_BIB1_TRU_CAN_NONE, @@ -520,6 +605,37 @@ static struct ccl_rpn_node *search_term(CCL_parser cclp, ccl_qualifier_t *qa) return search_term_x(cclp, qa, list, 0); } + +static struct ccl_rpn_node *search_terms2(CCL_parser cclp, + ccl_qualifier_t *qa) +{ + if (KIND == CCL_TOK_LP) + { + struct ccl_rpn_node *p; + ADVANCE; + if (!(p = find_spec(cclp, qa))) + return NULL; + if (KIND != CCL_TOK_RP) + { + cclp->error_code = CCL_ERR_RP_EXPECTED; + ccl_rpn_delete(p); + return NULL; + } + ADVANCE; + return p; + } + else + { + static int list[] = { + CCL_TOK_TERM, CCL_TOK_COMMA,CCL_TOK_EQ, + CCL_TOK_REL, CCL_TOK_SET, -1}; + + return search_term_x(cclp, qa, list, 1); + } +} + + + static struct ccl_rpn_node *qualifiers_order(CCL_parser cclp, ccl_qualifier_t *ap, char *attset) @@ -564,7 +680,7 @@ struct ccl_rpn_node *qualifiers_order(CCL_parser cclp, if (cclp->look_token->name[i] == '-') break; } - + if (cclp->look_token->len > 1 && i == 0) { /* -xx*/ struct ccl_token *ntoken = ccl_token_add(cclp->look_token); @@ -606,7 +722,7 @@ struct ccl_rpn_node *qualifiers_order(CCL_parser cclp, cclp->look_token->next->kind == CCL_TOK_TERM && cclp->look_token->next->len > 1 && cclp->look_token->next->name[0] == '-') - + { /* xx -yy */ /* we _know_ that xx does not have - in it */ struct ccl_token *ntoken = ccl_token_add(cclp->look_token); @@ -616,11 +732,11 @@ struct ccl_rpn_node *qualifiers_order(CCL_parser cclp, ntoken->len = 1; (ntoken->next->name)++; /* adjust yy */ - (ntoken->next->len)--; + (ntoken->next->len)--; } } } - + if (rel == 3 && KIND == CCL_TOK_TERM && cclp->look_token->next && cclp->look_token->next->len == 1 && @@ -633,7 +749,7 @@ struct ccl_rpn_node *qualifiers_order(CCL_parser cclp, if (KIND == CCL_TOK_TERM) /* = term - term ? */ { struct ccl_rpn_node *p2; - + if (!(p2 = search_term(cclp, ap))) { ccl_rpn_delete(p1); @@ -662,20 +778,6 @@ struct ccl_rpn_node *qualifiers_order(CCL_parser cclp, ccl_add_attr_numeric(p, attset, CCL_BIB1_REL, 2); return p; } - else if (KIND == CCL_TOK_LP) - { - ADVANCE; - if (!(p = find_spec(cclp, ap))) - return NULL; - if (KIND != CCL_TOK_RP) - { - cclp->error_code = CCL_ERR_RP_EXPECTED; - ccl_rpn_delete(p); - return NULL; - } - ADVANCE; - return p; - } else { if (!(p = search_terms(cclp, ap))) @@ -691,8 +793,7 @@ static struct ccl_rpn_node *qualifier_relation(CCL_parser cclp, ccl_qualifier_t *ap) { char *attset; - struct ccl_rpn_node *p; - + if (qual_val_type(ap, CCL_BIB1_REL, CCL_BIB1_REL_ORDER, &attset) || qual_val_type(ap, CCL_BIB1_REL, CCL_BIB1_REL_PORDER, &attset)) return qualifiers_order(cclp, ap, attset); @@ -704,34 +805,17 @@ struct ccl_rpn_node *qualifier_relation(CCL_parser cclp, ccl_qualifier_t *ap) return NULL; } ADVANCE; - if (KIND == CCL_TOK_LP) - { - ADVANCE; - if (!(p = find_spec(cclp, ap))) - { - return NULL; - } - if (KIND != CCL_TOK_RP) - { - cclp->error_code = CCL_ERR_RP_EXPECTED; - ccl_rpn_delete(p); - return NULL; - } - ADVANCE; - } - else - p = search_terms(cclp, ap); - return p; + return search_terms(cclp, ap); } /** - * qualifier_list: Parse CCL qualifiers and search terms. + * qualifier_list: Parse CCL qualifiers and search terms. * cclp: CCL Parser * la: Token pointer to RELATION token. * qa: Qualifier attributes already applied. * return: pointer to node(s); NULL on error. */ -static struct ccl_rpn_node *qualifier_list(CCL_parser cclp, +static struct ccl_rpn_node *qualifier_list(CCL_parser cclp, struct ccl_token *la, ccl_qualifier_t *qa) { @@ -781,7 +865,7 @@ static struct ccl_rpn_node *qualifier_list(CCL_parser cclp, { struct ccl_rpn_node *node_sub; cclp->look_token = la; - + node_sub = qualifier_relation(cclp, ap); if (!node_sub) { @@ -791,7 +875,7 @@ static struct ccl_rpn_node *qualifier_list(CCL_parser cclp, } if (node) { - struct ccl_rpn_node *node_this = + struct ccl_rpn_node *node_this = ccl_rpn_node_create(CCL_RPN_OR); node_this->u.p[0] = node; node_this->u.p[1] = node_sub; @@ -844,17 +928,17 @@ static struct ccl_rpn_node *qualifier_list(CCL_parser cclp, if (qa) { ccl_qualifier_t *qa0 = qa; - + while (*qa0) ap[i++] = *qa0++; } ap[i] = NULL; - + if (!found) break; - + cclp->look_token = lookahead; - + node_sub = qualifier_relation(cclp, ap); if (!node_sub) { @@ -863,7 +947,7 @@ static struct ccl_rpn_node *qualifier_list(CCL_parser cclp, } if (node) { - struct ccl_rpn_node *node_this = + struct ccl_rpn_node *node_this = ccl_rpn_node_create(CCL_RPN_OR); node_this->u.p[0] = node; node_this->u.p[1] = node_sub; @@ -888,9 +972,10 @@ static struct ccl_rpn_node *qualifier_list(CCL_parser cclp, static struct ccl_rpn_node *search_terms(CCL_parser cclp, ccl_qualifier_t *qa) { static int list[] = { - CCL_TOK_TERM, CCL_TOK_COMMA,CCL_TOK_EQ, CCL_TOK_REL, CCL_TOK_SET, -1}; + CCL_TOK_TERM, CCL_TOK_COMMA,CCL_TOK_EQ, + CCL_TOK_REL, CCL_TOK_SET, -1}; struct ccl_rpn_node *p1, *p2, *pn; - p1 = search_term_x(cclp, qa, list, 1); + p1 = search_terms2(cclp, qa); if (!p1) return NULL; while (1) @@ -908,7 +993,7 @@ static struct ccl_rpn_node *search_terms(CCL_parser cclp, ccl_qualifier_t *qa) p_prox->u.t.attr_list = 0; ADVANCE; - p2 = search_term_x(cclp, qa, list, 1); + p2 = search_terms2(cclp, qa); if (!p2) { ccl_rpn_delete(p1); @@ -922,7 +1007,7 @@ static struct ccl_rpn_node *search_terms(CCL_parser cclp, ccl_qualifier_t *qa) } else if (is_term_ok(KIND, list)) { - p2 = search_term_x(cclp, qa, list, 1); + p2 = search_terms2(cclp, qa); if (!p2) { ccl_rpn_delete(p1); @@ -951,22 +1036,7 @@ static struct ccl_rpn_node *search_elements(CCL_parser cclp, { struct ccl_rpn_node *p1; struct ccl_token *lookahead; - if (KIND == CCL_TOK_LP) - { - ADVANCE; - p1 = find_spec(cclp, qa); - if (!p1) - return NULL; - if (KIND != CCL_TOK_RP) - { - cclp->error_code = CCL_ERR_RP_EXPECTED; - ccl_rpn_delete(p1); - return NULL; - } - ADVANCE; - return p1; - } - else if (KIND == CCL_TOK_SET) + if (KIND == CCL_TOK_SET) { ADVANCE; if (KIND == CCL_TOK_EQ) @@ -992,7 +1062,7 @@ static struct ccl_rpn_node *search_elements(CCL_parser cclp, break; lookahead = lookahead->next; } - if (qa) + if (qa || lookahead->kind == CCL_TOK_LP) return search_terms(cclp, qa); else { @@ -1019,7 +1089,7 @@ static struct ccl_rpn_node *search_elements(CCL_parser cclp, } if (node) { - struct ccl_rpn_node *node_this = + struct ccl_rpn_node *node_this = ccl_rpn_node_create(CCL_RPN_OR); node_this->u.p[0] = node; node_this->u.p[1] = node_sub; @@ -1107,7 +1177,7 @@ struct ccl_rpn_node *ccl_parser_find_str(CCL_parser cclp, const char *str) return p; } -struct ccl_rpn_node *ccl_parser_find_token(CCL_parser cclp, +struct ccl_rpn_node *ccl_parser_find_token(CCL_parser cclp, struct ccl_token *list) { struct ccl_rpn_node *p;