From fdb80724c17104b31f80527cca19b433f6c2fc33 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Mon, 30 Apr 2007 19:55:39 +0000 Subject: [PATCH] Added stop word support for CCL parser. These are configured with @stop.field t1 t2 t3 .. All terms matching list of t1, t2, .. in field are removed from resulting RPN. --- client/default.bib | 19 +++++++- include/yaz/ccl.h | 24 ++-------- src/cclfind.c | 94 +++++++++++++++++++++--------------- src/cclp.h | 14 ++++-- src/cclqfile.c | 18 +++++-- src/cclqual.c | 134 ++++++++++++++++++++++++++++++++-------------------- src/ccltoken.c | 106 ++++++++++++++++++----------------------- 7 files changed, 228 insertions(+), 181 deletions(-) diff --git a/client/default.bib b/client/default.bib index 24f712e..67b8093 100644 --- a/client/default.bib +++ b/client/default.bib @@ -1,5 +1,5 @@ # CCL field mappings -# $Id: default.bib,v 1.10 2007-04-24 12:57:52 adam Exp $ +# $Id: default.bib,v 1.11 2007-04-30 19:55:39 adam Exp $ # # The rule below is used when no fields are specified term t=l,r s=al @@ -8,6 +8,7 @@ term t=l,r s=al clean t=l,r # # Rules for some BIB-1 fields + au u=1 s=pw ti u=4 s=pw isbn u=7 @@ -46,3 +47,19 @@ DateAdded exp1,1=9 DateChanged exp1,1=10 DateExpires exp1,1=11 ElementSetName exp1,1=12 + +# Define or operator +@or or +# Define and operator +@and and +# Define and not operator +@not not andnot + +# Whether CCL is case sensitive or not. +@case 1 + +# default stop words +# @stop.* the of a + +# stop words for ti +# @stop.ti art diff --git a/include/yaz/ccl.h b/include/yaz/ccl.h index 65d2ffd..043d09f 100644 --- a/include/yaz/ccl.h +++ b/include/yaz/ccl.h @@ -49,7 +49,7 @@ /* * CCL - header file * - * $Id: ccl.h,v 1.27 2007-04-30 11:33:49 adam Exp $ + * $Id: ccl.h,v 1.28 2007-04-30 19:55:39 adam Exp $ * * Old Europagate Log: * @@ -185,22 +185,6 @@ struct ccl_rpn_node *ccl_find_str(CCL_bibset bibset, YAZ_EXPORT struct ccl_rpn_node *ccl_parser_find_str(CCL_parser cclp, const char *str); -/** Set names for AND operator in parser */ -YAZ_EXPORT -void ccl_parser_set_op_and(CCL_parser p, const char *op); - -/** Set names for OR operator in parser */ -YAZ_EXPORT -void ccl_parser_set_op_or(CCL_parser p, const char *op); - -/** Set names for ANDNOT operator in parser */ -YAZ_EXPORT -void ccl_parser_set_op_not(CCL_parser p, const char *op); - -/** Set names for ResultSet in parser */ -YAZ_EXPORT -void ccl_parser_set_op_set(CCL_parser p, const char *op); - /** Set case sensitivity for parser */ YAZ_EXPORT void ccl_parser_set_case(CCL_parser p, int case_sensitivity_flag); @@ -228,11 +212,11 @@ void ccl_qual_add_set(CCL_bibset b, const char *name, int no, /** Add special qualifier */ YAZ_EXPORT -void ccl_qual_add_special(CCL_bibset bibset, const char *n, const char *v); +void ccl_qual_add_special(CCL_bibset bibset, const char *n, const char *cp); /** Add combo qualifier */ YAZ_EXPORT -void ccl_qual_add_combi(CCL_bibset b, const char *n, const char *names); +void ccl_qual_add_combi(CCL_bibset b, const char *n, const char **names); /** Read CCL qualifier list spec from file inf */ YAZ_EXPORT @@ -284,7 +268,7 @@ void ccl_parser_destroy(CCL_parser p); /** Search for special qualifier */ YAZ_EXPORT -const char *ccl_qual_search_special(CCL_bibset b, const char *name); +const char **ccl_qual_search_special(CCL_bibset b, const char *name); /** Pretty-print CCL RPN node tree to WRBUF */ YAZ_EXPORT void ccl_pquery(WRBUF w, struct ccl_rpn_node *p); diff --git a/src/cclfind.c b/src/cclfind.c index 2ebd11f..69f059c 100644 --- a/src/cclfind.c +++ b/src/cclfind.c @@ -56,7 +56,7 @@ /* CCL find (to rpn conversion) * Europagate, 1995 * - * $Id: cclfind.c,v 1.12 2007-04-30 11:33:49 adam Exp $ + * $Id: cclfind.c,v 1.13 2007-04-30 19:55:40 adam Exp $ * * Old Europagate log: * @@ -322,12 +322,17 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, int and_list = 0; int or_list = 0; char *attset; - const char *truncation_aliases; + const char **truncation_aliases; + const char *t_default[2]; truncation_aliases = ccl_qual_search_special(cclp->bibset, "truncation"); if (!truncation_aliases) - truncation_aliases = "?"; + { + truncation_aliases = t_default; + t_default[0] = "?"; + t_default[1] = 0; + } if (qual_val_type(qa, CCL_BIB1_STR, CCL_BIB1_STR_AND_LIST, 0)) and_list = 1; @@ -366,7 +371,7 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, for (i = 0; ilen; i++) if (lookahead->name[i] == ' ') no_spaces++; - else if (strchr(truncation_aliases, lookahead->name[i])) + else if (strchr(truncation_aliases[0], lookahead->name[i])) { if (no == 0 && i == 0 && lookahead->len >= 1) left_trunc = 1; @@ -382,32 +387,12 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, if (len == 0) break; /* no more terms . stop . */ - - - if (p_top) - { - if (or_list) - p = ccl_rpn_node_create(CCL_RPN_OR); - else if (and_list) - p = ccl_rpn_node_create(CCL_RPN_AND); - else - p = ccl_rpn_node_create(CCL_RPN_AND); - p->u.p[0] = p_top; - p_top = p; - } /* create the term node, but wait a moment before adding the term */ p = ccl_rpn_node_create(CCL_RPN_TERM); p->u.t.attr_list = NULL; p->u.t.term = NULL; - /* make the top node point to us.. */ - if (p_top) - p_top->u.p[1] = p; - else - p_top = p; - - /* go through all attributes and add them to the attribute list */ for (i=0; qa && qa[i]; i++) { @@ -477,7 +462,7 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, for (i = 0; ilook_token->name; - int src_len = cclp->look_token->len; + size_t src_len = cclp->look_token->len; if (i == 0 && left_trunc) { @@ -486,20 +471,50 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, } if (i == no-1 && right_trunc) src_len--; + if (!ccl_qual_match_stop(cclp->bibset, qa, src_str, src_len)) + { #if 0 - fprintf(stderr, "[%s %.*s]", - ccl_qual_get_name(qa[0]), src_len, src_str); + fprintf(stderr, "[%s %.*s]", + ccl_qual_get_name(qa[0]), src_len, src_str); #endif - if (i && cclp->look_token->ws_prefix_len) - { - size_t len = strlen(p->u.t.term); - memcpy(p->u.t.term + len, cclp->look_token->ws_prefix_buf, - cclp->look_token->ws_prefix_len); - p->u.t.term[len + cclp->look_token->ws_prefix_len] = '\0'; + if (p->u.t.term[0] && cclp->look_token->ws_prefix_len) + { + size_t len = strlen(p->u.t.term); + memcpy(p->u.t.term + len, cclp->look_token->ws_prefix_buf, + cclp->look_token->ws_prefix_len); + p->u.t.term[len + cclp->look_token->ws_prefix_len] = '\0'; + } + strxcat(p->u.t.term, src_str, src_len); } - strxcat(p->u.t.term, src_str, src_len); ADVANCE; } + + if (p->u.t.term[0] == 0) + { + ccl_rpn_delete(p); + continue; + } + + /* make the top node point to us.. */ + if (p_top) + { + struct ccl_rpn_node *tmp; + + if (or_list) + tmp = ccl_rpn_node_create(CCL_RPN_OR); + else if (and_list) + tmp = ccl_rpn_node_create(CCL_RPN_AND); + else + tmp = ccl_rpn_node_create(CCL_RPN_AND); + tmp->u.p[0] = p_top; + tmp->u.p[1] = p; + + p_top = tmp; + } + else + p_top = p; + + if (left_trunc && right_trunc) { if (!qual_val_type(qa, CCL_BIB1_TRU, CCL_BIB1_TRU_CAN_BOTH, @@ -764,14 +779,15 @@ struct ccl_rpn_node *qualifier_relation(CCL_parser cclp, ccl_qualifier_t *ap) * qa: Qualifier attributes already applied. * return: pointer to node(s); NULL on error. */ -static struct ccl_rpn_node *qualifier_list(CCL_parser cclp, struct ccl_token *la, - ccl_qualifier_t *qa) +static struct ccl_rpn_node *qualifier_list(CCL_parser cclp, + struct ccl_token *la, + ccl_qualifier_t *qa) { struct ccl_token *lookahead = cclp->look_token; struct ccl_token *look_start = cclp->look_token; ccl_qualifier_t *ap; struct ccl_rpn_node *node = 0; - const char *field_str; + const char **field_str; int no = 0; int seq = 0; int i; @@ -795,9 +811,9 @@ static struct ccl_rpn_node *qualifier_list(CCL_parser cclp, struct ccl_token *la field_str = ccl_qual_search_special(cclp->bibset, "field"); if (field_str) { - if (!strcmp(field_str, "or")) + if (!strcmp(field_str[0], "or")) mode_merge = 0; - else if (!strcmp(field_str, "merge")) + else if (!strcmp(field_str[0], "merge")) mode_merge = 1; } if (!mode_merge) diff --git a/src/cclp.h b/src/cclp.h index 1f59728..a3b1e90 100644 --- a/src/cclp.h +++ b/src/cclp.h @@ -2,7 +2,7 @@ * Copyright (C) 1995-2005, Index Data ApS * See the file LICENSE for details. * - * $Id: cclp.h,v 1.3 2007-04-30 11:33:49 adam Exp $ + * $Id: cclp.h,v 1.4 2007-04-30 19:55:40 adam Exp $ */ /** @@ -53,13 +53,13 @@ struct ccl_parser { CCL_bibset bibset; /** names of and operator */ - char *ccl_token_and; + const char **ccl_token_and; /** names of or operator */ - char *ccl_token_or; + const char **ccl_token_or; /** names of not operator */ - char *ccl_token_not; + const char **ccl_token_not; /** names of set operator */ - char *ccl_token_set; + const char **ccl_token_set; /** 1=CCL parser is case sensitive, 0=case insensitive */ int ccl_case_sensitive; }; @@ -99,6 +99,10 @@ struct ccl_rpn_attr *ccl_qual_get_attr(ccl_qualifier_t q); YAZ_EXPORT const char *ccl_qual_get_name(ccl_qualifier_t q); +YAZ_EXPORT +int ccl_qual_match_stop(CCL_bibset bibset, ccl_qualifier_t *qa, + const char *src_str, size_t src_len); + /* * Local variables: * c-basic-offset: 4 diff --git a/src/cclqfile.c b/src/cclqfile.c index 5d56447..aa94785 100644 --- a/src/cclqfile.c +++ b/src/cclqfile.c @@ -48,7 +48,7 @@ /* CCL qualifiers * Europagate, 1995 * - * $Id: cclqfile.c,v 1.10 2007-04-27 10:09:45 adam Exp $ + * $Id: cclqfile.c,v 1.11 2007-04-30 19:55:40 adam Exp $ * * Old Europagate Log: * @@ -136,9 +136,21 @@ int ccl_qual_field2(CCL_bibset bibset, const char *cp, const char *qual_name, { /* lead is first of a list of qualifier aliaeses */ /* qualifier alias: q1 q2 ... */ - xfree(lead_str); + char *qlist[10]; + int i = 0; + + qlist[i++] = lead_str; + + while ((t=yaz_tok_move(tp)) == YAZ_TOK_STRING) + { + if (i < sizeof(qlist)/sizeof(*qlist)-1) + qlist[i++] = xstrdup(yaz_tok_parse_string(tp)); + } + qlist[i] = 0; yaz_tok_parse_destroy(tp); - ccl_qual_add_combi (bibset, qual_name, cp); + ccl_qual_add_combi (bibset, qual_name, (const char **) qlist); + for (i = 0; qlist[i]; i++) + xfree(qlist[i]); return 0; } while (1) /* comma separated attribute value list */ diff --git a/src/cclqual.c b/src/cclqual.c index 1c2dc1e..7843494 100644 --- a/src/cclqual.c +++ b/src/cclqual.c @@ -48,7 +48,7 @@ /* CCL qualifiers * Europagate, 1995 * - * $Id: cclqual.c,v 1.8 2007-04-30 11:33:49 adam Exp $ + * $Id: cclqual.c,v 1.9 2007-04-30 19:55:40 adam Exp $ * * Old Europagate Log: * @@ -85,7 +85,8 @@ #include #include #include - +#include +#include #include "cclp.h" /** CCL Qualifier */ @@ -108,7 +109,7 @@ struct ccl_qualifiers { /** CCL Qualifier special */ struct ccl_qualifier_special { char *name; - char *value; + const char **values; struct ccl_qualifier_special *next; }; @@ -123,67 +124,68 @@ static struct ccl_qualifier *ccl_qual_lookup(CCL_bibset b, return q; } -/** \brief specifies special qualifier - \param bibset Bibset - \param n name of special (without leading @) - \param v value of special -*/ -void ccl_qual_add_special(CCL_bibset bibset, const char *n, const char *v) +void ccl_qual_add_special_ar(CCL_bibset bibset, const char *n, + const char **values) { struct ccl_qualifier_special *p; - const char *pe; - for (p = bibset->special; p && strcmp(p->name, n); p = p->next) ; if (p) - xfree(p->value); + { + if (p->values) + { + int i; + for (i = 0; p->values[i]; i++) + xfree((char *) p->values[i]); + xfree(p->values); + } + } else { p = (struct ccl_qualifier_special *) xmalloc(sizeof(*p)); p->name = xstrdup(n); - p->value = 0; p->next = bibset->special; bibset->special = p; } - while (strchr(" \t", *v)) - ++v; - for (pe = v + strlen(v); pe != v; --pe) - if (!strchr(" \n\r\t", pe[-1])) - break; - p->value = (char*) xmalloc(pe - v + 1); - if (pe - v) - memcpy(p->value, v, pe - v); - p->value[pe - v] = '\0'; + p->values = values; } -static int next_token(const char **cpp, const char **dst) +void ccl_qual_add_special(CCL_bibset bibset, const char *n, const char *cp) { - int len = 0; - const char *cp = *cpp; - while (*cp && strchr(" \r\n\t\f", *cp)) - cp++; - if (dst) - *dst = cp; - len = 0; - while (*cp && !strchr(" \r\n\t\f", *cp)) + size_t no = 2; + char **vlist = xmalloc(no * sizeof(*vlist)); + yaz_tok_cfg_t yt = yaz_tok_cfg_create(); + int t; + int i = 0; + + yaz_tok_parse_t tp = yaz_tok_parse_buf(yt, cp); + + yaz_tok_cfg_destroy(yt); + + t = yaz_tok_move(tp); + while (t == YAZ_TOK_STRING) { - cp++; - len++; + if (i >= no-1) + vlist = xrealloc(vlist, (no = no * 2) * sizeof(*vlist)); + vlist[i++] = xstrdup(yaz_tok_parse_string(tp)); + t = yaz_tok_move(tp); } - *cpp = cp; - return len; + vlist[i] = 0; + ccl_qual_add_special_ar(bibset, n, (const char **) vlist); + + yaz_tok_parse_destroy(tp); } + /** \brief adds specifies qualifier aliases \param b bibset \param n qualifier name \param names list of qualifier aliases */ -void ccl_qual_add_combi(CCL_bibset b, const char *n, const char *names) +void ccl_qual_add_combi(CCL_bibset b, const char *n, const char **names) { - const char *cp, *cp1; - int i, len; + int i; struct ccl_qualifier *q; for (q = b->list; q && strcmp(q->name, n); q = q->next) ; @@ -195,17 +197,13 @@ void ccl_qual_add_combi(CCL_bibset b, const char *n, const char *names) q->next = b->list; b->list = q; - cp = names; - for (i = 0; next_token(&cp, 0); i++) + for (i = 0; names[i]; i++) ; q->no_sub = i; - q->sub = (struct ccl_qualifier **) xmalloc(sizeof(*q->sub) * - (1+q->no_sub)); - cp = names; - for (i = 0; (len = next_token(&cp, &cp1)); i++) - { - q->sub[i] = ccl_qual_lookup(b, cp1, len); - } + q->sub = (struct ccl_qualifier **) + xmalloc(sizeof(*q->sub) * (1+q->no_sub)); + for (i = 0; names[i]; i++) + q->sub[i] = ccl_qual_lookup(b, names[i], strlen(names[i])); } /** \brief adds specifies attributes for qualifier @@ -320,7 +318,13 @@ void ccl_qual_rm(CCL_bibset *b) { sp1 = sp->next; xfree(sp->name); - xfree(sp->value); + if (sp->values) + { + int i; + for (i = 0; sp->values[i]; i++) + xfree((char*) sp->values[i]); + xfree(sp->values); + } xfree(sp); } xfree(*b); @@ -331,7 +335,7 @@ ccl_qualifier_t ccl_qual_search(CCL_parser cclp, const char *name, size_t name_len, int seq) { struct ccl_qualifier *q = 0; - const char *aliases; + const char **aliases; int case_sensitive = cclp->ccl_case_sensitive; ccl_assert(cclp); @@ -340,7 +344,7 @@ ccl_qualifier_t ccl_qual_search(CCL_parser cclp, const char *name, aliases = ccl_qual_search_special(cclp->bibset, "case"); if (aliases) - case_sensitive = atoi(aliases); + case_sensitive = atoi(aliases[0]); for (q = cclp->bibset->list; q; q = q->next) if (strlen(q->name) == name_len) @@ -381,7 +385,7 @@ const char *ccl_qual_get_name(ccl_qualifier_t q) return q->name; } -const char *ccl_qual_search_special(CCL_bibset b, const char *name) +const char **ccl_qual_search_special(CCL_bibset b, const char *name) { struct ccl_qualifier_special *q; if (!b) @@ -389,9 +393,35 @@ const char *ccl_qual_search_special(CCL_bibset b, const char *name) for (q = b->special; q && strcmp(q->name, name); q = q->next) ; if (q) - return q->value; + return q->values; return 0; } + +int ccl_qual_match_stop(CCL_bibset bibset, ccl_qualifier_t *qa, + const char *src_str, size_t src_len) +{ + if (qa[0]) + { + char qname[80]; + const char **slist; + yaz_snprintf(qname, sizeof(qname)-1, "stop.%s", + ccl_qual_get_name(qa[0])); + slist = ccl_qual_search_special(bibset, qname); + if (!slist) + slist = ccl_qual_search_special(bibset, "stop.*"); + if (slist) + { + int i; + for (i = 0; slist[i]; i++) + if (src_len == strlen(slist[i]) + && ccl_memicmp(slist[i], src_str, src_len) == 0) + return 1; + } + } + return 0; +} + + /* * Local variables: * c-basic-offset: 4 diff --git a/src/ccltoken.c b/src/ccltoken.c index e5742fe..e9c1f34 100644 --- a/src/ccltoken.c +++ b/src/ccltoken.c @@ -48,7 +48,7 @@ /* CCL - lexical analysis * Europagate, 1995 * - * $Id: ccltoken.c,v 1.11 2007-04-26 09:11:56 adam Exp $ + * $Id: ccltoken.c,v 1.12 2007-04-30 19:55:40 adam Exp $ * * Old Europagate Log: * @@ -100,40 +100,33 @@ * return: 1 if token string matches one of the keywords in list; * 0 otherwise. */ -static int token_cmp(CCL_parser cclp, const char *kw, struct ccl_token *token) +static int token_cmp(CCL_parser cclp, const char **kw, struct ccl_token *token) { - const char *cp1 = kw; - const char *cp2; - const char *aliases; + const char **aliases; int case_sensitive = cclp->ccl_case_sensitive; + int i; aliases = ccl_qual_search_special(cclp->bibset, "case"); if (aliases) - case_sensitive = atoi(aliases); - if (!kw) - return 0; - while ((cp2 = strchr(cp1, ' '))) + case_sensitive = atoi(aliases[0]); + + for (i = 0; kw[i]; i++) { - if (token->len == (size_t) (cp2-cp1)) + if (token->len == strlen(kw[i])) { if (case_sensitive) { - if (!memcmp(cp1, token->name, token->len)) + if (!memcmp(kw[i], token->name, token->len)) return 1; } else { - if (!ccl_memicmp(cp1, token->name, token->len)) + if (!ccl_memicmp(kw[i], token->name, token->len)) return 1; } } - cp1 = cp2+1; } - if (case_sensitive) - return token->len == strlen(cp1) - && !memcmp(cp1, token->name, token->len); - return token->len == strlen(cp1) && - !ccl_memicmp(cp1, token->name, token->len); + return 0; } /* @@ -142,7 +135,7 @@ static int token_cmp(CCL_parser cclp, const char *kw, struct ccl_token *token) */ struct ccl_token *ccl_parser_tokenize(CCL_parser cclp, const char *command) { - const char *aliases; + const char **aliases; const unsigned char *cp = (const unsigned char *) command; struct ccl_token *first = NULL; struct ccl_token *last = NULL; @@ -293,6 +286,31 @@ void ccl_token_del(struct ccl_token *list) } } +static const char **create_ar(const char *v1, const char *v2) +{ + const char **a = xmalloc(3 * sizeof(*a)); + a[0] = xstrdup(v1); + if (v2) + { + a[1] = xstrdup(v2); + a[2] = 0; + } + else + a[1] = 0; + return a; +} + +static void destroy_ar(const char **a) +{ + if (a) + { + int i; + for (i = 0; a[i]; i++) + xfree((char *) a[i]); + xfree(a); + } +} + CCL_parser ccl_parser_create(CCL_bibset bibset) { CCL_parser p = (CCL_parser)xmalloc(sizeof(*p)); @@ -303,10 +321,10 @@ CCL_parser ccl_parser_create(CCL_bibset bibset) p->error_pos = NULL; p->bibset = bibset; - p->ccl_token_and = xstrdup("and"); - p->ccl_token_or = xstrdup("or"); - p->ccl_token_not = xstrdup("not andnot"); - p->ccl_token_set = xstrdup("set"); + p->ccl_token_and = create_ar("and", 0); + p->ccl_token_or = create_ar("or", 0); + p->ccl_token_not = create_ar("not", "andnot"); + p->ccl_token_set = create_ar("set", 0); p->ccl_case_sensitive = 1; return p; @@ -316,47 +334,13 @@ void ccl_parser_destroy(CCL_parser p) { if (!p) return; - xfree(p->ccl_token_and); - xfree(p->ccl_token_or); - xfree(p->ccl_token_not); - xfree(p->ccl_token_set); + destroy_ar(p->ccl_token_and); + destroy_ar(p->ccl_token_or); + destroy_ar(p->ccl_token_not); + destroy_ar(p->ccl_token_set); xfree(p); } -void ccl_parser_set_op_and(CCL_parser p, const char *op) -{ - if (p && op) - { - xfree(p->ccl_token_and); - p->ccl_token_and = xstrdup(op); - } -} - -void ccl_parser_set_op_or(CCL_parser p, const char *op) -{ - if (p && op) - { - xfree(p->ccl_token_or); - p->ccl_token_or = xstrdup(op); - } -} -void ccl_parser_set_op_not(CCL_parser p, const char *op) -{ - if (p && op) - { - xfree(p->ccl_token_not); - p->ccl_token_not = xstrdup(op); - } -} -void ccl_parser_set_op_set(CCL_parser p, const char *op) -{ - if (p && op) - { - xfree(p->ccl_token_set); - p->ccl_token_set = xstrdup(op); - } -} - void ccl_parser_set_case(CCL_parser p, int case_sensitivity_flag) { if (p) -- 1.7.10.4