From 73f6b7a6b12a1e0e54c9cc360f552a56726d75fd Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Tue, 1 May 2007 12:22:10 +0000 Subject: [PATCH] Moved stop word support code to separate ccl_stop_words.c and encapsulated the private info in opaque ptr ccl_stop_words_t. --- include/yaz/ccl.h | 28 +++++- src/Makefile.am | 4 +- src/ccl_stop_words.c | 241 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/cclfind.c | 36 ++++---- src/cclp.h | 6 +- src/cclqual.c | 39 ++++---- util/cclsh.c | 68 +++++++++----- 7 files changed, 350 insertions(+), 72 deletions(-) create mode 100644 src/ccl_stop_words.c diff --git a/include/yaz/ccl.h b/include/yaz/ccl.h index 043d09f..e6c5af2 100644 --- a/include/yaz/ccl.h +++ b/include/yaz/ccl.h @@ -49,7 +49,7 @@ /* * CCL - header file * - * $Id: ccl.h,v 1.28 2007-04-30 19:55:39 adam Exp $ + * $Id: ccl.h,v 1.29 2007-05-01 12:22:10 adam Exp $ * * Old Europagate Log: * @@ -150,6 +150,7 @@ struct ccl_rpn_node { /** \brief Attributes + Term */ struct { char *term; + char *qual; struct ccl_rpn_attr *attr_list; } t; /** Result set */ @@ -287,6 +288,31 @@ YAZ_EXPORT void ccl_add_attr_string(struct ccl_rpn_node *p, const char *set, int type, char *value); +YAZ_EXPORT +int ccl_search_stop(CCL_bibset bibset, const char *qname, + const char *src_str, size_t src_len); + + +/** \brief stop words handle (pimpl) */ +typedef struct ccl_stop_words *ccl_stop_words_t; + +/** \brief creates stop words handle */ +YAZ_EXPORT +ccl_stop_words_t ccl_stop_words_create(void); + +/** \brief destroys stop words handle */ +YAZ_EXPORT +void ccl_stop_words_destroy(ccl_stop_words_t csw); + +/** \brief removes stop words from RPN tree */ +YAZ_EXPORT +int ccl_stop_words_tree(ccl_stop_words_t csw, + CCL_bibset bibset, struct ccl_rpn_node **t); + +/** \brief returns information about removed "stop" words */ +YAZ_EXPORT +int ccl_stop_words_info(ccl_stop_words_t csw, int idx, + const char **qualname, const char **term); #ifndef ccl_assert #define ccl_assert(x) ; diff --git a/src/Makefile.am b/src/Makefile.am index 2857942..576559a 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,6 +1,6 @@ ## This file is part of the YAZ toolkit. ## Copyright (C) 1995-2007, Index Data, All rights reserved. -## $Id: Makefile.am,v 1.66 2007-04-30 08:29:07 adam Exp $ +## $Id: Makefile.am,v 1.67 2007-05-01 12:22:11 adam Exp $ YAZ_VERSION_INFO=3:0:0 @@ -86,7 +86,7 @@ libyaz_la_SOURCES=version.c options.c log.c \ zoom-c.c zoom-socket.c zoom-opt.c zoom-p.h \ grs1disp.c zgdu.c soap.c srw.c srwutil.c \ opacdisp.c cclfind.c ccltoken.c cclerrms.c cclqual.c cclptree.c cclp.h \ - cclqfile.c cclstr.c cclxmlconfig.c \ + cclqfile.c cclstr.c cclxmlconfig.c ccl_stop_words.c \ cql.y cqlstdio.c cqltransform.c cqlutil.c xcqlutil.c cqlstring.c \ cqlstrer.c querytowrbuf.c \ tcpdchk.c \ diff --git a/src/ccl_stop_words.c b/src/ccl_stop_words.c new file mode 100644 index 0000000..6a36e92 --- /dev/null +++ b/src/ccl_stop_words.c @@ -0,0 +1,241 @@ +/* + * Copyright (c) 1995, the EUROPAGATE consortium (see below). + * + * The EUROPAGATE consortium members are: + * + * University College Dublin + * Danmarks Teknologiske Videnscenter + * An Chomhairle Leabharlanna + * Consejo Superior de Investigaciones Cientificas + * + * Permission to use, copy, modify, distribute, and sell this software and + * its documentation, in whole or in part, for any purpose, is hereby granted, + * provided that: + * + * 1. This copyright and permission notice appear in all copies of the + * software and its documentation. Notices of copyright or attribution + * which appear at the beginning of any file must remain unchanged. + * + * 2. The names of EUROPAGATE or the project partners may not be used to + * endorse or promote products derived from this software without specific + * prior written permission. + * + * 3. Users of this software (implementors and gateway operators) agree to + * inform the EUROPAGATE consortium of their use of the software. This + * information will be used to evaluate the EUROPAGATE project and the + * software, and to plan further developments. The consortium may use + * the information in later publications. + * + * 4. Users of this software agree to make their best efforts, when + * documenting their use of the software, to acknowledge the EUROPAGATE + * consortium, and the role played by the software in their work. + * + * THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND, + * EXPRESS, IMPLIED, OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY + * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL THE EUROPAGATE CONSORTIUM OR ITS MEMBERS BE LIABLE + * FOR ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF + * ANY KIND, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA + * OR PROFITS, WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND + * ON ANY THEORY OF LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE + * USE OR PERFORMANCE OF THIS SOFTWARE. + * + */ + +/** + * \file ccl_stop_words.c + * \brief Removes stop words from terms in RPN tree + */ + +#include +#include +#include +#include +#include + +struct ccl_stop_info { + char *qualname; + char *term; + struct ccl_stop_info *next; +}; + +struct ccl_stop_words { + char *blank_chars; + NMEM nmem; /* memory for removed items */ + struct ccl_stop_info *removed_items; +}; + +static void append_removed_item(ccl_stop_words_t csw, + const char *qname, + const char *t, size_t len) +{ + struct ccl_stop_info *csi = nmem_malloc(csw->nmem, sizeof(*csi)); + struct ccl_stop_info **csip = &csw->removed_items; + if (qname) + csi->qualname = nmem_strdup(csw->nmem, qname); + else + csi->qualname = 0; + + csi->term = nmem_malloc(csw->nmem, len+1); + memcpy(csi->term, t, len); + csi->term[len] = '\0'; + csi->next = 0; + + while (*csip) + csip = &(*csip)->next; + + *csip = csi; +} + +ccl_stop_words_t ccl_stop_words_create(void) +{ + NMEM nmem = nmem_create(); + ccl_stop_words_t csw = xmalloc(sizeof(*csw)); + csw->nmem = nmem; + csw->removed_items = 0; + csw->blank_chars = xstrdup(" \r\n\t"); + return csw; +} + +void ccl_stop_words_destroy(ccl_stop_words_t csw) +{ + if (csw) + { + nmem_destroy(csw->nmem); + xfree(csw->blank_chars); + xfree(csw); + } +} + +struct ccl_rpn_node *ccl_remove_stop_r(ccl_stop_words_t csw, + CCL_bibset bibset, + struct ccl_rpn_node *p) +{ + struct ccl_rpn_node *left, *right; + switch (p->kind) + { + case CCL_RPN_AND: + case CCL_RPN_OR: + case CCL_RPN_NOT: + case CCL_RPN_PROX: + left = ccl_remove_stop_r(csw, bibset, p->u.p[0]); + right = ccl_remove_stop_r(csw, bibset, p->u.p[1]); + if (!left || !right) + { + /* we must delete our binary node and return child (if any) */ + p->u.p[0] = 0; + p->u.p[1] = 0; + ccl_rpn_delete(p); + if (left) + return left; + else + return right; + } + break; + case CCL_RPN_SET: + break; + case CCL_RPN_TERM: + if (p->u.t.term) + { + int found = 1; + while (found) + { + char *cp = p->u.t.term; + found = 0; + while (1) + { + while (*cp && strchr(csw->blank_chars, *cp)) + cp++; + if (!*cp) + break; + else + { + char *cp0 = cp; + while (*cp && !strchr(csw->blank_chars, *cp)) + cp++; + if (cp != cp0) + { + size_t len = cp - cp0; + if (ccl_search_stop(bibset, p->u.t.qual, + cp0, len)) + { + append_removed_item(csw, p->u.t.qual, + cp0, len); + while (*cp && strchr(csw->blank_chars, *cp)) + cp++; + memmove(cp0, cp, strlen(cp)+1); + found = 1; + break; + } + } + } + } + } + } + /* chop right blanks .. and see if term it gets empty */ + if (p->u.t.term && csw->removed_items) + { + char *cp = p->u.t.term + strlen(p->u.t.term); + while (1) + { + if (cp == p->u.t.term) + { + /* term is empty / blank */ + ccl_rpn_delete(p); + return 0; + } + if (!strchr(csw->blank_chars, cp[-1])) + break; + /* chop right */ + cp[-1] = 0; + --cp; + } + } + break; + } + return p; +} + +int ccl_stop_words_tree(ccl_stop_words_t csw, + CCL_bibset bibset, struct ccl_rpn_node **t) +{ + struct ccl_rpn_node *r; + + /* remove list items */ + nmem_reset(csw->nmem); + csw->removed_items = 0; + + r = ccl_remove_stop_r(csw, bibset, *t); + *t = r; + if (csw->removed_items) + return 1; + return 0; +} + +int ccl_stop_words_info(ccl_stop_words_t csw, int idx, + const char **qualname, const char **term) +{ + struct ccl_stop_info *csi = csw->removed_items; + int i = 0; + while (csi && i < idx) + { + csi = csi->next; + i++; + } + if (csi) + { + *qualname = csi->qualname; + *term = csi->term; + return 1; + } + return 0; +} + +/* + * Local variables: + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ + diff --git a/src/cclfind.c b/src/cclfind.c index 69f059c..17d0470 100644 --- a/src/cclfind.c +++ b/src/cclfind.c @@ -56,7 +56,7 @@ /* CCL find (to rpn conversion) * Europagate, 1995 * - * $Id: cclfind.c,v 1.13 2007-04-30 19:55:40 adam Exp $ + * $Id: cclfind.c,v 1.14 2007-05-01 12:22:11 adam Exp $ * * Old Europagate log: * @@ -199,6 +199,7 @@ struct ccl_rpn_node *ccl_rpn_node_create(enum ccl_rpn_kind kind) case CCL_RPN_TERM: p->u.t.attr_list = 0; p->u.t.term = 0; + p->u.t.qual = 0; break; default: break; @@ -225,6 +226,7 @@ void ccl_rpn_delete(struct ccl_rpn_node *rpn) break; case CCL_RPN_TERM: xfree(rpn->u.t.term); + xfree(rpn->u.t.qual); for (attr = rpn->u.t.attr_list; attr; attr = attr1) { attr1 = attr->next; @@ -392,6 +394,12 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, p = ccl_rpn_node_create(CCL_RPN_TERM); p->u.t.attr_list = NULL; p->u.t.term = NULL; + if (qa && qa[0]) + { + const char *n = ccl_qual_get_name(qa[0]); + if (n) + p->u.t.qual = xstrdup(n); + } /* go through all attributes and add them to the attribute list */ for (i=0; qa && qa[i]; i++) @@ -471,30 +479,17 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, } if (i == no-1 && right_trunc) src_len--; - if (!ccl_qual_match_stop(cclp->bibset, qa, src_str, src_len)) + if (p->u.t.term[0] && cclp->look_token->ws_prefix_len) { -#if 0 - fprintf(stderr, "[%s %.*s]", - ccl_qual_get_name(qa[0]), src_len, src_str); -#endif - if (p->u.t.term[0] && cclp->look_token->ws_prefix_len) - { - size_t len = strlen(p->u.t.term); - memcpy(p->u.t.term + len, cclp->look_token->ws_prefix_buf, - cclp->look_token->ws_prefix_len); - p->u.t.term[len + cclp->look_token->ws_prefix_len] = '\0'; - } - strxcat(p->u.t.term, src_str, src_len); + size_t len = strlen(p->u.t.term); + memcpy(p->u.t.term + len, cclp->look_token->ws_prefix_buf, + cclp->look_token->ws_prefix_len); + p->u.t.term[len + cclp->look_token->ws_prefix_len] = '\0'; } + strxcat(p->u.t.term, src_str, src_len); ADVANCE; } - if (p->u.t.term[0] == 0) - { - ccl_rpn_delete(p); - continue; - } - /* make the top node point to us.. */ if (p_top) { @@ -1204,6 +1199,7 @@ struct ccl_rpn_node *ccl_find_str(CCL_bibset bibset, const char *str, ccl_token_del(list); return p; } + /* * Local variables: * c-basic-offset: 4 diff --git a/src/cclp.h b/src/cclp.h index a3b1e90..24930f3 100644 --- a/src/cclp.h +++ b/src/cclp.h @@ -2,7 +2,7 @@ * Copyright (C) 1995-2005, Index Data ApS * See the file LICENSE for details. * - * $Id: cclp.h,v 1.4 2007-04-30 19:55:40 adam Exp $ + * $Id: cclp.h,v 1.5 2007-05-01 12:22:11 adam Exp $ */ /** @@ -99,10 +99,6 @@ struct ccl_rpn_attr *ccl_qual_get_attr(ccl_qualifier_t q); YAZ_EXPORT const char *ccl_qual_get_name(ccl_qualifier_t q); -YAZ_EXPORT -int ccl_qual_match_stop(CCL_bibset bibset, ccl_qualifier_t *qa, - const char *src_str, size_t src_len); - /* * Local variables: * c-basic-offset: 4 diff --git a/src/cclqual.c b/src/cclqual.c index 7843494..67ef7fa 100644 --- a/src/cclqual.c +++ b/src/cclqual.c @@ -48,7 +48,7 @@ /* CCL qualifiers * Europagate, 1995 * - * $Id: cclqual.c,v 1.9 2007-04-30 19:55:40 adam Exp $ + * $Id: cclqual.c,v 1.10 2007-05-01 12:22:11 adam Exp $ * * Old Europagate Log: * @@ -397,31 +397,30 @@ const char **ccl_qual_search_special(CCL_bibset b, const char *name) return 0; } -int ccl_qual_match_stop(CCL_bibset bibset, ccl_qualifier_t *qa, - const char *src_str, size_t src_len) +int ccl_search_stop(CCL_bibset bibset, const char *qname, + const char *src_str, size_t src_len) { - if (qa[0]) + const char **slist = 0; + if (qname) { - char qname[80]; - const char **slist; - yaz_snprintf(qname, sizeof(qname)-1, "stop.%s", - ccl_qual_get_name(qa[0])); - slist = ccl_qual_search_special(bibset, qname); - if (!slist) - slist = ccl_qual_search_special(bibset, "stop.*"); - if (slist) - { - int i; - for (i = 0; slist[i]; i++) - if (src_len == strlen(slist[i]) - && ccl_memicmp(slist[i], src_str, src_len) == 0) - return 1; - } + char qname_buf[80]; + yaz_snprintf(qname_buf, sizeof(qname_buf)-1, "stop.%s", + qname); + slist = ccl_qual_search_special(bibset, qname_buf); + } + if (!slist) + slist = ccl_qual_search_special(bibset, "stop.*"); + if (slist) + { + int i; + for (i = 0; slist[i]; i++) + if (src_len == strlen(slist[i]) + && ccl_memicmp(slist[i], src_str, src_len) == 0) + return 1; } return 0; } - /* * Local variables: * c-basic-offset: 4 diff --git a/util/cclsh.c b/util/cclsh.c index b1e06e1..b9aafc0 100644 --- a/util/cclsh.c +++ b/util/cclsh.c @@ -44,7 +44,7 @@ /* CCL shell. * Europagate 1995 * - * $Id: cclsh.c,v 1.7 2007-04-30 19:50:22 adam Exp $ + * $Id: cclsh.c,v 1.8 2007-05-01 12:22:11 adam Exp $ * * Old Europagate Log: * @@ -104,11 +104,11 @@ static char *prog; void usage(const char *prog) { - fprintf (stderr, "%s: [-d] [-b configfile] [-x xmlconfig]\n", prog); - exit (1); + fprintf(stderr, "%s: [-d] [-b configfile] [-x xmlconfig]\n", prog); + exit(1); } -int main (int argc, char **argv) +int main(int argc, char **argv) { CCL_bibset bibset; FILE *bib_inf; @@ -122,7 +122,7 @@ int main (int argc, char **argv) WRBUF q_wrbuf = 0; prog = *argv; - bibset = ccl_qual_mk (); + bibset = ccl_qual_mk(); while ((ret = options("db:x:", argv, argc, &arg)) != -2) { @@ -133,15 +133,15 @@ int main (int argc, char **argv) break; case 'b': bib_fname = arg; - bib_inf = fopen (bib_fname, "r"); + bib_inf = fopen(bib_fname, "r"); if (!bib_inf) { - fprintf (stderr, "%s: cannot open %s\n", prog, + fprintf(stderr, "%s: cannot open %s\n", prog, bib_fname); - exit (1); + exit(1); } - ccl_qual_file (bibset, bib_inf); - fclose (bib_inf); + ccl_qual_file(bibset, bib_inf); + fclose(bib_inf); break; #if YAZ_HAVE_XML2 case 'x': @@ -176,23 +176,23 @@ int main (int argc, char **argv) int error; struct ccl_rpn_node *rpn; - rpn = ccl_parser_find_str (cclp, wrbuf_cstr(q_wrbuf)); + rpn = ccl_parser_find_str(cclp, wrbuf_cstr(q_wrbuf)); error = ccl_parser_get_error(cclp, 0); if (error) { - printf ("%s\n", ccl_err_msg (error)); + printf("%s\n", ccl_err_msg(error)); } else { if (rpn) { - ccl_pr_tree (rpn, stdout); - printf ("\n"); + ccl_pr_tree(rpn, stdout); + printf("\n"); } } - ccl_parser_destroy (cclp); + ccl_parser_destroy(cclp); if (rpn) ccl_rpn_delete(rpn); wrbuf_destroy(q_wrbuf); @@ -218,10 +218,10 @@ int main (int argc, char **argv) break; } strcpy(buf,line_in); - free (line_in); + free(line_in); #else - printf ("CCLSH>"); fflush (stdout); - if (!fgets (buf, 999, stdin)) + printf("CCLSH>"); fflush(stdout); + if (!fgets(buf, 999, stdin)) break; #endif @@ -236,23 +236,43 @@ int main (int argc, char **argv) if (error) { - printf ("%*s^ - ", 6+pos, " "); - printf ("%s\n", ccl_err_msg (error)); + printf("%*s^ - ", 6+pos, " "); + printf("%s\n", ccl_err_msg(error)); } else { if (rpn && i == 0) { - ccl_pr_tree (rpn, stdout); - printf ("\n"); + ccl_stop_words_t csw = ccl_stop_words_create(); + int idx = 0; + printf("First:\n"); + ccl_pr_tree(rpn, stdout); + if (ccl_stop_words_tree(csw, bibset, &rpn)) + { + printf("Second:\n"); + ccl_pr_tree(rpn, stdout); + printf("\n"); + + for (idx = 0; ; idx++) + { + const char *qname; + const char *term; + if (!ccl_stop_words_info(csw, idx, + &qname, &term)) + break; + printf("Removed from %s: %s\n", + qname ? qname : "none", term); + } + } + ccl_stop_words_destroy(csw); } } - ccl_parser_destroy (cclp); + ccl_parser_destroy(cclp); if (rpn) ccl_rpn_delete(rpn); } } - printf ("\n"); + printf("\n"); ccl_qual_rm(&bibset); return 0; } -- 1.7.10.4