From 65efc4dd0a947e1b4620d93c88d771b83bc32dac Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Sat, 28 Dec 2002 12:13:03 +0000 Subject: [PATCH] CCL proximity support --- CHANGELOG | 5 ++ ccl/cclfind.c | 18 ++++- ccl/cclptree.c | 60 ++++++++++----- ccl/ccltoken.c | 4 +- include/yaz/ccl.h | 4 +- zutil/logrpn.c | 94 +++++++++++++---------- zutil/yaz-ccl.c | 217 ++++++++++++----------------------------------------- 7 files changed, 169 insertions(+), 233 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 37b5e53..917ac97 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,11 @@ Possible compatibility problems with earlier versions marked with '*'. --- 1.9.3 2002/MM/DD +CCL proximity operators !n, %n converts to PQF @prox 0 n 1 2 k 2 +and @prox 0 n 0 1 k 2 respectively, meaning: exlusion=false, +distance=n, order=true/false, relation=le, prox unit=word. If +n is omitted, distance 1 is used. + For TCP/IP COMSTACK, set recv buffer to an appropriate "large" value on Solaris. Patch from Ko van der Sloot. diff --git a/ccl/cclfind.c b/ccl/cclfind.c index 388e7ba..c83d90e 100644 --- a/ccl/cclfind.c +++ b/ccl/cclfind.c @@ -44,7 +44,7 @@ /* CCL find (to rpn conversion) * Europagate, 1995 * - * $Id: cclfind.c,v 1.31 2002-06-06 12:54:24 adam Exp $ + * $Id: cclfind.c,v 1.32 2002-12-28 12:13:03 adam Exp $ * * Old Europagate log: * @@ -797,6 +797,16 @@ static struct ccl_rpn_node *search_terms (CCL_parser cclp, { if (KIND == CCL_TOK_PROX) { + struct ccl_rpn_node *p_prox = 0; + /* ! word order specified */ + /* % word order not specified */ + p_prox = mk_node(CCL_RPN_TERM); + p_prox->u.t.term = xmalloc(cclp->look_token->len); + memcpy(p_prox->u.t.term, cclp->look_token->name, + cclp->look_token->len); + p_prox->u.t.term[cclp->look_token->len] = 0; + p_prox->u.t.attr_list = 0; + ADVANCE; p2 = search_term_x (cclp, qa, list, 1); if (!p2) @@ -807,6 +817,7 @@ static struct ccl_rpn_node *search_terms (CCL_parser cclp, pn = mk_node (CCL_RPN_PROX); pn->u.p[0] = p1; pn->u.p[1] = p2; + pn->u.p[2] = p_prox; p1 = pn; } else if (is_term_ok(KIND, list)) @@ -820,6 +831,7 @@ static struct ccl_rpn_node *search_terms (CCL_parser cclp, pn = mk_node (CCL_RPN_PROX); pn->u.p[0] = p1; pn->u.p[1] = p2; + pn->u.p[2] = 0; p1 = pn; } else @@ -910,6 +922,7 @@ static struct ccl_rpn_node *search_elements (CCL_parser cclp, struct ccl_rpn_node *node_this = mk_node(CCL_RPN_OR); node_this->u.p[0] = node; node_this->u.p[1] = node_sub; + node_this->u.p[2] = 0; node = node_this; } else @@ -948,6 +961,7 @@ static struct ccl_rpn_node *find_spec (CCL_parser cclp, pn = mk_node (CCL_RPN_AND); pn->u.p[0] = p1; pn->u.p[1] = p2; + pn->u.p[2] = 0; p1 = pn; continue; case CCL_TOK_OR: @@ -961,6 +975,7 @@ static struct ccl_rpn_node *find_spec (CCL_parser cclp, pn = mk_node (CCL_RPN_OR); pn->u.p[0] = p1; pn->u.p[1] = p2; + pn->u.p[2] = 0; p1 = pn; continue; case CCL_TOK_NOT: @@ -974,6 +989,7 @@ static struct ccl_rpn_node *find_spec (CCL_parser cclp, pn = mk_node (CCL_RPN_NOT); pn->u.p[0] = p1; pn->u.p[1] = p2; + pn->u.p[2] = 0; p1 = pn; continue; } diff --git a/ccl/cclptree.c b/ccl/cclptree.c index a1f00d5..0710fdd 100644 --- a/ccl/cclptree.c +++ b/ccl/cclptree.c @@ -44,7 +44,7 @@ /* CCL print rpn tree - infix notation * Europagate, 1995 * - * $Id: cclptree.c,v 1.10 2002-07-12 12:11:33 ja7 Exp $ + * $Id: cclptree.c,v 1.11 2002-12-28 12:13:03 adam Exp $ * * Old Europagate Log: * @@ -98,30 +98,50 @@ void ccl_pr_tree_as_qrpn(struct ccl_rpn_node *rpn, FILE *fd_out, int indent) fprintf (fd_out, "@attr %d=%d ", attr->type, attr->value); } fprintf (fd_out, "\"%s\"\n", rpn->u.t.term); - break; + break; case CCL_RPN_AND: - fprintf (fd_out, "@and \n"); - ccl_pr_tree_as_qrpn (rpn->u.p[0], fd_out,indent+2); - ccl_pr_tree_as_qrpn (rpn->u.p[1], fd_out,indent+2); - break; + fprintf (fd_out, "@and \n"); + ccl_pr_tree_as_qrpn (rpn->u.p[0], fd_out,indent+2); + ccl_pr_tree_as_qrpn (rpn->u.p[1], fd_out,indent+2); + break; case CCL_RPN_OR: - fprintf (fd_out, "@or \n"); - ccl_pr_tree_as_qrpn (rpn->u.p[0], fd_out,indent+2); - ccl_pr_tree_as_qrpn (rpn->u.p[1], fd_out,indent+2); - break; + fprintf (fd_out, "@or \n"); + ccl_pr_tree_as_qrpn (rpn->u.p[0], fd_out,indent+2); + ccl_pr_tree_as_qrpn (rpn->u.p[1], fd_out,indent+2); + break; case CCL_RPN_NOT: - fprintf (fd_out, "@not "); - ccl_pr_tree_as_qrpn (rpn->u.p[0], fd_out,indent+2); - ccl_pr_tree_as_qrpn (rpn->u.p[1], fd_out,indent+2); - break; + fprintf (fd_out, "@not "); + ccl_pr_tree_as_qrpn (rpn->u.p[0], fd_out,indent+2); + ccl_pr_tree_as_qrpn (rpn->u.p[1], fd_out,indent+2); + break; case CCL_RPN_SET: - fprintf (fd_out, "set=%s ", rpn->u.setname); - break; + fprintf (fd_out, "set=%s ", rpn->u.setname); + break; case CCL_RPN_PROX: - fprintf (fd_out, "@prox "); - ccl_pr_tree_as_qrpn (rpn->u.p[0], fd_out,indent+2); - ccl_pr_tree_as_qrpn (rpn->u.p[1], fd_out,indent+2); - break; + if (rpn->u.p[2] && rpn->u.p[2]->kind == CCL_RPN_TERM) + { + const char *cp = rpn->u.p[2]->u.t.term; + /* exlusion distance ordered relation which-code unit-code */ + if (*cp == '!') + { + /* word order specified */ + if (isdigit(cp[1])) + fprintf(fd_out, "@prox 0 %s 1 2 known 2", cp+1); + else + fprintf(fd_out, "@prox 0 1 1 2 known 2"); + } + else if (*cp == '%') + { + /* word order not specified */ + if (isdigit(cp[1])) + fprintf(fd_out, "@prox 0 %s 0 2 known 2", cp+1); + else + fprintf(fd_out, "@prox 0 1 0 2 known 2"); + } + } + ccl_pr_tree_as_qrpn (rpn->u.p[0], fd_out,indent+2); + ccl_pr_tree_as_qrpn (rpn->u.p[1], fd_out,indent+2); + break; default: fprintf(stderr,"Internal Error Unknown ccl_rpn node type %d\n",rpn->kind); } diff --git a/ccl/ccltoken.c b/ccl/ccltoken.c index b749a9d..e530e11 100644 --- a/ccl/ccltoken.c +++ b/ccl/ccltoken.c @@ -44,7 +44,7 @@ /* CCL - lexical analysis * Europagate, 1995 * - * $Id: ccltoken.c,v 1.20 2002-10-14 19:45:36 adam Exp $ + * $Id: ccltoken.c,v 1.21 2002-12-28 12:13:03 adam Exp $ * * Old Europagate Log: * @@ -244,7 +244,7 @@ struct ccl_token *ccl_parser_tokenize (CCL_parser cclp, const char *command) case '%': case '!': last->kind = CCL_TOK_PROX; - while (*cp == '%' || *cp == '!') + while (isdigit(*cp)) { ++ last->len; cp++; diff --git a/include/yaz/ccl.h b/include/yaz/ccl.h index 1fb5503..076b899 100644 --- a/include/yaz/ccl.h +++ b/include/yaz/ccl.h @@ -45,7 +45,7 @@ /* * CCL - header file * - * $Id: ccl.h,v 1.12 2002-06-06 13:02:01 adam Exp $ + * $Id: ccl.h,v 1.13 2002-12-28 12:13:03 adam Exp $ * * Old Europagate Log: * @@ -125,7 +125,7 @@ struct ccl_rpn_attr { struct ccl_rpn_node { int kind; union { - struct ccl_rpn_node *p[2]; + struct ccl_rpn_node *p[3]; struct { char *term; struct ccl_rpn_attr *attr_list; diff --git a/zutil/logrpn.c b/zutil/logrpn.c index 57d2d24..95460cb 100644 --- a/zutil/logrpn.c +++ b/zutil/logrpn.c @@ -2,15 +2,34 @@ * Copyright (C) 1995-2001, Index Data * All rights reserved. * - * $Id: logrpn.c,v 1.7 2002-07-25 12:48:54 adam Exp $ + * $Id: logrpn.c,v 1.8 2002-12-28 12:13:03 adam Exp $ */ #include #include #include +static const char *relToStr(int v) +{ + const char *str = 0; + switch (v) + { + case 1: str = "Less than"; break; + case 2: str = "Less than or equal"; break; + case 3: str = "Equal"; break; + case 4: str = "Greater or equal"; break; + case 5: str = "Greater than"; break; + case 6: str = "Not equal"; break; + case 100: str = "Phonetic"; break; + case 101: str = "Stem"; break; + case 102: str = "Relevance"; break; + case 103: str = "AlwaysMatches"; break; + } + return str; +} static void attrStr (int type, int value, enum oid_value ast, char *str) { + const char *rstr; *str = '\0'; switch (ast) { @@ -23,41 +42,11 @@ static void attrStr (int type, int value, enum oid_value ast, char *str) sprintf (str, "use"); break; case 2: - switch (value) - { - case 1: - sprintf (str, "relation=Less than"); - break; - case 2: - sprintf (str, "relation=Less than or equal"); - break; - case 3: - sprintf (str, "relation=Equal"); - break; - case 4: - sprintf (str, "relation=Greater or equal"); - break; - case 5: - sprintf (str, "relation=Greater than"); - break; - case 6: - sprintf (str, "relation=Not equal"); - break; - case 100: - sprintf (str, "relation=Phonetic"); - break; - case 101: - sprintf (str, "relation=Stem"); - break; - case 102: - sprintf (str, "relation=Relevance"); - break; - case 103: - sprintf (str, "relation=AlwaysMatches"); - break; - default: - sprintf (str, "relation"); - } + rstr = relToStr(value); + if (rstr) + sprintf (str, "relation=%s", rstr); + else + sprintf (str, "relation=%d", value); break; case 3: switch (value) @@ -239,7 +228,10 @@ static void zlog_structure (Z_RPNStructure *zs, int level, enum oid_value ast) { if (zs->which == Z_RPNStructure_complex) { - switch (zs->u.complex->roperator->which) + Z_Operator *op = zs->u.complex->roperator; + const char *rstr = 0; + const char *unit = "private"; + switch (op->which) { case Z_Operator_and: yaz_log (LOG_LOG, "%*.0s and", level, ""); @@ -251,7 +243,33 @@ static void zlog_structure (Z_RPNStructure *zs, int level, enum oid_value ast) yaz_log (LOG_LOG, "%*.0s and-not", level, ""); break; case Z_Operator_prox: - yaz_log (LOG_LOG, "%*.0s proximity", level, ""); + if (op->u.prox->which == Z_ProximityOperator_known) + { + switch(*op->u.prox->u.known) + { + case Z_ProxUnit_character: unit = "character"; break; + case Z_ProxUnit_word: unit = "word"; break; + case Z_ProxUnit_sentence: unit = "sentence"; break; + case Z_ProxUnit_paragraph: unit = "paragraph"; break; + case Z_ProxUnit_section: unit = "section"; break; + case Z_ProxUnit_chapter: unit = "chapter"; break; + case Z_ProxUnit_document: unit = "document"; break; + case Z_ProxUnit_element: unit = "element"; break; + case Z_ProxUnit_subelement: unit = "subelement"; break; + case Z_ProxUnit_elementType: unit = "elementType"; break; + case Z_ProxUnit_byte: unit = "byte"; break; + default: unit = "unknown"; break; + } + } + rstr = relToStr(*op->u.prox->relationType); + yaz_log (LOG_LOG, "%*.0s prox excl=%s dist=%d order=%s " + "rel=%s unit=%s", + level, "", op->u.prox->exclusion ? + (*op->u.prox->exclusion ? "T" : "F") : "N", + *op->u.prox->distance, + *op->u.prox->ordered ? "T" : "F", + rstr ? rstr : "unknown", + unit); break; default: yaz_log (LOG_LOG, "%*.0s unknown complex", level, ""); diff --git a/zutil/yaz-ccl.c b/zutil/yaz-ccl.c index f8b8050..45eeb37 100644 --- a/zutil/yaz-ccl.c +++ b/zutil/yaz-ccl.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 1996-2001, Index Data. + * Copyright (c) 1996-2002, Index Data. * See the file LICENSE for details. * - * $Id: yaz-ccl.c,v 1.15 2001-11-13 23:00:43 adam Exp $ + * $Id: yaz-ccl.c,v 1.16 2002-12-28 12:13:03 adam Exp $ */ #include @@ -10,183 +10,38 @@ #include #include +#include -static Z_RPNStructure *ccl_rpn_structure (ODR o, struct ccl_rpn_node *p); - -static Z_AttributesPlusTerm *ccl_rpn_term (ODR o, struct ccl_rpn_node *p) +Z_RPNQuery *ccl_rpn_query (ODR o, struct ccl_rpn_node *p) { - struct ccl_rpn_attr *attr; - int num = 0; - Z_AttributesPlusTerm *zapt; - Odr_oct *term_octet; - Z_Term *term; - Z_AttributeElement **elements; - - zapt = (Z_AttributesPlusTerm *)odr_malloc (o, sizeof(*zapt)); - - term_octet = (Odr_oct *)odr_malloc (o, sizeof(*term_octet)); + YAZ_PQF_Parser parser = yaz_pqf_create(); + WRBUF wr = wrbuf_alloc(); + Z_RPNQuery *q; - term = (Z_Term *)odr_malloc (o, sizeof(*term)); + ccl_pquery(wr, p); - for (attr = p->u.t.attr_list; attr; attr = attr->next) - num++; - if (!num) - elements = (Z_AttributeElement**)odr_nullval(); - else - { - int i = 0; - elements = (Z_AttributeElement **) - odr_malloc (o, num*sizeof(*elements)); - for (attr = p->u.t.attr_list; attr; attr = attr->next, i++) - { - elements[i] = (Z_AttributeElement *) - odr_malloc (o, sizeof(**elements)); - elements[i]->attributeType = - (int *)odr_malloc(o, sizeof(int)); - *elements[i]->attributeType = attr->type; - elements[i]->attributeSet = 0; - if (attr->set && *attr->set) - { - int value = oid_getvalbyname (attr->set); + printf ("pqf=%s\n", wrbuf_buf(wr)); + q = yaz_pqf_parse(parser, o, wrbuf_buf(wr)); - if (value != VAL_NONE) - { - elements[i]->attributeSet = - yaz_oidval_to_z3950oid(o, CLASS_ATTSET, value); - } - } - elements[i]->which = Z_AttributeValue_numeric; - elements[i]->value.numeric = - (int *)odr_malloc (o, sizeof(int)); - *elements[i]->value.numeric = attr->value; - } - } - zapt->attributes = (Z_AttributeList *) - odr_malloc (o, sizeof(*zapt->attributes)); - zapt->attributes->num_attributes = num; - zapt->attributes->attributes = elements; - zapt->term = term; - term->which = Z_Term_general; - term->u.general = term_octet; - term_octet->len = term_octet->size = strlen (p->u.t.term); - term_octet->buf = (unsigned char *)odr_malloc (o, term_octet->len+1); - strcpy ((char*) term_octet->buf, p->u.t.term); - return zapt; + wrbuf_free(wr, 1); + yaz_pqf_destroy(parser); + return q; } -static Z_Operand *ccl_rpn_simple (ODR o, struct ccl_rpn_node *p) -{ - Z_Operand *zo; - - zo = (Z_Operand *)odr_malloc (o, sizeof(*zo)); - - switch (p->kind) - { - case CCL_RPN_TERM: - zo->which = Z_Operand_APT; - zo->u.attributesPlusTerm = ccl_rpn_term (o, p); - break; - case CCL_RPN_SET: - zo->which = Z_Operand_resultSetId; - zo->u.resultSetId = odr_strdup (o, p->u.setname); - break; - default: - return 0; - } - return zo; -} - -static Z_Complex *ccl_rpn_complex (ODR o, struct ccl_rpn_node *p) -{ - Z_Complex *zc; - Z_Operator *zo; - - zc = (Z_Complex *)odr_malloc (o, sizeof(*zc)); - zo = (Z_Operator *)odr_malloc (o, sizeof(*zo)); - - zc->roperator = zo; - switch (p->kind) - { - case CCL_RPN_AND: - zo->which = Z_Operator_and; - zo->u.and_not = odr_nullval(); - break; - case CCL_RPN_OR: - zo->which = Z_Operator_or; - zo->u.and_not = odr_nullval(); - break; - case CCL_RPN_NOT: - zo->which = Z_Operator_and_not; - zo->u.and_not = odr_nullval(); - break; - case CCL_RPN_PROX: - zo->which = Z_Operator_prox; - zo->u.prox = (Z_ProximityOperator *) - odr_malloc (o, sizeof(*zo->u.prox)); - zo->u.prox->exclusion = 0; - - zo->u.prox->distance = (int *) - odr_malloc (o, sizeof(*zo->u.prox->distance)); - *zo->u.prox->distance = 2; - - zo->u.prox->ordered = (bool_t *) - odr_malloc (o, sizeof(*zo->u.prox->ordered)); - *zo->u.prox->ordered = 0; - - zo->u.prox->relationType = (int *) - odr_malloc (o, sizeof(*zo->u.prox->relationType)); - *zo->u.prox->relationType = Z_ProximityOperator_Prox_lessThan; - zo->u.prox->which = Z_ProximityOperator_known; - zo->u.prox->u.known = - (Z_ProxUnit *) odr_malloc (o, sizeof(*zo->u.prox->u.known)); - *zo->u.prox->u.known = Z_ProxUnit_word; - break; - default: - return 0; - } - zc->s1 = ccl_rpn_structure (o, p->u.p[0]); - zc->s2 = ccl_rpn_structure (o, p->u.p[1]); - return zc; -} - -static Z_RPNStructure *ccl_rpn_structure (ODR o, struct ccl_rpn_node *p) +Z_AttributesPlusTerm *ccl_scan_query (ODR o, struct ccl_rpn_node *p) { - Z_RPNStructure *zs; + YAZ_PQF_Parser parser = yaz_pqf_create(); + WRBUF wr = wrbuf_alloc(); + Z_AttributesPlusTerm *q; + Odr_oid *setp; - zs = (Z_RPNStructure *)odr_malloc (o, sizeof(*zs)); - switch (p->kind) - { - case CCL_RPN_AND: - case CCL_RPN_OR: - case CCL_RPN_NOT: - case CCL_RPN_PROX: - zs->which = Z_RPNStructure_complex; - zs->u.complex = ccl_rpn_complex (o, p); - break; - case CCL_RPN_TERM: - case CCL_RPN_SET: - zs->which = Z_RPNStructure_simple; - zs->u.simple = ccl_rpn_simple (o, p); - break; - default: - return 0; - } - return zs; -} + ccl_pquery(wr, p); -Z_RPNQuery *ccl_rpn_query (ODR o, struct ccl_rpn_node *p) -{ - Z_RPNQuery *zq = (Z_RPNQuery *)odr_malloc (o, sizeof(*zq)); - zq->attributeSetId = yaz_oidval_to_z3950oid (o, CLASS_ATTSET, VAL_BIB1); - zq->RPNStructure = ccl_rpn_structure (o, p); - return zq; -} + q = yaz_pqf_scan(parser, o, &setp, wrbuf_buf(wr)); -Z_AttributesPlusTerm *ccl_scan_query (ODR o, struct ccl_rpn_node *p) -{ - if (p->kind != CCL_RPN_TERM) - return NULL; - return ccl_rpn_term (o, p); + wrbuf_free(wr, 1); + yaz_pqf_destroy(parser); + return q; } static void ccl_pquery_complex (WRBUF w, struct ccl_rpn_node *p) @@ -194,7 +49,7 @@ static void ccl_pquery_complex (WRBUF w, struct ccl_rpn_node *p) switch (p->kind) { case CCL_RPN_AND: - wrbuf_puts (w, "@and "); + wrbuf_puts(w, "@and "); break; case CCL_RPN_OR: wrbuf_puts(w, "@or "); @@ -203,7 +58,29 @@ static void ccl_pquery_complex (WRBUF w, struct ccl_rpn_node *p) wrbuf_puts(w, "@not "); break; case CCL_RPN_PROX: - wrbuf_puts(w, "@prox 0 2 0 1 known 2 "); + if (p->u.p[2] && p->u.p[2]->kind == CCL_RPN_TERM) + { + const char *cp = p->u.p[2]->u.t.term; + /* exlusion distance ordered relation which-code unit-code */ + if (*cp == '!') + { + /* word order specified */ + if (isdigit(cp[1])) + wrbuf_printf(w, "@prox 0 %s 1 2 k 2 ", cp+1); + else + wrbuf_printf(w, "@prox 0 1 1 2 k 2 "); + } + else if (*cp == '%') + { + /* word order not specified */ + if (isdigit(cp[1])) + wrbuf_printf(w, "@prox 0 %s 0 2 k 2 ", cp+1); + else + wrbuf_printf(w, "@prox 0 1 0 2 k 2 "); + } + } + else + wrbuf_puts(w, "@prox 0 2 0 1 k 2 "); break; default: wrbuf_puts(w, "@ bad op (unknown) "); -- 1.7.10.4