From: Adam Dickmeiss Date: Thu, 25 Aug 2011 12:06:40 +0000 (+0200) Subject: ccl2rpn: Conversion to regexp-1 terms (trunc=102). X-Git-Tag: v4.2.11~2 X-Git-Url: http://git.indexdata.com/?p=yaz-moved-to-github.git;a=commitdiff_plain;h=835fe1fa5d34428ba2803cd4a2b1a9b9aec48ab0 ccl2rpn: Conversion to regexp-1 terms (trunc=102). For mode t=x, the CCL parser will map both # and ? to their regular expression equivalents (. and .*). --- diff --git a/doc/tools.xml b/doc/tools.xml index 7416b1d..cc3e3c5 100644 --- a/doc/tools.xml +++ b/doc/tools.xml @@ -731,6 +731,15 @@ set to both left&right. + + t=x + Allows masking anywhere in a term, thus fully supporting + # (mask one character) and ? (zero or more of any). + If masking is used, trunction is set to 102 (regexp-1 in term) + and the term is converted accordingly to a regular expression. + + + diff --git a/include/yaz/ccl.h b/include/yaz/ccl.h index fa84877..d3e3032 100644 --- a/include/yaz/ccl.h +++ b/include/yaz/ccl.h @@ -352,6 +352,7 @@ int ccl_stop_words_info(ccl_stop_words_t csw, int idx, #define CCL_BIB1_TRU_CAN_RIGHT (-2) #define CCL_BIB1_TRU_CAN_BOTH (-3) #define CCL_BIB1_TRU_CAN_NONE (-4) +#define CCL_BIB1_TRU_CAN_REGEX (-5) diff --git a/src/cclfind.c b/src/cclfind.c index d5518df..f242169 100644 --- a/src/cclfind.c +++ b/src/cclfind.c @@ -17,6 +17,7 @@ #include #include +#include #include "cclp.h" @@ -258,6 +259,7 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, int len = 0; int left_trunc = 0; int right_trunc = 0; + int regex_trunc = 0; size_t max = 200; if (and_list || or_list || !multi) max = 1; @@ -356,26 +358,23 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, ccl_add_attr_numeric(p, attset, CCL_BIB1_STR, 1); } + if (qual_val_type(qa, CCL_BIB1_TRU, CCL_BIB1_TRU_CAN_REGEX, + &attset)) + { + regex_trunc = 1; /* regex trunc (102) allowed */ + } + /* make the RPN token */ - p->u.t.term = (char *)xmalloc(len); + p->u.t.term = (char *)xmalloc(len * 2 + 2); ccl_assert(p->u.t.term); p->u.t.term[0] = '\0'; for (i = 0; ilook_token->name; size_t src_len = cclp->look_token->len; + int j; + int quote_mode = 0; - if (i == 0 && src_len > 0 && *src_str == '?') - { - src_len--; - src_str++; - left_trunc = 1; - } - if (i == no - 1 && src_len > 0 && src_str[src_len-1] == '?') - { - src_len--; - right_trunc = 1; - } if (p->u.t.term[0] && cclp->look_token->ws_prefix_len) { size_t len = strlen(p->u.t.term); @@ -383,7 +382,61 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, cclp->look_token->ws_prefix_len); p->u.t.term[len + cclp->look_token->ws_prefix_len] = '\0'; } - strxcat(p->u.t.term, src_str, src_len); + for (j = 0; j < src_len; j++) + { + if (j > 0 && src_str[j-1] == '\\') + { + if (regex_trunc && strchr("()[]?*.", src_str[j])) + { + regex_trunc = 2; + strcat(p->u.t.term, "\\\\"); + } + strxcat(p->u.t.term, src_str + j, 1); + } + else if (src_str[j] == '"') + quote_mode = !quote_mode; + else if (!quote_mode && src_str[j] == '?') + { + if (regex_trunc) + { + strcat(p->u.t.term, ".*"); + regex_trunc = 2; /* regex trunc is really needed */ + } + else if (i == 0 && j == 0) + left_trunc = 1; + else if (i == no - 1 && j == src_len - 1) + right_trunc = 1; + else + { + cclp->error_code = CCL_ERR_TRUNC_NOT_BOTH; + ccl_rpn_delete(p); + return NULL; + } + } + else if (!quote_mode && src_str[j] == '#') + { + if (regex_trunc) + { + strcat(p->u.t.term, "."); + regex_trunc = 2; /* regex trunc is really needed */ + } + else + { + cclp->error_code = CCL_ERR_TRUNC_NOT_BOTH; + ccl_rpn_delete(p); + return NULL; + } + } + else if (src_str[j] != '\\') + { + if (regex_trunc && strchr("()[]?*.", src_str[j])) + { + regex_trunc = 2; + strcat(p->u.t.term, "\\\\"); + } + strxcat(p->u.t.term, src_str + j, 1); + } + } ADVANCE; } @@ -440,6 +493,10 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, } ccl_add_attr_numeric(p, attset, CCL_BIB1_TRU, 2); } + else if (regex_trunc == 2) + { + ccl_add_attr_numeric(p, attset, CCL_BIB1_TRU, 102); + } else { if (qual_val_type(qa, CCL_BIB1_TRU, CCL_BIB1_TRU_CAN_NONE, diff --git a/src/cclqfile.c b/src/cclqfile.c index c49df6c..16e658f 100644 --- a/src/cclqfile.c +++ b/src/cclqfile.c @@ -158,6 +158,8 @@ int ccl_qual_field2(CCL_bibset bibset, const char *cp, const char *qual_name, value = CCL_BIB1_TRU_CAN_BOTH; else if (!ccl_stricmp (value_str, "n")) value = CCL_BIB1_TRU_CAN_NONE; + else if (!ccl_stricmp (value_str, "x")) + value = CCL_BIB1_TRU_CAN_REGEX; break; case 'c': case 'C': diff --git a/src/ccltoken.c b/src/ccltoken.c index 6c74226..5211fe8 100644 --- a/src/ccltoken.c +++ b/src/ccltoken.c @@ -126,53 +126,60 @@ struct ccl_token *ccl_parser_tokenize(CCL_parser cclp, const char *command) default: --cp; --last->len; - if (*cp == '"') + + last->kind = CCL_TOK_TERM; + last->name = (const char *) cp; + while (*cp && !strchr("(),%!><= \t\n\r", *cp)) { - cp++; - last->kind = CCL_TOK_TERM; - last->name = (const char *) cp; - while (*cp && *cp != '"') + if (*cp == '\\' && cp[1]) { cp++; ++ last->len; } - if (*cp) - cp++; - } - else - { - last->kind = CCL_TOK_TERM; - last->name = (const char *) cp; - while (*cp && !strchr("(),%!><= \t\n\r", *cp)) + else if (*cp == '"') { - ++ last->len; - cp++; - } - aliases = ccl_qual_search_special(cclp->bibset, "and"); - if (!aliases) - aliases = cclp->ccl_token_and; - if (token_cmp(cclp, aliases, last)) - last->kind = CCL_TOK_AND; - - aliases = ccl_qual_search_special(cclp->bibset, "or"); - if (!aliases) - aliases = cclp->ccl_token_or; - if (token_cmp(cclp, aliases, last)) - last->kind = CCL_TOK_OR; - - aliases = ccl_qual_search_special(cclp->bibset, "not"); - if (!aliases) - aliases = cclp->ccl_token_not; - if (token_cmp(cclp, aliases, last)) - last->kind = CCL_TOK_NOT; - - aliases = ccl_qual_search_special(cclp->bibset, "set"); - if (!aliases) - aliases = cclp->ccl_token_set; - - if (token_cmp(cclp, aliases, last)) - last->kind = CCL_TOK_SET; + while (*cp) + { + cp++; + ++ last->len; + if (*cp == '\\' && cp[1]) + { + cp++; + ++ last->len; + } + else if (*cp == '"') + break; + } + } + if (!*cp) + break; + cp++; + ++ last->len; } + aliases = ccl_qual_search_special(cclp->bibset, "and"); + if (!aliases) + aliases = cclp->ccl_token_and; + if (token_cmp(cclp, aliases, last)) + last->kind = CCL_TOK_AND; + + aliases = ccl_qual_search_special(cclp->bibset, "or"); + if (!aliases) + aliases = cclp->ccl_token_or; + if (token_cmp(cclp, aliases, last)) + last->kind = CCL_TOK_OR; + + aliases = ccl_qual_search_special(cclp->bibset, "not"); + if (!aliases) + aliases = cclp->ccl_token_not; + if (token_cmp(cclp, aliases, last)) + last->kind = CCL_TOK_NOT; + + aliases = ccl_qual_search_special(cclp->bibset, "set"); + if (!aliases) + aliases = cclp->ccl_token_set; + + if (token_cmp(cclp, aliases, last)) + last->kind = CCL_TOK_SET; } } return first; diff --git a/test/test_ccl.c b/test/test_ccl.c index ae413ea..72370e3 100644 --- a/test/test_ccl.c +++ b/test/test_ccl.c @@ -79,7 +79,7 @@ void tst1(int pass) case 0: ccl_qual_fitem(bibset, "u=4 s=pw t=l,r", "ti"); ccl_qual_fitem(bibset, "1=1016 s=al,pw t=r", "term"); - ccl_qual_fitem(bibset, "1=/my/title", "dc.title"); + ccl_qual_fitem(bibset, "1=/my/title t=x", "dc.title"); ccl_qual_fitem(bibset, "r=r", "date"); ccl_qual_fitem(bibset, "r=o", "x"); ccl_qual_fitem(bibset, "dc.title", "title"); @@ -92,7 +92,7 @@ void tst1(int pass) strcpy(tstline, "term 1=1016 s=al,pw t=r # default term"); ccl_qual_line(bibset, tstline); - strcpy(tstline, "dc.title 1=/my/title"); + strcpy(tstline, "dc.title 1=/my/title t=x"); ccl_qual_line(bibset, tstline); strcpy(tstline, "date r=r # ordered relation"); @@ -111,7 +111,7 @@ void tst1(int pass) ccl_qual_buf(bibset, "ti u=4 s=pw t=l,r\n" "term 1=1016 s=al,pw t=r\r\n" "\n" - "dc.title 1=/my/title\n" + "dc.title 1=/my/title t=x\n" "date r=r\n" "x r=o\n" "title dc.title\n" @@ -139,6 +139,7 @@ void tst1(int pass) " \n" " \n" " \n" + " \n" " \n" " \n" " \n" @@ -250,14 +251,33 @@ void tst1(int pass) YAZ_CHECK(tst_ccl_query(bibset, "title=a", "@attr 1=/my/title a ")); + YAZ_CHECK(tst_ccl_query(bibset, "title=a?b#\"c?\"", + "@attr 5=102 @attr 1=/my/title a.*b.c\\\\? ")); + + YAZ_CHECK(tst_ccl_query(bibset, "title=\\(", + "@attr 5=102 @attr 1=/my/title \\\\( ")); + + YAZ_CHECK(tst_ccl_query(bibset, "title=.", + "@attr 5=102 @attr 1=/my/title \\\\. ")); + + YAZ_CHECK(tst_ccl_query(bibset, "title=\\.", + "@attr 5=102 @attr 1=/my/title \\\\. ")); + + YAZ_CHECK(tst_ccl_query(bibset, "title=\".\"", + "@attr 5=102 @attr 1=/my/title \\\\. ")); + YAZ_CHECK(tst_ccl_query(bibset, "comb=a", "@or @attr 4=2 @attr 1=1016 a " "@attr 1=/my/title a ")); YAZ_CHECK(tst_ccl_query(bibset, "a? b?", - /* correct */ "@and @attr 5=1 @attr 4=2 @attr 1=1016 a " "@attr 5=1 @attr 4=2 @attr 1=1016 b ")); + + YAZ_CHECK(tst_ccl_query(bibset, "\"a\"? \"b?\"", + "@and @attr 5=1 @attr 4=2 @attr 1=1016 a " + "@attr 4=2 @attr 1=1016 b? ")); + ccl_qual_rm(&bibset); }