src/ccltoken.c

   1 /*
   2  * Copyright (c) 1995, the EUROPAGATE consortium (see below).
   3  *
   4  * The EUROPAGATE consortium members are:
   5  *
   6  *    University College Dublin
   7  *    Danmarks Teknologiske Videnscenter
   8  *    An Chomhairle Leabharlanna
   9  *    Consejo Superior de Investigaciones Cientificas
  10  *
  11  * Permission to use, copy, modify, distribute, and sell this software and
  12  * its documentation, in whole or in part, for any purpose, is hereby granted,
  13  * provided that:
  14  *
  15  * 1. This copyright and permission notice appear in all copies of the
  16  * software and its documentation. Notices of copyright or attribution
  17  * which appear at the beginning of any file must remain unchanged.
  18  *
  19  * 2. The names of EUROPAGATE or the project partners may not be used to
  20  * endorse or promote products derived from this software without specific
  21  * prior written permission.
  22  *
  23  * 3. Users of this software (implementors and gateway operators) agree to
  24  * inform the EUROPAGATE consortium of their use of the software. This
  25  * information will be used to evaluate the EUROPAGATE project and the
  26  * software, and to plan further developments. The consortium may use
  27  * the information in later publications.
  28  *
  29  * 4. Users of this software agree to make their best efforts, when
  30  * documenting their use of the software, to acknowledge the EUROPAGATE
  31  * consortium, and the role played by the software in their work.
  32  *
  33  * THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,
  34  * EXPRESS, IMPLIED, OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
  35  * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
  36  * IN NO EVENT SHALL THE EUROPAGATE CONSORTIUM OR ITS MEMBERS BE LIABLE
  37  * FOR ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF
  38  * ANY KIND, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
  39  * OR PROFITS, WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND
  40  * ON ANY THEORY OF LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE
  41  * USE OR PERFORMANCE OF THIS SOFTWARE.
  42  *
  43  */
  44 /**
  45  * \file ccltoken.c
  46  * \brief Implements CCL lexical analyzer (scanner)
  47  */
  48 /* CCL - lexical analysis
  49  * Europagate, 1995
  50  *
  51  * $Id: ccltoken.c,v 1.5 2004-10-15 00:19:00 adam Exp $
  52  *
  53  * Old Europagate Log:
  54  *
  55  * Revision 1.10  1995/07/11  12:28:31  adam
  56  * New function: ccl_token_simple (split into simple tokens) and
  57  *  ccl_token_del (delete tokens).
  58  *
  59  * Revision 1.9  1995/05/16  09:39:28  adam
  60  * LICENSE.
  61  *
  62  * Revision 1.8  1995/05/11  14:03:57  adam
  63  * Changes in the reading of qualifier(s). New function: ccl_qual_fitem.
  64  * New variable ccl_case_sensitive, which controls whether reserved
  65  * words and field names are case sensitive or not.
  66  *
  67  * Revision 1.7  1995/04/19  12:11:24  adam
  68  * Minor change.
  69  *
  70  * Revision 1.6  1995/04/17  09:31:48  adam
  71  * Improved handling of qualifiers. Aliases or reserved words.
  72  *
  73  * Revision 1.5  1995/02/23  08:32:00  adam
  74  * Changed header.
  75  *
  76  * Revision 1.3  1995/02/15  17:42:16  adam
  77  * Minor changes of the api of this module. FILE* argument added
  78  * to ccl_pr_tree.
  79  *
  80  * Revision 1.2  1995/02/14  19:55:13  adam
  81  * Header files ccl.h/cclp.h are gone! They have been merged an
  82  * moved to ../include/ccl.h.
  83  * Node kind(s) in ccl_rpn_node have changed names.
  84  *
  85  * Revision 1.1  1995/02/13  12:35:21  adam
  86  * First version of CCL. Qualifiers aren't handled yet.
  87  *
  88  */
  89
  90 #include <string.h>
  91 #include <stdlib.h>
  92 #include <ctype.h>
  93
  94 #include <yaz/ccl.h>
  95
  96 /*
  97  * token_cmp: Compare token with keyword(s)
  98  * kw:     Keyword list. Each keyword is separated by space.
  99  * token:  CCL token.
 100  * return: 1 if token string matches one of the keywords in list;
 101  *         0 otherwise.
 102  */
 103 static int token_cmp (CCL_parser cclp, const char *kw, struct ccl_token *token)
 104 {
 105     const char *cp1 = kw;
 106     const char *cp2;
 107     const char *aliases;
 108     int case_sensitive = cclp->ccl_case_sensitive;
 109
 110     aliases = ccl_qual_search_special(cclp->bibset, "case");
 111     if (aliases)
 112         case_sensitive = atoi(aliases);
 113     if (!kw)
 114         return 0;
 115     while ((cp2 = strchr (cp1, ' ')))
 116     {
 117         if (token->len == (size_t) (cp2-cp1))
 118         {
 119             if (case_sensitive)
 120             {
 121                 if (!memcmp (cp1, token->name, token->len))
 122                     return 1;
 123             }
 124             else
 125             {
 126                 if (!ccl_memicmp (cp1, token->name, token->len))
 127                     return 1;
 128             }
 129         }
 130         cp1 = cp2+1;
 131     }
 132     if (case_sensitive)
 133         return token->len == strlen(cp1)
 134             && !memcmp (cp1, token->name, token->len);
 135     return token->len == strlen(cp1) &&
 136         !ccl_memicmp (cp1, token->name, token->len);
 137 }
 138
 139 /*
 140  * ccl_token_simple: tokenize CCL raw tokens
 141  */
 142 struct ccl_token *ccl_token_simple (const char *command)
 143 {
 144     const char *cp = command;
 145     struct ccl_token *first = NULL;
 146     struct ccl_token *last = NULL;
 147
 148     while (1)
 149     {
 150         while (*cp && strchr (" \t\r\n", *cp))
 151         {
 152             cp++;
 153             continue;
 154         }
 155         if (!first)
 156         {
 157             first = last = (struct ccl_token *)xmalloc (sizeof (*first));
 158             ccl_assert (first);
 159             last->prev = NULL;
 160         }
 161         else
 162         {
 163             last->next = (struct ccl_token *)xmalloc (sizeof(*first));
 164             ccl_assert (last->next);
 165             last->next->prev = last;
 166             last = last->next;
 167         }
 168         last->next = NULL;
 169         last->name = cp;
 170         last->len = 1;
 171         switch (*cp++)
 172         {
 173         case '\0':
 174             last->kind = CCL_TOK_EOL;
 175             return first;
 176         case '\"':
 177             last->kind = CCL_TOK_TERM;
 178             last->name = cp;
 179             last->len = 0;
 180             while (*cp && *cp != '\"')
 181             {
 182                 cp++;
 183                 ++ last->len;
 184             }
 185             if (*cp == '\"')
 186                 cp++;
 187             break;
 188         default:
 189             while (*cp && !strchr (" \t\n\r", *cp))
 190             {
 191                 cp++;
 192                 ++ last->len;
 193             }
 194             last->kind = CCL_TOK_TERM;
 195         }
 196     }
 197     return first;
 198 }
 199
 200
 201 /*
 202  * ccl_tokenize: tokenize CCL command string.
 203  * return: CCL token list.
 204  */
 205 struct ccl_token *ccl_parser_tokenize (CCL_parser cclp, const char *command)
 206 {
 207     const char *aliases;
 208     const unsigned char *cp = (const unsigned char *) command;
 209     struct ccl_token *first = NULL;
 210     struct ccl_token *last = NULL;
 211
 212     while (1)
 213     {
 214         while (*cp && strchr (" \t\r\n", *cp))
 215         {
 216             cp++;
 217             continue;
 218         }
 219         if (!first)
 220         {
 221             first = last = (struct ccl_token *)xmalloc (sizeof (*first));
 222             ccl_assert (first);
 223             last->prev = NULL;
 224         }
 225         else
 226         {
 227             last->next = (struct ccl_token *)xmalloc (sizeof(*first));
 228             ccl_assert (last->next);
 229             last->next->prev = last;
 230             last = last->next;
 231         }
 232         last->next = NULL;
 233         last->name = (const char *) cp;
 234         last->len = 1;
 235         switch (*cp++)
 236         {
 237         case '\0':
 238             last->kind = CCL_TOK_EOL;
 239             return first;
 240         case '(':
 241             last->kind = CCL_TOK_LP;
 242             break;
 243         case ')':
 244             last->kind = CCL_TOK_RP;
 245             break;
 246         case ',':
 247             last->kind = CCL_TOK_COMMA;
 248             break;
 249         case '%':
 250         case '!':
 251             last->kind = CCL_TOK_PROX;
 252             while (isdigit(*cp))
 253             {
 254                 ++ last->len;
 255                 cp++;
 256             }
 257             break;
 258         case '>':
 259         case '<':
 260         case '=':
 261             if (*cp == '=' || *cp == '<' || *cp == '>')
 262             {
 263                 cp++;
 264                 last->kind = CCL_TOK_REL;
 265                 ++ last->len;
 266             }
 267             else if (cp[-1] == '=')
 268                 last->kind = CCL_TOK_EQ;
 269             else
 270                 last->kind = CCL_TOK_REL;
 271             break;
 272         case '\"':
 273             last->kind = CCL_TOK_TERM;
 274             last->name = (const char *) cp;
 275             last->len = 0;
 276             while (*cp && *cp != '\"')
 277             {
 278                 cp++;
 279                 ++ last->len;
 280             }
 281             if (*cp == '\"')
 282                 cp++;
 283             break;
 284         default:
 285             if (!strchr ("(),%!><= \t\n\r", cp[-1]))
 286             {
 287                 while (*cp && !strchr ("(),%!><= \t\n\r", *cp))
 288                 {
 289                     cp++;
 290                     ++ last->len;
 291                 }
 292             }
 293             last->kind = CCL_TOK_TERM;
 294
 295             aliases = ccl_qual_search_special(cclp->bibset, "and");
 296             if (!aliases)
 297                 aliases = cclp->ccl_token_and;
 298             if (token_cmp (cclp, aliases, last))
 299                 last->kind = CCL_TOK_AND;
 300
 301             aliases = ccl_qual_search_special(cclp->bibset, "or");
 302             if (!aliases)
 303                 aliases = cclp->ccl_token_or;
 304             if (token_cmp (cclp, aliases, last))
 305                 last->kind = CCL_TOK_OR;
 306
 307             aliases = ccl_qual_search_special(cclp->bibset, "not");
 308             if (!aliases)
 309                 aliases = cclp->ccl_token_not;
 310             if (token_cmp (cclp, aliases, last))
 311                 last->kind = CCL_TOK_NOT;
 312
 313             aliases = ccl_qual_search_special(cclp->bibset, "set");
 314             if (!aliases)
 315                 aliases = cclp->ccl_token_set;
 316
 317             if (token_cmp (cclp, aliases, last))
 318                 last->kind = CCL_TOK_SET;
 319         }
 320     }
 321     return first;
 322 }
 323
 324 struct ccl_token *ccl_token_add (struct ccl_token *at)
 325 {
 326     struct ccl_token *n = (struct ccl_token *)xmalloc (sizeof(*n));
 327     ccl_assert(n);
 328     n->next = at->next;
 329     n->prev = at;
 330     at->next = n;
 331     if (n->next)
 332         n->next->prev = n;
 333
 334     n->kind = CCL_TOK_TERM;
 335     n->name = 0;
 336     n->len = 0;
 337     return n;
 338 }
 339
 340 struct ccl_token *ccl_tokenize (const char *command)
 341 {
 342     CCL_parser cclp = ccl_parser_create ();
 343     struct ccl_token *list;
 344
 345     list = ccl_parser_tokenize (cclp, command);
 346
 347     ccl_parser_destroy (cclp);
 348     return list;
 349 }
 350
 351 /*
 352  * ccl_token_del: delete CCL tokens
 353  */
 354 void ccl_token_del (struct ccl_token *list)
 355 {
 356     struct ccl_token *list1;
 357
 358     while (list)
 359     {
 360         list1 = list->next;
 361         xfree (list);
 362         list = list1;
 363     }
 364 }
 365
 366 char *ccl_strdup (const char *str)
 367 {
 368     int len = strlen(str);
 369     char *p = (char*) xmalloc (len+1);
 370     strcpy (p, str);
 371     return p;
 372 }
 373
 374 CCL_parser ccl_parser_create (void)
 375 {
 376     CCL_parser p = (CCL_parser)xmalloc (sizeof(*p));
 377     if (!p)
 378         return p;
 379     p->look_token = NULL;
 380     p->error_code = 0;
 381     p->error_pos = NULL;
 382     p->bibset = NULL;
 383
 384     p->ccl_token_and = ccl_strdup("and");
 385     p->ccl_token_or = ccl_strdup("or");
 386     p->ccl_token_not = ccl_strdup("not andnot");
 387     p->ccl_token_set = ccl_strdup("set");
 388     p->ccl_case_sensitive = 1;
 389
 390     return p;
 391 }
 392
 393 void ccl_parser_destroy (CCL_parser p)
 394 {
 395     if (!p)
 396         return;
 397     xfree (p->ccl_token_and);
 398     xfree (p->ccl_token_or);
 399     xfree (p->ccl_token_not);
 400     xfree (p->ccl_token_set);
 401     xfree (p);
 402 }
 403
 404 void ccl_parser_set_op_and (CCL_parser p, const char *op)
 405 {
 406     if (p && op)
 407     {
 408         if (p->ccl_token_and)
 409             xfree (p->ccl_token_and);
 410         p->ccl_token_and = ccl_strdup (op);
 411     }
 412 }
 413
 414 void ccl_parser_set_op_or (CCL_parser p, const char *op)
 415 {
 416     if (p && op)
 417     {
 418         if (p->ccl_token_or)
 419             xfree (p->ccl_token_or);
 420         p->ccl_token_or = ccl_strdup (op);
 421     }
 422 }
 423 void ccl_parser_set_op_not (CCL_parser p, const char *op)
 424 {
 425     if (p && op)
 426     {
 427         if (p->ccl_token_not)
 428             xfree (p->ccl_token_not);
 429         p->ccl_token_not = ccl_strdup (op);
 430     }
 431 }
 432 void ccl_parser_set_op_set (CCL_parser p, const char *op)
 433 {
 434     if (p && op)
 435     {
 436         if (p->ccl_token_set)
 437             xfree (p->ccl_token_set);
 438         p->ccl_token_set = ccl_strdup (op);
 439     }
 440 }
 441
 442 void ccl_parser_set_case (CCL_parser p, int case_sensitivity_flag)
 443 {
 444     if (p)
 445         p->ccl_case_sensitive = case_sensitivity_flag;
 446 }