src/ccltoken.c

   1 /*
   2  * Copyright (c) 1995, the EUROPAGATE consortium (see below).
   3  *
   4  * The EUROPAGATE consortium members are:
   5  *
   6  *    University College Dublin
   7  *    Danmarks Teknologiske Videnscenter
   8  *    An Chomhairle Leabharlanna
   9  *    Consejo Superior de Investigaciones Cientificas
  10  *
  11  * Permission to use, copy, modify, distribute, and sell this software and
  12  * its documentation, in whole or in part, for any purpose, is hereby granted,
  13  * provided that:
  14  *
  15  * 1. This copyright and permission notice appear in all copies of the
  16  * software and its documentation. Notices of copyright or attribution
  17  * which appear at the beginning of any file must remain unchanged.
  18  *
  19  * 2. The names of EUROPAGATE or the project partners may not be used to
  20  * endorse or promote products derived from this software without specific
  21  * prior written permission.
  22  *
  23  * 3. Users of this software (implementors and gateway operators) agree to
  24  * inform the EUROPAGATE consortium of their use of the software. This
  25  * information will be used to evaluate the EUROPAGATE project and the
  26  * software, and to plan further developments. The consortium may use
  27  * the information in later publications.
  28  *
  29  * 4. Users of this software agree to make their best efforts, when
  30  * documenting their use of the software, to acknowledge the EUROPAGATE
  31  * consortium, and the role played by the software in their work.
  32  *
  33  * THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,
  34  * EXPRESS, IMPLIED, OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
  35  * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
  36  * IN NO EVENT SHALL THE EUROPAGATE CONSORTIUM OR ITS MEMBERS BE LIABLE
  37  * FOR ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF
  38  * ANY KIND, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
  39  * OR PROFITS, WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND
  40  * ON ANY THEORY OF LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE
  41  * USE OR PERFORMANCE OF THIS SOFTWARE.
  42  *
  43  */
  44 /**
  45  * \file ccltoken.c
  46  * \brief Implements CCL lexical analyzer (scanner)
  47  */
  48 /* CCL - lexical analysis
  49  * Europagate, 1995
  50  *
  51  * $Id: ccltoken.c,v 1.6 2005-03-15 16:32:52 adam Exp $
  52  *
  53  * Old Europagate Log:
  54  *
  55  * Revision 1.10  1995/07/11  12:28:31  adam
  56  * New function: ccl_token_simple (split into simple tokens) and
  57  *  ccl_token_del (delete tokens).
  58  *
  59  * Revision 1.9  1995/05/16  09:39:28  adam
  60  * LICENSE.
  61  *
  62  * Revision 1.8  1995/05/11  14:03:57  adam
  63  * Changes in the reading of qualifier(s). New function: ccl_qual_fitem.
  64  * New variable ccl_case_sensitive, which controls whether reserved
  65  * words and field names are case sensitive or not.
  66  *
  67  * Revision 1.7  1995/04/19  12:11:24  adam
  68  * Minor change.
  69  *
  70  * Revision 1.6  1995/04/17  09:31:48  adam
  71  * Improved handling of qualifiers. Aliases or reserved words.
  72  *
  73  * Revision 1.5  1995/02/23  08:32:00  adam
  74  * Changed header.
  75  *
  76  * Revision 1.3  1995/02/15  17:42:16  adam
  77  * Minor changes of the api of this module. FILE* argument added
  78  * to ccl_pr_tree.
  79  *
  80  * Revision 1.2  1995/02/14  19:55:13  adam
  81  * Header files ccl.h/cclp.h are gone! They have been merged an
  82  * moved to ../include/ccl.h.
  83  * Node kind(s) in ccl_rpn_node have changed names.
  84  *
  85  * Revision 1.1  1995/02/13  12:35:21  adam
  86  * First version of CCL. Qualifiers aren't handled yet.
  87  *
  88  */
  89
  90 #include <string.h>
  91 #include <stdlib.h>
  92 #include <ctype.h>
  93
  94 #include <yaz/ccl.h>
  95
  96 /*
  97  * token_cmp: Compare token with keyword(s)
  98  * kw:     Keyword list. Each keyword is separated by space.
  99  * token:  CCL token.
 100  * return: 1 if token string matches one of the keywords in list;
 101  *         0 otherwise.
 102  */
 103 static int token_cmp (CCL_parser cclp, const char *kw, struct ccl_token *token)
 104 {
 105     const char *cp1 = kw;
 106     const char *cp2;
 107     const char *aliases;
 108     int case_sensitive = cclp->ccl_case_sensitive;
 109
 110     aliases = ccl_qual_search_special(cclp->bibset, "case");
 111     if (aliases)
 112         case_sensitive = atoi(aliases);
 113     if (!kw)
 114         return 0;
 115     while ((cp2 = strchr (cp1, ' ')))
 116     {
 117         if (token->len == (size_t) (cp2-cp1))
 118         {
 119             if (case_sensitive)
 120             {
 121                 if (!memcmp (cp1, token->name, token->len))
 122                     return 1;
 123             }
 124             else
 125             {
 126                 if (!ccl_memicmp (cp1, token->name, token->len))
 127                     return 1;
 128             }
 129         }
 130         cp1 = cp2+1;
 131     }
 132     if (case_sensitive)
 133         return token->len == strlen(cp1)
 134             && !memcmp (cp1, token->name, token->len);
 135     return token->len == strlen(cp1) &&
 136         !ccl_memicmp (cp1, token->name, token->len);
 137 }
 138
 139 /*
 140  * ccl_token_simple: tokenize CCL raw tokens
 141  */
 142 struct ccl_token *ccl_token_simple (const char *command)
 143 {
 144     const char *cp = command;
 145     struct ccl_token *first = NULL;
 146     struct ccl_token *last = NULL;
 147
 148     while (1)
 149     {
 150         while (*cp && strchr (" \t\r\n", *cp))
 151         {
 152             cp++;
 153             continue;
 154         }
 155         if (!first)
 156         {
 157             first = last = (struct ccl_token *)xmalloc (sizeof (*first));
 158             ccl_assert (first);
 159             last->prev = NULL;
 160         }
 161         else
 162         {
 163             last->next = (struct ccl_token *)xmalloc (sizeof(*first));
 164             ccl_assert (last->next);
 165             last->next->prev = last;
 166             last = last->next;
 167         }
 168         last->next = NULL;
 169         last->name = cp;
 170         last->len = 1;
 171         switch (*cp++)
 172         {
 173         case '\0':
 174             last->kind = CCL_TOK_EOL;
 175             return first;
 176         case '\"':
 177             last->kind = CCL_TOK_TERM;
 178             last->name = cp;
 179             last->len = 0;
 180             while (*cp && *cp != '\"')
 181             {
 182                 cp++;
 183                 ++ last->len;
 184             }
 185             if (*cp == '\"')
 186                 cp++;
 187             break;
 188         default:
 189             while (*cp && !strchr (" \t\n\r", *cp))
 190             {
 191                 cp++;
 192                 ++ last->len;
 193             }
 194             last->kind = CCL_TOK_TERM;
 195         }
 196     }
 197     return first;
 198 }
 199
 200
 201 /*
 202  * ccl_tokenize: tokenize CCL command string.
 203  * return: CCL token list.
 204  */
 205 struct ccl_token *ccl_parser_tokenize (CCL_parser cclp, const char *command)
 206 {
 207     const char *aliases;
 208     const unsigned char *cp = (const unsigned char *) command;
 209     struct ccl_token *first = NULL;
 210     struct ccl_token *last = NULL;
 211
 212     while (1)
 213     {
 214         const unsigned char *cp0 = cp;
 215         while (*cp && strchr (" \t\r\n", *cp))
 216         {
 217             cp++;
 218             continue;
 219         }
 220         if (!first)
 221         {
 222             first = last = (struct ccl_token *)xmalloc (sizeof (*first));
 223             ccl_assert (first);
 224             last->prev = NULL;
 225         }
 226         else
 227         {
 228             last->next = (struct ccl_token *)xmalloc (sizeof(*first));
 229             ccl_assert (last->next);
 230             last->next->prev = last;
 231             last = last->next;
 232         }
 233         last->ws_prefix_buf = cp0;
 234         last->ws_prefix_len = cp - cp0;
 235         last->next = NULL;
 236         last->name = (const char *) cp;
 237         last->len = 1;
 238         switch (*cp++)
 239         {
 240         case '\0':
 241             last->kind = CCL_TOK_EOL;
 242             return first;
 243         case '(':
 244             last->kind = CCL_TOK_LP;
 245             break;
 246         case ')':
 247             last->kind = CCL_TOK_RP;
 248             break;
 249         case ',':
 250             last->kind = CCL_TOK_COMMA;
 251             break;
 252         case '%':
 253         case '!':
 254             last->kind = CCL_TOK_PROX;
 255             while (isdigit(*cp))
 256             {
 257                 ++ last->len;
 258                 cp++;
 259             }
 260             break;
 261         case '>':
 262         case '<':
 263         case '=':
 264             if (*cp == '=' || *cp == '<' || *cp == '>')
 265             {
 266                 cp++;
 267                 last->kind = CCL_TOK_REL;
 268                 ++ last->len;
 269             }
 270             else if (cp[-1] == '=')
 271                 last->kind = CCL_TOK_EQ;
 272             else
 273                 last->kind = CCL_TOK_REL;
 274             break;
 275         case '\"':
 276             last->kind = CCL_TOK_TERM;
 277             last->name = (const char *) cp;
 278             last->len = 0;
 279             while (*cp && *cp != '\"')
 280             {
 281                 cp++;
 282                 ++ last->len;
 283             }
 284             if (*cp == '\"')
 285                 cp++;
 286             break;
 287         default:
 288             if (!strchr ("(),%!><= \t\n\r", cp[-1]))
 289             {
 290                 while (*cp && !strchr ("(),%!><= \t\n\r", *cp))
 291                 {
 292                     cp++;
 293                     ++ last->len;
 294                 }
 295             }
 296             last->kind = CCL_TOK_TERM;
 297
 298             aliases = ccl_qual_search_special(cclp->bibset, "and");
 299             if (!aliases)
 300                 aliases = cclp->ccl_token_and;
 301             if (token_cmp (cclp, aliases, last))
 302                 last->kind = CCL_TOK_AND;
 303
 304             aliases = ccl_qual_search_special(cclp->bibset, "or");
 305             if (!aliases)
 306                 aliases = cclp->ccl_token_or;
 307             if (token_cmp (cclp, aliases, last))
 308                 last->kind = CCL_TOK_OR;
 309
 310             aliases = ccl_qual_search_special(cclp->bibset, "not");
 311             if (!aliases)
 312                 aliases = cclp->ccl_token_not;
 313             if (token_cmp (cclp, aliases, last))
 314                 last->kind = CCL_TOK_NOT;
 315
 316             aliases = ccl_qual_search_special(cclp->bibset, "set");
 317             if (!aliases)
 318                 aliases = cclp->ccl_token_set;
 319
 320             if (token_cmp (cclp, aliases, last))
 321                 last->kind = CCL_TOK_SET;
 322         }
 323     }
 324     return first;
 325 }
 326
 327 struct ccl_token *ccl_token_add (struct ccl_token *at)
 328 {
 329     struct ccl_token *n = (struct ccl_token *)xmalloc (sizeof(*n));
 330     ccl_assert(n);
 331     n->next = at->next;
 332     n->prev = at;
 333     at->next = n;
 334     if (n->next)
 335         n->next->prev = n;
 336
 337     n->kind = CCL_TOK_TERM;
 338     n->name = 0;
 339     n->len = 0;
 340     return n;
 341 }
 342
 343 struct ccl_token *ccl_tokenize (const char *command)
 344 {
 345     CCL_parser cclp = ccl_parser_create ();
 346     struct ccl_token *list;
 347
 348     list = ccl_parser_tokenize (cclp, command);
 349
 350     ccl_parser_destroy (cclp);
 351     return list;
 352 }
 353
 354 /*
 355  * ccl_token_del: delete CCL tokens
 356  */
 357 void ccl_token_del (struct ccl_token *list)
 358 {
 359     struct ccl_token *list1;
 360
 361     while (list)
 362     {
 363         list1 = list->next;
 364         xfree (list);
 365         list = list1;
 366     }
 367 }
 368
 369 char *ccl_strdup (const char *str)
 370 {
 371     int len = strlen(str);
 372     char *p = (char*) xmalloc (len+1);
 373     strcpy (p, str);
 374     return p;
 375 }
 376
 377 CCL_parser ccl_parser_create (void)
 378 {
 379     CCL_parser p = (CCL_parser)xmalloc (sizeof(*p));
 380     if (!p)
 381         return p;
 382     p->look_token = NULL;
 383     p->error_code = 0;
 384     p->error_pos = NULL;
 385     p->bibset = NULL;
 386
 387     p->ccl_token_and = ccl_strdup("and");
 388     p->ccl_token_or = ccl_strdup("or");
 389     p->ccl_token_not = ccl_strdup("not andnot");
 390     p->ccl_token_set = ccl_strdup("set");
 391     p->ccl_case_sensitive = 1;
 392
 393     return p;
 394 }
 395
 396 void ccl_parser_destroy (CCL_parser p)
 397 {
 398     if (!p)
 399         return;
 400     xfree (p->ccl_token_and);
 401     xfree (p->ccl_token_or);
 402     xfree (p->ccl_token_not);
 403     xfree (p->ccl_token_set);
 404     xfree (p);
 405 }
 406
 407 void ccl_parser_set_op_and (CCL_parser p, const char *op)
 408 {
 409     if (p && op)
 410     {
 411         if (p->ccl_token_and)
 412             xfree (p->ccl_token_and);
 413         p->ccl_token_and = ccl_strdup (op);
 414     }
 415 }
 416
 417 void ccl_parser_set_op_or (CCL_parser p, const char *op)
 418 {
 419     if (p && op)
 420     {
 421         if (p->ccl_token_or)
 422             xfree (p->ccl_token_or);
 423         p->ccl_token_or = ccl_strdup (op);
 424     }
 425 }
 426 void ccl_parser_set_op_not (CCL_parser p, const char *op)
 427 {
 428     if (p && op)
 429     {
 430         if (p->ccl_token_not)
 431             xfree (p->ccl_token_not);
 432         p->ccl_token_not = ccl_strdup (op);
 433     }
 434 }
 435 void ccl_parser_set_op_set (CCL_parser p, const char *op)
 436 {
 437     if (p && op)
 438     {
 439         if (p->ccl_token_set)
 440             xfree (p->ccl_token_set);
 441         p->ccl_token_set = ccl_strdup (op);
 442     }
 443 }
 444
 445 void ccl_parser_set_case (CCL_parser p, int case_sensitivity_flag)
 446 {
 447     if (p)
 448         p->ccl_case_sensitive = case_sensitivity_flag;
 449 }