src/charsets.c

   1 /* $Id: charsets.c,v 1.1 2007-05-10 11:46:09 adam Exp $
   2    Copyright (c) 2006-2007, Index Data.
   3
   4 This file is part of Pazpar2.
   5
   6 Pazpar2 is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 2, or (at your option) any later
   9 version.
  10
  11 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Pazpar2; see the file LICENSE.  If not, write to the
  18 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
  19 02111-1307, USA.
  20  */
  21
  22 /** \file charsets.c
  23     \brief Pazpar2 Character set facilities
  24 */
  25
  26 #if HAVE_CONFIG_H
  27 #include "cconfig.h"
  28 #endif
  29
  30 #include <yaz/xmalloc.h>
  31 #include <yaz/wrbuf.h>
  32 #include <ctype.h>
  33 #include <assert.h>
  34 #include "charsets.h"
  35
  36 /* charset handle */
  37 struct pp2_charset_s {
  38     const char *(*token_next_handler)(pp2_relevance_token_t prt);
  39     /* other handlers will come as we see fit */
  40 };
  41
  42 static const char *pp2_relevance_token_a_to_z(pp2_relevance_token_t prt);
  43 /* in the future : */
  44 // static const char *pp2_relevance_token_icu(pp2_relevance_token_t prt);
  45
  46 /* tokenzier handle */
  47 struct pp2_relevance_token_s {
  48     const char *cp;     /* unnormalized buffer we're tokenizing */
  49     pp2_charset_t pct;  /* our main charset handle (type+config) */
  50     WRBUF norm_str;     /* normized string we return (temporarily) */
  51 };
  52
  53 pp2_charset_t pp2_charset_create(void)
  54 {
  55     pp2_charset_t pct = xmalloc(sizeof(*pct));
  56
  57     pct->token_next_handler = pp2_relevance_token_a_to_z;
  58     return pct;
  59 }
  60
  61 void pp2_charset_destroy(pp2_charset_t pct)
  62 {
  63     xfree(pct);
  64 }
  65
  66 pp2_relevance_token_t pp2_relevance_tokenize(pp2_charset_t pct,
  67                                              const char *buf)
  68 {
  69     pp2_relevance_token_t prt = xmalloc(sizeof(*prt));
  70
  71     assert(pct);
  72     prt->norm_str = wrbuf_alloc();
  73     prt->cp = buf;
  74     prt->pct = pct;
  75     return prt;
  76 }
  77
  78 void pp2_relevance_token_destroy(pp2_relevance_token_t prt)
  79 {
  80     assert(prt);
  81     wrbuf_destroy(prt->norm_str);
  82     xfree(prt);
  83 }
  84
  85 const char *pp2_relevance_token_next(pp2_relevance_token_t prt)
  86 {
  87     assert(prt);
  88     return (prt->pct->token_next_handler)(prt);
  89 }
  90
  91 #define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 1 : -1)
  92 /* original tokenizer with our tokenize interface, but we
  93    add +1 to ensure no '\0' are in our string (except for EOF)
  94 */
  95 static const char *pp2_relevance_token_a_to_z(pp2_relevance_token_t prt)
  96 {
  97     const char *cp = prt->cp;
  98     int c;
  99
 100     /* skip white space */
 101     while (*cp && (c = raw_char(tolower(*cp))) < 0)
 102         cp++;
 103     if (*cp == '\0')
 104     {
 105         prt->cp = cp;
 106         return 0;
 107     }
 108     /* now read the term itself */
 109     wrbuf_rewind(prt->norm_str);
 110     while (*cp && (c = raw_char(tolower(*cp))) >= 0)
 111     {
 112         wrbuf_putc(prt->norm_str, c);
 113         cp++;
 114     }
 115     prt->cp = cp;
 116     return wrbuf_cstr(prt->norm_str);
 117 }
 118
 119
 120 /*
 121  * Local variables:
 122  * c-basic-offset: 4
 123  * indent-tabs-mode: nil
 124  * End:
 125  * vim: shiftwidth=4 tabstop=8 expandtab
 126  */