Merge branch 'master' of ssh://git.indexdata.com/home/git/pub/yaz
[yaz-moved-to-github.git] / src / ccl_stop_words.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2010 Index Data
3  * See the file LICENSE for details.
4  */
5 /** 
6  * \file ccl_stop_words.c
7  * \brief Removes stop words from terms in RPN tree
8  */
9
10 #include <stdio.h>
11 #include <string.h>
12 #include <ctype.h>
13 #include <yaz/ccl.h>
14 #include <yaz/nmem.h>
15
16 struct ccl_stop_info {
17     char *qualname;
18     char *term;
19     struct ccl_stop_info *next;
20 };
21
22 struct ccl_stop_words {
23     char *blank_chars;
24     NMEM nmem; /* memory for removed items */
25     struct ccl_stop_info *removed_items;
26 };
27     
28 static void append_removed_item(ccl_stop_words_t csw,
29                                 const char *qname,
30                                 const char *t, size_t len)
31 {
32     struct ccl_stop_info *csi = (struct ccl_stop_info *)
33         nmem_malloc(csw->nmem, sizeof(*csi));
34     struct ccl_stop_info **csip = &csw->removed_items;
35     if (qname)
36         csi->qualname = nmem_strdup(csw->nmem, qname);
37     else
38         csi->qualname = 0;
39
40     csi->term = (char *) nmem_malloc(csw->nmem, len+1);
41     memcpy(csi->term, t, len);
42     csi->term[len] = '\0';
43     csi->next = 0;
44
45     while (*csip)
46         csip = &(*csip)->next;
47     
48     *csip = csi;
49 }
50
51 ccl_stop_words_t ccl_stop_words_create(void)
52 {
53     NMEM nmem = nmem_create();
54     ccl_stop_words_t csw = (ccl_stop_words_t) xmalloc(sizeof(*csw));
55     csw->nmem = nmem;
56     csw->removed_items = 0;
57     csw->blank_chars = xstrdup(" \r\n\t");
58     return csw;
59 }
60
61 void ccl_stop_words_destroy(ccl_stop_words_t csw)
62 {
63     if (csw)
64     {
65         nmem_destroy(csw->nmem);
66         xfree(csw->blank_chars);
67         xfree(csw);
68     }
69 }
70
71 struct ccl_rpn_node *ccl_remove_stop_r(ccl_stop_words_t csw,
72                                        CCL_bibset bibset,
73                                        struct ccl_rpn_node *p)
74 {
75     struct ccl_rpn_node *left, *right;
76     switch (p->kind)
77     {
78     case CCL_RPN_AND:
79     case CCL_RPN_OR:
80     case CCL_RPN_NOT:
81     case CCL_RPN_PROX:
82         left = ccl_remove_stop_r(csw, bibset, p->u.p[0]);
83         right = ccl_remove_stop_r(csw, bibset, p->u.p[1]);
84         if (!left || !right)
85         {
86             /* we must delete our binary node and return child (if any) */
87             p->u.p[0] = 0;
88             p->u.p[1] = 0;
89             ccl_rpn_delete(p);
90             if (left)
91                 return left;
92             else
93                 return right;
94         }
95         break;
96     case CCL_RPN_SET:
97         break;
98     case CCL_RPN_TERM:
99         if (p->u.t.term)
100         {
101             int found = 1;
102             while (found)
103             {
104                 char *cp = p->u.t.term;
105                 found = 0;
106                 while (1)
107                 {
108                     while (*cp && strchr(csw->blank_chars, *cp))
109                         cp++;
110                     if (!*cp)
111                         break;
112                     else
113                     {
114                         char *cp0 = cp;
115                         while (*cp && !strchr(csw->blank_chars, *cp))
116                             cp++;
117                         if (cp != cp0)
118                         {
119                             size_t len = cp - cp0;
120                             if (ccl_search_stop(bibset, p->u.t.qual,
121                                                 cp0, len))
122                             {
123                                 append_removed_item(csw, p->u.t.qual,
124                                                     cp0, len);
125                                 while (*cp && strchr(csw->blank_chars, *cp))
126                                     cp++;
127                                 memmove(cp0, cp, strlen(cp)+1);
128                                 found = 1;
129                                 break;
130                             }
131                         }
132                     }
133                 }
134             }
135         }
136         /* chop right blanks .. and see if term it gets empty */
137         if (p->u.t.term && csw->removed_items)
138         {
139             char *cp = p->u.t.term + strlen(p->u.t.term);
140             while (1)
141             {
142                 if (cp == p->u.t.term)
143                 {
144                     /* term is empty / blank */
145                     ccl_rpn_delete(p);
146                     return 0;
147                 }
148                 if (!strchr(csw->blank_chars, cp[-1]))
149                     break;
150                 /* chop right */
151                 cp[-1] = 0;
152                 --cp;
153             }
154         }
155         break;
156     }
157     return p;
158 }
159
160 int ccl_stop_words_tree(ccl_stop_words_t csw,
161                         CCL_bibset bibset, struct ccl_rpn_node **t)
162 {
163     struct ccl_rpn_node *r;
164     
165     /* remove list items */
166     nmem_reset(csw->nmem);
167     csw->removed_items = 0;
168     
169     r = ccl_remove_stop_r(csw, bibset, *t);
170     *t = r;
171     if (csw->removed_items)
172         return 1;
173     return 0;
174 }
175
176 int ccl_stop_words_info(ccl_stop_words_t csw, int idx,
177                         const char **qualname, const char **term)
178 {
179     struct ccl_stop_info *csi = csw->removed_items;
180     int i = 0;
181     while (csi && i < idx)
182     {
183         csi = csi->next;
184         i++;
185     }
186     if (csi)
187     {
188         *qualname = csi->qualname;
189         *term = csi->term;
190         return 1;
191     }
192     return 0;
193 }
194
195 /*
196  * Local variables:
197  * c-basic-offset: 4
198  * c-file-style: "Stroustrup"
199  * indent-tabs-mode: nil
200  * End:
201  * vim: shiftwidth=4 tabstop=8 expandtab
202  */
203