Version 5.0.11
[yaz-moved-to-github.git] / src / ccl_stop_words.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) Index Data
3  * See the file LICENSE for details.
4  */
5 /**
6  * \file ccl_stop_words.c
7  * \brief Removes stop words from terms in RPN tree
8  */
9 #if HAVE_CONFIG_H
10 #include <config.h>
11 #endif
12
13 #include <stdio.h>
14 #include <string.h>
15 #include <yaz/ccl.h>
16 #include <yaz/nmem.h>
17
18 struct ccl_stop_info {
19     char *qualname;
20     char *term;
21     struct ccl_stop_info *next;
22 };
23
24 struct ccl_stop_words {
25     char *blank_chars;
26     NMEM nmem; /* memory for removed items */
27     struct ccl_stop_info *removed_items;
28 };
29
30 static void append_removed_item(ccl_stop_words_t csw,
31                                 const char *qname,
32                                 const char *t, size_t len)
33 {
34     struct ccl_stop_info *csi = (struct ccl_stop_info *)
35         nmem_malloc(csw->nmem, sizeof(*csi));
36     struct ccl_stop_info **csip = &csw->removed_items;
37     if (qname)
38         csi->qualname = nmem_strdup(csw->nmem, qname);
39     else
40         csi->qualname = 0;
41
42     csi->term = (char *) nmem_malloc(csw->nmem, len+1);
43     memcpy(csi->term, t, len);
44     csi->term[len] = '\0';
45     csi->next = 0;
46
47     while (*csip)
48         csip = &(*csip)->next;
49
50     *csip = csi;
51 }
52
53 ccl_stop_words_t ccl_stop_words_create(void)
54 {
55     NMEM nmem = nmem_create();
56     ccl_stop_words_t csw = (ccl_stop_words_t) xmalloc(sizeof(*csw));
57     csw->nmem = nmem;
58     csw->removed_items = 0;
59     csw->blank_chars = xstrdup(" \r\n\t");
60     return csw;
61 }
62
63 void ccl_stop_words_destroy(ccl_stop_words_t csw)
64 {
65     if (csw)
66     {
67         nmem_destroy(csw->nmem);
68         xfree(csw->blank_chars);
69         xfree(csw);
70     }
71 }
72
73 struct ccl_rpn_node *ccl_remove_stop_r(ccl_stop_words_t csw,
74                                        CCL_bibset bibset,
75                                        struct ccl_rpn_node *p)
76 {
77     struct ccl_rpn_node *left, *right;
78     switch (p->kind)
79     {
80     case CCL_RPN_AND:
81     case CCL_RPN_OR:
82     case CCL_RPN_NOT:
83     case CCL_RPN_PROX:
84         left = ccl_remove_stop_r(csw, bibset, p->u.p[0]);
85         right = ccl_remove_stop_r(csw, bibset, p->u.p[1]);
86         if (!left || !right)
87         {
88             /* we must delete our binary node and return child (if any) */
89             p->u.p[0] = 0;
90             p->u.p[1] = 0;
91             ccl_rpn_delete(p);
92             if (left)
93                 return left;
94             else
95                 return right;
96         }
97         break;
98     case CCL_RPN_SET:
99         break;
100     case CCL_RPN_TERM:
101         if (p->u.t.term)
102         {
103             int found = 1;
104             while (found)
105             {
106                 char *cp = p->u.t.term;
107                 found = 0;
108                 while (1)
109                 {
110                     while (*cp && strchr(csw->blank_chars, *cp))
111                         cp++;
112                     if (!*cp)
113                         break;
114                     else
115                     {
116                         char *cp0 = cp;
117                         while (*cp && !strchr(csw->blank_chars, *cp))
118                             cp++;
119                         if (cp != cp0)
120                         {
121                             size_t len = cp - cp0;
122                             if (ccl_search_stop(bibset, p->u.t.qual,
123                                                 cp0, len))
124                             {
125                                 append_removed_item(csw, p->u.t.qual,
126                                                     cp0, len);
127                                 while (*cp && strchr(csw->blank_chars, *cp))
128                                     cp++;
129                                 memmove(cp0, cp, strlen(cp)+1);
130                                 found = 1;
131                                 break;
132                             }
133                         }
134                     }
135                 }
136             }
137         }
138         /* chop right blanks .. and see if term it gets empty */
139         if (p->u.t.term && csw->removed_items)
140         {
141             char *cp = p->u.t.term + strlen(p->u.t.term);
142             while (1)
143             {
144                 if (cp == p->u.t.term)
145                 {
146                     /* term is empty / blank */
147                     ccl_rpn_delete(p);
148                     return 0;
149                 }
150                 if (!strchr(csw->blank_chars, cp[-1]))
151                     break;
152                 /* chop right */
153                 cp[-1] = 0;
154                 --cp;
155             }
156         }
157         break;
158     }
159     return p;
160 }
161
162 int ccl_stop_words_tree(ccl_stop_words_t csw,
163                         CCL_bibset bibset, struct ccl_rpn_node **t)
164 {
165     struct ccl_rpn_node *r;
166
167     /* remove list items */
168     nmem_reset(csw->nmem);
169     csw->removed_items = 0;
170
171     r = ccl_remove_stop_r(csw, bibset, *t);
172     *t = r;
173     if (csw->removed_items)
174         return 1;
175     return 0;
176 }
177
178 int ccl_stop_words_info(ccl_stop_words_t csw, int idx,
179                         const char **qualname, const char **term)
180 {
181     struct ccl_stop_info *csi = csw->removed_items;
182     int i = 0;
183     while (csi && i < idx)
184     {
185         csi = csi->next;
186         i++;
187     }
188     if (csi)
189     {
190         *qualname = csi->qualname;
191         *term = csi->term;
192         return 1;
193     }
194     return 0;
195 }
196
197 /*
198  * Local variables:
199  * c-basic-offset: 4
200  * c-file-style: "Stroustrup"
201  * indent-tabs-mode: nil
202  * End:
203  * vim: shiftwidth=4 tabstop=8 expandtab
204  */
205