Ups... memory leak
[yaz-moved-to-github.git] / src / ccl_stop_words.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2010 Index Data
3  * See the file LICENSE for details.
4  */
5 /** 
6  * \file ccl_stop_words.c
7  * \brief Removes stop words from terms in RPN tree
8  */
9 #if HAVE_CONFIG_H
10 #include <config.h>
11 #endif
12
13 #include <stdio.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <yaz/ccl.h>
17 #include <yaz/nmem.h>
18
19 struct ccl_stop_info {
20     char *qualname;
21     char *term;
22     struct ccl_stop_info *next;
23 };
24
25 struct ccl_stop_words {
26     char *blank_chars;
27     NMEM nmem; /* memory for removed items */
28     struct ccl_stop_info *removed_items;
29 };
30     
31 static void append_removed_item(ccl_stop_words_t csw,
32                                 const char *qname,
33                                 const char *t, size_t len)
34 {
35     struct ccl_stop_info *csi = (struct ccl_stop_info *)
36         nmem_malloc(csw->nmem, sizeof(*csi));
37     struct ccl_stop_info **csip = &csw->removed_items;
38     if (qname)
39         csi->qualname = nmem_strdup(csw->nmem, qname);
40     else
41         csi->qualname = 0;
42
43     csi->term = (char *) nmem_malloc(csw->nmem, len+1);
44     memcpy(csi->term, t, len);
45     csi->term[len] = '\0';
46     csi->next = 0;
47
48     while (*csip)
49         csip = &(*csip)->next;
50     
51     *csip = csi;
52 }
53
54 ccl_stop_words_t ccl_stop_words_create(void)
55 {
56     NMEM nmem = nmem_create();
57     ccl_stop_words_t csw = (ccl_stop_words_t) xmalloc(sizeof(*csw));
58     csw->nmem = nmem;
59     csw->removed_items = 0;
60     csw->blank_chars = xstrdup(" \r\n\t");
61     return csw;
62 }
63
64 void ccl_stop_words_destroy(ccl_stop_words_t csw)
65 {
66     if (csw)
67     {
68         nmem_destroy(csw->nmem);
69         xfree(csw->blank_chars);
70         xfree(csw);
71     }
72 }
73
74 struct ccl_rpn_node *ccl_remove_stop_r(ccl_stop_words_t csw,
75                                        CCL_bibset bibset,
76                                        struct ccl_rpn_node *p)
77 {
78     struct ccl_rpn_node *left, *right;
79     switch (p->kind)
80     {
81     case CCL_RPN_AND:
82     case CCL_RPN_OR:
83     case CCL_RPN_NOT:
84     case CCL_RPN_PROX:
85         left = ccl_remove_stop_r(csw, bibset, p->u.p[0]);
86         right = ccl_remove_stop_r(csw, bibset, p->u.p[1]);
87         if (!left || !right)
88         {
89             /* we must delete our binary node and return child (if any) */
90             p->u.p[0] = 0;
91             p->u.p[1] = 0;
92             ccl_rpn_delete(p);
93             if (left)
94                 return left;
95             else
96                 return right;
97         }
98         break;
99     case CCL_RPN_SET:
100         break;
101     case CCL_RPN_TERM:
102         if (p->u.t.term)
103         {
104             int found = 1;
105             while (found)
106             {
107                 char *cp = p->u.t.term;
108                 found = 0;
109                 while (1)
110                 {
111                     while (*cp && strchr(csw->blank_chars, *cp))
112                         cp++;
113                     if (!*cp)
114                         break;
115                     else
116                     {
117                         char *cp0 = cp;
118                         while (*cp && !strchr(csw->blank_chars, *cp))
119                             cp++;
120                         if (cp != cp0)
121                         {
122                             size_t len = cp - cp0;
123                             if (ccl_search_stop(bibset, p->u.t.qual,
124                                                 cp0, len))
125                             {
126                                 append_removed_item(csw, p->u.t.qual,
127                                                     cp0, len);
128                                 while (*cp && strchr(csw->blank_chars, *cp))
129                                     cp++;
130                                 memmove(cp0, cp, strlen(cp)+1);
131                                 found = 1;
132                                 break;
133                             }
134                         }
135                     }
136                 }
137             }
138         }
139         /* chop right blanks .. and see if term it gets empty */
140         if (p->u.t.term && csw->removed_items)
141         {
142             char *cp = p->u.t.term + strlen(p->u.t.term);
143             while (1)
144             {
145                 if (cp == p->u.t.term)
146                 {
147                     /* term is empty / blank */
148                     ccl_rpn_delete(p);
149                     return 0;
150                 }
151                 if (!strchr(csw->blank_chars, cp[-1]))
152                     break;
153                 /* chop right */
154                 cp[-1] = 0;
155                 --cp;
156             }
157         }
158         break;
159     }
160     return p;
161 }
162
163 int ccl_stop_words_tree(ccl_stop_words_t csw,
164                         CCL_bibset bibset, struct ccl_rpn_node **t)
165 {
166     struct ccl_rpn_node *r;
167     
168     /* remove list items */
169     nmem_reset(csw->nmem);
170     csw->removed_items = 0;
171     
172     r = ccl_remove_stop_r(csw, bibset, *t);
173     *t = r;
174     if (csw->removed_items)
175         return 1;
176     return 0;
177 }
178
179 int ccl_stop_words_info(ccl_stop_words_t csw, int idx,
180                         const char **qualname, const char **term)
181 {
182     struct ccl_stop_info *csi = csw->removed_items;
183     int i = 0;
184     while (csi && i < idx)
185     {
186         csi = csi->next;
187         i++;
188     }
189     if (csi)
190     {
191         *qualname = csi->qualname;
192         *term = csi->term;
193         return 1;
194     }
195     return 0;
196 }
197
198 /*
199  * Local variables:
200  * c-basic-offset: 4
201  * c-file-style: "Stroustrup"
202  * indent-tabs-mode: nil
203  * End:
204  * vim: shiftwidth=4 tabstop=8 expandtab
205  */
206