http_rewrite: Avoid ?<group> regex'es.
[metaproxy-moved-to-github.git] / src / filter_http_rewrite.cpp
1 /* This file is part of Metaproxy.
2    Copyright (C) 2005-2013 Index Data
3
4 Metaproxy is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18
19 #include "config.hpp"
20 #include <metaproxy/filter.hpp>
21 #include <metaproxy/package.hpp>
22 #include <metaproxy/util.hpp>
23 #include "filter_http_rewrite.hpp"
24
25 #include <yaz/zgdu.h>
26 #include <yaz/log.h>
27
28 #include <boost/regex.hpp>
29 #include <boost/lexical_cast.hpp>
30
31 #include <vector>
32 #include <map>
33
34 #if HAVE_SYS_TYPES_H
35 #include <sys/types.h>
36 #endif
37
38 namespace mp = metaproxy_1;
39 namespace yf = mp::filter;
40
41 namespace metaproxy_1 {
42     namespace filter {
43         class HttpRewrite::RuleScope {
44         public:
45             std::vector<std::string> tags;
46             std::vector<std::string> attrs;
47             std::string content_type;
48         };
49         class HttpRewrite::Rule {
50         public:
51             enum Section { METHOD, HEADER, BODY };
52             std::string regex;
53             std::string recipe;
54             std::map<int, std::string> group_index;
55             std::vector<RuleScope> scopes;
56             Section section;
57             const std::string search_replace(
58                 std::map<std::string, std::string> & vars,
59                 const std::string & txt) const;
60             std::string sub_vars (
61                 const std::map<std::string, std::string> & vars) const;
62             void parse_groups();
63         };
64         class HttpRewrite::Rules {
65         public:
66             std::vector<Rule> rules;
67             void rewrite_reqline (mp::odr & o, Z_HTTP_Request *hreq,
68                 std::map<std::string, std::string> & vars) const;
69             void rewrite_headers(mp::odr & o, Z_HTTP_Header *headers,
70                 std::map<std::string, std::string> & vars) const;
71             void rewrite_body (mp::odr & o,
72                 char **content_buf, int *content_len,
73                 std::map<std::string, std::string> & vars) const;
74             const std::string test_patterns(
75                 std::map<std::string, std::string> & vars,
76                 const std::string & txt) const;
77         };
78     }
79 }
80
81 yf::HttpRewrite::HttpRewrite() : req_rules(new Rules), res_rules(new Rules)
82 {
83 }
84
85 yf::HttpRewrite::~HttpRewrite()
86 {
87 }
88
89 void yf::HttpRewrite::process(mp::Package & package) const
90 {
91     yaz_log(YLOG_LOG, "HttpRewrite begins....");
92     Z_GDU *gdu = package.request().get();
93     //map of request/response vars
94     std::map<std::string, std::string> vars;
95     //we have an http req
96     if (gdu && gdu->which == Z_GDU_HTTP_Request)
97     {
98         Z_HTTP_Request *hreq = gdu->u.HTTP_Request;
99         mp::odr o;
100         req_rules->rewrite_reqline(o, hreq, vars);
101         yaz_log(YLOG_LOG, ">> Request headers");
102         req_rules->rewrite_headers(o, hreq->headers, vars);
103         req_rules->rewrite_body(o,
104                 &hreq->content_buf, &hreq->content_len,
105                 vars);
106         package.request() = gdu;
107     }
108     package.move();
109     gdu = package.response().get();
110     if (gdu && gdu->which == Z_GDU_HTTP_Response)
111     {
112         Z_HTTP_Response *hres = gdu->u.HTTP_Response;
113         yaz_log(YLOG_LOG, "Response code %d", hres->code);
114         mp::odr o;
115         yaz_log(YLOG_LOG, "<< Respose headers");
116         res_rules->rewrite_headers(o, hres->headers, vars);
117         res_rules->rewrite_body(o, &hres->content_buf,
118                 &hres->content_len, vars);
119         package.response() = gdu;
120     }
121 }
122
123 void yf::HttpRewrite::Rules::rewrite_reqline (mp::odr & o,
124         Z_HTTP_Request *hreq,
125         std::map<std::string, std::string> & vars) const
126 {
127     //rewrite the request line
128     std::string path;
129     if (strstr(hreq->path, "http://") == hreq->path)
130     {
131         yaz_log(YLOG_LOG, "Path in the method line is absolute, "
132             "possibly a proxy request");
133         path += hreq->path;
134     }
135     else
136     {
137         //TODO what about proto
138         path += "http://";
139         path += z_HTTP_header_lookup(hreq->headers, "Host");
140         path += hreq->path;
141     }
142     yaz_log(YLOG_LOG, "Proxy request URL is %s", path.c_str());
143     std::string npath =
144         test_patterns(vars, path);
145     if (!npath.empty())
146     {
147         yaz_log(YLOG_LOG, "Rewritten request URL is %s", npath.c_str());
148         hreq->path = odr_strdup(o, npath.c_str());
149     }
150 }
151
152 void yf::HttpRewrite::Rules::rewrite_headers(mp::odr & o,
153         Z_HTTP_Header *headers,
154         std::map<std::string, std::string> & vars) const
155 {
156     for (Z_HTTP_Header *header = headers;
157             header != 0;
158             header = header->next)
159     {
160         std::string sheader(header->name);
161         sheader += ": ";
162         sheader += header->value;
163         yaz_log(YLOG_LOG, "%s: %s", header->name, header->value);
164         std::string out = test_patterns(vars, sheader);
165         if (!out.empty())
166         {
167             size_t pos = out.find(": ");
168             if (pos == std::string::npos)
169             {
170                 yaz_log(YLOG_LOG, "Header malformed during rewrite, ignoring");
171                 continue;
172             }
173             header->name = odr_strdup(o, out.substr(0, pos).c_str());
174             header->value = odr_strdup(o, out.substr(pos+2,
175                         std::string::npos).c_str());
176         }
177     }
178 }
179
180 void yf::HttpRewrite::Rules::rewrite_body (mp::odr & o,
181         char **content_buf,
182         int *content_len,
183         std::map<std::string, std::string> & vars) const
184 {
185     if (*content_buf)
186     {
187         std::string body(*content_buf);
188         std::string nbody =
189             test_patterns(vars, body);
190         if (!nbody.empty())
191         {
192             *content_buf = odr_strdup(o, nbody.c_str());
193             *content_len = nbody.size();
194         }
195     }
196 }
197
198 /**
199  * Tests pattern from the vector in order and executes recipe on
200  the first match.
201  */
202 const std::string yf::HttpRewrite::Rules::test_patterns(
203         std::map<std::string, std::string> & vars,
204         const std::string & txt) const
205 {
206     for (size_t i = 0; i < rules.size(); i++)
207     {
208         std::string out = rules[i].search_replace(vars, txt);
209         if (!out.empty()) return out;
210     }
211     return "";
212 }
213
214 const std::string yf::HttpRewrite::Rule::search_replace(
215         std::map<std::string, std::string> & vars,
216         const std::string & txt) const
217 {
218     //exec regex against value
219     boost::regex re(regex);
220     boost::smatch what;
221     std::string::const_iterator start, end;
222     start = txt.begin();
223     end = txt.end();
224     std::string out;
225     while (regex_search(start, end, what, re)) //find next full match
226     {
227         size_t i;
228         for (i = 1; i < what.size(); ++i)
229         {
230             //check if the group is named
231             std::map<int, std::string>::const_iterator it
232                 = group_index.find(i);
233             if (it != group_index.end())
234             {   //it is
235                 if (!what[i].str().empty())
236                     vars[it->second] = what[i];
237             }
238
239         }
240         //prepare replacement string
241         std::string rvalue = sub_vars(vars);
242         yaz_log(YLOG_LOG, "! Rewritten '%s' to '%s'",
243                 what.str(0).c_str(), rvalue.c_str());
244         out.append(start, what[0].first);
245         out.append(rvalue);
246         start = what[0].second; //move search forward
247     }
248     //if we had a match cat the last part
249     if (start != txt.begin())
250         out.append(start, end);
251     return out;
252 }
253
254 void yf::HttpRewrite::Rule::parse_groups()
255 {
256     int gnum = 0;
257     bool esc = false;
258     const std::string & str = regex;
259     std::string res;
260     yaz_log(YLOG_LOG, "Parsing groups from '%s'", str.c_str());
261     for (size_t i = 0; i < str.size(); ++i)
262     {
263         res += str[i];
264         if (!esc && str[i] == '\\')
265         {
266             esc = true;
267             continue;
268         }
269         if (!esc && str[i] == '(') //group starts
270         {
271             gnum++;
272             if (i+1 < str.size() && str[i+1] == '?') //group with attrs
273             {
274                 i++;
275                 if (i+1 < str.size() && str[i+1] == ':') //non-capturing
276                 {
277                     if (gnum > 0) gnum--;
278                     res += str[i];
279                     i++;
280                     res += str[i];
281                     continue;
282                 }
283                 if (i+1 < str.size() && str[i+1] == 'P') //optional, python
284                     i++;
285                 if (i+1 < str.size() && str[i+1] == '<') //named
286                 {
287                     i++;
288                     std::string gname;
289                     bool term = false;
290                     while (++i < str.size())
291                     {
292                         if (str[i] == '>') { term = true; break; }
293                         if (!isalnum(str[i]))
294                             throw mp::filter::FilterException
295                                 ("Only alphanumeric chars allowed, found "
296                                  " in '"
297                                  + str
298                                  + "' at "
299                                  + boost::lexical_cast<std::string>(i));
300                         gname += str[i];
301                     }
302                     if (!term)
303                         throw mp::filter::FilterException
304                             ("Unterminated group name '" + gname
305                              + " in '" + str +"'");
306                     group_index[gnum] = gname;
307                     yaz_log(YLOG_LOG, "Found named group '%s' at $%d",
308                             gname.c_str(), gnum);
309                 }
310             }
311         }
312         esc = false;
313     }
314     regex = res;
315 }
316
317 std::string yf::HttpRewrite::Rule::sub_vars (
318         const std::map<std::string, std::string> & vars) const
319 {
320     std::string out;
321     bool esc = false;
322     const std::string & in = recipe;
323     for (size_t i = 0; i < in.size(); ++i)
324     {
325         if (!esc && in[i] == '\\')
326         {
327             esc = true;
328             continue;
329         }
330         if (!esc && in[i] == '$') //var
331         {
332             if (i+1 < in.size() && in[i+1] == '{') //ref prefix
333             {
334                 ++i;
335                 std::string name;
336                 bool term = false;
337                 while (++i < in.size())
338                 {
339                     if (in[i] == '}') { term = true; break; }
340                     name += in[i];
341                 }
342                 if (!term) throw mp::filter::FilterException
343                     ("Unterminated var ref in '"+in+"' at "
344                      + boost::lexical_cast<std::string>(i));
345                 std::map<std::string, std::string>::const_iterator it
346                     = vars.find(name);
347                 if (it != vars.end())
348                 {
349                     out += it->second;
350                 }
351             }
352             else
353             {
354                 throw mp::filter::FilterException
355                     ("Malformed or trimmed var ref in '"
356                      +in+"' at "+boost::lexical_cast<std::string>(i));
357             }
358             continue;
359         }
360         //passthru
361         out += in[i];
362         esc = false;
363     }
364     return out;
365 }
366
367 void yf::HttpRewrite::configure_rules(const xmlNode *ptr,
368         Rules & rules)
369 {
370     for (ptr = ptr->children; ptr; ptr = ptr->next)
371     {
372         if (ptr->type != XML_ELEMENT_NODE)
373             continue;
374         else if (!strcmp((const char *) ptr->name, "rewrite"))
375         {
376             Rule rule;
377             const struct _xmlAttr *attr;
378             for (attr = ptr->properties; attr; attr = attr->next)
379             {
380                 if (!strcmp((const char *) attr->name,  "from"))
381                     rule.regex = mp::xml::get_text(attr->children);
382                 else if (!strcmp((const char *) attr->name,  "to"))
383                     rule.recipe = mp::xml::get_text(attr->children);
384                 else
385                     throw mp::filter::FilterException
386                         ("Bad attribute "
387                          + std::string((const char *) attr->name)
388                          + " in rewrite section of http_rewrite");
389             }
390             yaz_log(YLOG_LOG, "Found rewrite rule from '%s' to '%s'",
391                     rule.regex.c_str(), rule.recipe.c_str());
392             rule.parse_groups();
393             if (!rule.regex.empty())
394                 rules.rules.push_back(rule);
395         }
396         else
397         {
398             throw mp::filter::FilterException
399                 ("Bad element o"
400                  + std::string((const char *) ptr->name)
401                  + " in http_rewrite1 filter");
402         }
403     }
404 }
405
406 void yf::HttpRewrite::configure(const xmlNode * ptr, bool test_only,
407         const char *path)
408 {
409     for (ptr = ptr->children; ptr; ptr = ptr->next)
410     {
411         if (ptr->type != XML_ELEMENT_NODE)
412             continue;
413         else if (!strcmp((const char *) ptr->name, "request"))
414         {
415             configure_rules(ptr, *req_rules);
416         }
417         else if (!strcmp((const char *) ptr->name, "response"))
418         {
419             configure_rules(ptr, *res_rules);
420         }
421         else
422         {
423             throw mp::filter::FilterException
424                 ("Bad element "
425                  + std::string((const char *) ptr->name)
426                  + " in http_rewrite1 filter");
427         }
428     }
429 }
430
431 static mp::filter::Base* filter_creator()
432 {
433     return new mp::filter::HttpRewrite;
434 }
435
436 extern "C" {
437     struct metaproxy_1_filter_struct metaproxy_1_filter_http_rewrite = {
438         0,
439         "http_rewrite",
440         filter_creator
441     };
442 }
443
444
445 /*
446  * Local variables:
447  * c-basic-offset: 4
448  * c-file-style: "Stroustrup"
449  * indent-tabs-mode: nil
450  * End:
451  * vim: shiftwidth=4 tabstop=8 expandtab
452  */
453