Refactor a bit
[metaproxy-moved-to-github.git] / src / filter_http_rewrite.cpp
1 /* This file is part of Metaproxy.
2    Copyright (C) 2005-2013 Index Data
3
4 Metaproxy is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18
19 #include "config.hpp"
20 #include <metaproxy/filter.hpp>
21 #include <metaproxy/package.hpp>
22 #include <metaproxy/util.hpp>
23 #include "filter_http_rewrite.hpp"
24 #include "html_parser.hpp"
25
26 #include <yaz/zgdu.h>
27 #include <yaz/log.h>
28
29 #include <boost/regex.hpp>
30 #include <boost/lexical_cast.hpp>
31
32 #include <map>
33
34 namespace mp = metaproxy_1;
35 namespace yf = mp::filter;
36
37 namespace metaproxy_1 {
38     namespace filter {
39         class HttpRewrite::Replace {
40         public:
41             std::string regex;
42             std::string recipe;
43             std::map<int, std::string> group_index;
44             const std::string search_replace(
45                 std::map<std::string, std::string> & vars,
46                 const std::string & txt) const;
47             std::string sub_vars (
48                 const std::map<std::string, std::string> & vars) const;
49             void parse_groups();
50         };
51
52         class HttpRewrite::Rule {
53         public:
54             std::list<Replace> replace_list;
55             const std::string test_patterns(
56                 std::map<std::string, std::string> & vars,
57                 const std::string & txt) const;
58         };
59         class HttpRewrite::Within {
60         public:
61             std::string header;
62             std::string attr;
63             std::string tag;
64             RulePtr rule;
65         };
66
67         class HttpRewrite::Phase {
68         public:
69             std::list<Within> within_list;
70             void rewrite_reqline(mp::odr & o, Z_HTTP_Request *hreq,
71                 std::map<std::string, std::string> & vars) const;
72             void rewrite_headers(mp::odr & o, Z_HTTP_Header *headers,
73                 std::map<std::string, std::string> & vars) const;
74             void rewrite_body(mp::odr & o,
75                 char **content_buf, int *content_len,
76                 std::map<std::string, std::string> & vars) const;
77         };
78         class HttpRewrite::Event : public HTMLParserEvent {
79         public:
80             void openTagStart(const char *name);
81             void anyTagEnd(const char *name);
82             void attribute(const char *tagName, 
83                            const char *name, 
84                            const char *value,
85                            int val_len);
86             void closeTag(const char *name);
87             void text(const char *value, int len);
88         };
89     }
90 }
91
92 yf::HttpRewrite::HttpRewrite() :
93     req_phase(new Phase), res_phase(new Phase)
94 {
95 }
96
97 yf::HttpRewrite::~HttpRewrite()
98 {
99 }
100
101 void yf::HttpRewrite::process(mp::Package & package) const
102 {
103     yaz_log(YLOG_LOG, "HttpRewrite begins....");
104     Z_GDU *gdu = package.request().get();
105     //map of request/response vars
106     std::map<std::string, std::string> vars;
107     //we have an http req
108     if (gdu && gdu->which == Z_GDU_HTTP_Request)
109     {
110         Z_HTTP_Request *hreq = gdu->u.HTTP_Request;
111         mp::odr o;
112         req_phase->rewrite_reqline(o, hreq, vars);
113         yaz_log(YLOG_LOG, ">> Request headers");
114         req_phase->rewrite_headers(o, hreq->headers, vars);
115         req_phase->rewrite_body(o,
116                 &hreq->content_buf, &hreq->content_len, vars);
117         package.request() = gdu;
118     }
119     package.move();
120     gdu = package.response().get();
121     if (gdu && gdu->which == Z_GDU_HTTP_Response)
122     {
123         Z_HTTP_Response *hres = gdu->u.HTTP_Response;
124         yaz_log(YLOG_LOG, "Response code %d", hres->code);
125         mp::odr o;
126         yaz_log(YLOG_LOG, "<< Respose headers");
127         res_phase->rewrite_headers(o, hres->headers, vars);
128         res_phase->rewrite_body(o, &hres->content_buf,
129                 &hres->content_len, vars);
130         package.response() = gdu;
131     }
132 }
133
134 void yf::HttpRewrite::Phase::rewrite_reqline (mp::odr & o,
135         Z_HTTP_Request *hreq,
136         std::map<std::string, std::string> & vars) const
137 {
138     //rewrite the request line
139     std::string path;
140     if (strstr(hreq->path, "http://") == hreq->path)
141     {
142         yaz_log(YLOG_LOG, "Path in the method line is absolute, "
143             "possibly a proxy request");
144         path += hreq->path;
145     }
146     else
147     {
148         //TODO what about proto
149         path += "http://";
150         path += z_HTTP_header_lookup(hreq->headers, "Host");
151         path += hreq->path;
152     }
153
154     std::list<Within>::const_iterator it = within_list.begin();
155     if (it != within_list.end())
156     {
157         RulePtr rule = it->rule;
158
159         yaz_log(YLOG_LOG, "Proxy request URL is %s", path.c_str());
160         std::string npath = rule->test_patterns(vars, path);
161         if (!npath.empty())
162         {
163             yaz_log(YLOG_LOG, "Rewritten request URL is %s", npath.c_str());
164             hreq->path = odr_strdup(o, npath.c_str());
165         }
166     }
167 }
168
169 void yf::HttpRewrite::Phase::rewrite_headers(mp::odr & o,
170         Z_HTTP_Header *headers,
171         std::map<std::string, std::string> & vars) const
172 {
173     for (Z_HTTP_Header *header = headers;
174             header != 0;
175             header = header->next)
176     {
177         std::string sheader(header->name);
178         sheader += ": ";
179         sheader += header->value;
180         yaz_log(YLOG_LOG, "%s: %s", header->name, header->value);
181
182         std::list<Within>::const_iterator it = within_list.begin();
183         if (it == within_list.end())
184             continue;
185         RulePtr rule = it->rule;
186
187         std::string out = rule->test_patterns(vars, sheader);
188         if (!out.empty())
189         {
190             size_t pos = out.find(": ");
191             if (pos == std::string::npos)
192             {
193                 yaz_log(YLOG_LOG, "Header malformed during rewrite, ignoring");
194                 continue;
195             }
196             header->name = odr_strdup(o, out.substr(0, pos).c_str());
197             header->value = odr_strdup(o, out.substr(pos+2,
198                                                      std::string::npos).c_str());
199         }
200     }
201 }
202
203 void yf::HttpRewrite::Phase::rewrite_body(mp::odr & o,
204         char **content_buf,
205         int *content_len,
206         std::map<std::string, std::string> & vars) const
207 {
208     if (*content_buf)
209     {
210         HTMLParser parser;
211         Event ev;
212         std::string buf(*content_buf, *content_len);
213
214         parser.parse(ev, buf.c_str());
215         std::list<Within>::const_iterator it = within_list.begin();
216         if (it != within_list.end())
217         {
218             RulePtr rule = it->rule;
219
220             std::string body(*content_buf);
221             std::string nbody = rule->test_patterns(vars, body);
222             if (!nbody.empty())
223             {
224                 *content_buf = odr_strdup(o, nbody.c_str());
225                 *content_len = nbody.size();
226             }
227         }
228     }
229 }
230
231
232 void yf::HttpRewrite::Event::openTagStart(const char *name)
233 {
234 }
235
236 void yf::HttpRewrite::Event::anyTagEnd(const char *name)
237 {
238 }
239
240 void yf::HttpRewrite::Event::attribute(const char *tagName,
241                                          const char *name,
242                                          const char *value,
243                                          int val_len)
244 {
245 }
246
247
248 void yf::HttpRewrite::Event::closeTag(const char *name)
249 {
250 }
251
252 void yf::HttpRewrite::Event::text(const char *value, int len)
253 {
254 }
255
256
257 /**
258  * Tests pattern from the vector in order and executes recipe on
259  the first match.
260  */
261 const std::string yf::HttpRewrite::Rule::test_patterns(
262         std::map<std::string, std::string> & vars,
263         const std::string & txt) const
264 {
265     std::list<Replace>::const_iterator it = replace_list.begin();
266
267     for (; it != replace_list.end(); it++)
268     {
269         std::string out = it->search_replace(vars, txt);
270         if (!out.empty()) return out;
271     }
272     return "";
273 }
274
275 const std::string yf::HttpRewrite::Replace::search_replace(
276         std::map<std::string, std::string> & vars,
277         const std::string & txt) const
278 {
279     //exec regex against value
280     boost::regex re(regex);
281     boost::smatch what;
282     std::string::const_iterator start, end;
283     start = txt.begin();
284     end = txt.end();
285     std::string out;
286     while (regex_search(start, end, what, re)) //find next full match
287     {
288         size_t i;
289         for (i = 1; i < what.size(); ++i)
290         {
291             //check if the group is named
292             std::map<int, std::string>::const_iterator it
293                 = group_index.find(i);
294             if (it != group_index.end())
295             {   //it is
296                 if (!what[i].str().empty())
297                     vars[it->second] = what[i];
298             }
299
300         }
301         //prepare replacement string
302         std::string rvalue = sub_vars(vars);
303         yaz_log(YLOG_LOG, "! Rewritten '%s' to '%s'",
304                 what.str(0).c_str(), rvalue.c_str());
305         out.append(start, what[0].first);
306         out.append(rvalue);
307         start = what[0].second; //move search forward
308     }
309     //if we had a match cat the last part
310     if (start != txt.begin())
311         out.append(start, end);
312     return out;
313 }
314
315 void yf::HttpRewrite::Replace::parse_groups()
316 {
317     int gnum = 0;
318     bool esc = false;
319     const std::string & str = regex;
320     std::string res;
321     yaz_log(YLOG_LOG, "Parsing groups from '%s'", str.c_str());
322     for (size_t i = 0; i < str.size(); ++i)
323     {
324         res += str[i];
325         if (!esc && str[i] == '\\')
326         {
327             esc = true;
328             continue;
329         }
330         if (!esc && str[i] == '(') //group starts
331         {
332             gnum++;
333             if (i+1 < str.size() && str[i+1] == '?') //group with attrs
334             {
335                 i++;
336                 if (i+1 < str.size() && str[i+1] == ':') //non-capturing
337                 {
338                     if (gnum > 0) gnum--;
339                     res += str[i];
340                     i++;
341                     res += str[i];
342                     continue;
343                 }
344                 if (i+1 < str.size() && str[i+1] == 'P') //optional, python
345                     i++;
346                 if (i+1 < str.size() && str[i+1] == '<') //named
347                 {
348                     i++;
349                     std::string gname;
350                     bool term = false;
351                     while (++i < str.size())
352                     {
353                         if (str[i] == '>') { term = true; break; }
354                         if (!isalnum(str[i]))
355                             throw mp::filter::FilterException
356                                 ("Only alphanumeric chars allowed, found "
357                                  " in '"
358                                  + str
359                                  + "' at "
360                                  + boost::lexical_cast<std::string>(i));
361                         gname += str[i];
362                     }
363                     if (!term)
364                         throw mp::filter::FilterException
365                             ("Unterminated group name '" + gname
366                              + " in '" + str +"'");
367                     group_index[gnum] = gname;
368                     yaz_log(YLOG_LOG, "Found named group '%s' at $%d",
369                             gname.c_str(), gnum);
370                 }
371             }
372         }
373         esc = false;
374     }
375     regex = res;
376 }
377
378 std::string yf::HttpRewrite::Replace::sub_vars (
379         const std::map<std::string, std::string> & vars) const
380 {
381     std::string out;
382     bool esc = false;
383     const std::string & in = recipe;
384     for (size_t i = 0; i < in.size(); ++i)
385     {
386         if (!esc && in[i] == '\\')
387         {
388             esc = true;
389             continue;
390         }
391         if (!esc && in[i] == '$') //var
392         {
393             if (i+1 < in.size() && in[i+1] == '{') //ref prefix
394             {
395                 ++i;
396                 std::string name;
397                 bool term = false;
398                 while (++i < in.size())
399                 {
400                     if (in[i] == '}') { term = true; break; }
401                     name += in[i];
402                 }
403                 if (!term) throw mp::filter::FilterException
404                     ("Unterminated var ref in '"+in+"' at "
405                      + boost::lexical_cast<std::string>(i));
406                 std::map<std::string, std::string>::const_iterator it
407                     = vars.find(name);
408                 if (it != vars.end())
409                 {
410                     out += it->second;
411                 }
412             }
413             else
414             {
415                 throw mp::filter::FilterException
416                     ("Malformed or trimmed var ref in '"
417                      +in+"' at "+boost::lexical_cast<std::string>(i));
418             }
419             continue;
420         }
421         //passthru
422         out += in[i];
423         esc = false;
424     }
425     return out;
426 }
427
428
429 void yf::HttpRewrite::configure_phase(const xmlNode *ptr, Phase &phase)
430 {
431     std::map<std::string, RulePtr > rules;
432     for (ptr = ptr->children; ptr; ptr = ptr->next)
433     {
434         if (ptr->type != XML_ELEMENT_NODE)
435             continue;
436         else if (!strcmp((const char *) ptr->name, "rule"))
437         {
438             static const char *names[2] = { "name", 0 };
439             std::string values[1];
440             values[0] = "default";
441             mp::xml::parse_attr(ptr, names, values);
442
443             RulePtr rule(new Rule);
444             for (xmlNode *p = ptr->children; p; p = p->next)
445             {
446                 if (p->type != XML_ELEMENT_NODE)
447                     continue;
448                 if (!strcmp((const char *) p->name, "rewrite"))
449                 {
450                     Replace replace;
451                     const struct _xmlAttr *attr;
452                     for (attr = p->properties; attr; attr = attr->next)
453                     {
454                         if (!strcmp((const char *) attr->name,  "from"))
455                             replace.regex = mp::xml::get_text(attr->children);
456                         else if (!strcmp((const char *) attr->name,  "to"))
457                             replace.recipe = mp::xml::get_text(attr->children);
458                         else
459                             throw mp::filter::FilterException
460                                 ("Bad attribute "
461                                  + std::string((const char *) attr->name)
462                                  + " in rewrite section of http_rewrite");
463                     }
464                     yaz_log(YLOG_LOG, "Found rewrite rule from '%s' to '%s'",
465                             replace.regex.c_str(), replace.recipe.c_str());
466                     replace.parse_groups();
467                     if (!replace.regex.empty())
468                         rule->replace_list.push_back(replace);
469                 }
470                 else
471                     throw mp::filter::FilterException
472                         ("Bad element "
473                          + std::string((const char *) p->name)
474                          + " in http_rewrite filter");
475             }
476             if (!rule->replace_list.empty())
477                 rules[values[0]] = rule;
478         }
479         else if (!strcmp((const char *) ptr->name, "within"))
480         {
481             static const char *names[5] =
482                 { "header", "attr", "tag", "rule", 0 };
483             std::string values[4];
484             mp::xml::parse_attr(ptr, names, values);
485             Within w;
486             w.header = values[0];
487             w.attr = values[1];
488             w.tag = values[2];
489             std::map<std::string,RulePtr>::const_iterator it =
490                 rules.find(values[3]);
491             if (it == rules.end())
492                 throw mp::filter::FilterException
493                     ("Reference to non-existing rule '" + values[3] +
494                      "' in http_rewrite filter");
495             w.rule = it->second;
496             phase.within_list.push_back(w);
497         }
498         else
499         {
500             throw mp::filter::FilterException
501                 ("Bad element "
502                  + std::string((const char *) ptr->name)
503                  + " in http_rewrite filter");
504         }
505     }
506 }
507
508 void yf::HttpRewrite::configure(const xmlNode * ptr, bool test_only,
509         const char *path)
510 {
511     for (ptr = ptr->children; ptr; ptr = ptr->next)
512     {
513         if (ptr->type != XML_ELEMENT_NODE)
514             continue;
515         else if (!strcmp((const char *) ptr->name, "request"))
516         {
517             configure_phase(ptr, *req_phase);
518         }
519         else if (!strcmp((const char *) ptr->name, "response"))
520         {
521             configure_phase(ptr, *res_phase);
522         }
523         else
524         {
525             throw mp::filter::FilterException
526                 ("Bad element "
527                  + std::string((const char *) ptr->name)
528                  + " in http_rewrite1 filter");
529         }
530     }
531 }
532
533 static mp::filter::Base* filter_creator()
534 {
535     return new mp::filter::HttpRewrite;
536 }
537
538 extern "C" {
539     struct metaproxy_1_filter_struct metaproxy_1_filter_http_rewrite = {
540         0,
541         "http_rewrite",
542         filter_creator
543     };
544 }
545
546
547 /*
548  * Local variables:
549  * c-basic-offset: 4
550  * c-file-style: "Stroustrup"
551  * indent-tabs-mode: nil
552  * End:
553  * vim: shiftwidth=4 tabstop=8 expandtab
554  */
555