http_rewrite using HTML parser
[metaproxy-moved-to-github.git] / src / filter_http_rewrite.cpp
1 /* This file is part of Metaproxy.
2    Copyright (C) 2005-2013 Index Data
3
4 Metaproxy is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18
19 #include "config.hpp"
20 #include <metaproxy/filter.hpp>
21 #include <metaproxy/package.hpp>
22 #include <metaproxy/util.hpp>
23 #include "filter_http_rewrite.hpp"
24 #include "html_parser.hpp"
25
26 #include <yaz/zgdu.h>
27 #include <yaz/log.h>
28
29 #include <boost/regex.hpp>
30 #include <boost/lexical_cast.hpp>
31 #include <boost/algorithm/string.hpp>
32
33 #include <map>
34
35 namespace mp = metaproxy_1;
36 namespace yf = mp::filter;
37
38 namespace metaproxy_1 {
39     namespace filter {
40         class HttpRewrite::Replace {
41         public:
42             std::string regex;
43             std::string recipe;
44             std::map<int, std::string> group_index;
45             const std::string search_replace(
46                 std::map<std::string, std::string> & vars,
47                 const std::string & txt) const;
48             std::string sub_vars (
49                 const std::map<std::string, std::string> & vars) const;
50             void parse_groups();
51         };
52
53         class HttpRewrite::Rule {
54         public:
55             std::list<Replace> replace_list;
56             const std::string test_patterns(
57                 std::map<std::string, std::string> & vars,
58                 const std::string & txt) const;
59         };
60         class HttpRewrite::Within {
61         public:
62             std::string header;
63             std::string attr;
64             std::string tag;
65             bool reqline;
66             RulePtr rule;
67         };
68
69         class HttpRewrite::Phase {
70         public:
71             std::list<Within> within_list;
72             void rewrite_reqline(mp::odr & o, Z_HTTP_Request *hreq,
73                 std::map<std::string, std::string> & vars) const;
74             void rewrite_headers(mp::odr & o, Z_HTTP_Header *headers,
75                 std::map<std::string, std::string> & vars) const;
76             void rewrite_body(mp::odr & o,
77                 char **content_buf, int *content_len,
78                 std::map<std::string, std::string> & vars) const;
79         };
80         class HttpRewrite::Event : public HTMLParserEvent {
81             void openTagStart(const char *name);
82             void anyTagEnd(const char *name, int close_it);
83             void attribute(const char *tagName, 
84                            const char *name, 
85                            const char *value,
86                            int val_len);
87             void closeTag(const char *name);
88             void text(const char *value, int len);
89             const Phase *m_phase;
90             WRBUF m_w;
91             std::list<Within>::const_iterator enabled_within;
92             std::map<std::string, std::string> &m_vars;
93         public:
94             Event(const Phase *p, std::map<std::string, std::string> &vars);
95             ~Event();
96             const char *result();
97         };
98     }
99 }
100
101 yf::HttpRewrite::HttpRewrite() :
102     req_phase(new Phase), res_phase(new Phase)
103 {
104 }
105
106 yf::HttpRewrite::~HttpRewrite()
107 {
108 }
109
110 void yf::HttpRewrite::process(mp::Package & package) const
111 {
112     yaz_log(YLOG_LOG, "HttpRewrite begins....");
113     Z_GDU *gdu = package.request().get();
114     //map of request/response vars
115     std::map<std::string, std::string> vars;
116     //we have an http req
117     if (gdu && gdu->which == Z_GDU_HTTP_Request)
118     {
119         Z_HTTP_Request *hreq = gdu->u.HTTP_Request;
120         mp::odr o;
121         req_phase->rewrite_reqline(o, hreq, vars);
122         yaz_log(YLOG_LOG, ">> Request headers");
123         req_phase->rewrite_headers(o, hreq->headers, vars);
124         req_phase->rewrite_body(o,
125                 &hreq->content_buf, &hreq->content_len, vars);
126         package.request() = gdu;
127     }
128     package.move();
129     gdu = package.response().get();
130     if (gdu && gdu->which == Z_GDU_HTTP_Response)
131     {
132         Z_HTTP_Response *hres = gdu->u.HTTP_Response;
133         yaz_log(YLOG_LOG, "Response code %d", hres->code);
134         mp::odr o;
135         yaz_log(YLOG_LOG, "<< Respose headers");
136         res_phase->rewrite_headers(o, hres->headers, vars);
137         res_phase->rewrite_body(o, &hres->content_buf,
138                 &hres->content_len, vars);
139         package.response() = gdu;
140     }
141 }
142
143 void yf::HttpRewrite::Phase::rewrite_reqline (mp::odr & o,
144         Z_HTTP_Request *hreq,
145         std::map<std::string, std::string> & vars) const
146 {
147     //rewrite the request line
148     std::string path;
149     if (strstr(hreq->path, "http://") == hreq->path)
150     {
151         yaz_log(YLOG_LOG, "Path in the method line is absolute, "
152             "possibly a proxy request");
153         path += hreq->path;
154     }
155     else
156     {
157         //TODO what about proto
158         path += "http://";
159         path += z_HTTP_header_lookup(hreq->headers, "Host");
160         path += hreq->path;
161     }
162
163
164     std::list<Within>::const_iterator it = within_list.begin();
165     for (; it != within_list.end(); it++)
166         if (it->reqline)
167         {
168             RulePtr rule = it->rule;
169             yaz_log(YLOG_LOG, "Proxy request URL is %s", path.c_str());
170             std::string npath = rule->test_patterns(vars, path);
171             if (!npath.empty())
172             {
173                 yaz_log(YLOG_LOG, "Rewritten request URL is %s", npath.c_str());
174                 hreq->path = odr_strdup(o, npath.c_str());
175             }
176         }
177 }
178
179 void yf::HttpRewrite::Phase::rewrite_headers(mp::odr & o,
180         Z_HTTP_Header *headers,
181         std::map<std::string, std::string> & vars) const
182 {
183     for (Z_HTTP_Header *header = headers; header; header = header->next)
184     {
185         std::list<Within>::const_iterator it = within_list.begin();
186         for (; it != within_list.end(); it++)
187         {
188             if (it->header.length() > 0 &&
189                 yaz_strcasecmp(it->header.c_str(), header->name) == 0)
190             {
191                 std::string sheader(header->name);
192                 sheader += ": ";
193                 sheader += header->value;
194
195                 RulePtr rule = it->rule;
196                 std::string out = rule->test_patterns(vars, sheader);
197                 if (!out.empty())
198                 {
199                     size_t pos = out.find(": ");
200                     if (pos == std::string::npos)
201                     {
202                         yaz_log(YLOG_LOG, "Header malformed during rewrite, ignoring");
203                         continue;
204                     }
205                     header->name = odr_strdup(o, out.substr(0, pos).c_str());
206                     header->value = odr_strdup(o,
207                                                out.substr(pos + 2,
208                                                           std::string::npos).c_str());
209                 }
210             }
211         }
212     }
213 }
214
215 void yf::HttpRewrite::Phase::rewrite_body(mp::odr & o,
216         char **content_buf,
217         int *content_len,
218         std::map<std::string, std::string> & vars) const
219 {
220     if (*content_buf)
221     {
222         int i;
223         for (i = 0; i < *content_len; i++)
224             if ((*content_buf)[i] == 0)
225                 return;  // binary content. skip
226
227         HTMLParser parser;
228         Event ev(this, vars);
229         std::string buf(*content_buf, *content_len);
230
231         parser.parse(ev, buf.c_str());
232         const char *res = ev.result();
233         *content_buf = odr_strdup(o, res);
234         *content_len = strlen(res);
235     }
236 }
237
238 yf::HttpRewrite::Event::Event(const Phase *p,
239                               std::map<std::string, std::string> & vars
240     ) : m_phase(p), m_vars(vars)
241 {
242     m_w = wrbuf_alloc();
243     enabled_within = m_phase->within_list.end();
244 }
245
246 yf::HttpRewrite::Event::~Event()
247 {
248     wrbuf_destroy(m_w);
249 }
250
251 const char *yf::HttpRewrite::Event::result()
252 {
253     return wrbuf_cstr(m_w);
254 }
255
256 void yf::HttpRewrite::Event::openTagStart(const char *name)
257 {
258     // check if there is <within tag="x" .. />
259     if (enabled_within == m_phase->within_list.end())
260     {
261         std::list<Within>::const_iterator it =
262             m_phase->within_list.begin();
263         for (; it != m_phase->within_list.end(); it++)
264         {
265             if (it->tag.length() > 0 && it->tag.compare(name) == 0)
266             {
267                 enabled_within = it;
268             }
269         }
270     }
271     wrbuf_putc(m_w, '<');
272     wrbuf_puts(m_w, name);
273 }
274
275 void yf::HttpRewrite::Event::anyTagEnd(const char *name, int close_it)
276 {
277     if (close_it)
278     {
279         std::list<Within>::const_iterator it = enabled_within;
280         if (it != m_phase->within_list.end())
281         {
282             if (it->tag.compare(name) == 0)
283             {
284                 enabled_within = m_phase->within_list.end();
285             }
286         }
287     }
288     if (close_it)
289         wrbuf_putc(m_w, '/');
290     wrbuf_putc(m_w, '>');
291 }
292
293 void yf::HttpRewrite::Event::attribute(const char *tagName,
294                                          const char *name,
295                                          const char *value,
296                                          int val_len)
297 {
298     std::list<Within>::const_iterator it = m_phase->within_list.begin();
299     bool subst = false;
300
301     for (; it != m_phase->within_list.end(); it++)
302     {
303         if (it->tag.length() == 0 || it->tag.compare(tagName) == 0)
304         {
305             std::vector<std::string> attr;
306             boost::split(attr, it->attr, boost::is_any_of(","));
307             size_t i;
308             for (i = 0; i < attr.size(); i++)
309             {
310                 if (attr[i].compare("#text") && attr[i].compare(name) == 0)
311                     subst = true;
312             }
313         }
314         if (subst)
315             break;
316     }
317
318     wrbuf_putc(m_w, ' ');
319     wrbuf_puts(m_w, name);
320     wrbuf_puts(m_w, "=\"");
321
322     std::string output;
323     if (subst)
324     {
325         std::string input(value, val_len);
326         output = it->rule->test_patterns(m_vars, input);
327     }
328     if (output.empty())
329         wrbuf_write(m_w, value, val_len);
330     else
331         wrbuf_puts(m_w, output.c_str());
332     wrbuf_puts(m_w, "\"");
333 }
334
335 void yf::HttpRewrite::Event::closeTag(const char *name)
336 {
337     std::list<Within>::const_iterator it = enabled_within;
338     if (it != m_phase->within_list.end())
339     {
340         if (it->tag.compare(name) == 0)
341         {
342             enabled_within = m_phase->within_list.end();
343         }
344     }
345     wrbuf_puts(m_w, "</");
346     wrbuf_puts(m_w, name);
347 }
348
349 void yf::HttpRewrite::Event::text(const char *value, int len)
350 {
351     std::list<Within>::const_iterator it = enabled_within;
352     bool subst = false;
353
354     if (it != m_phase->within_list.end())
355     {
356         subst = true;
357         if (it->attr.length() > 0)
358         {
359             subst = false;
360             std::vector<std::string> attr;
361             boost::split(attr, it->attr, boost::is_any_of(","));
362             size_t i;
363             for (i = 0; i < attr.size(); i++)
364             {
365                 if (attr[i].compare("#text") == 0)
366                 {
367                     subst = true;
368                 }
369             }
370         }
371     }
372     std::string output;
373     if (subst)
374     {
375         std::string input(value, len);
376         output = it->rule->test_patterns(m_vars, input);
377     }
378     if (output.empty())
379         wrbuf_write(m_w, value, len);
380     else
381         wrbuf_puts(m_w, output.c_str());
382 }
383
384
385 /**
386  * Tests pattern from the vector in order and executes recipe on
387  the first match.
388  */
389 const std::string yf::HttpRewrite::Rule::test_patterns(
390         std::map<std::string, std::string> & vars,
391         const std::string & txt) const
392 {
393     std::list<Replace>::const_iterator it = replace_list.begin();
394
395     for (; it != replace_list.end(); it++)
396     {
397         std::string out = it->search_replace(vars, txt);
398         if (!out.empty()) return out;
399     }
400     return "";
401 }
402
403 const std::string yf::HttpRewrite::Replace::search_replace(
404         std::map<std::string, std::string> & vars,
405         const std::string & txt) const
406 {
407     //exec regex against value
408     boost::regex re(regex);
409     boost::smatch what;
410     std::string::const_iterator start, end;
411     start = txt.begin();
412     end = txt.end();
413     std::string out;
414     while (regex_search(start, end, what, re)) //find next full match
415     {
416         size_t i;
417         for (i = 1; i < what.size(); ++i)
418         {
419             //check if the group is named
420             std::map<int, std::string>::const_iterator it
421                 = group_index.find(i);
422             if (it != group_index.end())
423             {   //it is
424                 if (!what[i].str().empty())
425                     vars[it->second] = what[i];
426             }
427
428         }
429         //prepare replacement string
430         std::string rvalue = sub_vars(vars);
431         yaz_log(YLOG_LOG, "! Rewritten '%s' to '%s'",
432                 what.str(0).c_str(), rvalue.c_str());
433         out.append(start, what[0].first);
434         out.append(rvalue);
435         start = what[0].second; //move search forward
436     }
437     //if we had a match cat the last part
438     if (start != txt.begin())
439         out.append(start, end);
440     return out;
441 }
442
443 void yf::HttpRewrite::Replace::parse_groups()
444 {
445     int gnum = 0;
446     bool esc = false;
447     const std::string & str = regex;
448     std::string res;
449     yaz_log(YLOG_LOG, "Parsing groups from '%s'", str.c_str());
450     for (size_t i = 0; i < str.size(); ++i)
451     {
452         res += str[i];
453         if (!esc && str[i] == '\\')
454         {
455             esc = true;
456             continue;
457         }
458         if (!esc && str[i] == '(') //group starts
459         {
460             gnum++;
461             if (i+1 < str.size() && str[i+1] == '?') //group with attrs
462             {
463                 i++;
464                 if (i+1 < str.size() && str[i+1] == ':') //non-capturing
465                 {
466                     if (gnum > 0) gnum--;
467                     res += str[i];
468                     i++;
469                     res += str[i];
470                     continue;
471                 }
472                 if (i+1 < str.size() && str[i+1] == 'P') //optional, python
473                     i++;
474                 if (i+1 < str.size() && str[i+1] == '<') //named
475                 {
476                     i++;
477                     std::string gname;
478                     bool term = false;
479                     while (++i < str.size())
480                     {
481                         if (str[i] == '>') { term = true; break; }
482                         if (!isalnum(str[i]))
483                             throw mp::filter::FilterException
484                                 ("Only alphanumeric chars allowed, found "
485                                  " in '"
486                                  + str
487                                  + "' at "
488                                  + boost::lexical_cast<std::string>(i));
489                         gname += str[i];
490                     }
491                     if (!term)
492                         throw mp::filter::FilterException
493                             ("Unterminated group name '" + gname
494                              + " in '" + str +"'");
495                     group_index[gnum] = gname;
496                     yaz_log(YLOG_LOG, "Found named group '%s' at $%d",
497                             gname.c_str(), gnum);
498                 }
499             }
500         }
501         esc = false;
502     }
503     regex = res;
504 }
505
506 std::string yf::HttpRewrite::Replace::sub_vars (
507         const std::map<std::string, std::string> & vars) const
508 {
509     std::string out;
510     bool esc = false;
511     const std::string & in = recipe;
512     for (size_t i = 0; i < in.size(); ++i)
513     {
514         if (!esc && in[i] == '\\')
515         {
516             esc = true;
517             continue;
518         }
519         if (!esc && in[i] == '$') //var
520         {
521             if (i+1 < in.size() && in[i+1] == '{') //ref prefix
522             {
523                 ++i;
524                 std::string name;
525                 bool term = false;
526                 while (++i < in.size())
527                 {
528                     if (in[i] == '}') { term = true; break; }
529                     name += in[i];
530                 }
531                 if (!term) throw mp::filter::FilterException
532                     ("Unterminated var ref in '"+in+"' at "
533                      + boost::lexical_cast<std::string>(i));
534                 std::map<std::string, std::string>::const_iterator it
535                     = vars.find(name);
536                 if (it != vars.end())
537                 {
538                     out += it->second;
539                 }
540             }
541             else
542             {
543                 throw mp::filter::FilterException
544                     ("Malformed or trimmed var ref in '"
545                      +in+"' at "+boost::lexical_cast<std::string>(i));
546             }
547             continue;
548         }
549         //passthru
550         out += in[i];
551         esc = false;
552     }
553     return out;
554 }
555
556
557 void yf::HttpRewrite::configure_phase(const xmlNode *ptr, Phase &phase)
558 {
559     std::map<std::string, RulePtr > rules;
560     for (ptr = ptr->children; ptr; ptr = ptr->next)
561     {
562         if (ptr->type != XML_ELEMENT_NODE)
563             continue;
564         else if (!strcmp((const char *) ptr->name, "rule"))
565         {
566             static const char *names[2] = { "name", 0 };
567             std::string values[1];
568             values[0] = "default";
569             mp::xml::parse_attr(ptr, names, values);
570
571             RulePtr rule(new Rule);
572             for (xmlNode *p = ptr->children; p; p = p->next)
573             {
574                 if (p->type != XML_ELEMENT_NODE)
575                     continue;
576                 if (!strcmp((const char *) p->name, "rewrite"))
577                 {
578                     Replace replace;
579                     const struct _xmlAttr *attr;
580                     for (attr = p->properties; attr; attr = attr->next)
581                     {
582                         if (!strcmp((const char *) attr->name,  "from"))
583                             replace.regex = mp::xml::get_text(attr->children);
584                         else if (!strcmp((const char *) attr->name,  "to"))
585                             replace.recipe = mp::xml::get_text(attr->children);
586                         else
587                             throw mp::filter::FilterException
588                                 ("Bad attribute "
589                                  + std::string((const char *) attr->name)
590                                  + " in rewrite section of http_rewrite");
591                     }
592                     yaz_log(YLOG_LOG, "Found rewrite rule from '%s' to '%s'",
593                             replace.regex.c_str(), replace.recipe.c_str());
594                     replace.parse_groups();
595                     if (!replace.regex.empty())
596                         rule->replace_list.push_back(replace);
597                 }
598                 else
599                     throw mp::filter::FilterException
600                         ("Bad element "
601                          + std::string((const char *) p->name)
602                          + " in http_rewrite filter");
603             }
604             rules[values[0]] = rule;
605         }
606         else if (!strcmp((const char *) ptr->name, "within"))
607         {
608             static const char *names[6] =
609                 { "header", "attr", "tag", "rule", "reqline", 0 };
610             std::string values[5];
611             mp::xml::parse_attr(ptr, names, values);
612             Within w;
613             w.header = values[0];
614             w.attr = values[1];
615             w.tag = values[2];
616             std::map<std::string,RulePtr>::const_iterator it =
617                 rules.find(values[3]);
618             if (it == rules.end())
619                 throw mp::filter::FilterException
620                     ("Reference to non-existing rule '" + values[3] +
621                      "' in http_rewrite filter");
622             w.rule = it->second;
623             w.reqline = values[4] == "1";
624             phase.within_list.push_back(w);
625         }
626         else
627         {
628             throw mp::filter::FilterException
629                 ("Bad element "
630                  + std::string((const char *) ptr->name)
631                  + " in http_rewrite filter");
632         }
633     }
634 }
635
636 void yf::HttpRewrite::configure(const xmlNode * ptr, bool test_only,
637         const char *path)
638 {
639     for (ptr = ptr->children; ptr; ptr = ptr->next)
640     {
641         if (ptr->type != XML_ELEMENT_NODE)
642             continue;
643         else if (!strcmp((const char *) ptr->name, "request"))
644         {
645             configure_phase(ptr, *req_phase);
646         }
647         else if (!strcmp((const char *) ptr->name, "response"))
648         {
649             configure_phase(ptr, *res_phase);
650         }
651         else
652         {
653             throw mp::filter::FilterException
654                 ("Bad element "
655                  + std::string((const char *) ptr->name)
656                  + " in http_rewrite1 filter");
657         }
658     }
659 }
660
661 static mp::filter::Base* filter_creator()
662 {
663     return new mp::filter::HttpRewrite;
664 }
665
666 extern "C" {
667     struct metaproxy_1_filter_struct metaproxy_1_filter_http_rewrite = {
668         0,
669         "http_rewrite",
670         filter_creator
671     };
672 }
673
674
675 /*
676  * Local variables:
677  * c-basic-offset: 4
678  * c-file-style: "Stroustrup"
679  * indent-tabs-mode: nil
680  * End:
681  * vim: shiftwidth=4 tabstop=8 expandtab
682  */
683