boost::smatch part of Replace object too
[metaproxy-moved-to-github.git] / src / filter_http_rewrite.cpp
1 /* This file is part of Metaproxy.
2    Copyright (C) 2005-2013 Index Data
3
4 Metaproxy is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18
19 #include "config.hpp"
20 #include <metaproxy/filter.hpp>
21 #include <metaproxy/package.hpp>
22 #include <metaproxy/util.hpp>
23 #include "filter_http_rewrite.hpp"
24 #include "html_parser.hpp"
25
26 #include <yaz/zgdu.h>
27 #include <yaz/log.h>
28
29 #include <stack>
30 #include <boost/regex.hpp>
31 #include <boost/lexical_cast.hpp>
32 #include <boost/algorithm/string.hpp>
33
34 #include <map>
35
36 namespace mp = metaproxy_1;
37 namespace yf = mp::filter;
38
39 namespace metaproxy_1 {
40     namespace filter {
41         class HttpRewrite::Replace {
42         public:
43             boost::regex re;
44             boost::smatch what;
45             std::string recipe;
46             std::map<int, std::string> group_index;
47             const std::string search_replace(
48                 std::map<std::string, std::string> & vars,
49                 const std::string & txt);
50             std::string sub_vars(
51                 const std::map<std::string, std::string> & vars) const;
52             void parse_groups(std::string pattern);
53         };
54
55         class HttpRewrite::Rule {
56         public:
57             std::list<Replace> replace_list;
58             const std::string test_patterns(
59                 std::map<std::string, std::string> & vars,
60                 const std::string & txt);
61         };
62         class HttpRewrite::Within {
63         public:
64             std::string header;
65             std::string attr;
66             std::string tag;
67             bool reqline;
68             RulePtr rule;
69         };
70
71         class HttpRewrite::Phase {
72         public:
73             Phase();
74             std::list<Within> within_list;
75             int m_verbose;
76             void rewrite_reqline(mp::odr & o, Z_HTTP_Request *hreq,
77                 std::map<std::string, std::string> & vars) const;
78             void rewrite_headers(mp::odr & o, Z_HTTP_Header *headers,
79                 std::map<std::string, std::string> & vars) const;
80             void rewrite_body(mp::odr & o,
81                 char **content_buf, int *content_len,
82                 std::map<std::string, std::string> & vars) const;
83         };
84         class HttpRewrite::Event : public HTMLParserEvent {
85             void openTagStart(const char *tag, int tag_len);
86             void anyTagEnd(const char *tag, int tag_len, int close_it);
87             void attribute(const char *tag, int tag_len,
88                            const char *attr, int attr_len,
89                            const char *value, int val_len,
90                            const char *sep);
91             void closeTag(const char *tag, int tag_len);
92             void text(const char *value, int len);
93             const Phase *m_phase;
94             WRBUF m_w;
95             std::stack<std::list<Within>::const_iterator> s_within;
96             std::map<std::string, std::string> &m_vars;
97         public:
98             Event(const Phase *p, std::map<std::string, std::string> &vars);
99             ~Event();
100             const char *result();
101         };
102     }
103 }
104
105 yf::HttpRewrite::HttpRewrite() :
106     req_phase(new Phase), res_phase(new Phase)
107 {
108 }
109
110 yf::HttpRewrite::~HttpRewrite()
111 {
112 }
113
114 void yf::HttpRewrite::process(mp::Package & package) const
115 {
116     yaz_log(YLOG_LOG, "HttpRewrite begins....");
117     Z_GDU *gdu = package.request().get();
118     //map of request/response vars
119     std::map<std::string, std::string> vars;
120     //we have an http req
121     if (gdu && gdu->which == Z_GDU_HTTP_Request)
122     {
123         Z_HTTP_Request *hreq = gdu->u.HTTP_Request;
124         mp::odr o;
125         req_phase->rewrite_reqline(o, hreq, vars);
126         yaz_log(YLOG_LOG, ">> Request headers");
127         req_phase->rewrite_headers(o, hreq->headers, vars);
128         req_phase->rewrite_body(o,
129                 &hreq->content_buf, &hreq->content_len, vars);
130         package.request() = gdu;
131     }
132     package.move();
133     gdu = package.response().get();
134     if (gdu && gdu->which == Z_GDU_HTTP_Response)
135     {
136         Z_HTTP_Response *hres = gdu->u.HTTP_Response;
137         yaz_log(YLOG_LOG, "Response code %d", hres->code);
138         mp::odr o;
139         yaz_log(YLOG_LOG, "<< Respose headers");
140         res_phase->rewrite_headers(o, hres->headers, vars);
141         res_phase->rewrite_body(o, &hres->content_buf,
142                 &hres->content_len, vars);
143         package.response() = gdu;
144     }
145 }
146
147 void yf::HttpRewrite::Phase::rewrite_reqline (mp::odr & o,
148         Z_HTTP_Request *hreq,
149         std::map<std::string, std::string> & vars) const
150 {
151     //rewrite the request line
152     std::string path;
153     if (strstr(hreq->path, "http://") == hreq->path)
154     {
155         yaz_log(YLOG_LOG, "Path in the method line is absolute, "
156             "possibly a proxy request");
157         path += hreq->path;
158     }
159     else
160     {
161         //TODO what about proto
162         path += "http://";
163         path += z_HTTP_header_lookup(hreq->headers, "Host");
164         path += hreq->path;
165     }
166
167
168     std::list<Within>::const_iterator it = within_list.begin();
169     for (; it != within_list.end(); it++)
170         if (it->reqline)
171         {
172             RulePtr rule = it->rule;
173             yaz_log(YLOG_LOG, "Proxy request URL is %s", path.c_str());
174             std::string npath = rule->test_patterns(vars, path);
175             if (!npath.empty())
176             {
177                 yaz_log(YLOG_LOG, "Rewritten request URL is %s", npath.c_str());
178                 hreq->path = odr_strdup(o, npath.c_str());
179             }
180         }
181 }
182
183 void yf::HttpRewrite::Phase::rewrite_headers(mp::odr & o,
184         Z_HTTP_Header *headers,
185         std::map<std::string, std::string> & vars) const
186 {
187     for (Z_HTTP_Header *header = headers; header; header = header->next)
188     {
189         std::list<Within>::const_iterator it = within_list.begin();
190         for (; it != within_list.end(); it++)
191         {
192             if (it->header.length() > 0 &&
193                 yaz_strcasecmp(it->header.c_str(), header->name) == 0)
194             {
195                 std::string sheader(header->name);
196                 sheader += ": ";
197                 sheader += header->value;
198
199                 RulePtr rule = it->rule;
200                 std::string out = rule->test_patterns(vars, sheader);
201                 if (!out.empty())
202                 {
203                     size_t pos = out.find(": ");
204                     if (pos == std::string::npos)
205                     {
206                         yaz_log(YLOG_LOG, "Header malformed during rewrite, ignoring");
207                         continue;
208                     }
209                     header->name = odr_strdup(o, out.substr(0, pos).c_str());
210                     header->value = odr_strdup(o,
211                                                out.substr(pos + 2,
212                                                           std::string::npos).c_str());
213                 }
214             }
215         }
216     }
217 }
218
219 void yf::HttpRewrite::Phase::rewrite_body(mp::odr & o,
220         char **content_buf,
221         int *content_len,
222         std::map<std::string, std::string> & vars) const
223 {
224     if (*content_buf)
225     {
226         int i;
227         for (i = 0; i < *content_len; i++)
228             if ((*content_buf)[i] == 0)
229                 return;  // binary content. skip
230
231         HTMLParser parser;
232         Event ev(this, vars);
233
234         parser.set_verbose(m_verbose);
235
236         std::string buf(*content_buf, *content_len);
237
238         parser.parse(ev, buf.c_str());
239         const char *res = ev.result();
240         *content_buf = odr_strdup(o, res);
241         *content_len = strlen(res);
242     }
243 }
244
245 yf::HttpRewrite::Event::Event(const Phase *p,
246                               std::map<std::string, std::string> & vars
247     ) : m_phase(p), m_vars(vars)
248 {
249     m_w = wrbuf_alloc();
250 }
251
252 yf::HttpRewrite::Event::~Event()
253 {
254     wrbuf_destroy(m_w);
255 }
256
257 const char *yf::HttpRewrite::Event::result()
258 {
259     return wrbuf_cstr(m_w);
260 }
261
262 void yf::HttpRewrite::Event::openTagStart(const char *tag, int tag_len)
263 {
264     wrbuf_putc(m_w, '<');
265     wrbuf_write(m_w, tag, tag_len);
266
267     std::string t(tag, tag_len);
268     std::list<Within>::const_iterator it = m_phase->within_list.begin();
269     for (; it != m_phase->within_list.end(); it++)
270     {
271         if (it->tag.length() > 0 && yaz_strcasecmp(it->tag.c_str(),
272                                                    t.c_str()) == 0)
273         {
274             std::vector<std::string> attr;
275             boost::split(attr, it->attr, boost::is_any_of(","));
276             size_t i;
277             for (i = 0; i < attr.size(); i++)
278             {
279                 if (attr[i].compare("#text") == 0)
280                 {
281                     s_within.push(it);
282                     return;
283                 }
284             }
285         }
286     }
287 }
288
289 void yf::HttpRewrite::Event::anyTagEnd(const char *tag, int tag_len,
290                                        int close_it)
291 {
292     if (close_it)
293     {
294         if (!s_within.empty())
295         {
296             std::list<Within>::const_iterator it = s_within.top();
297             std::string t(tag, tag_len);
298             if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
299                 s_within.pop();
300         }
301     }
302     if (close_it)
303         wrbuf_putc(m_w, '/');
304     wrbuf_putc(m_w, '>');
305 }
306
307 void yf::HttpRewrite::Event::attribute(const char *tag, int tag_len,
308                                        const char *attr, int attr_len,
309                                        const char *value, int val_len,
310                                        const char *sep)
311 {
312     std::list<Within>::const_iterator it = m_phase->within_list.begin();
313     bool subst = false;
314
315     for (; it != m_phase->within_list.end(); it++)
316     {
317         std::string t(tag, tag_len);
318         if (it->tag.length() == 0 ||
319             yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
320         {
321             std::string a(attr, attr_len);
322             std::vector<std::string> attr;
323             boost::split(attr, it->attr, boost::is_any_of(","));
324             size_t i;
325             for (i = 0; i < attr.size(); i++)
326             {
327                 if (attr[i].compare("#text") &&
328                     yaz_strcasecmp(attr[i].c_str(), a.c_str()) == 0)
329                     subst = true;
330             }
331         }
332         if (subst)
333             break;
334     }
335
336     wrbuf_putc(m_w, ' ');
337     wrbuf_write(m_w, attr, attr_len);
338     if (value)
339     {
340         wrbuf_puts(m_w, "=");
341         wrbuf_puts(m_w, sep);
342
343         std::string output;
344         if (subst)
345         {
346             std::string input(value, val_len);
347             output = it->rule->test_patterns(m_vars, input);
348         }
349         if (output.empty())
350             wrbuf_write(m_w, value, val_len);
351         else
352             wrbuf_puts(m_w, output.c_str());
353         wrbuf_puts(m_w, sep);
354     }
355 }
356
357 void yf::HttpRewrite::Event::closeTag(const char *tag, int tag_len)
358 {
359     if (!s_within.empty())
360     {
361         std::list<Within>::const_iterator it = s_within.top();
362         std::string t(tag, tag_len);
363         if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
364             s_within.pop();
365     }
366     wrbuf_puts(m_w, "</");
367     wrbuf_write(m_w, tag, tag_len);
368 }
369
370 void yf::HttpRewrite::Event::text(const char *value, int len)
371 {
372     std::list<Within>::const_iterator it = m_phase->within_list.end();
373     if (!s_within.empty())
374         it = s_within.top();
375     std::string output;
376     if (it != m_phase->within_list.end())
377     {
378         std::string input(value, len);
379         output = it->rule->test_patterns(m_vars, input);
380     }
381     if (output.empty())
382         wrbuf_write(m_w, value, len);
383     else
384         wrbuf_puts(m_w, output.c_str());
385 }
386
387 /**
388  * Tests pattern from the vector in order and executes recipe on
389  the first match.
390  */
391 const std::string yf::HttpRewrite::Rule::test_patterns(
392         std::map<std::string, std::string> & vars,
393         const std::string & txt)
394 {
395     std::list<Replace>::iterator it = replace_list.begin();
396
397     for (; it != replace_list.end(); it++)
398     {
399         std::string out = it->search_replace(vars, txt);
400         if (!out.empty()) return out;
401     }
402     return "";
403 }
404
405 const std::string yf::HttpRewrite::Replace::search_replace(
406         std::map<std::string, std::string> & vars,
407         const std::string & txt)
408 {
409     std::string::const_iterator start, end;
410     start = txt.begin();
411     end = txt.end();
412     std::string out;
413     while (regex_search(start, end, what, re)) //find next full match
414     {
415         size_t i;
416         for (i = 1; i < what.size(); ++i)
417         {
418             //check if the group is named
419             std::map<int, std::string>::const_iterator it
420                 = group_index.find(i);
421             if (it != group_index.end())
422             {   //it is
423                 vars[it->second] = what[i];
424             }
425
426         }
427         //prepare replacement string
428         std::string rvalue = sub_vars(vars);
429         yaz_log(YLOG_LOG, "! Rewritten '%s' to '%s'",
430                 what.str(0).c_str(), rvalue.c_str());
431         out.append(start, what[0].first);
432         out.append(rvalue);
433         start = what[0].second; //move search forward
434     }
435     //if we had a match cat the last part
436     if (start != txt.begin())
437         out.append(start, end);
438     return out;
439 }
440
441 void yf::HttpRewrite::Replace::parse_groups(std::string pattern)
442 {
443     int gnum = 0;
444     bool esc = false;
445     const std::string &str = pattern;
446     std::string res;
447     yaz_log(YLOG_LOG, "Parsing groups from '%s'", str.c_str());
448     for (size_t i = 0; i < str.size(); ++i)
449     {
450         res += str[i];
451         if (!esc && str[i] == '\\')
452         {
453             esc = true;
454             continue;
455         }
456         if (!esc && str[i] == '(') //group starts
457         {
458             gnum++;
459             if (i+1 < str.size() && str[i+1] == '?') //group with attrs
460             {
461                 i++;
462                 if (i+1 < str.size() && str[i+1] == ':') //non-capturing
463                 {
464                     if (gnum > 0) gnum--;
465                     res += str[i];
466                     i++;
467                     res += str[i];
468                     continue;
469                 }
470                 if (i+1 < str.size() && str[i+1] == 'P') //optional, python
471                     i++;
472                 if (i+1 < str.size() && str[i+1] == '<') //named
473                 {
474                     i++;
475                     std::string gname;
476                     bool term = false;
477                     while (++i < str.size())
478                     {
479                         if (str[i] == '>') { term = true; break; }
480                         if (!isalnum(str[i]))
481                             throw mp::filter::FilterException
482                                 ("Only alphanumeric chars allowed, found "
483                                  " in '"
484                                  + str
485                                  + "' at "
486                                  + boost::lexical_cast<std::string>(i));
487                         gname += str[i];
488                     }
489                     if (!term)
490                         throw mp::filter::FilterException
491                             ("Unterminated group name '" + gname
492                              + " in '" + str +"'");
493                     group_index[gnum] = gname;
494                     yaz_log(YLOG_LOG, "Found named group '%s' at $%d",
495                             gname.c_str(), gnum);
496                 }
497             }
498         }
499         esc = false;
500     }
501     re = res;
502 }
503
504 std::string yf::HttpRewrite::Replace::sub_vars(
505     const std::map<std::string, std::string> & vars) const
506 {
507     std::string out;
508     bool esc = false;
509     const std::string & in = recipe;
510     for (size_t i = 0; i < in.size(); ++i)
511     {
512         if (!esc && in[i] == '\\')
513         {
514             esc = true;
515             continue;
516         }
517         if (!esc && in[i] == '$') //var
518         {
519             if (i+1 < in.size() && in[i+1] == '{') //ref prefix
520             {
521                 ++i;
522                 std::string name;
523                 bool term = false;
524                 while (++i < in.size())
525                 {
526                     if (in[i] == '}') { term = true; break; }
527                     name += in[i];
528                 }
529                 if (!term) throw mp::filter::FilterException
530                     ("Unterminated var ref in '"+in+"' at "
531                      + boost::lexical_cast<std::string>(i));
532                 std::map<std::string, std::string>::const_iterator it
533                     = vars.find(name);
534                 if (it != vars.end())
535                 {
536                     out += it->second;
537                 }
538             }
539             else
540             {
541                 throw mp::filter::FilterException
542                     ("Malformed or trimmed var ref in '"
543                      +in+"' at "+boost::lexical_cast<std::string>(i));
544             }
545             continue;
546         }
547         //passthru
548         out += in[i];
549         esc = false;
550     }
551     return out;
552 }
553
554 yf::HttpRewrite::Phase::Phase() : m_verbose(0)
555 {
556 }
557
558 void yf::HttpRewrite::configure_phase(const xmlNode *ptr, Phase &phase)
559 {
560     static const char *names[2] = { "verbose", 0 };
561     std::string values[1];
562     values[0] = "0";
563     mp::xml::parse_attr(ptr, names, values);
564
565     phase.m_verbose = atoi(values[0].c_str());
566
567     std::map<std::string, RulePtr > rules;
568     for (ptr = ptr->children; ptr; ptr = ptr->next)
569     {
570         if (ptr->type != XML_ELEMENT_NODE)
571             continue;
572         else if (!strcmp((const char *) ptr->name, "rule"))
573         {
574             static const char *names[2] = { "name", 0 };
575             std::string values[1];
576             values[0] = "default";
577             mp::xml::parse_attr(ptr, names, values);
578
579             RulePtr rule(new Rule);
580             for (xmlNode *p = ptr->children; p; p = p->next)
581             {
582                 if (p->type != XML_ELEMENT_NODE)
583                     continue;
584                 if (!strcmp((const char *) p->name, "rewrite"))
585                 {
586                     Replace replace;
587                     std::string from;
588                     const struct _xmlAttr *attr;
589                     for (attr = p->properties; attr; attr = attr->next)
590                     {
591                         if (!strcmp((const char *) attr->name,  "from"))
592                             from = mp::xml::get_text(attr->children);
593                         else if (!strcmp((const char *) attr->name,  "to"))
594                             replace.recipe = mp::xml::get_text(attr->children);
595                         else
596                             throw mp::filter::FilterException
597                                 ("Bad attribute "
598                                  + std::string((const char *) attr->name)
599                                  + " in rewrite section of http_rewrite");
600                     }
601                     yaz_log(YLOG_LOG, "Found rewrite rule from '%s' to '%s'",
602                             from.c_str(), replace.recipe.c_str());
603                     if (!from.empty())
604                     {
605                         replace.parse_groups(from);
606                         rule->replace_list.push_back(replace);
607                     }
608                 }
609                 else
610                     throw mp::filter::FilterException
611                         ("Bad element "
612                          + std::string((const char *) p->name)
613                          + " in http_rewrite filter");
614             }
615             rules[values[0]] = rule;
616         }
617         else if (!strcmp((const char *) ptr->name, "within"))
618         {
619             static const char *names[6] =
620                 { "header", "attr", "tag", "rule", "reqline", 0 };
621             std::string values[5];
622             mp::xml::parse_attr(ptr, names, values);
623             Within w;
624             w.header = values[0];
625             w.attr = values[1];
626             w.tag = values[2];
627             std::map<std::string,RulePtr>::const_iterator it =
628                 rules.find(values[3]);
629             if (it == rules.end())
630                 throw mp::filter::FilterException
631                     ("Reference to non-existing rule '" + values[3] +
632                      "' in http_rewrite filter");
633             w.rule = it->second;
634             w.reqline = values[4] == "1";
635             phase.within_list.push_back(w);
636         }
637         else
638         {
639             throw mp::filter::FilterException
640                 ("Bad element "
641                  + std::string((const char *) ptr->name)
642                  + " in http_rewrite filter");
643         }
644     }
645 }
646
647 void yf::HttpRewrite::configure(const xmlNode * ptr, bool test_only,
648         const char *path)
649 {
650     for (ptr = ptr->children; ptr; ptr = ptr->next)
651     {
652         if (ptr->type != XML_ELEMENT_NODE)
653             continue;
654         else if (!strcmp((const char *) ptr->name, "request"))
655         {
656             configure_phase(ptr, *req_phase);
657         }
658         else if (!strcmp((const char *) ptr->name, "response"))
659         {
660             configure_phase(ptr, *res_phase);
661         }
662         else
663         {
664             throw mp::filter::FilterException
665                 ("Bad element "
666                  + std::string((const char *) ptr->name)
667                  + " in http_rewrite1 filter");
668         }
669     }
670 }
671
672 static mp::filter::Base* filter_creator()
673 {
674     return new mp::filter::HttpRewrite;
675 }
676
677 extern "C" {
678     struct metaproxy_1_filter_struct metaproxy_1_filter_http_rewrite = {
679         0,
680         "http_rewrite",
681         filter_creator
682     };
683 }
684
685
686 /*
687  * Local variables:
688  * c-basic-offset: 4
689  * c-file-style: "Stroustrup"
690  * indent-tabs-mode: nil
691  * End:
692  * vim: shiftwidth=4 tabstop=8 expandtab
693  */
694