Reformat a bit
[metaproxy-moved-to-github.git] / src / filter_http_rewrite.cpp
1 /* This file is part of Metaproxy.
2    Copyright (C) 2005-2013 Index Data
3
4 Metaproxy is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18
19 #include "config.hpp"
20 #include <metaproxy/filter.hpp>
21 #include <metaproxy/package.hpp>
22 #include <metaproxy/util.hpp>
23 #include "filter_http_rewrite.hpp"
24 #include "html_parser.hpp"
25
26 #include <yaz/zgdu.h>
27 #include <yaz/log.h>
28
29 #include <stack>
30 #include <boost/regex.hpp>
31 #include <boost/lexical_cast.hpp>
32 #include <boost/algorithm/string.hpp>
33
34 #include <map>
35
36 namespace mp = metaproxy_1;
37 namespace yf = mp::filter;
38
39 namespace metaproxy_1 {
40     namespace filter {
41         class HttpRewrite::Replace {
42         public:
43             std::string regex;
44             std::string recipe;
45             std::map<int, std::string> group_index;
46             const std::string search_replace(
47                 std::map<std::string, std::string> & vars,
48                 const std::string & txt) const;
49             std::string sub_vars(
50                 const std::map<std::string, std::string> & vars) const;
51             void parse_groups();
52         };
53
54         class HttpRewrite::Rule {
55         public:
56             std::list<Replace> replace_list;
57             const std::string test_patterns(
58                 std::map<std::string, std::string> & vars,
59                 const std::string & txt) const;
60         };
61         class HttpRewrite::Within {
62         public:
63             std::string header;
64             std::string attr;
65             std::string tag;
66             bool reqline;
67             RulePtr rule;
68         };
69
70         class HttpRewrite::Phase {
71         public:
72             Phase();
73             std::list<Within> within_list;
74             int m_verbose;
75             void rewrite_reqline(mp::odr & o, Z_HTTP_Request *hreq,
76                 std::map<std::string, std::string> & vars) const;
77             void rewrite_headers(mp::odr & o, Z_HTTP_Header *headers,
78                 std::map<std::string, std::string> & vars) const;
79             void rewrite_body(mp::odr & o,
80                 char **content_buf, int *content_len,
81                 std::map<std::string, std::string> & vars) const;
82         };
83         class HttpRewrite::Event : public HTMLParserEvent {
84             void openTagStart(const char *tag, int tag_len);
85             void anyTagEnd(const char *tag, int tag_len, int close_it);
86             void attribute(const char *tag, int tag_len,
87                            const char *attr, int attr_len,
88                            const char *value, int val_len);
89             void closeTag(const char *tag, int tag_len);
90             void text(const char *value, int len);
91             const Phase *m_phase;
92             WRBUF m_w;
93             std::stack<std::list<Within>::const_iterator> s_within;
94             std::map<std::string, std::string> &m_vars;
95         public:
96             Event(const Phase *p, std::map<std::string, std::string> &vars);
97             ~Event();
98             const char *result();
99         };
100     }
101 }
102
103 yf::HttpRewrite::HttpRewrite() :
104     req_phase(new Phase), res_phase(new Phase)
105 {
106 }
107
108 yf::HttpRewrite::~HttpRewrite()
109 {
110 }
111
112 void yf::HttpRewrite::process(mp::Package & package) const
113 {
114     yaz_log(YLOG_LOG, "HttpRewrite begins....");
115     Z_GDU *gdu = package.request().get();
116     //map of request/response vars
117     std::map<std::string, std::string> vars;
118     //we have an http req
119     if (gdu && gdu->which == Z_GDU_HTTP_Request)
120     {
121         Z_HTTP_Request *hreq = gdu->u.HTTP_Request;
122         mp::odr o;
123         req_phase->rewrite_reqline(o, hreq, vars);
124         yaz_log(YLOG_LOG, ">> Request headers");
125         req_phase->rewrite_headers(o, hreq->headers, vars);
126         req_phase->rewrite_body(o,
127                 &hreq->content_buf, &hreq->content_len, vars);
128         package.request() = gdu;
129     }
130     package.move();
131     gdu = package.response().get();
132     if (gdu && gdu->which == Z_GDU_HTTP_Response)
133     {
134         Z_HTTP_Response *hres = gdu->u.HTTP_Response;
135         yaz_log(YLOG_LOG, "Response code %d", hres->code);
136         mp::odr o;
137         yaz_log(YLOG_LOG, "<< Respose headers");
138         res_phase->rewrite_headers(o, hres->headers, vars);
139         res_phase->rewrite_body(o, &hres->content_buf,
140                 &hres->content_len, vars);
141         package.response() = gdu;
142     }
143 }
144
145 void yf::HttpRewrite::Phase::rewrite_reqline (mp::odr & o,
146         Z_HTTP_Request *hreq,
147         std::map<std::string, std::string> & vars) const
148 {
149     //rewrite the request line
150     std::string path;
151     if (strstr(hreq->path, "http://") == hreq->path)
152     {
153         yaz_log(YLOG_LOG, "Path in the method line is absolute, "
154             "possibly a proxy request");
155         path += hreq->path;
156     }
157     else
158     {
159         //TODO what about proto
160         path += "http://";
161         path += z_HTTP_header_lookup(hreq->headers, "Host");
162         path += hreq->path;
163     }
164
165
166     std::list<Within>::const_iterator it = within_list.begin();
167     for (; it != within_list.end(); it++)
168         if (it->reqline)
169         {
170             RulePtr rule = it->rule;
171             yaz_log(YLOG_LOG, "Proxy request URL is %s", path.c_str());
172             std::string npath = rule->test_patterns(vars, path);
173             if (!npath.empty())
174             {
175                 yaz_log(YLOG_LOG, "Rewritten request URL is %s", npath.c_str());
176                 hreq->path = odr_strdup(o, npath.c_str());
177             }
178         }
179 }
180
181 void yf::HttpRewrite::Phase::rewrite_headers(mp::odr & o,
182         Z_HTTP_Header *headers,
183         std::map<std::string, std::string> & vars) const
184 {
185     for (Z_HTTP_Header *header = headers; header; header = header->next)
186     {
187         std::list<Within>::const_iterator it = within_list.begin();
188         for (; it != within_list.end(); it++)
189         {
190             if (it->header.length() > 0 &&
191                 yaz_strcasecmp(it->header.c_str(), header->name) == 0)
192             {
193                 std::string sheader(header->name);
194                 sheader += ": ";
195                 sheader += header->value;
196
197                 RulePtr rule = it->rule;
198                 std::string out = rule->test_patterns(vars, sheader);
199                 if (!out.empty())
200                 {
201                     size_t pos = out.find(": ");
202                     if (pos == std::string::npos)
203                     {
204                         yaz_log(YLOG_LOG, "Header malformed during rewrite, ignoring");
205                         continue;
206                     }
207                     header->name = odr_strdup(o, out.substr(0, pos).c_str());
208                     header->value = odr_strdup(o,
209                                                out.substr(pos + 2,
210                                                           std::string::npos).c_str());
211                 }
212             }
213         }
214     }
215 }
216
217 void yf::HttpRewrite::Phase::rewrite_body(mp::odr & o,
218         char **content_buf,
219         int *content_len,
220         std::map<std::string, std::string> & vars) const
221 {
222     if (*content_buf)
223     {
224         int i;
225         for (i = 0; i < *content_len; i++)
226             if ((*content_buf)[i] == 0)
227                 return;  // binary content. skip
228
229         HTMLParser parser;
230         Event ev(this, vars);
231
232         parser.set_verbose(m_verbose);
233
234         std::string buf(*content_buf, *content_len);
235
236         parser.parse(ev, buf.c_str());
237         const char *res = ev.result();
238         *content_buf = odr_strdup(o, res);
239         *content_len = strlen(res);
240     }
241 }
242
243 yf::HttpRewrite::Event::Event(const Phase *p,
244                               std::map<std::string, std::string> & vars
245     ) : m_phase(p), m_vars(vars)
246 {
247     m_w = wrbuf_alloc();
248 }
249
250 yf::HttpRewrite::Event::~Event()
251 {
252     wrbuf_destroy(m_w);
253 }
254
255 const char *yf::HttpRewrite::Event::result()
256 {
257     return wrbuf_cstr(m_w);
258 }
259
260 void yf::HttpRewrite::Event::openTagStart(const char *tag, int tag_len)
261 {
262     wrbuf_putc(m_w, '<');
263     wrbuf_write(m_w, tag, tag_len);
264
265     std::string t(tag, tag_len);
266     std::list<Within>::const_iterator it = m_phase->within_list.begin();
267     for (; it != m_phase->within_list.end(); it++)
268     {
269         if (it->tag.length() > 0 && yaz_strcasecmp(it->tag.c_str(),
270                                                    t.c_str()) == 0)
271         {
272             std::vector<std::string> attr;
273             boost::split(attr, it->attr, boost::is_any_of(","));
274             size_t i;
275             for (i = 0; i < attr.size(); i++)
276             {
277                 if (attr[i].compare("#text") == 0)
278                 {
279                     s_within.push(it);
280                     return;
281                 }
282             }
283         }
284     }
285 }
286
287 void yf::HttpRewrite::Event::anyTagEnd(const char *tag, int tag_len,
288                                        int close_it)
289 {
290     if (close_it)
291     {
292         if (!s_within.empty())
293         {
294             std::list<Within>::const_iterator it = s_within.top();
295             std::string t(tag, tag_len);
296             if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
297                 s_within.pop();
298         }
299     }
300     if (close_it)
301         wrbuf_putc(m_w, '/');
302     wrbuf_putc(m_w, '>');
303 }
304
305 void yf::HttpRewrite::Event::attribute(const char *tag, int tag_len,
306                                        const char *attr, int attr_len,
307                                        const char *value, int val_len)
308 {
309     std::list<Within>::const_iterator it = m_phase->within_list.begin();
310     bool subst = false;
311
312     for (; it != m_phase->within_list.end(); it++)
313     {
314         std::string t(tag, tag_len);
315         if (it->tag.length() == 0 ||
316             yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
317         {
318             std::string a(attr, attr_len);
319             std::vector<std::string> attr;
320             boost::split(attr, it->attr, boost::is_any_of(","));
321             size_t i;
322             for (i = 0; i < attr.size(); i++)
323             {
324                 if (attr[i].compare("#text") &&
325                     yaz_strcasecmp(attr[i].c_str(), a.c_str()) == 0)
326                     subst = true;
327             }
328         }
329         if (subst)
330             break;
331     }
332
333     wrbuf_putc(m_w, ' ');
334     wrbuf_write(m_w, attr, attr_len);
335     wrbuf_puts(m_w, "=\"");
336
337     std::string output;
338     if (subst)
339     {
340         std::string input(value, val_len);
341         output = it->rule->test_patterns(m_vars, input);
342     }
343     if (output.empty())
344         wrbuf_write(m_w, value, val_len);
345     else
346         wrbuf_puts(m_w, output.c_str());
347     wrbuf_puts(m_w, "\"");
348 }
349
350 void yf::HttpRewrite::Event::closeTag(const char *tag, int tag_len)
351 {
352     if (!s_within.empty())
353     {
354         std::list<Within>::const_iterator it = s_within.top();
355         std::string t(tag, tag_len);
356         if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
357             s_within.pop();
358     }
359     wrbuf_puts(m_w, "</");
360     wrbuf_write(m_w, tag, tag_len);
361 }
362
363 void yf::HttpRewrite::Event::text(const char *value, int len)
364 {
365     std::list<Within>::const_iterator it = m_phase->within_list.end();
366     if (!s_within.empty())
367         it = s_within.top();
368     std::string output;
369     if (it != m_phase->within_list.end())
370     {
371         std::string input(value, len);
372         output = it->rule->test_patterns(m_vars, input);
373     }
374     if (output.empty())
375         wrbuf_write(m_w, value, len);
376     else
377         wrbuf_puts(m_w, output.c_str());
378 }
379
380
381 /**
382  * Tests pattern from the vector in order and executes recipe on
383  the first match.
384  */
385 const std::string yf::HttpRewrite::Rule::test_patterns(
386         std::map<std::string, std::string> & vars,
387         const std::string & txt) const
388 {
389     std::list<Replace>::const_iterator it = replace_list.begin();
390
391     for (; it != replace_list.end(); it++)
392     {
393         std::string out = it->search_replace(vars, txt);
394         if (!out.empty()) return out;
395     }
396     return "";
397 }
398
399 const std::string yf::HttpRewrite::Replace::search_replace(
400         std::map<std::string, std::string> & vars,
401         const std::string & txt) const
402 {
403     //exec regex against value
404     boost::regex re(regex);
405     boost::smatch what;
406     std::string::const_iterator start, end;
407     start = txt.begin();
408     end = txt.end();
409     std::string out;
410     while (regex_search(start, end, what, re)) //find next full match
411     {
412         size_t i;
413         for (i = 1; i < what.size(); ++i)
414         {
415             //check if the group is named
416             std::map<int, std::string>::const_iterator it
417                 = group_index.find(i);
418             if (it != group_index.end())
419             {   //it is
420                 if (!what[i].str().empty())
421                     vars[it->second] = what[i];
422             }
423
424         }
425         //prepare replacement string
426         std::string rvalue = sub_vars(vars);
427         yaz_log(YLOG_LOG, "! Rewritten '%s' to '%s'",
428                 what.str(0).c_str(), rvalue.c_str());
429         out.append(start, what[0].first);
430         out.append(rvalue);
431         start = what[0].second; //move search forward
432     }
433     //if we had a match cat the last part
434     if (start != txt.begin())
435         out.append(start, end);
436     return out;
437 }
438
439 void yf::HttpRewrite::Replace::parse_groups()
440 {
441     int gnum = 0;
442     bool esc = false;
443     const std::string & str = regex;
444     std::string res;
445     yaz_log(YLOG_LOG, "Parsing groups from '%s'", str.c_str());
446     for (size_t i = 0; i < str.size(); ++i)
447     {
448         res += str[i];
449         if (!esc && str[i] == '\\')
450         {
451             esc = true;
452             continue;
453         }
454         if (!esc && str[i] == '(') //group starts
455         {
456             gnum++;
457             if (i+1 < str.size() && str[i+1] == '?') //group with attrs
458             {
459                 i++;
460                 if (i+1 < str.size() && str[i+1] == ':') //non-capturing
461                 {
462                     if (gnum > 0) gnum--;
463                     res += str[i];
464                     i++;
465                     res += str[i];
466                     continue;
467                 }
468                 if (i+1 < str.size() && str[i+1] == 'P') //optional, python
469                     i++;
470                 if (i+1 < str.size() && str[i+1] == '<') //named
471                 {
472                     i++;
473                     std::string gname;
474                     bool term = false;
475                     while (++i < str.size())
476                     {
477                         if (str[i] == '>') { term = true; break; }
478                         if (!isalnum(str[i]))
479                             throw mp::filter::FilterException
480                                 ("Only alphanumeric chars allowed, found "
481                                  " in '"
482                                  + str
483                                  + "' at "
484                                  + boost::lexical_cast<std::string>(i));
485                         gname += str[i];
486                     }
487                     if (!term)
488                         throw mp::filter::FilterException
489                             ("Unterminated group name '" + gname
490                              + " in '" + str +"'");
491                     group_index[gnum] = gname;
492                     yaz_log(YLOG_LOG, "Found named group '%s' at $%d",
493                             gname.c_str(), gnum);
494                 }
495             }
496         }
497         esc = false;
498     }
499     regex = res;
500 }
501
502 std::string yf::HttpRewrite::Replace::sub_vars(
503     const std::map<std::string, std::string> & vars) const
504 {
505     std::string out;
506     bool esc = false;
507     const std::string & in = recipe;
508     for (size_t i = 0; i < in.size(); ++i)
509     {
510         if (!esc && in[i] == '\\')
511         {
512             esc = true;
513             continue;
514         }
515         if (!esc && in[i] == '$') //var
516         {
517             if (i+1 < in.size() && in[i+1] == '{') //ref prefix
518             {
519                 ++i;
520                 std::string name;
521                 bool term = false;
522                 while (++i < in.size())
523                 {
524                     if (in[i] == '}') { term = true; break; }
525                     name += in[i];
526                 }
527                 if (!term) throw mp::filter::FilterException
528                     ("Unterminated var ref in '"+in+"' at "
529                      + boost::lexical_cast<std::string>(i));
530                 std::map<std::string, std::string>::const_iterator it
531                     = vars.find(name);
532                 if (it != vars.end())
533                 {
534                     out += it->second;
535                 }
536             }
537             else
538             {
539                 throw mp::filter::FilterException
540                     ("Malformed or trimmed var ref in '"
541                      +in+"' at "+boost::lexical_cast<std::string>(i));
542             }
543             continue;
544         }
545         //passthru
546         out += in[i];
547         esc = false;
548     }
549     return out;
550 }
551
552 yf::HttpRewrite::Phase::Phase() : m_verbose(0)
553 {
554 }
555
556 void yf::HttpRewrite::configure_phase(const xmlNode *ptr, Phase &phase)
557 {
558     static const char *names[2] = { "verbose", 0 };
559     std::string values[1];
560     values[0] = "0";
561     mp::xml::parse_attr(ptr, names, values);
562
563     phase.m_verbose = atoi(values[0].c_str());
564
565     std::map<std::string, RulePtr > rules;
566     for (ptr = ptr->children; ptr; ptr = ptr->next)
567     {
568         if (ptr->type != XML_ELEMENT_NODE)
569             continue;
570         else if (!strcmp((const char *) ptr->name, "rule"))
571         {
572             static const char *names[2] = { "name", 0 };
573             std::string values[1];
574             values[0] = "default";
575             mp::xml::parse_attr(ptr, names, values);
576
577             RulePtr rule(new Rule);
578             for (xmlNode *p = ptr->children; p; p = p->next)
579             {
580                 if (p->type != XML_ELEMENT_NODE)
581                     continue;
582                 if (!strcmp((const char *) p->name, "rewrite"))
583                 {
584                     Replace replace;
585                     const struct _xmlAttr *attr;
586                     for (attr = p->properties; attr; attr = attr->next)
587                     {
588                         if (!strcmp((const char *) attr->name,  "from"))
589                             replace.regex = mp::xml::get_text(attr->children);
590                         else if (!strcmp((const char *) attr->name,  "to"))
591                             replace.recipe = mp::xml::get_text(attr->children);
592                         else
593                             throw mp::filter::FilterException
594                                 ("Bad attribute "
595                                  + std::string((const char *) attr->name)
596                                  + " in rewrite section of http_rewrite");
597                     }
598                     yaz_log(YLOG_LOG, "Found rewrite rule from '%s' to '%s'",
599                             replace.regex.c_str(), replace.recipe.c_str());
600                     replace.parse_groups();
601                     if (!replace.regex.empty())
602                         rule->replace_list.push_back(replace);
603                 }
604                 else
605                     throw mp::filter::FilterException
606                         ("Bad element "
607                          + std::string((const char *) p->name)
608                          + " in http_rewrite filter");
609             }
610             rules[values[0]] = rule;
611         }
612         else if (!strcmp((const char *) ptr->name, "within"))
613         {
614             static const char *names[6] =
615                 { "header", "attr", "tag", "rule", "reqline", 0 };
616             std::string values[5];
617             mp::xml::parse_attr(ptr, names, values);
618             Within w;
619             w.header = values[0];
620             w.attr = values[1];
621             w.tag = values[2];
622             std::map<std::string,RulePtr>::const_iterator it =
623                 rules.find(values[3]);
624             if (it == rules.end())
625                 throw mp::filter::FilterException
626                     ("Reference to non-existing rule '" + values[3] +
627                      "' in http_rewrite filter");
628             w.rule = it->second;
629             w.reqline = values[4] == "1";
630             phase.within_list.push_back(w);
631         }
632         else
633         {
634             throw mp::filter::FilterException
635                 ("Bad element "
636                  + std::string((const char *) ptr->name)
637                  + " in http_rewrite filter");
638         }
639     }
640 }
641
642 void yf::HttpRewrite::configure(const xmlNode * ptr, bool test_only,
643         const char *path)
644 {
645     for (ptr = ptr->children; ptr; ptr = ptr->next)
646     {
647         if (ptr->type != XML_ELEMENT_NODE)
648             continue;
649         else if (!strcmp((const char *) ptr->name, "request"))
650         {
651             configure_phase(ptr, *req_phase);
652         }
653         else if (!strcmp((const char *) ptr->name, "response"))
654         {
655             configure_phase(ptr, *res_phase);
656         }
657         else
658         {
659             throw mp::filter::FilterException
660                 ("Bad element "
661                  + std::string((const char *) ptr->name)
662                  + " in http_rewrite1 filter");
663         }
664     }
665 }
666
667 static mp::filter::Base* filter_creator()
668 {
669     return new mp::filter::HttpRewrite;
670 }
671
672 extern "C" {
673     struct metaproxy_1_filter_struct metaproxy_1_filter_http_rewrite = {
674         0,
675         "http_rewrite",
676         filter_creator
677     };
678 }
679
680
681 /*
682  * Local variables:
683  * c-basic-offset: 4
684  * c-file-style: "Stroustrup"
685  * indent-tabs-mode: nil
686  * End:
687  * vim: shiftwidth=4 tabstop=8 expandtab
688  */
689