filter_http_rewrite: don't crash if "Host" header is missing
[metaproxy-moved-to-github.git] / src / filter_http_rewrite.cpp
1 /* This file is part of Metaproxy.
2    Copyright (C) 2005-2013 Index Data
3
4 Metaproxy is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18
19 #include "config.hpp"
20 #include <metaproxy/filter.hpp>
21 #include <metaproxy/package.hpp>
22 #include <metaproxy/util.hpp>
23 #include "filter_http_rewrite.hpp"
24 #include "html_parser.hpp"
25
26 #include <yaz/zgdu.h>
27 #include <yaz/log.h>
28
29 #include <stack>
30 #include <boost/regex.hpp>
31 #include <boost/lexical_cast.hpp>
32 #include <boost/algorithm/string.hpp>
33
34 #include <map>
35
36 namespace mp = metaproxy_1;
37 namespace yf = mp::filter;
38
39 namespace metaproxy_1 {
40     namespace filter {
41         class HttpRewrite::Replace {
42         public:
43             boost::regex re;
44             boost::smatch what;
45             std::string recipe;
46             std::map<int, std::string> group_index;
47             std::string sub_vars(
48                 const std::map<std::string, std::string> & vars) const;
49             void parse_groups(std::string pattern);
50         };
51
52         class HttpRewrite::Rule {
53         public:
54             std::list<Replace> replace_list;
55             const std::string test_patterns(
56                 std::map<std::string, std::string> & vars,
57                 const std::string & txt);
58         };
59         class HttpRewrite::Within {
60         public:
61             std::string header;
62             std::string attr;
63             std::string tag;
64             bool reqline;
65             RulePtr rule;
66         };
67
68         class HttpRewrite::Phase {
69         public:
70             Phase();
71             std::list<Within> within_list;
72             int m_verbose;
73             void rewrite_reqline(mp::odr & o, Z_HTTP_Request *hreq,
74                 std::map<std::string, std::string> & vars) const;
75             void rewrite_headers(mp::odr & o, Z_HTTP_Header *headers,
76                 std::map<std::string, std::string> & vars) const;
77             void rewrite_body(mp::odr & o,
78                 char **content_buf, int *content_len,
79                 std::map<std::string, std::string> & vars) const;
80         };
81         class HttpRewrite::Event : public HTMLParserEvent {
82             void openTagStart(const char *tag, int tag_len);
83             void anyTagEnd(const char *tag, int tag_len, int close_it);
84             void attribute(const char *tag, int tag_len,
85                            const char *attr, int attr_len,
86                            const char *value, int val_len,
87                            const char *sep);
88             void closeTag(const char *tag, int tag_len);
89             void text(const char *value, int len);
90             const Phase *m_phase;
91             WRBUF m_w;
92             std::stack<std::list<Within>::const_iterator> s_within;
93             std::map<std::string, std::string> &m_vars;
94         public:
95             Event(const Phase *p, std::map<std::string, std::string> &vars);
96             ~Event();
97             const char *result();
98         };
99     }
100 }
101
102 yf::HttpRewrite::HttpRewrite() :
103     req_phase(new Phase), res_phase(new Phase)
104 {
105 }
106
107 yf::HttpRewrite::~HttpRewrite()
108 {
109 }
110
111 void yf::HttpRewrite::process(mp::Package & package) const
112 {
113     yaz_log(YLOG_LOG, "HttpRewrite begins....");
114     Z_GDU *gdu = package.request().get();
115     //map of request/response vars
116     std::map<std::string, std::string> vars;
117     //we have an http req
118     if (gdu && gdu->which == Z_GDU_HTTP_Request)
119     {
120         Z_HTTP_Request *hreq = gdu->u.HTTP_Request;
121         mp::odr o;
122         req_phase->rewrite_reqline(o, hreq, vars);
123         yaz_log(YLOG_LOG, ">> Request headers");
124         req_phase->rewrite_headers(o, hreq->headers, vars);
125         req_phase->rewrite_body(o,
126                 &hreq->content_buf, &hreq->content_len, vars);
127         package.request() = gdu;
128     }
129     package.move();
130     gdu = package.response().get();
131     if (gdu && gdu->which == Z_GDU_HTTP_Response)
132     {
133         Z_HTTP_Response *hres = gdu->u.HTTP_Response;
134         yaz_log(YLOG_LOG, "Response code %d", hres->code);
135         mp::odr o;
136         yaz_log(YLOG_LOG, "<< Respose headers");
137         res_phase->rewrite_headers(o, hres->headers, vars);
138         res_phase->rewrite_body(o, &hres->content_buf,
139                 &hres->content_len, vars);
140         package.response() = gdu;
141     }
142 }
143
144 void yf::HttpRewrite::Phase::rewrite_reqline (mp::odr & o,
145         Z_HTTP_Request *hreq,
146         std::map<std::string, std::string> & vars) const
147 {
148     //rewrite the request line
149     std::string path;
150     if (strstr(hreq->path, "http://") == hreq->path)
151     {
152         yaz_log(YLOG_LOG, "Path in the method line is absolute, "
153             "possibly a proxy request");
154         path += hreq->path;
155     }
156     else
157     {
158         //TODO what about proto
159         const char *host = z_HTTP_header_lookup(hreq->headers, "Host");
160         if (!host)
161             return;
162
163         path += "http://";
164         path += host;
165         path += hreq->path;
166     }
167
168
169     std::list<Within>::const_iterator it = within_list.begin();
170     for (; it != within_list.end(); it++)
171         if (it->reqline)
172         {
173             RulePtr rule = it->rule;
174             yaz_log(YLOG_LOG, "Proxy request URL is %s", path.c_str());
175             std::string npath = rule->test_patterns(vars, path);
176             if (!npath.empty())
177             {
178                 yaz_log(YLOG_LOG, "Rewritten request URL is %s", npath.c_str());
179                 hreq->path = odr_strdup(o, npath.c_str());
180             }
181         }
182 }
183
184 void yf::HttpRewrite::Phase::rewrite_headers(mp::odr & o,
185         Z_HTTP_Header *headers,
186         std::map<std::string, std::string> & vars) const
187 {
188     for (Z_HTTP_Header *header = headers; header; header = header->next)
189     {
190         std::list<Within>::const_iterator it = within_list.begin();
191         for (; it != within_list.end(); it++)
192         {
193             if (it->header.length() > 0 &&
194                 yaz_strcasecmp(it->header.c_str(), header->name) == 0)
195             {
196                 std::string sheader(header->name);
197                 sheader += ": ";
198                 sheader += header->value;
199
200                 RulePtr rule = it->rule;
201                 std::string out = rule->test_patterns(vars, sheader);
202                 if (!out.empty())
203                 {
204                     size_t pos = out.find(": ");
205                     if (pos == std::string::npos)
206                     {
207                         yaz_log(YLOG_LOG, "Header malformed during rewrite, ignoring");
208                         continue;
209                     }
210                     header->name = odr_strdup(o, out.substr(0, pos).c_str());
211                     header->value = odr_strdup(o,
212                                                out.substr(pos + 2,
213                                                           std::string::npos).c_str());
214                 }
215             }
216         }
217     }
218 }
219
220 void yf::HttpRewrite::Phase::rewrite_body(mp::odr & o,
221         char **content_buf,
222         int *content_len,
223         std::map<std::string, std::string> & vars) const
224 {
225     if (*content_buf)
226     {
227         int i;
228         for (i = 0; i < *content_len; i++)
229             if ((*content_buf)[i] == 0)
230                 return;  // binary content. skip
231
232         HTMLParser parser;
233         Event ev(this, vars);
234
235         parser.set_verbose(m_verbose);
236
237         std::string buf(*content_buf, *content_len);
238
239         parser.parse(ev, buf.c_str());
240         const char *res = ev.result();
241         *content_buf = odr_strdup(o, res);
242         *content_len = strlen(res);
243     }
244 }
245
246 yf::HttpRewrite::Event::Event(const Phase *p,
247                               std::map<std::string, std::string> & vars
248     ) : m_phase(p), m_vars(vars)
249 {
250     m_w = wrbuf_alloc();
251 }
252
253 yf::HttpRewrite::Event::~Event()
254 {
255     wrbuf_destroy(m_w);
256 }
257
258 const char *yf::HttpRewrite::Event::result()
259 {
260     return wrbuf_cstr(m_w);
261 }
262
263 void yf::HttpRewrite::Event::openTagStart(const char *tag, int tag_len)
264 {
265     wrbuf_putc(m_w, '<');
266     wrbuf_write(m_w, tag, tag_len);
267
268     std::string t(tag, tag_len);
269     std::list<Within>::const_iterator it = m_phase->within_list.begin();
270     for (; it != m_phase->within_list.end(); it++)
271     {
272         if (it->tag.length() > 0 && yaz_strcasecmp(it->tag.c_str(),
273                                                    t.c_str()) == 0)
274         {
275             std::vector<std::string> attr;
276             boost::split(attr, it->attr, boost::is_any_of(","));
277             size_t i;
278             for (i = 0; i < attr.size(); i++)
279             {
280                 if (attr[i].compare("#text") == 0)
281                 {
282                     s_within.push(it);
283                     return;
284                 }
285             }
286         }
287     }
288 }
289
290 void yf::HttpRewrite::Event::anyTagEnd(const char *tag, int tag_len,
291                                        int close_it)
292 {
293     if (close_it)
294     {
295         if (!s_within.empty())
296         {
297             std::list<Within>::const_iterator it = s_within.top();
298             std::string t(tag, tag_len);
299             if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
300                 s_within.pop();
301         }
302     }
303     if (close_it)
304         wrbuf_putc(m_w, '/');
305     wrbuf_putc(m_w, '>');
306 }
307
308 void yf::HttpRewrite::Event::attribute(const char *tag, int tag_len,
309                                        const char *attr, int attr_len,
310                                        const char *value, int val_len,
311                                        const char *sep)
312 {
313     std::list<Within>::const_iterator it = m_phase->within_list.begin();
314     bool subst = false;
315
316     for (; it != m_phase->within_list.end(); it++)
317     {
318         std::string t(tag, tag_len);
319         if (it->tag.length() == 0 ||
320             yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
321         {
322             std::string a(attr, attr_len);
323             std::vector<std::string> attr;
324             boost::split(attr, it->attr, boost::is_any_of(","));
325             size_t i;
326             for (i = 0; i < attr.size(); i++)
327             {
328                 if (attr[i].compare("#text") &&
329                     yaz_strcasecmp(attr[i].c_str(), a.c_str()) == 0)
330                     subst = true;
331             }
332         }
333         if (subst)
334             break;
335     }
336
337     wrbuf_putc(m_w, ' ');
338     wrbuf_write(m_w, attr, attr_len);
339     if (value)
340     {
341         wrbuf_puts(m_w, "=");
342         wrbuf_puts(m_w, sep);
343
344         std::string output;
345         if (subst)
346         {
347             std::string input(value, val_len);
348             output = it->rule->test_patterns(m_vars, input);
349         }
350         if (output.empty())
351             wrbuf_write(m_w, value, val_len);
352         else
353             wrbuf_puts(m_w, output.c_str());
354         wrbuf_puts(m_w, sep);
355     }
356 }
357
358 void yf::HttpRewrite::Event::closeTag(const char *tag, int tag_len)
359 {
360     if (!s_within.empty())
361     {
362         std::list<Within>::const_iterator it = s_within.top();
363         std::string t(tag, tag_len);
364         if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
365             s_within.pop();
366     }
367     wrbuf_puts(m_w, "</");
368     wrbuf_write(m_w, tag, tag_len);
369 }
370
371 void yf::HttpRewrite::Event::text(const char *value, int len)
372 {
373     std::list<Within>::const_iterator it = m_phase->within_list.end();
374     if (!s_within.empty())
375         it = s_within.top();
376     std::string output;
377     if (it != m_phase->within_list.end())
378     {
379         std::string input(value, len);
380         output = it->rule->test_patterns(m_vars, input);
381     }
382     if (output.empty())
383         wrbuf_write(m_w, value, len);
384     else
385         wrbuf_puts(m_w, output.c_str());
386 }
387
388 const std::string yf::HttpRewrite::Rule::test_patterns(
389         std::map<std::string, std::string> & vars,
390         const std::string & txt)
391 {
392     std::string out;
393     std::string::const_iterator start, end;
394     start = txt.begin();
395     end = txt.end();
396     while (1)
397     {
398         std::list<Replace>::iterator bit = replace_list.end();
399         {
400             std::string::const_iterator best_pos = txt.end();
401             std::list<Replace>::iterator it = replace_list.begin();
402             for (; it != replace_list.end(); it++)
403             {
404                 if (regex_search(start, end, it->what, it->re))
405                 {
406                     if (it->what[0].first < best_pos)
407                     {
408                         best_pos = it->what[0].first;
409                         bit = it;
410                     }
411                 }
412             }
413             if (bit == replace_list.end())
414                 break;
415         }
416
417         size_t i;
418         for (i = 1; i < bit->what.size(); ++i)
419         {
420             //check if the group is named
421             std::map<int, std::string>::const_iterator git
422                 = bit->group_index.find(i);
423             if (git != bit->group_index.end())
424             {   //it is
425                 vars[git->second] = bit->what[i];
426             }
427
428         }
429         //prepare replacement string
430         std::string rvalue = bit->sub_vars(vars);
431         yaz_log(YLOG_LOG, "! Rewritten '%s' to '%s'",
432                 bit->what.str(0).c_str(), rvalue.c_str());
433         out.append(start, bit->what[0].first);
434         out.append(rvalue);
435         start = bit->what[0].second; //move search forward
436     }
437     if (start != txt.begin())
438         out.append(start, end);
439     return out;
440 }
441
442 void yf::HttpRewrite::Replace::parse_groups(std::string pattern)
443 {
444     int gnum = 0;
445     bool esc = false;
446     const std::string &str = pattern;
447     std::string res;
448     yaz_log(YLOG_LOG, "Parsing groups from '%s'", str.c_str());
449     for (size_t i = 0; i < str.size(); ++i)
450     {
451         res += str[i];
452         if (!esc && str[i] == '\\')
453         {
454             esc = true;
455             continue;
456         }
457         if (!esc && str[i] == '(') //group starts
458         {
459             gnum++;
460             if (i+1 < str.size() && str[i+1] == '?') //group with attrs
461             {
462                 i++;
463                 if (i+1 < str.size() && str[i+1] == ':') //non-capturing
464                 {
465                     if (gnum > 0) gnum--;
466                     res += str[i];
467                     i++;
468                     res += str[i];
469                     continue;
470                 }
471                 if (i+1 < str.size() && str[i+1] == 'P') //optional, python
472                     i++;
473                 if (i+1 < str.size() && str[i+1] == '<') //named
474                 {
475                     i++;
476                     std::string gname;
477                     bool term = false;
478                     while (++i < str.size())
479                     {
480                         if (str[i] == '>') { term = true; break; }
481                         if (!isalnum(str[i]))
482                             throw mp::filter::FilterException
483                                 ("Only alphanumeric chars allowed, found "
484                                  " in '"
485                                  + str
486                                  + "' at "
487                                  + boost::lexical_cast<std::string>(i));
488                         gname += str[i];
489                     }
490                     if (!term)
491                         throw mp::filter::FilterException
492                             ("Unterminated group name '" + gname
493                              + " in '" + str +"'");
494                     group_index[gnum] = gname;
495                     yaz_log(YLOG_LOG, "Found named group '%s' at $%d",
496                             gname.c_str(), gnum);
497                 }
498             }
499         }
500         esc = false;
501     }
502     re = res;
503 }
504
505 std::string yf::HttpRewrite::Replace::sub_vars(
506     const std::map<std::string, std::string> & vars) const
507 {
508     std::string out;
509     bool esc = false;
510     const std::string & in = recipe;
511     for (size_t i = 0; i < in.size(); ++i)
512     {
513         if (!esc && in[i] == '\\')
514         {
515             esc = true;
516             continue;
517         }
518         if (!esc && in[i] == '$') //var
519         {
520             if (i+1 < in.size() && in[i+1] == '{') //ref prefix
521             {
522                 ++i;
523                 std::string name;
524                 bool term = false;
525                 while (++i < in.size())
526                 {
527                     if (in[i] == '}') { term = true; break; }
528                     name += in[i];
529                 }
530                 if (!term) throw mp::filter::FilterException
531                     ("Unterminated var ref in '"+in+"' at "
532                      + boost::lexical_cast<std::string>(i));
533                 std::map<std::string, std::string>::const_iterator it
534                     = vars.find(name);
535                 if (it != vars.end())
536                 {
537                     out += it->second;
538                 }
539             }
540             else
541             {
542                 throw mp::filter::FilterException
543                     ("Malformed or trimmed var ref in '"
544                      +in+"' at "+boost::lexical_cast<std::string>(i));
545             }
546             continue;
547         }
548         //passthru
549         out += in[i];
550         esc = false;
551     }
552     return out;
553 }
554
555 yf::HttpRewrite::Phase::Phase() : m_verbose(0)
556 {
557 }
558
559 void yf::HttpRewrite::configure_phase(const xmlNode *ptr, Phase &phase)
560 {
561     static const char *names[2] = { "verbose", 0 };
562     std::string values[1];
563     values[0] = "0";
564     mp::xml::parse_attr(ptr, names, values);
565
566     phase.m_verbose = atoi(values[0].c_str());
567
568     std::map<std::string, RulePtr > rules;
569     for (ptr = ptr->children; ptr; ptr = ptr->next)
570     {
571         if (ptr->type != XML_ELEMENT_NODE)
572             continue;
573         else if (!strcmp((const char *) ptr->name, "rule"))
574         {
575             static const char *names[2] = { "name", 0 };
576             std::string values[1];
577             values[0] = "default";
578             mp::xml::parse_attr(ptr, names, values);
579
580             RulePtr rule(new Rule);
581             for (xmlNode *p = ptr->children; p; p = p->next)
582             {
583                 if (p->type != XML_ELEMENT_NODE)
584                     continue;
585                 if (!strcmp((const char *) p->name, "rewrite"))
586                 {
587                     Replace replace;
588                     std::string from;
589                     const struct _xmlAttr *attr;
590                     for (attr = p->properties; attr; attr = attr->next)
591                     {
592                         if (!strcmp((const char *) attr->name,  "from"))
593                             from = mp::xml::get_text(attr->children);
594                         else if (!strcmp((const char *) attr->name,  "to"))
595                             replace.recipe = mp::xml::get_text(attr->children);
596                         else
597                             throw mp::filter::FilterException
598                                 ("Bad attribute "
599                                  + std::string((const char *) attr->name)
600                                  + " in rewrite section of http_rewrite");
601                     }
602                     yaz_log(YLOG_LOG, "Found rewrite rule from '%s' to '%s'",
603                             from.c_str(), replace.recipe.c_str());
604                     if (!from.empty())
605                     {
606                         replace.parse_groups(from);
607                         rule->replace_list.push_back(replace);
608                     }
609                 }
610                 else
611                     throw mp::filter::FilterException
612                         ("Bad element "
613                          + std::string((const char *) p->name)
614                          + " in http_rewrite filter");
615             }
616             rules[values[0]] = rule;
617         }
618         else if (!strcmp((const char *) ptr->name, "within"))
619         {
620             static const char *names[6] =
621                 { "header", "attr", "tag", "rule", "reqline", 0 };
622             std::string values[5];
623             mp::xml::parse_attr(ptr, names, values);
624             Within w;
625             w.header = values[0];
626             w.attr = values[1];
627             w.tag = values[2];
628             std::map<std::string,RulePtr>::const_iterator it =
629                 rules.find(values[3]);
630             if (it == rules.end())
631                 throw mp::filter::FilterException
632                     ("Reference to non-existing rule '" + values[3] +
633                      "' in http_rewrite filter");
634             w.rule = it->second;
635             w.reqline = values[4] == "1";
636             phase.within_list.push_back(w);
637         }
638         else
639         {
640             throw mp::filter::FilterException
641                 ("Bad element "
642                  + std::string((const char *) ptr->name)
643                  + " in http_rewrite filter");
644         }
645     }
646 }
647
648 void yf::HttpRewrite::configure(const xmlNode * ptr, bool test_only,
649         const char *path)
650 {
651     for (ptr = ptr->children; ptr; ptr = ptr->next)
652     {
653         if (ptr->type != XML_ELEMENT_NODE)
654             continue;
655         else if (!strcmp((const char *) ptr->name, "request"))
656         {
657             configure_phase(ptr, *req_phase);
658         }
659         else if (!strcmp((const char *) ptr->name, "response"))
660         {
661             configure_phase(ptr, *res_phase);
662         }
663         else
664         {
665             throw mp::filter::FilterException
666                 ("Bad element "
667                  + std::string((const char *) ptr->name)
668                  + " in http_rewrite1 filter");
669         }
670     }
671 }
672
673 static mp::filter::Base* filter_creator()
674 {
675     return new mp::filter::HttpRewrite;
676 }
677
678 extern "C" {
679     struct metaproxy_1_filter_struct metaproxy_1_filter_http_rewrite = {
680         0,
681         "http_rewrite",
682         filter_creator
683     };
684 }
685
686
687 /*
688  * Local variables:
689  * c-basic-offset: 4
690  * c-file-style: "Stroustrup"
691  * indent-tabs-mode: nil
692  * End:
693  * vim: shiftwidth=4 tabstop=8 expandtab
694  */
695