Revise HTML parser; keep spelling
[metaproxy-moved-to-github.git] / src / filter_http_rewrite.cpp
1 /* This file is part of Metaproxy.
2    Copyright (C) 2005-2013 Index Data
3
4 Metaproxy is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18
19 #include "config.hpp"
20 #include <metaproxy/filter.hpp>
21 #include <metaproxy/package.hpp>
22 #include <metaproxy/util.hpp>
23 #include "filter_http_rewrite.hpp"
24 #include "html_parser.hpp"
25
26 #include <yaz/zgdu.h>
27 #include <yaz/log.h>
28
29 #include <boost/regex.hpp>
30 #include <boost/lexical_cast.hpp>
31 #include <boost/algorithm/string.hpp>
32
33 #include <map>
34
35 namespace mp = metaproxy_1;
36 namespace yf = mp::filter;
37
38 namespace metaproxy_1 {
39     namespace filter {
40         class HttpRewrite::Replace {
41         public:
42             std::string regex;
43             std::string recipe;
44             std::map<int, std::string> group_index;
45             const std::string search_replace(
46                 std::map<std::string, std::string> & vars,
47                 const std::string & txt) const;
48             std::string sub_vars (
49                 const std::map<std::string, std::string> & vars) const;
50             void parse_groups();
51         };
52
53         class HttpRewrite::Rule {
54         public:
55             std::list<Replace> replace_list;
56             const std::string test_patterns(
57                 std::map<std::string, std::string> & vars,
58                 const std::string & txt) const;
59         };
60         class HttpRewrite::Within {
61         public:
62             std::string header;
63             std::string attr;
64             std::string tag;
65             bool reqline;
66             RulePtr rule;
67         };
68
69         class HttpRewrite::Phase {
70         public:
71             std::list<Within> within_list;
72             void rewrite_reqline(mp::odr & o, Z_HTTP_Request *hreq,
73                 std::map<std::string, std::string> & vars) const;
74             void rewrite_headers(mp::odr & o, Z_HTTP_Header *headers,
75                 std::map<std::string, std::string> & vars) const;
76             void rewrite_body(mp::odr & o,
77                 char **content_buf, int *content_len,
78                 std::map<std::string, std::string> & vars) const;
79         };
80         class HttpRewrite::Event : public HTMLParserEvent {
81             void openTagStart(const char *tag, int tag_len);
82             void anyTagEnd(const char *tag, int tag_len, int close_it);
83             void attribute(const char *tag, int tag_len,
84                            const char *attr, int attr_len,
85                            const char *value, int val_len);
86             void closeTag(const char *tag, int tag_len);
87             void text(const char *value, int len);
88             const Phase *m_phase;
89             WRBUF m_w;
90             std::list<Within>::const_iterator enabled_within;
91             std::map<std::string, std::string> &m_vars;
92         public:
93             Event(const Phase *p, std::map<std::string, std::string> &vars);
94             ~Event();
95             const char *result();
96         };
97     }
98 }
99
100 yf::HttpRewrite::HttpRewrite() :
101     req_phase(new Phase), res_phase(new Phase)
102 {
103 }
104
105 yf::HttpRewrite::~HttpRewrite()
106 {
107 }
108
109 void yf::HttpRewrite::process(mp::Package & package) const
110 {
111     yaz_log(YLOG_LOG, "HttpRewrite begins....");
112     Z_GDU *gdu = package.request().get();
113     //map of request/response vars
114     std::map<std::string, std::string> vars;
115     //we have an http req
116     if (gdu && gdu->which == Z_GDU_HTTP_Request)
117     {
118         Z_HTTP_Request *hreq = gdu->u.HTTP_Request;
119         mp::odr o;
120         req_phase->rewrite_reqline(o, hreq, vars);
121         yaz_log(YLOG_LOG, ">> Request headers");
122         req_phase->rewrite_headers(o, hreq->headers, vars);
123         req_phase->rewrite_body(o,
124                 &hreq->content_buf, &hreq->content_len, vars);
125         package.request() = gdu;
126     }
127     package.move();
128     gdu = package.response().get();
129     if (gdu && gdu->which == Z_GDU_HTTP_Response)
130     {
131         Z_HTTP_Response *hres = gdu->u.HTTP_Response;
132         yaz_log(YLOG_LOG, "Response code %d", hres->code);
133         mp::odr o;
134         yaz_log(YLOG_LOG, "<< Respose headers");
135         res_phase->rewrite_headers(o, hres->headers, vars);
136         res_phase->rewrite_body(o, &hres->content_buf,
137                 &hres->content_len, vars);
138         package.response() = gdu;
139     }
140 }
141
142 void yf::HttpRewrite::Phase::rewrite_reqline (mp::odr & o,
143         Z_HTTP_Request *hreq,
144         std::map<std::string, std::string> & vars) const
145 {
146     //rewrite the request line
147     std::string path;
148     if (strstr(hreq->path, "http://") == hreq->path)
149     {
150         yaz_log(YLOG_LOG, "Path in the method line is absolute, "
151             "possibly a proxy request");
152         path += hreq->path;
153     }
154     else
155     {
156         //TODO what about proto
157         path += "http://";
158         path += z_HTTP_header_lookup(hreq->headers, "Host");
159         path += hreq->path;
160     }
161
162
163     std::list<Within>::const_iterator it = within_list.begin();
164     for (; it != within_list.end(); it++)
165         if (it->reqline)
166         {
167             RulePtr rule = it->rule;
168             yaz_log(YLOG_LOG, "Proxy request URL is %s", path.c_str());
169             std::string npath = rule->test_patterns(vars, path);
170             if (!npath.empty())
171             {
172                 yaz_log(YLOG_LOG, "Rewritten request URL is %s", npath.c_str());
173                 hreq->path = odr_strdup(o, npath.c_str());
174             }
175         }
176 }
177
178 void yf::HttpRewrite::Phase::rewrite_headers(mp::odr & o,
179         Z_HTTP_Header *headers,
180         std::map<std::string, std::string> & vars) const
181 {
182     for (Z_HTTP_Header *header = headers; header; header = header->next)
183     {
184         std::list<Within>::const_iterator it = within_list.begin();
185         for (; it != within_list.end(); it++)
186         {
187             if (it->header.length() > 0 &&
188                 yaz_strcasecmp(it->header.c_str(), header->name) == 0)
189             {
190                 std::string sheader(header->name);
191                 sheader += ": ";
192                 sheader += header->value;
193
194                 RulePtr rule = it->rule;
195                 std::string out = rule->test_patterns(vars, sheader);
196                 if (!out.empty())
197                 {
198                     size_t pos = out.find(": ");
199                     if (pos == std::string::npos)
200                     {
201                         yaz_log(YLOG_LOG, "Header malformed during rewrite, ignoring");
202                         continue;
203                     }
204                     header->name = odr_strdup(o, out.substr(0, pos).c_str());
205                     header->value = odr_strdup(o,
206                                                out.substr(pos + 2,
207                                                           std::string::npos).c_str());
208                 }
209             }
210         }
211     }
212 }
213
214 void yf::HttpRewrite::Phase::rewrite_body(mp::odr & o,
215         char **content_buf,
216         int *content_len,
217         std::map<std::string, std::string> & vars) const
218 {
219     if (*content_buf)
220     {
221         int i;
222         for (i = 0; i < *content_len; i++)
223             if ((*content_buf)[i] == 0)
224                 return;  // binary content. skip
225
226         HTMLParser parser;
227         Event ev(this, vars);
228         std::string buf(*content_buf, *content_len);
229
230         parser.parse(ev, buf.c_str());
231         const char *res = ev.result();
232         *content_buf = odr_strdup(o, res);
233         *content_len = strlen(res);
234     }
235 }
236
237 yf::HttpRewrite::Event::Event(const Phase *p,
238                               std::map<std::string, std::string> & vars
239     ) : m_phase(p), m_vars(vars)
240 {
241     m_w = wrbuf_alloc();
242     enabled_within = m_phase->within_list.end();
243 }
244
245 yf::HttpRewrite::Event::~Event()
246 {
247     wrbuf_destroy(m_w);
248 }
249
250 const char *yf::HttpRewrite::Event::result()
251 {
252     return wrbuf_cstr(m_w);
253 }
254
255 void yf::HttpRewrite::Event::openTagStart(const char *tag, int tag_len)
256 {
257     // check if there is <within tag="x" .. />
258     if (enabled_within == m_phase->within_list.end())
259     {
260         std::string t(tag, tag_len);
261         std::list<Within>::const_iterator it =
262             m_phase->within_list.begin();
263         for (; it != m_phase->within_list.end(); it++)
264         {
265             if (it->tag.length() > 0 && yaz_strcasecmp(it->tag.c_str(),
266                                                        t.c_str()) == 0)
267             {
268                 enabled_within = it;
269             }
270         }
271     }
272     wrbuf_putc(m_w, '<');
273     wrbuf_write(m_w, tag, tag_len);
274 }
275
276 void yf::HttpRewrite::Event::anyTagEnd(const char *tag, int tag_len,
277                                        int close_it)
278 {
279     if (close_it)
280     {
281         std::list<Within>::const_iterator it = enabled_within;
282         if (it != m_phase->within_list.end())
283         {
284             std::string t(tag, tag_len);
285             if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
286             {
287                 enabled_within = m_phase->within_list.end();
288             }
289         }
290     }
291     if (close_it)
292         wrbuf_putc(m_w, '/');
293     wrbuf_putc(m_w, '>');
294 }
295
296 void yf::HttpRewrite::Event::attribute(const char *tag, int tag_len,
297                                        const char *attr, int attr_len,
298                                        const char *value, int val_len)
299 {
300     std::list<Within>::const_iterator it = m_phase->within_list.begin();
301     bool subst = false;
302
303     for (; it != m_phase->within_list.end(); it++)
304     {
305         std::string t(tag, tag_len);
306         if (it->tag.length() == 0 ||
307             yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
308         {
309             std::string a(attr, attr_len);
310             std::vector<std::string> attr;
311             boost::split(attr, it->attr, boost::is_any_of(","));
312             size_t i;
313             for (i = 0; i < attr.size(); i++)
314             {
315                 if (attr[i].compare("#text") &&
316                     yaz_strcasecmp(attr[i].c_str(), a.c_str()) == 0)
317                     subst = true;
318             }
319         }
320         if (subst)
321             break;
322     }
323
324     wrbuf_putc(m_w, ' ');
325     wrbuf_write(m_w, attr, attr_len);
326     wrbuf_puts(m_w, "=\"");
327
328     std::string output;
329     if (subst)
330     {
331         std::string input(value, val_len);
332         output = it->rule->test_patterns(m_vars, input);
333     }
334     if (output.empty())
335         wrbuf_write(m_w, value, val_len);
336     else
337         wrbuf_puts(m_w, output.c_str());
338     wrbuf_puts(m_w, "\"");
339 }
340
341 void yf::HttpRewrite::Event::closeTag(const char *tag, int tag_len)
342 {
343     std::list<Within>::const_iterator it = enabled_within;
344     if (it != m_phase->within_list.end())
345     {
346         std::string t(tag, tag_len);
347         if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
348         {
349             enabled_within = m_phase->within_list.end();
350         }
351     }
352     wrbuf_puts(m_w, "</");
353     wrbuf_write(m_w, tag, tag_len);
354 }
355
356 void yf::HttpRewrite::Event::text(const char *value, int len)
357 {
358     std::list<Within>::const_iterator it = enabled_within;
359     bool subst = false;
360
361     if (it != m_phase->within_list.end())
362     {
363         subst = true;
364         if (it->attr.length() > 0)
365         {
366             subst = false;
367             std::vector<std::string> attr;
368             boost::split(attr, it->attr, boost::is_any_of(","));
369             size_t i;
370             for (i = 0; i < attr.size(); i++)
371             {
372                 if (attr[i].compare("#text") == 0)
373                 {
374                     subst = true;
375                 }
376             }
377         }
378     }
379     std::string output;
380     if (subst)
381     {
382         std::string input(value, len);
383         output = it->rule->test_patterns(m_vars, input);
384     }
385     if (output.empty())
386         wrbuf_write(m_w, value, len);
387     else
388         wrbuf_puts(m_w, output.c_str());
389 }
390
391
392 /**
393  * Tests pattern from the vector in order and executes recipe on
394  the first match.
395  */
396 const std::string yf::HttpRewrite::Rule::test_patterns(
397         std::map<std::string, std::string> & vars,
398         const std::string & txt) const
399 {
400     std::list<Replace>::const_iterator it = replace_list.begin();
401
402     for (; it != replace_list.end(); it++)
403     {
404         std::string out = it->search_replace(vars, txt);
405         if (!out.empty()) return out;
406     }
407     return "";
408 }
409
410 const std::string yf::HttpRewrite::Replace::search_replace(
411         std::map<std::string, std::string> & vars,
412         const std::string & txt) const
413 {
414     //exec regex against value
415     boost::regex re(regex);
416     boost::smatch what;
417     std::string::const_iterator start, end;
418     start = txt.begin();
419     end = txt.end();
420     std::string out;
421     while (regex_search(start, end, what, re)) //find next full match
422     {
423         size_t i;
424         for (i = 1; i < what.size(); ++i)
425         {
426             //check if the group is named
427             std::map<int, std::string>::const_iterator it
428                 = group_index.find(i);
429             if (it != group_index.end())
430             {   //it is
431                 if (!what[i].str().empty())
432                     vars[it->second] = what[i];
433             }
434
435         }
436         //prepare replacement string
437         std::string rvalue = sub_vars(vars);
438         yaz_log(YLOG_LOG, "! Rewritten '%s' to '%s'",
439                 what.str(0).c_str(), rvalue.c_str());
440         out.append(start, what[0].first);
441         out.append(rvalue);
442         start = what[0].second; //move search forward
443     }
444     //if we had a match cat the last part
445     if (start != txt.begin())
446         out.append(start, end);
447     return out;
448 }
449
450 void yf::HttpRewrite::Replace::parse_groups()
451 {
452     int gnum = 0;
453     bool esc = false;
454     const std::string & str = regex;
455     std::string res;
456     yaz_log(YLOG_LOG, "Parsing groups from '%s'", str.c_str());
457     for (size_t i = 0; i < str.size(); ++i)
458     {
459         res += str[i];
460         if (!esc && str[i] == '\\')
461         {
462             esc = true;
463             continue;
464         }
465         if (!esc && str[i] == '(') //group starts
466         {
467             gnum++;
468             if (i+1 < str.size() && str[i+1] == '?') //group with attrs
469             {
470                 i++;
471                 if (i+1 < str.size() && str[i+1] == ':') //non-capturing
472                 {
473                     if (gnum > 0) gnum--;
474                     res += str[i];
475                     i++;
476                     res += str[i];
477                     continue;
478                 }
479                 if (i+1 < str.size() && str[i+1] == 'P') //optional, python
480                     i++;
481                 if (i+1 < str.size() && str[i+1] == '<') //named
482                 {
483                     i++;
484                     std::string gname;
485                     bool term = false;
486                     while (++i < str.size())
487                     {
488                         if (str[i] == '>') { term = true; break; }
489                         if (!isalnum(str[i]))
490                             throw mp::filter::FilterException
491                                 ("Only alphanumeric chars allowed, found "
492                                  " in '"
493                                  + str
494                                  + "' at "
495                                  + boost::lexical_cast<std::string>(i));
496                         gname += str[i];
497                     }
498                     if (!term)
499                         throw mp::filter::FilterException
500                             ("Unterminated group name '" + gname
501                              + " in '" + str +"'");
502                     group_index[gnum] = gname;
503                     yaz_log(YLOG_LOG, "Found named group '%s' at $%d",
504                             gname.c_str(), gnum);
505                 }
506             }
507         }
508         esc = false;
509     }
510     regex = res;
511 }
512
513 std::string yf::HttpRewrite::Replace::sub_vars (
514         const std::map<std::string, std::string> & vars) const
515 {
516     std::string out;
517     bool esc = false;
518     const std::string & in = recipe;
519     for (size_t i = 0; i < in.size(); ++i)
520     {
521         if (!esc && in[i] == '\\')
522         {
523             esc = true;
524             continue;
525         }
526         if (!esc && in[i] == '$') //var
527         {
528             if (i+1 < in.size() && in[i+1] == '{') //ref prefix
529             {
530                 ++i;
531                 std::string name;
532                 bool term = false;
533                 while (++i < in.size())
534                 {
535                     if (in[i] == '}') { term = true; break; }
536                     name += in[i];
537                 }
538                 if (!term) throw mp::filter::FilterException
539                     ("Unterminated var ref in '"+in+"' at "
540                      + boost::lexical_cast<std::string>(i));
541                 std::map<std::string, std::string>::const_iterator it
542                     = vars.find(name);
543                 if (it != vars.end())
544                 {
545                     out += it->second;
546                 }
547             }
548             else
549             {
550                 throw mp::filter::FilterException
551                     ("Malformed or trimmed var ref in '"
552                      +in+"' at "+boost::lexical_cast<std::string>(i));
553             }
554             continue;
555         }
556         //passthru
557         out += in[i];
558         esc = false;
559     }
560     return out;
561 }
562
563
564 void yf::HttpRewrite::configure_phase(const xmlNode *ptr, Phase &phase)
565 {
566     std::map<std::string, RulePtr > rules;
567     for (ptr = ptr->children; ptr; ptr = ptr->next)
568     {
569         if (ptr->type != XML_ELEMENT_NODE)
570             continue;
571         else if (!strcmp((const char *) ptr->name, "rule"))
572         {
573             static const char *names[2] = { "name", 0 };
574             std::string values[1];
575             values[0] = "default";
576             mp::xml::parse_attr(ptr, names, values);
577
578             RulePtr rule(new Rule);
579             for (xmlNode *p = ptr->children; p; p = p->next)
580             {
581                 if (p->type != XML_ELEMENT_NODE)
582                     continue;
583                 if (!strcmp((const char *) p->name, "rewrite"))
584                 {
585                     Replace replace;
586                     const struct _xmlAttr *attr;
587                     for (attr = p->properties; attr; attr = attr->next)
588                     {
589                         if (!strcmp((const char *) attr->name,  "from"))
590                             replace.regex = mp::xml::get_text(attr->children);
591                         else if (!strcmp((const char *) attr->name,  "to"))
592                             replace.recipe = mp::xml::get_text(attr->children);
593                         else
594                             throw mp::filter::FilterException
595                                 ("Bad attribute "
596                                  + std::string((const char *) attr->name)
597                                  + " in rewrite section of http_rewrite");
598                     }
599                     yaz_log(YLOG_LOG, "Found rewrite rule from '%s' to '%s'",
600                             replace.regex.c_str(), replace.recipe.c_str());
601                     replace.parse_groups();
602                     if (!replace.regex.empty())
603                         rule->replace_list.push_back(replace);
604                 }
605                 else
606                     throw mp::filter::FilterException
607                         ("Bad element "
608                          + std::string((const char *) p->name)
609                          + " in http_rewrite filter");
610             }
611             rules[values[0]] = rule;
612         }
613         else if (!strcmp((const char *) ptr->name, "within"))
614         {
615             static const char *names[6] =
616                 { "header", "attr", "tag", "rule", "reqline", 0 };
617             std::string values[5];
618             mp::xml::parse_attr(ptr, names, values);
619             Within w;
620             w.header = values[0];
621             w.attr = values[1];
622             w.tag = values[2];
623             std::map<std::string,RulePtr>::const_iterator it =
624                 rules.find(values[3]);
625             if (it == rules.end())
626                 throw mp::filter::FilterException
627                     ("Reference to non-existing rule '" + values[3] +
628                      "' in http_rewrite filter");
629             w.rule = it->second;
630             w.reqline = values[4] == "1";
631             phase.within_list.push_back(w);
632         }
633         else
634         {
635             throw mp::filter::FilterException
636                 ("Bad element "
637                  + std::string((const char *) ptr->name)
638                  + " in http_rewrite filter");
639         }
640     }
641 }
642
643 void yf::HttpRewrite::configure(const xmlNode * ptr, bool test_only,
644         const char *path)
645 {
646     for (ptr = ptr->children; ptr; ptr = ptr->next)
647     {
648         if (ptr->type != XML_ELEMENT_NODE)
649             continue;
650         else if (!strcmp((const char *) ptr->name, "request"))
651         {
652             configure_phase(ptr, *req_phase);
653         }
654         else if (!strcmp((const char *) ptr->name, "response"))
655         {
656             configure_phase(ptr, *res_phase);
657         }
658         else
659         {
660             throw mp::filter::FilterException
661                 ("Bad element "
662                  + std::string((const char *) ptr->name)
663                  + " in http_rewrite1 filter");
664         }
665     }
666 }
667
668 static mp::filter::Base* filter_creator()
669 {
670     return new mp::filter::HttpRewrite;
671 }
672
673 extern "C" {
674     struct metaproxy_1_filter_struct metaproxy_1_filter_http_rewrite = {
675         0,
676         "http_rewrite",
677         filter_creator
678     };
679 }
680
681
682 /*
683  * Local variables:
684  * c-basic-offset: 4
685  * c-file-style: "Stroustrup"
686  * indent-tabs-mode: nil
687  * End:
688  * vim: shiftwidth=4 tabstop=8 expandtab
689  */
690