Option attribute values for HTML parser
[metaproxy-moved-to-github.git] / src / filter_http_rewrite.cpp
1 /* This file is part of Metaproxy.
2    Copyright (C) 2005-2013 Index Data
3
4 Metaproxy is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18
19 #include "config.hpp"
20 #include <metaproxy/filter.hpp>
21 #include <metaproxy/package.hpp>
22 #include <metaproxy/util.hpp>
23 #include "filter_http_rewrite.hpp"
24 #include "html_parser.hpp"
25
26 #include <yaz/zgdu.h>
27 #include <yaz/log.h>
28
29 #include <stack>
30 #include <boost/regex.hpp>
31 #include <boost/lexical_cast.hpp>
32 #include <boost/algorithm/string.hpp>
33
34 #include <map>
35
36 namespace mp = metaproxy_1;
37 namespace yf = mp::filter;
38
39 namespace metaproxy_1 {
40     namespace filter {
41         class HttpRewrite::Replace {
42         public:
43             std::string regex;
44             std::string recipe;
45             std::map<int, std::string> group_index;
46             const std::string search_replace(
47                 std::map<std::string, std::string> & vars,
48                 const std::string & txt) const;
49             std::string sub_vars(
50                 const std::map<std::string, std::string> & vars) const;
51             void parse_groups();
52         };
53
54         class HttpRewrite::Rule {
55         public:
56             std::list<Replace> replace_list;
57             const std::string test_patterns(
58                 std::map<std::string, std::string> & vars,
59                 const std::string & txt) const;
60         };
61         class HttpRewrite::Within {
62         public:
63             std::string header;
64             std::string attr;
65             std::string tag;
66             bool reqline;
67             RulePtr rule;
68         };
69
70         class HttpRewrite::Phase {
71         public:
72             Phase();
73             std::list<Within> within_list;
74             int m_verbose;
75             void rewrite_reqline(mp::odr & o, Z_HTTP_Request *hreq,
76                 std::map<std::string, std::string> & vars) const;
77             void rewrite_headers(mp::odr & o, Z_HTTP_Header *headers,
78                 std::map<std::string, std::string> & vars) const;
79             void rewrite_body(mp::odr & o,
80                 char **content_buf, int *content_len,
81                 std::map<std::string, std::string> & vars) const;
82         };
83         class HttpRewrite::Event : public HTMLParserEvent {
84             void openTagStart(const char *tag, int tag_len);
85             void anyTagEnd(const char *tag, int tag_len, int close_it);
86             void attribute(const char *tag, int tag_len,
87                            const char *attr, int attr_len,
88                            const char *value, int val_len,
89                            const char *sep);
90             void closeTag(const char *tag, int tag_len);
91             void text(const char *value, int len);
92             const Phase *m_phase;
93             WRBUF m_w;
94             std::stack<std::list<Within>::const_iterator> s_within;
95             std::map<std::string, std::string> &m_vars;
96         public:
97             Event(const Phase *p, std::map<std::string, std::string> &vars);
98             ~Event();
99             const char *result();
100         };
101     }
102 }
103
104 yf::HttpRewrite::HttpRewrite() :
105     req_phase(new Phase), res_phase(new Phase)
106 {
107 }
108
109 yf::HttpRewrite::~HttpRewrite()
110 {
111 }
112
113 void yf::HttpRewrite::process(mp::Package & package) const
114 {
115     yaz_log(YLOG_LOG, "HttpRewrite begins....");
116     Z_GDU *gdu = package.request().get();
117     //map of request/response vars
118     std::map<std::string, std::string> vars;
119     //we have an http req
120     if (gdu && gdu->which == Z_GDU_HTTP_Request)
121     {
122         Z_HTTP_Request *hreq = gdu->u.HTTP_Request;
123         mp::odr o;
124         req_phase->rewrite_reqline(o, hreq, vars);
125         yaz_log(YLOG_LOG, ">> Request headers");
126         req_phase->rewrite_headers(o, hreq->headers, vars);
127         req_phase->rewrite_body(o,
128                 &hreq->content_buf, &hreq->content_len, vars);
129         package.request() = gdu;
130     }
131     package.move();
132     gdu = package.response().get();
133     if (gdu && gdu->which == Z_GDU_HTTP_Response)
134     {
135         Z_HTTP_Response *hres = gdu->u.HTTP_Response;
136         yaz_log(YLOG_LOG, "Response code %d", hres->code);
137         mp::odr o;
138         yaz_log(YLOG_LOG, "<< Respose headers");
139         res_phase->rewrite_headers(o, hres->headers, vars);
140         res_phase->rewrite_body(o, &hres->content_buf,
141                 &hres->content_len, vars);
142         package.response() = gdu;
143     }
144 }
145
146 void yf::HttpRewrite::Phase::rewrite_reqline (mp::odr & o,
147         Z_HTTP_Request *hreq,
148         std::map<std::string, std::string> & vars) const
149 {
150     //rewrite the request line
151     std::string path;
152     if (strstr(hreq->path, "http://") == hreq->path)
153     {
154         yaz_log(YLOG_LOG, "Path in the method line is absolute, "
155             "possibly a proxy request");
156         path += hreq->path;
157     }
158     else
159     {
160         //TODO what about proto
161         path += "http://";
162         path += z_HTTP_header_lookup(hreq->headers, "Host");
163         path += hreq->path;
164     }
165
166
167     std::list<Within>::const_iterator it = within_list.begin();
168     for (; it != within_list.end(); it++)
169         if (it->reqline)
170         {
171             RulePtr rule = it->rule;
172             yaz_log(YLOG_LOG, "Proxy request URL is %s", path.c_str());
173             std::string npath = rule->test_patterns(vars, path);
174             if (!npath.empty())
175             {
176                 yaz_log(YLOG_LOG, "Rewritten request URL is %s", npath.c_str());
177                 hreq->path = odr_strdup(o, npath.c_str());
178             }
179         }
180 }
181
182 void yf::HttpRewrite::Phase::rewrite_headers(mp::odr & o,
183         Z_HTTP_Header *headers,
184         std::map<std::string, std::string> & vars) const
185 {
186     for (Z_HTTP_Header *header = headers; header; header = header->next)
187     {
188         std::list<Within>::const_iterator it = within_list.begin();
189         for (; it != within_list.end(); it++)
190         {
191             if (it->header.length() > 0 &&
192                 yaz_strcasecmp(it->header.c_str(), header->name) == 0)
193             {
194                 std::string sheader(header->name);
195                 sheader += ": ";
196                 sheader += header->value;
197
198                 RulePtr rule = it->rule;
199                 std::string out = rule->test_patterns(vars, sheader);
200                 if (!out.empty())
201                 {
202                     size_t pos = out.find(": ");
203                     if (pos == std::string::npos)
204                     {
205                         yaz_log(YLOG_LOG, "Header malformed during rewrite, ignoring");
206                         continue;
207                     }
208                     header->name = odr_strdup(o, out.substr(0, pos).c_str());
209                     header->value = odr_strdup(o,
210                                                out.substr(pos + 2,
211                                                           std::string::npos).c_str());
212                 }
213             }
214         }
215     }
216 }
217
218 void yf::HttpRewrite::Phase::rewrite_body(mp::odr & o,
219         char **content_buf,
220         int *content_len,
221         std::map<std::string, std::string> & vars) const
222 {
223     if (*content_buf)
224     {
225         int i;
226         for (i = 0; i < *content_len; i++)
227             if ((*content_buf)[i] == 0)
228                 return;  // binary content. skip
229
230         HTMLParser parser;
231         Event ev(this, vars);
232
233         parser.set_verbose(m_verbose);
234
235         std::string buf(*content_buf, *content_len);
236
237         parser.parse(ev, buf.c_str());
238         const char *res = ev.result();
239         *content_buf = odr_strdup(o, res);
240         *content_len = strlen(res);
241     }
242 }
243
244 yf::HttpRewrite::Event::Event(const Phase *p,
245                               std::map<std::string, std::string> & vars
246     ) : m_phase(p), m_vars(vars)
247 {
248     m_w = wrbuf_alloc();
249 }
250
251 yf::HttpRewrite::Event::~Event()
252 {
253     wrbuf_destroy(m_w);
254 }
255
256 const char *yf::HttpRewrite::Event::result()
257 {
258     return wrbuf_cstr(m_w);
259 }
260
261 void yf::HttpRewrite::Event::openTagStart(const char *tag, int tag_len)
262 {
263     wrbuf_putc(m_w, '<');
264     wrbuf_write(m_w, tag, tag_len);
265
266     std::string t(tag, tag_len);
267     std::list<Within>::const_iterator it = m_phase->within_list.begin();
268     for (; it != m_phase->within_list.end(); it++)
269     {
270         if (it->tag.length() > 0 && yaz_strcasecmp(it->tag.c_str(),
271                                                    t.c_str()) == 0)
272         {
273             std::vector<std::string> attr;
274             boost::split(attr, it->attr, boost::is_any_of(","));
275             size_t i;
276             for (i = 0; i < attr.size(); i++)
277             {
278                 if (attr[i].compare("#text") == 0)
279                 {
280                     s_within.push(it);
281                     return;
282                 }
283             }
284         }
285     }
286 }
287
288 void yf::HttpRewrite::Event::anyTagEnd(const char *tag, int tag_len,
289                                        int close_it)
290 {
291     if (close_it)
292     {
293         if (!s_within.empty())
294         {
295             std::list<Within>::const_iterator it = s_within.top();
296             std::string t(tag, tag_len);
297             if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
298                 s_within.pop();
299         }
300     }
301     if (close_it)
302         wrbuf_putc(m_w, '/');
303     wrbuf_putc(m_w, '>');
304 }
305
306 void yf::HttpRewrite::Event::attribute(const char *tag, int tag_len,
307                                        const char *attr, int attr_len,
308                                        const char *value, int val_len,
309                                        const char *sep)
310 {
311     std::list<Within>::const_iterator it = m_phase->within_list.begin();
312     bool subst = false;
313
314     for (; it != m_phase->within_list.end(); it++)
315     {
316         std::string t(tag, tag_len);
317         if (it->tag.length() == 0 ||
318             yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
319         {
320             std::string a(attr, attr_len);
321             std::vector<std::string> attr;
322             boost::split(attr, it->attr, boost::is_any_of(","));
323             size_t i;
324             for (i = 0; i < attr.size(); i++)
325             {
326                 if (attr[i].compare("#text") &&
327                     yaz_strcasecmp(attr[i].c_str(), a.c_str()) == 0)
328                     subst = true;
329             }
330         }
331         if (subst)
332             break;
333     }
334
335     wrbuf_putc(m_w, ' ');
336     wrbuf_write(m_w, attr, attr_len);
337     if (value)
338     {
339         wrbuf_puts(m_w, "=");
340         wrbuf_puts(m_w, sep);
341
342         std::string output;
343         if (subst)
344         {
345             std::string input(value, val_len);
346             output = it->rule->test_patterns(m_vars, input);
347         }
348         if (output.empty())
349             wrbuf_write(m_w, value, val_len);
350         else
351             wrbuf_puts(m_w, output.c_str());
352         wrbuf_puts(m_w, sep);
353     }
354 }
355
356 void yf::HttpRewrite::Event::closeTag(const char *tag, int tag_len)
357 {
358     if (!s_within.empty())
359     {
360         std::list<Within>::const_iterator it = s_within.top();
361         std::string t(tag, tag_len);
362         if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
363             s_within.pop();
364     }
365     wrbuf_puts(m_w, "</");
366     wrbuf_write(m_w, tag, tag_len);
367 }
368
369 void yf::HttpRewrite::Event::text(const char *value, int len)
370 {
371     std::list<Within>::const_iterator it = m_phase->within_list.end();
372     if (!s_within.empty())
373         it = s_within.top();
374     std::string output;
375     if (it != m_phase->within_list.end())
376     {
377         std::string input(value, len);
378         output = it->rule->test_patterns(m_vars, input);
379     }
380     if (output.empty())
381         wrbuf_write(m_w, value, len);
382     else
383         wrbuf_puts(m_w, output.c_str());
384 }
385
386
387 /**
388  * Tests pattern from the vector in order and executes recipe on
389  the first match.
390  */
391 const std::string yf::HttpRewrite::Rule::test_patterns(
392         std::map<std::string, std::string> & vars,
393         const std::string & txt) const
394 {
395     std::list<Replace>::const_iterator it = replace_list.begin();
396
397     for (; it != replace_list.end(); it++)
398     {
399         std::string out = it->search_replace(vars, txt);
400         if (!out.empty()) return out;
401     }
402     return "";
403 }
404
405 const std::string yf::HttpRewrite::Replace::search_replace(
406         std::map<std::string, std::string> & vars,
407         const std::string & txt) const
408 {
409     //exec regex against value
410     boost::regex re(regex);
411     boost::smatch what;
412     std::string::const_iterator start, end;
413     start = txt.begin();
414     end = txt.end();
415     std::string out;
416     while (regex_search(start, end, what, re)) //find next full match
417     {
418         size_t i;
419         for (i = 1; i < what.size(); ++i)
420         {
421             //check if the group is named
422             std::map<int, std::string>::const_iterator it
423                 = group_index.find(i);
424             if (it != group_index.end())
425             {   //it is
426                 vars[it->second] = what[i];
427             }
428
429         }
430         //prepare replacement string
431         std::string rvalue = sub_vars(vars);
432         yaz_log(YLOG_LOG, "! Rewritten '%s' to '%s'",
433                 what.str(0).c_str(), rvalue.c_str());
434         out.append(start, what[0].first);
435         out.append(rvalue);
436         start = what[0].second; //move search forward
437     }
438     //if we had a match cat the last part
439     if (start != txt.begin())
440         out.append(start, end);
441     return out;
442 }
443
444 void yf::HttpRewrite::Replace::parse_groups()
445 {
446     int gnum = 0;
447     bool esc = false;
448     const std::string & str = regex;
449     std::string res;
450     yaz_log(YLOG_LOG, "Parsing groups from '%s'", str.c_str());
451     for (size_t i = 0; i < str.size(); ++i)
452     {
453         res += str[i];
454         if (!esc && str[i] == '\\')
455         {
456             esc = true;
457             continue;
458         }
459         if (!esc && str[i] == '(') //group starts
460         {
461             gnum++;
462             if (i+1 < str.size() && str[i+1] == '?') //group with attrs
463             {
464                 i++;
465                 if (i+1 < str.size() && str[i+1] == ':') //non-capturing
466                 {
467                     if (gnum > 0) gnum--;
468                     res += str[i];
469                     i++;
470                     res += str[i];
471                     continue;
472                 }
473                 if (i+1 < str.size() && str[i+1] == 'P') //optional, python
474                     i++;
475                 if (i+1 < str.size() && str[i+1] == '<') //named
476                 {
477                     i++;
478                     std::string gname;
479                     bool term = false;
480                     while (++i < str.size())
481                     {
482                         if (str[i] == '>') { term = true; break; }
483                         if (!isalnum(str[i]))
484                             throw mp::filter::FilterException
485                                 ("Only alphanumeric chars allowed, found "
486                                  " in '"
487                                  + str
488                                  + "' at "
489                                  + boost::lexical_cast<std::string>(i));
490                         gname += str[i];
491                     }
492                     if (!term)
493                         throw mp::filter::FilterException
494                             ("Unterminated group name '" + gname
495                              + " in '" + str +"'");
496                     group_index[gnum] = gname;
497                     yaz_log(YLOG_LOG, "Found named group '%s' at $%d",
498                             gname.c_str(), gnum);
499                 }
500             }
501         }
502         esc = false;
503     }
504     regex = res;
505 }
506
507 std::string yf::HttpRewrite::Replace::sub_vars(
508     const std::map<std::string, std::string> & vars) const
509 {
510     std::string out;
511     bool esc = false;
512     const std::string & in = recipe;
513     for (size_t i = 0; i < in.size(); ++i)
514     {
515         if (!esc && in[i] == '\\')
516         {
517             esc = true;
518             continue;
519         }
520         if (!esc && in[i] == '$') //var
521         {
522             if (i+1 < in.size() && in[i+1] == '{') //ref prefix
523             {
524                 ++i;
525                 std::string name;
526                 bool term = false;
527                 while (++i < in.size())
528                 {
529                     if (in[i] == '}') { term = true; break; }
530                     name += in[i];
531                 }
532                 if (!term) throw mp::filter::FilterException
533                     ("Unterminated var ref in '"+in+"' at "
534                      + boost::lexical_cast<std::string>(i));
535                 std::map<std::string, std::string>::const_iterator it
536                     = vars.find(name);
537                 if (it != vars.end())
538                 {
539                     out += it->second;
540                 }
541             }
542             else
543             {
544                 throw mp::filter::FilterException
545                     ("Malformed or trimmed var ref in '"
546                      +in+"' at "+boost::lexical_cast<std::string>(i));
547             }
548             continue;
549         }
550         //passthru
551         out += in[i];
552         esc = false;
553     }
554     return out;
555 }
556
557 yf::HttpRewrite::Phase::Phase() : m_verbose(0)
558 {
559 }
560
561 void yf::HttpRewrite::configure_phase(const xmlNode *ptr, Phase &phase)
562 {
563     static const char *names[2] = { "verbose", 0 };
564     std::string values[1];
565     values[0] = "0";
566     mp::xml::parse_attr(ptr, names, values);
567
568     phase.m_verbose = atoi(values[0].c_str());
569
570     std::map<std::string, RulePtr > rules;
571     for (ptr = ptr->children; ptr; ptr = ptr->next)
572     {
573         if (ptr->type != XML_ELEMENT_NODE)
574             continue;
575         else if (!strcmp((const char *) ptr->name, "rule"))
576         {
577             static const char *names[2] = { "name", 0 };
578             std::string values[1];
579             values[0] = "default";
580             mp::xml::parse_attr(ptr, names, values);
581
582             RulePtr rule(new Rule);
583             for (xmlNode *p = ptr->children; p; p = p->next)
584             {
585                 if (p->type != XML_ELEMENT_NODE)
586                     continue;
587                 if (!strcmp((const char *) p->name, "rewrite"))
588                 {
589                     Replace replace;
590                     const struct _xmlAttr *attr;
591                     for (attr = p->properties; attr; attr = attr->next)
592                     {
593                         if (!strcmp((const char *) attr->name,  "from"))
594                             replace.regex = mp::xml::get_text(attr->children);
595                         else if (!strcmp((const char *) attr->name,  "to"))
596                             replace.recipe = mp::xml::get_text(attr->children);
597                         else
598                             throw mp::filter::FilterException
599                                 ("Bad attribute "
600                                  + std::string((const char *) attr->name)
601                                  + " in rewrite section of http_rewrite");
602                     }
603                     yaz_log(YLOG_LOG, "Found rewrite rule from '%s' to '%s'",
604                             replace.regex.c_str(), replace.recipe.c_str());
605                     replace.parse_groups();
606                     if (!replace.regex.empty())
607                         rule->replace_list.push_back(replace);
608                 }
609                 else
610                     throw mp::filter::FilterException
611                         ("Bad element "
612                          + std::string((const char *) p->name)
613                          + " in http_rewrite filter");
614             }
615             rules[values[0]] = rule;
616         }
617         else if (!strcmp((const char *) ptr->name, "within"))
618         {
619             static const char *names[6] =
620                 { "header", "attr", "tag", "rule", "reqline", 0 };
621             std::string values[5];
622             mp::xml::parse_attr(ptr, names, values);
623             Within w;
624             w.header = values[0];
625             w.attr = values[1];
626             w.tag = values[2];
627             std::map<std::string,RulePtr>::const_iterator it =
628                 rules.find(values[3]);
629             if (it == rules.end())
630                 throw mp::filter::FilterException
631                     ("Reference to non-existing rule '" + values[3] +
632                      "' in http_rewrite filter");
633             w.rule = it->second;
634             w.reqline = values[4] == "1";
635             phase.within_list.push_back(w);
636         }
637         else
638         {
639             throw mp::filter::FilterException
640                 ("Bad element "
641                  + std::string((const char *) ptr->name)
642                  + " in http_rewrite filter");
643         }
644     }
645 }
646
647 void yf::HttpRewrite::configure(const xmlNode * ptr, bool test_only,
648         const char *path)
649 {
650     for (ptr = ptr->children; ptr; ptr = ptr->next)
651     {
652         if (ptr->type != XML_ELEMENT_NODE)
653             continue;
654         else if (!strcmp((const char *) ptr->name, "request"))
655         {
656             configure_phase(ptr, *req_phase);
657         }
658         else if (!strcmp((const char *) ptr->name, "response"))
659         {
660             configure_phase(ptr, *res_phase);
661         }
662         else
663         {
664             throw mp::filter::FilterException
665                 ("Bad element "
666                  + std::string((const char *) ptr->name)
667                  + " in http_rewrite1 filter");
668         }
669     }
670 }
671
672 static mp::filter::Base* filter_creator()
673 {
674     return new mp::filter::HttpRewrite;
675 }
676
677 extern "C" {
678     struct metaproxy_1_filter_struct metaproxy_1_filter_http_rewrite = {
679         0,
680         "http_rewrite",
681         filter_creator
682     };
683 }
684
685
686 /*
687  * Local variables:
688  * c-basic-offset: 4
689  * c-file-style: "Stroustrup"
690  * indent-tabs-mode: nil
691  * End:
692  * vim: shiftwidth=4 tabstop=8 expandtab
693  */
694