http_rewrite: content areas
[metaproxy-moved-to-github.git] / src / filter_http_rewrite.cpp
1 /* This file is part of Metaproxy.
2    Copyright (C) 2005-2013 Index Data
3
4 Metaproxy is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18
19 #include "config.hpp"
20 #include <metaproxy/filter.hpp>
21 #include <metaproxy/package.hpp>
22 #include <metaproxy/util.hpp>
23 #include "filter_http_rewrite.hpp"
24 #include "html_parser.hpp"
25
26 #include <yaz/zgdu.h>
27 #include <yaz/log.h>
28
29 #include <stack>
30 #include <boost/regex.hpp>
31 #include <boost/lexical_cast.hpp>
32 #include <boost/algorithm/string.hpp>
33
34 #include <map>
35
36 namespace mp = metaproxy_1;
37 namespace yf = mp::filter;
38
39 namespace metaproxy_1 {
40     namespace filter {
41         class HttpRewrite::Replace {
42         public:
43             bool start_anchor;
44             boost::regex re;
45             boost::smatch what;
46             std::string recipe;
47             std::map<int, std::string> group_index;
48             std::string sub_vars(
49                 const std::map<std::string, std::string> & vars) const;
50             void parse_groups(std::string pattern);
51         };
52
53         class HttpRewrite::Rule {
54         public:
55             std::list<Replace> replace_list;
56             const std::string test_patterns(
57                 std::map<std::string, std::string> & vars,
58                 const std::string & txt, bool anchor);
59         };
60         class HttpRewrite::Within {
61         public:
62             std::string header;
63             std::string attr;
64             std::string tag;
65             bool reqline;
66             RulePtr rule;
67         };
68
69         class HttpRewrite::Content {
70         public:
71             std::string type;
72             boost::regex content_re;
73             std::list<Within> within_list;
74             void configure(const xmlNode *ptr,
75                            std::map<std::string, RulePtr > &rules);
76         };
77         class HttpRewrite::Phase {
78         public:
79             Phase();
80             int m_verbose;
81             std::list<Content> content_list;
82             void rewrite_reqline(mp::odr & o, Z_HTTP_Request *hreq,
83                 std::map<std::string, std::string> & vars) const;
84             void rewrite_headers(mp::odr & o, Z_HTTP_Header *headers,
85                 std::map<std::string, std::string> & vars) const;
86             void rewrite_body(mp::odr & o,
87                               const char *content_type,
88                               char **content_buf, int *content_len,
89                               std::map<std::string, std::string> & vars) const;
90         };
91         class HttpRewrite::Event : public HTMLParserEvent {
92             void openTagStart(const char *tag, int tag_len);
93             void anyTagEnd(const char *tag, int tag_len, int close_it);
94             void attribute(const char *tag, int tag_len,
95                            const char *attr, int attr_len,
96                            const char *value, int val_len,
97                            const char *sep);
98             void closeTag(const char *tag, int tag_len);
99             void text(const char *value, int len);
100             const Content *m_content;
101             WRBUF m_w;
102             std::stack<std::list<Within>::const_iterator> s_within;
103             std::map<std::string, std::string> &m_vars;
104         public:
105             Event(const Content *p, std::map<std::string, std::string> &vars);
106             ~Event();
107             const char *result();
108         };
109     }
110 }
111
112 yf::HttpRewrite::HttpRewrite() :
113     req_phase(new Phase), res_phase(new Phase)
114 {
115 }
116
117 yf::HttpRewrite::~HttpRewrite()
118 {
119 }
120
121 void yf::HttpRewrite::process(mp::Package & package) const
122 {
123     yaz_log(YLOG_LOG, "HttpRewrite begins....");
124     Z_GDU *gdu = package.request().get();
125     //map of request/response vars
126     std::map<std::string, std::string> vars;
127     //we have an http req
128     if (gdu && gdu->which == Z_GDU_HTTP_Request)
129     {
130         Z_HTTP_Request *hreq = gdu->u.HTTP_Request;
131         mp::odr o;
132         req_phase->rewrite_reqline(o, hreq, vars);
133         yaz_log(YLOG_LOG, ">> Request headers");
134         req_phase->rewrite_headers(o, hreq->headers, vars);
135         req_phase->rewrite_body(o,
136                                 z_HTTP_header_lookup(hreq->headers,
137                                                      "Content-Type"),
138                                 &hreq->content_buf, &hreq->content_len,
139                                 vars);
140         package.request() = gdu;
141     }
142     package.move();
143     gdu = package.response().get();
144     if (gdu && gdu->which == Z_GDU_HTTP_Response)
145     {
146         Z_HTTP_Response *hres = gdu->u.HTTP_Response;
147         yaz_log(YLOG_LOG, "Response code %d", hres->code);
148         mp::odr o;
149         yaz_log(YLOG_LOG, "<< Respose headers");
150         res_phase->rewrite_headers(o, hres->headers, vars);
151         res_phase->rewrite_body(o,
152                                 z_HTTP_header_lookup(hres->headers,
153                                                      "Content-Type"),
154                                 &hres->content_buf, &hres->content_len,
155                                 vars);
156         package.response() = gdu;
157     }
158 }
159
160 void yf::HttpRewrite::Phase::rewrite_reqline (mp::odr & o,
161         Z_HTTP_Request *hreq,
162         std::map<std::string, std::string> & vars) const
163 {
164     //rewrite the request line
165     std::string path;
166     if (strstr(hreq->path, "http://") == hreq->path)
167     {
168         yaz_log(YLOG_LOG, "Path in the method line is absolute, "
169             "possibly a proxy request");
170         path += hreq->path;
171     }
172     else
173     {
174         //TODO what about proto
175         const char *host = z_HTTP_header_lookup(hreq->headers, "Host");
176         if (!host)
177             return;
178
179         path += "http://";
180         path += host;
181         path += hreq->path;
182     }
183
184     std::list<Content>::const_iterator cit = content_list.begin();
185     for (; cit != content_list.end(); cit++)
186         if (cit->type == "headers")
187             break;
188
189     if (cit == content_list.end())
190         return;
191
192     std::list<Within>::const_iterator it = cit->within_list.begin();
193     for (; it != cit->within_list.end(); it++)
194         if (it->reqline)
195         {
196             RulePtr rule = it->rule;
197             yaz_log(YLOG_LOG, "Proxy request URL is %s", path.c_str());
198             std::string npath = rule->test_patterns(vars, path, true);
199             if (!npath.empty())
200             {
201                 yaz_log(YLOG_LOG, "Rewritten request URL is %s", npath.c_str());
202                 hreq->path = odr_strdup(o, npath.c_str());
203             }
204         }
205 }
206
207 void yf::HttpRewrite::Phase::rewrite_headers(mp::odr & o,
208         Z_HTTP_Header *headers,
209         std::map<std::string, std::string> & vars) const
210 {
211     std::list<Content>::const_iterator cit = content_list.begin();
212     for (; cit != content_list.end(); cit++)
213         if (cit->type == "headers")
214             break;
215
216     if (cit == content_list.end())
217         return;
218
219     for (Z_HTTP_Header *header = headers; header; header = header->next)
220     {
221         std::list<Within>::const_iterator it = cit->within_list.begin();
222         for (; it != cit->within_list.end(); it++)
223         {
224             if (it->header.length() > 0 &&
225                 yaz_strcasecmp(it->header.c_str(), header->name) == 0)
226             {
227                 std::string sheader(header->name);
228                 sheader += ": ";
229                 sheader += header->value;
230
231                 RulePtr rule = it->rule;
232                 std::string out = rule->test_patterns(vars, sheader, true);
233                 if (!out.empty())
234                 {
235                     size_t pos = out.find(": ");
236                     if (pos == std::string::npos)
237                     {
238                         yaz_log(YLOG_LOG, "Header malformed during rewrite, ignoring");
239                         continue;
240                     }
241                     header->name = odr_strdup(o, out.substr(0, pos).c_str());
242                     header->value = odr_strdup(o,
243                                                out.substr(pos + 2,
244                                                           std::string::npos).c_str());
245                 }
246             }
247         }
248     }
249 }
250
251 void yf::HttpRewrite::Phase::rewrite_body(
252     mp::odr &o,
253     const char *content_type,
254     char **content_buf,
255     int *content_len,
256     std::map<std::string, std::string> & vars) const
257 {
258     std::list<Content>::const_iterator cit = content_list.begin();
259     for (; cit != content_list.end(); cit++)
260     {
261         yaz_log(YLOG_LOG, "rewrite_body: content_type=%s type=%s",
262                 content_type, cit->type.c_str());
263         if (cit->type != "headers"
264             && regex_match(content_type, cit->content_re))
265             break;
266     }
267     if (cit == content_list.end())
268         return;
269
270     if (*content_buf)
271     {
272         int i;
273         for (i = 0; i < *content_len; i++)
274             if ((*content_buf)[i] == 0)
275                 return;  // binary content. skip
276
277         if (cit->type == "html")
278         {
279             HTMLParser parser;
280             Event ev(&*cit, vars);
281
282             parser.set_verbose(m_verbose);
283
284             std::string buf(*content_buf, *content_len);
285
286             parser.parse(ev, buf.c_str());
287             const char *res = ev.result();
288             *content_buf = odr_strdup(o, res);
289             *content_len = strlen(res);
290         }
291     }
292 }
293
294 yf::HttpRewrite::Event::Event(const Content *p,
295                               std::map<std::string, std::string> & vars
296     ) : m_content(p), m_vars(vars)
297 {
298     m_w = wrbuf_alloc();
299 }
300
301 yf::HttpRewrite::Event::~Event()
302 {
303     wrbuf_destroy(m_w);
304 }
305
306 const char *yf::HttpRewrite::Event::result()
307 {
308     return wrbuf_cstr(m_w);
309 }
310
311 void yf::HttpRewrite::Event::openTagStart(const char *tag, int tag_len)
312 {
313     wrbuf_putc(m_w, '<');
314     wrbuf_write(m_w, tag, tag_len);
315
316     std::string t(tag, tag_len);
317     std::list<Within>::const_iterator it = m_content->within_list.begin();
318     for (; it != m_content->within_list.end(); it++)
319     {
320         if (it->tag.length() > 0 && yaz_strcasecmp(it->tag.c_str(),
321                                                    t.c_str()) == 0)
322         {
323             std::vector<std::string> attr;
324             boost::split(attr, it->attr, boost::is_any_of(","));
325             size_t i;
326             for (i = 0; i < attr.size(); i++)
327             {
328                 if (attr[i].compare("#text") == 0)
329                 {
330                     s_within.push(it);
331                     return;
332                 }
333             }
334         }
335     }
336 }
337
338 void yf::HttpRewrite::Event::anyTagEnd(const char *tag, int tag_len,
339                                        int close_it)
340 {
341     if (close_it)
342     {
343         if (!s_within.empty())
344         {
345             std::list<Within>::const_iterator it = s_within.top();
346             std::string t(tag, tag_len);
347             if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
348                 s_within.pop();
349         }
350     }
351     if (close_it)
352         wrbuf_putc(m_w, '/');
353     wrbuf_putc(m_w, '>');
354 }
355
356 void yf::HttpRewrite::Event::attribute(const char *tag, int tag_len,
357                                        const char *attr, int attr_len,
358                                        const char *value, int val_len,
359                                        const char *sep)
360 {
361     std::list<Within>::const_iterator it = m_content->within_list.begin();
362     bool subst = false;
363
364     for (; it != m_content->within_list.end(); it++)
365     {
366         std::string t(tag, tag_len);
367         if (it->tag.length() == 0 ||
368             yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
369         {
370             std::string a(attr, attr_len);
371             std::vector<std::string> attr;
372             boost::split(attr, it->attr, boost::is_any_of(","));
373             size_t i;
374             for (i = 0; i < attr.size(); i++)
375             {
376                 if (attr[i].compare("#text") &&
377                     yaz_strcasecmp(attr[i].c_str(), a.c_str()) == 0)
378                     subst = true;
379             }
380         }
381         if (subst)
382             break;
383     }
384
385     wrbuf_putc(m_w, ' ');
386     wrbuf_write(m_w, attr, attr_len);
387     if (value)
388     {
389         wrbuf_puts(m_w, "=");
390         wrbuf_puts(m_w, sep);
391
392         std::string output;
393         if (subst)
394         {
395             std::string input(value, val_len);
396             output = it->rule->test_patterns(m_vars, input, true);
397         }
398         if (output.empty())
399             wrbuf_write(m_w, value, val_len);
400         else
401             wrbuf_puts(m_w, output.c_str());
402         wrbuf_puts(m_w, sep);
403     }
404 }
405
406 void yf::HttpRewrite::Event::closeTag(const char *tag, int tag_len)
407 {
408     if (!s_within.empty())
409     {
410         std::list<Within>::const_iterator it = s_within.top();
411         std::string t(tag, tag_len);
412         if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
413             s_within.pop();
414     }
415     wrbuf_puts(m_w, "</");
416     wrbuf_write(m_w, tag, tag_len);
417 }
418
419 void yf::HttpRewrite::Event::text(const char *value, int len)
420 {
421     std::list<Within>::const_iterator it = m_content->within_list.end();
422     if (!s_within.empty())
423         it = s_within.top();
424     std::string output;
425     if (it != m_content->within_list.end())
426     {
427         std::string input(value, len);
428         output = it->rule->test_patterns(m_vars, input, false);
429     }
430     if (output.empty())
431         wrbuf_write(m_w, value, len);
432     else
433         wrbuf_puts(m_w, output.c_str());
434 }
435
436 const std::string yf::HttpRewrite::Rule::test_patterns(
437         std::map<std::string, std::string> & vars,
438         const std::string & txt, bool anchor)
439 {
440     bool first = anchor;
441     std::string out;
442     std::string::const_iterator start, end;
443     start = txt.begin();
444     end = txt.end();
445     while (1)
446     {
447         std::list<Replace>::iterator bit = replace_list.end();
448         {
449             std::string::const_iterator best_pos = txt.end();
450             std::list<Replace>::iterator it = replace_list.begin();
451             for (; it != replace_list.end(); it++)
452             {
453                 if (it->start_anchor && !first)
454                     continue;
455                 if (regex_search(start, end, it->what, it->re))
456                 {
457                     if (it->what[0].first < best_pos)
458                     {
459                         best_pos = it->what[0].first;
460                         bit = it;
461                     }
462                 }
463             }
464             if (bit == replace_list.end())
465                 break;
466         }
467         first = false;
468         size_t i;
469         for (i = 1; i < bit->what.size(); ++i)
470         {
471             //check if the group is named
472             std::map<int, std::string>::const_iterator git
473                 = bit->group_index.find(i);
474             if (git != bit->group_index.end())
475             {   //it is
476                 vars[git->second] = bit->what[i];
477             }
478
479         }
480         //prepare replacement string
481         std::string rvalue = bit->sub_vars(vars);
482         yaz_log(YLOG_LOG, "! Rewritten '%s' to '%s'",
483                 bit->what.str(0).c_str(), rvalue.c_str());
484         out.append(start, bit->what[0].first);
485         out.append(rvalue);
486         start = bit->what[0].second; //move search forward
487     }
488     if (start != txt.begin())
489         out.append(start, end);
490     return out;
491 }
492
493 void yf::HttpRewrite::Replace::parse_groups(std::string pattern)
494 {
495     int gnum = 0;
496     bool esc = false;
497     const std::string &str = pattern;
498     std::string res;
499     start_anchor = str[0] == '^';
500     yaz_log(YLOG_LOG, "Parsing groups from '%s'", str.c_str());
501     for (size_t i = 0; i < str.size(); ++i)
502     {
503         res += str[i];
504         if (!esc && str[i] == '\\')
505         {
506             esc = true;
507             continue;
508         }
509         if (!esc && str[i] == '(') //group starts
510         {
511             gnum++;
512             if (i+1 < str.size() && str[i+1] == '?') //group with attrs
513             {
514                 i++;
515                 if (i+1 < str.size() && str[i+1] == ':') //non-capturing
516                 {
517                     if (gnum > 0) gnum--;
518                     res += str[i];
519                     i++;
520                     res += str[i];
521                     continue;
522                 }
523                 if (i+1 < str.size() && str[i+1] == 'P') //optional, python
524                     i++;
525                 if (i+1 < str.size() && str[i+1] == '<') //named
526                 {
527                     i++;
528                     std::string gname;
529                     bool term = false;
530                     while (++i < str.size())
531                     {
532                         if (str[i] == '>') { term = true; break; }
533                         if (!isalnum(str[i]))
534                             throw mp::filter::FilterException
535                                 ("Only alphanumeric chars allowed, found "
536                                  " in '"
537                                  + str
538                                  + "' at "
539                                  + boost::lexical_cast<std::string>(i));
540                         gname += str[i];
541                     }
542                     if (!term)
543                         throw mp::filter::FilterException
544                             ("Unterminated group name '" + gname
545                              + " in '" + str +"'");
546                     group_index[gnum] = gname;
547                     yaz_log(YLOG_LOG, "Found named group '%s' at $%d",
548                             gname.c_str(), gnum);
549                 }
550             }
551         }
552         esc = false;
553     }
554     re = res;
555 }
556
557 std::string yf::HttpRewrite::Replace::sub_vars(
558     const std::map<std::string, std::string> & vars) const
559 {
560     std::string out;
561     bool esc = false;
562     const std::string & in = recipe;
563     for (size_t i = 0; i < in.size(); ++i)
564     {
565         if (!esc && in[i] == '\\')
566         {
567             esc = true;
568             continue;
569         }
570         if (!esc && in[i] == '$') //var
571         {
572             if (i+1 < in.size() && in[i+1] == '{') //ref prefix
573             {
574                 ++i;
575                 std::string name;
576                 bool term = false;
577                 while (++i < in.size())
578                 {
579                     if (in[i] == '}') { term = true; break; }
580                     name += in[i];
581                 }
582                 if (!term) throw mp::filter::FilterException
583                     ("Unterminated var ref in '"+in+"' at "
584                      + boost::lexical_cast<std::string>(i));
585                 std::map<std::string, std::string>::const_iterator it
586                     = vars.find(name);
587                 if (it != vars.end())
588                 {
589                     out += it->second;
590                 }
591             }
592             else
593             {
594                 throw mp::filter::FilterException
595                     ("Malformed or trimmed var ref in '"
596                      +in+"' at "+boost::lexical_cast<std::string>(i));
597             }
598             continue;
599         }
600         //passthru
601         out += in[i];
602         esc = false;
603     }
604     return out;
605 }
606
607 yf::HttpRewrite::Phase::Phase() : m_verbose(0)
608 {
609 }
610
611 void yf::HttpRewrite::Content::configure(
612     const xmlNode *ptr, std::map<std::string, RulePtr > &rules)
613 {
614     for (; ptr; ptr = ptr->next)
615     {
616         if (ptr->type != XML_ELEMENT_NODE)
617             continue;
618         if (!strcmp((const char *) ptr->name, "within"))
619         {
620             static const char *names[6] =
621                 { "header", "attr", "tag", "rule", "reqline", 0 };
622             std::string values[5];
623             mp::xml::parse_attr(ptr, names, values);
624             Within w;
625             w.header = values[0];
626             w.attr = values[1];
627             w.tag = values[2];
628             std::map<std::string,RulePtr>::const_iterator it =
629                 rules.find(values[3]);
630             if (it == rules.end())
631                 throw mp::filter::FilterException
632                     ("Reference to non-existing rule '" + values[3] +
633                      "' in http_rewrite filter");
634             w.rule = it->second;
635             w.reqline = values[4] == "1";
636             within_list.push_back(w);
637         }
638     }
639 }
640
641 void yf::HttpRewrite::configure_phase(const xmlNode *ptr, Phase &phase)
642 {
643     static const char *names[2] = { "verbose", 0 };
644     std::string values[1];
645     values[0] = "0";
646     mp::xml::parse_attr(ptr, names, values);
647
648     phase.m_verbose = atoi(values[0].c_str());
649
650     std::map<std::string, RulePtr > rules;
651     for (ptr = ptr->children; ptr; ptr = ptr->next)
652     {
653         if (ptr->type != XML_ELEMENT_NODE)
654             continue;
655         else if (!strcmp((const char *) ptr->name, "rule"))
656         {
657             static const char *names[2] = { "name", 0 };
658             std::string values[1];
659             values[0] = "default";
660             mp::xml::parse_attr(ptr, names, values);
661
662             RulePtr rule(new Rule);
663             for (xmlNode *p = ptr->children; p; p = p->next)
664             {
665                 if (p->type != XML_ELEMENT_NODE)
666                     continue;
667                 if (!strcmp((const char *) p->name, "rewrite"))
668                 {
669                     Replace replace;
670                     std::string from;
671                     const struct _xmlAttr *attr;
672                     for (attr = p->properties; attr; attr = attr->next)
673                     {
674                         if (!strcmp((const char *) attr->name,  "from"))
675                             from = mp::xml::get_text(attr->children);
676                         else if (!strcmp((const char *) attr->name,  "to"))
677                             replace.recipe = mp::xml::get_text(attr->children);
678                         else
679                             throw mp::filter::FilterException
680                                 ("Bad attribute "
681                                  + std::string((const char *) attr->name)
682                                  + " in rewrite section of http_rewrite");
683                     }
684                     yaz_log(YLOG_LOG, "Found rewrite rule from '%s' to '%s'",
685                             from.c_str(), replace.recipe.c_str());
686                     if (!from.empty())
687                     {
688                         replace.parse_groups(from);
689                         rule->replace_list.push_back(replace);
690                     }
691                 }
692                 else
693                     throw mp::filter::FilterException
694                         ("Bad element "
695                          + std::string((const char *) p->name)
696                          + " in http_rewrite filter");
697             }
698             rules[values[0]] = rule;
699         }
700         else if (!strcmp((const char *) ptr->name, "content"))
701         {
702             static const char *names[3] =
703                 { "type", "mime", 0 };
704             std::string values[2];
705             mp::xml::parse_attr(ptr, names, values);
706             if (values[0].empty())
707             {
708                     throw mp::filter::FilterException
709                         ("Missing attribute, type for for element "
710                          + std::string((const char *) ptr->name)
711                          + " in http_rewrite filter");
712             }
713             Content c;
714
715             c.type = values[0];
716             // if (!values[1].empty())
717                 c.content_re = values[1];
718             c.configure(ptr->children, rules);
719             phase.content_list.push_back(c);
720         }
721         else
722         {
723             throw mp::filter::FilterException
724                 ("Bad element "
725                  + std::string((const char *) ptr->name)
726                  + " in http_rewrite filter");
727         }
728     }
729 }
730
731 void yf::HttpRewrite::configure(const xmlNode * ptr, bool test_only,
732         const char *path)
733 {
734     for (ptr = ptr->children; ptr; ptr = ptr->next)
735     {
736         if (ptr->type != XML_ELEMENT_NODE)
737             continue;
738         else if (!strcmp((const char *) ptr->name, "request"))
739         {
740             configure_phase(ptr, *req_phase);
741         }
742         else if (!strcmp((const char *) ptr->name, "response"))
743         {
744             configure_phase(ptr, *res_phase);
745         }
746         else
747         {
748             throw mp::filter::FilterException
749                 ("Bad element "
750                  + std::string((const char *) ptr->name)
751                  + " in http_rewrite1 filter");
752         }
753     }
754 }
755
756 static mp::filter::Base* filter_creator()
757 {
758     return new mp::filter::HttpRewrite;
759 }
760
761 extern "C" {
762     struct metaproxy_1_filter_struct metaproxy_1_filter_http_rewrite = {
763         0,
764         "http_rewrite",
765         filter_creator
766     };
767 }
768
769
770 /*
771  * Local variables:
772  * c-basic-offset: 4
773  * c-file-style: "Stroustrup"
774  * indent-tabs-mode: nil
775  * End:
776  * vim: shiftwidth=4 tabstop=8 expandtab
777  */
778