Skip start anchor for text sections entirely
[metaproxy-moved-to-github.git] / src / filter_http_rewrite.cpp
1 /* This file is part of Metaproxy.
2    Copyright (C) 2005-2013 Index Data
3
4 Metaproxy is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18
19 #include "config.hpp"
20 #include <metaproxy/filter.hpp>
21 #include <metaproxy/package.hpp>
22 #include <metaproxy/util.hpp>
23 #include "filter_http_rewrite.hpp"
24 #include "html_parser.hpp"
25
26 #include <yaz/zgdu.h>
27 #include <yaz/log.h>
28
29 #include <stack>
30 #include <boost/regex.hpp>
31 #include <boost/lexical_cast.hpp>
32 #include <boost/algorithm/string.hpp>
33
34 #include <map>
35
36 namespace mp = metaproxy_1;
37 namespace yf = mp::filter;
38
39 namespace metaproxy_1 {
40     namespace filter {
41         class HttpRewrite::Replace {
42         public:
43             bool start_anchor;
44             boost::regex re;
45             boost::smatch what;
46             std::string recipe;
47             std::map<int, std::string> group_index;
48             std::string sub_vars(
49                 const std::map<std::string, std::string> & vars) const;
50             void parse_groups(std::string pattern);
51         };
52
53         class HttpRewrite::Rule {
54         public:
55             std::list<Replace> replace_list;
56             const std::string test_patterns(
57                 std::map<std::string, std::string> & vars,
58                 const std::string & txt, bool anchor);
59         };
60         class HttpRewrite::Within {
61         public:
62             std::string header;
63             std::string attr;
64             std::string tag;
65             bool reqline;
66             RulePtr rule;
67         };
68
69         class HttpRewrite::Phase {
70         public:
71             Phase();
72             std::list<Within> within_list;
73             int m_verbose;
74             void rewrite_reqline(mp::odr & o, Z_HTTP_Request *hreq,
75                 std::map<std::string, std::string> & vars) const;
76             void rewrite_headers(mp::odr & o, Z_HTTP_Header *headers,
77                 std::map<std::string, std::string> & vars) const;
78             void rewrite_body(mp::odr & o,
79                 char **content_buf, int *content_len,
80                 std::map<std::string, std::string> & vars) const;
81         };
82         class HttpRewrite::Event : public HTMLParserEvent {
83             void openTagStart(const char *tag, int tag_len);
84             void anyTagEnd(const char *tag, int tag_len, int close_it);
85             void attribute(const char *tag, int tag_len,
86                            const char *attr, int attr_len,
87                            const char *value, int val_len,
88                            const char *sep);
89             void closeTag(const char *tag, int tag_len);
90             void text(const char *value, int len);
91             const Phase *m_phase;
92             WRBUF m_w;
93             std::stack<std::list<Within>::const_iterator> s_within;
94             std::map<std::string, std::string> &m_vars;
95         public:
96             Event(const Phase *p, std::map<std::string, std::string> &vars);
97             ~Event();
98             const char *result();
99         };
100     }
101 }
102
103 yf::HttpRewrite::HttpRewrite() :
104     req_phase(new Phase), res_phase(new Phase)
105 {
106 }
107
108 yf::HttpRewrite::~HttpRewrite()
109 {
110 }
111
112 void yf::HttpRewrite::process(mp::Package & package) const
113 {
114     yaz_log(YLOG_LOG, "HttpRewrite begins....");
115     Z_GDU *gdu = package.request().get();
116     //map of request/response vars
117     std::map<std::string, std::string> vars;
118     //we have an http req
119     if (gdu && gdu->which == Z_GDU_HTTP_Request)
120     {
121         Z_HTTP_Request *hreq = gdu->u.HTTP_Request;
122         mp::odr o;
123         req_phase->rewrite_reqline(o, hreq, vars);
124         yaz_log(YLOG_LOG, ">> Request headers");
125         req_phase->rewrite_headers(o, hreq->headers, vars);
126         req_phase->rewrite_body(o,
127                 &hreq->content_buf, &hreq->content_len, vars);
128         package.request() = gdu;
129     }
130     package.move();
131     gdu = package.response().get();
132     if (gdu && gdu->which == Z_GDU_HTTP_Response)
133     {
134         Z_HTTP_Response *hres = gdu->u.HTTP_Response;
135         yaz_log(YLOG_LOG, "Response code %d", hres->code);
136         mp::odr o;
137         yaz_log(YLOG_LOG, "<< Respose headers");
138         res_phase->rewrite_headers(o, hres->headers, vars);
139         res_phase->rewrite_body(o, &hres->content_buf,
140                 &hres->content_len, vars);
141         package.response() = gdu;
142     }
143 }
144
145 void yf::HttpRewrite::Phase::rewrite_reqline (mp::odr & o,
146         Z_HTTP_Request *hreq,
147         std::map<std::string, std::string> & vars) const
148 {
149     //rewrite the request line
150     std::string path;
151     if (strstr(hreq->path, "http://") == hreq->path)
152     {
153         yaz_log(YLOG_LOG, "Path in the method line is absolute, "
154             "possibly a proxy request");
155         path += hreq->path;
156     }
157     else
158     {
159         //TODO what about proto
160         const char *host = z_HTTP_header_lookup(hreq->headers, "Host");
161         if (!host)
162             return;
163
164         path += "http://";
165         path += host;
166         path += hreq->path;
167     }
168
169
170     std::list<Within>::const_iterator it = within_list.begin();
171     for (; it != within_list.end(); it++)
172         if (it->reqline)
173         {
174             RulePtr rule = it->rule;
175             yaz_log(YLOG_LOG, "Proxy request URL is %s", path.c_str());
176             std::string npath = rule->test_patterns(vars, path, true);
177             if (!npath.empty())
178             {
179                 yaz_log(YLOG_LOG, "Rewritten request URL is %s", npath.c_str());
180                 hreq->path = odr_strdup(o, npath.c_str());
181             }
182         }
183 }
184
185 void yf::HttpRewrite::Phase::rewrite_headers(mp::odr & o,
186         Z_HTTP_Header *headers,
187         std::map<std::string, std::string> & vars) const
188 {
189     for (Z_HTTP_Header *header = headers; header; header = header->next)
190     {
191         std::list<Within>::const_iterator it = within_list.begin();
192         for (; it != within_list.end(); it++)
193         {
194             if (it->header.length() > 0 &&
195                 yaz_strcasecmp(it->header.c_str(), header->name) == 0)
196             {
197                 std::string sheader(header->name);
198                 sheader += ": ";
199                 sheader += header->value;
200
201                 RulePtr rule = it->rule;
202                 std::string out = rule->test_patterns(vars, sheader, true);
203                 if (!out.empty())
204                 {
205                     size_t pos = out.find(": ");
206                     if (pos == std::string::npos)
207                     {
208                         yaz_log(YLOG_LOG, "Header malformed during rewrite, ignoring");
209                         continue;
210                     }
211                     header->name = odr_strdup(o, out.substr(0, pos).c_str());
212                     header->value = odr_strdup(o,
213                                                out.substr(pos + 2,
214                                                           std::string::npos).c_str());
215                 }
216             }
217         }
218     }
219 }
220
221 void yf::HttpRewrite::Phase::rewrite_body(mp::odr & o,
222         char **content_buf,
223         int *content_len,
224         std::map<std::string, std::string> & vars) const
225 {
226     if (*content_buf)
227     {
228         int i;
229         for (i = 0; i < *content_len; i++)
230             if ((*content_buf)[i] == 0)
231                 return;  // binary content. skip
232
233         HTMLParser parser;
234         Event ev(this, vars);
235
236         parser.set_verbose(m_verbose);
237
238         std::string buf(*content_buf, *content_len);
239
240         parser.parse(ev, buf.c_str());
241         const char *res = ev.result();
242         *content_buf = odr_strdup(o, res);
243         *content_len = strlen(res);
244     }
245 }
246
247 yf::HttpRewrite::Event::Event(const Phase *p,
248                               std::map<std::string, std::string> & vars
249     ) : m_phase(p), m_vars(vars)
250 {
251     m_w = wrbuf_alloc();
252 }
253
254 yf::HttpRewrite::Event::~Event()
255 {
256     wrbuf_destroy(m_w);
257 }
258
259 const char *yf::HttpRewrite::Event::result()
260 {
261     return wrbuf_cstr(m_w);
262 }
263
264 void yf::HttpRewrite::Event::openTagStart(const char *tag, int tag_len)
265 {
266     wrbuf_putc(m_w, '<');
267     wrbuf_write(m_w, tag, tag_len);
268
269     std::string t(tag, tag_len);
270     std::list<Within>::const_iterator it = m_phase->within_list.begin();
271     for (; it != m_phase->within_list.end(); it++)
272     {
273         if (it->tag.length() > 0 && yaz_strcasecmp(it->tag.c_str(),
274                                                    t.c_str()) == 0)
275         {
276             std::vector<std::string> attr;
277             boost::split(attr, it->attr, boost::is_any_of(","));
278             size_t i;
279             for (i = 0; i < attr.size(); i++)
280             {
281                 if (attr[i].compare("#text") == 0)
282                 {
283                     s_within.push(it);
284                     return;
285                 }
286             }
287         }
288     }
289 }
290
291 void yf::HttpRewrite::Event::anyTagEnd(const char *tag, int tag_len,
292                                        int close_it)
293 {
294     if (close_it)
295     {
296         if (!s_within.empty())
297         {
298             std::list<Within>::const_iterator it = s_within.top();
299             std::string t(tag, tag_len);
300             if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
301                 s_within.pop();
302         }
303     }
304     if (close_it)
305         wrbuf_putc(m_w, '/');
306     wrbuf_putc(m_w, '>');
307 }
308
309 void yf::HttpRewrite::Event::attribute(const char *tag, int tag_len,
310                                        const char *attr, int attr_len,
311                                        const char *value, int val_len,
312                                        const char *sep)
313 {
314     std::list<Within>::const_iterator it = m_phase->within_list.begin();
315     bool subst = false;
316
317     for (; it != m_phase->within_list.end(); it++)
318     {
319         std::string t(tag, tag_len);
320         if (it->tag.length() == 0 ||
321             yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
322         {
323             std::string a(attr, attr_len);
324             std::vector<std::string> attr;
325             boost::split(attr, it->attr, boost::is_any_of(","));
326             size_t i;
327             for (i = 0; i < attr.size(); i++)
328             {
329                 if (attr[i].compare("#text") &&
330                     yaz_strcasecmp(attr[i].c_str(), a.c_str()) == 0)
331                     subst = true;
332             }
333         }
334         if (subst)
335             break;
336     }
337
338     wrbuf_putc(m_w, ' ');
339     wrbuf_write(m_w, attr, attr_len);
340     if (value)
341     {
342         wrbuf_puts(m_w, "=");
343         wrbuf_puts(m_w, sep);
344
345         std::string output;
346         if (subst)
347         {
348             std::string input(value, val_len);
349             output = it->rule->test_patterns(m_vars, input, true);
350         }
351         if (output.empty())
352             wrbuf_write(m_w, value, val_len);
353         else
354             wrbuf_puts(m_w, output.c_str());
355         wrbuf_puts(m_w, sep);
356     }
357 }
358
359 void yf::HttpRewrite::Event::closeTag(const char *tag, int tag_len)
360 {
361     if (!s_within.empty())
362     {
363         std::list<Within>::const_iterator it = s_within.top();
364         std::string t(tag, tag_len);
365         if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
366             s_within.pop();
367     }
368     wrbuf_puts(m_w, "</");
369     wrbuf_write(m_w, tag, tag_len);
370 }
371
372 void yf::HttpRewrite::Event::text(const char *value, int len)
373 {
374     std::list<Within>::const_iterator it = m_phase->within_list.end();
375     if (!s_within.empty())
376         it = s_within.top();
377     std::string output;
378     if (it != m_phase->within_list.end())
379     {
380         std::string input(value, len);
381         output = it->rule->test_patterns(m_vars, input, false);
382     }
383     if (output.empty())
384         wrbuf_write(m_w, value, len);
385     else
386         wrbuf_puts(m_w, output.c_str());
387 }
388
389 const std::string yf::HttpRewrite::Rule::test_patterns(
390         std::map<std::string, std::string> & vars,
391         const std::string & txt, bool anchor)
392 {
393     bool first = anchor;
394     std::string out;
395     std::string::const_iterator start, end;
396     start = txt.begin();
397     end = txt.end();
398     while (1)
399     {
400         std::list<Replace>::iterator bit = replace_list.end();
401         {
402             std::string::const_iterator best_pos = txt.end();
403             std::list<Replace>::iterator it = replace_list.begin();
404             for (; it != replace_list.end(); it++)
405             {
406                 if (it->start_anchor && !first)
407                     continue;
408                 if (regex_search(start, end, it->what, it->re))
409                 {
410                     if (it->what[0].first < best_pos)
411                     {
412                         best_pos = it->what[0].first;
413                         bit = it;
414                     }
415                 }
416             }
417             if (bit == replace_list.end())
418                 break;
419         }
420         first = false;
421         size_t i;
422         for (i = 1; i < bit->what.size(); ++i)
423         {
424             //check if the group is named
425             std::map<int, std::string>::const_iterator git
426                 = bit->group_index.find(i);
427             if (git != bit->group_index.end())
428             {   //it is
429                 vars[git->second] = bit->what[i];
430             }
431
432         }
433         //prepare replacement string
434         std::string rvalue = bit->sub_vars(vars);
435         yaz_log(YLOG_LOG, "! Rewritten '%s' to '%s'",
436                 bit->what.str(0).c_str(), rvalue.c_str());
437         out.append(start, bit->what[0].first);
438         out.append(rvalue);
439         start = bit->what[0].second; //move search forward
440     }
441     if (start != txt.begin())
442         out.append(start, end);
443     return out;
444 }
445
446 void yf::HttpRewrite::Replace::parse_groups(std::string pattern)
447 {
448     int gnum = 0;
449     bool esc = false;
450     const std::string &str = pattern;
451     std::string res;
452     start_anchor = str[0] == '^';
453     yaz_log(YLOG_LOG, "Parsing groups from '%s'", str.c_str());
454     for (size_t i = 0; i < str.size(); ++i)
455     {
456         res += str[i];
457         if (!esc && str[i] == '\\')
458         {
459             esc = true;
460             continue;
461         }
462         if (!esc && str[i] == '(') //group starts
463         {
464             gnum++;
465             if (i+1 < str.size() && str[i+1] == '?') //group with attrs
466             {
467                 i++;
468                 if (i+1 < str.size() && str[i+1] == ':') //non-capturing
469                 {
470                     if (gnum > 0) gnum--;
471                     res += str[i];
472                     i++;
473                     res += str[i];
474                     continue;
475                 }
476                 if (i+1 < str.size() && str[i+1] == 'P') //optional, python
477                     i++;
478                 if (i+1 < str.size() && str[i+1] == '<') //named
479                 {
480                     i++;
481                     std::string gname;
482                     bool term = false;
483                     while (++i < str.size())
484                     {
485                         if (str[i] == '>') { term = true; break; }
486                         if (!isalnum(str[i]))
487                             throw mp::filter::FilterException
488                                 ("Only alphanumeric chars allowed, found "
489                                  " in '"
490                                  + str
491                                  + "' at "
492                                  + boost::lexical_cast<std::string>(i));
493                         gname += str[i];
494                     }
495                     if (!term)
496                         throw mp::filter::FilterException
497                             ("Unterminated group name '" + gname
498                              + " in '" + str +"'");
499                     group_index[gnum] = gname;
500                     yaz_log(YLOG_LOG, "Found named group '%s' at $%d",
501                             gname.c_str(), gnum);
502                 }
503             }
504         }
505         esc = false;
506     }
507     re = res;
508 }
509
510 std::string yf::HttpRewrite::Replace::sub_vars(
511     const std::map<std::string, std::string> & vars) const
512 {
513     std::string out;
514     bool esc = false;
515     const std::string & in = recipe;
516     for (size_t i = 0; i < in.size(); ++i)
517     {
518         if (!esc && in[i] == '\\')
519         {
520             esc = true;
521             continue;
522         }
523         if (!esc && in[i] == '$') //var
524         {
525             if (i+1 < in.size() && in[i+1] == '{') //ref prefix
526             {
527                 ++i;
528                 std::string name;
529                 bool term = false;
530                 while (++i < in.size())
531                 {
532                     if (in[i] == '}') { term = true; break; }
533                     name += in[i];
534                 }
535                 if (!term) throw mp::filter::FilterException
536                     ("Unterminated var ref in '"+in+"' at "
537                      + boost::lexical_cast<std::string>(i));
538                 std::map<std::string, std::string>::const_iterator it
539                     = vars.find(name);
540                 if (it != vars.end())
541                 {
542                     out += it->second;
543                 }
544             }
545             else
546             {
547                 throw mp::filter::FilterException
548                     ("Malformed or trimmed var ref in '"
549                      +in+"' at "+boost::lexical_cast<std::string>(i));
550             }
551             continue;
552         }
553         //passthru
554         out += in[i];
555         esc = false;
556     }
557     return out;
558 }
559
560 yf::HttpRewrite::Phase::Phase() : m_verbose(0)
561 {
562 }
563
564 void yf::HttpRewrite::configure_phase(const xmlNode *ptr, Phase &phase)
565 {
566     static const char *names[2] = { "verbose", 0 };
567     std::string values[1];
568     values[0] = "0";
569     mp::xml::parse_attr(ptr, names, values);
570
571     phase.m_verbose = atoi(values[0].c_str());
572
573     std::map<std::string, RulePtr > rules;
574     for (ptr = ptr->children; ptr; ptr = ptr->next)
575     {
576         if (ptr->type != XML_ELEMENT_NODE)
577             continue;
578         else if (!strcmp((const char *) ptr->name, "rule"))
579         {
580             static const char *names[2] = { "name", 0 };
581             std::string values[1];
582             values[0] = "default";
583             mp::xml::parse_attr(ptr, names, values);
584
585             RulePtr rule(new Rule);
586             for (xmlNode *p = ptr->children; p; p = p->next)
587             {
588                 if (p->type != XML_ELEMENT_NODE)
589                     continue;
590                 if (!strcmp((const char *) p->name, "rewrite"))
591                 {
592                     Replace replace;
593                     std::string from;
594                     const struct _xmlAttr *attr;
595                     for (attr = p->properties; attr; attr = attr->next)
596                     {
597                         if (!strcmp((const char *) attr->name,  "from"))
598                             from = mp::xml::get_text(attr->children);
599                         else if (!strcmp((const char *) attr->name,  "to"))
600                             replace.recipe = mp::xml::get_text(attr->children);
601                         else
602                             throw mp::filter::FilterException
603                                 ("Bad attribute "
604                                  + std::string((const char *) attr->name)
605                                  + " in rewrite section of http_rewrite");
606                     }
607                     yaz_log(YLOG_LOG, "Found rewrite rule from '%s' to '%s'",
608                             from.c_str(), replace.recipe.c_str());
609                     if (!from.empty())
610                     {
611                         replace.parse_groups(from);
612                         rule->replace_list.push_back(replace);
613                     }
614                 }
615                 else
616                     throw mp::filter::FilterException
617                         ("Bad element "
618                          + std::string((const char *) p->name)
619                          + " in http_rewrite filter");
620             }
621             rules[values[0]] = rule;
622         }
623         else if (!strcmp((const char *) ptr->name, "within"))
624         {
625             static const char *names[6] =
626                 { "header", "attr", "tag", "rule", "reqline", 0 };
627             std::string values[5];
628             mp::xml::parse_attr(ptr, names, values);
629             Within w;
630             w.header = values[0];
631             w.attr = values[1];
632             w.tag = values[2];
633             std::map<std::string,RulePtr>::const_iterator it =
634                 rules.find(values[3]);
635             if (it == rules.end())
636                 throw mp::filter::FilterException
637                     ("Reference to non-existing rule '" + values[3] +
638                      "' in http_rewrite filter");
639             w.rule = it->second;
640             w.reqline = values[4] == "1";
641             phase.within_list.push_back(w);
642         }
643         else
644         {
645             throw mp::filter::FilterException
646                 ("Bad element "
647                  + std::string((const char *) ptr->name)
648                  + " in http_rewrite filter");
649         }
650     }
651 }
652
653 void yf::HttpRewrite::configure(const xmlNode * ptr, bool test_only,
654         const char *path)
655 {
656     for (ptr = ptr->children; ptr; ptr = ptr->next)
657     {
658         if (ptr->type != XML_ELEMENT_NODE)
659             continue;
660         else if (!strcmp((const char *) ptr->name, "request"))
661         {
662             configure_phase(ptr, *req_phase);
663         }
664         else if (!strcmp((const char *) ptr->name, "response"))
665         {
666             configure_phase(ptr, *res_phase);
667         }
668         else
669         {
670             throw mp::filter::FilterException
671                 ("Bad element "
672                  + std::string((const char *) ptr->name)
673                  + " in http_rewrite1 filter");
674         }
675     }
676 }
677
678 static mp::filter::Base* filter_creator()
679 {
680     return new mp::filter::HttpRewrite;
681 }
682
683 extern "C" {
684     struct metaproxy_1_filter_struct metaproxy_1_filter_http_rewrite = {
685         0,
686         "http_rewrite",
687         filter_creator
688     };
689 }
690
691
692 /*
693  * Local variables:
694  * c-basic-offset: 4
695  * c-file-style: "Stroustrup"
696  * indent-tabs-mode: nil
697  * End:
698  * vim: shiftwidth=4 tabstop=8 expandtab
699  */
700