1 /* This file is part of Metaproxy.
2 Copyright (C) 2005-2013 Index Data
4 Metaproxy is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
9 Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 #include <metaproxy/filter.hpp>
21 #include <metaproxy/package.hpp>
22 #include <metaproxy/util.hpp>
23 #include "filter_http_rewrite.hpp"
24 #include "html_parser.hpp"
30 #include <boost/regex.hpp>
31 #include <boost/lexical_cast.hpp>
32 #include <boost/algorithm/string.hpp>
36 namespace mp = metaproxy_1;
37 namespace yf = mp::filter;
39 namespace metaproxy_1 {
41 class HttpRewrite::Replace {
47 std::map<int, std::string> group_index;
49 const std::map<std::string, std::string> & vars) const;
50 void parse_groups(std::string pattern);
53 class HttpRewrite::Rule {
55 std::list<Replace> replace_list;
56 const std::string test_patterns(
57 std::map<std::string, std::string> & vars,
58 const std::string & txt, bool anchor);
60 class HttpRewrite::Within {
69 class HttpRewrite::Content {
72 boost::regex content_re;
73 std::list<Within> within_list;
74 void configure(const xmlNode *ptr,
75 std::map<std::string, RulePtr > &rules);
77 class HttpRewrite::Phase {
81 std::list<Content> content_list;
82 void rewrite_reqline(mp::odr & o, Z_HTTP_Request *hreq,
83 std::map<std::string, std::string> & vars) const;
84 void rewrite_headers(mp::odr & o, Z_HTTP_Header *headers,
85 std::map<std::string, std::string> & vars) const;
86 void rewrite_body(mp::odr & o,
87 const char *content_type,
88 char **content_buf, int *content_len,
89 std::map<std::string, std::string> & vars) const;
91 class HttpRewrite::Event : public HTMLParserEvent {
92 void openTagStart(const char *tag, int tag_len);
93 void anyTagEnd(const char *tag, int tag_len, int close_it);
94 void attribute(const char *tag, int tag_len,
95 const char *attr, int attr_len,
96 const char *value, int val_len,
98 void closeTag(const char *tag, int tag_len);
99 void text(const char *value, int len);
100 const Content *m_content;
102 std::stack<std::list<Within>::const_iterator> s_within;
103 std::map<std::string, std::string> &m_vars;
105 Event(const Content *p, std::map<std::string, std::string> &vars);
107 const char *result();
112 yf::HttpRewrite::HttpRewrite() :
113 req_phase(new Phase), res_phase(new Phase)
117 yf::HttpRewrite::~HttpRewrite()
121 void yf::HttpRewrite::process(mp::Package & package) const
123 yaz_log(YLOG_LOG, "HttpRewrite begins....");
124 Z_GDU *gdu = package.request().get();
125 //map of request/response vars
126 std::map<std::string, std::string> vars;
127 //we have an http req
128 if (gdu && gdu->which == Z_GDU_HTTP_Request)
130 Z_HTTP_Request *hreq = gdu->u.HTTP_Request;
132 req_phase->rewrite_reqline(o, hreq, vars);
133 yaz_log(YLOG_LOG, ">> Request headers");
134 req_phase->rewrite_headers(o, hreq->headers, vars);
135 req_phase->rewrite_body(o,
136 z_HTTP_header_lookup(hreq->headers,
138 &hreq->content_buf, &hreq->content_len,
140 package.request() = gdu;
143 gdu = package.response().get();
144 if (gdu && gdu->which == Z_GDU_HTTP_Response)
146 Z_HTTP_Response *hres = gdu->u.HTTP_Response;
147 yaz_log(YLOG_LOG, "Response code %d", hres->code);
149 yaz_log(YLOG_LOG, "<< Respose headers");
150 res_phase->rewrite_headers(o, hres->headers, vars);
151 res_phase->rewrite_body(o,
152 z_HTTP_header_lookup(hres->headers,
154 &hres->content_buf, &hres->content_len,
156 package.response() = gdu;
160 void yf::HttpRewrite::Phase::rewrite_reqline (mp::odr & o,
161 Z_HTTP_Request *hreq,
162 std::map<std::string, std::string> & vars) const
164 //rewrite the request line
166 if (strstr(hreq->path, "http://") == hreq->path)
168 yaz_log(YLOG_LOG, "Path in the method line is absolute, "
169 "possibly a proxy request");
174 //TODO what about proto
175 const char *host = z_HTTP_header_lookup(hreq->headers, "Host");
184 std::list<Content>::const_iterator cit = content_list.begin();
185 for (; cit != content_list.end(); cit++)
186 if (cit->type == "headers")
189 if (cit == content_list.end())
192 std::list<Within>::const_iterator it = cit->within_list.begin();
193 for (; it != cit->within_list.end(); it++)
196 RulePtr rule = it->rule;
197 yaz_log(YLOG_LOG, "Proxy request URL is %s", path.c_str());
198 std::string npath = rule->test_patterns(vars, path, true);
201 yaz_log(YLOG_LOG, "Rewritten request URL is %s", npath.c_str());
202 hreq->path = odr_strdup(o, npath.c_str());
207 void yf::HttpRewrite::Phase::rewrite_headers(mp::odr & o,
208 Z_HTTP_Header *headers,
209 std::map<std::string, std::string> & vars) const
211 std::list<Content>::const_iterator cit = content_list.begin();
212 for (; cit != content_list.end(); cit++)
213 if (cit->type == "headers")
216 if (cit == content_list.end())
219 for (Z_HTTP_Header *header = headers; header; header = header->next)
221 std::list<Within>::const_iterator it = cit->within_list.begin();
222 for (; it != cit->within_list.end(); it++)
224 if (it->header.length() > 0 &&
225 yaz_strcasecmp(it->header.c_str(), header->name) == 0)
227 std::string sheader(header->name);
229 sheader += header->value;
231 RulePtr rule = it->rule;
232 std::string out = rule->test_patterns(vars, sheader, true);
235 size_t pos = out.find(": ");
236 if (pos == std::string::npos)
238 yaz_log(YLOG_LOG, "Header malformed during rewrite, ignoring");
241 header->name = odr_strdup(o, out.substr(0, pos).c_str());
242 header->value = odr_strdup(o,
244 std::string::npos).c_str());
251 void yf::HttpRewrite::Phase::rewrite_body(
253 const char *content_type,
256 std::map<std::string, std::string> & vars) const
258 std::list<Content>::const_iterator cit = content_list.begin();
259 for (; cit != content_list.end(); cit++)
261 yaz_log(YLOG_LOG, "rewrite_body: content_type=%s type=%s",
262 content_type, cit->type.c_str());
263 if (cit->type != "headers"
264 && regex_match(content_type, cit->content_re))
267 if (cit == content_list.end())
273 for (i = 0; i < *content_len; i++)
274 if ((*content_buf)[i] == 0)
275 return; // binary content. skip
277 if (cit->type == "html")
280 Event ev(&*cit, vars);
282 parser.set_verbose(m_verbose);
284 std::string buf(*content_buf, *content_len);
286 parser.parse(ev, buf.c_str());
287 const char *res = ev.result();
288 *content_buf = odr_strdup(o, res);
289 *content_len = strlen(res);
294 yf::HttpRewrite::Event::Event(const Content *p,
295 std::map<std::string, std::string> & vars
296 ) : m_content(p), m_vars(vars)
301 yf::HttpRewrite::Event::~Event()
306 const char *yf::HttpRewrite::Event::result()
308 return wrbuf_cstr(m_w);
311 void yf::HttpRewrite::Event::openTagStart(const char *tag, int tag_len)
313 wrbuf_putc(m_w, '<');
314 wrbuf_write(m_w, tag, tag_len);
316 std::string t(tag, tag_len);
317 std::list<Within>::const_iterator it = m_content->within_list.begin();
318 for (; it != m_content->within_list.end(); it++)
320 if (it->tag.length() > 0 && yaz_strcasecmp(it->tag.c_str(),
323 std::vector<std::string> attr;
324 boost::split(attr, it->attr, boost::is_any_of(","));
326 for (i = 0; i < attr.size(); i++)
328 if (attr[i].compare("#text") == 0)
338 void yf::HttpRewrite::Event::anyTagEnd(const char *tag, int tag_len,
343 if (!s_within.empty())
345 std::list<Within>::const_iterator it = s_within.top();
346 std::string t(tag, tag_len);
347 if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
352 wrbuf_putc(m_w, '/');
353 wrbuf_putc(m_w, '>');
356 void yf::HttpRewrite::Event::attribute(const char *tag, int tag_len,
357 const char *attr, int attr_len,
358 const char *value, int val_len,
361 std::list<Within>::const_iterator it = m_content->within_list.begin();
364 for (; it != m_content->within_list.end(); it++)
366 std::string t(tag, tag_len);
367 if (it->tag.length() == 0 ||
368 yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
370 std::string a(attr, attr_len);
371 std::vector<std::string> attr;
372 boost::split(attr, it->attr, boost::is_any_of(","));
374 for (i = 0; i < attr.size(); i++)
376 if (attr[i].compare("#text") &&
377 yaz_strcasecmp(attr[i].c_str(), a.c_str()) == 0)
385 wrbuf_putc(m_w, ' ');
386 wrbuf_write(m_w, attr, attr_len);
389 wrbuf_puts(m_w, "=");
390 wrbuf_puts(m_w, sep);
395 std::string input(value, val_len);
396 output = it->rule->test_patterns(m_vars, input, true);
399 wrbuf_write(m_w, value, val_len);
401 wrbuf_puts(m_w, output.c_str());
402 wrbuf_puts(m_w, sep);
406 void yf::HttpRewrite::Event::closeTag(const char *tag, int tag_len)
408 if (!s_within.empty())
410 std::list<Within>::const_iterator it = s_within.top();
411 std::string t(tag, tag_len);
412 if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
415 wrbuf_puts(m_w, "</");
416 wrbuf_write(m_w, tag, tag_len);
419 void yf::HttpRewrite::Event::text(const char *value, int len)
421 std::list<Within>::const_iterator it = m_content->within_list.end();
422 if (!s_within.empty())
425 if (it != m_content->within_list.end())
427 std::string input(value, len);
428 output = it->rule->test_patterns(m_vars, input, false);
431 wrbuf_write(m_w, value, len);
433 wrbuf_puts(m_w, output.c_str());
436 const std::string yf::HttpRewrite::Rule::test_patterns(
437 std::map<std::string, std::string> & vars,
438 const std::string & txt, bool anchor)
442 std::string::const_iterator start, end;
447 std::list<Replace>::iterator bit = replace_list.end();
449 std::string::const_iterator best_pos = txt.end();
450 std::list<Replace>::iterator it = replace_list.begin();
451 for (; it != replace_list.end(); it++)
453 if (it->start_anchor && !first)
455 if (regex_search(start, end, it->what, it->re))
457 if (it->what[0].first < best_pos)
459 best_pos = it->what[0].first;
464 if (bit == replace_list.end())
469 for (i = 1; i < bit->what.size(); ++i)
471 //check if the group is named
472 std::map<int, std::string>::const_iterator git
473 = bit->group_index.find(i);
474 if (git != bit->group_index.end())
476 vars[git->second] = bit->what[i];
480 //prepare replacement string
481 std::string rvalue = bit->sub_vars(vars);
482 yaz_log(YLOG_LOG, "! Rewritten '%s' to '%s'",
483 bit->what.str(0).c_str(), rvalue.c_str());
484 out.append(start, bit->what[0].first);
486 start = bit->what[0].second; //move search forward
488 if (start != txt.begin())
489 out.append(start, end);
493 void yf::HttpRewrite::Replace::parse_groups(std::string pattern)
497 const std::string &str = pattern;
499 start_anchor = str[0] == '^';
500 yaz_log(YLOG_LOG, "Parsing groups from '%s'", str.c_str());
501 for (size_t i = 0; i < str.size(); ++i)
504 if (!esc && str[i] == '\\')
509 if (!esc && str[i] == '(') //group starts
512 if (i+1 < str.size() && str[i+1] == '?') //group with attrs
515 if (i+1 < str.size() && str[i+1] == ':') //non-capturing
517 if (gnum > 0) gnum--;
523 if (i+1 < str.size() && str[i+1] == 'P') //optional, python
525 if (i+1 < str.size() && str[i+1] == '<') //named
530 while (++i < str.size())
532 if (str[i] == '>') { term = true; break; }
533 if (!isalnum(str[i]))
534 throw mp::filter::FilterException
535 ("Only alphanumeric chars allowed, found "
539 + boost::lexical_cast<std::string>(i));
543 throw mp::filter::FilterException
544 ("Unterminated group name '" + gname
545 + " in '" + str +"'");
546 group_index[gnum] = gname;
547 yaz_log(YLOG_LOG, "Found named group '%s' at $%d",
548 gname.c_str(), gnum);
557 std::string yf::HttpRewrite::Replace::sub_vars(
558 const std::map<std::string, std::string> & vars) const
562 const std::string & in = recipe;
563 for (size_t i = 0; i < in.size(); ++i)
565 if (!esc && in[i] == '\\')
570 if (!esc && in[i] == '$') //var
572 if (i+1 < in.size() && in[i+1] == '{') //ref prefix
577 while (++i < in.size())
579 if (in[i] == '}') { term = true; break; }
582 if (!term) throw mp::filter::FilterException
583 ("Unterminated var ref in '"+in+"' at "
584 + boost::lexical_cast<std::string>(i));
585 std::map<std::string, std::string>::const_iterator it
587 if (it != vars.end())
594 throw mp::filter::FilterException
595 ("Malformed or trimmed var ref in '"
596 +in+"' at "+boost::lexical_cast<std::string>(i));
607 yf::HttpRewrite::Phase::Phase() : m_verbose(0)
611 void yf::HttpRewrite::Content::configure(
612 const xmlNode *ptr, std::map<std::string, RulePtr > &rules)
614 for (; ptr; ptr = ptr->next)
616 if (ptr->type != XML_ELEMENT_NODE)
618 if (!strcmp((const char *) ptr->name, "within"))
620 static const char *names[6] =
621 { "header", "attr", "tag", "rule", "reqline", 0 };
622 std::string values[5];
623 mp::xml::parse_attr(ptr, names, values);
625 w.header = values[0];
628 std::map<std::string,RulePtr>::const_iterator it =
629 rules.find(values[3]);
630 if (it == rules.end())
631 throw mp::filter::FilterException
632 ("Reference to non-existing rule '" + values[3] +
633 "' in http_rewrite filter");
635 w.reqline = values[4] == "1";
636 within_list.push_back(w);
641 void yf::HttpRewrite::configure_phase(const xmlNode *ptr, Phase &phase)
643 static const char *names[2] = { "verbose", 0 };
644 std::string values[1];
646 mp::xml::parse_attr(ptr, names, values);
648 phase.m_verbose = atoi(values[0].c_str());
650 std::map<std::string, RulePtr > rules;
651 for (ptr = ptr->children; ptr; ptr = ptr->next)
653 if (ptr->type != XML_ELEMENT_NODE)
655 else if (!strcmp((const char *) ptr->name, "rule"))
657 static const char *names[2] = { "name", 0 };
658 std::string values[1];
659 values[0] = "default";
660 mp::xml::parse_attr(ptr, names, values);
662 RulePtr rule(new Rule);
663 for (xmlNode *p = ptr->children; p; p = p->next)
665 if (p->type != XML_ELEMENT_NODE)
667 if (!strcmp((const char *) p->name, "rewrite"))
671 const struct _xmlAttr *attr;
672 for (attr = p->properties; attr; attr = attr->next)
674 if (!strcmp((const char *) attr->name, "from"))
675 from = mp::xml::get_text(attr->children);
676 else if (!strcmp((const char *) attr->name, "to"))
677 replace.recipe = mp::xml::get_text(attr->children);
679 throw mp::filter::FilterException
681 + std::string((const char *) attr->name)
682 + " in rewrite section of http_rewrite");
684 yaz_log(YLOG_LOG, "Found rewrite rule from '%s' to '%s'",
685 from.c_str(), replace.recipe.c_str());
688 replace.parse_groups(from);
689 rule->replace_list.push_back(replace);
693 throw mp::filter::FilterException
695 + std::string((const char *) p->name)
696 + " in http_rewrite filter");
698 rules[values[0]] = rule;
700 else if (!strcmp((const char *) ptr->name, "content"))
702 static const char *names[3] =
703 { "type", "mime", 0 };
704 std::string values[2];
705 mp::xml::parse_attr(ptr, names, values);
706 if (values[0].empty())
708 throw mp::filter::FilterException
709 ("Missing attribute, type for for element "
710 + std::string((const char *) ptr->name)
711 + " in http_rewrite filter");
716 // if (!values[1].empty())
717 c.content_re = values[1];
718 c.configure(ptr->children, rules);
719 phase.content_list.push_back(c);
723 throw mp::filter::FilterException
725 + std::string((const char *) ptr->name)
726 + " in http_rewrite filter");
731 void yf::HttpRewrite::configure(const xmlNode * ptr, bool test_only,
734 for (ptr = ptr->children; ptr; ptr = ptr->next)
736 if (ptr->type != XML_ELEMENT_NODE)
738 else if (!strcmp((const char *) ptr->name, "request"))
740 configure_phase(ptr, *req_phase);
742 else if (!strcmp((const char *) ptr->name, "response"))
744 configure_phase(ptr, *res_phase);
748 throw mp::filter::FilterException
750 + std::string((const char *) ptr->name)
751 + " in http_rewrite1 filter");
756 static mp::filter::Base* filter_creator()
758 return new mp::filter::HttpRewrite;
762 struct metaproxy_1_filter_struct metaproxy_1_filter_http_rewrite = {
773 * c-file-style: "Stroustrup"
774 * indent-tabs-mode: nil
776 * vim: shiftwidth=4 tabstop=8 expandtab