X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=src%2Ffilter_http_rewrite.cpp;h=bb03d67a4e14d9f874b4dcec685e94cf5efc5f09;hb=2c02be2dd0d82a2ce2db3050ce3a6bfe0a3ef16b;hp=e64fb3642f9c315cca1dc053661c293c123bf569;hpb=67e481dac76e773799e3e18c87d29f0a210cbfb1;p=metaproxy-moved-to-github.git diff --git a/src/filter_http_rewrite.cpp b/src/filter_http_rewrite.cpp index e64fb36..bb03d67 100644 --- a/src/filter_http_rewrite.cpp +++ b/src/filter_http_rewrite.cpp @@ -21,24 +21,86 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #include #include #include "filter_http_rewrite.hpp" +#include "html_parser.hpp" #include #include +#include #include #include +#include -#include #include -#if HAVE_SYS_TYPES_H -#include -#endif - namespace mp = metaproxy_1; namespace yf = mp::filter; -yf::HttpRewrite::HttpRewrite() +namespace metaproxy_1 { + namespace filter { + class HttpRewrite::Replace { + public: + boost::regex re; + boost::smatch what; + std::string recipe; + std::map group_index; + std::string sub_vars( + const std::map & vars) const; + void parse_groups(std::string pattern); + }; + + class HttpRewrite::Rule { + public: + std::list replace_list; + const std::string test_patterns( + std::map & vars, + const std::string & txt); + }; + class HttpRewrite::Within { + public: + std::string header; + std::string attr; + std::string tag; + bool reqline; + RulePtr rule; + }; + + class HttpRewrite::Phase { + public: + Phase(); + std::list within_list; + int m_verbose; + void rewrite_reqline(mp::odr & o, Z_HTTP_Request *hreq, + std::map & vars) const; + void rewrite_headers(mp::odr & o, Z_HTTP_Header *headers, + std::map & vars) const; + void rewrite_body(mp::odr & o, + char **content_buf, int *content_len, + std::map & vars) const; + }; + class HttpRewrite::Event : public HTMLParserEvent { + void openTagStart(const char *tag, int tag_len); + void anyTagEnd(const char *tag, int tag_len, int close_it); + void attribute(const char *tag, int tag_len, + const char *attr, int attr_len, + const char *value, int val_len, + const char *sep); + void closeTag(const char *tag, int tag_len); + void text(const char *value, int len); + const Phase *m_phase; + WRBUF m_w; + std::stack::const_iterator> s_within; + std::map &m_vars; + public: + Event(const Phase *p, std::map &vars); + ~Event(); + const char *result(); + }; + } +} + +yf::HttpRewrite::HttpRewrite() : + req_phase(new Phase), res_phase(new Phase) { } @@ -46,8 +108,9 @@ yf::HttpRewrite::~HttpRewrite() { } -void yf::HttpRewrite::process(mp::Package & package) const +void yf::HttpRewrite::process(mp::Package & package) const { + yaz_log(YLOG_LOG, "HttpRewrite begins...."); Z_GDU *gdu = package.request().get(); //map of request/response vars std::map vars; @@ -56,10 +119,11 @@ void yf::HttpRewrite::process(mp::Package & package) const { Z_HTTP_Request *hreq = gdu->u.HTTP_Request; mp::odr o; - std::cout << ">> Request headers" << std::endl; - rewrite_reqline(o, hreq, vars); - rewrite_headers(o, hreq->headers, vars); - rewrite_body(o, &hreq->content_buf, &hreq->content_len, vars); + req_phase->rewrite_reqline(o, hreq, vars); + yaz_log(YLOG_LOG, ">> Request headers"); + req_phase->rewrite_headers(o, hreq->headers, vars); + req_phase->rewrite_body(o, + &hreq->content_buf, &hreq->content_len, vars); package.request() = gdu; } package.move(); @@ -67,221 +131,380 @@ void yf::HttpRewrite::process(mp::Package & package) const if (gdu && gdu->which == Z_GDU_HTTP_Response) { Z_HTTP_Response *hres = gdu->u.HTTP_Response; - std::cout << "Response " << hres->code; - std::cout << "<< Respose headers" << std::endl; + yaz_log(YLOG_LOG, "Response code %d", hres->code); mp::odr o; - rewrite_headers(o, hres->headers, vars); - rewrite_body(o, &hres->content_buf, &hres->content_len, vars); + yaz_log(YLOG_LOG, "<< Respose headers"); + res_phase->rewrite_headers(o, hres->headers, vars); + res_phase->rewrite_body(o, &hres->content_buf, + &hres->content_len, vars); package.response() = gdu; } } -void yf::HttpRewrite::rewrite_reqline (mp::odr & o, Z_HTTP_Request *hreq, - std::map & vars) const +void yf::HttpRewrite::Phase::rewrite_reqline (mp::odr & o, + Z_HTTP_Request *hreq, + std::map & vars) const { //rewrite the request line std::string path; if (strstr(hreq->path, "http://") == hreq->path) { - std::cout << "Path in the method line is absolute, " - "possibly a proxy request\n"; + yaz_log(YLOG_LOG, "Path in the method line is absolute, " + "possibly a proxy request"); path += hreq->path; } else { //TODO what about proto + path += "http://"; path += z_HTTP_header_lookup(hreq->headers, "Host"); - path += hreq->path; + path += hreq->path; } - std::cout << "Proxy request URL is " << path << std::endl; - std::string npath = - test_patterns(vars, path, req_uri_pats, req_groups_bynum); - std::cout << "Resp request URL is " << npath << std::endl; - if (!npath.empty()) - hreq->path = odr_strdup(o, npath.c_str()); + + + std::list::const_iterator it = within_list.begin(); + for (; it != within_list.end(); it++) + if (it->reqline) + { + RulePtr rule = it->rule; + yaz_log(YLOG_LOG, "Proxy request URL is %s", path.c_str()); + std::string npath = rule->test_patterns(vars, path); + if (!npath.empty()) + { + yaz_log(YLOG_LOG, "Rewritten request URL is %s", npath.c_str()); + hreq->path = odr_strdup(o, npath.c_str()); + } + } } -void yf::HttpRewrite::rewrite_headers (mp::odr & o, Z_HTTP_Header *headers, - std::map & vars) const +void yf::HttpRewrite::Phase::rewrite_headers(mp::odr & o, + Z_HTTP_Header *headers, + std::map & vars) const { - for (Z_HTTP_Header *header = headers; - header != 0; - header = header->next) + for (Z_HTTP_Header *header = headers; header; header = header->next) { - std::string sheader(header->name); - sheader += ": "; - sheader += header->value; - std::cout << header->name << ": " << header->value << std::endl; - std::string out = test_patterns(vars, - sheader, - req_uri_pats, req_groups_bynum); - if (!out.empty()) + std::list::const_iterator it = within_list.begin(); + for (; it != within_list.end(); it++) { - size_t pos = out.find(": "); - if (pos == std::string::npos) + if (it->header.length() > 0 && + yaz_strcasecmp(it->header.c_str(), header->name) == 0) { - std::cout << "Header malformed during rewrite, ignoring"; - continue; + std::string sheader(header->name); + sheader += ": "; + sheader += header->value; + + RulePtr rule = it->rule; + std::string out = rule->test_patterns(vars, sheader); + if (!out.empty()) + { + size_t pos = out.find(": "); + if (pos == std::string::npos) + { + yaz_log(YLOG_LOG, "Header malformed during rewrite, ignoring"); + continue; + } + header->name = odr_strdup(o, out.substr(0, pos).c_str()); + header->value = odr_strdup(o, + out.substr(pos + 2, + std::string::npos).c_str()); + } } - header->name = odr_strdup(o, out.substr(0, pos).c_str()); - header->value = odr_strdup(o, out.substr(pos+2, - std::string::npos).c_str()); } } } -void yf::HttpRewrite::rewrite_body (mp::odr & o, char **content_buf, int *content_len, - std::map & vars) const +void yf::HttpRewrite::Phase::rewrite_body(mp::odr & o, + char **content_buf, + int *content_len, + std::map & vars) const { if (*content_buf) { - std::string body(*content_buf); - std::string nbody = - test_patterns(vars, body, req_uri_pats, req_groups_bynum); - if (!nbody.empty()) + int i; + for (i = 0; i < *content_len; i++) + if ((*content_buf)[i] == 0) + return; // binary content. skip + + HTMLParser parser; + Event ev(this, vars); + + parser.set_verbose(m_verbose); + + std::string buf(*content_buf, *content_len); + + parser.parse(ev, buf.c_str()); + const char *res = ev.result(); + *content_buf = odr_strdup(o, res); + *content_len = strlen(res); + } +} + +yf::HttpRewrite::Event::Event(const Phase *p, + std::map & vars + ) : m_phase(p), m_vars(vars) +{ + m_w = wrbuf_alloc(); +} + +yf::HttpRewrite::Event::~Event() +{ + wrbuf_destroy(m_w); +} + +const char *yf::HttpRewrite::Event::result() +{ + return wrbuf_cstr(m_w); +} + +void yf::HttpRewrite::Event::openTagStart(const char *tag, int tag_len) +{ + wrbuf_putc(m_w, '<'); + wrbuf_write(m_w, tag, tag_len); + + std::string t(tag, tag_len); + std::list::const_iterator it = m_phase->within_list.begin(); + for (; it != m_phase->within_list.end(); it++) + { + if (it->tag.length() > 0 && yaz_strcasecmp(it->tag.c_str(), + t.c_str()) == 0) { - *content_buf = odr_strdup(o, nbody.c_str()); - *content_len = nbody.size(); + std::vector attr; + boost::split(attr, it->attr, boost::is_any_of(",")); + size_t i; + for (i = 0; i < attr.size(); i++) + { + if (attr[i].compare("#text") == 0) + { + s_within.push(it); + return; + } + } } } } -/** - * Tests pattern from the vector in order and executes recipe on - the first match. - */ -const std::string yf::HttpRewrite::test_patterns( - std::map & vars, - const std::string & txt, - const spair_vec & uri_pats, - const std::vector > & groups_bynum_vec) - const +void yf::HttpRewrite::Event::anyTagEnd(const char *tag, int tag_len, + int close_it) +{ + if (close_it) + { + if (!s_within.empty()) + { + std::list::const_iterator it = s_within.top(); + std::string t(tag, tag_len); + if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0) + s_within.pop(); + } + } + if (close_it) + wrbuf_putc(m_w, '/'); + wrbuf_putc(m_w, '>'); +} + +void yf::HttpRewrite::Event::attribute(const char *tag, int tag_len, + const char *attr, int attr_len, + const char *value, int val_len, + const char *sep) +{ + std::list::const_iterator it = m_phase->within_list.begin(); + bool subst = false; + + for (; it != m_phase->within_list.end(); it++) + { + std::string t(tag, tag_len); + if (it->tag.length() == 0 || + yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0) + { + std::string a(attr, attr_len); + std::vector attr; + boost::split(attr, it->attr, boost::is_any_of(",")); + size_t i; + for (i = 0; i < attr.size(); i++) + { + if (attr[i].compare("#text") && + yaz_strcasecmp(attr[i].c_str(), a.c_str()) == 0) + subst = true; + } + } + if (subst) + break; + } + + wrbuf_putc(m_w, ' '); + wrbuf_write(m_w, attr, attr_len); + if (value) + { + wrbuf_puts(m_w, "="); + wrbuf_puts(m_w, sep); + + std::string output; + if (subst) + { + std::string input(value, val_len); + output = it->rule->test_patterns(m_vars, input); + } + if (output.empty()) + wrbuf_write(m_w, value, val_len); + else + wrbuf_puts(m_w, output.c_str()); + wrbuf_puts(m_w, sep); + } +} + +void yf::HttpRewrite::Event::closeTag(const char *tag, int tag_len) { - for (int i = 0; i < uri_pats.size(); i++) + if (!s_within.empty()) { - std::string out = search_replace(vars, txt, - uri_pats[i].first, uri_pats[i].second, - groups_bynum_vec[i]); - if (!out.empty()) return out; + std::list::const_iterator it = s_within.top(); + std::string t(tag, tag_len); + if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0) + s_within.pop(); } - return ""; + wrbuf_puts(m_w, "::const_iterator it = m_phase->within_list.end(); + if (!s_within.empty()) + it = s_within.top(); + std::string output; + if (it != m_phase->within_list.end()) + { + std::string input(value, len); + output = it->rule->test_patterns(m_vars, input); + } + if (output.empty()) + wrbuf_write(m_w, value, len); + else + wrbuf_puts(m_w, output.c_str()); +} -const std::string yf::HttpRewrite::search_replace( +const std::string yf::HttpRewrite::Rule::test_patterns( std::map & vars, - const std::string & txt, - const std::string & uri_re, - const std::string & uri_pat, - const std::map & groups_bynum) const + const std::string & txt) { - //exec regex against value - boost::regex re(uri_re); - boost::smatch what; + std::string out; std::string::const_iterator start, end; start = txt.begin(); end = txt.end(); - std::string out; - while (regex_search(start, end, what, re)) //find next full match + while (1) { - unsigned i; - for (i = 1; i < what.size(); ++i) + std::list::iterator bit = replace_list.end(); + { + std::string::const_iterator best_pos = txt.end(); + std::list::iterator it = replace_list.begin(); + for (; it != replace_list.end(); it++) + { + if (regex_search(start, end, it->what, it->re)) + { + if (it->what[0].first < best_pos) + { + best_pos = it->what[0].first; + bit = it; + } + } + } + if (bit == replace_list.end()) + break; + } + + size_t i; + for (i = 1; i < bit->what.size(); ++i) { //check if the group is named - std::map::const_iterator it - = groups_bynum.find(i); - if (it != groups_bynum.end()) + std::map::const_iterator git + = bit->group_index.find(i); + if (git != bit->group_index.end()) { //it is - std::string name = it->second; - if (!what[i].str().empty()) - vars[name] = what[i]; + vars[git->second] = bit->what[i]; } } //prepare replacement string - std::string rvalue = sub_vars(uri_pat, vars); - //rewrite value - std::string rhvalue = what.prefix().str() - + rvalue + what.suffix().str(); - std::cout << "! Rewritten '"+what.str(0)+"' to '"+rvalue+"'\n"; - out += rhvalue; - start = what[0].second; //move search forward + std::string rvalue = bit->sub_vars(vars); + yaz_log(YLOG_LOG, "! Rewritten '%s' to '%s'", + bit->what.str(0).c_str(), rvalue.c_str()); + out.append(start, bit->what[0].first); + out.append(rvalue); + start = bit->what[0].second; //move search forward } + if (start != txt.begin()) + out.append(start, end); return out; } -void yf::HttpRewrite::parse_groups( - const spair_vec & uri_pats, - std::vector > & groups_bynum_vec) +void yf::HttpRewrite::Replace::parse_groups(std::string pattern) { - for (int h = 0; h < uri_pats.size(); h++) + int gnum = 0; + bool esc = false; + const std::string &str = pattern; + std::string res; + yaz_log(YLOG_LOG, "Parsing groups from '%s'", str.c_str()); + for (size_t i = 0; i < str.size(); ++i) { - int gnum = 0; - bool esc = false; - //regex is first, subpat is second - std::string str = uri_pats[h].first; - //for each pair we have an indexing map - std::map groups_bynum; - for (int i = 0; i < str.size(); ++i) + res += str[i]; + if (!esc && str[i] == '\\') { - if (!esc && str[i] == '\\') - { - esc = true; - continue; - } - if (!esc && str[i] == '(') //group starts + esc = true; + continue; + } + if (!esc && str[i] == '(') //group starts + { + gnum++; + if (i+1 < str.size() && str[i+1] == '?') //group with attrs { - gnum++; - if (i+1 < str.size() && str[i+1] == '?') //group with attrs + i++; + if (i+1 < str.size() && str[i+1] == ':') //non-capturing { + if (gnum > 0) gnum--; + res += str[i]; i++; - if (i+1 < str.size() && str[i+1] == ':') //non-capturing - { - if (gnum > 0) gnum--; - i++; - continue; - } - if (i+1 < str.size() && str[i+1] == 'P') //optional, python - i++; - if (i+1 < str.size() && str[i+1] == '<') //named + res += str[i]; + continue; + } + if (i+1 < str.size() && str[i+1] == 'P') //optional, python + i++; + if (i+1 < str.size() && str[i+1] == '<') //named + { + i++; + std::string gname; + bool term = false; + while (++i < str.size()) { - i++; - std::string gname; - bool term = false; - while (++i < str.size()) - { - if (str[i] == '>') { term = true; break; } - if (!isalnum(str[i])) - throw mp::filter::FilterException - ("Only alphanumeric chars allowed, found " - " in '" - + str - + "' at " - + boost::lexical_cast(i)); - gname += str[i]; - } - if (!term) + if (str[i] == '>') { term = true; break; } + if (!isalnum(str[i])) throw mp::filter::FilterException - ("Unterminated group name '" + gname - + " in '" + str +"'"); - groups_bynum[gnum] = gname; - std::cout << "Found named group '" << gname - << "' at $" << gnum << std::endl; + ("Only alphanumeric chars allowed, found " + " in '" + + str + + "' at " + + boost::lexical_cast(i)); + gname += str[i]; } + if (!term) + throw mp::filter::FilterException + ("Unterminated group name '" + gname + + " in '" + str +"'"); + group_index[gnum] = gname; + yaz_log(YLOG_LOG, "Found named group '%s' at $%d", + gname.c_str(), gnum); } } - esc = false; } - groups_bynum_vec.push_back(groups_bynum); + esc = false; } + re = res; } -std::string yf::HttpRewrite::sub_vars (const std::string & in, - const std::map & vars) +std::string yf::HttpRewrite::Replace::sub_vars( + const std::map & vars) const { std::string out; bool esc = false; - for (int i = 0; i < in.size(); ++i) + const std::string & in = recipe; + for (size_t i = 0; i < in.size(); ++i) { if (!esc && in[i] == '\\') { @@ -295,7 +518,7 @@ std::string yf::HttpRewrite::sub_vars (const std::string & in, ++i; std::string name; bool term = false; - while (++i < in.size()) + while (++i < in.size()) { if (in[i] == '}') { term = true; break; } name += in[i]; @@ -314,7 +537,7 @@ std::string yf::HttpRewrite::sub_vars (const std::string & in, { throw mp::filter::FilterException ("Malformed or trimmed var ref in '" - +in+"' at "+boost::lexical_cast(i)); + +in+"' at "+boost::lexical_cast(i)); } continue; } @@ -325,50 +548,95 @@ std::string yf::HttpRewrite::sub_vars (const std::string & in, return out; } -void yf::HttpRewrite::configure( - const spair_vec req_uri_pats, - const spair_vec res_uri_pats) +yf::HttpRewrite::Phase::Phase() : m_verbose(0) { - //TODO should we really copy them out? - this->req_uri_pats = req_uri_pats; - this->res_uri_pats = res_uri_pats; - //pick up names - parse_groups(req_uri_pats, req_groups_bynum); - parse_groups(res_uri_pats, res_groups_bynum); } - -static void configure_rules(const xmlNode *ptr, yf::HttpRewrite::spair_vec & dest) +void yf::HttpRewrite::configure_phase(const xmlNode *ptr, Phase &phase) { + static const char *names[2] = { "verbose", 0 }; + std::string values[1]; + values[0] = "0"; + mp::xml::parse_attr(ptr, names, values); + + phase.m_verbose = atoi(values[0].c_str()); + + std::map rules; for (ptr = ptr->children; ptr; ptr = ptr->next) { if (ptr->type != XML_ELEMENT_NODE) continue; - else if (!strcmp((const char *) ptr->name, "rewrite")) + else if (!strcmp((const char *) ptr->name, "rule")) { - std::string from, to; - const struct _xmlAttr *attr; - for (attr = ptr->properties; attr; attr = attr->next) + static const char *names[2] = { "name", 0 }; + std::string values[1]; + values[0] = "default"; + mp::xml::parse_attr(ptr, names, values); + + RulePtr rule(new Rule); + for (xmlNode *p = ptr->children; p; p = p->next) { - if (!strcmp((const char *) attr->name, "from")) - from = mp::xml::get_text(attr->children); - else if (!strcmp((const char *) attr->name, "to")) - to = mp::xml::get_text(attr->children); + if (p->type != XML_ELEMENT_NODE) + continue; + if (!strcmp((const char *) p->name, "rewrite")) + { + Replace replace; + std::string from; + const struct _xmlAttr *attr; + for (attr = p->properties; attr; attr = attr->next) + { + if (!strcmp((const char *) attr->name, "from")) + from = mp::xml::get_text(attr->children); + else if (!strcmp((const char *) attr->name, "to")) + replace.recipe = mp::xml::get_text(attr->children); + else + throw mp::filter::FilterException + ("Bad attribute " + + std::string((const char *) attr->name) + + " in rewrite section of http_rewrite"); + } + yaz_log(YLOG_LOG, "Found rewrite rule from '%s' to '%s'", + from.c_str(), replace.recipe.c_str()); + if (!from.empty()) + { + replace.parse_groups(from); + rule->replace_list.push_back(replace); + } + } else throw mp::filter::FilterException - ("Bad attribute " - + std::string((const char *) attr->name) - + " in rewrite section of http_rewrite"); + ("Bad element " + + std::string((const char *) p->name) + + " in http_rewrite filter"); } - if (!from.empty()) - dest.push_back(std::make_pair(from, to)); + rules[values[0]] = rule; + } + else if (!strcmp((const char *) ptr->name, "within")) + { + static const char *names[6] = + { "header", "attr", "tag", "rule", "reqline", 0 }; + std::string values[5]; + mp::xml::parse_attr(ptr, names, values); + Within w; + w.header = values[0]; + w.attr = values[1]; + w.tag = values[2]; + std::map::const_iterator it = + rules.find(values[3]); + if (it == rules.end()) + throw mp::filter::FilterException + ("Reference to non-existing rule '" + values[3] + + "' in http_rewrite filter"); + w.rule = it->second; + w.reqline = values[4] == "1"; + phase.within_list.push_back(w); } else { throw mp::filter::FilterException ("Bad element " + std::string((const char *) ptr->name) - + " in http_rewrite1 filter"); + + " in http_rewrite filter"); } } } @@ -376,19 +644,17 @@ static void configure_rules(const xmlNode *ptr, yf::HttpRewrite::spair_vec & des void yf::HttpRewrite::configure(const xmlNode * ptr, bool test_only, const char *path) { - spair_vec req_uri_pats; - spair_vec res_uri_pats; for (ptr = ptr->children; ptr; ptr = ptr->next) { if (ptr->type != XML_ELEMENT_NODE) continue; else if (!strcmp((const char *) ptr->name, "request")) { - configure_rules(ptr->children, req_uri_pats); + configure_phase(ptr, *req_phase); } else if (!strcmp((const char *) ptr->name, "response")) { - configure_rules(ptr->children, res_uri_pats); + configure_phase(ptr, *res_phase); } else {