Remove unused code, fix printing
[metaproxy-moved-to-github.git] / src / filter_http_rewrite.cpp
index 5259fc3..098c349 100644 (file)
@@ -21,6 +21,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 #include <metaproxy/package.hpp>
 #include <metaproxy/util.hpp>
 #include "filter_http_rewrite.hpp"
+#include "html_parser.hpp"
 
 #include <yaz/zgdu.h>
 #include <yaz/log.h>
@@ -28,32 +29,18 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 #include <boost/regex.hpp>
 #include <boost/lexical_cast.hpp>
 
-#include <vector>
 #include <map>
 
-#if HAVE_SYS_TYPES_H
-#include <sys/types.h>
-#endif
-
 namespace mp = metaproxy_1;
 namespace yf = mp::filter;
 
 namespace metaproxy_1 {
     namespace filter {
-        class HttpRewrite::RuleScope {
-        public:
-            std::vector<std::string> tags;
-            std::vector<std::string> attrs;
-            std::string content_type;
-        };
-        class HttpRewrite::Rule {
+        class HttpRewrite::Replace {
         public:
-            enum Section { METHOD, HEADER, BODY };
             std::string regex;
             std::string recipe;
             std::map<int, std::string> group_index;
-            std::vector<RuleScope> scopes;
-            Section section;
             const std::string search_replace(
                 std::map<std::string, std::string> & vars,
                 const std::string & txt) const;
@@ -61,24 +48,49 @@ namespace metaproxy_1 {
                 const std::map<std::string, std::string> & vars) const;
             void parse_groups();
         };
-        class HttpRewrite::Rules {
+
+        class HttpRewrite::Rule {
+        public:
+            std::list<Replace> replace_list;
+            const std::string test_patterns(
+                std::map<std::string, std::string> & vars,
+                const std::string & txt) const;
+        };
+        class HttpRewrite::Within {
+        public:
+            std::string header;
+            std::string attr;
+            std::string tag;
+            RulePtr rule;
+        };
+
+        class HttpRewrite::Phase {
         public:
-            std::vector<Rule> rules;
-            void rewrite_reqline (mp::odr & o, Z_HTTP_Request *hreq,
+            std::list<Within> within_list;
+            void rewrite_reqline(mp::odr & o, Z_HTTP_Request *hreq,
                 std::map<std::string, std::string> & vars) const;
             void rewrite_headers(mp::odr & o, Z_HTTP_Header *headers,
                 std::map<std::string, std::string> & vars) const;
-            void rewrite_body (mp::odr & o, 
+            void rewrite_body(mp::odr & o,
                 char **content_buf, int *content_len,
                 std::map<std::string, std::string> & vars) const;
-            const std::string test_patterns(
-                std::map<std::string, std::string> & vars,
-                const std::string & txt) const;
+        };
+        class HttpRewrite::Event : public HTMLParserEvent {
+        public:
+            void openTagStart(const char *name);
+            void anyTagEnd(const char *name);
+            void attribute(const char *tagName, 
+                           const char *name, 
+                           const char *value,
+                           int val_len);
+            void closeTag(const char *name);
+            void text(const char *value, int len);
         };
     }
 }
 
-yf::HttpRewrite::HttpRewrite() : req_rules(new Rules), res_rules(new Rules)
+yf::HttpRewrite::HttpRewrite() :
+    req_phase(new Phase), res_phase(new Phase)
 {
 }
 
@@ -86,7 +98,7 @@ yf::HttpRewrite::~HttpRewrite()
 {
 }
 
-void yf::HttpRewrite::process(mp::Package & package) const 
+void yf::HttpRewrite::process(mp::Package & package) const
 {
     yaz_log(YLOG_LOG, "HttpRewrite begins....");
     Z_GDU *gdu = package.request().get();
@@ -97,12 +109,11 @@ void yf::HttpRewrite::process(mp::Package & package) const
     {
         Z_HTTP_Request *hreq = gdu->u.HTTP_Request;
         mp::odr o;
-        req_rules->rewrite_reqline(o, hreq, vars);
+        req_phase->rewrite_reqline(o, hreq, vars);
         yaz_log(YLOG_LOG, ">> Request headers");
-        req_rules->rewrite_headers(o, hreq->headers, vars);
-        req_rules->rewrite_body(o, 
-                &hreq->content_buf, &hreq->content_len, 
-                vars);
+        req_phase->rewrite_headers(o, hreq->headers, vars);
+        req_phase->rewrite_body(o,
+                &hreq->content_buf, &hreq->content_len, vars);
         package.request() = gdu;
     }
     package.move();
@@ -113,22 +124,22 @@ void yf::HttpRewrite::process(mp::Package & package) const
         yaz_log(YLOG_LOG, "Response code %d", hres->code);
         mp::odr o;
         yaz_log(YLOG_LOG, "<< Respose headers");
-        res_rules->rewrite_headers(o, hres->headers, vars);
-        res_rules->rewrite_body(o, &hres->content_buf, 
+        res_phase->rewrite_headers(o, hres->headers, vars);
+        res_phase->rewrite_body(o, &hres->content_buf,
                 &hres->content_len, vars);
         package.response() = gdu;
     }
 }
 
-void yf::HttpRewrite::Rules::rewrite_reqline (mp::odr & o, 
+void yf::HttpRewrite::Phase::rewrite_reqline (mp::odr & o,
         Z_HTTP_Request *hreq,
-        std::map<std::string, std::string> & vars) const 
+        std::map<std::string, std::string> & vars) const
 {
     //rewrite the request line
     std::string path;
     if (strstr(hreq->path, "http://") == hreq->path)
     {
-        yaz_log(YLOG_LOG, "Path in the method line is absolute, " 
+        yaz_log(YLOG_LOG, "Path in the method line is absolute, "
             "possibly a proxy request");
         path += hreq->path;
     }
@@ -137,32 +148,44 @@ void yf::HttpRewrite::Rules::rewrite_reqline (mp::odr & o,
         //TODO what about proto
         path += "http://";
         path += z_HTTP_header_lookup(hreq->headers, "Host");
-        path += hreq->path; 
+        path += hreq->path;
     }
-    yaz_log(YLOG_LOG, "Proxy request URL is %s", path.c_str());
-    std::string npath = 
-        test_patterns(vars, path);
-    if (!npath.empty())
+
+    std::list<Within>::const_iterator it = within_list.begin();
+    if (it != within_list.end())
     {
-        yaz_log(YLOG_LOG, "Rewritten request URL is %s", npath.c_str());
-        hreq->path = odr_strdup(o, npath.c_str());
+        RulePtr rule = it->rule;
+
+        yaz_log(YLOG_LOG, "Proxy request URL is %s", path.c_str());
+        std::string npath = rule->test_patterns(vars, path);
+        if (!npath.empty())
+        {
+            yaz_log(YLOG_LOG, "Rewritten request URL is %s", npath.c_str());
+            hreq->path = odr_strdup(o, npath.c_str());
+        }
     }
 }
 
-void yf::HttpRewrite::Rules::rewrite_headers(mp::odr & o, 
+void yf::HttpRewrite::Phase::rewrite_headers(mp::odr & o,
         Z_HTTP_Header *headers,
-        std::map<std::string, std::string> & vars) const 
+        std::map<std::string, std::string> & vars) const
 {
     for (Z_HTTP_Header *header = headers;
-            header != 0; 
-            header = header->next) 
+            header != 0;
+            header = header->next)
     {
         std::string sheader(header->name);
         sheader += ": ";
         sheader += header->value;
         yaz_log(YLOG_LOG, "%s: %s", header->name, header->value);
-        std::string out = test_patterns(vars, sheader);
-        if (!out.empty()) 
+
+        std::list<Within>::const_iterator it = within_list.begin();
+        if (it == within_list.end())
+            continue;
+        RulePtr rule = it->rule;
+
+        std::string out = rule->test_patterns(vars, sheader);
+        if (!out.empty())
         {
             size_t pos = out.find(": ");
             if (pos == std::string::npos)
@@ -171,47 +194,85 @@ void yf::HttpRewrite::Rules::rewrite_headers(mp::odr & o,
                 continue;
             }
             header->name = odr_strdup(o, out.substr(0, pos).c_str());
-            header->value = odr_strdup(o, out.substr(pos+2, 
-                        std::string::npos).c_str());
+            header->value = odr_strdup(o, out.substr(pos+2,
+                                                     std::string::npos).c_str());
         }
     }
 }
 
-void yf::HttpRewrite::Rules::rewrite_body (mp::odr & o, 
-        char **content_buf, 
+void yf::HttpRewrite::Phase::rewrite_body(mp::odr & o,
+        char **content_buf,
         int *content_len,
-        std::map<std::string, std::string> & vars) const 
+        std::map<std::string, std::string> & vars) const
 {
     if (*content_buf)
     {
-        std::string body(*content_buf);
-        std::string nbody = 
-            test_patterns(vars, body);
-        if (!nbody.empty())
+        HTMLParser parser;
+        Event ev;
+        std::string buf(*content_buf, *content_len);
+
+        parser.parse(ev, buf.c_str());
+        std::list<Within>::const_iterator it = within_list.begin();
+        if (it != within_list.end())
         {
-            *content_buf = odr_strdup(o, nbody.c_str());
-            *content_len = nbody.size();
+            RulePtr rule = it->rule;
+
+            std::string body(*content_buf);
+            std::string nbody = rule->test_patterns(vars, body);
+            if (!nbody.empty())
+            {
+                *content_buf = odr_strdup(o, nbody.c_str());
+                *content_len = nbody.size();
+            }
         }
     }
 }
 
+
+void yf::HttpRewrite::Event::openTagStart(const char *name)
+{
+}
+
+void yf::HttpRewrite::Event::anyTagEnd(const char *name)
+{
+}
+
+void yf::HttpRewrite::Event::attribute(const char *tagName,
+                                         const char *name,
+                                         const char *value,
+                                         int val_len)
+{
+}
+
+
+void yf::HttpRewrite::Event::closeTag(const char *name)
+{
+}
+
+void yf::HttpRewrite::Event::text(const char *value, int len)
+{
+}
+
+
 /**
  * Tests pattern from the vector in order and executes recipe on
  the first match.
  */
-const std::string yf::HttpRewrite::Rules::test_patterns(
+const std::string yf::HttpRewrite::Rule::test_patterns(
         std::map<std::string, std::string> & vars,
         const std::string & txt) const
 {
-    for (size_t i = 0; i < rules.size(); i++) 
+    std::list<Replace>::const_iterator it = replace_list.begin();
+
+    for (; it != replace_list.end(); it++)
     {
-        std::string out = rules[i].search_replace(vars, txt);
+        std::string out = it->search_replace(vars, txt);
         if (!out.empty()) return out;
     }
     return "";
 }
 
-const std::string yf::HttpRewrite::Rule::search_replace(
+const std::string yf::HttpRewrite::Replace::search_replace(
         std::map<std::string, std::string> & vars,
         const std::string & txt) const
 {
@@ -230,7 +291,7 @@ const std::string yf::HttpRewrite::Rule::search_replace(
             //check if the group is named
             std::map<int, std::string>::const_iterator it
                 = group_index.find(i);
-            if (it != group_index.end()) 
+            if (it != group_index.end())
             {   //it is
                 if (!what[i].str().empty())
                     vars[it->second] = what[i];
@@ -239,7 +300,7 @@ const std::string yf::HttpRewrite::Rule::search_replace(
         }
         //prepare replacement string
         std::string rvalue = sub_vars(vars);
-        yaz_log(YLOG_LOG, "! Rewritten '%s' to '%s'", 
+        yaz_log(YLOG_LOG, "! Rewritten '%s' to '%s'",
                 what.str(0).c_str(), rvalue.c_str());
         out.append(start, what[0].first);
         out.append(rvalue);
@@ -251,14 +312,16 @@ const std::string yf::HttpRewrite::Rule::search_replace(
     return out;
 }
 
-void yf::HttpRewrite::Rule::parse_groups()
+void yf::HttpRewrite::Replace::parse_groups()
 {
     int gnum = 0;
     bool esc = false;
     const std::string & str = regex;
+    std::string res;
     yaz_log(YLOG_LOG, "Parsing groups from '%s'", str.c_str());
     for (size_t i = 0; i < str.size(); ++i)
     {
+        res += str[i];
         if (!esc && str[i] == '\\')
         {
             esc = true;
@@ -267,13 +330,15 @@ void yf::HttpRewrite::Rule::parse_groups()
         if (!esc && str[i] == '(') //group starts
         {
             gnum++;
-            if (i+1 < str.size() && str[i+1] == '?') //group with attrs 
+            if (i+1 < str.size() && str[i+1] == '?') //group with attrs
             {
-                i++; 
+                i++;
                 if (i+1 < str.size() && str[i+1] == ':') //non-capturing
                 {
                     if (gnum > 0) gnum--;
+                    res += str[i];
                     i++;
+                    res += str[i];
                     continue;
                 }
                 if (i+1 < str.size() && str[i+1] == 'P') //optional, python
@@ -286,18 +351,18 @@ void yf::HttpRewrite::Rule::parse_groups()
                     while (++i < str.size())
                     {
                         if (str[i] == '>') { term = true; break; }
-                        if (!isalnum(str[i])) 
+                        if (!isalnum(str[i]))
                             throw mp::filter::FilterException
                                 ("Only alphanumeric chars allowed, found "
-                                 " in '" 
-                                 + str 
-                                 + "' at " 
-                                 + boost::lexical_cast<std::string>(i)); 
+                                 " in '"
+                                 + str
+                                 + "' at "
+                                 + boost::lexical_cast<std::string>(i));
                         gname += str[i];
                     }
                     if (!term)
                         throw mp::filter::FilterException
-                            ("Unterminated group name '" + gname 
+                            ("Unterminated group name '" + gname
                              + " in '" + str +"'");
                     group_index[gnum] = gname;
                     yaz_log(YLOG_LOG, "Found named group '%s' at $%d",
@@ -307,9 +372,10 @@ void yf::HttpRewrite::Rule::parse_groups()
         }
         esc = false;
     }
+    regex = res;
 }
 
-std::string yf::HttpRewrite::Rule::sub_vars (
+std::string yf::HttpRewrite::Replace::sub_vars (
         const std::map<std::string, std::string> & vars) const
 {
     std::string out;
@@ -329,7 +395,7 @@ std::string yf::HttpRewrite::Rule::sub_vars (
                 ++i;
                 std::string name;
                 bool term = false;
-                while (++i < in.size()) 
+                while (++i < in.size())
                 {
                     if (in[i] == '}') { term = true; break; }
                     name += in[i];
@@ -348,7 +414,7 @@ std::string yf::HttpRewrite::Rule::sub_vars (
             {
                 throw mp::filter::FilterException
                     ("Malformed or trimmed var ref in '"
-                     +in+"' at "+boost::lexical_cast<std::string>(i)); 
+                     +in+"' at "+boost::lexical_cast<std::string>(i));
             }
             continue;
         }
@@ -359,41 +425,82 @@ std::string yf::HttpRewrite::Rule::sub_vars (
     return out;
 }
 
-void yf::HttpRewrite::configure_rules(const xmlNode *ptr, 
-        Rules & rules)
+
+void yf::HttpRewrite::configure_phase(const xmlNode *ptr, Phase &phase)
 {
+    std::map<std::string, RulePtr > rules;
     for (ptr = ptr->children; ptr; ptr = ptr->next)
     {
         if (ptr->type != XML_ELEMENT_NODE)
             continue;
-        else if (!strcmp((const char *) ptr->name, "rewrite"))
+        else if (!strcmp((const char *) ptr->name, "rule"))
         {
-            Rule rule;
-            const struct _xmlAttr *attr;
-            for (attr = ptr->properties; attr; attr = attr->next)
+            static const char *names[2] = { "name", 0 };
+            std::string values[1];
+            values[0] = "default";
+            mp::xml::parse_attr(ptr, names, values);
+
+            RulePtr rule(new Rule);
+            for (xmlNode *p = ptr->children; p; p = p->next)
             {
-                if (!strcmp((const char *) attr->name,  "from"))
-                    rule.regex = mp::xml::get_text(attr->children);
-                else if (!strcmp((const char *) attr->name,  "to"))
-                    rule.recipe = mp::xml::get_text(attr->children);
+                if (p->type != XML_ELEMENT_NODE)
+                    continue;
+                if (!strcmp((const char *) p->name, "rewrite"))
+                {
+                    Replace replace;
+                    const struct _xmlAttr *attr;
+                    for (attr = p->properties; attr; attr = attr->next)
+                    {
+                        if (!strcmp((const char *) attr->name,  "from"))
+                            replace.regex = mp::xml::get_text(attr->children);
+                        else if (!strcmp((const char *) attr->name,  "to"))
+                            replace.recipe = mp::xml::get_text(attr->children);
+                        else
+                            throw mp::filter::FilterException
+                                ("Bad attribute "
+                                 + std::string((const char *) attr->name)
+                                 + " in rewrite section of http_rewrite");
+                    }
+                    yaz_log(YLOG_LOG, "Found rewrite rule from '%s' to '%s'",
+                            replace.regex.c_str(), replace.recipe.c_str());
+                    replace.parse_groups();
+                    if (!replace.regex.empty())
+                        rule->replace_list.push_back(replace);
+                }
                 else
                     throw mp::filter::FilterException
-                        ("Bad attribute "
-                         + std::string((const char *) attr->name)
-                         + " in rewrite section of http_rewrite");
+                        ("Bad element "
+                         + std::string((const char *) p->name)
+                         + " in http_rewrite filter");
             }
-            yaz_log(YLOG_LOG, "Found rewrite rule from '%s' to '%s'", 
-                    rule.regex.c_str(), rule.recipe.c_str());
-            rule.parse_groups();
-            if (!rule.regex.empty())
-                rules.rules.push_back(rule);
+            if (!rule->replace_list.empty())
+                rules[values[0]] = rule;
+        }
+        else if (!strcmp((const char *) ptr->name, "within"))
+        {
+            static const char *names[5] =
+                { "header", "attr", "tag", "rule", 0 };
+            std::string values[4];
+            mp::xml::parse_attr(ptr, names, values);
+            Within w;
+            w.header = values[0];
+            w.attr = values[1];
+            w.tag = values[2];
+            std::map<std::string,RulePtr>::const_iterator it =
+                rules.find(values[3]);
+            if (it == rules.end())
+                throw mp::filter::FilterException
+                    ("Reference to non-existing rule '" + values[3] +
+                     "' in http_rewrite filter");
+            w.rule = it->second;
+            phase.within_list.push_back(w);
         }
         else
         {
             throw mp::filter::FilterException
-                ("Bad element o"
+                ("Bad element "
                  + std::string((const char *) ptr->name)
-                 + " in http_rewrite1 filter");
+                 + " in http_rewrite filter");
         }
     }
 }
@@ -407,11 +514,11 @@ void yf::HttpRewrite::configure(const xmlNode * ptr, bool test_only,
             continue;
         else if (!strcmp((const char *) ptr->name, "request"))
         {
-            configure_rules(ptr, *req_rules);
+            configure_phase(ptr, *req_phase);
         }
         else if (!strcmp((const char *) ptr->name, "response"))
         {
-            configure_rules(ptr, *res_rules);
+            configure_phase(ptr, *res_phase);
         }
         else
         {