cql_rpn: use path for cql2rpn file MP-481
[metaproxy-moved-to-github.git] / src / filter_http_rewrite.cpp
index 97c0755..58243bf 100644 (file)
@@ -53,17 +53,20 @@ namespace metaproxy_1 {
         class HttpRewrite::Rule {
         public:
             std::list<Replace> replace_list;
-            const std::string test_patterns(
-                std::map<std::string, std::string> & vars,
-                const std::string & txt, bool anchor);
+            bool test_patterns(
+                std::map<std::string, std::string> &vars,
+                std::string &txt, bool anchor);
         };
         class HttpRewrite::Within {
         public:
-            std::string header;
-            std::string attr;
-            std::string tag;
+            boost::regex header;
+            boost::regex attr;
+            boost::regex tag;
+            std::string type;
             bool reqline;
             RulePtr rule;
+            bool exec(std::map<std::string, std::string> &vars,
+                      std::string &txt, bool anchor) const;
         };
 
         class HttpRewrite::Content {
@@ -75,6 +78,8 @@ namespace metaproxy_1 {
                            std::map<std::string, RulePtr > &rules);
             void quoted_literal(std::string &content,
                                 std::map<std::string, std::string> &vars) const;
+            void parse(int verbose, std::string &content,
+                       std::map<std::string, std::string> & vars) const;
         };
         class HttpRewrite::Phase {
         public:
@@ -195,13 +200,11 @@ void yf::HttpRewrite::Phase::rewrite_reqline (mp::odr & o,
     for (; it != cit->within_list.end(); it++)
         if (it->reqline)
         {
-            RulePtr rule = it->rule;
             yaz_log(YLOG_LOG, "Proxy request URL is %s", path.c_str());
-            std::string npath = rule->test_patterns(vars, path, true);
-            if (!npath.empty())
+            if (it->exec(vars, path, true))
             {
-                yaz_log(YLOG_LOG, "Rewritten request URL is %s", npath.c_str());
-                hreq->path = odr_strdup(o, npath.c_str());
+                yaz_log(YLOG_LOG, "Rewritten request URL is %s", path.c_str());
+                hreq->path = odr_strdup(o, path.c_str());
             }
         }
 }
@@ -223,28 +226,41 @@ void yf::HttpRewrite::Phase::rewrite_headers(mp::odr & o,
         std::list<Within>::const_iterator it = cit->within_list.begin();
         for (; it != cit->within_list.end(); it++)
         {
-            if (it->header.length() > 0 &&
-                yaz_strcasecmp(it->header.c_str(), header->name) == 0)
+            if (!it->header.empty() &&
+                regex_match(header->name, it->header))
             {
+#ifdef OLDHEADERMATCH                
+                // Matches and replaces the whole header line.
+                // This is good if you want to play with the header name too,
+                // but useless for patterns that want to anchor to the beginning
+                // or end of the header value, as we want to do with host-relative
+                // links. This code should probably be removed.
                 std::string sheader(header->name);
                 sheader += ": ";
                 sheader += header->value;
 
-                RulePtr rule = it->rule;
-                std::string out = rule->test_patterns(vars, sheader, true);
-                if (!out.empty())
+                if (it->exec(vars, sheader, true))
                 {
-                    size_t pos = out.find(": ");
+                    size_t pos = sheader.find(": ");
                     if (pos == std::string::npos)
                     {
                         yaz_log(YLOG_LOG, "Header malformed during rewrite, ignoring");
                         continue;
                     }
-                    header->name = odr_strdup(o, out.substr(0, pos).c_str());
-                    header->value = odr_strdup(o,
-                                               out.substr(pos + 2,
-                                                          std::string::npos).c_str());
+                    header->name = odr_strdup(
+                        o, sheader.substr(0, pos).c_str());
+                    header->value = odr_strdup(
+                        o, sheader.substr(pos + 2, std::string::npos).c_str());
+                }
+#else
+                // Match and replace only the header value
+                std::string hval(header->value);
+                if (it->exec(vars, hval, true))
+                {
+                    header->value = odr_strdup(o, hval.c_str());
                 }
+                    
+#endif
             }
         }
     }
@@ -257,6 +273,12 @@ void yf::HttpRewrite::Phase::rewrite_body(
     int *content_len,
     std::map<std::string, std::string> & vars) const
 {
+    if (*content_len == 0)
+        return;
+    if (!content_type) {
+        yaz_log(YLOG_LOG, "rewrite_body: null content_type, can not rewrite");
+        return;
+    }
     std::list<Content>::const_iterator cit = content_list.begin();
     for (; cit != content_list.end(); cit++)
     {
@@ -266,38 +288,23 @@ void yf::HttpRewrite::Phase::rewrite_body(
             && regex_match(content_type, cit->content_re))
             break;
     }
-    if (cit == content_list.end())
+    if (cit == content_list.end()) {
+        yaz_log(YLOG_LOG,"rewrite_body: No content rule matched %s, not rewriting",
+                content_type );  
         return;
+    }
 
-    if (*content_buf)
-    {
-        int i;
-        for (i = 0; i < *content_len; i++)
-            if ((*content_buf)[i] == 0)
-                return;  // binary content. skip
-
-        if (cit->type == "html")
-        {
-            HTMLParser parser;
-            Event ev(&*cit, vars);
-
-            parser.set_verbose(m_verbose);
-
-            std::string buf(*content_buf, *content_len);
-
-            parser.parse(ev, buf.c_str());
-            const char *res = ev.result();
-            *content_buf = odr_strdup(o, res);
-            *content_len = strlen(res);
+    int i;
+    for (i = 0; i < *content_len; i++)
+        if ((*content_buf)[i] == 0) {
+            yaz_log(YLOG_LOG,"rewrite_body: Looks like binary stuff, not rewriting");
+            return;  // binary content. skip
         }
-        if (cit->type == "quoted-literal")
-        {
-            std::string content(*content_buf, *content_len);
-            cit->quoted_literal(content, vars);
-            *content_buf = odr_strdup(o, content.c_str());
-            *content_len = strlen(*content_buf);
-        }
-    }
+
+    std::string content(*content_buf, *content_len);
+    cit->parse(m_verbose, content, vars);
+    *content_buf = odr_strdup(o, content.c_str());
+    *content_len = strlen(*content_buf);
 }
 
 yf::HttpRewrite::Event::Event(const Content *p,
@@ -326,19 +333,12 @@ void yf::HttpRewrite::Event::openTagStart(const char *tag, int tag_len)
     std::list<Within>::const_iterator it = m_content->within_list.begin();
     for (; it != m_content->within_list.end(); it++)
     {
-        if (it->tag.length() > 0 && yaz_strcasecmp(it->tag.c_str(),
-                                                   t.c_str()) == 0)
+        if (!it->tag.empty() && regex_match(t, it->tag))
         {
-            std::vector<std::string> attr;
-            boost::split(attr, it->attr, boost::is_any_of(","));
-            size_t i;
-            for (i = 0; i < attr.size(); i++)
+            if (!it->attr.empty() && regex_match("#text", it->attr))
             {
-                if (attr[i].compare("#text") == 0)
-                {
-                    s_within.push(it);
-                    return;
-                }
+                s_within.push(it);
+                return;
             }
         }
     }
@@ -353,7 +353,7 @@ void yf::HttpRewrite::Event::anyTagEnd(const char *tag, int tag_len,
         {
             std::list<Within>::const_iterator it = s_within.top();
             std::string t(tag, tag_len);
-            if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
+            if (regex_match(t, it->tag))
                 s_within.pop();
         }
     }
@@ -373,19 +373,11 @@ void yf::HttpRewrite::Event::attribute(const char *tag, int tag_len,
     for (; it != m_content->within_list.end(); it++)
     {
         std::string t(tag, tag_len);
-        if (it->tag.length() == 0 ||
-            yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
+        if (it->tag.empty() || regex_match(t, it->tag))
         {
             std::string a(attr, attr_len);
-            std::vector<std::string> attr;
-            boost::split(attr, it->attr, boost::is_any_of(","));
-            size_t i;
-            for (i = 0; i < attr.size(); i++)
-            {
-                if (attr[i].compare("#text") &&
-                    yaz_strcasecmp(attr[i].c_str(), a.c_str()) == 0)
-                    subst = true;
-            }
+            if (!it->attr.empty() && regex_match(a, it->attr))
+                subst = true;
         }
         if (subst)
             break;
@@ -401,13 +393,12 @@ void yf::HttpRewrite::Event::attribute(const char *tag, int tag_len,
         std::string output;
         if (subst)
         {
-            std::string input(value, val_len);
-            output = it->rule->test_patterns(m_vars, input, true);
+            std::string s(value, val_len);
+            it->exec(m_vars, s, true);
+            wrbuf_puts(m_w, s.c_str());
         }
-        if (output.empty())
-            wrbuf_write(m_w, value, val_len);
         else
-            wrbuf_puts(m_w, output.c_str());
+            wrbuf_write(m_w, value, val_len);
         wrbuf_puts(m_w, sep);
     }
 }
@@ -418,7 +409,7 @@ void yf::HttpRewrite::Event::closeTag(const char *tag, int tag_len)
     {
         std::list<Within>::const_iterator it = s_within.top();
         std::string t(tag, tag_len);
-        if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
+        if (regex_match(t, it->tag))
             s_within.pop();
     }
     wrbuf_puts(m_w, "</");
@@ -430,22 +421,102 @@ void yf::HttpRewrite::Event::text(const char *value, int len)
     std::list<Within>::const_iterator it = m_content->within_list.end();
     if (!s_within.empty())
         it = s_within.top();
-    std::string output;
     if (it != m_content->within_list.end())
     {
-        std::string input(value, len);
-        output = it->rule->test_patterns(m_vars, input, false);
+        std::string s(value, len);
+        it->exec(m_vars, s, false);
+        wrbuf_puts(m_w, s.c_str());
     }
-    if (output.empty())
+    else
         wrbuf_write(m_w, value, len);
+}
+
+static bool embed_quoted_literal(
+    std::string &content,
+    std::map<std::string, std::string> &vars,
+    mp::filter::HttpRewrite::RulePtr ruleptr,
+    bool html_context)
+{
+    bool replace = false;
+    std::string res;
+    const char *cp = content.c_str();
+    const char *cp0 = cp;
+    while (*cp)
+    {
+        if (html_context && !strncmp(cp, "&quot;", 6))
+        {
+            cp += 6;
+            res.append(cp0, cp - cp0);
+            cp0 = cp;
+            while (*cp)
+            {
+                if (!strncmp(cp, "&quot;", 6))
+                    break;
+                if (*cp == '\n')
+                    break;
+                cp++;
+            }
+            if (!*cp)
+                break;
+            std::string s(cp0, cp - cp0);
+            if (ruleptr->test_patterns(vars, s, true))
+                replace = true;
+            cp0 = cp;
+            res.append(s);
+        }
+        else if (*cp == '"' || *cp == '\'')
+        {
+            int m = *cp;
+            cp++;
+            res.append(cp0, cp - cp0);
+            cp0 = cp;
+            while (*cp)
+            {
+                if (cp[-1] != '\\' && *cp == m)
+                    break;
+                if (*cp == '\n')
+                    break;
+                cp++;
+            }
+            if (!*cp)
+                break;
+            std::string s(cp0, cp - cp0);
+            if (ruleptr->test_patterns(vars, s, true))
+                replace = true;
+            cp0 = cp;
+            res.append(s);
+        }
+        else if (*cp == '/' && cp[1] == '/')
+        {
+            while (cp[1] && cp[1] != '\n')
+                cp++;
+        }
+        cp++;
+    }
+    res.append(cp0, cp - cp0);
+    content = res;
+    return replace;
+}
+
+bool yf::HttpRewrite::Within::exec(
+    std::map<std::string, std::string> & vars,
+    std::string & txt, bool anchor) const
+{
+    if (type == "quoted-literal")
+    {
+        return embed_quoted_literal(txt, vars, rule, true);
+    }
     else
-        wrbuf_puts(m_w, output.c_str());
+    {
+        return rule->test_patterns(vars, txt, anchor);
+    }
 }
 
-const std::string yf::HttpRewrite::Rule::test_patterns(
-        std::map<std::string, std::string> & vars,
-        const std::string & txt, bool anchor)
+bool yf::HttpRewrite::Rule::test_patterns(
+    std::map<std::string, std::string> & vars,
+    std::string & txt, bool anchor)
 {
+    bool replaces = false;
     bool first = anchor;
     std::string out;
     std::string::const_iterator start, end;
@@ -474,6 +545,7 @@ const std::string yf::HttpRewrite::Rule::test_patterns(
                 break;
         }
         first = false;
+        replaces = true;
         size_t i;
         for (i = 1; i < bit->what.size(); ++i)
         {
@@ -494,9 +566,9 @@ const std::string yf::HttpRewrite::Rule::test_patterns(
         out.append(rvalue);
         start = bit->what[0].second; //move search forward
     }
-    if (start != txt.begin())
-        out.append(start, end);
-    return out;
+    out.append(start, end);
+    txt = out;
+    return replaces;
 }
 
 void yf::HttpRewrite::Replace::parse_groups(std::string pattern)
@@ -617,53 +689,34 @@ yf::HttpRewrite::Phase::Phase() : m_verbose(0)
 {
 }
 
-void yf::HttpRewrite::Content::quoted_literal(
+void yf::HttpRewrite::Content::parse(
+    int verbose,
     std::string &content,
     std::map<std::string, std::string> &vars) const
 {
-    std::string res;
-    const char *cp = content.c_str();
-    const char *cp0 = cp;
-    while (*cp)
+    if (type == "html")
     {
-        if (*cp == '"' || *cp == '\'')
-        {
-            int m = *cp;
-            cp++;
-            res.append(cp0, cp - cp0);
-            cp0 = cp;
-            while (*cp)
-            {
-                if (cp[-1] != '\\' && *cp == m)
-                    break;
-                if (*cp == '\n')
-                    break;
-                cp++;
-            }
-            if (!*cp)
-                break;
-            std::list<Within>::const_iterator it = within_list.begin();
-            std::string s(cp0, cp - cp0);
-            if (it != within_list.end())
-            {
-                RulePtr rule = it->rule;
-                std::string r;
-                r = rule->test_patterns(vars, s, true);
-                if (!r.empty())
-                    s = r;
-            }
-            cp0 = cp;
-            res.append(s);
-        }
-        else if (*cp == '/' && cp[1] == '/')
-        {
-            while (cp[1] && cp[1] != '\n')
-                cp++;
-        }
-        cp++;
+        HTMLParser parser;
+        Event ev(this, vars);
+
+        parser.set_verbose(verbose);
+
+        parser.parse(ev, content.c_str());
+        content = ev.result();
     }
-    res.append(cp0, cp - cp0);
-    content = res;
+    if (type == "quoted-literal")
+    {
+        quoted_literal(content, vars);
+    }
+}
+
+void yf::HttpRewrite::Content::quoted_literal(
+    std::string &content,
+    std::map<std::string, std::string> &vars) const
+{
+    std::list<Within>::const_iterator it = within_list.begin();
+    if (it != within_list.end())
+        embed_quoted_literal(content, vars, it->rule, false);
 }
 
 void yf::HttpRewrite::Content::configure(
@@ -675,22 +728,65 @@ void yf::HttpRewrite::Content::configure(
             continue;
         if (!strcmp((const char *) ptr->name, "within"))
         {
-            static const char *names[6] =
-                { "header", "attr", "tag", "rule", "reqline", 0 };
-            std::string values[5];
+            static const char *names[7] =
+                { "header", "attr", "tag", "rule", "reqline", "type", 0 };
+            std::string values[6];
             mp::xml::parse_attr(ptr, names, values);
             Within w;
-            w.header = values[0];
-            w.attr = values[1];
-            w.tag = values[2];
-            std::map<std::string,RulePtr>::const_iterator it =
-                rules.find(values[3]);
-            if (it == rules.end())
+            if (values[0].length() > 0)
+                w.header.assign(values[0], boost::regex_constants::icase);
+            if (values[1].length() > 0)
+                w.attr.assign(values[1], boost::regex_constants::icase);
+            if (values[2].length() > 0)
+                w.tag.assign(values[2], boost::regex_constants::icase);
+
+            std::vector<std::string> rulenames;
+            boost::split(rulenames, values[3], boost::is_any_of(","));
+            if (rulenames.size() == 0)
+            {
                 throw mp::filter::FilterException
-                    ("Reference to non-existing rule '" + values[3] +
+                    ("Empty rule in '" + values[3] +
                      "' in http_rewrite filter");
-            w.rule = it->second;
+            }
+            else if (rulenames.size() == 1)
+            {
+                std::map<std::string,RulePtr>::const_iterator it =
+                    rules.find(rulenames[0]);
+                if (it == rules.end())
+                    throw mp::filter::FilterException
+                        ("Reference to non-existing rule '" + rulenames[0] +
+                         "' in http_rewrite filter");
+                w.rule = it->second;
+
+            }
+            else
+            {
+                RulePtr rule(new Rule);
+                size_t i;
+                for (i = 0; i < rulenames.size(); i++)
+                {
+                    std::map<std::string,RulePtr>::const_iterator it =
+                        rules.find(rulenames[i]);
+                    if (it == rules.end())
+                        throw mp::filter::FilterException
+                            ("Reference to non-existing rule '" + rulenames[i] +
+                             "' in http_rewrite filter");
+                    RulePtr subRule = it->second;
+                    std::list<Replace>::iterator rit =
+                        subRule->replace_list.begin();
+                    for (; rit != subRule->replace_list.end(); rit++)
+                        rule->replace_list.push_back(*rit);
+                }
+                w.rule = rule;
+            }
             w.reqline = values[4] == "1";
+            w.type = values[5];
+            if (w.type.empty() || w.type == "quoted-literal")
+                ;
+            else
+                throw mp::filter::FilterException
+                    ("within type must be quoted-literal or none in "
+                     " in http_rewrite filter");
             within_list.push_back(w);
         }
     }
@@ -771,8 +867,8 @@ void yf::HttpRewrite::configure_phase(const xmlNode *ptr, Phase &phase)
             Content c;
 
             c.type = values[0];
-            // if (!values[1].empty())
-                c.content_re = values[1];
+            if (!values[1].empty())
+                c.content_re.assign(values[1], boost::regex::icase);
             c.configure(ptr->children, rules);
             phase.content_list.push_back(c);
         }