Merge remote-tracking branch 'origin/master' into rewrite-filter
authorHeikki Levanto <heikki@indexdata.dk>
Thu, 13 Jun 2013 10:16:03 +0000 (12:16 +0200)
committerHeikki Levanto <heikki@indexdata.dk>
Thu, 13 Jun 2013 10:16:03 +0000 (12:16 +0200)
src/.gitignore
src/Makefile.am
src/factory_static.cpp
src/filter_http_rewrite.cpp [new file with mode: 0644]
src/filter_http_rewrite.hpp [new file with mode: 0644]
src/html_parser.cpp [new file with mode: 0644]
src/html_parser.hpp [new file with mode: 0644]
src/test_filter_rewrite.cpp [new file with mode: 0644]
src/test_html_parser.cpp [new file with mode: 0644]

index caad553..a9d9820 100644 (file)
@@ -1,39 +1,5 @@
-.libs
-.deps
-*.lo
-*.la
-stamp-h*
+*
+!*.hpp
+!*.cpp
+!*.am
 config.hpp
-socket
-Makefile
-Makefile.in
-config.hpp.in
-ex_filter_frontend_net
-ex_router_flexml
-test_boost_threads
-test_boost_time
-test_filter_auth_simple
-test_filter1
-test_filter2
-test_filter_frontend_net
-test_filter_log
-test_filter_multi
-test_filter_query_rewrite
-test_package1
-test_pipe
-test_thread_pool_observer
-test_session1
-test_session2
-test_filter_factory
-test_filter_z3950_client
-test_filter_backend_test
-test_filter_virt_db
-test_router_flexml
-test_ses_map
-tstdl
-metaproxy
-test_filter_bounce
-test_filter_record_transform
-test_filter_sru_to_z3950
-*.o
-metaproxy-config
index 564f7b8..42a9577 100644 (file)
@@ -21,6 +21,7 @@ filter_src = \
        filter_http_client.cpp filter_http_client.hpp \
        filter_http_file.cpp filter_http_file.hpp \
        filter_http_rewrite1.cpp filter_http_rewrite1.hpp \
+       filter_http_rewrite.cpp filter_http_rewrite.hpp \
        filter_limit.cpp filter_limit.hpp \
        filter_load_balance.cpp filter_load_balance.hpp \
        filter_log.cpp filter_log.hpp \
@@ -57,6 +58,7 @@ libmetaproxy_la_SOURCES = \
        torus.cpp torus.hpp \
        url_recipe.cpp \
        util.cpp \
+       html_parser.hpp html_parser.cpp \
        router_chain.cpp \
         router_flexml.hpp router_flexml.cpp \
        router_xml.cpp \
@@ -109,8 +111,10 @@ check_PROGRAMS = \
        test_filter_record_transform \
        test_filter_sru_to_z3950 \
        test_filter_virt_db \
+       test_filter_rewrite \
        test_ses_map \
        test_router_flexml \
+       test_html_parser \
        test_xmlutil
 
 TESTS=$(check_PROGRAMS)
@@ -139,6 +143,9 @@ test_ses_map_SOURCES = test_ses_map.cpp
 test_router_flexml_SOURCES = test_router_flexml.cpp
 test_xmlutil_SOURCES = test_xmlutil.cpp
 
+test_html_parser_SOURCES = test_html_parser.cpp $(filter_src)
+test_filter_rewrite_SOURCES = test_filter_rewrite.cpp $(filter_src)
+
 # doxygen target
 dox:
        (cd $(top_srcdir) ; make dox) 
index 3d88a82..95d0a62 100644 (file)
@@ -54,6 +54,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 #include "filter_z3950_client.hpp"
 #include "filter_zeerex_explain.hpp"
 #include "filter_zoom.hpp"
+#include "filter_http_rewrite.hpp"
 
 namespace mp = metaproxy_1;
 
@@ -87,6 +88,7 @@ mp::FactoryStatic::FactoryStatic()
         &metaproxy_1_filter_z3950_client,
         &metaproxy_1_filter_zeerex_explain,
         &metaproxy_1_filter_zoom,
+        &metaproxy_1_filter_http_rewrite,
         0
     };
     int i;
diff --git a/src/filter_http_rewrite.cpp b/src/filter_http_rewrite.cpp
new file mode 100644 (file)
index 0000000..55ae35c
--- /dev/null
@@ -0,0 +1,448 @@
+/* This file is part of Metaproxy.
+   Copyright (C) 2005-2013 Index Data
+
+Metaproxy is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2, or (at your option) any later
+version.
+
+Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include "config.hpp"
+#include <metaproxy/filter.hpp>
+#include <metaproxy/package.hpp>
+#include <metaproxy/util.hpp>
+#include "filter_http_rewrite.hpp"
+
+#include <yaz/zgdu.h>
+#include <yaz/log.h>
+
+#include <boost/regex.hpp>
+#include <boost/lexical_cast.hpp>
+
+#include <vector>
+#include <map>
+
+#if HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+
+namespace mp = metaproxy_1;
+namespace yf = mp::filter;
+
+namespace metaproxy_1 {
+    namespace filter {
+        class HttpRewrite::RuleScope {
+        public:
+            std::vector<std::string> tags;
+            std::vector<std::string> attrs;
+            std::string content_type;
+        };
+        class HttpRewrite::Rule {
+        public:
+            enum Section { METHOD, HEADER, BODY };
+            std::string regex;
+            std::string recipe;
+            std::map<int, std::string> group_index;
+            std::vector<RuleScope> scopes;
+            Section section;
+            const std::string search_replace(
+                std::map<std::string, std::string> & vars,
+                const std::string & txt) const;
+            std::string sub_vars (
+                const std::map<std::string, std::string> & vars) const;
+            void parse_groups();
+        };
+        class HttpRewrite::Rules {
+        public:
+            std::vector<Rule> rules;
+            void rewrite_reqline (mp::odr & o, Z_HTTP_Request *hreq,
+                std::map<std::string, std::string> & vars) const;
+            void rewrite_headers(mp::odr & o, Z_HTTP_Header *headers,
+                std::map<std::string, std::string> & vars) const;
+            void rewrite_body (mp::odr & o, 
+                char **content_buf, int *content_len,
+                std::map<std::string, std::string> & vars) const;
+            const std::string test_patterns(
+                std::map<std::string, std::string> & vars,
+                const std::string & txt) const;
+        };
+    }
+}
+
+yf::HttpRewrite::HttpRewrite() : req_rules(new Rules), res_rules(new Rules)
+{
+}
+
+yf::HttpRewrite::~HttpRewrite()
+{
+}
+
+void yf::HttpRewrite::process(mp::Package & package) const 
+{
+    yaz_log(YLOG_LOG, "HttpRewrite begins....");
+    Z_GDU *gdu = package.request().get();
+    //map of request/response vars
+    std::map<std::string, std::string> vars;
+    //we have an http req
+    if (gdu && gdu->which == Z_GDU_HTTP_Request)
+    {
+        Z_HTTP_Request *hreq = gdu->u.HTTP_Request;
+        mp::odr o;
+        req_rules->rewrite_reqline(o, hreq, vars);
+        yaz_log(YLOG_LOG, ">> Request headers");
+        req_rules->rewrite_headers(o, hreq->headers, vars);
+        req_rules->rewrite_body(o, 
+                &hreq->content_buf, &hreq->content_len, 
+                vars);
+        package.request() = gdu;
+    }
+    package.move();
+    gdu = package.response().get();
+    if (gdu && gdu->which == Z_GDU_HTTP_Response)
+    {
+        Z_HTTP_Response *hres = gdu->u.HTTP_Response;
+        yaz_log(YLOG_LOG, "Response code %d", hres->code);
+        mp::odr o;
+        yaz_log(YLOG_LOG, "<< Respose headers");
+        res_rules->rewrite_headers(o, hres->headers, vars);
+        res_rules->rewrite_body(o, &hres->content_buf, 
+                &hres->content_len, vars);
+        package.response() = gdu;
+    }
+}
+
+void yf::HttpRewrite::Rules::rewrite_reqline (mp::odr & o, 
+        Z_HTTP_Request *hreq,
+        std::map<std::string, std::string> & vars) const 
+{
+    //rewrite the request line
+    std::string path;
+    if (strstr(hreq->path, "http://") == hreq->path)
+    {
+        yaz_log(YLOG_LOG, "Path in the method line is absolute, " 
+            "possibly a proxy request");
+        path += hreq->path;
+    }
+    else
+    {
+        //TODO what about proto
+        path += "http://";
+        path += z_HTTP_header_lookup(hreq->headers, "Host");
+        path += hreq->path; 
+    }
+    yaz_log(YLOG_LOG, "Proxy request URL is %s", path.c_str());
+    std::string npath = 
+        test_patterns(vars, path);
+    if (!npath.empty())
+    {
+        yaz_log(YLOG_LOG, "Rewritten request URL is %s", npath.c_str());
+        hreq->path = odr_strdup(o, npath.c_str());
+    }
+}
+
+void yf::HttpRewrite::Rules::rewrite_headers(mp::odr & o, 
+        Z_HTTP_Header *headers,
+        std::map<std::string, std::string> & vars) const 
+{
+    for (Z_HTTP_Header *header = headers;
+            header != 0; 
+            header = header->next) 
+    {
+        std::string sheader(header->name);
+        sheader += ": ";
+        sheader += header->value;
+        yaz_log(YLOG_LOG, "%s: %s", header->name, header->value);
+        std::string out = test_patterns(vars, sheader);
+        if (!out.empty()) 
+        {
+            size_t pos = out.find(": ");
+            if (pos == std::string::npos)
+            {
+                yaz_log(YLOG_LOG, "Header malformed during rewrite, ignoring");
+                continue;
+            }
+            header->name = odr_strdup(o, out.substr(0, pos).c_str());
+            header->value = odr_strdup(o, out.substr(pos+2, 
+                        std::string::npos).c_str());
+        }
+    }
+}
+
+void yf::HttpRewrite::Rules::rewrite_body (mp::odr & o, 
+        char **content_buf, 
+        int *content_len,
+        std::map<std::string, std::string> & vars) const 
+{
+    if (*content_buf)
+    {
+        std::string body(*content_buf);
+        std::string nbody = 
+            test_patterns(vars, body);
+        if (!nbody.empty())
+        {
+            *content_buf = odr_strdup(o, nbody.c_str());
+            *content_len = nbody.size();
+        }
+    }
+}
+
+/**
+ * Tests pattern from the vector in order and executes recipe on
+ the first match.
+ */
+const std::string yf::HttpRewrite::Rules::test_patterns(
+        std::map<std::string, std::string> & vars,
+        const std::string & txt) const
+{
+    for (unsigned i = 0; i < rules.size(); i++) 
+    {
+        std::string out = rules[i].search_replace(vars, txt);
+        if (!out.empty()) return out;
+    }
+    return "";
+}
+
+const std::string yf::HttpRewrite::Rule::search_replace(
+        std::map<std::string, std::string> & vars,
+        const std::string & txt) const
+{
+    //exec regex against value
+    boost::regex re(regex);
+    boost::smatch what;
+    std::string::const_iterator start, end;
+    start = txt.begin();
+    end = txt.end();
+    std::string out;
+    while (regex_search(start, end, what, re)) //find next full match
+    {
+        unsigned i;
+        for (i = 1; i < what.size(); ++i)
+        {
+            //check if the group is named
+            std::map<int, std::string>::const_iterator it
+                = group_index.find(i);
+            if (it != group_index.end()) 
+            {   //it is
+                if (!what[i].str().empty())
+                    vars[it->second] = what[i];
+            }
+
+        }
+        //prepare replacement string
+        std::string rvalue = sub_vars(vars);
+        yaz_log(YLOG_LOG, "! Rewritten '%s' to '%s'", 
+                what.str(0).c_str(), rvalue.c_str());
+        out.append(start, what[0].first);
+        out.append(rvalue);
+        start = what[0].second; //move search forward
+    }
+    //if we had a match cat the last part
+    if (start != txt.begin())
+        out.append(start, end);
+    return out;
+}
+
+void yf::HttpRewrite::Rule::parse_groups()
+{
+    int gnum = 0;
+    bool esc = false;
+    const std::string & str = regex;
+    yaz_log(YLOG_LOG, "Parsing groups from '%s'", str.c_str());
+    for (unsigned i = 0; i < str.size(); ++i)
+    {
+        if (!esc && str[i] == '\\')
+        {
+            esc = true;
+            continue;
+        }
+        if (!esc && str[i] == '(') //group starts
+        {
+            gnum++;
+            if (i+1 < str.size() && str[i+1] == '?') //group with attrs 
+            {
+                i++; 
+                if (i+1 < str.size() && str[i+1] == ':') //non-capturing
+                {
+                    if (gnum > 0) gnum--;
+                    i++;
+                    continue;
+                }
+                if (i+1 < str.size() && str[i+1] == 'P') //optional, python
+                    i++;
+                if (i+1 < str.size() && str[i+1] == '<') //named
+                {
+                    i++;
+                    std::string gname;
+                    bool term = false;
+                    while (++i < str.size())
+                    {
+                        if (str[i] == '>') { term = true; break; }
+                        if (!isalnum(str[i])) 
+                            throw mp::filter::FilterException
+                                ("Only alphanumeric chars allowed, found "
+                                 " in '" 
+                                 + str 
+                                 + "' at " 
+                                 + boost::lexical_cast<std::string>(i)); 
+                        gname += str[i];
+                    }
+                    if (!term)
+                        throw mp::filter::FilterException
+                            ("Unterminated group name '" + gname 
+                             + " in '" + str +"'");
+                    group_index[gnum] = gname;
+                    yaz_log(YLOG_LOG, "Found named group '%s' at $%d",
+                            gname.c_str(), gnum);
+                }
+            }
+        }
+        esc = false;
+    }
+}
+
+std::string yf::HttpRewrite::Rule::sub_vars (
+        const std::map<std::string, std::string> & vars) const
+{
+    std::string out;
+    bool esc = false;
+    const std::string & in = recipe;
+    for (unsigned i = 0; i < in.size(); ++i)
+    {
+        if (!esc && in[i] == '\\')
+        {
+            esc = true;
+            continue;
+        }
+        if (!esc && in[i] == '$') //var
+        {
+            if (i+1 < in.size() && in[i+1] == '{') //ref prefix
+            {
+                ++i;
+                std::string name;
+                bool term = false;
+                while (++i < in.size()) 
+                {
+                    if (in[i] == '}') { term = true; break; }
+                    name += in[i];
+                }
+                if (!term) throw mp::filter::FilterException
+                    ("Unterminated var ref in '"+in+"' at "
+                     + boost::lexical_cast<std::string>(i));
+                std::map<std::string, std::string>::const_iterator it
+                    = vars.find(name);
+                if (it != vars.end())
+                {
+                    out += it->second;
+                }
+            }
+            else
+            {
+                throw mp::filter::FilterException
+                    ("Malformed or trimmed var ref in '"
+                     +in+"' at "+boost::lexical_cast<std::string>(i)); 
+            }
+            continue;
+        }
+        //passthru
+        out += in[i];
+        esc = false;
+    }
+    return out;
+}
+
+void yf::HttpRewrite::configure_rules(const xmlNode *ptr, 
+        Rules & rules)
+{
+    for (ptr = ptr->children; ptr; ptr = ptr->next)
+    {
+        if (ptr->type != XML_ELEMENT_NODE)
+            continue;
+        else if (!strcmp((const char *) ptr->name, "rewrite"))
+        {
+            Rule rule;
+            const struct _xmlAttr *attr;
+            for (attr = ptr->properties; attr; attr = attr->next)
+            {
+                if (!strcmp((const char *) attr->name,  "from"))
+                    rule.regex = mp::xml::get_text(attr->children);
+                else if (!strcmp((const char *) attr->name,  "to"))
+                    rule.recipe = mp::xml::get_text(attr->children);
+                else
+                    throw mp::filter::FilterException
+                        ("Bad attribute "
+                         + std::string((const char *) attr->name)
+                         + " in rewrite section of http_rewrite");
+            }
+            yaz_log(YLOG_LOG, "Found rewrite rule from '%s' to '%s'", 
+                    rule.regex.c_str(), rule.recipe.c_str());
+            rule.parse_groups();
+            if (!rule.regex.empty())
+                rules.rules.push_back(rule);
+        }
+        else
+        {
+            throw mp::filter::FilterException
+                ("Bad element o"
+                 + std::string((const char *) ptr->name)
+                 + " in http_rewrite1 filter");
+        }
+    }
+}
+
+void yf::HttpRewrite::configure(const xmlNode * ptr, bool test_only,
+        const char *path)
+{
+    for (ptr = ptr->children; ptr; ptr = ptr->next)
+    {
+        if (ptr->type != XML_ELEMENT_NODE)
+            continue;
+        else if (!strcmp((const char *) ptr->name, "request"))
+        {
+            configure_rules(ptr, *req_rules);
+        }
+        else if (!strcmp((const char *) ptr->name, "response"))
+        {
+            configure_rules(ptr, *res_rules);
+        }
+        else
+        {
+            throw mp::filter::FilterException
+                ("Bad element "
+                 + std::string((const char *) ptr->name)
+                 + " in http_rewrite1 filter");
+        }
+    }
+}
+
+static mp::filter::Base* filter_creator()
+{
+    return new mp::filter::HttpRewrite;
+}
+
+extern "C" {
+    struct metaproxy_1_filter_struct metaproxy_1_filter_http_rewrite = {
+        0,
+        "http_rewrite",
+        filter_creator
+    };
+}
+
+
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * c-file-style: "Stroustrup"
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */
+
diff --git a/src/filter_http_rewrite.hpp b/src/filter_http_rewrite.hpp
new file mode 100644 (file)
index 0000000..d611142
--- /dev/null
@@ -0,0 +1,59 @@
+/* This file is part of Metaproxy.
+   Copyright (C) 2005-2013 Index Data
+
+Metaproxy is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2, or (at your option) any later
+version.
+
+Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#ifndef FILTER_HTTP_REWRITE_HPP
+#define FILTER_HTTP_REWRITE_HPP
+
+#include <metaproxy/filter.hpp>
+#include <boost/scoped_ptr.hpp>
+
+namespace mp = metaproxy_1;
+
+namespace metaproxy_1 {
+    namespace filter {
+        class HttpRewrite : public Base {
+            class Rules;
+            class Rule;
+            class RuleScope;
+            boost::scoped_ptr<Rules> req_rules;
+            boost::scoped_ptr<Rules> res_rules;
+            void configure_rules(const xmlNode *ptr, Rules & rules);
+        public:
+            HttpRewrite();
+            ~HttpRewrite();
+            void process(metaproxy_1::Package & package) const;
+            void configure(const xmlNode * ptr, 
+                    bool test_only, const char *path);
+        };
+    }
+}
+
+extern "C" {
+    extern struct metaproxy_1_filter_struct metaproxy_1_filter_http_rewrite;
+}
+
+#endif
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * c-file-style: "Stroustrup"
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */
+
diff --git a/src/html_parser.cpp b/src/html_parser.cpp
new file mode 100644 (file)
index 0000000..8d91a2c
--- /dev/null
@@ -0,0 +1,248 @@
+/* This file is part of Metaproxy.
+   Copyright (C) 2005-2013 Index Data
+
+Metaproxy is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2, or (at your option) any later
+version.
+
+Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include "config.hpp"
+#include "html_parser.hpp"
+
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <stdio.h>
+
+#define TAG_MAX_LEN 64
+
+#define SPACECHR " \t\r\n\f"
+
+#define DEBUG(x) x
+
+#if HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+
+namespace mp = metaproxy_1;
+
+mp::HTMLParser::HTMLParser()
+{
+}
+
+mp::HTMLParser::~HTMLParser()
+{
+}
+
+static void parse_str(mp::HTMLParserEvent & event, const char * str);
+
+void mp::HTMLParser::parse(mp::HTMLParserEvent & event, const char *str) const
+{
+    parse_str(event, str);
+}
+
+//static C functions follow would probably make sense to wrap this in PIMPL?
+
+static char* dupe (const char *buff, int len)
+{
+    char *value = (char *) malloc (len + 1);
+    assert (value);
+    memcpy (value, buff, len);
+    value[len] = '\0';
+    return value;
+}
+
+static int skipSpace (const char *cp)
+{
+    int i = 0;
+    while (cp[i] && strchr (SPACECHR, cp[i]))
+        i++;
+    return i;
+}
+
+static int skipName (const char *cp, char *dst)
+{
+    int i;
+    int j = 0;
+    for (i=0; cp[i] && !strchr (SPACECHR "/>=", cp[i]); i++)
+       if (j < TAG_MAX_LEN-1)
+       {
+           dst[j] = tolower(cp[j]);
+           j++;
+       }
+    dst[j] = '\0';
+    return i;
+}
+
+static int skipAttribute (const char *cp, char *name, const char **value, int *val_len)
+{
+    int i = skipName (cp, name);   
+    *value = NULL;
+    if (!i)
+        return skipSpace (cp);
+    i += skipSpace (cp + i);
+    if (cp[i] == '=')
+    {
+        int v0, v1;
+        i++;
+        i += skipSpace (cp + i);
+        if (cp[i] == '\"' || cp[i] == '\'')
+        {
+            char tr = cp[i];
+            v0 = ++i;
+            while (cp[i] != tr && cp[i])
+                i++; 
+            v1 = i;
+            if (cp[i])
+                i++;
+        }
+        else
+        {
+            v0 = i;
+            while (cp[i] && !strchr (SPACECHR ">", cp[i]))
+                i++;
+            v1 = i;
+        }
+        *value = cp + v0;
+        *val_len = v1 - v0;
+    }
+    i += skipSpace (cp + i);
+    return i;
+}
+
+static int tagAttrs (mp::HTMLParserEvent & event, 
+                     const char *tagName,
+                     const char *cp)
+{
+    int i;
+    char attr_name[TAG_MAX_LEN];
+    const char *attr_value;
+    int val_len;
+    i = skipSpace (cp);
+    while (cp[i] && cp[i] != '>')
+    {
+        int nor = skipAttribute (cp+i, attr_name, &attr_value, &val_len);
+        i += nor;
+       if (nor)
+       {
+           DEBUG(printf ("------ attr %s=%s\n", attr_name, dupe(attr_value, val_len)));
+            event.attribute(tagName, attr_name, attr_value, val_len);
+       }
+        else
+        {
+            if (!nor)
+                i++;
+        }
+    }
+    return i;
+}
+
+static int tagStart (mp::HTMLParserEvent & event,
+        char *tagName, const char *cp, const char which)
+{
+    int i = 0;
+    i = skipName (cp, tagName);
+    switch (which) 
+    {
+        case '/' : 
+            DEBUG(printf ("------ tag close %s\n", tagName));
+            event.closeTag(tagName);
+            break;
+        case '!' : 
+            DEBUG(printf ("------ dtd %s\n", tagName)); 
+            break;
+        case '?' : 
+            DEBUG(printf ("------ pi %s\n", tagName)); 
+            break;
+        default :  
+            DEBUG(printf ("------ tag open %s\n", tagName));
+            event.openTagStart(tagName);
+            break;
+    }
+    return i;
+}
+
+static int tagEnd (mp::HTMLParserEvent & event, const char *tagName, const char *cp)
+{
+    int i = 0;
+    while (cp[i] && cp[i] != '>')
+        i++;
+    if (cp[i] == '>')
+    {
+        event.anyTagEnd(tagName);
+        i++;
+    }
+    return i;
+}
+
+static void tagText (mp::HTMLParserEvent & event, const char *text_start, const char *text_end)
+{
+    if (text_end - text_start) //got text to flush
+    {
+        DEBUG(printf ("------ text %s\n", dupe(text_start, text_end-text_start)));
+        event.text(text_start, text_end-text_start);
+    }
+}
+
+static void parse_str (mp::HTMLParserEvent & event, const char *cp)
+{
+    const char *text_start = cp;
+    const char *text_end = cp;
+    while (*cp)
+    {
+        if (cp[0] == '<' && cp[1])  //tag?
+        {
+            char which = cp[1];
+            if (which == '/') cp++;
+            if (!strchr (SPACECHR, cp[1])) //valid tag starts
+            {
+                tagText (event, text_start, text_end); //flush any text
+                char tagName[TAG_MAX_LEN];
+                cp++;
+                if (which == '/')
+                {
+                    cp += tagStart (event, tagName, cp, which);
+                }
+                else if (which == '!' || which == '?') //pi or dtd
+                {
+                    cp++;
+                    cp += tagStart (event, tagName, cp, which);
+                }
+                else
+                {
+                    cp += tagStart (event, tagName, cp, which);
+                    cp += tagAttrs (event, tagName, cp);
+                }
+                cp += tagEnd (event, tagName, cp);
+                text_start = cp;
+                text_end = cp;
+                continue;
+            }
+        }
+        //text
+        cp++;
+        text_end = cp;
+    }
+    tagText (event, text_start, text_end); //flush any text
+}
+
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * c-file-style: "Stroustrup"
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */
+
diff --git a/src/html_parser.hpp b/src/html_parser.hpp
new file mode 100644 (file)
index 0000000..ad46061
--- /dev/null
@@ -0,0 +1,53 @@
+/* This file is part of Metaproxy.
+   Copyright (C) 2005-2013 Index Data
+
+Metaproxy is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2, or (at your option) any later
+version.
+
+Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#ifndef HTML_PARSER_HPP
+#define HTML_PARSER_HPP
+
+#include <boost/scoped_ptr.hpp>
+
+namespace metaproxy_1 {
+        class HTMLParserEvent {
+        public:
+            virtual void openTagStart(const char *name) = 0;
+            virtual void anyTagEnd(const char *name) = 0;
+            virtual void attribute(const char *tagName, 
+                    const char *name, 
+                    const char *value,
+                    int val_len) = 0;
+            virtual void closeTag(const char *name) = 0;
+            virtual void text(const char *value, int len) = 0;
+        };
+        class HTMLParser {
+        public:
+            HTMLParser();
+            ~HTMLParser();
+            void parse(HTMLParserEvent & event, const char *str) const;
+        };
+}
+
+#endif
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * c-file-style: "Stroustrup"
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */
+
diff --git a/src/test_filter_rewrite.cpp b/src/test_filter_rewrite.cpp
new file mode 100644 (file)
index 0000000..90c17d6
--- /dev/null
@@ -0,0 +1,358 @@
+/* This file is part of Metaproxy.
+   Copyright (C) 2005-2013 Index Data
+
+Metaproxy is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2, or (at your option) any later
+version.
+
+Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include "config.hpp"
+#include <iostream>
+#include <stdexcept>
+
+#include "filter_http_client.hpp"
+#include "filter_http_rewrite.hpp"
+#include <metaproxy/util.hpp>
+#include <metaproxy/router_chain.hpp>
+#include <metaproxy/package.hpp>
+
+#include <boost/regex.hpp>
+#include <boost/lexical_cast.hpp>
+
+#include <yaz/log.h>
+
+#define BOOST_AUTO_TEST_MAIN
+#define BOOST_TEST_DYN_LINK
+
+#include <boost/test/auto_unit_test.hpp>
+
+using namespace boost::unit_test;
+namespace mp = metaproxy_1;
+/*
+ * The global testconfig is commented out, as it won't even compile
+ * on old Centos5 machines
+struct TestConfig {
+    TestConfig()   
+    {
+        std::cout << "global setup\n"; 
+        yaz_log_init_level(YLOG_ALL);
+    }
+    ~TestConfig() 
+    { 
+        std::cout << "global teardown\n"; 
+    }
+};
+
+BOOST_GLOBAL_FIXTURE( TestConfig );
+*/
+
+BOOST_AUTO_TEST_CASE( test_filter_rewrite_1 )
+{
+    try
+    {
+        std::cout << "Running non-xml config test case" << std::endl;
+        mp::RouterChain router;
+        mp::filter::HttpRewrite fhr;
+         
+        std::string xmlconf =
+            "<?xml version='1.0'?>\n"
+            "<filter xmlns='http://indexdata.com/metaproxy'\n"
+            "        id='rewrite1' type='http_rewrite'>\n"
+            " <request>\n"
+            "   <rewrite from='"
+    "(?&lt;proto>https?://)(?&lt;pxhost>[^ /?#]+)/(?&lt;pxpath>[^ /]+)"
+    "/(?&lt;host>[^ /]+)(?&lt;path>[^ ]*)'\n"
+            "            to='${proto}${host}${path}' />\n"
+            "   <rewrite from='(?:Host: )(.*)'\n"
+            "            to='Host: ${host}' />\n" 
+            " </request>\n"
+            " <response>\n"
+            "   <rewrite from='"
+    "(?&lt;proto>https?://)(?&lt;host>[^/?# &quot;&apos;>]+)/(?&lt;path>[^  &quot;&apos;>]+)'\n"
+            "            to='${proto}${pxhost}/${pxpath}/${host}/${path}' />\n" 
+            " </response>\n"
+            "</filter>\n"
+        ;
+
+        std::cout << xmlconf;
+
+        // reading and parsing XML conf
+        xmlDocPtr doc = xmlParseMemory(xmlconf.c_str(), xmlconf.size());
+        BOOST_CHECK(doc);
+        xmlNode *root_element = xmlDocGetRootElement(doc);
+        fhr.configure(root_element, true, "");
+        xmlFreeDoc(doc);
+       
+        router.append(fhr);
+
+        // create an http request
+        mp::Package pack;
+
+        mp::odr odr;
+        Z_GDU *gdu_req = z_get_HTTP_Request_uri(odr, 
+        "http://proxyhost/proxypath/targetsite/page1.html", 0, 1);
+
+        pack.request() = gdu_req;
+
+        //create the http response
+
+        const char *resp_buf =
+            "HTTP/1.1 200 OK\r\n"
+            "Content-Length: 441\r\n"
+            "Content-Type: text/html\r\n"
+            "Link: <http://targetsite/file.xml>; rel=absolute\r\n"
+            "Link: </dir/file.xml>; rel=relative\r\n"
+            "\r\n"
+            "<html><head><title>Hello proxy!</title>"
+            "<style>"
+            "body {"
+            "  background-image:url('http://targetsite/images/bg.png');"
+            "}"
+            "</style>"
+            "</head>"
+            "<script>var jslink=\"http://targetsite/webservice.xml\";</script>"
+            "<body>"
+            "<p>Welcome to our website. It doesn't make it easy to get pro"
+            "xified"
+            "<a href=\"http://targetsite/page2.html\">"
+            "  An absolute link</a>"
+            "<a target=_blank href='http://targetsite/page3.html\">"
+            "  Another abs link</a>"
+            "<a href=\"/docs/page4.html\" />"
+            "</body></html>";
+
+        const char *resp_expected =
+            "HTTP/1.1 200 OK\r\n"
+            "Content-Length: 521\r\n"
+            "Content-Type: text/html\r\n"
+            "Link: <http://proxyhost/proxypath/targetsite/file.xml>; rel=absolute\r\n"
+            "Link: </dir/file.xml>; rel=relative\r\n"
+            "\r\n"
+            "<html><head><title>Hello proxy!</title>"
+            "<style>"
+            "body {"
+            "  background-image:url('http://proxyhost/proxypath/targetsite/images/bg.png');"
+            "}"
+            "</style>"
+            "</head>"
+            "<script>var jslink=\"http://proxyhost/proxypath/targetsite/webservice.xml\";</script>"
+            "<body>"
+            "<p>Welcome to our website. It doesn't make it easy to get pro"
+            "xified"
+            "<a href=\"http://proxyhost/proxypath/targetsite/page2.html\">"
+            "  An absolute link</a>"
+            "<a target=_blank href='http://proxyhost/proxypath/targetsite/page3.html\">"
+            "  Another abs link</a>"
+            "<a href=\"/docs/page4.html\" />"
+            "</body></html>";
+
+        int r;
+        Z_GDU *gdu_res;
+        ODR dec = odr_createmem(ODR_DECODE);
+        odr_setbuf(dec, (char *) resp_buf, strlen(resp_buf), 0);
+        r = z_GDU(dec, &gdu_res, 0, 0);
+
+        BOOST_CHECK(r);
+        if (r)
+        {
+            BOOST_CHECK_EQUAL(gdu_res->which, Z_GDU_HTTP_Response);
+        }
+
+        pack.response() = gdu_res;
+
+        //feed to the router
+        pack.router(router).move();
+
+        //analyze the response
+        Z_GDU *gdu_res_rew = pack.response().get();
+        BOOST_CHECK(gdu_res_rew);
+        BOOST_CHECK_EQUAL(gdu_res_rew->which, Z_GDU_HTTP_Response);
+        
+        Z_HTTP_Response *hres = gdu_res_rew->u.HTTP_Response;
+        BOOST_CHECK(hres);
+
+        //compare buffers
+        std::cout << "Expected result:\n" << resp_expected << std::endl;
+
+        ODR enc = odr_createmem(ODR_ENCODE);
+        z_GDU(enc, &gdu_res_rew, 0, 0);
+        char *resp_result;
+        int resp_result_len;
+        resp_result = odr_getbuf(enc, &resp_result_len, 0);
+        
+        BOOST_CHECK(resp_result);
+        BOOST_CHECK_EQUAL(resp_result_len, strlen(resp_expected));
+
+        std::cout << "Rewriten result:\n" << resp_result << std::endl;
+        std::cout << "Rewriten result buf len: " << resp_result_len 
+            << std::endl;
+
+        BOOST_CHECK(memcmp(resp_result, resp_expected, resp_result_len) == 0);
+
+        odr_destroy(dec);
+        odr_destroy(enc);
+    }
+    catch (std::exception & e) {
+        std::cout << e.what();
+        std::cout << std::endl;
+        BOOST_CHECK (false);
+    }
+}
+
+/*
+BOOST_AUTO_TEST_CASE( test_filter_rewrite_2 )
+{
+    try
+    {
+        std::cout << "Running xml config test case" << std::endl;
+        mp::RouterChain router;
+        mp::filter::HttpRewrite fhr;
+
+        std::string xmlconf =
+            "<?xml version='1.0'?>\n"
+            "<filter xmlns='http://indexdata.com/metaproxy'\n"
+            "        id='rewrite1' type='http_rewrite'>\n"
+            " <request>\n"
+            "   <rewrite from='"
+    "(?&lt;proto>https?://)(?&lt;pxhost>[^ /?#]+)/(?&lt;pxpath>[^ /]+)"
+    "/(?&lt;host>[^ /]+)(?&lt;path>[^ ]*)'\n"
+            "            to='${proto}${host}${path}' />\n"
+            "   <rewrite from='(?:Host: )(.*)'\n"
+            "            to='Host: ${host}' />\n" 
+            " </request>\n"
+            " <response>\n"
+            "   <rewrite from='"
+    "(?&lt;proto>https?://)(?&lt;host>[^/?# &quot;&apos;>]+)/(?&lt;path>[^  &quot;&apos;>]+)'\n"
+            "            to='${proto}${pxhost}/${pxpath}/${host}/${path}' />\n" 
+            " </response>\n"
+            "</filter>\n"
+        ;
+
+        std::cout << xmlconf;
+
+        // reading and parsing XML conf
+        xmlDocPtr doc = xmlParseMemory(xmlconf.c_str(), xmlconf.size());
+        BOOST_CHECK(doc);
+        xmlNode *root_element = xmlDocGetRootElement(doc);
+        fhr.configure(root_element, true, "");
+        xmlFreeDoc(doc);
+        
+        router.append(fhr);
+
+        // create an http request
+        mp::Package pack;
+
+        mp::odr odr;
+        Z_GDU *gdu_req = z_get_HTTP_Request_uri(odr, 
+        "http://proxyhost/proxypath/targetsite/page1.html", 0, 1);
+
+        pack.request() = gdu_req;
+
+        //create the http response
+
+        const char *resp_buf =
+            "HTTP/1.1 200 OK\r\n"
+            "Content-Length: 50\r\n"
+            "Content-Type: text/html\r\n"
+            "Link: <http://targetsite/file.xml>; rel=absolute\r\n"
+            "Link: </dir/file.xml>; rel=relative\r\n"
+            "\r\n"
+            "<html><head><title>Hello proxy!</title>"
+            "<style>"
+            "body {"
+            "  background-image:url('http://targetsite/images/bg.png');"
+            "}"
+            "</style>"
+            "</head>"
+            "<script>var jslink=\"http://targetsite/webservice.xml\";</script>"
+            "<body>"
+            "<p>Welcome to our website. It doesn't make it easy to get pro"
+            "xified"
+            "<a href=\"http://targetsite/page2.html\">"
+            "  An absolute link</a>"
+            "<a target=_blank href='http://targetsite/page3.html\">"
+            "  Another abs link</a>"
+            "<a href=\"/docs/page4.html\" />"
+            "</body></html>";
+
+        const char *resp_buf_rew =
+            "HTTP/1.1 200 OK\r\n"
+            "Content-Length: 50\r\n"
+            "Content-Type: text/html\r\n"
+            "Link: <http://proxyhost/proxypath/targetsite/file.xml>; rel=absolute\r\n"
+            "Link: </dir/file.xml>; rel=relative\r\n"
+            "\r\n"
+            "<html><head><title>Hello proxy!</title>"
+            "<style>"
+            "body {"
+            "  background-image:url('http://proxyhost/proxypath/targetsite/images/bg.png');"
+            "}"
+            "</style>"
+            "</head>"
+            "<script>var jslink=\"http://proxyhost/proxypath/targetsite/webservice.xml\";</script>"
+            "<body>"
+            "<p>Welcome to our website. It doesn't make it easy to get pro"
+            "xified"
+            "<a href=\"http://proxyhost/proxypath/targetsite/page.html\">"
+            "  An absolute link</a>"
+            "<a target=_blank href='http://proxyhost/proxypath/targetsite/anotherpage.html\">"
+            "  Another abs link</a>"
+            "<a href=\"/docs/page2.html\" />"
+            "</body></html>";
+
+        int r;
+        Z_GDU *gdu_res;
+        ODR odr2 = odr_createmem(ODR_DECODE);
+        odr_setbuf(odr2, (char *) resp_buf, strlen(resp_buf), 0);
+        r = z_GDU(odr2, &gdu_res, 0, 0);
+
+        BOOST_CHECK(r == 0);
+        if (r)
+        {
+            BOOST_CHECK_EQUAL(gdu_res->which, Z_GDU_HTTP_Response);
+        }
+
+        pack.response() = gdu_res;
+
+        //feed to the router
+        pack.router(router).move();
+
+        //analyze the response
+        Z_GDU *gdu_res_rew = pack.response().get();
+        BOOST_CHECK(gdu_res_rew);
+        BOOST_CHECK_EQUAL(gdu_res_rew->which, Z_GDU_HTTP_Response);
+        
+        Z_HTTP_Response *hres = gdu_res_rew->u.HTTP_Response;
+        BOOST_CHECK(hres);
+
+        //how to compare the buffers:
+
+        odr_destroy(odr2);
+    }
+    catch (std::exception & e) {
+        std::cout << e.what();
+        std::cout << std::endl;
+        BOOST_CHECK (false);
+    }
+}
+*/
+
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * c-file-style: "Stroustrup"
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */
+
diff --git a/src/test_html_parser.cpp b/src/test_html_parser.cpp
new file mode 100644 (file)
index 0000000..aa818f9
--- /dev/null
@@ -0,0 +1,107 @@
+/* This file is part of Metaproxy.
+   Copyright (C) 2005-2013 Index Data
+
+Metaproxy is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2, or (at your option) any later
+version.
+
+Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include "config.hpp"
+#include <iostream>
+#include <stdexcept>
+
+#include "html_parser.hpp"
+#include <metaproxy/util.hpp>
+
+#include <boost/lexical_cast.hpp>
+
+#include <yaz/log.h>
+
+#define BOOST_AUTO_TEST_MAIN
+#define BOOST_TEST_DYN_LINK
+
+#include <boost/test/auto_unit_test.hpp>
+
+using namespace boost::unit_test;
+namespace mp = metaproxy_1;
+
+class MyEvent : public mp::HTMLParserEvent {
+    public:
+        std::string out;
+        void openTagStart(const char *name)
+        {
+            out += "<";
+            out += name;
+        } 
+        
+        void attribute(const char *tagName, 
+                const char *name, const char *value, int val_len)
+        {
+            out += " ";
+            out += name;
+            out += "=\"";
+            out.append(value, val_len);
+            out += "\"";
+        }
+
+        void anyTagEnd(const char *name)
+        {
+            out += ">";
+        }
+        
+        void closeTag(const char *name)
+        {
+            out += "</";
+            out += name;
+        }
+        
+        void text(const char *value, int len)
+        {
+            out.append(value, len);
+        }
+};
+
+
+BOOST_AUTO_TEST_CASE( test_html_parser_1 )
+{
+    try
+    {
+        mp::HTMLParser hp;
+        const char* html = 
+            "<html><body><a t1=v1 t2='v2' t3=\"v3\">some text</a>"
+            "<hr><table ></table  ></body></html";
+        const char* expected = 
+            "<html><body><a t1=\"v1\" t2=\"v2\" t3=\"v3\">some text</a>"
+            "<hr><table></table></body></html";
+        MyEvent e;
+        hp.parse(e, html);
+        BOOST_CHECK_EQUAL(std::string(expected), e.out);
+    }
+    catch (std::exception & e) 
+    {
+        std::cout << e.what();
+        std::cout << std::endl;
+        BOOST_CHECK (false);
+    }
+}
+
+
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * c-file-style: "Stroustrup"
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */
+