From a258482a3b53b90b0932f4a789ba66e8e1576c05 Mon Sep 17 00:00:00 2001 From: Jakub Skoczen Date: Fri, 10 May 2013 16:50:56 +0200 Subject: [PATCH] HTML push parser Based on Adam's tclrobot but more aligned with the HTML specs * all different forms of attrs supported * whitespaces after tag names allowed * pi/dtd passthru also various fixes. --- src/Makefile.am | 4 + src/html_parser.cpp | 250 ++++++++++++++++++++++++++++++++++++++++++++++ src/html_parser.hpp | 50 ++++++++++ src/test_html_parser.cpp | 107 ++++++++++++++++++++ 4 files changed, 411 insertions(+) create mode 100644 src/html_parser.cpp create mode 100644 src/html_parser.hpp create mode 100644 src/test_html_parser.cpp diff --git a/src/Makefile.am b/src/Makefile.am index 47a7b52..e5c40aa 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -59,6 +59,7 @@ libmetaproxy_la_SOURCES = \ torus.cpp torus.hpp \ url_recipe.cpp \ util.cpp \ + html_parser.hpp html_parser.cpp \ xmlutil.cpp @@ -108,6 +109,7 @@ check_PROGRAMS = \ test_filter_rewrite \ test_ses_map \ test_router_flexml \ + test_html_parser \ test_xmlutil TESTS=$(check_PROGRAMS) @@ -135,6 +137,7 @@ test_filter_virt_db_SOURCES = test_filter_virt_db.cpp test_filter_rewrite_SOURCES = test_filter_rewrite.cpp test_ses_map_SOURCES = test_ses_map.cpp test_router_flexml_SOURCES = test_router_flexml.cpp +test_html_parser_SOURCES = test_html_parser.cpp test_xmlutil_SOURCES = test_xmlutil.cpp TESTLDADD = $(LDADD) $(BOOST_TEST_LIB) @@ -162,6 +165,7 @@ test_filter_rewrite_LDADD = $(TESTLDADD) test_router_flexml_LDADD = $(TESTLDADD) test_ses_map_LDADD = $(TESTLDADD) test_thread_pool_observer_LDADD = $(TESTLDADD) +test_html_parser_LDADD = $(TESTLDADD) test_xmlutil_LDADD = $(TESTLDADD) # doxygen target diff --git a/src/html_parser.cpp b/src/html_parser.cpp new file mode 100644 index 0000000..ef8ad2b --- /dev/null +++ b/src/html_parser.cpp @@ -0,0 +1,250 @@ +/* This file is part of Metaproxy. + Copyright (C) 2005-2013 Index Data + +Metaproxy is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include "config.hpp" +#include "html_parser.hpp" + +#include +#include +#include +#include +#include + +#define TAG_MAX_LEN 64 + +#define SPACECHR " \t\r\n\f" + +#define DEBUG(x) x + +#if HAVE_SYS_TYPES_H +#include +#endif + +namespace mp = metaproxy_1; + +mp::HTMLParser::HTMLParser() +{ +} + +mp::HTMLParser::~HTMLParser() +{ +} + +static void parse_str(mp::HTMLParserEvent & event, const char * str); + +void mp::HTMLParser::parse(mp::HTMLParserEvent & event, const char *str) const +{ + parse_str(event, str); +} + +//static C functions follow would probably make sense to wrap this in PIMPL? + +static int skipSpace (const char *cp) +{ + int i = 0; + while (cp[i] && strchr (SPACECHR, cp[i])) + i++; + return i; +} + +static int skipName (const char *cp, char *dst) +{ + int i; + int j = 0; + for (i=0; cp[i] && !strchr (SPACECHR "/>=", cp[i]); i++) + if (j < TAG_MAX_LEN-1) + { + dst[j] = tolower(cp[j]); + j++; + } + dst[j] = '\0'; + return i; +} + +static int skipAttribute (const char *cp, char *name, char **value) +{ + int i = skipName (cp, name); + *value = NULL; + if (!i) + return skipSpace (cp); + i += skipSpace (cp + i); + if (cp[i] == '=') + { + int v0, v1; + i++; + i += skipSpace (cp + i); + if (cp[i] == '\"' || cp[i] == '\'') + { + char tr = cp[i]; + v0 = ++i; + while (cp[i] != tr && cp[i]) + i++; + v1 = i; + if (cp[i]) + i++; + } + else + { + v0 = i; + while (cp[i] && !strchr (SPACECHR ">", cp[i])) + i++; + v1 = i; + } + *value = (char *) malloc (v1 - v0 + 1); + memcpy (*value, cp + v0, v1-v0); + (*value)[v1-v0] = '\0'; + } + i += skipSpace (cp + i); + return i; +} + +static int tagAttrs (mp::HTMLParserEvent & event, + const char *tagName, + const char *cp) +{ + int i; + char attr_name[TAG_MAX_LEN]; + char *attr_value; + i = skipSpace (cp); + while (cp[i] && cp[i] != '>') + { + int nor = skipAttribute (cp+i, attr_name, &attr_value); + i += nor; + if (nor) + { + DEBUG(printf ("------ attr %s=%s\n", attr_name, attr_value)); + event.attribute(tagName, attr_name, attr_value); + } + else + { + if (!nor) + i++; + } + } + return i; +} + +static int tagStart (mp::HTMLParserEvent & event, + char *tagName, const char *cp, const char which) +{ + int i = 0; + i = skipName (cp, tagName); + switch (which) + { + case '/' : + DEBUG(printf ("------ tag close %s\n", tagName)); + event.closeTag(tagName); + break; + case '!' : + DEBUG(printf ("------ dtd %s\n", tagName)); + break; + case '?' : + DEBUG(printf ("------ pi %s\n", tagName)); + break; + default : + DEBUG(printf ("------ tag open %s\n", tagName)); + event.openTagStart(tagName); + break; + } + return i; +} + +static int tagEnd (mp::HTMLParserEvent & event, const char *tagName, const char *cp) +{ + int i = 0; + while (cp[i] && cp[i] != '>') + i++; + if (cp[i] == '>') + { + event.anyTagEnd(tagName); + i++; + } + return i; +} + +static char* allocFromRange (const char *start, const char *end) +{ + char *value = (char *) malloc (end - start + 1); + assert (value); + memcpy (value, start, end - start); + value[end - start] = '\0'; + return value; +} + +static void tagText (mp::HTMLParserEvent & event, const char *text_start, const char *text_end) +{ + if (text_end - text_start) //got text to flush + { + char *temp = allocFromRange(text_start, text_end); + DEBUG(printf ("------ text %s\n", temp)); + event.text(text_start, text_end-text_start); + free(temp); + } +} + +static void parse_str (mp::HTMLParserEvent & event, const char *cp) +{ + const char *text_start = cp; + const char *text_end = cp; + while (*cp) + { + if (cp[0] == '<' && cp[1]) //tag? + { + char which = cp[1]; + if (which == '/') cp++; + if (!strchr (SPACECHR, cp[1])) //valid tag starts + { + tagText (event, text_start, text_end); //flush any text + char tagName[TAG_MAX_LEN]; + cp++; + if (which == '/') + { + cp += tagStart (event, tagName, cp, which); + } + else if (which == '!' || which == '?') //pi or dtd + { + cp++; + cp += tagStart (event, tagName, cp, which); + } + else + { + cp += tagStart (event, tagName, cp, which); + cp += tagAttrs (event, tagName, cp); + } + cp += tagEnd (event, tagName, cp); + text_start = cp; + text_end = cp; + continue; + } + } + //text + cp++; + text_end = cp; + } + tagText (event, text_start, text_end); //flush any text +} + +/* + * Local variables: + * c-basic-offset: 4 + * c-file-style: "Stroustrup" + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ + diff --git a/src/html_parser.hpp b/src/html_parser.hpp new file mode 100644 index 0000000..a03044b --- /dev/null +++ b/src/html_parser.hpp @@ -0,0 +1,50 @@ +/* This file is part of Metaproxy. + Copyright (C) 2005-2013 Index Data + +Metaproxy is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#ifndef HTML_PARSER_HPP +#define HTML_PARSER_HPP + +#include + +namespace metaproxy_1 { + class HTMLParserEvent { + public: + virtual void openTagStart(const char *name) = 0; + virtual void anyTagEnd(const char *name) = 0; + virtual void attribute(const char *tagName, const char *name, const char *value) = 0; + virtual void closeTag(const char *name) = 0; + virtual void text(const char *value, int len) = 0; + }; + class HTMLParser { + public: + HTMLParser(); + ~HTMLParser(); + void parse(HTMLParserEvent & event, const char *str) const; + }; +} + +#endif +/* + * Local variables: + * c-basic-offset: 4 + * c-file-style: "Stroustrup" + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ + diff --git a/src/test_html_parser.cpp b/src/test_html_parser.cpp new file mode 100644 index 0000000..5230117 --- /dev/null +++ b/src/test_html_parser.cpp @@ -0,0 +1,107 @@ +/* This file is part of Metaproxy. + Copyright (C) 2005-2013 Index Data + +Metaproxy is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include "config.hpp" +#include +#include + +#include "html_parser.hpp" +#include + +#include + +#include + +#define BOOST_AUTO_TEST_MAIN +#define BOOST_TEST_DYN_LINK + +#include + +using namespace boost::unit_test; +namespace mp = metaproxy_1; + +class MyEvent : public mp::HTMLParserEvent { + public: + std::string out; + void openTagStart(const char *name) + { + out += "<"; + out += name; + } + + void attribute(const char *tagName, + const char *name, const char *value) + { + out += " "; + out += name; + out += "=\""; + out += value; + out += "\""; + } + + void anyTagEnd(const char *name) + { + out += ">"; + } + + void closeTag(const char *name) + { + out += "some text" + "
some text" + "