From: Adam Dickmeiss Date: Wed, 26 Jun 2013 14:58:10 +0000 (+0200) Subject: Revise HTML parser; keep spelling X-Git-Tag: v1.3.59~43 X-Git-Url: http://git.indexdata.com/?a=commitdiff_plain;h=3ef4df94516a136b7ee18ec8a45e740ef9e9dc05;p=metaproxy-moved-to-github.git Revise HTML parser; keep spelling Fixes for DTD/PIs handling. 0-copy for tags and attributes (no limit). No debug output, unless verbose is set for HTMLParser. --- diff --git a/src/filter_http_rewrite.cpp b/src/filter_http_rewrite.cpp index fd1b887..e69aba6 100644 --- a/src/filter_http_rewrite.cpp +++ b/src/filter_http_rewrite.cpp @@ -78,13 +78,12 @@ namespace metaproxy_1 { std::map & vars) const; }; class HttpRewrite::Event : public HTMLParserEvent { - void openTagStart(const char *name); - void anyTagEnd(const char *name, int close_it); - void attribute(const char *tagName, - const char *name, - const char *value, - int val_len); - void closeTag(const char *name); + void openTagStart(const char *tag, int tag_len); + void anyTagEnd(const char *tag, int tag_len, int close_it); + void attribute(const char *tag, int tag_len, + const char *attr, int attr_len, + const char *value, int val_len); + void closeTag(const char *tag, int tag_len); void text(const char *value, int len); const Phase *m_phase; WRBUF m_w; @@ -253,33 +252,37 @@ const char *yf::HttpRewrite::Event::result() return wrbuf_cstr(m_w); } -void yf::HttpRewrite::Event::openTagStart(const char *name) +void yf::HttpRewrite::Event::openTagStart(const char *tag, int tag_len) { // check if there is if (enabled_within == m_phase->within_list.end()) { + std::string t(tag, tag_len); std::list::const_iterator it = m_phase->within_list.begin(); for (; it != m_phase->within_list.end(); it++) { - if (it->tag.length() > 0 && it->tag.compare(name) == 0) + if (it->tag.length() > 0 && yaz_strcasecmp(it->tag.c_str(), + t.c_str()) == 0) { enabled_within = it; } } } wrbuf_putc(m_w, '<'); - wrbuf_puts(m_w, name); + wrbuf_write(m_w, tag, tag_len); } -void yf::HttpRewrite::Event::anyTagEnd(const char *name, int close_it) +void yf::HttpRewrite::Event::anyTagEnd(const char *tag, int tag_len, + int close_it) { if (close_it) { std::list::const_iterator it = enabled_within; if (it != m_phase->within_list.end()) { - if (it->tag.compare(name) == 0) + std::string t(tag, tag_len); + if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0) { enabled_within = m_phase->within_list.end(); } @@ -290,24 +293,27 @@ void yf::HttpRewrite::Event::anyTagEnd(const char *name, int close_it) wrbuf_putc(m_w, '>'); } -void yf::HttpRewrite::Event::attribute(const char *tagName, - const char *name, - const char *value, - int val_len) +void yf::HttpRewrite::Event::attribute(const char *tag, int tag_len, + const char *attr, int attr_len, + const char *value, int val_len) { std::list::const_iterator it = m_phase->within_list.begin(); bool subst = false; for (; it != m_phase->within_list.end(); it++) { - if (it->tag.length() == 0 || it->tag.compare(tagName) == 0) + std::string t(tag, tag_len); + if (it->tag.length() == 0 || + yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0) { + std::string a(attr, attr_len); std::vector attr; boost::split(attr, it->attr, boost::is_any_of(",")); size_t i; for (i = 0; i < attr.size(); i++) { - if (attr[i].compare("#text") && attr[i].compare(name) == 0) + if (attr[i].compare("#text") && + yaz_strcasecmp(attr[i].c_str(), a.c_str()) == 0) subst = true; } } @@ -316,7 +322,7 @@ void yf::HttpRewrite::Event::attribute(const char *tagName, } wrbuf_putc(m_w, ' '); - wrbuf_puts(m_w, name); + wrbuf_write(m_w, attr, attr_len); wrbuf_puts(m_w, "=\""); std::string output; @@ -332,18 +338,19 @@ void yf::HttpRewrite::Event::attribute(const char *tagName, wrbuf_puts(m_w, "\""); } -void yf::HttpRewrite::Event::closeTag(const char *name) +void yf::HttpRewrite::Event::closeTag(const char *tag, int tag_len) { std::list::const_iterator it = enabled_within; if (it != m_phase->within_list.end()) { - if (it->tag.compare(name) == 0) + std::string t(tag, tag_len); + if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0) { enabled_within = m_phase->within_list.end(); } } wrbuf_puts(m_w, " #include -#define TAG_MAX_LEN 64 - #define SPACECHR " \t\r\n\f" -#define DEBUG(x) x -#if HAVE_SYS_TYPES_H -#include -#endif +namespace metaproxy_1 { + class HTMLParser::Rep { + friend class HTMLParser; + public: + void parse_str(HTMLParserEvent &event, const char *cp); + void tagText(HTMLParserEvent &event, + const char *text_start, const char *text_end); + int tagEnd(HTMLParserEvent &event, + const char *tag, int tag_len, const char *cp); + int tagStart(HTMLParserEvent &event, + int *tag_len, const char *cp, const char which); + int tagAttrs(HTMLParserEvent &event, + const char *name, int len, + const char *cp); + Rep(); + ~Rep(); + int m_verbose; + }; +} namespace mp = metaproxy_1; -mp::HTMLParser::HTMLParser() +mp::HTMLParser::Rep::Rep() { + m_verbose = 0; } -mp::HTMLParser::~HTMLParser() +mp::HTMLParser::Rep::~Rep() +{ +} + +mp::HTMLParser::HTMLParser() : m_p(new Rep) { } -static void parse_str(mp::HTMLParserEvent & event, const char * str); +mp::HTMLParser::~HTMLParser() +{ +} void mp::HTMLParser::parse(mp::HTMLParserEvent & event, const char *str) const { - parse_str(event, str); + m_p->parse_str(event, str); } static int skipSpace(const char *cp) @@ -60,23 +80,19 @@ static int skipSpace(const char *cp) return i; } -static int skipName(const char *cp, char *dst) +static int skipName(const char *cp) { int i; - int j = 0; for (i = 0; cp[i] && !strchr(SPACECHR "/>=", cp[i]); i++) - if (j < TAG_MAX_LEN-1) - { - dst[j] = tolower(cp[j]); - j++; - } - dst[j] = '\0'; + ; return i; } -static int skipAttribute(const char *cp, char *name, const char **value, int *val_len) +static int skipAttribute(const char *cp, int *attr_len, + const char **value, int *val_len) { - int i = skipName(cp, name); + int i = skipName(cp); + *attr_len = i; *value = NULL; if (!i) return skipSpace(cp); @@ -110,57 +126,80 @@ static int skipAttribute(const char *cp, char *name, const char **value, int *va return i; } -static int tagAttrs(mp::HTMLParserEvent & event, - const char *tagName, - const char *cp) +int mp::HTMLParser::Rep::tagAttrs(HTMLParserEvent &event, + const char *name, int len, + const char *cp) { - char attr_name[TAG_MAX_LEN]; - const char *attr_value; - int val_len; int i = skipSpace(cp); while (cp[i] && cp[i] != '>' && cp[i] != '/') { - int nor = skipAttribute(cp+i, attr_name, &attr_value, &val_len); + const char *attr_name = cp + i; + int attr_len; + const char *value; + int val_len; + int nor = skipAttribute(cp+i, &attr_len, &value, &val_len); i += nor; if (nor) { - DEBUG(printf ("------ attr %s=%.*s\n", attr_name, val_len, attr_value)); - event.attribute(tagName, attr_name, attr_value, val_len); + if (m_verbose) + printf ("------ attr %.*s=%.*s\n", attr_len, attr_name, + val_len, value); + event.attribute(name, len, attr_name, attr_len, value, val_len); } else { - if (!nor) - i++; + i++; } } return i; } -static int tagStart(mp::HTMLParserEvent & event, - char *tagName, const char *cp, const char which) +int mp::HTMLParser::Rep::tagStart(HTMLParserEvent &event, + int *tag_len, + const char *cp, const char which) { - int i = skipName(cp, tagName); + int i; switch (which) { - case '/' : - DEBUG(printf("------ tag close %s\n", tagName)); - event.closeTag(tagName); + case '/': + i = skipName(cp); + *tag_len = i; + if (m_verbose) + printf("------ tag close %.*s\n", i, cp); + event.closeTag(cp, i); break; - case '!' : - DEBUG(printf("------ dtd %s\n", tagName)); + case '!': + for (i = 0; cp[i] && cp[i] != '>'; i++) + ; + *tag_len = i; + event.openTagStart(cp, i); + if (m_verbose) + printf("------ dtd %.*s\n", i, cp); break; - case '?' : - DEBUG(printf("------ pi %s\n", tagName)); + case '?': + for (i = 0; cp[i] && cp[i] != '>'; i++) + ; + *tag_len = i; + event.openTagStart(cp, i); + if (m_verbose) + printf("------ pi %.*s\n", i, cp); break; - default : - DEBUG(printf("------ tag open %s\n", tagName)); - event.openTagStart(tagName); + default: + i = skipName(cp); + *tag_len = i; + if (m_verbose) + printf("------ tag open %.*s\n", i, cp); + event.openTagStart(cp, i); + + i += tagAttrs(event, cp, i, cp + i); + break; } return i; } -static int tagEnd(mp::HTMLParserEvent & event, const char *tagName, const char *cp) +int mp::HTMLParser::Rep::tagEnd(HTMLParserEvent &event, + const char *tag, int tag_len, const char *cp) { int i = 0; int close_it = 0; @@ -172,23 +211,25 @@ static int tagEnd(mp::HTMLParserEvent & event, const char *tagName, const char * } if (cp[i] == '>') { - event.anyTagEnd(tagName, close_it); + event.anyTagEnd(tag, tag_len, close_it); i++; } return i; } -static void tagText(mp::HTMLParserEvent & event, const char *text_start, const char *text_end) +void mp::HTMLParser::Rep::tagText(HTMLParserEvent &event, + const char *text_start, const char *text_end) { if (text_end - text_start) //got text to flush { - DEBUG(printf("------ text %.*s\n", - (int) (text_end - text_start), text_start)); + if (m_verbose) + printf("------ text %.*s\n", + (int) (text_end - text_start), text_start); event.text(text_start, text_end-text_start); } } -static void parse_str(mp::HTMLParserEvent & event, const char *cp) +void mp::HTMLParser::Rep::parse_str(HTMLParserEvent &event, const char *cp) { const char *text_start = cp; const char *text_end = cp; @@ -201,24 +242,14 @@ static void parse_str(mp::HTMLParserEvent & event, const char *cp) cp++; if (!strchr(SPACECHR, cp[1])) //valid tag starts { + int i = 0; + int tag_len; + tagText(event, text_start, text_end); //flush any text - char tagName[TAG_MAX_LEN]; cp++; - if (which == '/') - { - cp += tagStart(event, tagName, cp, which); - } - else if (which == '!' || which == '?') //pi or dtd - { - cp++; - cp += tagStart(event, tagName, cp, which); - } - else - { - cp += tagStart(event, tagName, cp, which); - cp += tagAttrs(event, tagName, cp); - } - cp += tagEnd(event, tagName, cp); + i += tagStart(event, &tag_len, cp, which); + i += tagEnd(event, cp, tag_len, cp + i); + cp += i; text_start = cp; text_end = cp; continue; diff --git a/src/html_parser.hpp b/src/html_parser.hpp index 72ff60b..f754699 100644 --- a/src/html_parser.hpp +++ b/src/html_parser.hpp @@ -24,19 +24,23 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA namespace metaproxy_1 { class HTMLParserEvent { public: - virtual void openTagStart(const char *name) = 0; - virtual void anyTagEnd(const char *name, int close_it) = 0; - virtual void attribute(const char *tagName, const char *name, - const char *value, - int val_len) = 0; - virtual void closeTag(const char *name) = 0; + virtual void openTagStart(const char *tag, int tag_len) = 0; + virtual void anyTagEnd(const char *tag, int tag_len, + int close_it) = 0; + virtual void attribute(const char *tag, int tag_len, + const char *attr, int attr_len, + const char *value, int val_len) = 0; + virtual void closeTag(const char *tag, int tag_len) = 0; virtual void text(const char *value, int len) = 0; }; class HTMLParser { + class Rep; public: HTMLParser(); ~HTMLParser(); void parse(HTMLParserEvent &event, const char *str) const; + private: + boost::scoped_ptr m_p; }; } diff --git a/src/test_html_parser.cpp b/src/test_html_parser.cpp index f0ab641..c4ffe13 100644 --- a/src/test_html_parser.cpp +++ b/src/test_html_parser.cpp @@ -35,42 +35,36 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA using namespace boost::unit_test; namespace mp = metaproxy_1; -class MyEvent : public mp::HTMLParserEvent { - public: - std::string out; - void openTagStart(const char *name) - { - out += "<"; - out += name; - } - - void attribute(const char *tagName, - const char *name, const char *value, int val_len) - { - out += " "; - out += name; - out += "=\""; - out.append(value, val_len); - out += "\""; - } - - void anyTagEnd(const char *name, int close_it) - { - if (close_it) - out += "/"; - out += ">"; - } - - void closeTag(const char *name) - { - out += ""; + } + void closeTag(const char *tag, int tag_len) { + out += "\n" + "\n" + " \n" + " YAZ 4.2.60\n" + " \n" + " \n" + "

YAZ 4.2.60

\n" + "

Error: 404

\n" + "

Description: Not Found

\n" + " \n" + ""; + + const char* expected = html; + MyEvent e; + hp.parse(e, html); + + std::cout << "Expected" << std::endl; std::cout << expected << std::endl; + std::cout << "Got" << std::endl; std::cout << e.out << std::endl; + BOOST_CHECK_EQUAL(std::string(expected), e.out); } catch (std::exception & e)