X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=src%2Fhtml_parser.cpp;h=95db7c7fd41993645193ba617e1d5404993a8307;hb=586d78659d671683f33ec55f4a7d32b28e345ccd;hp=49ef67044a8287d27fd6055c42036318d2015337;hpb=8bcd4c3e063e932b2f80f9491ec0af66d3da5c2e;p=metaproxy-moved-to-github.git diff --git a/src/html_parser.cpp b/src/html_parser.cpp index 49ef670..95db7c7 100644 --- a/src/html_parser.cpp +++ b/src/html_parser.cpp @@ -1,5 +1,5 @@ /* This file is part of Metaproxy. - Copyright (C) 2005-2013 Index Data + Copyright (C) Index Data Metaproxy is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free @@ -24,20 +24,47 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #include #include #include - -#define TAG_MAX_LEN 64 +#include #define SPACECHR " \t\r\n\f" -#define DEBUG(x) x +// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html -#if HAVE_SYS_TYPES_H -#include -#endif +namespace metaproxy_1 { + class HTMLParser::Rep { + friend class HTMLParser; + public: + void parse_str(HTMLParserEvent &event, const char *cp); + void tagText(HTMLParserEvent &event, + const char *text_start, const char *text_end); + int tagEnd(HTMLParserEvent &event, + const char *tag, int tag_len, const char *cp); + int tagAttrs(HTMLParserEvent &event, + const char *name, int len, + const char *cp); + int skipAttribute(HTMLParserEvent &event, + const char *cp, int *attr_len, + const char **value, int *val_len, int *tr); + Rep(); + ~Rep(); + int m_verbose; + bool nest; + }; +} namespace mp = metaproxy_1; -mp::HTMLParser::HTMLParser() +mp::HTMLParser::Rep::Rep() +{ + m_verbose = 0; + nest = true; +} + +mp::HTMLParser::Rep::~Rep() +{ +} + +mp::HTMLParser::HTMLParser() : m_p(new Rep) { } @@ -45,201 +72,247 @@ mp::HTMLParser::~HTMLParser() { } -static void parse_str(mp::HTMLParserEvent & event, const char * str); +void mp::HTMLParser::set_verbose(int v) +{ + m_p->m_verbose = v; +} + void mp::HTMLParser::parse(mp::HTMLParserEvent & event, const char *str) const { - parse_str(event, str); + m_p->parse_str(event, str); } -//static C functions follow would probably make sense to wrap this in PIMPL? - -static char* dupe (const char *buff, int len) +static int isAlpha(int c) { - char *value = (char *) malloc (len + 1); - assert (value); - memcpy (value, buff, len); - value[len] = '\0'; - return value; + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } -static int skipSpace (const char *cp) +static int skipSpace(const char *cp) { int i = 0; - while (cp[i] && strchr (SPACECHR, cp[i])) + while (cp[i] && strchr(SPACECHR, cp[i])) i++; return i; } -static int skipName (const char *cp, char *dst) +static int skipName(const char *cp) { int i; - int j = 0; - for (i=0; cp[i] && !strchr (SPACECHR "/>=", cp[i]); i++) - if (j < TAG_MAX_LEN-1) - { - dst[j] = tolower(cp[j]); - j++; - } - dst[j] = '\0'; + for (i = 0; cp[i] && !strchr(SPACECHR "/><=", cp[i]); i++) + ; return i; } -static int skipAttribute (const char *cp, char *name, const char **value, int *val_len) +int mp::HTMLParser::Rep::skipAttribute(HTMLParserEvent &event, + const char *cp, int *attr_len, + const char **value, int *val_len, + int *tr) { - int i = skipName (cp, name); + int v0, v1; + int i = skipName(cp); + *attr_len = i; *value = NULL; if (!i) - return skipSpace (cp); - i += skipSpace (cp + i); + return skipSpace(cp); + i += skipSpace(cp + i); if (cp[i] == '=') { - int v0, v1; i++; - i += skipSpace (cp + i); + i += skipSpace(cp + i); if (cp[i] == '\"' || cp[i] == '\'') { - char tr = cp[i]; + *tr = cp[i]; v0 = ++i; - while (cp[i] != tr && cp[i]) - i++; + while (cp[i] != *tr && cp[i]) + i++; v1 = i; if (cp[i]) i++; } else { + *tr = 0; v0 = i; - while (cp[i] && !strchr (SPACECHR ">", cp[i])) + while (cp[i] && !strchr(SPACECHR ">", cp[i])) i++; v1 = i; } *value = cp + v0; *val_len = v1 - v0; + i += skipSpace(cp + i); } - i += skipSpace (cp + i); return i; } -static int tagAttrs (mp::HTMLParserEvent & event, - const char *tagName, - const char *cp) +int mp::HTMLParser::Rep::tagAttrs(HTMLParserEvent &event, + const char *name, int len, + const char *cp) { - int i; - char attr_name[TAG_MAX_LEN]; - const char *attr_value; - int val_len; - i = skipSpace (cp); - while (cp[i] && cp[i] != '>' && cp[i] != '/') + int i = skipSpace(cp); + while (cp[i] && !strchr("/><", cp[i])) { - int nor = skipAttribute (cp+i, attr_name, &attr_value, &val_len); + const char *attr_name = cp + i; + int attr_len; + const char *value; + int val_len; + int tr; + char x[2]; + int nor = skipAttribute(event, cp+i, &attr_len, &value, &val_len, &tr); + if (!nor) + break; i += nor; - if (nor) - { - DEBUG(printf ("------ attr %s=%s\n", attr_name, dupe(attr_value, val_len))); - event.attribute(tagName, attr_name, attr_value, val_len); - } - else + + x[0] = tr; + x[1] = 0; + if (m_verbose) { - if (!nor) - i++; + printf("------ attr %.*s", attr_len, attr_name); + if (value) + printf("=%.*s", val_len, value); + printf("\n"); } + event.attribute(name, len, attr_name, attr_len, value, val_len, x); } return i; } -static int tagStart (mp::HTMLParserEvent & event, - char *tagName, const char *cp, const char which) +int mp::HTMLParser::Rep::tagEnd(HTMLParserEvent &event, + const char *tag, int tag_len, const char *cp) { int i = 0; - i = skipName (cp, tagName); - switch (which) + int close_it = 0; + for (; cp[i] && !strchr("/><", cp[i]); i++) + ; + if (i > 0) { - case '/' : - DEBUG(printf ("------ tag close %s\n", tagName)); - event.closeTag(tagName); - break; - case '!' : - DEBUG(printf ("------ dtd %s\n", tagName)); - break; - case '?' : - DEBUG(printf ("------ pi %s\n", tagName)); - break; - default : - DEBUG(printf ("------ tag open %s\n", tagName)); - event.openTagStart(tagName); - break; + if (m_verbose) + printf("------ text %.*s\n", i, cp); + event.text(cp, i); } - return i; -} - -static int tagEnd (mp::HTMLParserEvent & event, const char *tagName, const char *cp) -{ - int i = 0; - int close_it = 0; - while (cp[i] && cp[i] != '>') + if (cp[i] == '/') { - if (cp[i] == '/') - close_it = 1; + close_it = 1; i++; } if (cp[i] == '>') { - event.anyTagEnd(tagName, close_it); + if (m_verbose) + printf("------ any tag %s %.*s\n", + close_it ? "close" : "end", tag_len, tag); + event.anyTagEnd(tag, tag_len, close_it); i++; } return i; } -static void tagText (mp::HTMLParserEvent & event, const char *text_start, const char *text_end) +void mp::HTMLParser::Rep::tagText(HTMLParserEvent &event, + const char *text_start, const char *text_end) { if (text_end - text_start) //got text to flush { - DEBUG(printf ("------ text %s\n", dupe(text_start, text_end-text_start))); + if (m_verbose) + printf("------ text %.*s\n", + (int) (text_end - text_start), text_start); event.text(text_start, text_end-text_start); } } -static void parse_str (mp::HTMLParserEvent & event, const char *cp) +void mp::HTMLParser::Rep::parse_str(HTMLParserEvent &event, const char *cp) { const char *text_start = cp; - const char *text_end = cp; while (*cp) { - if (cp[0] == '<' && cp[1]) //tag? + if (*cp++ != '<') + continue; + + if (nest && *cp == '!') { - char which = cp[1]; - if (which == '/') cp++; - if (!strchr (SPACECHR, cp[1])) //valid tag starts + int i; + tagText(event, text_start, cp - 1); + if (cp[1] == '-' && cp[2] == '-') { - tagText (event, text_start, text_end); //flush any text - char tagName[TAG_MAX_LEN]; - cp++; - if (which == '/') - { - cp += tagStart (event, tagName, cp, which); - } - else if (which == '!' || which == '?') //pi or dtd - { - cp++; - cp += tagStart (event, tagName, cp, which); - } - else + for (i = 3; cp[i]; i++) + if (cp[i] == '-' && cp[i+1] == '-' && cp[i+2] == '>') + { + i+= 2; + event.openTagStart(cp, i); + break; + } + } + else + { + for (i = 1; cp[i] && cp[i] != '>'; i++) + ; + event.openTagStart(cp, i); + } + if (m_verbose) + printf("------ dtd %.*s\n", i, cp); + i += tagEnd(event, cp, i, cp + i); + cp += i; + text_start = cp; + } + else if (nest && *cp == '?') + { + int i; + tagText(event, text_start, cp - 1); + for (i = 1; cp[i] && cp[i] != '>'; i++) + ; + event.openTagStart(cp, i); + if (m_verbose) + printf("------ pi %.*s\n", i, cp); + i += tagEnd(event, cp, i, cp + i); + cp += i; + text_start = cp; + } + else if (*cp == '/' && isAlpha(cp[1])) + { + int i; + + i = skipName(++cp); + + if (!nest) + { + if (i == 6 && !yaz_strncasecmp(cp, "script", i)) { - cp += tagStart (event, tagName, cp, which); - cp += tagAttrs (event, tagName, cp); + int ws = skipSpace(cp + 6); + if (cp[ws + 6] == '>') + nest = true; /* really terminated */ } - cp += tagEnd (event, tagName, cp); - text_start = cp; - text_end = cp; - continue; + if (!nest) + continue; } + tagText(event, text_start, cp - 2); + event.closeTag(cp, i); + if (m_verbose) + printf("------ tag close %.*s\n", i, cp); + i += tagEnd(event, cp, i, cp + i); + cp += i; + text_start = cp; + } + else if (nest && isAlpha(*cp)) + { + int i, j; + tagText(event, text_start, cp - 1); + i = skipName(cp); + event.openTagStart(cp, i); + if (m_verbose) + printf("------ tag open %.*s\n", i, cp); + j = tagAttrs(event, cp, i, cp + i); + j += tagEnd(event, cp, i, cp + i + j); + + if (i == 6 && !yaz_strncasecmp(cp, "script", i)) + nest = false; + + cp += i + j; + text_start = cp; } - //text - cp++; - text_end = cp; } - tagText (event, text_start, text_end); //flush any text + tagText(event, text_start, cp); +} + +mp::HTMLParserEvent::~HTMLParserEvent() +{ } /*