X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=src%2Fhtml_parser.cpp;h=3f1a4f4fdba224f819527b7b0c612f631d0b5bc8;hb=ddbce5fc4d40b7fdfbcb2b39031d12be6191dc05;hp=4d4e3a770dd68c6dd3736da4ab92992d67fd6c2a;hpb=4f5fe9b621891d0d4eabbd7dbcba65f96f11f528;p=metaproxy-moved-to-github.git diff --git a/src/html_parser.cpp b/src/html_parser.cpp index 4d4e3a7..3f1a4f4 100644 --- a/src/html_parser.cpp +++ b/src/html_parser.cpp @@ -24,6 +24,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #include #include #include +#include #define SPACECHR " \t\r\n\f" @@ -47,6 +48,7 @@ namespace metaproxy_1 { Rep(); ~Rep(); int m_verbose; + bool nest; }; } @@ -55,6 +57,7 @@ namespace mp = metaproxy_1; mp::HTMLParser::Rep::Rep() { m_verbose = 0; + nest = true; } mp::HTMLParser::Rep::~Rep() @@ -219,7 +222,7 @@ void mp::HTMLParser::Rep::parse_str(HTMLParserEvent &event, const char *cp) if (*cp++ != '<') continue; - if (*cp == '!') + if (nest && *cp == '!') { int i; tagText(event, text_start, cp - 1); @@ -245,7 +248,7 @@ void mp::HTMLParser::Rep::parse_str(HTMLParserEvent &event, const char *cp) cp += i; text_start = cp; } - else if (*cp == '?') + else if (nest && *cp == '?') { int i; tagText(event, text_start, cp - 1); @@ -261,9 +264,21 @@ void mp::HTMLParser::Rep::parse_str(HTMLParserEvent &event, const char *cp) else if (*cp == '/' && isAlpha(cp[1])) { int i; - tagText(event, text_start, cp - 1); - + i = skipName(++cp); + + if (!nest) + { + if (i == 6 && !yaz_strncasecmp(cp, "script", i)) + { + int ws = skipSpace(cp + 6); + if (cp[ws + 6] == '>') + nest = true; /* really terminated */ + } + if (!nest) + continue; + } + tagText(event, text_start, cp - 2); event.closeTag(cp, i); if (m_verbose) printf("------ tag close %.*s\n", i, cp); @@ -271,7 +286,7 @@ void mp::HTMLParser::Rep::parse_str(HTMLParserEvent &event, const char *cp) cp += i; text_start = cp; } - else if (isAlpha(*cp)) + else if (nest && isAlpha(*cp)) { int i, j; tagText(event, text_start, cp - 1); @@ -281,6 +296,10 @@ void mp::HTMLParser::Rep::parse_str(HTMLParserEvent &event, const char *cp) printf("------ tag open %.*s\n", i, cp); j = tagAttrs(event, cp, i, cp + i); j += tagEnd(event, cp, i, cp + i + j); + + if (i == 6 && !yaz_strncasecmp(cp, "script", i)) + nest = false; + cp += i + j; text_start = cp; }