X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=src%2Fhtml_parser.cpp;h=95db7c7fd41993645193ba617e1d5404993a8307;hb=586d78659d671683f33ec55f4a7d32b28e345ccd;hp=22d4a31db94f9b4607b0e18e34348851052a9f25;hpb=dd17e564b39fa38a3d611853a3b13dacdff3d070;p=metaproxy-moved-to-github.git diff --git a/src/html_parser.cpp b/src/html_parser.cpp index 22d4a31..95db7c7 100644 --- a/src/html_parser.cpp +++ b/src/html_parser.cpp @@ -1,5 +1,5 @@ /* This file is part of Metaproxy. - Copyright (C) 2005-2013 Index Data + Copyright (C) Index Data Metaproxy is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free @@ -24,9 +24,11 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #include #include #include +#include #define SPACECHR " \t\r\n\f" +// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html namespace metaproxy_1 { class HTMLParser::Rep { @@ -46,6 +48,7 @@ namespace metaproxy_1 { Rep(); ~Rep(); int m_verbose; + bool nest; }; } @@ -54,6 +57,7 @@ namespace mp = metaproxy_1; mp::HTMLParser::Rep::Rep() { m_verbose = 0; + nest = true; } mp::HTMLParser::Rep::~Rep() @@ -95,7 +99,7 @@ static int skipSpace(const char *cp) static int skipName(const char *cp) { int i; - for (i = 0; cp[i] && !strchr(SPACECHR "/>=", cp[i]); i++) + for (i = 0; cp[i] && !strchr(SPACECHR "/><=", cp[i]); i++) ; return i; } @@ -146,7 +150,7 @@ int mp::HTMLParser::Rep::tagAttrs(HTMLParserEvent &event, const char *cp) { int i = skipSpace(cp); - while (cp[i] && cp[i] != '>' && cp[i] != '/') + while (cp[i] && !strchr("/><", cp[i])) { const char *attr_name = cp + i; int attr_len; @@ -162,8 +166,12 @@ int mp::HTMLParser::Rep::tagAttrs(HTMLParserEvent &event, x[0] = tr; x[1] = 0; if (m_verbose) - printf ("------ attr %.*s=%.*s\n", attr_len, attr_name, - val_len, value); + { + printf("------ attr %.*s", attr_len, attr_name); + if (value) + printf("=%.*s", val_len, value); + printf("\n"); + } event.attribute(name, len, attr_name, attr_len, value, val_len, x); } return i; @@ -174,7 +182,7 @@ int mp::HTMLParser::Rep::tagEnd(HTMLParserEvent &event, { int i = 0; int close_it = 0; - for (; cp[i] && cp[i] != '/' && cp[i] != '>'; i++) + for (; cp[i] && !strchr("/><", cp[i]); i++) ; if (i > 0) { @@ -191,7 +199,7 @@ int mp::HTMLParser::Rep::tagEnd(HTMLParserEvent &event, { if (m_verbose) printf("------ any tag %s %.*s\n", - close_it ? " close" : "end", tag_len, tag); + close_it ? "close" : "end", tag_len, tag); event.anyTagEnd(tag, tag_len, close_it); i++; } @@ -218,20 +226,33 @@ void mp::HTMLParser::Rep::parse_str(HTMLParserEvent &event, const char *cp) if (*cp++ != '<') continue; - if (*cp == '!') + if (nest && *cp == '!') { int i; tagText(event, text_start, cp - 1); - for (i = 1; cp[i] && cp[i] != '>'; i++) - ; - event.openTagStart(cp, i); + if (cp[1] == '-' && cp[2] == '-') + { + for (i = 3; cp[i]; i++) + if (cp[i] == '-' && cp[i+1] == '-' && cp[i+2] == '>') + { + i+= 2; + event.openTagStart(cp, i); + break; + } + } + else + { + for (i = 1; cp[i] && cp[i] != '>'; i++) + ; + event.openTagStart(cp, i); + } if (m_verbose) printf("------ dtd %.*s\n", i, cp); i += tagEnd(event, cp, i, cp + i); cp += i; text_start = cp; } - else if (*cp == '?') + else if (nest && *cp == '?') { int i; tagText(event, text_start, cp - 1); @@ -247,9 +268,21 @@ void mp::HTMLParser::Rep::parse_str(HTMLParserEvent &event, const char *cp) else if (*cp == '/' && isAlpha(cp[1])) { int i; - tagText(event, text_start, cp - 1); - + i = skipName(++cp); + + if (!nest) + { + if (i == 6 && !yaz_strncasecmp(cp, "script", i)) + { + int ws = skipSpace(cp + 6); + if (cp[ws + 6] == '>') + nest = true; /* really terminated */ + } + if (!nest) + continue; + } + tagText(event, text_start, cp - 2); event.closeTag(cp, i); if (m_verbose) printf("------ tag close %.*s\n", i, cp); @@ -257,7 +290,7 @@ void mp::HTMLParser::Rep::parse_str(HTMLParserEvent &event, const char *cp) cp += i; text_start = cp; } - else if (isAlpha(*cp)) + else if (nest && isAlpha(*cp)) { int i, j; tagText(event, text_start, cp - 1); @@ -267,6 +300,10 @@ void mp::HTMLParser::Rep::parse_str(HTMLParserEvent &event, const char *cp) printf("------ tag open %.*s\n", i, cp); j = tagAttrs(event, cp, i, cp + i); j += tagEnd(event, cp, i, cp + i + j); + + if (i == 6 && !yaz_strncasecmp(cp, "script", i)) + nest = false; + cp += i + j; text_start = cp; } @@ -274,6 +311,10 @@ void mp::HTMLParser::Rep::parse_str(HTMLParserEvent &event, const char *cp) tagText(event, text_start, cp); } +mp::HTMLParserEvent::~HTMLParserEvent() +{ +} + /* * Local variables: * c-basic-offset: 4