X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=src%2Fhtml_parser.cpp;h=36411950093c29c0a8d54a9d0744509ebd5dd156;hb=96a9d0598df0722dc31117fd4559bb4b23a2225e;hp=1436553f691048fa1b9cd66905ea2fa1c4d0ed5a;hpb=897639233e3a6232d039666ba38b393bf7ac0ef0;p=metaproxy-moved-to-github.git diff --git a/src/html_parser.cpp b/src/html_parser.cpp index 1436553..3641195 100644 --- a/src/html_parser.cpp +++ b/src/html_parser.cpp @@ -37,14 +37,12 @@ namespace metaproxy_1 { const char *text_start, const char *text_end); int tagEnd(HTMLParserEvent &event, const char *tag, int tag_len, const char *cp); - int tagStart(HTMLParserEvent &event, - int *tag_len, const char *cp, const char which); int tagAttrs(HTMLParserEvent &event, const char *name, int len, const char *cp); int skipAttribute(HTMLParserEvent &event, const char *cp, int *attr_len, - const char **value, int *val_len); + const char **value, int *val_len, int *tr); Rep(); ~Rep(); int m_verbose; @@ -81,6 +79,11 @@ void mp::HTMLParser::parse(mp::HTMLParserEvent & event, const char *str) const m_p->parse_str(event, str); } +static int isAlpha(int c) +{ + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); +} + static int skipSpace(const char *cp) { int i = 0; @@ -99,8 +102,10 @@ static int skipName(const char *cp) int mp::HTMLParser::Rep::skipAttribute(HTMLParserEvent &event, const char *cp, int *attr_len, - const char **value, int *val_len) + const char **value, int *val_len, + int *tr) { + int v0, v1; int i = skipName(cp); *attr_len = i; *value = NULL; @@ -109,14 +114,13 @@ int mp::HTMLParser::Rep::skipAttribute(HTMLParserEvent &event, i += skipSpace(cp + i); if (cp[i] == '=') { - int v0, v1; i++; i += skipSpace(cp + i); if (cp[i] == '\"' || cp[i] == '\'') { - char tr = cp[i]; + *tr = cp[i]; v0 = ++i; - while (cp[i] != tr && cp[i]) + while (cp[i] != *tr && cp[i]) i++; v1 = i; if (cp[i]) @@ -124,6 +128,7 @@ int mp::HTMLParser::Rep::skipAttribute(HTMLParserEvent &event, } else { + *tr = 0; v0 = i; while (cp[i] && !strchr(SPACECHR ">", cp[i])) i++; @@ -131,8 +136,8 @@ int mp::HTMLParser::Rep::skipAttribute(HTMLParserEvent &event, } *value = cp + v0; *val_len = v1 - v0; + i += skipSpace(cp + i); } - i += skipSpace(cp + i); return i; } @@ -147,63 +152,19 @@ int mp::HTMLParser::Rep::tagAttrs(HTMLParserEvent &event, int attr_len; const char *value; int val_len; - int nor = skipAttribute(event, cp+i, &attr_len, &value, &val_len); + int tr; + char x[2]; + int nor = skipAttribute(event, cp+i, &attr_len, &value, &val_len, &tr); + if (!nor) + break; i += nor; - if (nor) - { - if (m_verbose) - printf ("------ attr %.*s=%.*s\n", attr_len, attr_name, - val_len, value); - event.attribute(name, len, attr_name, attr_len, value, val_len); - } - else - { - i++; - } - } - return i; -} -int mp::HTMLParser::Rep::tagStart(HTMLParserEvent &event, - int *tag_len, - const char *cp, const char which) -{ - int i; - switch (which) - { - case '/': - i = skipName(cp); - *tag_len = i; - if (m_verbose) - printf("------ tag close %.*s\n", i, cp); - event.closeTag(cp, i); - break; - case '!': - for (i = 0; cp[i] && cp[i] != '>'; i++) - ; - *tag_len = i; - event.openTagStart(cp, i); + x[0] = tr; + x[1] = 0; if (m_verbose) - printf("------ dtd %.*s\n", i, cp); - break; - case '?': - for (i = 0; cp[i] && cp[i] != '>'; i++) - ; - *tag_len = i; - event.openTagStart(cp, i); - if (m_verbose) - printf("------ pi %.*s\n", i, cp); - break; - default: - i = skipName(cp); - *tag_len = i; - if (m_verbose) - printf("------ tag open %.*s\n", i, cp); - event.openTagStart(cp, i); - - i += tagAttrs(event, cp, i, cp + i); - - break; + printf ("------ attr %.*s=%.*s\n", attr_len, attr_name, + val_len, value); + event.attribute(name, len, attr_name, attr_len, value, val_len, x); } return i; } @@ -216,7 +177,11 @@ int mp::HTMLParser::Rep::tagEnd(HTMLParserEvent &event, for (; cp[i] && cp[i] != '/' && cp[i] != '>'; i++) ; if (i > 0) + { + if (m_verbose) + printf("------ text %.*s\n", i, cp); event.text(cp, i); + } if (cp[i] == '/') { close_it = 1; @@ -224,6 +189,9 @@ int mp::HTMLParser::Rep::tagEnd(HTMLParserEvent &event, } if (cp[i] == '>') { + if (m_verbose) + printf("------ any tag %s %.*s\n", + close_it ? " close" : "end", tag_len, tag); event.anyTagEnd(tag, tag_len, close_it); i++; } @@ -245,34 +213,82 @@ void mp::HTMLParser::Rep::tagText(HTMLParserEvent &event, void mp::HTMLParser::Rep::parse_str(HTMLParserEvent &event, const char *cp) { const char *text_start = cp; - const char *text_end = cp; while (*cp) { - if (cp[0] == '<' && cp[1]) //tag? + if (*cp++ != '<') + continue; + + if (*cp == '!') { - char which = cp[1]; - if (which == '/') - cp++; - if (!strchr(SPACECHR, cp[1])) //valid tag starts + int i; + tagText(event, text_start, cp - 1); + if (cp[1] == '-' && cp[2] == '-') { - int i = 0; - int tag_len; - - tagText(event, text_start, text_end); //flush any text - cp++; - i += tagStart(event, &tag_len, cp, which); - i += tagEnd(event, cp, tag_len, cp + i); - cp += i; - text_start = cp; - text_end = cp; - continue; + for (i = 3; cp[i]; i++) + if (cp[i] == '-' && cp[i+1] == '-' && cp[i+2] == '>') + { + i+= 2; + event.openTagStart(cp, i); + break; + } } + else + { + for (i = 1; cp[i] && cp[i] != '>'; i++) + ; + event.openTagStart(cp, i); + } + if (m_verbose) + printf("------ dtd %.*s\n", i, cp); + i += tagEnd(event, cp, i, cp + i); + cp += i; + text_start = cp; + } + else if (*cp == '?') + { + int i; + tagText(event, text_start, cp - 1); + for (i = 1; cp[i] && cp[i] != '>'; i++) + ; + event.openTagStart(cp, i); + if (m_verbose) + printf("------ pi %.*s\n", i, cp); + i += tagEnd(event, cp, i, cp + i); + cp += i; + text_start = cp; + } + else if (*cp == '/' && isAlpha(cp[1])) + { + int i; + tagText(event, text_start, cp - 1); + + i = skipName(++cp); + event.closeTag(cp, i); + if (m_verbose) + printf("------ tag close %.*s\n", i, cp); + i += tagEnd(event, cp, i, cp + i); + cp += i; + text_start = cp; + } + else if (isAlpha(*cp)) + { + int i, j; + tagText(event, text_start, cp - 1); + i = skipName(cp); + event.openTagStart(cp, i); + if (m_verbose) + printf("------ tag open %.*s\n", i, cp); + j = tagAttrs(event, cp, i, cp + i); + j += tagEnd(event, cp, i, cp + i + j); + cp += i + j; + text_start = cp; } - //text - cp++; - text_end = cp; } - tagText(event, text_start, text_end); //flush any text + tagText(event, text_start, cp); +} + +mp::HTMLParserEvent::~HTMLParserEvent() +{ } /*