#include <stdlib.h>
#include <ctype.h>
#include <stdio.h>
+#include <yaz/matchstr.h>
#define SPACECHR " \t\r\n\f"
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html
namespace metaproxy_1 {
class HTMLParser::Rep {
const char *text_start, const char *text_end);
int tagEnd(HTMLParserEvent &event,
const char *tag, int tag_len, const char *cp);
- int tagStart(HTMLParserEvent &event,
- int *tag_len, const char *cp, const char which);
int tagAttrs(HTMLParserEvent &event,
const char *name, int len,
const char *cp);
+ int skipAttribute(HTMLParserEvent &event,
+ const char *cp, int *attr_len,
+ const char **value, int *val_len, int *tr);
Rep();
~Rep();
int m_verbose;
+ bool nest;
};
}
mp::HTMLParser::Rep::Rep()
{
m_verbose = 0;
+ nest = true;
}
mp::HTMLParser::Rep::~Rep()
m_p->parse_str(event, str);
}
+static int isAlpha(int c)
+{
+ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
+}
+
static int skipSpace(const char *cp)
{
int i = 0;
static int skipName(const char *cp)
{
int i;
- for (i = 0; cp[i] && !strchr(SPACECHR "/>=", cp[i]); i++)
+ for (i = 0; cp[i] && !strchr(SPACECHR "/><=", cp[i]); i++)
;
return i;
}
-static int skipAttribute(const char *cp, int *attr_len,
- const char **value, int *val_len)
+int mp::HTMLParser::Rep::skipAttribute(HTMLParserEvent &event,
+ const char *cp, int *attr_len,
+ const char **value, int *val_len,
+ int *tr)
{
+ int v0, v1;
int i = skipName(cp);
*attr_len = i;
*value = NULL;
i += skipSpace(cp + i);
if (cp[i] == '=')
{
- int v0, v1;
i++;
i += skipSpace(cp + i);
if (cp[i] == '\"' || cp[i] == '\'')
{
- char tr = cp[i];
+ *tr = cp[i];
v0 = ++i;
- while (cp[i] != tr && cp[i])
+ while (cp[i] != *tr && cp[i])
i++;
v1 = i;
if (cp[i])
}
else
{
+ *tr = 0;
v0 = i;
while (cp[i] && !strchr(SPACECHR ">", cp[i]))
i++;
}
*value = cp + v0;
*val_len = v1 - v0;
+ i += skipSpace(cp + i);
}
- i += skipSpace(cp + i);
return i;
}
const char *cp)
{
int i = skipSpace(cp);
- while (cp[i] && cp[i] != '>' && cp[i] != '/')
+ while (cp[i] && !strchr("/><", cp[i]))
{
const char *attr_name = cp + i;
int attr_len;
const char *value;
int val_len;
- int nor = skipAttribute(cp+i, &attr_len, &value, &val_len);
+ int tr;
+ char x[2];
+ int nor = skipAttribute(event, cp+i, &attr_len, &value, &val_len, &tr);
+ if (!nor)
+ break;
i += nor;
- if (nor)
- {
- if (m_verbose)
- printf ("------ attr %.*s=%.*s\n", attr_len, attr_name,
- val_len, value);
- event.attribute(name, len, attr_name, attr_len, value, val_len);
- }
- else
- {
- i++;
- }
- }
- return i;
-}
-int mp::HTMLParser::Rep::tagStart(HTMLParserEvent &event,
- int *tag_len,
- const char *cp, const char which)
-{
- int i;
- switch (which)
- {
- case '/':
- i = skipName(cp);
- *tag_len = i;
- if (m_verbose)
- printf("------ tag close %.*s\n", i, cp);
- event.closeTag(cp, i);
- break;
- case '!':
- for (i = 0; cp[i] && cp[i] != '>'; i++)
- ;
- *tag_len = i;
- event.openTagStart(cp, i);
- if (m_verbose)
- printf("------ dtd %.*s\n", i, cp);
- break;
- case '?':
- for (i = 0; cp[i] && cp[i] != '>'; i++)
- ;
- *tag_len = i;
- event.openTagStart(cp, i);
- if (m_verbose)
- printf("------ pi %.*s\n", i, cp);
- break;
- default:
- i = skipName(cp);
- *tag_len = i;
+ x[0] = tr;
+ x[1] = 0;
if (m_verbose)
- printf("------ tag open %.*s\n", i, cp);
- event.openTagStart(cp, i);
-
- i += tagAttrs(event, cp, i, cp + i);
-
- break;
+ {
+ printf("------ attr %.*s", attr_len, attr_name);
+ if (value)
+ printf("=%.*s", val_len, value);
+ printf("\n");
+ }
+ event.attribute(name, len, attr_name, attr_len, value, val_len, x);
}
return i;
}
{
int i = 0;
int close_it = 0;
- while (cp[i] && cp[i] != '>')
+ for (; cp[i] && !strchr("/><", cp[i]); i++)
+ ;
+ if (i > 0)
+ {
+ if (m_verbose)
+ printf("------ text %.*s\n", i, cp);
+ event.text(cp, i);
+ }
+ if (cp[i] == '/')
{
- if (cp[i] == '/')
- close_it = 1;
+ close_it = 1;
i++;
}
if (cp[i] == '>')
{
+ if (m_verbose)
+ printf("------ any tag %s %.*s\n",
+ close_it ? "close" : "end", tag_len, tag);
event.anyTagEnd(tag, tag_len, close_it);
i++;
}
void mp::HTMLParser::Rep::parse_str(HTMLParserEvent &event, const char *cp)
{
const char *text_start = cp;
- const char *text_end = cp;
while (*cp)
{
- if (cp[0] == '<' && cp[1]) //tag?
+ if (*cp++ != '<')
+ continue;
+
+ if (nest && *cp == '!')
+ {
+ int i;
+ tagText(event, text_start, cp - 1);
+ if (cp[1] == '-' && cp[2] == '-')
+ {
+ for (i = 3; cp[i]; i++)
+ if (cp[i] == '-' && cp[i+1] == '-' && cp[i+2] == '>')
+ {
+ i+= 2;
+ event.openTagStart(cp, i);
+ break;
+ }
+ }
+ else
+ {
+ for (i = 1; cp[i] && cp[i] != '>'; i++)
+ ;
+ event.openTagStart(cp, i);
+ }
+ if (m_verbose)
+ printf("------ dtd %.*s\n", i, cp);
+ i += tagEnd(event, cp, i, cp + i);
+ cp += i;
+ text_start = cp;
+ }
+ else if (nest && *cp == '?')
+ {
+ int i;
+ tagText(event, text_start, cp - 1);
+ for (i = 1; cp[i] && cp[i] != '>'; i++)
+ ;
+ event.openTagStart(cp, i);
+ if (m_verbose)
+ printf("------ pi %.*s\n", i, cp);
+ i += tagEnd(event, cp, i, cp + i);
+ cp += i;
+ text_start = cp;
+ }
+ else if (*cp == '/' && isAlpha(cp[1]))
{
- char which = cp[1];
- if (which == '/')
- cp++;
- if (!strchr(SPACECHR, cp[1])) //valid tag starts
+ int i;
+
+ i = skipName(++cp);
+
+ if (!nest)
{
- int i = 0;
- int tag_len;
-
- tagText(event, text_start, text_end); //flush any text
- cp++;
- i += tagStart(event, &tag_len, cp, which);
- i += tagEnd(event, cp, tag_len, cp + i);
- cp += i;
- text_start = cp;
- text_end = cp;
- continue;
+ if (i == 6 && !yaz_strncasecmp(cp, "script", i))
+ {
+ int ws = skipSpace(cp + 6);
+ if (cp[ws + 6] == '>')
+ nest = true; /* really terminated */
+ }
+ if (!nest)
+ continue;
}
+ tagText(event, text_start, cp - 2);
+ event.closeTag(cp, i);
+ if (m_verbose)
+ printf("------ tag close %.*s\n", i, cp);
+ i += tagEnd(event, cp, i, cp + i);
+ cp += i;
+ text_start = cp;
+ }
+ else if (nest && isAlpha(*cp))
+ {
+ int i, j;
+ tagText(event, text_start, cp - 1);
+ i = skipName(cp);
+ event.openTagStart(cp, i);
+ if (m_verbose)
+ printf("------ tag open %.*s\n", i, cp);
+ j = tagAttrs(event, cp, i, cp + i);
+ j += tagEnd(event, cp, i, cp + i + j);
+
+ if (i == 6 && !yaz_strncasecmp(cp, "script", i))
+ nest = false;
+
+ cp += i + j;
+ text_start = cp;
}
- //text
- cp++;
- text_end = cp;
}
- tagText(event, text_start, text_end); //flush any text
+ tagText(event, text_start, cp);
+}
+
+mp::HTMLParserEvent::~HTMLParserEvent()
+{
}
/*