/* This file is part of Metaproxy.
- Copyright (C) 2005-2013 Index Data
+ Copyright (C) Index Data
Metaproxy is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
#include <stdlib.h>
#include <ctype.h>
#include <stdio.h>
+#include <yaz/matchstr.h>
#define SPACECHR " \t\r\n\f"
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html
namespace metaproxy_1 {
class HTMLParser::Rep {
Rep();
~Rep();
int m_verbose;
+ bool nest;
};
}
mp::HTMLParser::Rep::Rep()
{
m_verbose = 0;
+ nest = true;
}
mp::HTMLParser::Rep::~Rep()
static int skipName(const char *cp)
{
int i;
- for (i = 0; cp[i] && !strchr(SPACECHR "/>=", cp[i]); i++)
+ for (i = 0; cp[i] && !strchr(SPACECHR "/><=", cp[i]); i++)
;
return i;
}
const char *cp)
{
int i = skipSpace(cp);
- while (cp[i] && cp[i] != '>' && cp[i] != '/')
+ while (cp[i] && !strchr("/><", cp[i]))
{
const char *attr_name = cp + i;
int attr_len;
x[0] = tr;
x[1] = 0;
if (m_verbose)
- printf ("------ attr %.*s=%.*s\n", attr_len, attr_name,
- val_len, value);
+ {
+ printf("------ attr %.*s", attr_len, attr_name);
+ if (value)
+ printf("=%.*s", val_len, value);
+ printf("\n");
+ }
event.attribute(name, len, attr_name, attr_len, value, val_len, x);
}
return i;
{
int i = 0;
int close_it = 0;
- for (; cp[i] && cp[i] != '/' && cp[i] != '>'; i++)
+ for (; cp[i] && !strchr("/><", cp[i]); i++)
;
if (i > 0)
{
{
if (m_verbose)
printf("------ any tag %s %.*s\n",
- close_it ? " close" : "end", tag_len, tag);
+ close_it ? "close" : "end", tag_len, tag);
event.anyTagEnd(tag, tag_len, close_it);
i++;
}
if (*cp++ != '<')
continue;
- if (*cp == '!')
+ if (nest && *cp == '!')
{
int i;
tagText(event, text_start, cp - 1);
- for (i = 1; cp[i] && cp[i] != '>'; i++)
- ;
- event.openTagStart(cp, i);
+ if (cp[1] == '-' && cp[2] == '-')
+ {
+ for (i = 3; cp[i]; i++)
+ if (cp[i] == '-' && cp[i+1] == '-' && cp[i+2] == '>')
+ {
+ i+= 2;
+ event.openTagStart(cp, i);
+ break;
+ }
+ }
+ else
+ {
+ for (i = 1; cp[i] && cp[i] != '>'; i++)
+ ;
+ event.openTagStart(cp, i);
+ }
if (m_verbose)
printf("------ dtd %.*s\n", i, cp);
i += tagEnd(event, cp, i, cp + i);
cp += i;
text_start = cp;
}
- else if (*cp == '?')
+ else if (nest && *cp == '?')
{
int i;
tagText(event, text_start, cp - 1);
else if (*cp == '/' && isAlpha(cp[1]))
{
int i;
- tagText(event, text_start, cp - 1);
-
+
i = skipName(++cp);
+
+ if (!nest)
+ {
+ if (i == 6 && !yaz_strncasecmp(cp, "script", i))
+ {
+ int ws = skipSpace(cp + 6);
+ if (cp[ws + 6] == '>')
+ nest = true; /* really terminated */
+ }
+ if (!nest)
+ continue;
+ }
+ tagText(event, text_start, cp - 2);
event.closeTag(cp, i);
if (m_verbose)
printf("------ tag close %.*s\n", i, cp);
cp += i;
text_start = cp;
}
- else if (isAlpha(*cp))
+ else if (nest && isAlpha(*cp))
{
int i, j;
tagText(event, text_start, cp - 1);
printf("------ tag open %.*s\n", i, cp);
j = tagAttrs(event, cp, i, cp + i);
j += tagEnd(event, cp, i, cp + i + j);
+
+ if (i == 6 && !yaz_strncasecmp(cp, "script", i))
+ nest = false;
+
cp += i + j;
text_start = cp;
}
tagText(event, text_start, cp);
}
+mp::HTMLParserEvent::~HTMLParserEvent()
+{
+}
+
/*
* Local variables:
* c-basic-offset: 4