HTMLParser more forgiving with bad attributes
authorAdam Dickmeiss <adam@indexdata.dk>
Thu, 27 Jun 2013 14:40:50 +0000 (16:40 +0200)
committerAdam Dickmeiss <adam@indexdata.dk>
Thu, 27 Jun 2013 14:40:50 +0000 (16:40 +0200)
The idea is not to fix, but at least passthru as text.

src/html_parser.cpp
src/test_html_parser.cpp

index ddbbbe0..e704620 100644 (file)
@@ -102,38 +102,39 @@ int mp::HTMLParser::Rep::skipAttribute(HTMLParserEvent &event,
                                        const char **value, int *val_len,
                                        int *tr)
 {
+    int v0, v1;
     int i = skipName(cp);
     *attr_len = i;
     *value = NULL;
     if (!i)
         return skipSpace(cp);
     i += skipSpace(cp + i);
-    if (cp[i] == '=')
+    if (cp[i] != '=')
+        return 0;
+
+    i++;
+    i += skipSpace(cp + i);
+    if (cp[i] == '\"' || cp[i] == '\'')
     {
-        int v0, v1;
-        i++;
-        i += skipSpace(cp + i);
-        if (cp[i] == '\"' || cp[i] == '\'')
-        {
-            *tr = cp[i];
-            v0 = ++i;
-            while (cp[i] != *tr && cp[i])
-                i++;
-            v1 = i;
-            if (cp[i])
-                i++;
-        }
-        else
-        {
-            *tr = 0;
-            v0 = i;
-            while (cp[i] && !strchr(SPACECHR ">", cp[i]))
-                i++;
-            v1 = i;
-        }
-        *value = cp + v0;
-        *val_len = v1 - v0;
+        *tr = cp[i];
+        v0 = ++i;
+        while (cp[i] != *tr && cp[i])
+            i++;
+        v1 = i;
+        if (cp[i])
+            i++;
+    }
+    else
+    {
+        *tr = 0;
+        v0 = i;
+        while (cp[i] && !strchr(SPACECHR ">", cp[i]))
+            i++;
+        v1 = i;
     }
+    *value = cp + v0;
+    *val_len = v1 - v0;
+
     i += skipSpace(cp + i);
     return i;
 }
@@ -150,22 +151,18 @@ int mp::HTMLParser::Rep::tagAttrs(HTMLParserEvent &event,
         const char *value;
         int val_len;
         int tr;
+        char x[2];
         int nor = skipAttribute(event, cp+i, &attr_len, &value, &val_len, &tr);
+        if (!nor)
+            break;
         i += nor;
-       if (nor)
-       {
-            char x[2];
-            x[0] = tr;
-            x[1] = 0;
-            if (m_verbose)
-                printf ("------ attr %.*s=%.*s\n", attr_len, attr_name,
-                        val_len, value);
-            event.attribute(name, len, attr_name, attr_len, value, val_len, x);
-       }
-        else
-        {
-            i++;
-        }
+
+        x[0] = tr;
+        x[1] = 0;
+        if (m_verbose)
+            printf ("------ attr %.*s=%.*s\n", attr_len, attr_name,
+                    val_len, value);
+        event.attribute(name, len, attr_name, attr_len, value, val_len, x);
     }
     return i;
 }
@@ -222,7 +219,11 @@ int mp::HTMLParser::Rep::tagEnd(HTMLParserEvent &event,
     for (; cp[i] && cp[i] != '/' && cp[i] != '>'; i++)
         ;
     if (i > 0)
+    {
+        if (m_verbose)
+            printf("------ text %.*s\n", i, cp);
         event.text(cp, i);
+    }
     if (cp[i] == '/')
     {
         close_it = 1;
@@ -230,6 +231,9 @@ int mp::HTMLParser::Rep::tagEnd(HTMLParserEvent &event,
     }
     if (cp[i] == '>')
     {
+        if (m_verbose)
+            printf("------ any tag %s %.*s\n",
+                   close_it ? " close" : "end", tag_len, tag);
         event.anyTagEnd(tag, tag_len, close_it);
         i++;
     }
index 6f5c134..baf42bc 100644 (file)
@@ -80,14 +80,17 @@ BOOST_AUTO_TEST_CASE( test_html_parser_1 )
             "<html><body><a t1=v1 t2='v2' t3=\"v3\">some text</a>"
             "<hr><table></table  ><a href=\"x\"/></body></html>";
         MyEvent e;
-        hp.set_verbose(1);
+        hp.set_verbose(0);
         hp.parse(e, html);
 
-        std::cout << "Expected" << std::endl;
-        std::cout << expected << std::endl;
-        std::cout << "Got" << std::endl;
-        std::cout << e.out << std::endl;
         BOOST_CHECK_EQUAL(std::string(expected), e.out);
+        if (std::string(expected) != e.out)
+        {
+            std::cout << "Expected" << std::endl;
+            std::cout << expected << std::endl;
+            std::cout << "Got" << std::endl;
+            std::cout << e.out << std::endl;
+        }
     }
     catch (std::exception & e)
     {
@@ -117,15 +120,17 @@ BOOST_AUTO_TEST_CASE( test_html_parser_2 )
 
         const char* expected = html;
         MyEvent e;
-        hp.set_verbose(1);
+        hp.set_verbose(0);
         hp.parse(e, html);
 
-        std::cout << "Expected" << std::endl;
-        std::cout << expected << std::endl;
-        std::cout << "Got" << std::endl;
-        std::cout << e.out << std::endl;
-
         BOOST_CHECK_EQUAL(std::string(expected), e.out);
+        if (std::string(expected) != e.out)
+        {
+            std::cout << "Expected" << std::endl;
+            std::cout << expected << std::endl;
+            std::cout << "Got" << std::endl;
+            std::cout << e.out << std::endl;
+        }
     }
     catch (std::exception & e) 
     {
@@ -152,15 +157,17 @@ BOOST_AUTO_TEST_CASE( test_html_parser_3 )
 
         const char* expected = html;
         MyEvent e;
-        hp.set_verbose(1);
+        hp.set_verbose(0);
         hp.parse(e, html);
 
-        std::cout << "Expected" << std::endl;
-        std::cout << expected << std::endl;
-        std::cout << "Got" << std::endl;
-        std::cout << e.out << std::endl;
-
         BOOST_CHECK_EQUAL(std::string(expected), e.out);
+        if (std::string(expected) != e.out)
+        {
+            std::cout << "Expected" << std::endl;
+            std::cout << expected << std::endl;
+            std::cout << "Got" << std::endl;
+            std::cout << e.out << std::endl;
+        }
     }
     catch (std::exception & e) 
     {
@@ -170,28 +177,28 @@ BOOST_AUTO_TEST_CASE( test_html_parser_3 )
     }
 }
 
-#if 0
-// null ptr exception
 BOOST_AUTO_TEST_CASE( test_html_parser_4 )
 {
     try
     {
         mp::HTMLParser hp;
         const char* html =
-            "<\"?xml version=\"1.0\" strandalone=\"no\"?>\n"
-            "<book></book>";
+            "<\"?xml version=\"1.0\" strandalone=\"no\"?  ax>\n"
+            "<book></book>";  // <book badboy></book> does not work
 
         const char* expected = html;
         MyEvent e;
         hp.set_verbose(1);
         hp.parse(e, html);
 
-        std::cout << "Expected" << std::endl;
-        std::cout << expected << std::endl;
-        std::cout << "Got" << std::endl;
-        std::cout << e.out << std::endl;
-
         BOOST_CHECK_EQUAL(std::string(expected), e.out);
+        if (std::string(expected) != e.out)
+        {
+            std::cout << "Expected" << std::endl;
+            std::cout << expected << std::endl;
+            std::cout << "Got" << std::endl;
+            std::cout << e.out << std::endl;
+        }
     }
     catch (std::exception & e) 
     {
@@ -200,7 +207,6 @@ BOOST_AUTO_TEST_CASE( test_html_parser_4 )
         BOOST_CHECK (false);
     }
 }
-#endif
 
 /*
  * Local variables: