287 lines
11 KiB
Diff
287 lines
11 KiB
Diff
From 8e219b154e9b938af84c4b009aefa692020103f9 Mon Sep 17 00:00:00 2001
|
|
From: Nick Wellnhofer <wellnhofer@aevum.de>
|
|
Date: Sun, 12 Jul 2020 21:43:44 +0200
|
|
Subject: [PATCH] Fix HTML push parser lookahead
|
|
|
|
The parsing rules when looking for terminating chars or sequences in
|
|
the push parser differed from the actual parsing code. This could
|
|
result in the lookahead to overshoot and data being rescanned,
|
|
potentially leading to quadratic runtime.
|
|
|
|
Comments must never be handled during lookahead. Attribute values must
|
|
only be skipped for start tags and doctype declarations, not for end
|
|
tags, comments, PIs and script content.
|
|
---
|
|
HTMLparser.c | 88 +++++++++++++---------------------------------------
|
|
1 file changed, 21 insertions(+), 67 deletions(-)
|
|
|
|
diff --git a/HTMLparser.c b/HTMLparser.c
|
|
index 06d8c602..d10cf11f 100644
|
|
--- a/HTMLparser.c
|
|
+++ b/HTMLparser.c
|
|
@@ -5136,7 +5136,7 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
|
|
* @first: the first char to lookup
|
|
* @next: the next char to lookup or zero
|
|
* @third: the next char to lookup or zero
|
|
- * @comment: flag to force checking inside comments
|
|
+ * @ignoreattrval: skip over attribute values
|
|
*
|
|
* Try to find if a sequence (first, next, third) or just (first next) or
|
|
* (first) is available in the input stream.
|
|
@@ -5150,13 +5150,11 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
|
|
*/
|
|
static int
|
|
htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
|
|
- xmlChar next, xmlChar third, int iscomment,
|
|
- int ignoreattrval)
|
|
+ xmlChar next, xmlChar third, int ignoreattrval)
|
|
{
|
|
int base, len;
|
|
htmlParserInputPtr in;
|
|
const xmlChar *buf;
|
|
- int incomment = 0;
|
|
int invalue = 0;
|
|
char valdellim = 0x0;
|
|
|
|
@@ -5171,8 +5169,7 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
|
|
if (ctxt->checkIndex > base) {
|
|
base = ctxt->checkIndex;
|
|
/* Abuse hasPErefs member to restore current state. */
|
|
- incomment = ctxt->hasPErefs & 1 ? 1 : 0;
|
|
- invalue = ctxt->hasPErefs & 2 ? 1 : 0;
|
|
+ invalue = ctxt->hasPErefs & 1 ? 1 : 0;
|
|
}
|
|
|
|
if (in->buf == NULL) {
|
|
@@ -5189,14 +5186,6 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
|
|
else if (next)
|
|
len--;
|
|
for (; base < len; base++) {
|
|
- if ((!incomment) && (base + 4 < len) && (!iscomment)) {
|
|
- if ((buf[base] == '<') && (buf[base + 1] == '!') &&
|
|
- (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
|
|
- incomment = 1;
|
|
- /* do not increment past <! - some people use <!--> */
|
|
- base += 2;
|
|
- }
|
|
- }
|
|
if (ignoreattrval) {
|
|
if (buf[base] == '"' || buf[base] == '\'') {
|
|
if (invalue) {
|
|
@@ -5213,16 +5202,6 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
|
|
continue;
|
|
}
|
|
}
|
|
- if (incomment) {
|
|
- if (base + 3 > len)
|
|
- break;
|
|
- if ((buf[base] == '-') && (buf[base + 1] == '-') &&
|
|
- (buf[base + 2] == '>')) {
|
|
- incomment = 0;
|
|
- base += 2;
|
|
- }
|
|
- continue;
|
|
- }
|
|
if (buf[base] == first) {
|
|
if (third != 0) {
|
|
if ((buf[base + 1] != next) || (buf[base + 2] != third))
|
|
@@ -5251,11 +5230,10 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
|
|
}
|
|
ctxt->checkIndex = base;
|
|
/* Abuse hasPErefs member to track current state. */
|
|
- ctxt->hasPErefs = 0;
|
|
- if (incomment)
|
|
- ctxt->hasPErefs |= 1;
|
|
if (invalue)
|
|
- ctxt->hasPErefs |= 2;
|
|
+ ctxt->hasPErefs |= 1;
|
|
+ else
|
|
+ ctxt->hasPErefs &= ~1;
|
|
#ifdef DEBUG_PUSH
|
|
if (next == 0)
|
|
xmlGenericError(xmlGenericErrorContext,
|
|
@@ -5293,7 +5271,6 @@ htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
|
|
int base, len;
|
|
htmlParserInputPtr in;
|
|
const xmlChar *buf;
|
|
- int incomment = 0;
|
|
int i;
|
|
|
|
in = ctxt->input;
|
|
@@ -5304,11 +5281,8 @@ htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
|
|
if (base < 0)
|
|
return (-1);
|
|
|
|
- if (ctxt->checkIndex > base) {
|
|
+ if (ctxt->checkIndex > base)
|
|
base = ctxt->checkIndex;
|
|
- /* Abuse hasPErefs member to restore current state. */
|
|
- incomment = ctxt->hasPErefs & 1 ? 1 : 0;
|
|
- }
|
|
|
|
if (in->buf == NULL) {
|
|
buf = in->base;
|
|
@@ -5319,24 +5293,6 @@ htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
|
|
}
|
|
|
|
for (; base < len; base++) {
|
|
- if (!incomment && (base + 4 < len)) {
|
|
- if ((buf[base] == '<') && (buf[base + 1] == '!') &&
|
|
- (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
|
|
- incomment = 1;
|
|
- /* do not increment past <! - some people use <!--> */
|
|
- base += 2;
|
|
- }
|
|
- }
|
|
- if (incomment) {
|
|
- if (base + 3 > len)
|
|
- break;
|
|
- if ((buf[base] == '-') && (buf[base + 1] == '-') &&
|
|
- (buf[base + 2] == '>')) {
|
|
- incomment = 0;
|
|
- base += 2;
|
|
- }
|
|
- continue;
|
|
- }
|
|
for (i = 0; i < stopLen; ++i) {
|
|
if (buf[base] == stop[i]) {
|
|
ctxt->checkIndex = 0;
|
|
@@ -5345,8 +5301,6 @@ htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
|
|
}
|
|
}
|
|
ctxt->checkIndex = base;
|
|
- /* Abuse hasPErefs member to track current state. */
|
|
- ctxt->hasPErefs = incomment;
|
|
return (-1);
|
|
}
|
|
|
|
@@ -5489,7 +5443,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
|
|
(UPP(8) == 'E')) {
|
|
if ((!terminate) &&
|
|
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
|
|
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
|
|
goto done;
|
|
#ifdef DEBUG_PUSH
|
|
xmlGenericError(xmlGenericErrorContext,
|
|
@@ -5536,7 +5490,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|
if ((cur == '<') && (next == '!') &&
|
|
(in->cur[2] == '-') && (in->cur[3] == '-')) {
|
|
if ((!terminate) &&
|
|
- (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
|
|
+ (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
|
|
goto done;
|
|
#ifdef DEBUG_PUSH
|
|
xmlGenericError(xmlGenericErrorContext,
|
|
@@ -5546,7 +5500,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|
ctxt->instate = XML_PARSER_MISC;
|
|
} else if ((cur == '<') && (next == '?')) {
|
|
if ((!terminate) &&
|
|
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
|
|
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
|
|
goto done;
|
|
#ifdef DEBUG_PUSH
|
|
xmlGenericError(xmlGenericErrorContext,
|
|
@@ -5560,7 +5514,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
|
|
(UPP(8) == 'E')) {
|
|
if ((!terminate) &&
|
|
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
|
|
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
|
|
goto done;
|
|
#ifdef DEBUG_PUSH
|
|
xmlGenericError(xmlGenericErrorContext,
|
|
@@ -5597,7 +5551,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|
if ((cur == '<') && (next == '!') &&
|
|
(in->cur[2] == '-') && (in->cur[3] == '-')) {
|
|
if ((!terminate) &&
|
|
- (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
|
|
+ (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
|
|
goto done;
|
|
#ifdef DEBUG_PUSH
|
|
xmlGenericError(xmlGenericErrorContext,
|
|
@@ -5607,7 +5561,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|
ctxt->instate = XML_PARSER_PROLOG;
|
|
} else if ((cur == '<') && (next == '?')) {
|
|
if ((!terminate) &&
|
|
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
|
|
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
|
|
goto done;
|
|
#ifdef DEBUG_PUSH
|
|
xmlGenericError(xmlGenericErrorContext,
|
|
@@ -5645,7 +5599,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|
if ((cur == '<') && (next == '!') &&
|
|
(in->cur[2] == '-') && (in->cur[3] == '-')) {
|
|
if ((!terminate) &&
|
|
- (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
|
|
+ (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
|
|
goto done;
|
|
#ifdef DEBUG_PUSH
|
|
xmlGenericError(xmlGenericErrorContext,
|
|
@@ -5655,7 +5609,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|
ctxt->instate = XML_PARSER_EPILOG;
|
|
} else if ((cur == '<') && (next == '?')) {
|
|
if ((!terminate) &&
|
|
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
|
|
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
|
|
goto done;
|
|
#ifdef DEBUG_PUSH
|
|
xmlGenericError(xmlGenericErrorContext,
|
|
@@ -5719,7 +5673,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|
break;
|
|
}
|
|
if ((!terminate) &&
|
|
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
|
|
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
|
|
goto done;
|
|
|
|
/* Capture start position */
|
|
@@ -5866,7 +5820,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|
int idx;
|
|
xmlChar val;
|
|
|
|
- idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
|
|
+ idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
|
|
if (idx < 0)
|
|
goto done;
|
|
val = in->cur[idx + 2];
|
|
@@ -5893,7 +5847,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
|
|
(UPP(8) == 'E')) {
|
|
if ((!terminate) &&
|
|
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
|
|
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
|
|
goto done;
|
|
htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
|
|
"Misplaced DOCTYPE declaration\n",
|
|
@@ -5903,7 +5857,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|
(in->cur[2] == '-') && (in->cur[3] == '-')) {
|
|
if ((!terminate) &&
|
|
(htmlParseLookupSequence(
|
|
- ctxt, '-', '-', '>', 1, 1) < 0))
|
|
+ ctxt, '-', '-', '>', 0) < 0))
|
|
goto done;
|
|
#ifdef DEBUG_PUSH
|
|
xmlGenericError(xmlGenericErrorContext,
|
|
@@ -5913,7 +5867,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|
ctxt->instate = XML_PARSER_CONTENT;
|
|
} else if ((cur == '<') && (next == '?')) {
|
|
if ((!terminate) &&
|
|
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
|
|
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
|
|
goto done;
|
|
#ifdef DEBUG_PUSH
|
|
xmlGenericError(xmlGenericErrorContext,
|
|
@@ -5984,7 +5938,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|
if (avail < 2)
|
|
goto done;
|
|
if ((!terminate) &&
|
|
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
|
|
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
|
|
goto done;
|
|
htmlParseEndTag(ctxt);
|
|
if (ctxt->nameNr == 0) {
|
|
--
|
|
2.27.0
|
|
|