From 0c79c441320e67584693ec9bf4bc9649a1c3e1cb Mon Sep 17 00:00:00 2001 From: panxiaohe Date: Fri, 12 Nov 2021 17:25:21 +0800 Subject: [PATCH] add backport bug fixes --- Fix-buffering-in-xmlOutputBufferWrite.patch | 63 ++++++ ...ression-in-xmlNodeDumpOutputInternal.patch | 46 ++++ ...hen-serializing-empty-HTML-documents.patch | 43 ++++ ...id-epsilon-reduction-of-final-states.patch | 60 +++++ Work-around-lxml-API-abuse.patch | 212 ++++++++++++++++++ libxml2.spec | 18 +- 6 files changed, 441 insertions(+), 1 deletion(-) create mode 100644 Fix-buffering-in-xmlOutputBufferWrite.patch create mode 100644 Fix-regression-in-xmlNodeDumpOutputInternal.patch create mode 100644 Fix-whitespace-when-serializing-empty-HTML-documents.patch create mode 100644 Patch-to-forbid-epsilon-reduction-of-final-states.patch create mode 100644 Work-around-lxml-API-abuse.patch diff --git a/Fix-buffering-in-xmlOutputBufferWrite.patch b/Fix-buffering-in-xmlOutputBufferWrite.patch new file mode 100644 index 0000000..eb81c72 --- /dev/null +++ b/Fix-buffering-in-xmlOutputBufferWrite.patch @@ -0,0 +1,63 @@ +From dea91c97debeac7c1aaf9c19f79029809e23a353 Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Tue, 27 Jul 2021 16:12:54 +0200 +Subject: [PATCH] Fix buffering in xmlOutputBufferWrite + +Fix a regression introduced with commit a697ed1e which caused +xmlOutputBufferWrite to flush internal buffers too late. + +Fixes #296. +--- + xmlIO.c | 20 ++++++++++++++++---- + 1 file changed, 16 insertions(+), 4 deletions(-) + +diff --git a/xmlIO.c b/xmlIO.c +index 57312b9..f20c0fa 100644 +--- a/xmlIO.c ++++ b/xmlIO.c +@@ -3401,12 +3401,18 @@ xmlOutputBufferWrite(xmlOutputBufferPtr out, int len, const char *buf) { + out->error = XML_IO_ENCODER; + return(-1); + } +- nbchars = ret >= 0 ? ret : 0; ++ if (out->writecallback) ++ nbchars = xmlBufUse(out->conv); ++ else ++ nbchars = ret >= 0 ? ret : 0; + } else { + ret = xmlBufAdd(out->buffer, (const xmlChar *) buf, chunk); + if (ret != 0) + return(-1); +- nbchars = chunk; ++ if (out->writecallback) ++ nbchars = xmlBufUse(out->buffer); ++ else ++ nbchars = chunk; + } + buf += chunk; + len -= chunk; +@@ -3593,13 +3599,19 @@ xmlOutputBufferWriteEscape(xmlOutputBufferPtr out, const xmlChar *str, + out->error = XML_IO_ENCODER; + return(-1); + } +- nbchars = ret >= 0 ? ret : 0; ++ if (out->writecallback) ++ nbchars = xmlBufUse(out->conv); ++ else ++ nbchars = ret >= 0 ? ret : 0; + } else { + ret = escaping(xmlBufEnd(out->buffer), &chunk, str, &cons); + if ((ret < 0) || (chunk == 0)) /* chunk==0 => nothing done */ + return(-1); + xmlBufAddLen(out->buffer, chunk); +- nbchars = chunk; ++ if (out->writecallback) ++ nbchars = xmlBufUse(out->buffer); ++ else ++ nbchars = chunk; + } + str += cons; + len -= cons; +-- +1.8.3.1 + diff --git a/Fix-regression-in-xmlNodeDumpOutputInternal.patch b/Fix-regression-in-xmlNodeDumpOutputInternal.patch new file mode 100644 index 0000000..c3c2bc1 --- /dev/null +++ b/Fix-regression-in-xmlNodeDumpOutputInternal.patch @@ -0,0 +1,46 @@ +From 13ad8736d294536da4cbcd70a96b0a2fbf47070c Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Tue, 25 May 2021 10:55:25 +0200 +Subject: [PATCH] Fix regression in xmlNodeDumpOutputInternal + +Commit 85b1792e could cause additional whitespace if xmlNodeDump was +called with a non-zero starting level. +--- + xmlsave.c | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +diff --git a/xmlsave.c b/xmlsave.c +index aedbd5e..489505f 100644 +--- a/xmlsave.c ++++ b/xmlsave.c +@@ -890,6 +890,13 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + break; + + case XML_ELEMENT_NODE: ++ if ((cur != root) && (ctxt->format == 1) && ++ (xmlIndentTreeOutput)) ++ xmlOutputBufferWrite(buf, ctxt->indent_size * ++ (ctxt->level > ctxt->indent_nr ? ++ ctxt->indent_nr : ctxt->level), ++ ctxt->indent); ++ + /* + * Some users like lxml are known to pass nodes with a corrupted + * tree structure. Fall back to a recursive call to handle this +@@ -900,13 +907,6 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + break; + } + +- if ((ctxt->level > 0) && (ctxt->format == 1) && +- (xmlIndentTreeOutput)) +- xmlOutputBufferWrite(buf, ctxt->indent_size * +- (ctxt->level > ctxt->indent_nr ? +- ctxt->indent_nr : ctxt->level), +- ctxt->indent); +- + xmlOutputBufferWrite(buf, 1, "<"); + if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { + xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); +-- +1.8.3.1 + diff --git a/Fix-whitespace-when-serializing-empty-HTML-documents.patch b/Fix-whitespace-when-serializing-empty-HTML-documents.patch new file mode 100644 index 0000000..2041f99 --- /dev/null +++ b/Fix-whitespace-when-serializing-empty-HTML-documents.patch @@ -0,0 +1,43 @@ +From 92d9ab4c28842a09ca2b76d3ff2f933e01b6cd6f Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Mon, 7 Jun 2021 15:09:53 +0200 +Subject: [PATCH] Fix whitespace when serializing empty HTML documents + +The old, non-recursive HTML serialization code would always terminate +the output with a newline. The new implementation omitted the newline +if the document node had no children. Readd the newline when +serializing empty documents. + +Fixes #266. +--- + HTMLtree.c | 14 +++++++++----- + 1 file changed, 9 insertions(+), 5 deletions(-) + +diff --git a/HTMLtree.c b/HTMLtree.c +index bdd639c..7a2b855 100644 +--- a/HTMLtree.c ++++ b/HTMLtree.c +@@ -763,11 +763,15 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + if (((xmlDocPtr) cur)->intSubset != NULL) { + htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL); + } +- /* Always validate cur->parent when descending. */ +- if ((cur->parent == parent) && (cur->children != NULL)) { +- parent = cur; +- cur = cur->children; +- continue; ++ if (cur->children != NULL) { ++ /* Always validate cur->parent when descending. */ ++ if (cur->parent == parent) { ++ parent = cur; ++ cur = cur->children; ++ continue; ++ } ++ } else { ++ xmlOutputBufferWriteString(buf, "\n"); + } + break; + +-- +1.8.3.1 + diff --git a/Patch-to-forbid-epsilon-reduction-of-final-states.patch b/Patch-to-forbid-epsilon-reduction-of-final-states.patch new file mode 100644 index 0000000..a9b1c0b --- /dev/null +++ b/Patch-to-forbid-epsilon-reduction-of-final-states.patch @@ -0,0 +1,60 @@ +From ec6e3efb06d7b15cf5a2328fabd3845acea4c815 Mon Sep 17 00:00:00 2001 +From: Arne Becker +Date: Tue, 6 Jul 2021 21:56:04 +0200 +Subject: [PATCH] Patch to forbid epsilon-reduction of final states + +When building the internal representation of a regexp, it is possible +that a lot of empty transitions are created. Therefore there is a step +to reduce them in the function xmlFAEliminateSimpleEpsilonTransitions. + +There is an error there for this case: + +* State 1 has a transition with an atom (in this case "a") to state 2. +* State 2 is final and has an epsilon transition to state 1. + +After reduction it looked like: +* State 1 has a transition with an atom (in this case "a") to itself + and is final. + +In other words, the empty string is accepted when it shouldn't be. + +The attached patch skips the reduction step for final states. +An alternative would be to insert or increment counters when reducing a +final state, but this seemed error prone and unnecessary, since there +aren't that many final states. + +Fixes #282 +--- + xmlregexp.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/xmlregexp.c b/xmlregexp.c +index 40dabb2..8d01c2b 100644 +--- a/xmlregexp.c ++++ b/xmlregexp.c +@@ -1892,6 +1892,12 @@ xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, + * then X and Y are semantically equivalent and X can be eliminated + * If X is the start state then make Y the start state, else replace the + * target of all transitions to X by transitions to Y. ++ * ++ * If X is a final state, skip it. ++ * Otherwise it would be necessary to manipulate counters for this case when ++ * eliminating state 2: ++ * State 1 has a transition with an atom to state 2. ++ * State 2 is final and has an epsilon transition to state 1. + */ + static void + xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { +@@ -1904,7 +1910,8 @@ xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { + continue; + if (state->nbTrans != 1) + continue; +- if (state->type == XML_REGEXP_UNREACH_STATE) ++ if (state->type == XML_REGEXP_UNREACH_STATE || ++ state->type == XML_REGEXP_FINAL_STATE) + continue; + /* is the only transition out a basic transition */ + if ((state->trans[0].atom == NULL) && +-- +1.8.3.1 + diff --git a/Work-around-lxml-API-abuse.patch b/Work-around-lxml-API-abuse.patch new file mode 100644 index 0000000..8bb91b4 --- /dev/null +++ b/Work-around-lxml-API-abuse.patch @@ -0,0 +1,212 @@ +From 85b1792e37b131e7a51af98a37f92472e8de5f3f Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Tue, 18 May 2021 20:08:28 +0200 +Subject: [PATCH] Work around lxml API abuse + +Make xmlNodeDumpOutput and htmlNodeDumpFormatOutput work with corrupted +parent pointers. This used to work with the old recursive code but the +non-recursive rewrite required parent pointers to be set correctly. + +Unfortunately, lxml relies on the old behavior and passes subtrees with +a corrupted structure. Fall back to a recursive function call if an +invalid parent pointer is detected. + +Fixes #255. +--- + HTMLtree.c | 46 ++++++++++++++++++++++++++++------------------ + xmlsave.c | 31 +++++++++++++++++++++---------- + 2 files changed, 49 insertions(+), 28 deletions(-) + +diff --git a/HTMLtree.c b/HTMLtree.c +index 24434d4..bdd639c 100644 +--- a/HTMLtree.c ++++ b/HTMLtree.c +@@ -744,7 +744,7 @@ void + htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED, + int format) { +- xmlNodePtr root; ++ xmlNodePtr root, parent; + xmlAttrPtr attr; + const htmlElemDesc * info; + +@@ -755,6 +755,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + } + + root = cur; ++ parent = cur->parent; + while (1) { + switch (cur->type) { + case XML_HTML_DOCUMENT_NODE: +@@ -762,7 +763,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + if (((xmlDocPtr) cur)->intSubset != NULL) { + htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL); + } +- if (cur->children != NULL) { ++ /* Always validate cur->parent when descending. */ ++ if ((cur->parent == parent) && (cur->children != NULL)) { ++ parent = cur; + cur = cur->children; + continue; + } +@@ -770,6 +773,16 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + + case XML_ELEMENT_NODE: + /* ++ * Some users like lxml are known to pass nodes with a corrupted ++ * tree structure. Fall back to a recursive call to handle this ++ * case. ++ */ ++ if ((cur->parent != parent) && (cur->children != NULL)) { ++ htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); ++ break; ++ } ++ ++ /* + * Get specific HTML info for that node. + */ + if (cur->ns == NULL) +@@ -817,6 +830,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + (cur->name != NULL) && + (cur->name[0] != 'p')) /* p, pre, param */ + xmlOutputBufferWriteString(buf, "\n"); ++ parent = cur; + cur = cur->children; + continue; + } +@@ -825,9 +839,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + (info != NULL) && (!info->isinline)) { + if ((cur->next->type != HTML_TEXT_NODE) && + (cur->next->type != HTML_ENTITY_REF_NODE) && +- (cur->parent != NULL) && +- (cur->parent->name != NULL) && +- (cur->parent->name[0] != 'p')) /* p, pre, param */ ++ (parent != NULL) && ++ (parent->name != NULL) && ++ (parent->name[0] != 'p')) /* p, pre, param */ + xmlOutputBufferWriteString(buf, "\n"); + } + +@@ -842,9 +856,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + break; + if (((cur->name == (const xmlChar *)xmlStringText) || + (cur->name != (const xmlChar *)xmlStringTextNoenc)) && +- ((cur->parent == NULL) || +- ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) && +- (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) { ++ ((parent == NULL) || ++ ((xmlStrcasecmp(parent->name, BAD_CAST "script")) && ++ (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) { + xmlChar *buffer; + + buffer = xmlEncodeEntitiesReentrant(doc, cur->content); +@@ -902,13 +916,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + break; + } + +- /* +- * The parent should never be NULL here but we want to handle +- * corrupted documents gracefully. +- */ +- if (cur->parent == NULL) +- return; +- cur = cur->parent; ++ cur = parent; ++ /* cur->parent was validated when descending. */ ++ parent = cur->parent; + + if ((cur->type == XML_HTML_DOCUMENT_NODE) || + (cur->type == XML_DOCUMENT_NODE)) { +@@ -939,9 +949,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + (cur->next != NULL)) { + if ((cur->next->type != HTML_TEXT_NODE) && + (cur->next->type != HTML_ENTITY_REF_NODE) && +- (cur->parent != NULL) && +- (cur->parent->name != NULL) && +- (cur->parent->name[0] != 'p')) /* p, pre, param */ ++ (parent != NULL) && ++ (parent->name != NULL) && ++ (parent->name[0] != 'p')) /* p, pre, param */ + xmlOutputBufferWriteString(buf, "\n"); + } + } +diff --git a/xmlsave.c b/xmlsave.c +index 61a4045..aedbd5e 100644 +--- a/xmlsave.c ++++ b/xmlsave.c +@@ -847,7 +847,7 @@ htmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + static void + xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + int format = ctxt->format; +- xmlNodePtr tmp, root, unformattedNode = NULL; ++ xmlNodePtr tmp, root, unformattedNode = NULL, parent; + xmlAttrPtr attr; + xmlChar *start, *end; + xmlOutputBufferPtr buf; +@@ -856,6 +856,7 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + buf = ctxt->buf; + + root = cur; ++ parent = cur->parent; + while (1) { + switch (cur->type) { + case XML_DOCUMENT_NODE: +@@ -868,7 +869,9 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + break; + + case XML_DOCUMENT_FRAG_NODE: +- if (cur->children != NULL) { ++ /* Always validate cur->parent when descending. */ ++ if ((cur->parent == parent) && (cur->children != NULL)) { ++ parent = cur; + cur = cur->children; + continue; + } +@@ -887,7 +890,18 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + break; + + case XML_ELEMENT_NODE: +- if ((cur != root) && (ctxt->format == 1) && (xmlIndentTreeOutput)) ++ /* ++ * Some users like lxml are known to pass nodes with a corrupted ++ * tree structure. Fall back to a recursive call to handle this ++ * case. ++ */ ++ if ((cur->parent != parent) && (cur->children != NULL)) { ++ xmlNodeDumpOutputInternal(ctxt, cur); ++ break; ++ } ++ ++ if ((ctxt->level > 0) && (ctxt->format == 1) && ++ (xmlIndentTreeOutput)) + xmlOutputBufferWrite(buf, ctxt->indent_size * + (ctxt->level > ctxt->indent_nr ? + ctxt->indent_nr : ctxt->level), +@@ -942,6 +956,7 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + xmlOutputBufferWrite(buf, 1, ">"); + if (ctxt->format == 1) xmlOutputBufferWrite(buf, 1, "\n"); + if (ctxt->level >= 0) ctxt->level++; ++ parent = cur; + cur = cur->children; + continue; + } +@@ -1058,13 +1073,9 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + break; + } + +- /* +- * The parent should never be NULL here but we want to handle +- * corrupted documents gracefully. +- */ +- if (cur->parent == NULL) +- return; +- cur = cur->parent; ++ cur = parent; ++ /* cur->parent was validated when descending. */ ++ parent = cur->parent; + + if (cur->type == XML_ELEMENT_NODE) { + if (ctxt->level > 0) ctxt->level--; +-- +1.8.3.1 + diff --git a/libxml2.spec b/libxml2.spec index 10427a2..f4aa441 100644 --- a/libxml2.spec +++ b/libxml2.spec @@ -1,7 +1,7 @@ Summary: Library providing XML and HTML support Name: libxml2 Version: 2.9.12 -Release: 2 +Release: 3 License: MIT Group: Development/Libraries Source: ftp://xmlsoft.org/libxml2/libxml2-%{version}.tar.gz @@ -11,6 +11,11 @@ Patch1: Fix-XPath-recursion-limit.patch Patch2: Fix-Null-deref-in-xmlSchemaGetComponentTargetNs.patch Patch3: Fix-memleaks-in-xmlXIncludeProcessFlags.patch Patch4: Fix-heap-use-after-free-in-xmlAddNextSibling-and-xmlAddChild.patch +Patch5: Work-around-lxml-API-abuse.patch +Patch6: Fix-regression-in-xmlNodeDumpOutputInternal.patch +Patch7: Fix-whitespace-when-serializing-empty-HTML-documents.patch +Patch8: Patch-to-forbid-epsilon-reduction-of-final-states.patch +Patch9: Fix-buffering-in-xmlOutputBufferWrite.patch BuildRoot: %{_tmppath}/%{name}-%{version}-root BuildRequires: python3-devel @@ -171,6 +176,17 @@ rm -fr %{buildroot} %changelog +* Fri Nov 12 2021 panxiaohe - 2.9.12-3 +- Type:bugfix +- ID:NA +- SUG:NA +- DESC:add backport bug fixes. + work around lxml API abuse + fix regression in xmlNodeDumpOutputInternal + fix whitespace when serializing empty HTML documents + forbid epsilon-reduction of final states + fix buffering in xmlOutputBufferWrite + * Thu Nov 11 2021 panxiaohe - 2.9.12-2 - Type:bugfix - ID:NA