apache-commons-io/XmlStreamReader-can-t-parse-XML-document-with-multi-.patch

102 lines
4.9 KiB
Diff
Raw Normal View History

From 17f8b44d50372f4b540059232ed0ffa189eceb62 Mon Sep 17 00:00:00 2001
From: Gary Gregory <garydgregory@gmail.com>
Date: Tue, 2 Jan 2024 09:08:58 -0500
Subject: [PATCH] XmlStreamReader can't parse XML document with multi-line
prolog #550
- Apply PR #550, not merged or would have caused the build to fail.
- Implement fix
Origin:
https://github.com/apache/commons-io/commit/17f8b44d50372f4b540059232ed0ffa189eceb62
---
.../apache/commons/io/input/XmlStreamReader.java | 16 +++++++++++-----
.../commons/io/input/XmlStreamReaderTest.java | 10 ++++++++++
2 files changed, 21 insertions(+), 5 deletions(-)
diff --git a/src/main/java/org/apache/commons/io/input/XmlStreamReader.java b/src/main/java/org/apache/commons/io/input/XmlStreamReader.java
index 2b9b379..ff16987 100644
--- a/src/main/java/org/apache/commons/io/input/XmlStreamReader.java
+++ b/src/main/java/org/apache/commons/io/input/XmlStreamReader.java
@@ -214,6 +214,16 @@ public class XmlStreamReader extends Reader {
* <p>
* See also the <a href="https://www.w3.org/TR/2008/REC-xml-20081126/#NT-EncName">XML specification</a>.
* </p>
+ * <p>
+ * Note the documented pattern is:
+ * </p>
+ * <pre>
+ * EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
+ * </pre>
+ * <p>
+ * However this does not match all the aliases that are supported by Java.
+ * For example, '437', 'ISO_8859-1:1987' and 'ebcdic-de-273+euro'.
+ * </p>
*/
public static final Pattern ENCODING_PATTERN = Pattern.compile(
// @formatter:off
@@ -223,10 +233,6 @@ public class XmlStreamReader extends Reader {
+ "((?:\"[A-Za-z0-9][A-Za-z0-9._+:-]*\")" // double-quoted
+ "|(?:'[A-Za-z0-9][A-Za-z0-9._+:-]*'))", // single-quoted
Pattern.MULTILINE);
- // N.B. the documented pattern is
- // EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
- // However this does not match all the aliases that are supported by Java.
- // e.g. '437', 'ISO_8859-1:1987' and 'ebcdic-de-273+euro'
// @formatter:on
private static final String RAW_EX_1 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
@@ -325,7 +331,7 @@ public class XmlStreamReader extends Reader {
inputStream.reset();
final BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1)));
final StringBuilder prolog = new StringBuilder();
- IOConsumer.forEach(bReader.lines(), prolog::append);
+ IOConsumer.forEach(bReader.lines(), l -> prolog.append(l).append(' '));
final Matcher m = ENCODING_PATTERN.matcher(prolog);
if (m.find()) {
encoding = m.group(1).toUpperCase(Locale.ROOT);
diff --git a/src/test/java/org/apache/commons/io/input/XmlStreamReaderTest.java b/src/test/java/org/apache/commons/io/input/XmlStreamReaderTest.java
index 63d587a..de986c9 100644
--- a/src/test/java/org/apache/commons/io/input/XmlStreamReaderTest.java
+++ b/src/test/java/org/apache/commons/io/input/XmlStreamReaderTest.java
@@ -60,6 +60,8 @@ public class XmlStreamReaderTest {
private static final String UTF_32LE = "UTF-32LE";
private static final String UTF_32BE = "UTF-32BE";
private static final String UTF_8 = StandardCharsets.UTF_8.name();
+
+ private static final String XML6 = "xml-prolog-encoding-new-line";
private static final String XML5 = "xml-prolog-encoding-spaced-single-quotes";
private static final String XML4 = "xml-prolog-encoding-single-quotes";
private static final String XML3 = "xml-prolog-encoding-double-quotes";
@@ -102,6 +104,8 @@ public class XmlStreamReaderTest {
private static final MessageFormat XML_WITH_PROLOG = new MessageFormat(
"<?xml version=\"1.0\"?>\n<root>{2}</root>");
+ private static final MessageFormat XML_WITH_PROLOG_AND_ENCODING_NEW_LINES = new MessageFormat(
+ "<?xml\nversion\n=\n\"1.0\"\nencoding\n=\n\"{1}\"\n?>\n<root>{2}</root>");
private static final MessageFormat XML_WITH_PROLOG_AND_ENCODING_DOUBLE_QUOTES = new MessageFormat(
"<?xml version=\"1.0\" encoding=\"{1}\"?>\n<root>{2}</root>");
@@ -123,6 +127,7 @@ public class XmlStreamReaderTest {
XMLs.put(XML3, XML_WITH_PROLOG_AND_ENCODING_DOUBLE_QUOTES);
XMLs.put(XML4, XML_WITH_PROLOG_AND_ENCODING_SINGLE_QUOTES);
XMLs.put(XML5, XML_WITH_PROLOG_AND_ENCODING_SPACED_SINGLE_QUOTES);
+ XMLs.put(XML6, XML_WITH_PROLOG_AND_ENCODING_NEW_LINES);
}
/**
@@ -624,5 +629,10 @@ public class XmlStreamReaderTest {
xmlReader = new XmlStreamReader(is);
assertEquals(xmlReader.getEncoding(), encoding);
xmlReader.close();
+
+ is = getXmlInputStream("no-bom", XML6, encoding, encoding);
+ xmlReader = new XmlStreamReader(is);
+ assertEquals(xmlReader.getEncoding(), encoding);
+ xmlReader.close();
}
}
--
2.47.0