102 lines
4.9 KiB
Diff
102 lines
4.9 KiB
Diff
|
|
From 17f8b44d50372f4b540059232ed0ffa189eceb62 Mon Sep 17 00:00:00 2001
|
||
|
|
From: Gary Gregory <garydgregory@gmail.com>
|
||
|
|
Date: Tue, 2 Jan 2024 09:08:58 -0500
|
||
|
|
Subject: [PATCH] XmlStreamReader can't parse XML document with multi-line
|
||
|
|
prolog #550
|
||
|
|
|
||
|
|
- Apply PR #550, not merged or would have caused the build to fail.
|
||
|
|
- Implement fix
|
||
|
|
|
||
|
|
Origin:
|
||
|
|
https://github.com/apache/commons-io/commit/17f8b44d50372f4b540059232ed0ffa189eceb62
|
||
|
|
---
|
||
|
|
.../apache/commons/io/input/XmlStreamReader.java | 16 +++++++++++-----
|
||
|
|
.../commons/io/input/XmlStreamReaderTest.java | 10 ++++++++++
|
||
|
|
2 files changed, 21 insertions(+), 5 deletions(-)
|
||
|
|
|
||
|
|
diff --git a/src/main/java/org/apache/commons/io/input/XmlStreamReader.java b/src/main/java/org/apache/commons/io/input/XmlStreamReader.java
|
||
|
|
index 2b9b379..ff16987 100644
|
||
|
|
--- a/src/main/java/org/apache/commons/io/input/XmlStreamReader.java
|
||
|
|
+++ b/src/main/java/org/apache/commons/io/input/XmlStreamReader.java
|
||
|
|
@@ -214,6 +214,16 @@ public class XmlStreamReader extends Reader {
|
||
|
|
* <p>
|
||
|
|
* See also the <a href="https://www.w3.org/TR/2008/REC-xml-20081126/#NT-EncName">XML specification</a>.
|
||
|
|
* </p>
|
||
|
|
+ * <p>
|
||
|
|
+ * Note the documented pattern is:
|
||
|
|
+ * </p>
|
||
|
|
+ * <pre>
|
||
|
|
+ * EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
|
||
|
|
+ * </pre>
|
||
|
|
+ * <p>
|
||
|
|
+ * However this does not match all the aliases that are supported by Java.
|
||
|
|
+ * For example, '437', 'ISO_8859-1:1987' and 'ebcdic-de-273+euro'.
|
||
|
|
+ * </p>
|
||
|
|
*/
|
||
|
|
public static final Pattern ENCODING_PATTERN = Pattern.compile(
|
||
|
|
// @formatter:off
|
||
|
|
@@ -223,10 +233,6 @@ public class XmlStreamReader extends Reader {
|
||
|
|
+ "((?:\"[A-Za-z0-9][A-Za-z0-9._+:-]*\")" // double-quoted
|
||
|
|
+ "|(?:'[A-Za-z0-9][A-Za-z0-9._+:-]*'))", // single-quoted
|
||
|
|
Pattern.MULTILINE);
|
||
|
|
- // N.B. the documented pattern is
|
||
|
|
- // EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
|
||
|
|
- // However this does not match all the aliases that are supported by Java.
|
||
|
|
- // e.g. '437', 'ISO_8859-1:1987' and 'ebcdic-de-273+euro'
|
||
|
|
// @formatter:on
|
||
|
|
|
||
|
|
private static final String RAW_EX_1 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
|
||
|
|
@@ -325,7 +331,7 @@ public class XmlStreamReader extends Reader {
|
||
|
|
inputStream.reset();
|
||
|
|
final BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1)));
|
||
|
|
final StringBuilder prolog = new StringBuilder();
|
||
|
|
- IOConsumer.forEach(bReader.lines(), prolog::append);
|
||
|
|
+ IOConsumer.forEach(bReader.lines(), l -> prolog.append(l).append(' '));
|
||
|
|
final Matcher m = ENCODING_PATTERN.matcher(prolog);
|
||
|
|
if (m.find()) {
|
||
|
|
encoding = m.group(1).toUpperCase(Locale.ROOT);
|
||
|
|
diff --git a/src/test/java/org/apache/commons/io/input/XmlStreamReaderTest.java b/src/test/java/org/apache/commons/io/input/XmlStreamReaderTest.java
|
||
|
|
index 63d587a..de986c9 100644
|
||
|
|
--- a/src/test/java/org/apache/commons/io/input/XmlStreamReaderTest.java
|
||
|
|
+++ b/src/test/java/org/apache/commons/io/input/XmlStreamReaderTest.java
|
||
|
|
@@ -60,6 +60,8 @@ public class XmlStreamReaderTest {
|
||
|
|
private static final String UTF_32LE = "UTF-32LE";
|
||
|
|
private static final String UTF_32BE = "UTF-32BE";
|
||
|
|
private static final String UTF_8 = StandardCharsets.UTF_8.name();
|
||
|
|
+
|
||
|
|
+ private static final String XML6 = "xml-prolog-encoding-new-line";
|
||
|
|
private static final String XML5 = "xml-prolog-encoding-spaced-single-quotes";
|
||
|
|
private static final String XML4 = "xml-prolog-encoding-single-quotes";
|
||
|
|
private static final String XML3 = "xml-prolog-encoding-double-quotes";
|
||
|
|
@@ -102,6 +104,8 @@ public class XmlStreamReaderTest {
|
||
|
|
|
||
|
|
private static final MessageFormat XML_WITH_PROLOG = new MessageFormat(
|
||
|
|
"<?xml version=\"1.0\"?>\n<root>{2}</root>");
|
||
|
|
+ private static final MessageFormat XML_WITH_PROLOG_AND_ENCODING_NEW_LINES = new MessageFormat(
|
||
|
|
+ "<?xml\nversion\n=\n\"1.0\"\nencoding\n=\n\"{1}\"\n?>\n<root>{2}</root>");
|
||
|
|
|
||
|
|
private static final MessageFormat XML_WITH_PROLOG_AND_ENCODING_DOUBLE_QUOTES = new MessageFormat(
|
||
|
|
"<?xml version=\"1.0\" encoding=\"{1}\"?>\n<root>{2}</root>");
|
||
|
|
@@ -123,6 +127,7 @@ public class XmlStreamReaderTest {
|
||
|
|
XMLs.put(XML3, XML_WITH_PROLOG_AND_ENCODING_DOUBLE_QUOTES);
|
||
|
|
XMLs.put(XML4, XML_WITH_PROLOG_AND_ENCODING_SINGLE_QUOTES);
|
||
|
|
XMLs.put(XML5, XML_WITH_PROLOG_AND_ENCODING_SPACED_SINGLE_QUOTES);
|
||
|
|
+ XMLs.put(XML6, XML_WITH_PROLOG_AND_ENCODING_NEW_LINES);
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
@@ -624,5 +629,10 @@ public class XmlStreamReaderTest {
|
||
|
|
xmlReader = new XmlStreamReader(is);
|
||
|
|
assertEquals(xmlReader.getEncoding(), encoding);
|
||
|
|
xmlReader.close();
|
||
|
|
+
|
||
|
|
+ is = getXmlInputStream("no-bom", XML6, encoding, encoding);
|
||
|
|
+ xmlReader = new XmlStreamReader(is);
|
||
|
|
+ assertEquals(xmlReader.getEncoding(), encoding);
|
||
|
|
+ xmlReader.close();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
--
|
||
|
|
2.47.0
|
||
|
|
|