224 lines
9.4 KiB
Diff
224 lines
9.4 KiB
Diff
From 6cc9677838ce4e68680f7877d71032ca6481ee56 Mon Sep 17 00:00:00 2001
|
|
From: Snild Dolkow <snild@sony.com>
|
|
Date: Thu, 17 Aug 2023 16:25:26 +0200
|
|
Subject: [PATCH] Skip parsing after repeated partials on the same token
|
|
|
|
Reference: https://github.com/libexpat/libexpat/pull/789/commits/9cdf9b8d77d5c2c2a27d15fb68dd3f83cafb45a1
|
|
Conflict: remove basic_test.c
|
|
change xmlparse.c
|
|
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=utf-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
When the parse buffer contains the starting bytes of a token but not
|
|
all of them, we cannot parse the token to completion. We call this a
|
|
partial token. When this happens, the parse position is reset to the
|
|
start of the token, and the parse() call returns. The client is then
|
|
expected to provide more data and call parse() again.
|
|
|
|
In extreme cases, this means that the bytes of a token may be parsed
|
|
many times: once for every buffer refill required before the full token
|
|
is present in the buffer.
|
|
|
|
Math:
|
|
Assume there's a token of T bytes
|
|
Assume the client fills the buffer in chunks of X bytes
|
|
We'll try to parse X, 2X, 3X, 4X ... until mX == T (technically >=)
|
|
That's (m²+m)X/2 = (T²/X+T)/2 bytes parsed (arithmetic progression)
|
|
While it is alleviated by larger refills, this amounts to O(T²)
|
|
|
|
Expat grows its internal buffer by doubling it when necessary, but has
|
|
no way to inform the client about how much space is available. Instead,
|
|
we add a heuristic that skips parsing when we've repeatedly stopped on
|
|
an incomplete token. Specifically:
|
|
|
|
* Only try to parse if we have a certain amount of data buffered
|
|
* Every time we stop on an incomplete token, double the threshold
|
|
* As soon as any token completes, the threshold is reset
|
|
|
|
This means that when we get stuck on an incomplete token, the threshold
|
|
grows exponentially, effectively making the client perform larger buffer
|
|
fills, limiting how many times we can end up re-parsing the same bytes.
|
|
|
|
Math:
|
|
Assume there's a token of T bytes
|
|
Assume the client fills the buffer in chunks of X bytes
|
|
We'll try to parse X, 2X, 4X, 8X ... until (2^k)X == T (or larger)
|
|
That's (2^(k+1)-1)X bytes parsed -- e.g. 15X if T = 8X
|
|
This is equal to 2T-X, which amounts to O(T)
|
|
|
|
We could've chosen a faster growth rate, e.g. 4 or 8. Those seem to
|
|
increase performance further, at the cost of further increasing the
|
|
risk of growing the buffer more than necessary. This can easily be
|
|
adjusted in the future, if desired.
|
|
|
|
This is all completely transparent to the client, except for:
|
|
1. possible delay of some callbacks (when our heuristic overshoots)
|
|
2. apps that never do isFinal=XML_TRUE could miss data at the end
|
|
|
|
For the affected testdata, this change shows a 100-400x speedup.
|
|
The recset.xml benchmark shows no clear change either way.
|
|
|
|
Before:
|
|
benchmark -n ../testdata/largefiles/recset.xml 65535 3
|
|
3 loops, with buffer size 65535. Average time per loop: 0.270223
|
|
benchmark -n ../testdata/largefiles/aaaaaa_attr.xml 4096 3
|
|
3 loops, with buffer size 4096. Average time per loop: 15.033048
|
|
benchmark -n ../testdata/largefiles/aaaaaa_cdata.xml 4096 3
|
|
3 loops, with buffer size 4096. Average time per loop: 0.018027
|
|
benchmark -n ../testdata/largefiles/aaaaaa_comment.xml 4096 3
|
|
3 loops, with buffer size 4096. Average time per loop: 11.775362
|
|
benchmark -n ../testdata/largefiles/aaaaaa_tag.xml 4096 3
|
|
3 loops, with buffer size 4096. Average time per loop: 11.711414
|
|
benchmark -n ../testdata/largefiles/aaaaaa_text.xml 4096 3
|
|
3 loops, with buffer size 4096. Average time per loop: 0.019362
|
|
|
|
After:
|
|
./run.sh benchmark -n ../testdata/largefiles/recset.xml 65535 3
|
|
3 loops, with buffer size 65535. Average time per loop: 0.269030
|
|
./run.sh benchmark -n ../testdata/largefiles/aaaaaa_attr.xml 4096 3
|
|
3 loops, with buffer size 4096. Average time per loop: 0.044794
|
|
./run.sh benchmark -n ../testdata/largefiles/aaaaaa_cdata.xml 4096 3
|
|
3 loops, with buffer size 4096. Average time per loop: 0.016377
|
|
./run.sh benchmark -n ../testdata/largefiles/aaaaaa_comment.xml 4096 3
|
|
3 loops, with buffer size 4096. Average time per loop: 0.027022
|
|
./run.sh benchmark -n ../testdata/largefiles/aaaaaa_tag.xml 4096 3
|
|
3 loops, with buffer size 4096. Average time per loop: 0.099360
|
|
./run.sh benchmark -n ../testdata/largefiles/aaaaaa_text.xml 4096 3
|
|
3 loops, with buffer size 4096. Average time per loop: 0.017956
|
|
---
|
|
lib/xmlparse.c | 58 +++++++++++++++++++++++++++++---------------
|
|
1 file changed, 39 insertions(+), 19 deletions(-)
|
|
|
|
diff --git a/lib/xmlparse.c b/lib/xmlparse.c
|
|
index 5ba56eae..32df1eb9 100644
|
|
--- a/lib/xmlparse.c
|
|
+++ b/lib/xmlparse.c
|
|
@@ -65,6 +65,7 @@
|
|
# endif
|
|
#endif
|
|
|
|
+#include <stdbool.h>
|
|
#include <stddef.h>
|
|
#include <string.h> /* memset(), memcpy() */
|
|
#include <assert.h>
|
|
@@ -613,6 +614,7 @@ struct XML_ParserStruct {
|
|
const char *m_bufferLim;
|
|
XML_Index m_parseEndByteIndex;
|
|
const char *m_parseEndPtr;
|
|
+ size_t m_partialTokenBytesBefore; /* used in heuristic to avoid O(n^2) */
|
|
XML_Char *m_dataBuf;
|
|
XML_Char *m_dataBufEnd;
|
|
XML_StartElementHandler m_startElementHandler;
|
|
@@ -944,6 +946,32 @@ get_hash_secret_salt(XML_Parser parser) {
|
|
return parser->m_hash_secret_salt;
|
|
}
|
|
|
|
+static enum XML_Error
|
|
+callProcessor(XML_Parser parser, const char *start, const char *end,
|
|
+ const char **endPtr) {
|
|
+ const size_t have_now = EXPAT_SAFE_PTR_DIFF(end, start);
|
|
+
|
|
+ if (! parser->m_parsingStatus.finalBuffer) {
|
|
+ // Heuristic: don't try to parse a partial token again until the amount of
|
|
+ // available data has increased significantly.
|
|
+ const size_t had_before = parser->m_partialTokenBytesBefore;
|
|
+ const bool enough = (have_now >= 2 * had_before);
|
|
+
|
|
+ if (! enough) {
|
|
+ *endPtr = start; // callers may expect this to be set
|
|
+ return XML_ERROR_NONE;
|
|
+ }
|
|
+ }
|
|
+ const enum XML_Error ret = parser->m_processor(parser, start, end, endPtr);
|
|
+ // if we consumed nothing, remember what we had on this parse attempt.
|
|
+ if (*endPtr == start) {
|
|
+ parser->m_partialTokenBytesBefore = have_now;
|
|
+ } else {
|
|
+ parser->m_partialTokenBytesBefore = 0;
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+
|
|
static XML_Bool /* only valid for root parser */
|
|
startParsing(XML_Parser parser) {
|
|
/* hash functions must be initialized before setContext() is called */
|
|
@@ -1117,6 +1145,7 @@ parserInit(XML_Parser parser, const XML_Char *encodingName) {
|
|
parser->m_bufferEnd = parser->m_buffer;
|
|
parser->m_parseEndByteIndex = 0;
|
|
parser->m_parseEndPtr = NULL;
|
|
+ parser->m_partialTokenBytesBefore = 0;
|
|
parser->m_declElementType = NULL;
|
|
parser->m_declAttributeId = NULL;
|
|
parser->m_declEntity = NULL;
|
|
@@ -1849,29 +1878,20 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) {
|
|
to detect errors based on that fact.
|
|
*/
|
|
parser->m_errorCode
|
|
- = parser->m_processor(parser, parser->m_bufferPtr,
|
|
- parser->m_parseEndPtr, &parser->m_bufferPtr);
|
|
+ = callProcessor(parser, parser->m_bufferPtr, parser->m_parseEndPtr,
|
|
+ &parser->m_bufferPtr);
|
|
|
|
if (parser->m_errorCode == XML_ERROR_NONE) {
|
|
switch (parser->m_parsingStatus.parsing) {
|
|
case XML_SUSPENDED:
|
|
- /* It is hard to be certain, but it seems that this case
|
|
- * cannot occur. This code is cleaning up a previous parse
|
|
- * with no new data (since len == 0). Changing the parsing
|
|
- * state requires getting to execute a handler function, and
|
|
- * there doesn't seem to be an opportunity for that while in
|
|
- * this circumstance.
|
|
- *
|
|
- * Given the uncertainty, we retain the code but exclude it
|
|
- * from coverage tests.
|
|
- *
|
|
- * LCOV_EXCL_START
|
|
- */
|
|
+ /* While we added no new data, the finalBuffer flag may have caused
|
|
+ * us to parse previously-unparsed data in the internal buffer.
|
|
+ * If that triggered a callback to the application, it would have
|
|
+ * had an opportunity to suspend parsing. */
|
|
XmlUpdatePosition(parser->m_encoding, parser->m_positionPtr,
|
|
parser->m_bufferPtr, &parser->m_position);
|
|
parser->m_positionPtr = parser->m_bufferPtr;
|
|
return XML_STATUS_SUSPENDED;
|
|
- /* LCOV_EXCL_STOP */
|
|
case XML_INITIALIZED:
|
|
case XML_PARSING:
|
|
parser->m_parsingStatus.parsing = XML_FINISHED;
|
|
@@ -1901,7 +1921,7 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) {
|
|
parser->m_parsingStatus.finalBuffer = (XML_Bool)isFinal;
|
|
|
|
parser->m_errorCode
|
|
- = parser->m_processor(parser, s, parser->m_parseEndPtr = s + len, &end);
|
|
+ = callProcessor(parser, s, parser->m_parseEndPtr = s + len, &end);
|
|
|
|
if (parser->m_errorCode != XML_ERROR_NONE) {
|
|
parser->m_eventEndPtr = parser->m_eventPtr;
|
|
@@ -2004,8 +2024,8 @@ XML_ParseBuffer(XML_Parser parser, int len, int isFinal) {
|
|
parser->m_parseEndByteIndex += len;
|
|
parser->m_parsingStatus.finalBuffer = (XML_Bool)isFinal;
|
|
|
|
- parser->m_errorCode = parser->m_processor(
|
|
- parser, start, parser->m_parseEndPtr, &parser->m_bufferPtr);
|
|
+ parser->m_errorCode = callProcessor(parser, start, parser->m_parseEndPtr,
|
|
+ &parser->m_bufferPtr);
|
|
|
|
if (parser->m_errorCode != XML_ERROR_NONE) {
|
|
parser->m_eventEndPtr = parser->m_eventPtr;
|
|
@@ -2192,7 +2212,7 @@ XML_ResumeParser(XML_Parser parser) {
|
|
}
|
|
parser->m_parsingStatus.parsing = XML_PARSING;
|
|
|
|
- parser->m_errorCode = parser->m_processor(
|
|
+ parser->m_errorCode = callProcessor(
|
|
parser, parser->m_bufferPtr, parser->m_parseEndPtr, &parser->m_bufferPtr);
|
|
|
|
if (parser->m_errorCode != XML_ERROR_NONE) {
|
|
--
|
|
2.33.0
|
|
|
|
|