pcre2/backport-Fix-the-lookahead-after-d-or-posix-to-skip-whitespac.patch
2024-12-10 11:41:16 +08:00

319 lines
9.3 KiB
Diff

From 16d7edb56757e5294eeeecc9a19135aab89a50ba Mon Sep 17 00:00:00 2001
From: Nicholas Wilson <niwilson@microsoft.com>
Date: Fri, 1 Nov 2024 17:13:34 +0000
Subject: [PATCH] Fix the lookahead after [\d or [[:posix] to skip whitespace
(#544)
Conflict:don't modify alt_extended_class because fc38d9e784 is not merged;
don't modify class_op_state because class_op_state is not merged; adapt context
Reference:https://github.com/PCRE2Project/pcre2/commit/16d7edb56757e5294eeeecc9a19135aab89a50ba
---
src/pcre2_compile.c | 88 +++++++++++++++++++++++++++---------------
src/pcre2_intmodedep.h | 2 +-
testdata/testinput1 | 20 +++++++---
testdata/testinput2 | 8 ++++
testdata/testoutput1 | 30 ++++++++++----
testdata/testoutput2 | 12 ++++++
6 files changed, 113 insertions(+), 47 deletions(-)
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index 2493c871..9be26b07 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -2681,7 +2681,14 @@ the main compiling phase. */
/* States used for analyzing ranges in character classes. The two OK values
must be last. */
-enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
+enum {
+ RANGE_NO, /* State after '[' (initial), or '[a-z'; hyphen is literal */
+ RANGE_STARTED, /* State after '[1-'; last-emitted code is META_RANGE_XYZ */
+ RANGE_FORBID_NO, /* State after '[\d'; '-]' is allowed but not '-1]' */
+ RANGE_FORBID_STARTED, /* State after '[\d-'*/
+ RANGE_OK_ESCAPED, /* State after '[1'; hyphen may be a range */
+ RANGE_OK_LITERAL /* State after '[\1'; hyphen may be a range */
+};
/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
the storing of literal values in the main parsed pattern, where they can always
@@ -2734,6 +2741,7 @@ PCRE2_SPTR thisptr;
PCRE2_SPTR name;
PCRE2_SPTR ptrend = cb->end_pattern;
PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */
+PCRE2_SPTR class_range_forbid_ptr = NULL;
named_group *ng;
nest_save *top_nest, *end_nests;
@@ -3559,6 +3567,21 @@ while (ptr < ptrend)
goto FAILED;
}
+ /* Perl treats a hyphen after a POSIX class as a literal, not the
+ start of a range. However, it gives a warning in its warning mode
+ unless the hyphen is the last character in the class. PCRE does not
+ have a warning mode, so we give an error, because this is likely an
+ error on the user's part.
+
+ Roll back to the hyphen for the error position. */
+
+ if (class_range_state == RANGE_FORBID_STARTED)
+ {
+ ptr = class_range_forbid_ptr;
+ errorcode = ERR50;
+ goto FAILED;
+ }
+
if (*ptr != CHAR_COLON)
{
errorcode = ERR13;
@@ -3579,26 +3602,12 @@ while (ptr < ptrend)
}
ptr = tempptr + 2;
- /* Perl treats a hyphen after a POSIX class as a literal, not the
- start of a range. However, it gives a warning in its warning mode
- unless the hyphen is the last character in the class. PCRE does not
- have a warning mode, so we give an error, because this is likely an
- error on the user's part. */
-
- if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
- ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
- {
- errorcode = ERR50;
- goto FAILED;
- }
-
- /* Set "a hyphen is not the start of a range" for the -] case, and also
- in case the POSIX class is followed by \E or \Q\E (possibly repeated -
- fuzzers do that kind of thing) and *then* a hyphen. This causes that
- hyphen to be treated as a literal. I don't think it's worth setting up
- special apparatus to do otherwise. */
+ /* Set "a hyphen is forbidden to be the start of a range". For the '-]'
+ case, the hyphen is treated as a literal, but for '-1' it is disallowed
+ (because it would be interpreted as range). */
- class_range_state = RANGE_NO;
+ class_range_state = RANGE_FORBID_NO;
+ class_range_forbid_ptr = ptr;
/* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
of the POSIX classes are converted to use Unicode properties \p or \P
@@ -3648,6 +3657,14 @@ while (ptr < ptrend)
class_range_state = RANGE_STARTED;
}
+ /* Handle forbidden start of range */
+
+ else if (c == CHAR_MINUS && class_range_state == RANGE_FORBID_NO)
+ {
+ *parsed_pattern++ = CHAR_MINUS;
+ class_range_state = RANGE_FORBID_STARTED;
+ }
+
/* Handle a literal character */
else if (c != CHAR_BACKSLASH)
@@ -3670,6 +3687,12 @@ while (ptr < ptrend)
}
class_range_state = RANGE_NO;
}
+ else if (class_range_state == RANGE_FORBID_STARTED)
+ {
+ ptr = class_range_forbid_ptr;
+ errorcode = ERR50;
+ goto FAILED;
+ }
else /* Potential start of range */
{
class_range_state = char_is_literal?
@@ -3733,13 +3756,23 @@ while (ptr < ptrend)
if (class_range_state == RANGE_STARTED)
{
errorcode = ERR50;
- goto FAILED; /* Not CLASS_ESCAPE_FAILED; always an error */
+ goto FAILED;
+ }
+ /* Perl gives a warning unless the hyphen following a multi-character
+ escape is the last character in the class. PCRE throws an error. */
+ if (class_range_state == RANGE_FORBID_STARTED)
+ {
+ ptr = class_range_forbid_ptr;
+ errorcode = ERR50;
+ goto FAILED;
}
/* Of the remaining escapes, only those that define characters are
allowed in a class. None may start a range. */
- class_range_state = RANGE_NO;
+ class_range_state = RANGE_FORBID_NO;
+ class_range_forbid_ptr = ptr;
+
switch(escape)
{
case ESC_N:
@@ -3779,6 +3812,7 @@ while (ptr < ptrend)
if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
*parsed_pattern++ = META_ESCAPE + escape;
*parsed_pattern++ = (ptype << 16) | pdata;
+ class_range_forbid_ptr = ptr;
}
#else
errorcode = ERR45;
@@ -3791,16 +3825,6 @@ while (ptr < ptrend)
ptr--;
goto FAILED;
}
-
- /* Perl gives a warning unless a following hyphen is the last character
- in the class. PCRE throws an error. */
-
- if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
- ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
- {
- errorcode = ERR50;
- goto FAILED;
- }
}
/* Proceed to next thing in the class. */
diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h
index 598060c9..a11b4faa 100644
--- a/src/pcre2_intmodedep.h
+++ b/src/pcre2_intmodedep.h
@@ -435,7 +435,7 @@ UTF-16 mode. */
c = *eptr; \
if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
-/* Get the next UTF-816character, testing for UTF-16 mode, not advancing the
+/* Get the next UTF-16 character, testing for UTF-16 mode, not advancing the
pointer, incrementing length if there is a low surrogate. This is called when
we do not know if we are in UTF-16 mode. */
diff --git a/testdata/testinput1 b/testdata/testinput1
index 0794502e..1e50369f 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -5787,12 +5787,6 @@ ef) x/x,mark
/(?'c')XX(?'YYYYYYYYYYYYYYYYYYYYYYYCl')/
-/[s[:digit:]\E-H]+/
- s09-H
-
-/[s[:digit:]\Q\E-H]+/
- s09-H
-
/a+(?:|b)a/
aaaa
@@ -6435,4 +6429,18 @@ ef) x/x,mark
/(a\K.(?1)*)/
abac
+/[[:digit:]- ]/xx
+ 1
+ -
+\= Expect no match
+ z
+ \ \
+
+/[\d- ]/xx
+ 1
+ -
+\= Expect no match
+ z
+ \ \
+
# End of testinput1
diff --git a/testdata/testinput2 b/testdata/testinput2
index b6464a0b..61b94e69 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -5981,4 +5981,12 @@ a)"xI
a
a\=noteol
+/[[:digit:] -Z]/xx
+
+/[\d -Z]/xx
+
+/[[:digit:]\E-H]/
+
+/[[:digit:]\Q\E-H]+/
+
# End of testinput2
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index 8daf8362..6f927729 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -9246,14 +9246,6 @@ No match
/(?'c')XX(?'YYYYYYYYYYYYYYYYYYYYYYYCl')/
-/[s[:digit:]\E-H]+/
- s09-H
- 0: s09-H
-
-/[s[:digit:]\Q\E-H]+/
- s09-H
- 0: s09-H
-
/a+(?:|b)a/
aaaa
0: aaaa
@@ -10197,4 +10189,26 @@ No match
0: c
1: abac
+/[[:digit:]- ]/xx
+ 1
+ 0: 1
+ -
+ 0: -
+\= Expect no match
+ z
+No match
+ \ \
+No match
+
+/[\d- ]/xx
+ 1
+ 0: 1
+ -
+ 0: -
+\= Expect no match
+ z
+No match
+ \ \
+No match
+
# End of testinput1
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 1075b4d4..86bfe964 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -17815,6 +17815,18 @@ Subject length lower bound = 2
a\=noteol
0: a
+/[[:digit:] -Z]/xx
+Failed: error 150 at offset 10: invalid range in character class
+
+/[\d -Z]/xx
+Failed: error 150 at offset 3: invalid range in character class
+
+/[[:digit:]\E-H]/
+Failed: error 150 at offset 10: invalid range in character class
+
+/[[:digit:]\Q\E-H]+/
+Failed: error 150 at offset 10: invalid range in character class
+
# End of testinput2
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data
--
2.33.0