319 lines
9.3 KiB
Diff
319 lines
9.3 KiB
Diff
|
|
From 16d7edb56757e5294eeeecc9a19135aab89a50ba Mon Sep 17 00:00:00 2001
|
||
|
|
From: Nicholas Wilson <niwilson@microsoft.com>
|
||
|
|
Date: Fri, 1 Nov 2024 17:13:34 +0000
|
||
|
|
Subject: [PATCH] Fix the lookahead after [\d or [[:posix] to skip whitespace
|
||
|
|
(#544)
|
||
|
|
|
||
|
|
Conflict:don't modify alt_extended_class because fc38d9e784 is not merged;
|
||
|
|
don't modify class_op_state because class_op_state is not merged; adapt context
|
||
|
|
Reference:https://github.com/PCRE2Project/pcre2/commit/16d7edb56757e5294eeeecc9a19135aab89a50ba
|
||
|
|
|
||
|
|
---
|
||
|
|
src/pcre2_compile.c | 88 +++++++++++++++++++++++++++---------------
|
||
|
|
src/pcre2_intmodedep.h | 2 +-
|
||
|
|
testdata/testinput1 | 20 +++++++---
|
||
|
|
testdata/testinput2 | 8 ++++
|
||
|
|
testdata/testoutput1 | 30 ++++++++++----
|
||
|
|
testdata/testoutput2 | 12 ++++++
|
||
|
|
6 files changed, 113 insertions(+), 47 deletions(-)
|
||
|
|
|
||
|
|
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
|
||
|
|
index 2493c871..9be26b07 100644
|
||
|
|
--- a/src/pcre2_compile.c
|
||
|
|
+++ b/src/pcre2_compile.c
|
||
|
|
@@ -2681,7 +2681,14 @@ the main compiling phase. */
|
||
|
|
/* States used for analyzing ranges in character classes. The two OK values
|
||
|
|
must be last. */
|
||
|
|
|
||
|
|
-enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
|
||
|
|
+enum {
|
||
|
|
+ RANGE_NO, /* State after '[' (initial), or '[a-z'; hyphen is literal */
|
||
|
|
+ RANGE_STARTED, /* State after '[1-'; last-emitted code is META_RANGE_XYZ */
|
||
|
|
+ RANGE_FORBID_NO, /* State after '[\d'; '-]' is allowed but not '-1]' */
|
||
|
|
+ RANGE_FORBID_STARTED, /* State after '[\d-'*/
|
||
|
|
+ RANGE_OK_ESCAPED, /* State after '[1'; hyphen may be a range */
|
||
|
|
+ RANGE_OK_LITERAL /* State after '[\1'; hyphen may be a range */
|
||
|
|
+};
|
||
|
|
|
||
|
|
/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
|
||
|
|
the storing of literal values in the main parsed pattern, where they can always
|
||
|
|
@@ -2734,6 +2741,7 @@ PCRE2_SPTR thisptr;
|
||
|
|
PCRE2_SPTR name;
|
||
|
|
PCRE2_SPTR ptrend = cb->end_pattern;
|
||
|
|
PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */
|
||
|
|
+PCRE2_SPTR class_range_forbid_ptr = NULL;
|
||
|
|
named_group *ng;
|
||
|
|
nest_save *top_nest, *end_nests;
|
||
|
|
|
||
|
|
@@ -3559,6 +3567,21 @@ while (ptr < ptrend)
|
||
|
|
goto FAILED;
|
||
|
|
}
|
||
|
|
|
||
|
|
+ /* Perl treats a hyphen after a POSIX class as a literal, not the
|
||
|
|
+ start of a range. However, it gives a warning in its warning mode
|
||
|
|
+ unless the hyphen is the last character in the class. PCRE does not
|
||
|
|
+ have a warning mode, so we give an error, because this is likely an
|
||
|
|
+ error on the user's part.
|
||
|
|
+
|
||
|
|
+ Roll back to the hyphen for the error position. */
|
||
|
|
+
|
||
|
|
+ if (class_range_state == RANGE_FORBID_STARTED)
|
||
|
|
+ {
|
||
|
|
+ ptr = class_range_forbid_ptr;
|
||
|
|
+ errorcode = ERR50;
|
||
|
|
+ goto FAILED;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
if (*ptr != CHAR_COLON)
|
||
|
|
{
|
||
|
|
errorcode = ERR13;
|
||
|
|
@@ -3579,26 +3602,12 @@ while (ptr < ptrend)
|
||
|
|
}
|
||
|
|
ptr = tempptr + 2;
|
||
|
|
|
||
|
|
- /* Perl treats a hyphen after a POSIX class as a literal, not the
|
||
|
|
- start of a range. However, it gives a warning in its warning mode
|
||
|
|
- unless the hyphen is the last character in the class. PCRE does not
|
||
|
|
- have a warning mode, so we give an error, because this is likely an
|
||
|
|
- error on the user's part. */
|
||
|
|
-
|
||
|
|
- if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
|
||
|
|
- ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
|
||
|
|
- {
|
||
|
|
- errorcode = ERR50;
|
||
|
|
- goto FAILED;
|
||
|
|
- }
|
||
|
|
-
|
||
|
|
- /* Set "a hyphen is not the start of a range" for the -] case, and also
|
||
|
|
- in case the POSIX class is followed by \E or \Q\E (possibly repeated -
|
||
|
|
- fuzzers do that kind of thing) and *then* a hyphen. This causes that
|
||
|
|
- hyphen to be treated as a literal. I don't think it's worth setting up
|
||
|
|
- special apparatus to do otherwise. */
|
||
|
|
+ /* Set "a hyphen is forbidden to be the start of a range". For the '-]'
|
||
|
|
+ case, the hyphen is treated as a literal, but for '-1' it is disallowed
|
||
|
|
+ (because it would be interpreted as range). */
|
||
|
|
|
||
|
|
- class_range_state = RANGE_NO;
|
||
|
|
+ class_range_state = RANGE_FORBID_NO;
|
||
|
|
+ class_range_forbid_ptr = ptr;
|
||
|
|
|
||
|
|
/* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
|
||
|
|
of the POSIX classes are converted to use Unicode properties \p or \P
|
||
|
|
@@ -3648,6 +3657,14 @@ while (ptr < ptrend)
|
||
|
|
class_range_state = RANGE_STARTED;
|
||
|
|
}
|
||
|
|
|
||
|
|
+ /* Handle forbidden start of range */
|
||
|
|
+
|
||
|
|
+ else if (c == CHAR_MINUS && class_range_state == RANGE_FORBID_NO)
|
||
|
|
+ {
|
||
|
|
+ *parsed_pattern++ = CHAR_MINUS;
|
||
|
|
+ class_range_state = RANGE_FORBID_STARTED;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
/* Handle a literal character */
|
||
|
|
|
||
|
|
else if (c != CHAR_BACKSLASH)
|
||
|
|
@@ -3670,6 +3687,12 @@ while (ptr < ptrend)
|
||
|
|
}
|
||
|
|
class_range_state = RANGE_NO;
|
||
|
|
}
|
||
|
|
+ else if (class_range_state == RANGE_FORBID_STARTED)
|
||
|
|
+ {
|
||
|
|
+ ptr = class_range_forbid_ptr;
|
||
|
|
+ errorcode = ERR50;
|
||
|
|
+ goto FAILED;
|
||
|
|
+ }
|
||
|
|
else /* Potential start of range */
|
||
|
|
{
|
||
|
|
class_range_state = char_is_literal?
|
||
|
|
@@ -3733,13 +3756,23 @@ while (ptr < ptrend)
|
||
|
|
if (class_range_state == RANGE_STARTED)
|
||
|
|
{
|
||
|
|
errorcode = ERR50;
|
||
|
|
- goto FAILED; /* Not CLASS_ESCAPE_FAILED; always an error */
|
||
|
|
+ goto FAILED;
|
||
|
|
+ }
|
||
|
|
+ /* Perl gives a warning unless the hyphen following a multi-character
|
||
|
|
+ escape is the last character in the class. PCRE throws an error. */
|
||
|
|
+ if (class_range_state == RANGE_FORBID_STARTED)
|
||
|
|
+ {
|
||
|
|
+ ptr = class_range_forbid_ptr;
|
||
|
|
+ errorcode = ERR50;
|
||
|
|
+ goto FAILED;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Of the remaining escapes, only those that define characters are
|
||
|
|
allowed in a class. None may start a range. */
|
||
|
|
|
||
|
|
- class_range_state = RANGE_NO;
|
||
|
|
+ class_range_state = RANGE_FORBID_NO;
|
||
|
|
+ class_range_forbid_ptr = ptr;
|
||
|
|
+
|
||
|
|
switch(escape)
|
||
|
|
{
|
||
|
|
case ESC_N:
|
||
|
|
@@ -3779,6 +3812,7 @@ while (ptr < ptrend)
|
||
|
|
if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
|
||
|
|
*parsed_pattern++ = META_ESCAPE + escape;
|
||
|
|
*parsed_pattern++ = (ptype << 16) | pdata;
|
||
|
|
+ class_range_forbid_ptr = ptr;
|
||
|
|
}
|
||
|
|
#else
|
||
|
|
errorcode = ERR45;
|
||
|
|
@@ -3791,16 +3825,6 @@ while (ptr < ptrend)
|
||
|
|
ptr--;
|
||
|
|
goto FAILED;
|
||
|
|
}
|
||
|
|
-
|
||
|
|
- /* Perl gives a warning unless a following hyphen is the last character
|
||
|
|
- in the class. PCRE throws an error. */
|
||
|
|
-
|
||
|
|
- if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
|
||
|
|
- ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
|
||
|
|
- {
|
||
|
|
- errorcode = ERR50;
|
||
|
|
- goto FAILED;
|
||
|
|
- }
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Proceed to next thing in the class. */
|
||
|
|
diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h
|
||
|
|
index 598060c9..a11b4faa 100644
|
||
|
|
--- a/src/pcre2_intmodedep.h
|
||
|
|
+++ b/src/pcre2_intmodedep.h
|
||
|
|
@@ -435,7 +435,7 @@ UTF-16 mode. */
|
||
|
|
c = *eptr; \
|
||
|
|
if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
|
||
|
|
|
||
|
|
-/* Get the next UTF-816character, testing for UTF-16 mode, not advancing the
|
||
|
|
+/* Get the next UTF-16 character, testing for UTF-16 mode, not advancing the
|
||
|
|
pointer, incrementing length if there is a low surrogate. This is called when
|
||
|
|
we do not know if we are in UTF-16 mode. */
|
||
|
|
|
||
|
|
diff --git a/testdata/testinput1 b/testdata/testinput1
|
||
|
|
index 0794502e..1e50369f 100644
|
||
|
|
--- a/testdata/testinput1
|
||
|
|
+++ b/testdata/testinput1
|
||
|
|
@@ -5787,12 +5787,6 @@ ef) x/x,mark
|
||
|
|
|
||
|
|
/(?'c')XX(?'YYYYYYYYYYYYYYYYYYYYYYYCl')/
|
||
|
|
|
||
|
|
-/[s[:digit:]\E-H]+/
|
||
|
|
- s09-H
|
||
|
|
-
|
||
|
|
-/[s[:digit:]\Q\E-H]+/
|
||
|
|
- s09-H
|
||
|
|
-
|
||
|
|
/a+(?:|b)a/
|
||
|
|
aaaa
|
||
|
|
|
||
|
|
@@ -6435,4 +6429,18 @@ ef) x/x,mark
|
||
|
|
/(a\K.(?1)*)/
|
||
|
|
abac
|
||
|
|
|
||
|
|
+/[[:digit:]- ]/xx
|
||
|
|
+ 1
|
||
|
|
+ -
|
||
|
|
+\= Expect no match
|
||
|
|
+ z
|
||
|
|
+ \ \
|
||
|
|
+
|
||
|
|
+/[\d- ]/xx
|
||
|
|
+ 1
|
||
|
|
+ -
|
||
|
|
+\= Expect no match
|
||
|
|
+ z
|
||
|
|
+ \ \
|
||
|
|
+
|
||
|
|
# End of testinput1
|
||
|
|
diff --git a/testdata/testinput2 b/testdata/testinput2
|
||
|
|
index b6464a0b..61b94e69 100644
|
||
|
|
--- a/testdata/testinput2
|
||
|
|
+++ b/testdata/testinput2
|
||
|
|
@@ -5981,4 +5981,12 @@ a)"xI
|
||
|
|
a
|
||
|
|
a\=noteol
|
||
|
|
|
||
|
|
+/[[:digit:] -Z]/xx
|
||
|
|
+
|
||
|
|
+/[\d -Z]/xx
|
||
|
|
+
|
||
|
|
+/[[:digit:]\E-H]/
|
||
|
|
+
|
||
|
|
+/[[:digit:]\Q\E-H]+/
|
||
|
|
+
|
||
|
|
# End of testinput2
|
||
|
|
diff --git a/testdata/testoutput1 b/testdata/testoutput1
|
||
|
|
index 8daf8362..6f927729 100644
|
||
|
|
--- a/testdata/testoutput1
|
||
|
|
+++ b/testdata/testoutput1
|
||
|
|
@@ -9246,14 +9246,6 @@ No match
|
||
|
|
|
||
|
|
/(?'c')XX(?'YYYYYYYYYYYYYYYYYYYYYYYCl')/
|
||
|
|
|
||
|
|
-/[s[:digit:]\E-H]+/
|
||
|
|
- s09-H
|
||
|
|
- 0: s09-H
|
||
|
|
-
|
||
|
|
-/[s[:digit:]\Q\E-H]+/
|
||
|
|
- s09-H
|
||
|
|
- 0: s09-H
|
||
|
|
-
|
||
|
|
/a+(?:|b)a/
|
||
|
|
aaaa
|
||
|
|
0: aaaa
|
||
|
|
@@ -10197,4 +10189,26 @@ No match
|
||
|
|
0: c
|
||
|
|
1: abac
|
||
|
|
|
||
|
|
+/[[:digit:]- ]/xx
|
||
|
|
+ 1
|
||
|
|
+ 0: 1
|
||
|
|
+ -
|
||
|
|
+ 0: -
|
||
|
|
+\= Expect no match
|
||
|
|
+ z
|
||
|
|
+No match
|
||
|
|
+ \ \
|
||
|
|
+No match
|
||
|
|
+
|
||
|
|
+/[\d- ]/xx
|
||
|
|
+ 1
|
||
|
|
+ 0: 1
|
||
|
|
+ -
|
||
|
|
+ 0: -
|
||
|
|
+\= Expect no match
|
||
|
|
+ z
|
||
|
|
+No match
|
||
|
|
+ \ \
|
||
|
|
+No match
|
||
|
|
+
|
||
|
|
# End of testinput1
|
||
|
|
diff --git a/testdata/testoutput2 b/testdata/testoutput2
|
||
|
|
index 1075b4d4..86bfe964 100644
|
||
|
|
--- a/testdata/testoutput2
|
||
|
|
+++ b/testdata/testoutput2
|
||
|
|
@@ -17815,6 +17815,18 @@ Subject length lower bound = 2
|
||
|
|
a\=noteol
|
||
|
|
0: a
|
||
|
|
|
||
|
|
+/[[:digit:] -Z]/xx
|
||
|
|
+Failed: error 150 at offset 10: invalid range in character class
|
||
|
|
+
|
||
|
|
+/[\d -Z]/xx
|
||
|
|
+Failed: error 150 at offset 3: invalid range in character class
|
||
|
|
+
|
||
|
|
+/[[:digit:]\E-H]/
|
||
|
|
+Failed: error 150 at offset 10: invalid range in character class
|
||
|
|
+
|
||
|
|
+/[[:digit:]\Q\E-H]+/
|
||
|
|
+Failed: error 150 at offset 10: invalid range in character class
|
||
|
|
+
|
||
|
|
# End of testinput2
|
||
|
|
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
|
||
|
|
Error -62: bad serialized data
|
||
|
|
--
|
||
|
|
2.33.0
|
||
|
|
|