From 16d7edb56757e5294eeeecc9a19135aab89a50ba Mon Sep 17 00:00:00 2001 From: Nicholas Wilson Date: Fri, 1 Nov 2024 17:13:34 +0000 Subject: [PATCH] Fix the lookahead after [\d or [[:posix] to skip whitespace (#544) Conflict:don't modify alt_extended_class because fc38d9e784 is not merged; don't modify class_op_state because class_op_state is not merged; adapt context Reference:https://github.com/PCRE2Project/pcre2/commit/16d7edb56757e5294eeeecc9a19135aab89a50ba --- src/pcre2_compile.c | 88 +++++++++++++++++++++++++++--------------- src/pcre2_intmodedep.h | 2 +- testdata/testinput1 | 20 +++++++--- testdata/testinput2 | 8 ++++ testdata/testoutput1 | 30 ++++++++++---- testdata/testoutput2 | 12 ++++++ 6 files changed, 113 insertions(+), 47 deletions(-) diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 2493c871..9be26b07 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -2681,7 +2681,14 @@ the main compiling phase. */ /* States used for analyzing ranges in character classes. The two OK values must be last. */ -enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL }; +enum { + RANGE_NO, /* State after '[' (initial), or '[a-z'; hyphen is literal */ + RANGE_STARTED, /* State after '[1-'; last-emitted code is META_RANGE_XYZ */ + RANGE_FORBID_NO, /* State after '[\d'; '-]' is allowed but not '-1]' */ + RANGE_FORBID_STARTED, /* State after '[\d-'*/ + RANGE_OK_ESCAPED, /* State after '[1'; hyphen may be a range */ + RANGE_OK_LITERAL /* State after '[\1'; hyphen may be a range */ +}; /* Only in 32-bit mode can there be literals > META_END. A macro encapsulates the storing of literal values in the main parsed pattern, where they can always @@ -2734,6 +2741,7 @@ PCRE2_SPTR thisptr; PCRE2_SPTR name; PCRE2_SPTR ptrend = cb->end_pattern; PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */ +PCRE2_SPTR class_range_forbid_ptr = NULL; named_group *ng; nest_save *top_nest, *end_nests; @@ -3559,6 +3567,21 @@ while (ptr < ptrend) goto FAILED; } + /* Perl treats a hyphen after a POSIX class as a literal, not the + start of a range. However, it gives a warning in its warning mode + unless the hyphen is the last character in the class. PCRE does not + have a warning mode, so we give an error, because this is likely an + error on the user's part. + + Roll back to the hyphen for the error position. */ + + if (class_range_state == RANGE_FORBID_STARTED) + { + ptr = class_range_forbid_ptr; + errorcode = ERR50; + goto FAILED; + } + if (*ptr != CHAR_COLON) { errorcode = ERR13; @@ -3579,26 +3602,12 @@ while (ptr < ptrend) } ptr = tempptr + 2; - /* Perl treats a hyphen after a POSIX class as a literal, not the - start of a range. However, it gives a warning in its warning mode - unless the hyphen is the last character in the class. PCRE does not - have a warning mode, so we give an error, because this is likely an - error on the user's part. */ - - if (ptr < ptrend - 1 && *ptr == CHAR_MINUS && - ptr[1] != CHAR_RIGHT_SQUARE_BRACKET) - { - errorcode = ERR50; - goto FAILED; - } - - /* Set "a hyphen is not the start of a range" for the -] case, and also - in case the POSIX class is followed by \E or \Q\E (possibly repeated - - fuzzers do that kind of thing) and *then* a hyphen. This causes that - hyphen to be treated as a literal. I don't think it's worth setting up - special apparatus to do otherwise. */ + /* Set "a hyphen is forbidden to be the start of a range". For the '-]' + case, the hyphen is treated as a literal, but for '-1' it is disallowed + (because it would be interpreted as range). */ - class_range_state = RANGE_NO; + class_range_state = RANGE_FORBID_NO; + class_range_forbid_ptr = ptr; /* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some of the POSIX classes are converted to use Unicode properties \p or \P @@ -3648,6 +3657,14 @@ while (ptr < ptrend) class_range_state = RANGE_STARTED; } + /* Handle forbidden start of range */ + + else if (c == CHAR_MINUS && class_range_state == RANGE_FORBID_NO) + { + *parsed_pattern++ = CHAR_MINUS; + class_range_state = RANGE_FORBID_STARTED; + } + /* Handle a literal character */ else if (c != CHAR_BACKSLASH) @@ -3670,6 +3687,12 @@ while (ptr < ptrend) } class_range_state = RANGE_NO; } + else if (class_range_state == RANGE_FORBID_STARTED) + { + ptr = class_range_forbid_ptr; + errorcode = ERR50; + goto FAILED; + } else /* Potential start of range */ { class_range_state = char_is_literal? @@ -3733,13 +3756,23 @@ while (ptr < ptrend) if (class_range_state == RANGE_STARTED) { errorcode = ERR50; - goto FAILED; /* Not CLASS_ESCAPE_FAILED; always an error */ + goto FAILED; + } + /* Perl gives a warning unless the hyphen following a multi-character + escape is the last character in the class. PCRE throws an error. */ + if (class_range_state == RANGE_FORBID_STARTED) + { + ptr = class_range_forbid_ptr; + errorcode = ERR50; + goto FAILED; } /* Of the remaining escapes, only those that define characters are allowed in a class. None may start a range. */ - class_range_state = RANGE_NO; + class_range_state = RANGE_FORBID_NO; + class_range_forbid_ptr = ptr; + switch(escape) { case ESC_N: @@ -3779,6 +3812,7 @@ while (ptr < ptrend) if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P; *parsed_pattern++ = META_ESCAPE + escape; *parsed_pattern++ = (ptype << 16) | pdata; + class_range_forbid_ptr = ptr; } #else errorcode = ERR45; @@ -3791,16 +3825,6 @@ while (ptr < ptrend) ptr--; goto FAILED; } - - /* Perl gives a warning unless a following hyphen is the last character - in the class. PCRE throws an error. */ - - if (ptr < ptrend - 1 && *ptr == CHAR_MINUS && - ptr[1] != CHAR_RIGHT_SQUARE_BRACKET) - { - errorcode = ERR50; - goto FAILED; - } } /* Proceed to next thing in the class. */ diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h index 598060c9..a11b4faa 100644 --- a/src/pcre2_intmodedep.h +++ b/src/pcre2_intmodedep.h @@ -435,7 +435,7 @@ UTF-16 mode. */ c = *eptr; \ if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len); -/* Get the next UTF-816character, testing for UTF-16 mode, not advancing the +/* Get the next UTF-16 character, testing for UTF-16 mode, not advancing the pointer, incrementing length if there is a low surrogate. This is called when we do not know if we are in UTF-16 mode. */ diff --git a/testdata/testinput1 b/testdata/testinput1 index 0794502e..1e50369f 100644 --- a/testdata/testinput1 +++ b/testdata/testinput1 @@ -5787,12 +5787,6 @@ ef) x/x,mark /(?'c')XX(?'YYYYYYYYYYYYYYYYYYYYYYYCl')/ -/[s[:digit:]\E-H]+/ - s09-H - -/[s[:digit:]\Q\E-H]+/ - s09-H - /a+(?:|b)a/ aaaa @@ -6435,4 +6429,18 @@ ef) x/x,mark /(a\K.(?1)*)/ abac +/[[:digit:]- ]/xx + 1 + - +\= Expect no match + z + \ \ + +/[\d- ]/xx + 1 + - +\= Expect no match + z + \ \ + # End of testinput1 diff --git a/testdata/testinput2 b/testdata/testinput2 index b6464a0b..61b94e69 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -5981,4 +5981,12 @@ a)"xI a a\=noteol +/[[:digit:] -Z]/xx + +/[\d -Z]/xx + +/[[:digit:]\E-H]/ + +/[[:digit:]\Q\E-H]+/ + # End of testinput2 diff --git a/testdata/testoutput1 b/testdata/testoutput1 index 8daf8362..6f927729 100644 --- a/testdata/testoutput1 +++ b/testdata/testoutput1 @@ -9246,14 +9246,6 @@ No match /(?'c')XX(?'YYYYYYYYYYYYYYYYYYYYYYYCl')/ -/[s[:digit:]\E-H]+/ - s09-H - 0: s09-H - -/[s[:digit:]\Q\E-H]+/ - s09-H - 0: s09-H - /a+(?:|b)a/ aaaa 0: aaaa @@ -10197,4 +10189,26 @@ No match 0: c 1: abac +/[[:digit:]- ]/xx + 1 + 0: 1 + - + 0: - +\= Expect no match + z +No match + \ \ +No match + +/[\d- ]/xx + 1 + 0: 1 + - + 0: - +\= Expect no match + z +No match + \ \ +No match + # End of testinput1 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 1075b4d4..86bfe964 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -17815,6 +17815,18 @@ Subject length lower bound = 2 a\=noteol 0: a +/[[:digit:] -Z]/xx +Failed: error 150 at offset 10: invalid range in character class + +/[\d -Z]/xx +Failed: error 150 at offset 3: invalid range in character class + +/[[:digit:]\E-H]/ +Failed: error 150 at offset 10: invalid range in character class + +/[[:digit:]\Q\E-H]+/ +Failed: error 150 at offset 10: invalid range in character class + # End of testinput2 Error -70: PCRE2_ERROR_BADDATA (unknown error number) Error -62: bad serialized data -- 2.33.0