426 lines
14 KiB
Diff
426 lines
14 KiB
Diff
From 6185344ed8617ff84a08764e808e5b3667c34a7a Mon Sep 17 00:00:00 2001
|
|
From: Nicholas Wilson <niwilson@microsoft.com>
|
|
Date: Wed, 6 Nov 2024 08:45:46 +0000
|
|
Subject: [PATCH] Improve error offsets for character classes (#548)
|
|
|
|
Conflict:don't modify alt_extended_class because fc38d9e784 is not merged;
|
|
don't modify class_op_state because class_op_state is not merged; adapt context
|
|
Reference:https://github.com/PCRE2Project/pcre2/commit/6185344ed8617ff84a08764e808e5b3667c34a7a
|
|
|
|
* Error offset should be advanced by one character for "[\d-z]"
|
|
invalid range error
|
|
|
|
The code does a 1-char lookahead for a hyphen, but then doesn't
|
|
advance the pointer to consume the hyphen when returning the error.
|
|
|
|
Perl's error message (with "use warnings") does advance to just
|
|
after the hyphen, so PCRE2 should match.
|
|
|
|
Fixes #545.
|
|
|
|
* Also improve error offsets for [[:bad:]], [[=...=]] and [z-\p{...}]
|
|
cases
|
|
---
|
|
src/pcre2_compile.c | 67 +++++++++++++++++++-------------------
|
|
testdata/testinput2 | 8 +++++
|
|
testdata/testinput5 | 8 +++++
|
|
testdata/testoutput2 | 76 +++++++++++++++++++++++++-------------------
|
|
testdata/testoutput5 | 14 +++++++-
|
|
5 files changed, 106 insertions(+), 67 deletions(-)
|
|
|
|
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
|
|
index 32db44db..290e759b 100644
|
|
--- a/src/pcre2_compile.c
|
|
+++ b/src/pcre2_compile.c
|
|
@@ -3563,6 +3563,7 @@ while (ptr < ptrend)
|
|
|
|
if (class_range_state == RANGE_STARTED)
|
|
{
|
|
+ ptr = tempptr + 2;
|
|
errorcode = ERR50;
|
|
goto FAILED;
|
|
}
|
|
@@ -3584,8 +3585,9 @@ while (ptr < ptrend)
|
|
|
|
if (*ptr != CHAR_COLON)
|
|
{
|
|
+ ptr = tempptr + 2;
|
|
errorcode = ERR13;
|
|
- goto FAILED_BACK;
|
|
+ goto FAILED;
|
|
}
|
|
|
|
if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
|
|
@@ -3595,19 +3597,18 @@ while (ptr < ptrend)
|
|
}
|
|
|
|
posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
|
|
+ ptr = tempptr + 2;
|
|
if (posix_class < 0)
|
|
{
|
|
errorcode = ERR30;
|
|
goto FAILED;
|
|
}
|
|
- ptr = tempptr + 2;
|
|
|
|
/* Set "a hyphen is forbidden to be the start of a range". For the '-]'
|
|
case, the hyphen is treated as a literal, but for '-1' it is disallowed
|
|
(because it would be interpreted as range). */
|
|
|
|
class_range_state = RANGE_FORBID_NO;
|
|
- class_range_forbid_ptr = ptr;
|
|
|
|
/* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
|
|
of the POSIX classes are converted to use Unicode properties \p or \P
|
|
@@ -3664,6 +3665,7 @@ while (ptr < ptrend)
|
|
{
|
|
*parsed_pattern++ = CHAR_MINUS;
|
|
class_range_state = RANGE_FORBID_STARTED;
|
|
+ class_range_forbid_ptr = ptr;
|
|
}
|
|
|
|
/* Handle a literal character */
|
|
@@ -3746,37 +3748,8 @@ while (ptr < ptrend)
|
|
errorcode = ERR7;
|
|
ptr--;
|
|
goto FAILED;
|
|
- }
|
|
|
|
- /* The second part of a range can be a single-character escape
|
|
- sequence (detected above), but not any of the other escapes. Perl
|
|
- treats a hyphen as a literal in such circumstances. However, in Perl's
|
|
- warning mode, a warning is given, so PCRE now faults it, as it is
|
|
- almost certainly a mistake on the user's part. */
|
|
-
|
|
- if (class_range_state == RANGE_STARTED)
|
|
- {
|
|
- errorcode = ERR50;
|
|
- goto FAILED;
|
|
- }
|
|
- /* Perl gives a warning unless the hyphen following a multi-character
|
|
- escape is the last character in the class. PCRE throws an error. */
|
|
- if (class_range_state == RANGE_FORBID_STARTED)
|
|
- {
|
|
- ptr = class_range_forbid_ptr;
|
|
- errorcode = ERR50;
|
|
- goto FAILED;
|
|
- }
|
|
-
|
|
- /* Of the remaining escapes, only those that define characters are
|
|
- allowed in a class. None may start a range. */
|
|
-
|
|
- class_range_state = RANGE_FORBID_NO;
|
|
- class_range_forbid_ptr = ptr;
|
|
-
|
|
- switch(escape)
|
|
- {
|
|
- case ESC_N:
|
|
+ case ESC_N: /* Not permitted by Perl either */
|
|
errorcode = ERR71;
|
|
goto FAILED;
|
|
|
|
@@ -3813,7 +3786,6 @@ while (ptr < ptrend)
|
|
if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
|
|
*parsed_pattern++ = META_ESCAPE + escape;
|
|
*parsed_pattern++ = (ptype << 16) | pdata;
|
|
- class_range_forbid_ptr = ptr;
|
|
}
|
|
#else
|
|
errorcode = ERR45;
|
|
@@ -3826,6 +3798,33 @@ while (ptr < ptrend)
|
|
ptr--;
|
|
goto FAILED;
|
|
}
|
|
+
|
|
+ /* All the switch-cases above which end in "break" describe a set
|
|
+ of characters. None may start a range. */
|
|
+
|
|
+ /* The second part of a range can be a single-character escape
|
|
+ sequence (detected above), but not any of the other escapes. Perl
|
|
+ treats a hyphen as a literal in such circumstances. However, in Perl's
|
|
+ warning mode, a warning is given, so PCRE now faults it, as it is
|
|
+ almost certainly a mistake on the user's part. */
|
|
+
|
|
+ if (class_range_state == RANGE_STARTED)
|
|
+ {
|
|
+ errorcode = ERR50;
|
|
+ goto FAILED;
|
|
+ }
|
|
+
|
|
+ /* Perl gives a warning unless the hyphen following a multi-character
|
|
+ escape is the last character in the class. PCRE throws an error. */
|
|
+
|
|
+ if (class_range_state == RANGE_FORBID_STARTED)
|
|
+ {
|
|
+ ptr = class_range_forbid_ptr;
|
|
+ errorcode = ERR50;
|
|
+ goto FAILED;
|
|
+ }
|
|
+
|
|
+ class_range_state = RANGE_FORBID_NO;
|
|
}
|
|
|
|
/* Proceed to next thing in the class. */
|
|
diff --git a/testdata/testinput2 b/testdata/testinput2
|
|
index 61b94e69..1fbb778e 100644
|
|
--- a/testdata/testinput2
|
|
+++ b/testdata/testinput2
|
|
@@ -7008,4 +7008,12 @@ a)"xI
|
|
|
|
/[[:digit:]\Q\E-H]+/
|
|
|
|
+/[z-[:space:]]/
|
|
+
|
|
+/[z-\d]/
|
|
+
|
|
+/[[:space:]-z]/
|
|
+
|
|
+/[\d-z]/
|
|
+
|
|
# End of testinput2
|
|
diff --git a/testdata/testinput5 b/testdata/testinput5
|
|
index 494371b5..f3faeb8f 100644
|
|
--- a/testdata/testinput5
|
|
+++ b/testdata/testinput5
|
|
@@ -2458,4 +2458,12 @@
|
|
/abc/utf,substitute_extended,replace=>\777<
|
|
abc
|
|
|
|
+/[z-\p{Lu}]/
|
|
+
|
|
+/[z-\pL]/
|
|
+
|
|
+/[\p{Lu}-z]/
|
|
+
|
|
+/[\pL-z]/
|
|
+
|
|
# End of testinput5
|
|
diff --git a/testdata/testoutput2 b/testdata/testoutput2
|
|
index 86bfe964..99714596 100644
|
|
--- a/testdata/testoutput2
|
|
+++ b/testdata/testoutput2
|
|
@@ -2176,13 +2176,13 @@ Starting code units: % 0 1 A B C D E F G H I J K L M N O P Q R S T U V W
|
|
Subject length lower bound = 1
|
|
|
|
/[[.ch.]]/I
|
|
-Failed: error 113 at offset 1: POSIX collating elements are not supported
|
|
+Failed: error 113 at offset 7: POSIX collating elements are not supported
|
|
|
|
/[[=ch=]]/I
|
|
-Failed: error 113 at offset 1: POSIX collating elements are not supported
|
|
+Failed: error 113 at offset 7: POSIX collating elements are not supported
|
|
|
|
/[[:rhubarb:]]/I
|
|
-Failed: error 130 at offset 3: unknown POSIX class name
|
|
+Failed: error 130 at offset 12: unknown POSIX class name
|
|
|
|
/[[:upper:]]/Ii
|
|
Capture group count = 0
|
|
@@ -8722,31 +8722,31 @@ Failed: error 162 at offset 4: subpattern name expected
|
|
Failed: error 162 at offset 4: subpattern name expected
|
|
|
|
/[[:foo:]]/
|
|
-Failed: error 130 at offset 3: unknown POSIX class name
|
|
+Failed: error 130 at offset 8: unknown POSIX class name
|
|
|
|
/[[:1234:]]/
|
|
-Failed: error 130 at offset 3: unknown POSIX class name
|
|
+Failed: error 130 at offset 9: unknown POSIX class name
|
|
|
|
/[[:f\oo:]]/
|
|
-Failed: error 130 at offset 3: unknown POSIX class name
|
|
+Failed: error 130 at offset 9: unknown POSIX class name
|
|
|
|
/[[: :]]/
|
|
-Failed: error 130 at offset 3: unknown POSIX class name
|
|
+Failed: error 130 at offset 6: unknown POSIX class name
|
|
|
|
/[[:...:]]/
|
|
-Failed: error 130 at offset 3: unknown POSIX class name
|
|
+Failed: error 130 at offset 8: unknown POSIX class name
|
|
|
|
/[[:l\ower:]]/
|
|
-Failed: error 130 at offset 3: unknown POSIX class name
|
|
+Failed: error 130 at offset 11: unknown POSIX class name
|
|
|
|
/[[:abc\:]]/
|
|
-Failed: error 130 at offset 3: unknown POSIX class name
|
|
+Failed: error 130 at offset 9: unknown POSIX class name
|
|
|
|
/[abc[:x\]pqr:]]/
|
|
-Failed: error 130 at offset 6: unknown POSIX class name
|
|
+Failed: error 130 at offset 14: unknown POSIX class name
|
|
|
|
/[[:a\dz:]]/
|
|
-Failed: error 130 at offset 3: unknown POSIX class name
|
|
+Failed: error 130 at offset 9: unknown POSIX class name
|
|
|
|
/(^(a|b\g<-1'c))/
|
|
Failed: error 157 at offset 8: \g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number
|
|
@@ -11409,7 +11409,7 @@ Failed: error 171 at offset 4: \N is not supported in a class
|
|
aNc
|
|
|
|
/a[B-\Nc]/
|
|
-Failed: error 150 at offset 6: invalid range in character class
|
|
+Failed: error 171 at offset 6: \N is not supported in a class
|
|
|
|
/a[B\Nc]/
|
|
Failed: error 171 at offset 5: \N is not supported in a class
|
|
@@ -13232,16 +13232,16 @@ Failed: error 178 at offset 5: digits missing in \x{} or \o{} or \N{U+}
|
|
------------------------------------------------------------------
|
|
|
|
/[a-[:digit:]]+/
|
|
-Failed: error 150 at offset 4: invalid range in character class
|
|
+Failed: error 150 at offset 12: invalid range in character class
|
|
|
|
/[A-[:digit:]]+/
|
|
-Failed: error 150 at offset 4: invalid range in character class
|
|
+Failed: error 150 at offset 12: invalid range in character class
|
|
|
|
/[a-[.xxx.]]+/
|
|
-Failed: error 150 at offset 4: invalid range in character class
|
|
+Failed: error 150 at offset 10: invalid range in character class
|
|
|
|
/[a-[=xxx=]]+/
|
|
-Failed: error 150 at offset 4: invalid range in character class
|
|
+Failed: error 150 at offset 10: invalid range in character class
|
|
|
|
/[a-[!xxx!]]+/
|
|
Failed: error 108 at offset 3: range out of order in character class
|
|
@@ -13362,7 +13362,7 @@ No match
|
|
No match
|
|
|
|
/[a[:<:]] should give error/
|
|
-Failed: error 130 at offset 4: unknown POSIX class name
|
|
+Failed: error 130 at offset 7: unknown POSIX class name
|
|
|
|
/(?=ab\K)/aftertext,allow_lookaround_bsk
|
|
abcd\=startchar
|
|
@@ -15510,11 +15510,11 @@ Failed: error 125 at offset 13: lookbehind assertion is not fixed length
|
|
# Perl accepts these, but gives a warning. We can't warn, so give an error.
|
|
|
|
/[a-[:digit:]]+/
|
|
-Failed: error 150 at offset 4: invalid range in character class
|
|
+Failed: error 150 at offset 12: invalid range in character class
|
|
a-a9-a
|
|
|
|
/[A-[:digit:]]+/
|
|
-Failed: error 150 at offset 4: invalid range in character class
|
|
+Failed: error 150 at offset 12: invalid range in character class
|
|
A-A9-A
|
|
|
|
/[a-\d]+/
|
|
@@ -15651,7 +15651,7 @@ Failed: error 128 at offset 63: assertion expected after (?( or (?(?C)
|
|
.+(?(?C'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'))?!XXXX.=X
|
|
|
|
/[:[:alnum:]-[[a:lnum:]+/
|
|
-Failed: error 150 at offset 11: invalid range in character class
|
|
+Failed: error 150 at offset 12: invalid range in character class
|
|
|
|
/((?(?C'')\QX\E(?!((?(?C'')(?!X=X));=)r*X=X));=)/
|
|
Failed: error 128 at offset 11: assertion expected after (?( or (?(?C)
|
|
@@ -16285,10 +16285,10 @@ Subject length lower bound = 3
|
|
------------------------------------------------------------------
|
|
|
|
/[Q-\N]/B,bad_escape_is_literal
|
|
-Failed: error 150 at offset 5: invalid range in character class
|
|
+Failed: error 171 at offset 5: \N is not supported in a class
|
|
|
|
/[\s-_]/bad_escape_is_literal
|
|
-Failed: error 150 at offset 3: invalid range in character class
|
|
+Failed: error 150 at offset 4: invalid range in character class
|
|
|
|
/[_-\s]/bad_escape_is_literal
|
|
Failed: error 150 at offset 5: invalid range in character class
|
|
@@ -16443,19 +16443,19 @@ No match
|
|
No match
|
|
|
|
/[[:digit:]-a]/
|
|
-Failed: error 150 at offset 10: invalid range in character class
|
|
+Failed: error 150 at offset 11: invalid range in character class
|
|
|
|
/[[:digit:]-[:print:]]/
|
|
-Failed: error 150 at offset 10: invalid range in character class
|
|
+Failed: error 150 at offset 11: invalid range in character class
|
|
|
|
/[\d-a]/
|
|
-Failed: error 150 at offset 3: invalid range in character class
|
|
+Failed: error 150 at offset 4: invalid range in character class
|
|
|
|
/[\H-z]/
|
|
-Failed: error 150 at offset 3: invalid range in character class
|
|
+Failed: error 150 at offset 4: invalid range in character class
|
|
|
|
/[\d-[:print:]]/
|
|
-Failed: error 150 at offset 3: invalid range in character class
|
|
+Failed: error 150 at offset 4: invalid range in character class
|
|
|
|
# Perl gets the second of these wrong, giving no match.
|
|
|
|
@@ -17816,16 +17816,28 @@ Subject length lower bound = 2
|
|
0: a
|
|
|
|
/[[:digit:] -Z]/xx
|
|
-Failed: error 150 at offset 10: invalid range in character class
|
|
+Failed: error 150 at offset 14: invalid range in character class
|
|
|
|
/[\d -Z]/xx
|
|
-Failed: error 150 at offset 3: invalid range in character class
|
|
+Failed: error 150 at offset 7: invalid range in character class
|
|
|
|
/[[:digit:]\E-H]/
|
|
-Failed: error 150 at offset 10: invalid range in character class
|
|
+Failed: error 150 at offset 13: invalid range in character class
|
|
|
|
/[[:digit:]\Q\E-H]+/
|
|
-Failed: error 150 at offset 10: invalid range in character class
|
|
+Failed: error 150 at offset 15: invalid range in character class
|
|
+
|
|
+/[z-[:space:]]/
|
|
+Failed: error 150 at offset 12: invalid range in character class
|
|
+
|
|
+/[z-\d]/
|
|
+Failed: error 150 at offset 5: invalid range in character class
|
|
+
|
|
+/[[:space:]-z]/
|
|
+Failed: error 150 at offset 11: invalid range in character class
|
|
+
|
|
+/[\d-z]/
|
|
+Failed: error 150 at offset 4: invalid range in character class
|
|
|
|
# End of testinput2
|
|
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
|
|
diff --git a/testdata/testoutput5 b/testdata/testoutput5
|
|
index bf06ee12..0dba11c6 100644
|
|
--- a/testdata/testoutput5
|
|
+++ b/testdata/testoutput5
|
|
@@ -795,7 +795,7 @@ No match
|
|
No match
|
|
|
|
/[[:a\x{100}b:]]/utf
|
|
-Failed: error 130 at offset 3: unknown POSIX class name
|
|
+Failed: error 130 at offset 14: unknown POSIX class name
|
|
|
|
/a[^]b/utf,allow_empty_class,match_unset_backref
|
|
a\x{1234}b
|
|
@@ -5403,4 +5403,16 @@ No match
|
|
abc
|
|
1: >\x{1ff}<
|
|
|
|
+/[z-\p{Lu}]/
|
|
+Failed: error 150 at offset 9: invalid range in character class
|
|
+
|
|
+/[z-\pL]/
|
|
+Failed: error 150 at offset 6: invalid range in character class
|
|
+
|
|
+/[\p{Lu}-z]/
|
|
+Failed: error 150 at offset 8: invalid range in character class
|
|
+
|
|
+/[\pL-z]/
|
|
+Failed: error 150 at offset 5: invalid range in character class
|
|
+
|
|
# End of testinput5
|
|
--
|
|
2.33.0
|
|
|