From 6185344ed8617ff84a08764e808e5b3667c34a7a Mon Sep 17 00:00:00 2001 From: Nicholas Wilson Date: Wed, 6 Nov 2024 08:45:46 +0000 Subject: [PATCH] Improve error offsets for character classes (#548) Conflict:don't modify alt_extended_class because fc38d9e784 is not merged; don't modify class_op_state because class_op_state is not merged; adapt context Reference:https://github.com/PCRE2Project/pcre2/commit/6185344ed8617ff84a08764e808e5b3667c34a7a * Error offset should be advanced by one character for "[\d-z]" invalid range error The code does a 1-char lookahead for a hyphen, but then doesn't advance the pointer to consume the hyphen when returning the error. Perl's error message (with "use warnings") does advance to just after the hyphen, so PCRE2 should match. Fixes #545. * Also improve error offsets for [[:bad:]], [[=...=]] and [z-\p{...}] cases --- src/pcre2_compile.c | 67 +++++++++++++++++++------------------- testdata/testinput2 | 8 +++++ testdata/testinput5 | 8 +++++ testdata/testoutput2 | 76 +++++++++++++++++++++++++------------------- testdata/testoutput5 | 14 +++++++- 5 files changed, 106 insertions(+), 67 deletions(-) diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 32db44db..290e759b 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -3563,6 +3563,7 @@ while (ptr < ptrend) if (class_range_state == RANGE_STARTED) { + ptr = tempptr + 2; errorcode = ERR50; goto FAILED; } @@ -3584,8 +3585,9 @@ while (ptr < ptrend) if (*ptr != CHAR_COLON) { + ptr = tempptr + 2; errorcode = ERR13; - goto FAILED_BACK; + goto FAILED; } if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT) @@ -3595,19 +3597,18 @@ while (ptr < ptrend) } posix_class = check_posix_name(ptr, (int)(tempptr - ptr)); + ptr = tempptr + 2; if (posix_class < 0) { errorcode = ERR30; goto FAILED; } - ptr = tempptr + 2; /* Set "a hyphen is forbidden to be the start of a range". For the '-]' case, the hyphen is treated as a literal, but for '-1' it is disallowed (because it would be interpreted as range). */ class_range_state = RANGE_FORBID_NO; - class_range_forbid_ptr = ptr; /* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some of the POSIX classes are converted to use Unicode properties \p or \P @@ -3664,6 +3665,7 @@ while (ptr < ptrend) { *parsed_pattern++ = CHAR_MINUS; class_range_state = RANGE_FORBID_STARTED; + class_range_forbid_ptr = ptr; } /* Handle a literal character */ @@ -3746,37 +3748,8 @@ while (ptr < ptrend) errorcode = ERR7; ptr--; goto FAILED; - } - /* The second part of a range can be a single-character escape - sequence (detected above), but not any of the other escapes. Perl - treats a hyphen as a literal in such circumstances. However, in Perl's - warning mode, a warning is given, so PCRE now faults it, as it is - almost certainly a mistake on the user's part. */ - - if (class_range_state == RANGE_STARTED) - { - errorcode = ERR50; - goto FAILED; - } - /* Perl gives a warning unless the hyphen following a multi-character - escape is the last character in the class. PCRE throws an error. */ - if (class_range_state == RANGE_FORBID_STARTED) - { - ptr = class_range_forbid_ptr; - errorcode = ERR50; - goto FAILED; - } - - /* Of the remaining escapes, only those that define characters are - allowed in a class. None may start a range. */ - - class_range_state = RANGE_FORBID_NO; - class_range_forbid_ptr = ptr; - - switch(escape) - { - case ESC_N: + case ESC_N: /* Not permitted by Perl either */ errorcode = ERR71; goto FAILED; @@ -3813,7 +3786,6 @@ while (ptr < ptrend) if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P; *parsed_pattern++ = META_ESCAPE + escape; *parsed_pattern++ = (ptype << 16) | pdata; - class_range_forbid_ptr = ptr; } #else errorcode = ERR45; @@ -3826,6 +3798,33 @@ while (ptr < ptrend) ptr--; goto FAILED; } + + /* All the switch-cases above which end in "break" describe a set + of characters. None may start a range. */ + + /* The second part of a range can be a single-character escape + sequence (detected above), but not any of the other escapes. Perl + treats a hyphen as a literal in such circumstances. However, in Perl's + warning mode, a warning is given, so PCRE now faults it, as it is + almost certainly a mistake on the user's part. */ + + if (class_range_state == RANGE_STARTED) + { + errorcode = ERR50; + goto FAILED; + } + + /* Perl gives a warning unless the hyphen following a multi-character + escape is the last character in the class. PCRE throws an error. */ + + if (class_range_state == RANGE_FORBID_STARTED) + { + ptr = class_range_forbid_ptr; + errorcode = ERR50; + goto FAILED; + } + + class_range_state = RANGE_FORBID_NO; } /* Proceed to next thing in the class. */ diff --git a/testdata/testinput2 b/testdata/testinput2 index 61b94e69..1fbb778e 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -7008,4 +7008,12 @@ a)"xI /[[:digit:]\Q\E-H]+/ +/[z-[:space:]]/ + +/[z-\d]/ + +/[[:space:]-z]/ + +/[\d-z]/ + # End of testinput2 diff --git a/testdata/testinput5 b/testdata/testinput5 index 494371b5..f3faeb8f 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -2458,4 +2458,12 @@ /abc/utf,substitute_extended,replace=>\777< abc +/[z-\p{Lu}]/ + +/[z-\pL]/ + +/[\p{Lu}-z]/ + +/[\pL-z]/ + # End of testinput5 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 86bfe964..99714596 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -2176,13 +2176,13 @@ Starting code units: % 0 1 A B C D E F G H I J K L M N O P Q R S T U V W Subject length lower bound = 1 /[[.ch.]]/I -Failed: error 113 at offset 1: POSIX collating elements are not supported +Failed: error 113 at offset 7: POSIX collating elements are not supported /[[=ch=]]/I -Failed: error 113 at offset 1: POSIX collating elements are not supported +Failed: error 113 at offset 7: POSIX collating elements are not supported /[[:rhubarb:]]/I -Failed: error 130 at offset 3: unknown POSIX class name +Failed: error 130 at offset 12: unknown POSIX class name /[[:upper:]]/Ii Capture group count = 0 @@ -8722,31 +8722,31 @@ Failed: error 162 at offset 4: subpattern name expected Failed: error 162 at offset 4: subpattern name expected /[[:foo:]]/ -Failed: error 130 at offset 3: unknown POSIX class name +Failed: error 130 at offset 8: unknown POSIX class name /[[:1234:]]/ -Failed: error 130 at offset 3: unknown POSIX class name +Failed: error 130 at offset 9: unknown POSIX class name /[[:f\oo:]]/ -Failed: error 130 at offset 3: unknown POSIX class name +Failed: error 130 at offset 9: unknown POSIX class name /[[: :]]/ -Failed: error 130 at offset 3: unknown POSIX class name +Failed: error 130 at offset 6: unknown POSIX class name /[[:...:]]/ -Failed: error 130 at offset 3: unknown POSIX class name +Failed: error 130 at offset 8: unknown POSIX class name /[[:l\ower:]]/ -Failed: error 130 at offset 3: unknown POSIX class name +Failed: error 130 at offset 11: unknown POSIX class name /[[:abc\:]]/ -Failed: error 130 at offset 3: unknown POSIX class name +Failed: error 130 at offset 9: unknown POSIX class name /[abc[:x\]pqr:]]/ -Failed: error 130 at offset 6: unknown POSIX class name +Failed: error 130 at offset 14: unknown POSIX class name /[[:a\dz:]]/ -Failed: error 130 at offset 3: unknown POSIX class name +Failed: error 130 at offset 9: unknown POSIX class name /(^(a|b\g<-1'c))/ Failed: error 157 at offset 8: \g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number @@ -11409,7 +11409,7 @@ Failed: error 171 at offset 4: \N is not supported in a class aNc /a[B-\Nc]/ -Failed: error 150 at offset 6: invalid range in character class +Failed: error 171 at offset 6: \N is not supported in a class /a[B\Nc]/ Failed: error 171 at offset 5: \N is not supported in a class @@ -13232,16 +13232,16 @@ Failed: error 178 at offset 5: digits missing in \x{} or \o{} or \N{U+} ------------------------------------------------------------------ /[a-[:digit:]]+/ -Failed: error 150 at offset 4: invalid range in character class +Failed: error 150 at offset 12: invalid range in character class /[A-[:digit:]]+/ -Failed: error 150 at offset 4: invalid range in character class +Failed: error 150 at offset 12: invalid range in character class /[a-[.xxx.]]+/ -Failed: error 150 at offset 4: invalid range in character class +Failed: error 150 at offset 10: invalid range in character class /[a-[=xxx=]]+/ -Failed: error 150 at offset 4: invalid range in character class +Failed: error 150 at offset 10: invalid range in character class /[a-[!xxx!]]+/ Failed: error 108 at offset 3: range out of order in character class @@ -13362,7 +13362,7 @@ No match No match /[a[:<:]] should give error/ -Failed: error 130 at offset 4: unknown POSIX class name +Failed: error 130 at offset 7: unknown POSIX class name /(?=ab\K)/aftertext,allow_lookaround_bsk abcd\=startchar @@ -15510,11 +15510,11 @@ Failed: error 125 at offset 13: lookbehind assertion is not fixed length # Perl accepts these, but gives a warning. We can't warn, so give an error. /[a-[:digit:]]+/ -Failed: error 150 at offset 4: invalid range in character class +Failed: error 150 at offset 12: invalid range in character class a-a9-a /[A-[:digit:]]+/ -Failed: error 150 at offset 4: invalid range in character class +Failed: error 150 at offset 12: invalid range in character class A-A9-A /[a-\d]+/ @@ -15651,7 +15651,7 @@ Failed: error 128 at offset 63: assertion expected after (?( or (?(?C) .+(?(?C'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'))?!XXXX.=X /[:[:alnum:]-[[a:lnum:]+/ -Failed: error 150 at offset 11: invalid range in character class +Failed: error 150 at offset 12: invalid range in character class /((?(?C'')\QX\E(?!((?(?C'')(?!X=X));=)r*X=X));=)/ Failed: error 128 at offset 11: assertion expected after (?( or (?(?C) @@ -16285,10 +16285,10 @@ Subject length lower bound = 3 ------------------------------------------------------------------ /[Q-\N]/B,bad_escape_is_literal -Failed: error 150 at offset 5: invalid range in character class +Failed: error 171 at offset 5: \N is not supported in a class /[\s-_]/bad_escape_is_literal -Failed: error 150 at offset 3: invalid range in character class +Failed: error 150 at offset 4: invalid range in character class /[_-\s]/bad_escape_is_literal Failed: error 150 at offset 5: invalid range in character class @@ -16443,19 +16443,19 @@ No match No match /[[:digit:]-a]/ -Failed: error 150 at offset 10: invalid range in character class +Failed: error 150 at offset 11: invalid range in character class /[[:digit:]-[:print:]]/ -Failed: error 150 at offset 10: invalid range in character class +Failed: error 150 at offset 11: invalid range in character class /[\d-a]/ -Failed: error 150 at offset 3: invalid range in character class +Failed: error 150 at offset 4: invalid range in character class /[\H-z]/ -Failed: error 150 at offset 3: invalid range in character class +Failed: error 150 at offset 4: invalid range in character class /[\d-[:print:]]/ -Failed: error 150 at offset 3: invalid range in character class +Failed: error 150 at offset 4: invalid range in character class # Perl gets the second of these wrong, giving no match. @@ -17816,16 +17816,28 @@ Subject length lower bound = 2 0: a /[[:digit:] -Z]/xx -Failed: error 150 at offset 10: invalid range in character class +Failed: error 150 at offset 14: invalid range in character class /[\d -Z]/xx -Failed: error 150 at offset 3: invalid range in character class +Failed: error 150 at offset 7: invalid range in character class /[[:digit:]\E-H]/ -Failed: error 150 at offset 10: invalid range in character class +Failed: error 150 at offset 13: invalid range in character class /[[:digit:]\Q\E-H]+/ -Failed: error 150 at offset 10: invalid range in character class +Failed: error 150 at offset 15: invalid range in character class + +/[z-[:space:]]/ +Failed: error 150 at offset 12: invalid range in character class + +/[z-\d]/ +Failed: error 150 at offset 5: invalid range in character class + +/[[:space:]-z]/ +Failed: error 150 at offset 11: invalid range in character class + +/[\d-z]/ +Failed: error 150 at offset 4: invalid range in character class # End of testinput2 Error -70: PCRE2_ERROR_BADDATA (unknown error number) diff --git a/testdata/testoutput5 b/testdata/testoutput5 index bf06ee12..0dba11c6 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -795,7 +795,7 @@ No match No match /[[:a\x{100}b:]]/utf -Failed: error 130 at offset 3: unknown POSIX class name +Failed: error 130 at offset 14: unknown POSIX class name /a[^]b/utf,allow_empty_class,match_unset_backref a\x{1234}b @@ -5403,4 +5403,16 @@ No match abc 1: >\x{1ff}< +/[z-\p{Lu}]/ +Failed: error 150 at offset 9: invalid range in character class + +/[z-\pL]/ +Failed: error 150 at offset 6: invalid range in character class + +/[\p{Lu}-z]/ +Failed: error 150 at offset 8: invalid range in character class + +/[\pL-z]/ +Failed: error 150 at offset 5: invalid range in character class + # End of testinput5 -- 2.33.0