!126 sync patches from upstream

From: @hugel 
Reviewed-by: @dillon_chen 
Signed-off-by: @dillon_chen
This commit is contained in:
openeuler-ci-bot 2024-12-10 04:15:08 +00:00 committed by Gitee
commit 0ec5d05b58
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
6 changed files with 1590 additions and 1 deletions

View File

@ -0,0 +1,318 @@
From 16d7edb56757e5294eeeecc9a19135aab89a50ba Mon Sep 17 00:00:00 2001
From: Nicholas Wilson <niwilson@microsoft.com>
Date: Fri, 1 Nov 2024 17:13:34 +0000
Subject: [PATCH] Fix the lookahead after [\d or [[:posix] to skip whitespace
(#544)
Conflict:don't modify alt_extended_class because fc38d9e784 is not merged;
don't modify class_op_state because class_op_state is not merged; adapt context
Reference:https://github.com/PCRE2Project/pcre2/commit/16d7edb56757e5294eeeecc9a19135aab89a50ba
---
src/pcre2_compile.c | 88 +++++++++++++++++++++++++++---------------
src/pcre2_intmodedep.h | 2 +-
testdata/testinput1 | 20 +++++++---
testdata/testinput2 | 8 ++++
testdata/testoutput1 | 30 ++++++++++----
testdata/testoutput2 | 12 ++++++
6 files changed, 113 insertions(+), 47 deletions(-)
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index 2493c871..9be26b07 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -2681,7 +2681,14 @@ the main compiling phase. */
/* States used for analyzing ranges in character classes. The two OK values
must be last. */
-enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
+enum {
+ RANGE_NO, /* State after '[' (initial), or '[a-z'; hyphen is literal */
+ RANGE_STARTED, /* State after '[1-'; last-emitted code is META_RANGE_XYZ */
+ RANGE_FORBID_NO, /* State after '[\d'; '-]' is allowed but not '-1]' */
+ RANGE_FORBID_STARTED, /* State after '[\d-'*/
+ RANGE_OK_ESCAPED, /* State after '[1'; hyphen may be a range */
+ RANGE_OK_LITERAL /* State after '[\1'; hyphen may be a range */
+};
/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
the storing of literal values in the main parsed pattern, where they can always
@@ -2734,6 +2741,7 @@ PCRE2_SPTR thisptr;
PCRE2_SPTR name;
PCRE2_SPTR ptrend = cb->end_pattern;
PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */
+PCRE2_SPTR class_range_forbid_ptr = NULL;
named_group *ng;
nest_save *top_nest, *end_nests;
@@ -3559,6 +3567,21 @@ while (ptr < ptrend)
goto FAILED;
}
+ /* Perl treats a hyphen after a POSIX class as a literal, not the
+ start of a range. However, it gives a warning in its warning mode
+ unless the hyphen is the last character in the class. PCRE does not
+ have a warning mode, so we give an error, because this is likely an
+ error on the user's part.
+
+ Roll back to the hyphen for the error position. */
+
+ if (class_range_state == RANGE_FORBID_STARTED)
+ {
+ ptr = class_range_forbid_ptr;
+ errorcode = ERR50;
+ goto FAILED;
+ }
+
if (*ptr != CHAR_COLON)
{
errorcode = ERR13;
@@ -3579,26 +3602,12 @@ while (ptr < ptrend)
}
ptr = tempptr + 2;
- /* Perl treats a hyphen after a POSIX class as a literal, not the
- start of a range. However, it gives a warning in its warning mode
- unless the hyphen is the last character in the class. PCRE does not
- have a warning mode, so we give an error, because this is likely an
- error on the user's part. */
-
- if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
- ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
- {
- errorcode = ERR50;
- goto FAILED;
- }
-
- /* Set "a hyphen is not the start of a range" for the -] case, and also
- in case the POSIX class is followed by \E or \Q\E (possibly repeated -
- fuzzers do that kind of thing) and *then* a hyphen. This causes that
- hyphen to be treated as a literal. I don't think it's worth setting up
- special apparatus to do otherwise. */
+ /* Set "a hyphen is forbidden to be the start of a range". For the '-]'
+ case, the hyphen is treated as a literal, but for '-1' it is disallowed
+ (because it would be interpreted as range). */
- class_range_state = RANGE_NO;
+ class_range_state = RANGE_FORBID_NO;
+ class_range_forbid_ptr = ptr;
/* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
of the POSIX classes are converted to use Unicode properties \p or \P
@@ -3648,6 +3657,14 @@ while (ptr < ptrend)
class_range_state = RANGE_STARTED;
}
+ /* Handle forbidden start of range */
+
+ else if (c == CHAR_MINUS && class_range_state == RANGE_FORBID_NO)
+ {
+ *parsed_pattern++ = CHAR_MINUS;
+ class_range_state = RANGE_FORBID_STARTED;
+ }
+
/* Handle a literal character */
else if (c != CHAR_BACKSLASH)
@@ -3670,6 +3687,12 @@ while (ptr < ptrend)
}
class_range_state = RANGE_NO;
}
+ else if (class_range_state == RANGE_FORBID_STARTED)
+ {
+ ptr = class_range_forbid_ptr;
+ errorcode = ERR50;
+ goto FAILED;
+ }
else /* Potential start of range */
{
class_range_state = char_is_literal?
@@ -3733,13 +3756,23 @@ while (ptr < ptrend)
if (class_range_state == RANGE_STARTED)
{
errorcode = ERR50;
- goto FAILED; /* Not CLASS_ESCAPE_FAILED; always an error */
+ goto FAILED;
+ }
+ /* Perl gives a warning unless the hyphen following a multi-character
+ escape is the last character in the class. PCRE throws an error. */
+ if (class_range_state == RANGE_FORBID_STARTED)
+ {
+ ptr = class_range_forbid_ptr;
+ errorcode = ERR50;
+ goto FAILED;
}
/* Of the remaining escapes, only those that define characters are
allowed in a class. None may start a range. */
- class_range_state = RANGE_NO;
+ class_range_state = RANGE_FORBID_NO;
+ class_range_forbid_ptr = ptr;
+
switch(escape)
{
case ESC_N:
@@ -3779,6 +3812,7 @@ while (ptr < ptrend)
if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
*parsed_pattern++ = META_ESCAPE + escape;
*parsed_pattern++ = (ptype << 16) | pdata;
+ class_range_forbid_ptr = ptr;
}
#else
errorcode = ERR45;
@@ -3791,16 +3825,6 @@ while (ptr < ptrend)
ptr--;
goto FAILED;
}
-
- /* Perl gives a warning unless a following hyphen is the last character
- in the class. PCRE throws an error. */
-
- if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
- ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
- {
- errorcode = ERR50;
- goto FAILED;
- }
}
/* Proceed to next thing in the class. */
diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h
index 598060c9..a11b4faa 100644
--- a/src/pcre2_intmodedep.h
+++ b/src/pcre2_intmodedep.h
@@ -435,7 +435,7 @@ UTF-16 mode. */
c = *eptr; \
if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
-/* Get the next UTF-816character, testing for UTF-16 mode, not advancing the
+/* Get the next UTF-16 character, testing for UTF-16 mode, not advancing the
pointer, incrementing length if there is a low surrogate. This is called when
we do not know if we are in UTF-16 mode. */
diff --git a/testdata/testinput1 b/testdata/testinput1
index 0794502e..1e50369f 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -5787,12 +5787,6 @@ ef) x/x,mark
/(?'c')XX(?'YYYYYYYYYYYYYYYYYYYYYYYCl')/
-/[s[:digit:]\E-H]+/
- s09-H
-
-/[s[:digit:]\Q\E-H]+/
- s09-H
-
/a+(?:|b)a/
aaaa
@@ -6435,4 +6429,18 @@ ef) x/x,mark
/(a\K.(?1)*)/
abac
+/[[:digit:]- ]/xx
+ 1
+ -
+\= Expect no match
+ z
+ \ \
+
+/[\d- ]/xx
+ 1
+ -
+\= Expect no match
+ z
+ \ \
+
# End of testinput1
diff --git a/testdata/testinput2 b/testdata/testinput2
index b6464a0b..61b94e69 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -5981,4 +5981,12 @@ a)"xI
a
a\=noteol
+/[[:digit:] -Z]/xx
+
+/[\d -Z]/xx
+
+/[[:digit:]\E-H]/
+
+/[[:digit:]\Q\E-H]+/
+
# End of testinput2
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index 8daf8362..6f927729 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -9246,14 +9246,6 @@ No match
/(?'c')XX(?'YYYYYYYYYYYYYYYYYYYYYYYCl')/
-/[s[:digit:]\E-H]+/
- s09-H
- 0: s09-H
-
-/[s[:digit:]\Q\E-H]+/
- s09-H
- 0: s09-H
-
/a+(?:|b)a/
aaaa
0: aaaa
@@ -10197,4 +10189,26 @@ No match
0: c
1: abac
+/[[:digit:]- ]/xx
+ 1
+ 0: 1
+ -
+ 0: -
+\= Expect no match
+ z
+No match
+ \ \
+No match
+
+/[\d- ]/xx
+ 1
+ 0: 1
+ -
+ 0: -
+\= Expect no match
+ z
+No match
+ \ \
+No match
+
# End of testinput1
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 1075b4d4..86bfe964 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -17815,6 +17815,18 @@ Subject length lower bound = 2
a\=noteol
0: a
+/[[:digit:] -Z]/xx
+Failed: error 150 at offset 10: invalid range in character class
+
+/[\d -Z]/xx
+Failed: error 150 at offset 3: invalid range in character class
+
+/[[:digit:]\E-H]/
+Failed: error 150 at offset 10: invalid range in character class
+
+/[[:digit:]\Q\E-H]+/
+Failed: error 150 at offset 10: invalid range in character class
+
# End of testinput2
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data
--
2.33.0

View File

@ -0,0 +1,104 @@
From fc56fd790c1a3ba8f2890fc2b6afba21250923de Mon Sep 17 00:00:00 2001
From: Philip Hazel <Philip.Hazel@gmail.com>
Date: Thu, 2 Feb 2023 17:19:45 +0000
Subject: [PATCH] Further ASCII tests and minor bugfix plus ChangeLog update
Conflict:don't modify ChangeLog
Reference:https://github.com/PCRE2Project/pcre2/commit/fc56fd790c1a3ba8f2890fc2b6afba21250923de
---
src/pcre2_compile.c | 5 ++---
testdata/testinput5 | 5 +++++
testdata/testinput7 | 5 +++++
testdata/testoutput5 | 7 +++++++
testdata/testoutput7 | 7 +++++++
5 files changed, 26 insertions(+), 3 deletions(-)
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index b8a9e098..64a35bda 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -2660,10 +2660,9 @@ the main compiling phase. */
PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
PCRE2_UNGREEDY)
-#define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT)
-
#define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \
- PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW)
+ PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \
+ PCRE2_EXTRA_ASCII_POSIX)
/* States used for analyzing ranges in character classes. The two OK values
must be last. */
diff --git a/testdata/testinput5 b/testdata/testinput5
index 6e186cf0..49b46f82 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -2434,6 +2434,11 @@
/(?aP)[[:alnum:]\d]+/i,ucp,utf
abc\x{660}xyz
+/(*UCP)(*UTF)[[:alnum:]](?aP:[[:alnum:]])[[:alnum:]]/
+ \x{660}A\x{660}
+\= Expect no match
+ \x{660}\x{660}\x{660}
+
# VARIOUS
/[\d\s\w]+/a,ucp,utf
diff --git a/testdata/testinput7 b/testdata/testinput7
index 64a37ad2..a2b7fb8d 100644
--- a/testdata/testinput7
+++ b/testdata/testinput7
@@ -2453,6 +2453,11 @@
/(?aP)[[:alnum:]\d]+/i,ucp,utf
abc\x{660}xyz
+/(*UCP)(*UTF)[[:alnum:]](?aP:[[:alnum:]])[[:alnum:]]/
+ \x{660}A\x{660}
+\= Expect no match
+ \x{660}\x{660}\x{660}
+
# VARIOUS
/[\d\s\w]+/a,ucp,utf
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index 26972f70..4f845c84 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -5365,6 +5365,13 @@ No match
abc\x{660}xyz
0: abc\x{660}xyz
+/(*UCP)(*UTF)[[:alnum:]](?aP:[[:alnum:]])[[:alnum:]]/
+ \x{660}A\x{660}
+ 0: \x{660}A\x{660}
+\= Expect no match
+ \x{660}\x{660}\x{660}
+No match
+
# VARIOUS
/[\d\s\w]+/a,ucp,utf
diff --git a/testdata/testoutput7 b/testdata/testoutput7
index c830748c..4065981d 100644
--- a/testdata/testoutput7
+++ b/testdata/testoutput7
@@ -4105,6 +4105,13 @@ No match
abc\x{660}xyz
0: abc\x{660}xyz
+/(*UCP)(*UTF)[[:alnum:]](?aP:[[:alnum:]])[[:alnum:]]/
+ \x{660}A\x{660}
+ 0: \x{660}A\x{660}
+\= Expect no match
+ \x{660}\x{660}\x{660}
+No match
+
# VARIOUS
/[\d\s\w]+/a,ucp,utf
--
2.33.0

View File

@ -0,0 +1,425 @@
From 6185344ed8617ff84a08764e808e5b3667c34a7a Mon Sep 17 00:00:00 2001
From: Nicholas Wilson <niwilson@microsoft.com>
Date: Wed, 6 Nov 2024 08:45:46 +0000
Subject: [PATCH] Improve error offsets for character classes (#548)
Conflict:don't modify alt_extended_class because fc38d9e784 is not merged;
don't modify class_op_state because class_op_state is not merged; adapt context
Reference:https://github.com/PCRE2Project/pcre2/commit/6185344ed8617ff84a08764e808e5b3667c34a7a
* Error offset should be advanced by one character for "[\d-z]"
invalid range error
The code does a 1-char lookahead for a hyphen, but then doesn't
advance the pointer to consume the hyphen when returning the error.
Perl's error message (with "use warnings") does advance to just
after the hyphen, so PCRE2 should match.
Fixes #545.
* Also improve error offsets for [[:bad:]], [[=...=]] and [z-\p{...}]
cases
---
src/pcre2_compile.c | 67 +++++++++++++++++++-------------------
testdata/testinput2 | 8 +++++
testdata/testinput5 | 8 +++++
testdata/testoutput2 | 76 +++++++++++++++++++++++++-------------------
testdata/testoutput5 | 14 +++++++-
5 files changed, 106 insertions(+), 67 deletions(-)
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index 32db44db..290e759b 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -3563,6 +3563,7 @@ while (ptr < ptrend)
if (class_range_state == RANGE_STARTED)
{
+ ptr = tempptr + 2;
errorcode = ERR50;
goto FAILED;
}
@@ -3584,8 +3585,9 @@ while (ptr < ptrend)
if (*ptr != CHAR_COLON)
{
+ ptr = tempptr + 2;
errorcode = ERR13;
- goto FAILED_BACK;
+ goto FAILED;
}
if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
@@ -3595,19 +3597,18 @@ while (ptr < ptrend)
}
posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
+ ptr = tempptr + 2;
if (posix_class < 0)
{
errorcode = ERR30;
goto FAILED;
}
- ptr = tempptr + 2;
/* Set "a hyphen is forbidden to be the start of a range". For the '-]'
case, the hyphen is treated as a literal, but for '-1' it is disallowed
(because it would be interpreted as range). */
class_range_state = RANGE_FORBID_NO;
- class_range_forbid_ptr = ptr;
/* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
of the POSIX classes are converted to use Unicode properties \p or \P
@@ -3664,6 +3665,7 @@ while (ptr < ptrend)
{
*parsed_pattern++ = CHAR_MINUS;
class_range_state = RANGE_FORBID_STARTED;
+ class_range_forbid_ptr = ptr;
}
/* Handle a literal character */
@@ -3746,37 +3748,8 @@ while (ptr < ptrend)
errorcode = ERR7;
ptr--;
goto FAILED;
- }
- /* The second part of a range can be a single-character escape
- sequence (detected above), but not any of the other escapes. Perl
- treats a hyphen as a literal in such circumstances. However, in Perl's
- warning mode, a warning is given, so PCRE now faults it, as it is
- almost certainly a mistake on the user's part. */
-
- if (class_range_state == RANGE_STARTED)
- {
- errorcode = ERR50;
- goto FAILED;
- }
- /* Perl gives a warning unless the hyphen following a multi-character
- escape is the last character in the class. PCRE throws an error. */
- if (class_range_state == RANGE_FORBID_STARTED)
- {
- ptr = class_range_forbid_ptr;
- errorcode = ERR50;
- goto FAILED;
- }
-
- /* Of the remaining escapes, only those that define characters are
- allowed in a class. None may start a range. */
-
- class_range_state = RANGE_FORBID_NO;
- class_range_forbid_ptr = ptr;
-
- switch(escape)
- {
- case ESC_N:
+ case ESC_N: /* Not permitted by Perl either */
errorcode = ERR71;
goto FAILED;
@@ -3813,7 +3786,6 @@ while (ptr < ptrend)
if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
*parsed_pattern++ = META_ESCAPE + escape;
*parsed_pattern++ = (ptype << 16) | pdata;
- class_range_forbid_ptr = ptr;
}
#else
errorcode = ERR45;
@@ -3826,6 +3798,33 @@ while (ptr < ptrend)
ptr--;
goto FAILED;
}
+
+ /* All the switch-cases above which end in "break" describe a set
+ of characters. None may start a range. */
+
+ /* The second part of a range can be a single-character escape
+ sequence (detected above), but not any of the other escapes. Perl
+ treats a hyphen as a literal in such circumstances. However, in Perl's
+ warning mode, a warning is given, so PCRE now faults it, as it is
+ almost certainly a mistake on the user's part. */
+
+ if (class_range_state == RANGE_STARTED)
+ {
+ errorcode = ERR50;
+ goto FAILED;
+ }
+
+ /* Perl gives a warning unless the hyphen following a multi-character
+ escape is the last character in the class. PCRE throws an error. */
+
+ if (class_range_state == RANGE_FORBID_STARTED)
+ {
+ ptr = class_range_forbid_ptr;
+ errorcode = ERR50;
+ goto FAILED;
+ }
+
+ class_range_state = RANGE_FORBID_NO;
}
/* Proceed to next thing in the class. */
diff --git a/testdata/testinput2 b/testdata/testinput2
index 61b94e69..1fbb778e 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -7008,4 +7008,12 @@ a)"xI
/[[:digit:]\Q\E-H]+/
+/[z-[:space:]]/
+
+/[z-\d]/
+
+/[[:space:]-z]/
+
+/[\d-z]/
+
# End of testinput2
diff --git a/testdata/testinput5 b/testdata/testinput5
index 494371b5..f3faeb8f 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -2458,4 +2458,12 @@
/abc/utf,substitute_extended,replace=>\777<
abc
+/[z-\p{Lu}]/
+
+/[z-\pL]/
+
+/[\p{Lu}-z]/
+
+/[\pL-z]/
+
# End of testinput5
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 86bfe964..99714596 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -2176,13 +2176,13 @@ Starting code units: % 0 1 A B C D E F G H I J K L M N O P Q R S T U V W
Subject length lower bound = 1
/[[.ch.]]/I
-Failed: error 113 at offset 1: POSIX collating elements are not supported
+Failed: error 113 at offset 7: POSIX collating elements are not supported
/[[=ch=]]/I
-Failed: error 113 at offset 1: POSIX collating elements are not supported
+Failed: error 113 at offset 7: POSIX collating elements are not supported
/[[:rhubarb:]]/I
-Failed: error 130 at offset 3: unknown POSIX class name
+Failed: error 130 at offset 12: unknown POSIX class name
/[[:upper:]]/Ii
Capture group count = 0
@@ -8722,31 +8722,31 @@ Failed: error 162 at offset 4: subpattern name expected
Failed: error 162 at offset 4: subpattern name expected
/[[:foo:]]/
-Failed: error 130 at offset 3: unknown POSIX class name
+Failed: error 130 at offset 8: unknown POSIX class name
/[[:1234:]]/
-Failed: error 130 at offset 3: unknown POSIX class name
+Failed: error 130 at offset 9: unknown POSIX class name
/[[:f\oo:]]/
-Failed: error 130 at offset 3: unknown POSIX class name
+Failed: error 130 at offset 9: unknown POSIX class name
/[[: :]]/
-Failed: error 130 at offset 3: unknown POSIX class name
+Failed: error 130 at offset 6: unknown POSIX class name
/[[:...:]]/
-Failed: error 130 at offset 3: unknown POSIX class name
+Failed: error 130 at offset 8: unknown POSIX class name
/[[:l\ower:]]/
-Failed: error 130 at offset 3: unknown POSIX class name
+Failed: error 130 at offset 11: unknown POSIX class name
/[[:abc\:]]/
-Failed: error 130 at offset 3: unknown POSIX class name
+Failed: error 130 at offset 9: unknown POSIX class name
/[abc[:x\]pqr:]]/
-Failed: error 130 at offset 6: unknown POSIX class name
+Failed: error 130 at offset 14: unknown POSIX class name
/[[:a\dz:]]/
-Failed: error 130 at offset 3: unknown POSIX class name
+Failed: error 130 at offset 9: unknown POSIX class name
/(^(a|b\g<-1'c))/
Failed: error 157 at offset 8: \g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number
@@ -11409,7 +11409,7 @@ Failed: error 171 at offset 4: \N is not supported in a class
aNc
/a[B-\Nc]/
-Failed: error 150 at offset 6: invalid range in character class
+Failed: error 171 at offset 6: \N is not supported in a class
/a[B\Nc]/
Failed: error 171 at offset 5: \N is not supported in a class
@@ -13232,16 +13232,16 @@ Failed: error 178 at offset 5: digits missing in \x{} or \o{} or \N{U+}
------------------------------------------------------------------
/[a-[:digit:]]+/
-Failed: error 150 at offset 4: invalid range in character class
+Failed: error 150 at offset 12: invalid range in character class
/[A-[:digit:]]+/
-Failed: error 150 at offset 4: invalid range in character class
+Failed: error 150 at offset 12: invalid range in character class
/[a-[.xxx.]]+/
-Failed: error 150 at offset 4: invalid range in character class
+Failed: error 150 at offset 10: invalid range in character class
/[a-[=xxx=]]+/
-Failed: error 150 at offset 4: invalid range in character class
+Failed: error 150 at offset 10: invalid range in character class
/[a-[!xxx!]]+/
Failed: error 108 at offset 3: range out of order in character class
@@ -13362,7 +13362,7 @@ No match
No match
/[a[:<:]] should give error/
-Failed: error 130 at offset 4: unknown POSIX class name
+Failed: error 130 at offset 7: unknown POSIX class name
/(?=ab\K)/aftertext,allow_lookaround_bsk
abcd\=startchar
@@ -15510,11 +15510,11 @@ Failed: error 125 at offset 13: lookbehind assertion is not fixed length
# Perl accepts these, but gives a warning. We can't warn, so give an error.
/[a-[:digit:]]+/
-Failed: error 150 at offset 4: invalid range in character class
+Failed: error 150 at offset 12: invalid range in character class
a-a9-a
/[A-[:digit:]]+/
-Failed: error 150 at offset 4: invalid range in character class
+Failed: error 150 at offset 12: invalid range in character class
A-A9-A
/[a-\d]+/
@@ -15651,7 +15651,7 @@ Failed: error 128 at offset 63: assertion expected after (?( or (?(?C)
.+(?(?C'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'))?!XXXX.=X
/[:[:alnum:]-[[a:lnum:]+/
-Failed: error 150 at offset 11: invalid range in character class
+Failed: error 150 at offset 12: invalid range in character class
/((?(?C'')\QX\E(?!((?(?C'')(?!X=X));=)r*X=X));=)/
Failed: error 128 at offset 11: assertion expected after (?( or (?(?C)
@@ -16285,10 +16285,10 @@ Subject length lower bound = 3
------------------------------------------------------------------
/[Q-\N]/B,bad_escape_is_literal
-Failed: error 150 at offset 5: invalid range in character class
+Failed: error 171 at offset 5: \N is not supported in a class
/[\s-_]/bad_escape_is_literal
-Failed: error 150 at offset 3: invalid range in character class
+Failed: error 150 at offset 4: invalid range in character class
/[_-\s]/bad_escape_is_literal
Failed: error 150 at offset 5: invalid range in character class
@@ -16443,19 +16443,19 @@ No match
No match
/[[:digit:]-a]/
-Failed: error 150 at offset 10: invalid range in character class
+Failed: error 150 at offset 11: invalid range in character class
/[[:digit:]-[:print:]]/
-Failed: error 150 at offset 10: invalid range in character class
+Failed: error 150 at offset 11: invalid range in character class
/[\d-a]/
-Failed: error 150 at offset 3: invalid range in character class
+Failed: error 150 at offset 4: invalid range in character class
/[\H-z]/
-Failed: error 150 at offset 3: invalid range in character class
+Failed: error 150 at offset 4: invalid range in character class
/[\d-[:print:]]/
-Failed: error 150 at offset 3: invalid range in character class
+Failed: error 150 at offset 4: invalid range in character class
# Perl gets the second of these wrong, giving no match.
@@ -17816,16 +17816,28 @@ Subject length lower bound = 2
0: a
/[[:digit:] -Z]/xx
-Failed: error 150 at offset 10: invalid range in character class
+Failed: error 150 at offset 14: invalid range in character class
/[\d -Z]/xx
-Failed: error 150 at offset 3: invalid range in character class
+Failed: error 150 at offset 7: invalid range in character class
/[[:digit:]\E-H]/
-Failed: error 150 at offset 10: invalid range in character class
+Failed: error 150 at offset 13: invalid range in character class
/[[:digit:]\Q\E-H]+/
-Failed: error 150 at offset 10: invalid range in character class
+Failed: error 150 at offset 15: invalid range in character class
+
+/[z-[:space:]]/
+Failed: error 150 at offset 12: invalid range in character class
+
+/[z-\d]/
+Failed: error 150 at offset 5: invalid range in character class
+
+/[[:space:]-z]/
+Failed: error 150 at offset 11: invalid range in character class
+
+/[\d-z]/
+Failed: error 150 at offset 4: invalid range in character class
# End of testinput2
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index bf06ee12..0dba11c6 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -795,7 +795,7 @@ No match
No match
/[[:a\x{100}b:]]/utf
-Failed: error 130 at offset 3: unknown POSIX class name
+Failed: error 130 at offset 14: unknown POSIX class name
/a[^]b/utf,allow_empty_class,match_unset_backref
a\x{1234}b
@@ -5403,4 +5403,16 @@ No match
abc
1: >\x{1ff}<
+/[z-\p{Lu}]/
+Failed: error 150 at offset 9: invalid range in character class
+
+/[z-\pL]/
+Failed: error 150 at offset 6: invalid range in character class
+
+/[\p{Lu}-z]/
+Failed: error 150 at offset 8: invalid range in character class
+
+/[\pL-z]/
+Failed: error 150 at offset 5: invalid range in character class
+
# End of testinput5
--
2.33.0

View File

@ -0,0 +1,459 @@
From 6f2da25f009ff463cd9357ae5ebe452fbec8ab5c Mon Sep 17 00:00:00 2001
From: Zoltan Herczeg <zherczeg7@gmail.com>
Date: Fri, 15 Nov 2024 13:21:03 +0100
Subject: [PATCH] Non-recursive scan prefix in JIT (#560)
Conflict:NA
Reference:https://github.com/PCRE2Project/pcre2/commit/6f2da25f009ff463cd9357ae5ebe452fbec8ab5c
---
src/pcre2_jit_compile.c | 238 ++++++++++++++++++++++++++++------------
src/pcre2_jit_test.c | 1 +
2 files changed, 168 insertions(+), 71 deletions(-)
diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c
index 127c393d..4449d59f 100644
--- a/src/pcre2_jit_compile.c
+++ b/src/pcre2_jit_compile.c
@@ -5670,11 +5670,38 @@ if (last)
chars->last_count++;
}
-static int scan_prefix(compiler_common *common, PCRE2_SPTR cc, fast_forward_char_data *chars, int max_chars, sljit_u32 *rec_count)
+/* Value can be increased if needed. Patterns
+such as /(a|){33}b/ can exhaust the stack.
+
+Note: /(a|){29}b/ already stops scan_prefix()
+because it reaches the maximum step_count. */
+#define SCAN_PREFIX_STACK_END 32
+
+/*
+Scan prefix stores the prefix string in the chars array.
+The elements of the chars array is either small character
+sets or "any" (count is set to 255).
+
+Examples (the chars array is represented by a simple regex):
+
+/(abc|xbyd)/ prefix: /[ax]b[cy]/ (length: 3)
+/a[a-z]b+c/ prefix: a.b (length: 3)
+/ab?cd/ prefix: a[bc][cd] (length: 3)
+/(ab|cd)|(ef|gh)/ prefix: [aceg][bdfh] (length: 2)
+
+The length is returned by scan_prefix(). The length is
+less than or equal than the minimum length of the pattern.
+*/
+
+static int scan_prefix(compiler_common *common, PCRE2_SPTR cc, fast_forward_char_data *chars)
{
-/* Recursive function, which scans prefix literals. */
+fast_forward_char_data *chars_start = chars;
+fast_forward_char_data *chars_end = chars + MAX_N_CHARS;
+PCRE2_SPTR cc_stack[SCAN_PREFIX_STACK_END];
+fast_forward_char_data *chars_stack[SCAN_PREFIX_STACK_END];
+sljit_u8 next_alternative_stack[SCAN_PREFIX_STACK_END];
BOOL last, any, class, caseless;
-int len, repeat, len_save, consumed = 0;
+int stack_ptr, step_count, repeat, len, len_save;
sljit_u32 chr; /* Any unicode character. */
sljit_u8 *bytes, *bytes_end, byte;
PCRE2_SPTR alternative, cc_save, oc;
@@ -5687,11 +5714,44 @@ PCRE2_UCHAR othercase[1];
#endif
repeat = 1;
+stack_ptr = 0;
+step_count = 10000;
while (TRUE)
{
- if (*rec_count == 0)
+ if (--step_count == 0)
return 0;
- (*rec_count)--;
+
+ SLJIT_ASSERT(chars <= chars_start + MAX_N_CHARS);
+
+ if (chars >= chars_end)
+ {
+ if (stack_ptr == 0)
+ return (int)(chars_end - chars_start);
+
+ --stack_ptr;
+ cc = cc_stack[stack_ptr];
+ chars = chars_stack[stack_ptr];
+
+ if (chars >= chars_end)
+ continue;
+
+ if (next_alternative_stack[stack_ptr] != 0)
+ {
+ /* When an alternative is processed, the
+ next alternative is pushed onto the stack. */
+ SLJIT_ASSERT(*cc == OP_ALT);
+ alternative = cc + GET(cc, 1);
+ if (*alternative == OP_ALT)
+ {
+ SLJIT_ASSERT(stack_ptr < SCAN_PREFIX_STACK_END);
+ SLJIT_ASSERT(chars_stack[stack_ptr] == chars);
+ SLJIT_ASSERT(next_alternative_stack[stack_ptr] == 1);
+ cc_stack[stack_ptr] = alternative;
+ stack_ptr++;
+ }
+ cc += 1 + LINK_SIZE;
+ }
+ }
last = TRUE;
any = FALSE;
@@ -5768,9 +5828,17 @@ while (TRUE)
#ifdef SUPPORT_UNICODE
if (common->utf && HAS_EXTRALEN(*cc)) len += GET_EXTRALEN(*cc);
#endif
- max_chars = scan_prefix(common, cc + len, chars, max_chars, rec_count);
- if (max_chars == 0)
- return consumed;
+ if (stack_ptr >= SCAN_PREFIX_STACK_END)
+ {
+ chars_end = chars;
+ continue;
+ }
+
+ cc_stack[stack_ptr] = cc + len;
+ chars_stack[stack_ptr] = chars;
+ next_alternative_stack[stack_ptr] = 0;
+ stack_ptr++;
+
last = FALSE;
break;
@@ -5788,12 +5856,18 @@ while (TRUE)
case OP_CBRA:
case OP_CBRAPOS:
alternative = cc + GET(cc, 1);
- while (*alternative == OP_ALT)
+ if (*alternative == OP_ALT)
{
- max_chars = scan_prefix(common, alternative + 1 + LINK_SIZE, chars, max_chars, rec_count);
- if (max_chars == 0)
- return consumed;
- alternative += GET(alternative, 1);
+ if (stack_ptr >= SCAN_PREFIX_STACK_END)
+ {
+ chars_end = chars;
+ continue;
+ }
+
+ cc_stack[stack_ptr] = alternative;
+ chars_stack[stack_ptr] = chars;
+ next_alternative_stack[stack_ptr] = 1;
+ stack_ptr++;
}
if (*cc == OP_CBRA || *cc == OP_CBRAPOS)
@@ -5804,14 +5878,21 @@ while (TRUE)
case OP_CLASS:
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
if (common->utf && !is_char7_bitset((const sljit_u8 *)(cc + 1), FALSE))
- return consumed;
+ {
+ chars_end = chars;
+ continue;
+ }
#endif
class = TRUE;
break;
case OP_NCLASS:
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
- if (common->utf) return consumed;
+ if (common->utf)
+ {
+ chars_end = chars;
+ continue;
+ }
#endif
class = TRUE;
break;
@@ -5819,7 +5900,11 @@ while (TRUE)
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
case OP_XCLASS:
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
- if (common->utf) return consumed;
+ if (common->utf)
+ {
+ chars_end = chars;
+ continue;
+ }
#endif
any = TRUE;
cc += GET(cc, 1);
@@ -5829,7 +5914,10 @@ while (TRUE)
case OP_DIGIT:
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_digit, FALSE))
- return consumed;
+ {
+ chars_end = chars;
+ continue;
+ }
#endif
any = TRUE;
cc++;
@@ -5838,7 +5926,10 @@ while (TRUE)
case OP_WHITESPACE:
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_space, FALSE))
- return consumed;
+ {
+ chars_end = chars;
+ continue;
+ }
#endif
any = TRUE;
cc++;
@@ -5847,7 +5938,10 @@ while (TRUE)
case OP_WORDCHAR:
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_word, FALSE))
- return consumed;
+ {
+ chars_end = chars;
+ continue;
+ }
#endif
any = TRUE;
cc++;
@@ -5863,7 +5957,11 @@ while (TRUE)
case OP_ANY:
case OP_ALLANY:
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
- if (common->utf) return consumed;
+ if (common->utf)
+ {
+ chars_end = chars;
+ continue;
+ }
#endif
any = TRUE;
cc++;
@@ -5873,7 +5971,11 @@ while (TRUE)
case OP_NOTPROP:
case OP_PROP:
#if PCRE2_CODE_UNIT_WIDTH != 32
- if (common->utf) return consumed;
+ if (common->utf)
+ {
+ chars_end = chars;
+ continue;
+ }
#endif
any = TRUE;
cc += 1 + 2;
@@ -5888,7 +5990,11 @@ while (TRUE)
case OP_NOTEXACT:
case OP_NOTEXACTI:
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
- if (common->utf) return consumed;
+ if (common->utf)
+ {
+ chars_end = chars;
+ continue;
+ }
#endif
any = TRUE;
repeat = GET2(cc, 1);
@@ -5896,21 +6002,20 @@ while (TRUE)
break;
default:
- return consumed;
+ chars_end = chars;
+ continue;
}
+ SLJIT_ASSERT(chars < chars_end);
+
if (any)
{
do
{
chars->count = 255;
-
- consumed++;
- if (--max_chars == 0)
- return consumed;
chars++;
}
- while (--repeat > 0);
+ while (--repeat > 0 && chars < chars_end);
repeat = 1;
continue;
@@ -5921,17 +6026,27 @@ while (TRUE)
bytes = (sljit_u8*) (cc + 1);
cc += 1 + 32 / sizeof(PCRE2_UCHAR);
+ SLJIT_ASSERT(last == TRUE && repeat == 1);
switch (*cc)
{
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPOSSTAR:
case OP_CRQUERY:
case OP_CRMINQUERY:
case OP_CRPOSQUERY:
- max_chars = scan_prefix(common, cc + 1, chars, max_chars, rec_count);
- if (max_chars == 0)
- return consumed;
+ last = FALSE;
+ /* Fall through */
+ case OP_CRSTAR:
+ case OP_CRMINSTAR:
+ case OP_CRPOSSTAR:
+ if (stack_ptr >= SCAN_PREFIX_STACK_END)
+ {
+ chars_end = chars;
+ continue;
+ }
+
+ cc_stack[stack_ptr] = ++cc;
+ chars_stack[stack_ptr] = chars;
+ next_alternative_stack[stack_ptr] = 0;
+ stack_ptr++;
break;
default:
@@ -5945,7 +6060,13 @@ while (TRUE)
case OP_CRPOSRANGE:
repeat = GET2(cc, 1);
if (repeat <= 0)
- return consumed;
+ {
+ chars_end = chars;
+ continue;
+ }
+
+ last = (repeat != (int)GET2(cc, 1 + IMM2_SIZE));
+ cc += 1 + 2 * IMM2_SIZE;
break;
}
@@ -5980,36 +6101,13 @@ while (TRUE)
bytes = bytes_end - 32;
}
- consumed++;
- if (--max_chars == 0)
- return consumed;
chars++;
}
- while (--repeat > 0);
-
- switch (*cc)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPOSSTAR:
- return consumed;
-
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- case OP_CRPOSQUERY:
- cc++;
- break;
-
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- case OP_CRPOSRANGE:
- if (GET2(cc, 1) != GET2(cc, 1 + IMM2_SIZE))
- return consumed;
- cc += 1 + 2 * IMM2_SIZE;
- break;
- }
+ while (--repeat > 0 && chars < chars_end);
repeat = 1;
+ if (last)
+ chars_end = chars;
continue;
}
@@ -6025,7 +6123,10 @@ while (TRUE)
{
GETCHAR(chr, cc);
if ((int)PRIV(ord2utf)(char_othercase(common, chr), othercase) != len)
- return consumed;
+ {
+ chars_end = chars;
+ continue;
+ }
}
else
#endif
@@ -6056,7 +6157,6 @@ while (TRUE)
do
{
len--;
- consumed++;
chr = *cc;
add_prefix_char(*cc, chars, len == 0);
@@ -6064,15 +6164,13 @@ while (TRUE)
if (caseless)
add_prefix_char(*oc, chars, len == 0);
- if (--max_chars == 0)
- return consumed;
chars++;
cc++;
oc++;
}
- while (len > 0);
+ while (len > 0 && chars < chars_end);
- if (--repeat == 0)
+ if (--repeat == 0 || chars >= chars_end)
break;
len = len_save;
@@ -6081,7 +6179,7 @@ while (TRUE)
repeat = 1;
if (last)
- return consumed;
+ chars_end = chars;
}
}
@@ -6251,7 +6349,6 @@ int i, max, from;
int range_right = -1, range_len;
sljit_u8 *update_table = NULL;
BOOL in_range;
-sljit_u32 rec_count;
for (i = 0; i < MAX_N_CHARS; i++)
{
@@ -6259,8 +6356,7 @@ for (i = 0; i < MAX_N_CHARS; i++)
chars[i].last_count = 0;
}
-rec_count = 10000;
-max = scan_prefix(common, common->start, chars, MAX_N_CHARS, &rec_count);
+max = scan_prefix(common, common->start, chars);
if (max < 1)
return FALSE;
diff --git a/src/pcre2_jit_test.c b/src/pcre2_jit_test.c
index 28bc7af9..066095fe 100644
--- a/src/pcre2_jit_test.c
+++ b/src/pcre2_jit_test.c
@@ -286,6 +286,7 @@ static struct regression_test_case regression_test_cases[] = {
{ CMU, A, 0, 0, "(a|b)?\?d((?:e)?)", "ABABdx" },
{ MU, A, 0, 0, "(a|b)?\?d((?:e)?)", "abcde" },
{ MU, A, 0, 0, "((?:ab)?\?g|b(?:g(nn|d)?\?)?)?\?(?:n)?m", "abgnbgnnbgdnmm" },
+ { M, A, 0, 0, "(?:a?|a)b", "ba" },
/* Greedy and non-greedy + operators */
{ MU, A, 0, 0, "(aa)+aa", "aaaaaaa" },
--
2.33.0

View File

@ -0,0 +1,270 @@
From 64549346f044dec18d18d06c2d08a68a68e26817 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= <carenas@gmail.com>
Date: Sun, 9 Apr 2023 04:29:46 -0700
Subject: [PATCH] avoid inconsistency between \d and [:digit:] when using /a
(#223)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Conflict:don't modify Changelog and doc/*; keep pcre2.h.generic consistent
with pcre2.h.in according to 1de7291
Reference:https://github.com/PCRE2Project/pcre2/commit/64549346f044dec18d18d06c2d08a68a68e26817
Since a608946 (Additional PCRE2_EXTRA_ASCII_xxx code, 2023-02-01)
PCRE2_EXTRA_ASCII_BSD could be used to restrict \d to ASCII causing
the following inconsistent behaviour in UCP mode.
PCRE2 version 10.43-DEV 2023-01-15
re> /\d/utf,ucp,ascii_bsd
data> ٣
No match
data>
re> /[[:digit:]]/utf,ucp,ascii_bsd
data> ٣
0: \x{663}
It has been suggested[1] that the change to match \p{Nd} when Unicode
is enabled for [:digit:] might had been unintentional and a bug, as
[:digit:] should be able to be POSIX compatible, so add a new flag
PCRE2_EXTRA_ASCII_DIGIT to avoid changing its definition in UCP mode.
[1] https://lore.kernel.org/git/CANgJU+U+xXsh9psd0z5Xjr+Se5QgdKkjQ7LUQ-PdUULSN3n4+g@mail.gmail.com/
---
src/pcre2.h.generic | 6 ++++++
src/pcre2.h.in | 1 +
src/pcre2_compile.c | 6 ++++--
src/pcre2test.c | 4 +++-
testdata/testinput5 | 10 +++++++++-
testdata/testinput7 | 10 ++++++++--
testdata/testoutput5 | 19 ++++++++++++++++++-
testdata/testoutput7 | 13 +++++++++++--
8 files changed, 60 insertions(+), 9 deletions(-)
diff --git a/src/pcre2.h.generic b/src/pcre2.h.generic
index dad774ce..05cf9bc1 100644
--- a/src/pcre2.h.generic
+++ b/src/pcre2.h.generic
@@ -153,6 +153,12 @@ D is inspected during pcre2_dfa_match() execution
#define PCRE2_EXTRA_ESCAPED_CR_IS_LF 0x00000010u /* C */
#define PCRE2_EXTRA_ALT_BSUX 0x00000020u /* C */
#define PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK 0x00000040u /* C */
+#define PCRE2_EXTRA_CASELESS_RESTRICT 0x00000080u /* C */
+#define PCRE2_EXTRA_ASCII_BSD 0x00000100u /* C */
+#define PCRE2_EXTRA_ASCII_BSS 0x00000200u /* C */
+#define PCRE2_EXTRA_ASCII_BSW 0x00000400u /* C */
+#define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */
+#define PCRE2_EXTRA_ASCII_DIGIT 0x00001000u /* C */
/* These are for pcre2_jit_compile(). */
diff --git a/src/pcre2.h.in b/src/pcre2.h.in
index 7202c633..cd7fdcf2 100644
--- a/src/pcre2.h.in
+++ b/src/pcre2.h.in
@@ -158,6 +158,7 @@ D is inspected during pcre2_dfa_match() execution
#define PCRE2_EXTRA_ASCII_BSS 0x00000200u /* C */
#define PCRE2_EXTRA_ASCII_BSW 0x00000400u /* C */
#define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */
+#define PCRE2_EXTRA_ASCII_DIGIT 0x00001000u /* C */
/* These are for pcre2_jit_compile(). */
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index 95c4a79d..634360b7 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -786,7 +786,8 @@ are allowed. */
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
- PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX)
+ PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
+ PCRE2_EXTRA_ASCII_DIGIT)
/* Compile time error code numbers. They are given names so that they can more
easily be tracked. When a new number is added, the tables called eint1 and
@@ -3581,7 +3582,8 @@ while (ptr < ptrend)
#ifdef SUPPORT_UNICODE
if ((options & PCRE2_UCP) != 0 &&
- (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)
+ (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
+ !(posix_class == 7 && (xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0))
{
int ptype = posix_substitutes[2*posix_class];
int pvalue = posix_substitutes[2*posix_class + 1];
diff --git a/src/pcre2test.c b/src/pcre2test.c
index 4da3ef90..21b19370 100644
--- a/src/pcre2test.c
+++ b/src/pcre2test.c
@@ -651,6 +651,7 @@ static modstruct modlist[] = {
{ "ascii_bsd", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSD, CO(extra_options) },
{ "ascii_bss", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSS, CO(extra_options) },
{ "ascii_bsw", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSW, CO(extra_options) },
+ { "ascii_digit", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_DIGIT, CO(extra_options) },
{ "ascii_posix", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_POSIX, CO(extra_options) },
{ "auto_callout", MOD_PAT, MOD_OPT, PCRE2_AUTO_CALLOUT, PO(options) },
{ "bad_escape_is_literal", MOD_CTC, MOD_OPT, PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL, CO(extra_options) },
@@ -4294,13 +4295,14 @@ show_compile_extra_options(uint32_t options, const char *before,
const char *after)
{
if (options == 0) fprintf(outfile, "%s <none>%s", before, after);
-else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s",
+else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
before,
((options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)? " allow_surrogate_escapes" : "",
((options & PCRE2_EXTRA_ALT_BSUX) != 0)? " alt_bsux" : "",
((options & PCRE2_EXTRA_ASCII_BSD) != 0)? " ascii_bsd" : "",
((options & PCRE2_EXTRA_ASCII_BSS) != 0)? " ascii_bss" : "",
((options & PCRE2_EXTRA_ASCII_BSW) != 0)? " ascii_bsw" : "",
+ ((options & PCRE2_EXTRA_ASCII_DIGIT) != 0)? " ascii_digit" : "",
((options & PCRE2_EXTRA_ASCII_POSIX) != 0)? " ascii_posix" : "",
((options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) != 0)? " bad_escape_is_literal" : "",
((options & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)? " caseless_restrict" : "",
diff --git a/testdata/testinput5 b/testdata/testinput5
index 0f105408..0624a0c3 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -1215,6 +1215,8 @@
/[[:digit:]]/B,ucp
+/[[:digit:]]/B,ucp,ascii_digit
+
/[[:graph:]]/B,ucp
/[[:print:]]/B,ucp
@@ -1227,7 +1229,7 @@
/[[:xdigit:]]/B,ucp
-# Unicode properties for \b abd \B
+# Unicode properties for \b and \B
/\b...\B/utf,ucp
abc_
@@ -2431,6 +2433,12 @@
/[[:digit:]]+/utf,ucp
123\x{660}456
+/[[:digit:]]+/utf,ucp,ascii_digit
+ 123\x{660}456
+
+/[[:digit:]]+/g,utf,ucp,ascii_digit
+ 123\x{660}456
+
/[[:digit:]]+/utf,ucp,ascii_posix
123\x{660}456
diff --git a/testdata/testinput7 b/testdata/testinput7
index a2b7fb8d..96deaa30 100644
--- a/testdata/testinput7
+++ b/testdata/testinput7
@@ -1657,7 +1657,7 @@
/^[\p{Xwd}]+/utf
ABCD1234\x{6ca}\x{a6c}\x{10a7}_
-# Unicode properties for \b abd \B
+# Unicode properties for \b and \B
/\b...\B/utf,ucp
abc_
@@ -2435,9 +2435,15 @@
/[[:digit:]]+/utf,ucp
123\x{660}456
+/[[:digit:]]+/utf,ucp,ascii_digit
+ 123\x{660}456
+
+/[[:digit:]]+/g,utf,ucp,ascii_digit
+ 123\x{660}456
+
/[[:digit:]]+/utf,ucp,ascii_posix
123\x{660}456
-
+
/>[[:space:]]+</utf,ucp
>\x{a0} \x{a0}<
>\x{a0}\x{a0}\x{a0}<
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index 3cee990e..febcc954 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -2520,6 +2520,14 @@ No match
End
------------------------------------------------------------------
+/[[:digit:]]/B,ucp,ascii_digit
+------------------------------------------------------------------
+ Bra
+ [0-9]
+ Ket
+ End
+------------------------------------------------------------------
+
/[[:graph:]]/B,ucp
------------------------------------------------------------------
Bra
@@ -2568,7 +2576,7 @@ No match
End
------------------------------------------------------------------
-# Unicode properties for \b abd \B
+# Unicode properties for \b and \B
/\b...\B/utf,ucp
abc_
@@ -5359,6 +5367,15 @@ No match
123\x{660}456
0: 123\x{660}456
+/[[:digit:]]+/utf,ucp,ascii_digit
+ 123\x{660}456
+ 0: 123
+
+/[[:digit:]]+/g,utf,ucp,ascii_digit
+ 123\x{660}456
+ 0: 123
+ 0: 456
+
/[[:digit:]]+/utf,ucp,ascii_posix
123\x{660}456
0: 123
diff --git a/testdata/testoutput7 b/testdata/testoutput7
index 4065981d..d98178e6 100644
--- a/testdata/testoutput7
+++ b/testdata/testoutput7
@@ -2853,7 +2853,7 @@ No match
ABCD1234\x{6ca}\x{a6c}\x{10a7}_
0: ABCD1234\x{6ca}\x{a6c}\x{10a7}_
-# Unicode properties for \b abd \B
+# Unicode properties for \b and \B
/\b...\B/utf,ucp
abc_
@@ -4080,10 +4080,19 @@ No match
123\x{660}456
0: 123\x{660}456
+/[[:digit:]]+/utf,ucp,ascii_digit
+ 123\x{660}456
+ 0: 123
+
+/[[:digit:]]+/g,utf,ucp,ascii_digit
+ 123\x{660}456
+ 0: 123
+ 0: 456
+
/[[:digit:]]+/utf,ucp,ascii_posix
123\x{660}456
0: 123
-
+
/>[[:space:]]+</utf,ucp
>\x{a0} \x{a0}<
0: >\x{a0} \x{a0}<
--
2.33.0

View File

@ -1,6 +1,6 @@
Name: pcre2
Version: 10.42
Release: 11
Release: 12
Summary: Perl Compatible Regular Expressions
License: BSD
URL: http://www.pcre.org/
@ -39,6 +39,11 @@ Patch6027: backport-Add-Perl-titlecasing-475.patch
Patch6028: backport-Fix-incorrect-positive-error-code-from-pcre2_substitute.patch
Patch6029: backport-pcre2_compile-avoid-1-byte-buffer-overread-parsing-V.patch
Patch6030: backport-Improve-error-message-for-N-name-in-character-classes.patch
Patch6031: backport-Further-ASCII-tests-and-minor-bugfix-plus-ChangeLog-.patch
Patch6032: backport-avoid-inconsistency-between-d-and-digit-when-using-a.patch
Patch6033: backport-Fix-the-lookahead-after-d-or-posix-to-skip-whitespac.patch
Patch6034: backport-Improve-error-offsets-for-character-classes-548.patch
Patch6035: backport-Non-recursive-scan-prefix-in-JIT-560.patch
BuildRequires: autoconf libtool automake coreutils gcc make readline-devel
Obsoletes: pcre2-utf16 pcre2-utf32 pcre2-tools
@ -156,6 +161,14 @@ make check
%{_pkgdocdir}/html/
%changelog
* Tue Dec 10 2024 hugel <gengqihu2@h-partners.com> - 10.42-12
- DESC:sync patches from upstream
backport-Further-ASCII-tests-and-minor-bugfix-plus-ChangeLog-.patch
backport-avoid-inconsistency-between-d-and-digit-when-using-a.patch
backport-Fix-the-lookahead-after-d-or-posix-to-skip-whitespac.patch
backport-Improve-error-offsets-for-character-classes-548.patch
backport-Non-recursive-scan-prefix-in-JIT-560.patch
* Tue Nov 19 2024 yanglongkang <yanglongkang@h-partners.com> - 10.42-11
- DESC:sync patches from upstream