sync patches from upstream
This commit is contained in:
parent
3822b2533a
commit
abeb907f1d
@ -0,0 +1,318 @@
|
||||
From 16d7edb56757e5294eeeecc9a19135aab89a50ba Mon Sep 17 00:00:00 2001
|
||||
From: Nicholas Wilson <niwilson@microsoft.com>
|
||||
Date: Fri, 1 Nov 2024 17:13:34 +0000
|
||||
Subject: [PATCH] Fix the lookahead after [\d or [[:posix] to skip whitespace
|
||||
(#544)
|
||||
|
||||
Conflict:don't modify alt_extended_class because fc38d9e784 is not merged;
|
||||
don't modify class_op_state because class_op_state is not merged; adapt context
|
||||
Reference:https://github.com/PCRE2Project/pcre2/commit/16d7edb56757e5294eeeecc9a19135aab89a50ba
|
||||
|
||||
---
|
||||
src/pcre2_compile.c | 88 +++++++++++++++++++++++++++---------------
|
||||
src/pcre2_intmodedep.h | 2 +-
|
||||
testdata/testinput1 | 20 +++++++---
|
||||
testdata/testinput2 | 8 ++++
|
||||
testdata/testoutput1 | 30 ++++++++++----
|
||||
testdata/testoutput2 | 12 ++++++
|
||||
6 files changed, 113 insertions(+), 47 deletions(-)
|
||||
|
||||
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
|
||||
index 2493c871..9be26b07 100644
|
||||
--- a/src/pcre2_compile.c
|
||||
+++ b/src/pcre2_compile.c
|
||||
@@ -2681,7 +2681,14 @@ the main compiling phase. */
|
||||
/* States used for analyzing ranges in character classes. The two OK values
|
||||
must be last. */
|
||||
|
||||
-enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
|
||||
+enum {
|
||||
+ RANGE_NO, /* State after '[' (initial), or '[a-z'; hyphen is literal */
|
||||
+ RANGE_STARTED, /* State after '[1-'; last-emitted code is META_RANGE_XYZ */
|
||||
+ RANGE_FORBID_NO, /* State after '[\d'; '-]' is allowed but not '-1]' */
|
||||
+ RANGE_FORBID_STARTED, /* State after '[\d-'*/
|
||||
+ RANGE_OK_ESCAPED, /* State after '[1'; hyphen may be a range */
|
||||
+ RANGE_OK_LITERAL /* State after '[\1'; hyphen may be a range */
|
||||
+};
|
||||
|
||||
/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
|
||||
the storing of literal values in the main parsed pattern, where they can always
|
||||
@@ -2734,6 +2741,7 @@ PCRE2_SPTR thisptr;
|
||||
PCRE2_SPTR name;
|
||||
PCRE2_SPTR ptrend = cb->end_pattern;
|
||||
PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */
|
||||
+PCRE2_SPTR class_range_forbid_ptr = NULL;
|
||||
named_group *ng;
|
||||
nest_save *top_nest, *end_nests;
|
||||
|
||||
@@ -3559,6 +3567,21 @@ while (ptr < ptrend)
|
||||
goto FAILED;
|
||||
}
|
||||
|
||||
+ /* Perl treats a hyphen after a POSIX class as a literal, not the
|
||||
+ start of a range. However, it gives a warning in its warning mode
|
||||
+ unless the hyphen is the last character in the class. PCRE does not
|
||||
+ have a warning mode, so we give an error, because this is likely an
|
||||
+ error on the user's part.
|
||||
+
|
||||
+ Roll back to the hyphen for the error position. */
|
||||
+
|
||||
+ if (class_range_state == RANGE_FORBID_STARTED)
|
||||
+ {
|
||||
+ ptr = class_range_forbid_ptr;
|
||||
+ errorcode = ERR50;
|
||||
+ goto FAILED;
|
||||
+ }
|
||||
+
|
||||
if (*ptr != CHAR_COLON)
|
||||
{
|
||||
errorcode = ERR13;
|
||||
@@ -3579,26 +3602,12 @@ while (ptr < ptrend)
|
||||
}
|
||||
ptr = tempptr + 2;
|
||||
|
||||
- /* Perl treats a hyphen after a POSIX class as a literal, not the
|
||||
- start of a range. However, it gives a warning in its warning mode
|
||||
- unless the hyphen is the last character in the class. PCRE does not
|
||||
- have a warning mode, so we give an error, because this is likely an
|
||||
- error on the user's part. */
|
||||
-
|
||||
- if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
|
||||
- ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
|
||||
- {
|
||||
- errorcode = ERR50;
|
||||
- goto FAILED;
|
||||
- }
|
||||
-
|
||||
- /* Set "a hyphen is not the start of a range" for the -] case, and also
|
||||
- in case the POSIX class is followed by \E or \Q\E (possibly repeated -
|
||||
- fuzzers do that kind of thing) and *then* a hyphen. This causes that
|
||||
- hyphen to be treated as a literal. I don't think it's worth setting up
|
||||
- special apparatus to do otherwise. */
|
||||
+ /* Set "a hyphen is forbidden to be the start of a range". For the '-]'
|
||||
+ case, the hyphen is treated as a literal, but for '-1' it is disallowed
|
||||
+ (because it would be interpreted as range). */
|
||||
|
||||
- class_range_state = RANGE_NO;
|
||||
+ class_range_state = RANGE_FORBID_NO;
|
||||
+ class_range_forbid_ptr = ptr;
|
||||
|
||||
/* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
|
||||
of the POSIX classes are converted to use Unicode properties \p or \P
|
||||
@@ -3648,6 +3657,14 @@ while (ptr < ptrend)
|
||||
class_range_state = RANGE_STARTED;
|
||||
}
|
||||
|
||||
+ /* Handle forbidden start of range */
|
||||
+
|
||||
+ else if (c == CHAR_MINUS && class_range_state == RANGE_FORBID_NO)
|
||||
+ {
|
||||
+ *parsed_pattern++ = CHAR_MINUS;
|
||||
+ class_range_state = RANGE_FORBID_STARTED;
|
||||
+ }
|
||||
+
|
||||
/* Handle a literal character */
|
||||
|
||||
else if (c != CHAR_BACKSLASH)
|
||||
@@ -3670,6 +3687,12 @@ while (ptr < ptrend)
|
||||
}
|
||||
class_range_state = RANGE_NO;
|
||||
}
|
||||
+ else if (class_range_state == RANGE_FORBID_STARTED)
|
||||
+ {
|
||||
+ ptr = class_range_forbid_ptr;
|
||||
+ errorcode = ERR50;
|
||||
+ goto FAILED;
|
||||
+ }
|
||||
else /* Potential start of range */
|
||||
{
|
||||
class_range_state = char_is_literal?
|
||||
@@ -3733,13 +3756,23 @@ while (ptr < ptrend)
|
||||
if (class_range_state == RANGE_STARTED)
|
||||
{
|
||||
errorcode = ERR50;
|
||||
- goto FAILED; /* Not CLASS_ESCAPE_FAILED; always an error */
|
||||
+ goto FAILED;
|
||||
+ }
|
||||
+ /* Perl gives a warning unless the hyphen following a multi-character
|
||||
+ escape is the last character in the class. PCRE throws an error. */
|
||||
+ if (class_range_state == RANGE_FORBID_STARTED)
|
||||
+ {
|
||||
+ ptr = class_range_forbid_ptr;
|
||||
+ errorcode = ERR50;
|
||||
+ goto FAILED;
|
||||
}
|
||||
|
||||
/* Of the remaining escapes, only those that define characters are
|
||||
allowed in a class. None may start a range. */
|
||||
|
||||
- class_range_state = RANGE_NO;
|
||||
+ class_range_state = RANGE_FORBID_NO;
|
||||
+ class_range_forbid_ptr = ptr;
|
||||
+
|
||||
switch(escape)
|
||||
{
|
||||
case ESC_N:
|
||||
@@ -3779,6 +3812,7 @@ while (ptr < ptrend)
|
||||
if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
|
||||
*parsed_pattern++ = META_ESCAPE + escape;
|
||||
*parsed_pattern++ = (ptype << 16) | pdata;
|
||||
+ class_range_forbid_ptr = ptr;
|
||||
}
|
||||
#else
|
||||
errorcode = ERR45;
|
||||
@@ -3791,16 +3825,6 @@ while (ptr < ptrend)
|
||||
ptr--;
|
||||
goto FAILED;
|
||||
}
|
||||
-
|
||||
- /* Perl gives a warning unless a following hyphen is the last character
|
||||
- in the class. PCRE throws an error. */
|
||||
-
|
||||
- if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
|
||||
- ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
|
||||
- {
|
||||
- errorcode = ERR50;
|
||||
- goto FAILED;
|
||||
- }
|
||||
}
|
||||
|
||||
/* Proceed to next thing in the class. */
|
||||
diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h
|
||||
index 598060c9..a11b4faa 100644
|
||||
--- a/src/pcre2_intmodedep.h
|
||||
+++ b/src/pcre2_intmodedep.h
|
||||
@@ -435,7 +435,7 @@ UTF-16 mode. */
|
||||
c = *eptr; \
|
||||
if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
|
||||
|
||||
-/* Get the next UTF-816character, testing for UTF-16 mode, not advancing the
|
||||
+/* Get the next UTF-16 character, testing for UTF-16 mode, not advancing the
|
||||
pointer, incrementing length if there is a low surrogate. This is called when
|
||||
we do not know if we are in UTF-16 mode. */
|
||||
|
||||
diff --git a/testdata/testinput1 b/testdata/testinput1
|
||||
index 0794502e..1e50369f 100644
|
||||
--- a/testdata/testinput1
|
||||
+++ b/testdata/testinput1
|
||||
@@ -5787,12 +5787,6 @@ ef) x/x,mark
|
||||
|
||||
/(?'c')XX(?'YYYYYYYYYYYYYYYYYYYYYYYCl')/
|
||||
|
||||
-/[s[:digit:]\E-H]+/
|
||||
- s09-H
|
||||
-
|
||||
-/[s[:digit:]\Q\E-H]+/
|
||||
- s09-H
|
||||
-
|
||||
/a+(?:|b)a/
|
||||
aaaa
|
||||
|
||||
@@ -6435,4 +6429,18 @@ ef) x/x,mark
|
||||
/(a\K.(?1)*)/
|
||||
abac
|
||||
|
||||
+/[[:digit:]- ]/xx
|
||||
+ 1
|
||||
+ -
|
||||
+\= Expect no match
|
||||
+ z
|
||||
+ \ \
|
||||
+
|
||||
+/[\d- ]/xx
|
||||
+ 1
|
||||
+ -
|
||||
+\= Expect no match
|
||||
+ z
|
||||
+ \ \
|
||||
+
|
||||
# End of testinput1
|
||||
diff --git a/testdata/testinput2 b/testdata/testinput2
|
||||
index b6464a0b..61b94e69 100644
|
||||
--- a/testdata/testinput2
|
||||
+++ b/testdata/testinput2
|
||||
@@ -5981,4 +5981,12 @@ a)"xI
|
||||
a
|
||||
a\=noteol
|
||||
|
||||
+/[[:digit:] -Z]/xx
|
||||
+
|
||||
+/[\d -Z]/xx
|
||||
+
|
||||
+/[[:digit:]\E-H]/
|
||||
+
|
||||
+/[[:digit:]\Q\E-H]+/
|
||||
+
|
||||
# End of testinput2
|
||||
diff --git a/testdata/testoutput1 b/testdata/testoutput1
|
||||
index 8daf8362..6f927729 100644
|
||||
--- a/testdata/testoutput1
|
||||
+++ b/testdata/testoutput1
|
||||
@@ -9246,14 +9246,6 @@ No match
|
||||
|
||||
/(?'c')XX(?'YYYYYYYYYYYYYYYYYYYYYYYCl')/
|
||||
|
||||
-/[s[:digit:]\E-H]+/
|
||||
- s09-H
|
||||
- 0: s09-H
|
||||
-
|
||||
-/[s[:digit:]\Q\E-H]+/
|
||||
- s09-H
|
||||
- 0: s09-H
|
||||
-
|
||||
/a+(?:|b)a/
|
||||
aaaa
|
||||
0: aaaa
|
||||
@@ -10197,4 +10189,26 @@ No match
|
||||
0: c
|
||||
1: abac
|
||||
|
||||
+/[[:digit:]- ]/xx
|
||||
+ 1
|
||||
+ 0: 1
|
||||
+ -
|
||||
+ 0: -
|
||||
+\= Expect no match
|
||||
+ z
|
||||
+No match
|
||||
+ \ \
|
||||
+No match
|
||||
+
|
||||
+/[\d- ]/xx
|
||||
+ 1
|
||||
+ 0: 1
|
||||
+ -
|
||||
+ 0: -
|
||||
+\= Expect no match
|
||||
+ z
|
||||
+No match
|
||||
+ \ \
|
||||
+No match
|
||||
+
|
||||
# End of testinput1
|
||||
diff --git a/testdata/testoutput2 b/testdata/testoutput2
|
||||
index 1075b4d4..86bfe964 100644
|
||||
--- a/testdata/testoutput2
|
||||
+++ b/testdata/testoutput2
|
||||
@@ -17815,6 +17815,18 @@ Subject length lower bound = 2
|
||||
a\=noteol
|
||||
0: a
|
||||
|
||||
+/[[:digit:] -Z]/xx
|
||||
+Failed: error 150 at offset 10: invalid range in character class
|
||||
+
|
||||
+/[\d -Z]/xx
|
||||
+Failed: error 150 at offset 3: invalid range in character class
|
||||
+
|
||||
+/[[:digit:]\E-H]/
|
||||
+Failed: error 150 at offset 10: invalid range in character class
|
||||
+
|
||||
+/[[:digit:]\Q\E-H]+/
|
||||
+Failed: error 150 at offset 10: invalid range in character class
|
||||
+
|
||||
# End of testinput2
|
||||
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
|
||||
Error -62: bad serialized data
|
||||
--
|
||||
2.33.0
|
||||
|
||||
@ -0,0 +1,104 @@
|
||||
From fc56fd790c1a3ba8f2890fc2b6afba21250923de Mon Sep 17 00:00:00 2001
|
||||
From: Philip Hazel <Philip.Hazel@gmail.com>
|
||||
Date: Thu, 2 Feb 2023 17:19:45 +0000
|
||||
Subject: [PATCH] Further ASCII tests and minor bugfix plus ChangeLog update
|
||||
|
||||
Conflict:don't modify ChangeLog
|
||||
Reference:https://github.com/PCRE2Project/pcre2/commit/fc56fd790c1a3ba8f2890fc2b6afba21250923de
|
||||
|
||||
---
|
||||
src/pcre2_compile.c | 5 ++---
|
||||
testdata/testinput5 | 5 +++++
|
||||
testdata/testinput7 | 5 +++++
|
||||
testdata/testoutput5 | 7 +++++++
|
||||
testdata/testoutput7 | 7 +++++++
|
||||
5 files changed, 26 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
|
||||
index b8a9e098..64a35bda 100644
|
||||
--- a/src/pcre2_compile.c
|
||||
+++ b/src/pcre2_compile.c
|
||||
@@ -2660,10 +2660,9 @@ the main compiling phase. */
|
||||
PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
|
||||
PCRE2_UNGREEDY)
|
||||
|
||||
-#define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT)
|
||||
-
|
||||
#define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \
|
||||
- PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW)
|
||||
+ PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \
|
||||
+ PCRE2_EXTRA_ASCII_POSIX)
|
||||
|
||||
/* States used for analyzing ranges in character classes. The two OK values
|
||||
must be last. */
|
||||
diff --git a/testdata/testinput5 b/testdata/testinput5
|
||||
index 6e186cf0..49b46f82 100644
|
||||
--- a/testdata/testinput5
|
||||
+++ b/testdata/testinput5
|
||||
@@ -2434,6 +2434,11 @@
|
||||
/(?aP)[[:alnum:]\d]+/i,ucp,utf
|
||||
abc\x{660}xyz
|
||||
|
||||
+/(*UCP)(*UTF)[[:alnum:]](?aP:[[:alnum:]])[[:alnum:]]/
|
||||
+ \x{660}A\x{660}
|
||||
+\= Expect no match
|
||||
+ \x{660}\x{660}\x{660}
|
||||
+
|
||||
# VARIOUS
|
||||
|
||||
/[\d\s\w]+/a,ucp,utf
|
||||
diff --git a/testdata/testinput7 b/testdata/testinput7
|
||||
index 64a37ad2..a2b7fb8d 100644
|
||||
--- a/testdata/testinput7
|
||||
+++ b/testdata/testinput7
|
||||
@@ -2453,6 +2453,11 @@
|
||||
/(?aP)[[:alnum:]\d]+/i,ucp,utf
|
||||
abc\x{660}xyz
|
||||
|
||||
+/(*UCP)(*UTF)[[:alnum:]](?aP:[[:alnum:]])[[:alnum:]]/
|
||||
+ \x{660}A\x{660}
|
||||
+\= Expect no match
|
||||
+ \x{660}\x{660}\x{660}
|
||||
+
|
||||
# VARIOUS
|
||||
|
||||
/[\d\s\w]+/a,ucp,utf
|
||||
diff --git a/testdata/testoutput5 b/testdata/testoutput5
|
||||
index 26972f70..4f845c84 100644
|
||||
--- a/testdata/testoutput5
|
||||
+++ b/testdata/testoutput5
|
||||
@@ -5365,6 +5365,13 @@ No match
|
||||
abc\x{660}xyz
|
||||
0: abc\x{660}xyz
|
||||
|
||||
+/(*UCP)(*UTF)[[:alnum:]](?aP:[[:alnum:]])[[:alnum:]]/
|
||||
+ \x{660}A\x{660}
|
||||
+ 0: \x{660}A\x{660}
|
||||
+\= Expect no match
|
||||
+ \x{660}\x{660}\x{660}
|
||||
+No match
|
||||
+
|
||||
# VARIOUS
|
||||
|
||||
/[\d\s\w]+/a,ucp,utf
|
||||
diff --git a/testdata/testoutput7 b/testdata/testoutput7
|
||||
index c830748c..4065981d 100644
|
||||
--- a/testdata/testoutput7
|
||||
+++ b/testdata/testoutput7
|
||||
@@ -4105,6 +4105,13 @@ No match
|
||||
abc\x{660}xyz
|
||||
0: abc\x{660}xyz
|
||||
|
||||
+/(*UCP)(*UTF)[[:alnum:]](?aP:[[:alnum:]])[[:alnum:]]/
|
||||
+ \x{660}A\x{660}
|
||||
+ 0: \x{660}A\x{660}
|
||||
+\= Expect no match
|
||||
+ \x{660}\x{660}\x{660}
|
||||
+No match
|
||||
+
|
||||
# VARIOUS
|
||||
|
||||
/[\d\s\w]+/a,ucp,utf
|
||||
--
|
||||
2.33.0
|
||||
|
||||
425
backport-Improve-error-offsets-for-character-classes-548.patch
Normal file
425
backport-Improve-error-offsets-for-character-classes-548.patch
Normal file
@ -0,0 +1,425 @@
|
||||
From 6185344ed8617ff84a08764e808e5b3667c34a7a Mon Sep 17 00:00:00 2001
|
||||
From: Nicholas Wilson <niwilson@microsoft.com>
|
||||
Date: Wed, 6 Nov 2024 08:45:46 +0000
|
||||
Subject: [PATCH] Improve error offsets for character classes (#548)
|
||||
|
||||
Conflict:don't modify alt_extended_class because fc38d9e784 is not merged;
|
||||
don't modify class_op_state because class_op_state is not merged; adapt context
|
||||
Reference:https://github.com/PCRE2Project/pcre2/commit/6185344ed8617ff84a08764e808e5b3667c34a7a
|
||||
|
||||
* Error offset should be advanced by one character for "[\d-z]"
|
||||
invalid range error
|
||||
|
||||
The code does a 1-char lookahead for a hyphen, but then doesn't
|
||||
advance the pointer to consume the hyphen when returning the error.
|
||||
|
||||
Perl's error message (with "use warnings") does advance to just
|
||||
after the hyphen, so PCRE2 should match.
|
||||
|
||||
Fixes #545.
|
||||
|
||||
* Also improve error offsets for [[:bad:]], [[=...=]] and [z-\p{...}]
|
||||
cases
|
||||
---
|
||||
src/pcre2_compile.c | 67 +++++++++++++++++++-------------------
|
||||
testdata/testinput2 | 8 +++++
|
||||
testdata/testinput5 | 8 +++++
|
||||
testdata/testoutput2 | 76 +++++++++++++++++++++++++-------------------
|
||||
testdata/testoutput5 | 14 +++++++-
|
||||
5 files changed, 106 insertions(+), 67 deletions(-)
|
||||
|
||||
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
|
||||
index 32db44db..290e759b 100644
|
||||
--- a/src/pcre2_compile.c
|
||||
+++ b/src/pcre2_compile.c
|
||||
@@ -3563,6 +3563,7 @@ while (ptr < ptrend)
|
||||
|
||||
if (class_range_state == RANGE_STARTED)
|
||||
{
|
||||
+ ptr = tempptr + 2;
|
||||
errorcode = ERR50;
|
||||
goto FAILED;
|
||||
}
|
||||
@@ -3584,8 +3585,9 @@ while (ptr < ptrend)
|
||||
|
||||
if (*ptr != CHAR_COLON)
|
||||
{
|
||||
+ ptr = tempptr + 2;
|
||||
errorcode = ERR13;
|
||||
- goto FAILED_BACK;
|
||||
+ goto FAILED;
|
||||
}
|
||||
|
||||
if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
|
||||
@@ -3595,19 +3597,18 @@ while (ptr < ptrend)
|
||||
}
|
||||
|
||||
posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
|
||||
+ ptr = tempptr + 2;
|
||||
if (posix_class < 0)
|
||||
{
|
||||
errorcode = ERR30;
|
||||
goto FAILED;
|
||||
}
|
||||
- ptr = tempptr + 2;
|
||||
|
||||
/* Set "a hyphen is forbidden to be the start of a range". For the '-]'
|
||||
case, the hyphen is treated as a literal, but for '-1' it is disallowed
|
||||
(because it would be interpreted as range). */
|
||||
|
||||
class_range_state = RANGE_FORBID_NO;
|
||||
- class_range_forbid_ptr = ptr;
|
||||
|
||||
/* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
|
||||
of the POSIX classes are converted to use Unicode properties \p or \P
|
||||
@@ -3664,6 +3665,7 @@ while (ptr < ptrend)
|
||||
{
|
||||
*parsed_pattern++ = CHAR_MINUS;
|
||||
class_range_state = RANGE_FORBID_STARTED;
|
||||
+ class_range_forbid_ptr = ptr;
|
||||
}
|
||||
|
||||
/* Handle a literal character */
|
||||
@@ -3746,37 +3748,8 @@ while (ptr < ptrend)
|
||||
errorcode = ERR7;
|
||||
ptr--;
|
||||
goto FAILED;
|
||||
- }
|
||||
|
||||
- /* The second part of a range can be a single-character escape
|
||||
- sequence (detected above), but not any of the other escapes. Perl
|
||||
- treats a hyphen as a literal in such circumstances. However, in Perl's
|
||||
- warning mode, a warning is given, so PCRE now faults it, as it is
|
||||
- almost certainly a mistake on the user's part. */
|
||||
-
|
||||
- if (class_range_state == RANGE_STARTED)
|
||||
- {
|
||||
- errorcode = ERR50;
|
||||
- goto FAILED;
|
||||
- }
|
||||
- /* Perl gives a warning unless the hyphen following a multi-character
|
||||
- escape is the last character in the class. PCRE throws an error. */
|
||||
- if (class_range_state == RANGE_FORBID_STARTED)
|
||||
- {
|
||||
- ptr = class_range_forbid_ptr;
|
||||
- errorcode = ERR50;
|
||||
- goto FAILED;
|
||||
- }
|
||||
-
|
||||
- /* Of the remaining escapes, only those that define characters are
|
||||
- allowed in a class. None may start a range. */
|
||||
-
|
||||
- class_range_state = RANGE_FORBID_NO;
|
||||
- class_range_forbid_ptr = ptr;
|
||||
-
|
||||
- switch(escape)
|
||||
- {
|
||||
- case ESC_N:
|
||||
+ case ESC_N: /* Not permitted by Perl either */
|
||||
errorcode = ERR71;
|
||||
goto FAILED;
|
||||
|
||||
@@ -3813,7 +3786,6 @@ while (ptr < ptrend)
|
||||
if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
|
||||
*parsed_pattern++ = META_ESCAPE + escape;
|
||||
*parsed_pattern++ = (ptype << 16) | pdata;
|
||||
- class_range_forbid_ptr = ptr;
|
||||
}
|
||||
#else
|
||||
errorcode = ERR45;
|
||||
@@ -3826,6 +3798,33 @@ while (ptr < ptrend)
|
||||
ptr--;
|
||||
goto FAILED;
|
||||
}
|
||||
+
|
||||
+ /* All the switch-cases above which end in "break" describe a set
|
||||
+ of characters. None may start a range. */
|
||||
+
|
||||
+ /* The second part of a range can be a single-character escape
|
||||
+ sequence (detected above), but not any of the other escapes. Perl
|
||||
+ treats a hyphen as a literal in such circumstances. However, in Perl's
|
||||
+ warning mode, a warning is given, so PCRE now faults it, as it is
|
||||
+ almost certainly a mistake on the user's part. */
|
||||
+
|
||||
+ if (class_range_state == RANGE_STARTED)
|
||||
+ {
|
||||
+ errorcode = ERR50;
|
||||
+ goto FAILED;
|
||||
+ }
|
||||
+
|
||||
+ /* Perl gives a warning unless the hyphen following a multi-character
|
||||
+ escape is the last character in the class. PCRE throws an error. */
|
||||
+
|
||||
+ if (class_range_state == RANGE_FORBID_STARTED)
|
||||
+ {
|
||||
+ ptr = class_range_forbid_ptr;
|
||||
+ errorcode = ERR50;
|
||||
+ goto FAILED;
|
||||
+ }
|
||||
+
|
||||
+ class_range_state = RANGE_FORBID_NO;
|
||||
}
|
||||
|
||||
/* Proceed to next thing in the class. */
|
||||
diff --git a/testdata/testinput2 b/testdata/testinput2
|
||||
index 61b94e69..1fbb778e 100644
|
||||
--- a/testdata/testinput2
|
||||
+++ b/testdata/testinput2
|
||||
@@ -7008,4 +7008,12 @@ a)"xI
|
||||
|
||||
/[[:digit:]\Q\E-H]+/
|
||||
|
||||
+/[z-[:space:]]/
|
||||
+
|
||||
+/[z-\d]/
|
||||
+
|
||||
+/[[:space:]-z]/
|
||||
+
|
||||
+/[\d-z]/
|
||||
+
|
||||
# End of testinput2
|
||||
diff --git a/testdata/testinput5 b/testdata/testinput5
|
||||
index 494371b5..f3faeb8f 100644
|
||||
--- a/testdata/testinput5
|
||||
+++ b/testdata/testinput5
|
||||
@@ -2458,4 +2458,12 @@
|
||||
/abc/utf,substitute_extended,replace=>\777<
|
||||
abc
|
||||
|
||||
+/[z-\p{Lu}]/
|
||||
+
|
||||
+/[z-\pL]/
|
||||
+
|
||||
+/[\p{Lu}-z]/
|
||||
+
|
||||
+/[\pL-z]/
|
||||
+
|
||||
# End of testinput5
|
||||
diff --git a/testdata/testoutput2 b/testdata/testoutput2
|
||||
index 86bfe964..99714596 100644
|
||||
--- a/testdata/testoutput2
|
||||
+++ b/testdata/testoutput2
|
||||
@@ -2176,13 +2176,13 @@ Starting code units: % 0 1 A B C D E F G H I J K L M N O P Q R S T U V W
|
||||
Subject length lower bound = 1
|
||||
|
||||
/[[.ch.]]/I
|
||||
-Failed: error 113 at offset 1: POSIX collating elements are not supported
|
||||
+Failed: error 113 at offset 7: POSIX collating elements are not supported
|
||||
|
||||
/[[=ch=]]/I
|
||||
-Failed: error 113 at offset 1: POSIX collating elements are not supported
|
||||
+Failed: error 113 at offset 7: POSIX collating elements are not supported
|
||||
|
||||
/[[:rhubarb:]]/I
|
||||
-Failed: error 130 at offset 3: unknown POSIX class name
|
||||
+Failed: error 130 at offset 12: unknown POSIX class name
|
||||
|
||||
/[[:upper:]]/Ii
|
||||
Capture group count = 0
|
||||
@@ -8722,31 +8722,31 @@ Failed: error 162 at offset 4: subpattern name expected
|
||||
Failed: error 162 at offset 4: subpattern name expected
|
||||
|
||||
/[[:foo:]]/
|
||||
-Failed: error 130 at offset 3: unknown POSIX class name
|
||||
+Failed: error 130 at offset 8: unknown POSIX class name
|
||||
|
||||
/[[:1234:]]/
|
||||
-Failed: error 130 at offset 3: unknown POSIX class name
|
||||
+Failed: error 130 at offset 9: unknown POSIX class name
|
||||
|
||||
/[[:f\oo:]]/
|
||||
-Failed: error 130 at offset 3: unknown POSIX class name
|
||||
+Failed: error 130 at offset 9: unknown POSIX class name
|
||||
|
||||
/[[: :]]/
|
||||
-Failed: error 130 at offset 3: unknown POSIX class name
|
||||
+Failed: error 130 at offset 6: unknown POSIX class name
|
||||
|
||||
/[[:...:]]/
|
||||
-Failed: error 130 at offset 3: unknown POSIX class name
|
||||
+Failed: error 130 at offset 8: unknown POSIX class name
|
||||
|
||||
/[[:l\ower:]]/
|
||||
-Failed: error 130 at offset 3: unknown POSIX class name
|
||||
+Failed: error 130 at offset 11: unknown POSIX class name
|
||||
|
||||
/[[:abc\:]]/
|
||||
-Failed: error 130 at offset 3: unknown POSIX class name
|
||||
+Failed: error 130 at offset 9: unknown POSIX class name
|
||||
|
||||
/[abc[:x\]pqr:]]/
|
||||
-Failed: error 130 at offset 6: unknown POSIX class name
|
||||
+Failed: error 130 at offset 14: unknown POSIX class name
|
||||
|
||||
/[[:a\dz:]]/
|
||||
-Failed: error 130 at offset 3: unknown POSIX class name
|
||||
+Failed: error 130 at offset 9: unknown POSIX class name
|
||||
|
||||
/(^(a|b\g<-1'c))/
|
||||
Failed: error 157 at offset 8: \g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number
|
||||
@@ -11409,7 +11409,7 @@ Failed: error 171 at offset 4: \N is not supported in a class
|
||||
aNc
|
||||
|
||||
/a[B-\Nc]/
|
||||
-Failed: error 150 at offset 6: invalid range in character class
|
||||
+Failed: error 171 at offset 6: \N is not supported in a class
|
||||
|
||||
/a[B\Nc]/
|
||||
Failed: error 171 at offset 5: \N is not supported in a class
|
||||
@@ -13232,16 +13232,16 @@ Failed: error 178 at offset 5: digits missing in \x{} or \o{} or \N{U+}
|
||||
------------------------------------------------------------------
|
||||
|
||||
/[a-[:digit:]]+/
|
||||
-Failed: error 150 at offset 4: invalid range in character class
|
||||
+Failed: error 150 at offset 12: invalid range in character class
|
||||
|
||||
/[A-[:digit:]]+/
|
||||
-Failed: error 150 at offset 4: invalid range in character class
|
||||
+Failed: error 150 at offset 12: invalid range in character class
|
||||
|
||||
/[a-[.xxx.]]+/
|
||||
-Failed: error 150 at offset 4: invalid range in character class
|
||||
+Failed: error 150 at offset 10: invalid range in character class
|
||||
|
||||
/[a-[=xxx=]]+/
|
||||
-Failed: error 150 at offset 4: invalid range in character class
|
||||
+Failed: error 150 at offset 10: invalid range in character class
|
||||
|
||||
/[a-[!xxx!]]+/
|
||||
Failed: error 108 at offset 3: range out of order in character class
|
||||
@@ -13362,7 +13362,7 @@ No match
|
||||
No match
|
||||
|
||||
/[a[:<:]] should give error/
|
||||
-Failed: error 130 at offset 4: unknown POSIX class name
|
||||
+Failed: error 130 at offset 7: unknown POSIX class name
|
||||
|
||||
/(?=ab\K)/aftertext,allow_lookaround_bsk
|
||||
abcd\=startchar
|
||||
@@ -15510,11 +15510,11 @@ Failed: error 125 at offset 13: lookbehind assertion is not fixed length
|
||||
# Perl accepts these, but gives a warning. We can't warn, so give an error.
|
||||
|
||||
/[a-[:digit:]]+/
|
||||
-Failed: error 150 at offset 4: invalid range in character class
|
||||
+Failed: error 150 at offset 12: invalid range in character class
|
||||
a-a9-a
|
||||
|
||||
/[A-[:digit:]]+/
|
||||
-Failed: error 150 at offset 4: invalid range in character class
|
||||
+Failed: error 150 at offset 12: invalid range in character class
|
||||
A-A9-A
|
||||
|
||||
/[a-\d]+/
|
||||
@@ -15651,7 +15651,7 @@ Failed: error 128 at offset 63: assertion expected after (?( or (?(?C)
|
||||
.+(?(?C'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'))?!XXXX.=X
|
||||
|
||||
/[:[:alnum:]-[[a:lnum:]+/
|
||||
-Failed: error 150 at offset 11: invalid range in character class
|
||||
+Failed: error 150 at offset 12: invalid range in character class
|
||||
|
||||
/((?(?C'')\QX\E(?!((?(?C'')(?!X=X));=)r*X=X));=)/
|
||||
Failed: error 128 at offset 11: assertion expected after (?( or (?(?C)
|
||||
@@ -16285,10 +16285,10 @@ Subject length lower bound = 3
|
||||
------------------------------------------------------------------
|
||||
|
||||
/[Q-\N]/B,bad_escape_is_literal
|
||||
-Failed: error 150 at offset 5: invalid range in character class
|
||||
+Failed: error 171 at offset 5: \N is not supported in a class
|
||||
|
||||
/[\s-_]/bad_escape_is_literal
|
||||
-Failed: error 150 at offset 3: invalid range in character class
|
||||
+Failed: error 150 at offset 4: invalid range in character class
|
||||
|
||||
/[_-\s]/bad_escape_is_literal
|
||||
Failed: error 150 at offset 5: invalid range in character class
|
||||
@@ -16443,19 +16443,19 @@ No match
|
||||
No match
|
||||
|
||||
/[[:digit:]-a]/
|
||||
-Failed: error 150 at offset 10: invalid range in character class
|
||||
+Failed: error 150 at offset 11: invalid range in character class
|
||||
|
||||
/[[:digit:]-[:print:]]/
|
||||
-Failed: error 150 at offset 10: invalid range in character class
|
||||
+Failed: error 150 at offset 11: invalid range in character class
|
||||
|
||||
/[\d-a]/
|
||||
-Failed: error 150 at offset 3: invalid range in character class
|
||||
+Failed: error 150 at offset 4: invalid range in character class
|
||||
|
||||
/[\H-z]/
|
||||
-Failed: error 150 at offset 3: invalid range in character class
|
||||
+Failed: error 150 at offset 4: invalid range in character class
|
||||
|
||||
/[\d-[:print:]]/
|
||||
-Failed: error 150 at offset 3: invalid range in character class
|
||||
+Failed: error 150 at offset 4: invalid range in character class
|
||||
|
||||
# Perl gets the second of these wrong, giving no match.
|
||||
|
||||
@@ -17816,16 +17816,28 @@ Subject length lower bound = 2
|
||||
0: a
|
||||
|
||||
/[[:digit:] -Z]/xx
|
||||
-Failed: error 150 at offset 10: invalid range in character class
|
||||
+Failed: error 150 at offset 14: invalid range in character class
|
||||
|
||||
/[\d -Z]/xx
|
||||
-Failed: error 150 at offset 3: invalid range in character class
|
||||
+Failed: error 150 at offset 7: invalid range in character class
|
||||
|
||||
/[[:digit:]\E-H]/
|
||||
-Failed: error 150 at offset 10: invalid range in character class
|
||||
+Failed: error 150 at offset 13: invalid range in character class
|
||||
|
||||
/[[:digit:]\Q\E-H]+/
|
||||
-Failed: error 150 at offset 10: invalid range in character class
|
||||
+Failed: error 150 at offset 15: invalid range in character class
|
||||
+
|
||||
+/[z-[:space:]]/
|
||||
+Failed: error 150 at offset 12: invalid range in character class
|
||||
+
|
||||
+/[z-\d]/
|
||||
+Failed: error 150 at offset 5: invalid range in character class
|
||||
+
|
||||
+/[[:space:]-z]/
|
||||
+Failed: error 150 at offset 11: invalid range in character class
|
||||
+
|
||||
+/[\d-z]/
|
||||
+Failed: error 150 at offset 4: invalid range in character class
|
||||
|
||||
# End of testinput2
|
||||
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
|
||||
diff --git a/testdata/testoutput5 b/testdata/testoutput5
|
||||
index bf06ee12..0dba11c6 100644
|
||||
--- a/testdata/testoutput5
|
||||
+++ b/testdata/testoutput5
|
||||
@@ -795,7 +795,7 @@ No match
|
||||
No match
|
||||
|
||||
/[[:a\x{100}b:]]/utf
|
||||
-Failed: error 130 at offset 3: unknown POSIX class name
|
||||
+Failed: error 130 at offset 14: unknown POSIX class name
|
||||
|
||||
/a[^]b/utf,allow_empty_class,match_unset_backref
|
||||
a\x{1234}b
|
||||
@@ -5403,4 +5403,16 @@ No match
|
||||
abc
|
||||
1: >\x{1ff}<
|
||||
|
||||
+/[z-\p{Lu}]/
|
||||
+Failed: error 150 at offset 9: invalid range in character class
|
||||
+
|
||||
+/[z-\pL]/
|
||||
+Failed: error 150 at offset 6: invalid range in character class
|
||||
+
|
||||
+/[\p{Lu}-z]/
|
||||
+Failed: error 150 at offset 8: invalid range in character class
|
||||
+
|
||||
+/[\pL-z]/
|
||||
+Failed: error 150 at offset 5: invalid range in character class
|
||||
+
|
||||
# End of testinput5
|
||||
--
|
||||
2.33.0
|
||||
|
||||
459
backport-Non-recursive-scan-prefix-in-JIT-560.patch
Normal file
459
backport-Non-recursive-scan-prefix-in-JIT-560.patch
Normal file
@ -0,0 +1,459 @@
|
||||
From 6f2da25f009ff463cd9357ae5ebe452fbec8ab5c Mon Sep 17 00:00:00 2001
|
||||
From: Zoltan Herczeg <zherczeg7@gmail.com>
|
||||
Date: Fri, 15 Nov 2024 13:21:03 +0100
|
||||
Subject: [PATCH] Non-recursive scan prefix in JIT (#560)
|
||||
|
||||
Conflict:NA
|
||||
Reference:https://github.com/PCRE2Project/pcre2/commit/6f2da25f009ff463cd9357ae5ebe452fbec8ab5c
|
||||
|
||||
---
|
||||
src/pcre2_jit_compile.c | 238 ++++++++++++++++++++++++++++------------
|
||||
src/pcre2_jit_test.c | 1 +
|
||||
2 files changed, 168 insertions(+), 71 deletions(-)
|
||||
|
||||
diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c
|
||||
index 127c393d..4449d59f 100644
|
||||
--- a/src/pcre2_jit_compile.c
|
||||
+++ b/src/pcre2_jit_compile.c
|
||||
@@ -5670,11 +5670,38 @@ if (last)
|
||||
chars->last_count++;
|
||||
}
|
||||
|
||||
-static int scan_prefix(compiler_common *common, PCRE2_SPTR cc, fast_forward_char_data *chars, int max_chars, sljit_u32 *rec_count)
|
||||
+/* Value can be increased if needed. Patterns
|
||||
+such as /(a|){33}b/ can exhaust the stack.
|
||||
+
|
||||
+Note: /(a|){29}b/ already stops scan_prefix()
|
||||
+because it reaches the maximum step_count. */
|
||||
+#define SCAN_PREFIX_STACK_END 32
|
||||
+
|
||||
+/*
|
||||
+Scan prefix stores the prefix string in the chars array.
|
||||
+The elements of the chars array is either small character
|
||||
+sets or "any" (count is set to 255).
|
||||
+
|
||||
+Examples (the chars array is represented by a simple regex):
|
||||
+
|
||||
+/(abc|xbyd)/ prefix: /[ax]b[cy]/ (length: 3)
|
||||
+/a[a-z]b+c/ prefix: a.b (length: 3)
|
||||
+/ab?cd/ prefix: a[bc][cd] (length: 3)
|
||||
+/(ab|cd)|(ef|gh)/ prefix: [aceg][bdfh] (length: 2)
|
||||
+
|
||||
+The length is returned by scan_prefix(). The length is
|
||||
+less than or equal than the minimum length of the pattern.
|
||||
+*/
|
||||
+
|
||||
+static int scan_prefix(compiler_common *common, PCRE2_SPTR cc, fast_forward_char_data *chars)
|
||||
{
|
||||
-/* Recursive function, which scans prefix literals. */
|
||||
+fast_forward_char_data *chars_start = chars;
|
||||
+fast_forward_char_data *chars_end = chars + MAX_N_CHARS;
|
||||
+PCRE2_SPTR cc_stack[SCAN_PREFIX_STACK_END];
|
||||
+fast_forward_char_data *chars_stack[SCAN_PREFIX_STACK_END];
|
||||
+sljit_u8 next_alternative_stack[SCAN_PREFIX_STACK_END];
|
||||
BOOL last, any, class, caseless;
|
||||
-int len, repeat, len_save, consumed = 0;
|
||||
+int stack_ptr, step_count, repeat, len, len_save;
|
||||
sljit_u32 chr; /* Any unicode character. */
|
||||
sljit_u8 *bytes, *bytes_end, byte;
|
||||
PCRE2_SPTR alternative, cc_save, oc;
|
||||
@@ -5687,11 +5714,44 @@ PCRE2_UCHAR othercase[1];
|
||||
#endif
|
||||
|
||||
repeat = 1;
|
||||
+stack_ptr = 0;
|
||||
+step_count = 10000;
|
||||
while (TRUE)
|
||||
{
|
||||
- if (*rec_count == 0)
|
||||
+ if (--step_count == 0)
|
||||
return 0;
|
||||
- (*rec_count)--;
|
||||
+
|
||||
+ SLJIT_ASSERT(chars <= chars_start + MAX_N_CHARS);
|
||||
+
|
||||
+ if (chars >= chars_end)
|
||||
+ {
|
||||
+ if (stack_ptr == 0)
|
||||
+ return (int)(chars_end - chars_start);
|
||||
+
|
||||
+ --stack_ptr;
|
||||
+ cc = cc_stack[stack_ptr];
|
||||
+ chars = chars_stack[stack_ptr];
|
||||
+
|
||||
+ if (chars >= chars_end)
|
||||
+ continue;
|
||||
+
|
||||
+ if (next_alternative_stack[stack_ptr] != 0)
|
||||
+ {
|
||||
+ /* When an alternative is processed, the
|
||||
+ next alternative is pushed onto the stack. */
|
||||
+ SLJIT_ASSERT(*cc == OP_ALT);
|
||||
+ alternative = cc + GET(cc, 1);
|
||||
+ if (*alternative == OP_ALT)
|
||||
+ {
|
||||
+ SLJIT_ASSERT(stack_ptr < SCAN_PREFIX_STACK_END);
|
||||
+ SLJIT_ASSERT(chars_stack[stack_ptr] == chars);
|
||||
+ SLJIT_ASSERT(next_alternative_stack[stack_ptr] == 1);
|
||||
+ cc_stack[stack_ptr] = alternative;
|
||||
+ stack_ptr++;
|
||||
+ }
|
||||
+ cc += 1 + LINK_SIZE;
|
||||
+ }
|
||||
+ }
|
||||
|
||||
last = TRUE;
|
||||
any = FALSE;
|
||||
@@ -5768,9 +5828,17 @@ while (TRUE)
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if (common->utf && HAS_EXTRALEN(*cc)) len += GET_EXTRALEN(*cc);
|
||||
#endif
|
||||
- max_chars = scan_prefix(common, cc + len, chars, max_chars, rec_count);
|
||||
- if (max_chars == 0)
|
||||
- return consumed;
|
||||
+ if (stack_ptr >= SCAN_PREFIX_STACK_END)
|
||||
+ {
|
||||
+ chars_end = chars;
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ cc_stack[stack_ptr] = cc + len;
|
||||
+ chars_stack[stack_ptr] = chars;
|
||||
+ next_alternative_stack[stack_ptr] = 0;
|
||||
+ stack_ptr++;
|
||||
+
|
||||
last = FALSE;
|
||||
break;
|
||||
|
||||
@@ -5788,12 +5856,18 @@ while (TRUE)
|
||||
case OP_CBRA:
|
||||
case OP_CBRAPOS:
|
||||
alternative = cc + GET(cc, 1);
|
||||
- while (*alternative == OP_ALT)
|
||||
+ if (*alternative == OP_ALT)
|
||||
{
|
||||
- max_chars = scan_prefix(common, alternative + 1 + LINK_SIZE, chars, max_chars, rec_count);
|
||||
- if (max_chars == 0)
|
||||
- return consumed;
|
||||
- alternative += GET(alternative, 1);
|
||||
+ if (stack_ptr >= SCAN_PREFIX_STACK_END)
|
||||
+ {
|
||||
+ chars_end = chars;
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ cc_stack[stack_ptr] = alternative;
|
||||
+ chars_stack[stack_ptr] = chars;
|
||||
+ next_alternative_stack[stack_ptr] = 1;
|
||||
+ stack_ptr++;
|
||||
}
|
||||
|
||||
if (*cc == OP_CBRA || *cc == OP_CBRAPOS)
|
||||
@@ -5804,14 +5878,21 @@ while (TRUE)
|
||||
case OP_CLASS:
|
||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
|
||||
if (common->utf && !is_char7_bitset((const sljit_u8 *)(cc + 1), FALSE))
|
||||
- return consumed;
|
||||
+ {
|
||||
+ chars_end = chars;
|
||||
+ continue;
|
||||
+ }
|
||||
#endif
|
||||
class = TRUE;
|
||||
break;
|
||||
|
||||
case OP_NCLASS:
|
||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
|
||||
- if (common->utf) return consumed;
|
||||
+ if (common->utf)
|
||||
+ {
|
||||
+ chars_end = chars;
|
||||
+ continue;
|
||||
+ }
|
||||
#endif
|
||||
class = TRUE;
|
||||
break;
|
||||
@@ -5819,7 +5900,11 @@ while (TRUE)
|
||||
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
|
||||
case OP_XCLASS:
|
||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
|
||||
- if (common->utf) return consumed;
|
||||
+ if (common->utf)
|
||||
+ {
|
||||
+ chars_end = chars;
|
||||
+ continue;
|
||||
+ }
|
||||
#endif
|
||||
any = TRUE;
|
||||
cc += GET(cc, 1);
|
||||
@@ -5829,7 +5914,10 @@ while (TRUE)
|
||||
case OP_DIGIT:
|
||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
|
||||
if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_digit, FALSE))
|
||||
- return consumed;
|
||||
+ {
|
||||
+ chars_end = chars;
|
||||
+ continue;
|
||||
+ }
|
||||
#endif
|
||||
any = TRUE;
|
||||
cc++;
|
||||
@@ -5838,7 +5926,10 @@ while (TRUE)
|
||||
case OP_WHITESPACE:
|
||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
|
||||
if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_space, FALSE))
|
||||
- return consumed;
|
||||
+ {
|
||||
+ chars_end = chars;
|
||||
+ continue;
|
||||
+ }
|
||||
#endif
|
||||
any = TRUE;
|
||||
cc++;
|
||||
@@ -5847,7 +5938,10 @@ while (TRUE)
|
||||
case OP_WORDCHAR:
|
||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
|
||||
if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_word, FALSE))
|
||||
- return consumed;
|
||||
+ {
|
||||
+ chars_end = chars;
|
||||
+ continue;
|
||||
+ }
|
||||
#endif
|
||||
any = TRUE;
|
||||
cc++;
|
||||
@@ -5863,7 +5957,11 @@ while (TRUE)
|
||||
case OP_ANY:
|
||||
case OP_ALLANY:
|
||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
|
||||
- if (common->utf) return consumed;
|
||||
+ if (common->utf)
|
||||
+ {
|
||||
+ chars_end = chars;
|
||||
+ continue;
|
||||
+ }
|
||||
#endif
|
||||
any = TRUE;
|
||||
cc++;
|
||||
@@ -5873,7 +5971,11 @@ while (TRUE)
|
||||
case OP_NOTPROP:
|
||||
case OP_PROP:
|
||||
#if PCRE2_CODE_UNIT_WIDTH != 32
|
||||
- if (common->utf) return consumed;
|
||||
+ if (common->utf)
|
||||
+ {
|
||||
+ chars_end = chars;
|
||||
+ continue;
|
||||
+ }
|
||||
#endif
|
||||
any = TRUE;
|
||||
cc += 1 + 2;
|
||||
@@ -5888,7 +5990,11 @@ while (TRUE)
|
||||
case OP_NOTEXACT:
|
||||
case OP_NOTEXACTI:
|
||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
|
||||
- if (common->utf) return consumed;
|
||||
+ if (common->utf)
|
||||
+ {
|
||||
+ chars_end = chars;
|
||||
+ continue;
|
||||
+ }
|
||||
#endif
|
||||
any = TRUE;
|
||||
repeat = GET2(cc, 1);
|
||||
@@ -5896,21 +6002,20 @@ while (TRUE)
|
||||
break;
|
||||
|
||||
default:
|
||||
- return consumed;
|
||||
+ chars_end = chars;
|
||||
+ continue;
|
||||
}
|
||||
|
||||
+ SLJIT_ASSERT(chars < chars_end);
|
||||
+
|
||||
if (any)
|
||||
{
|
||||
do
|
||||
{
|
||||
chars->count = 255;
|
||||
-
|
||||
- consumed++;
|
||||
- if (--max_chars == 0)
|
||||
- return consumed;
|
||||
chars++;
|
||||
}
|
||||
- while (--repeat > 0);
|
||||
+ while (--repeat > 0 && chars < chars_end);
|
||||
|
||||
repeat = 1;
|
||||
continue;
|
||||
@@ -5921,17 +6026,27 @@ while (TRUE)
|
||||
bytes = (sljit_u8*) (cc + 1);
|
||||
cc += 1 + 32 / sizeof(PCRE2_UCHAR);
|
||||
|
||||
+ SLJIT_ASSERT(last == TRUE && repeat == 1);
|
||||
switch (*cc)
|
||||
{
|
||||
- case OP_CRSTAR:
|
||||
- case OP_CRMINSTAR:
|
||||
- case OP_CRPOSSTAR:
|
||||
case OP_CRQUERY:
|
||||
case OP_CRMINQUERY:
|
||||
case OP_CRPOSQUERY:
|
||||
- max_chars = scan_prefix(common, cc + 1, chars, max_chars, rec_count);
|
||||
- if (max_chars == 0)
|
||||
- return consumed;
|
||||
+ last = FALSE;
|
||||
+ /* Fall through */
|
||||
+ case OP_CRSTAR:
|
||||
+ case OP_CRMINSTAR:
|
||||
+ case OP_CRPOSSTAR:
|
||||
+ if (stack_ptr >= SCAN_PREFIX_STACK_END)
|
||||
+ {
|
||||
+ chars_end = chars;
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ cc_stack[stack_ptr] = ++cc;
|
||||
+ chars_stack[stack_ptr] = chars;
|
||||
+ next_alternative_stack[stack_ptr] = 0;
|
||||
+ stack_ptr++;
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -5945,7 +6060,13 @@ while (TRUE)
|
||||
case OP_CRPOSRANGE:
|
||||
repeat = GET2(cc, 1);
|
||||
if (repeat <= 0)
|
||||
- return consumed;
|
||||
+ {
|
||||
+ chars_end = chars;
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ last = (repeat != (int)GET2(cc, 1 + IMM2_SIZE));
|
||||
+ cc += 1 + 2 * IMM2_SIZE;
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -5980,36 +6101,13 @@ while (TRUE)
|
||||
bytes = bytes_end - 32;
|
||||
}
|
||||
|
||||
- consumed++;
|
||||
- if (--max_chars == 0)
|
||||
- return consumed;
|
||||
chars++;
|
||||
}
|
||||
- while (--repeat > 0);
|
||||
-
|
||||
- switch (*cc)
|
||||
- {
|
||||
- case OP_CRSTAR:
|
||||
- case OP_CRMINSTAR:
|
||||
- case OP_CRPOSSTAR:
|
||||
- return consumed;
|
||||
-
|
||||
- case OP_CRQUERY:
|
||||
- case OP_CRMINQUERY:
|
||||
- case OP_CRPOSQUERY:
|
||||
- cc++;
|
||||
- break;
|
||||
-
|
||||
- case OP_CRRANGE:
|
||||
- case OP_CRMINRANGE:
|
||||
- case OP_CRPOSRANGE:
|
||||
- if (GET2(cc, 1) != GET2(cc, 1 + IMM2_SIZE))
|
||||
- return consumed;
|
||||
- cc += 1 + 2 * IMM2_SIZE;
|
||||
- break;
|
||||
- }
|
||||
+ while (--repeat > 0 && chars < chars_end);
|
||||
|
||||
repeat = 1;
|
||||
+ if (last)
|
||||
+ chars_end = chars;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -6025,7 +6123,10 @@ while (TRUE)
|
||||
{
|
||||
GETCHAR(chr, cc);
|
||||
if ((int)PRIV(ord2utf)(char_othercase(common, chr), othercase) != len)
|
||||
- return consumed;
|
||||
+ {
|
||||
+ chars_end = chars;
|
||||
+ continue;
|
||||
+ }
|
||||
}
|
||||
else
|
||||
#endif
|
||||
@@ -6056,7 +6157,6 @@ while (TRUE)
|
||||
do
|
||||
{
|
||||
len--;
|
||||
- consumed++;
|
||||
|
||||
chr = *cc;
|
||||
add_prefix_char(*cc, chars, len == 0);
|
||||
@@ -6064,15 +6164,13 @@ while (TRUE)
|
||||
if (caseless)
|
||||
add_prefix_char(*oc, chars, len == 0);
|
||||
|
||||
- if (--max_chars == 0)
|
||||
- return consumed;
|
||||
chars++;
|
||||
cc++;
|
||||
oc++;
|
||||
}
|
||||
- while (len > 0);
|
||||
+ while (len > 0 && chars < chars_end);
|
||||
|
||||
- if (--repeat == 0)
|
||||
+ if (--repeat == 0 || chars >= chars_end)
|
||||
break;
|
||||
|
||||
len = len_save;
|
||||
@@ -6081,7 +6179,7 @@ while (TRUE)
|
||||
|
||||
repeat = 1;
|
||||
if (last)
|
||||
- return consumed;
|
||||
+ chars_end = chars;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6251,7 +6349,6 @@ int i, max, from;
|
||||
int range_right = -1, range_len;
|
||||
sljit_u8 *update_table = NULL;
|
||||
BOOL in_range;
|
||||
-sljit_u32 rec_count;
|
||||
|
||||
for (i = 0; i < MAX_N_CHARS; i++)
|
||||
{
|
||||
@@ -6259,8 +6356,7 @@ for (i = 0; i < MAX_N_CHARS; i++)
|
||||
chars[i].last_count = 0;
|
||||
}
|
||||
|
||||
-rec_count = 10000;
|
||||
-max = scan_prefix(common, common->start, chars, MAX_N_CHARS, &rec_count);
|
||||
+max = scan_prefix(common, common->start, chars);
|
||||
|
||||
if (max < 1)
|
||||
return FALSE;
|
||||
diff --git a/src/pcre2_jit_test.c b/src/pcre2_jit_test.c
|
||||
index 28bc7af9..066095fe 100644
|
||||
--- a/src/pcre2_jit_test.c
|
||||
+++ b/src/pcre2_jit_test.c
|
||||
@@ -286,6 +286,7 @@ static struct regression_test_case regression_test_cases[] = {
|
||||
{ CMU, A, 0, 0, "(a|b)?\?d((?:e)?)", "ABABdx" },
|
||||
{ MU, A, 0, 0, "(a|b)?\?d((?:e)?)", "abcde" },
|
||||
{ MU, A, 0, 0, "((?:ab)?\?g|b(?:g(nn|d)?\?)?)?\?(?:n)?m", "abgnbgnnbgdnmm" },
|
||||
+ { M, A, 0, 0, "(?:a?|a)b", "ba" },
|
||||
|
||||
/* Greedy and non-greedy + operators */
|
||||
{ MU, A, 0, 0, "(aa)+aa", "aaaaaaa" },
|
||||
--
|
||||
2.33.0
|
||||
|
||||
@ -0,0 +1,270 @@
|
||||
From 64549346f044dec18d18d06c2d08a68a68e26817 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= <carenas@gmail.com>
|
||||
Date: Sun, 9 Apr 2023 04:29:46 -0700
|
||||
Subject: [PATCH] avoid inconsistency between \d and [:digit:] when using /a
|
||||
(#223)
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Conflict:don't modify Changelog and doc/*; keep pcre2.h.generic consistent
|
||||
with pcre2.h.in according to 1de7291
|
||||
Reference:https://github.com/PCRE2Project/pcre2/commit/64549346f044dec18d18d06c2d08a68a68e26817
|
||||
|
||||
Since a608946 (Additional PCRE2_EXTRA_ASCII_xxx code, 2023-02-01)
|
||||
PCRE2_EXTRA_ASCII_BSD could be used to restrict \d to ASCII causing
|
||||
the following inconsistent behaviour in UCP mode.
|
||||
|
||||
PCRE2 version 10.43-DEV 2023-01-15
|
||||
re> /\d/utf,ucp,ascii_bsd
|
||||
data> ٣
|
||||
No match
|
||||
data>
|
||||
re> /[[:digit:]]/utf,ucp,ascii_bsd
|
||||
data> ٣
|
||||
0: \x{663}
|
||||
|
||||
It has been suggested[1] that the change to match \p{Nd} when Unicode
|
||||
is enabled for [:digit:] might had been unintentional and a bug, as
|
||||
[:digit:] should be able to be POSIX compatible, so add a new flag
|
||||
PCRE2_EXTRA_ASCII_DIGIT to avoid changing its definition in UCP mode.
|
||||
|
||||
[1] https://lore.kernel.org/git/CANgJU+U+xXsh9psd0z5Xjr+Se5QgdKkjQ7LUQ-PdUULSN3n4+g@mail.gmail.com/
|
||||
---
|
||||
src/pcre2.h.generic | 6 ++++++
|
||||
src/pcre2.h.in | 1 +
|
||||
src/pcre2_compile.c | 6 ++++--
|
||||
src/pcre2test.c | 4 +++-
|
||||
testdata/testinput5 | 10 +++++++++-
|
||||
testdata/testinput7 | 10 ++++++++--
|
||||
testdata/testoutput5 | 19 ++++++++++++++++++-
|
||||
testdata/testoutput7 | 13 +++++++++++--
|
||||
8 files changed, 60 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/src/pcre2.h.generic b/src/pcre2.h.generic
|
||||
index dad774ce..05cf9bc1 100644
|
||||
--- a/src/pcre2.h.generic
|
||||
+++ b/src/pcre2.h.generic
|
||||
@@ -153,6 +153,12 @@ D is inspected during pcre2_dfa_match() execution
|
||||
#define PCRE2_EXTRA_ESCAPED_CR_IS_LF 0x00000010u /* C */
|
||||
#define PCRE2_EXTRA_ALT_BSUX 0x00000020u /* C */
|
||||
#define PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK 0x00000040u /* C */
|
||||
+#define PCRE2_EXTRA_CASELESS_RESTRICT 0x00000080u /* C */
|
||||
+#define PCRE2_EXTRA_ASCII_BSD 0x00000100u /* C */
|
||||
+#define PCRE2_EXTRA_ASCII_BSS 0x00000200u /* C */
|
||||
+#define PCRE2_EXTRA_ASCII_BSW 0x00000400u /* C */
|
||||
+#define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */
|
||||
+#define PCRE2_EXTRA_ASCII_DIGIT 0x00001000u /* C */
|
||||
|
||||
/* These are for pcre2_jit_compile(). */
|
||||
|
||||
diff --git a/src/pcre2.h.in b/src/pcre2.h.in
|
||||
index 7202c633..cd7fdcf2 100644
|
||||
--- a/src/pcre2.h.in
|
||||
+++ b/src/pcre2.h.in
|
||||
@@ -158,6 +158,7 @@ D is inspected during pcre2_dfa_match() execution
|
||||
#define PCRE2_EXTRA_ASCII_BSS 0x00000200u /* C */
|
||||
#define PCRE2_EXTRA_ASCII_BSW 0x00000400u /* C */
|
||||
#define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */
|
||||
+#define PCRE2_EXTRA_ASCII_DIGIT 0x00001000u /* C */
|
||||
|
||||
/* These are for pcre2_jit_compile(). */
|
||||
|
||||
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
|
||||
index 95c4a79d..634360b7 100644
|
||||
--- a/src/pcre2_compile.c
|
||||
+++ b/src/pcre2_compile.c
|
||||
@@ -786,7 +786,8 @@ are allowed. */
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
|
||||
PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
|
||||
- PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX)
|
||||
+ PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
|
||||
+ PCRE2_EXTRA_ASCII_DIGIT)
|
||||
|
||||
/* Compile time error code numbers. They are given names so that they can more
|
||||
easily be tracked. When a new number is added, the tables called eint1 and
|
||||
@@ -3581,7 +3582,8 @@ while (ptr < ptrend)
|
||||
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if ((options & PCRE2_UCP) != 0 &&
|
||||
- (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)
|
||||
+ (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
|
||||
+ !(posix_class == 7 && (xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0))
|
||||
{
|
||||
int ptype = posix_substitutes[2*posix_class];
|
||||
int pvalue = posix_substitutes[2*posix_class + 1];
|
||||
diff --git a/src/pcre2test.c b/src/pcre2test.c
|
||||
index 4da3ef90..21b19370 100644
|
||||
--- a/src/pcre2test.c
|
||||
+++ b/src/pcre2test.c
|
||||
@@ -651,6 +651,7 @@ static modstruct modlist[] = {
|
||||
{ "ascii_bsd", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSD, CO(extra_options) },
|
||||
{ "ascii_bss", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSS, CO(extra_options) },
|
||||
{ "ascii_bsw", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSW, CO(extra_options) },
|
||||
+ { "ascii_digit", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_DIGIT, CO(extra_options) },
|
||||
{ "ascii_posix", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_POSIX, CO(extra_options) },
|
||||
{ "auto_callout", MOD_PAT, MOD_OPT, PCRE2_AUTO_CALLOUT, PO(options) },
|
||||
{ "bad_escape_is_literal", MOD_CTC, MOD_OPT, PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL, CO(extra_options) },
|
||||
@@ -4294,13 +4295,14 @@ show_compile_extra_options(uint32_t options, const char *before,
|
||||
const char *after)
|
||||
{
|
||||
if (options == 0) fprintf(outfile, "%s <none>%s", before, after);
|
||||
-else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
+else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
before,
|
||||
((options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)? " allow_surrogate_escapes" : "",
|
||||
((options & PCRE2_EXTRA_ALT_BSUX) != 0)? " alt_bsux" : "",
|
||||
((options & PCRE2_EXTRA_ASCII_BSD) != 0)? " ascii_bsd" : "",
|
||||
((options & PCRE2_EXTRA_ASCII_BSS) != 0)? " ascii_bss" : "",
|
||||
((options & PCRE2_EXTRA_ASCII_BSW) != 0)? " ascii_bsw" : "",
|
||||
+ ((options & PCRE2_EXTRA_ASCII_DIGIT) != 0)? " ascii_digit" : "",
|
||||
((options & PCRE2_EXTRA_ASCII_POSIX) != 0)? " ascii_posix" : "",
|
||||
((options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) != 0)? " bad_escape_is_literal" : "",
|
||||
((options & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)? " caseless_restrict" : "",
|
||||
diff --git a/testdata/testinput5 b/testdata/testinput5
|
||||
index 0f105408..0624a0c3 100644
|
||||
--- a/testdata/testinput5
|
||||
+++ b/testdata/testinput5
|
||||
@@ -1215,6 +1215,8 @@
|
||||
|
||||
/[[:digit:]]/B,ucp
|
||||
|
||||
+/[[:digit:]]/B,ucp,ascii_digit
|
||||
+
|
||||
/[[:graph:]]/B,ucp
|
||||
|
||||
/[[:print:]]/B,ucp
|
||||
@@ -1227,7 +1229,7 @@
|
||||
|
||||
/[[:xdigit:]]/B,ucp
|
||||
|
||||
-# Unicode properties for \b abd \B
|
||||
+# Unicode properties for \b and \B
|
||||
|
||||
/\b...\B/utf,ucp
|
||||
abc_
|
||||
@@ -2431,6 +2433,12 @@
|
||||
/[[:digit:]]+/utf,ucp
|
||||
123\x{660}456
|
||||
|
||||
+/[[:digit:]]+/utf,ucp,ascii_digit
|
||||
+ 123\x{660}456
|
||||
+
|
||||
+/[[:digit:]]+/g,utf,ucp,ascii_digit
|
||||
+ 123\x{660}456
|
||||
+
|
||||
/[[:digit:]]+/utf,ucp,ascii_posix
|
||||
123\x{660}456
|
||||
|
||||
diff --git a/testdata/testinput7 b/testdata/testinput7
|
||||
index a2b7fb8d..96deaa30 100644
|
||||
--- a/testdata/testinput7
|
||||
+++ b/testdata/testinput7
|
||||
@@ -1657,7 +1657,7 @@
|
||||
/^[\p{Xwd}]+/utf
|
||||
ABCD1234\x{6ca}\x{a6c}\x{10a7}_
|
||||
|
||||
-# Unicode properties for \b abd \B
|
||||
+# Unicode properties for \b and \B
|
||||
|
||||
/\b...\B/utf,ucp
|
||||
abc_
|
||||
@@ -2435,9 +2435,15 @@
|
||||
/[[:digit:]]+/utf,ucp
|
||||
123\x{660}456
|
||||
|
||||
+/[[:digit:]]+/utf,ucp,ascii_digit
|
||||
+ 123\x{660}456
|
||||
+
|
||||
+/[[:digit:]]+/g,utf,ucp,ascii_digit
|
||||
+ 123\x{660}456
|
||||
+
|
||||
/[[:digit:]]+/utf,ucp,ascii_posix
|
||||
123\x{660}456
|
||||
-
|
||||
+
|
||||
/>[[:space:]]+</utf,ucp
|
||||
>\x{a0} \x{a0}<
|
||||
>\x{a0}\x{a0}\x{a0}<
|
||||
diff --git a/testdata/testoutput5 b/testdata/testoutput5
|
||||
index 3cee990e..febcc954 100644
|
||||
--- a/testdata/testoutput5
|
||||
+++ b/testdata/testoutput5
|
||||
@@ -2520,6 +2520,14 @@ No match
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
+/[[:digit:]]/B,ucp,ascii_digit
|
||||
+------------------------------------------------------------------
|
||||
+ Bra
|
||||
+ [0-9]
|
||||
+ Ket
|
||||
+ End
|
||||
+------------------------------------------------------------------
|
||||
+
|
||||
/[[:graph:]]/B,ucp
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
@@ -2568,7 +2576,7 @@ No match
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
-# Unicode properties for \b abd \B
|
||||
+# Unicode properties for \b and \B
|
||||
|
||||
/\b...\B/utf,ucp
|
||||
abc_
|
||||
@@ -5359,6 +5367,15 @@ No match
|
||||
123\x{660}456
|
||||
0: 123\x{660}456
|
||||
|
||||
+/[[:digit:]]+/utf,ucp,ascii_digit
|
||||
+ 123\x{660}456
|
||||
+ 0: 123
|
||||
+
|
||||
+/[[:digit:]]+/g,utf,ucp,ascii_digit
|
||||
+ 123\x{660}456
|
||||
+ 0: 123
|
||||
+ 0: 456
|
||||
+
|
||||
/[[:digit:]]+/utf,ucp,ascii_posix
|
||||
123\x{660}456
|
||||
0: 123
|
||||
diff --git a/testdata/testoutput7 b/testdata/testoutput7
|
||||
index 4065981d..d98178e6 100644
|
||||
--- a/testdata/testoutput7
|
||||
+++ b/testdata/testoutput7
|
||||
@@ -2853,7 +2853,7 @@ No match
|
||||
ABCD1234\x{6ca}\x{a6c}\x{10a7}_
|
||||
0: ABCD1234\x{6ca}\x{a6c}\x{10a7}_
|
||||
|
||||
-# Unicode properties for \b abd \B
|
||||
+# Unicode properties for \b and \B
|
||||
|
||||
/\b...\B/utf,ucp
|
||||
abc_
|
||||
@@ -4080,10 +4080,19 @@ No match
|
||||
123\x{660}456
|
||||
0: 123\x{660}456
|
||||
|
||||
+/[[:digit:]]+/utf,ucp,ascii_digit
|
||||
+ 123\x{660}456
|
||||
+ 0: 123
|
||||
+
|
||||
+/[[:digit:]]+/g,utf,ucp,ascii_digit
|
||||
+ 123\x{660}456
|
||||
+ 0: 123
|
||||
+ 0: 456
|
||||
+
|
||||
/[[:digit:]]+/utf,ucp,ascii_posix
|
||||
123\x{660}456
|
||||
0: 123
|
||||
-
|
||||
+
|
||||
/>[[:space:]]+</utf,ucp
|
||||
>\x{a0} \x{a0}<
|
||||
0: >\x{a0} \x{a0}<
|
||||
--
|
||||
2.33.0
|
||||
|
||||
15
pcre2.spec
15
pcre2.spec
@ -1,6 +1,6 @@
|
||||
Name: pcre2
|
||||
Version: 10.42
|
||||
Release: 11
|
||||
Release: 12
|
||||
Summary: Perl Compatible Regular Expressions
|
||||
License: BSD
|
||||
URL: http://www.pcre.org/
|
||||
@ -39,6 +39,11 @@ Patch6027: backport-Add-Perl-titlecasing-475.patch
|
||||
Patch6028: backport-Fix-incorrect-positive-error-code-from-pcre2_substitute.patch
|
||||
Patch6029: backport-pcre2_compile-avoid-1-byte-buffer-overread-parsing-V.patch
|
||||
Patch6030: backport-Improve-error-message-for-N-name-in-character-classes.patch
|
||||
Patch6031: backport-Further-ASCII-tests-and-minor-bugfix-plus-ChangeLog-.patch
|
||||
Patch6032: backport-avoid-inconsistency-between-d-and-digit-when-using-a.patch
|
||||
Patch6033: backport-Fix-the-lookahead-after-d-or-posix-to-skip-whitespac.patch
|
||||
Patch6034: backport-Improve-error-offsets-for-character-classes-548.patch
|
||||
Patch6035: backport-Non-recursive-scan-prefix-in-JIT-560.patch
|
||||
|
||||
BuildRequires: autoconf libtool automake coreutils gcc make readline-devel
|
||||
Obsoletes: pcre2-utf16 pcre2-utf32 pcre2-tools
|
||||
@ -156,6 +161,14 @@ make check
|
||||
%{_pkgdocdir}/html/
|
||||
|
||||
%changelog
|
||||
* Tue Dec 10 2024 hugel <gengqihu2@h-partners.com> - 10.42-12
|
||||
- DESC:sync patches from upstream
|
||||
backport-Further-ASCII-tests-and-minor-bugfix-plus-ChangeLog-.patch
|
||||
backport-avoid-inconsistency-between-d-and-digit-when-using-a.patch
|
||||
backport-Fix-the-lookahead-after-d-or-posix-to-skip-whitespac.patch
|
||||
backport-Improve-error-offsets-for-character-classes-548.patch
|
||||
backport-Non-recursive-scan-prefix-in-JIT-560.patch
|
||||
|
||||
* Tue Nov 19 2024 yanglongkang <yanglongkang@h-partners.com> - 10.42-11
|
||||
- DESC:sync patches from upstream
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user