Compare commits
10 Commits
4d9e18c8d6
...
c8eb1717e3
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c8eb1717e3 | ||
|
|
9cb3575357 | ||
|
|
0ec5d05b58 | ||
|
|
abeb907f1d | ||
|
|
3822b2533a | ||
|
|
f01ff6070f | ||
|
|
8be0b453f3 | ||
|
|
b64fc32b63 | ||
|
|
4f635d0fa7 | ||
|
|
96bb7bb183 |
69
backport-Add-Perl-titlecasing-475.patch
Normal file
69
backport-Add-Perl-titlecasing-475.patch
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
From f334e76dc765f23670e957413bae18c9d20b1d82 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Nicholas Wilson <nicholas@nicholaswilson.me.uk>
|
||||||
|
Date: Mon, 16 Sep 2024 17:38:40 +0100
|
||||||
|
Subject: [PATCH] Add Perl titlecasing (#475)
|
||||||
|
|
||||||
|
---
|
||||||
|
src/pcre2_substitute.c | 11 +++++++++++
|
||||||
|
testdata/testinput2 | 3 +++
|
||||||
|
testdata/testoutput2 | 4 ++++
|
||||||
|
3 files changed, 18 insertions(+)
|
||||||
|
|
||||||
|
diff --git a/src/pcre2_substitute.c b/src/pcre2_substitute.c
|
||||||
|
index 1ccef0660..83ddb8364 100644
|
||||||
|
--- a/src/pcre2_substitute.c
|
||||||
|
+++ b/src/pcre2_substitute.c
|
||||||
|
@@ -839,6 +839,12 @@ do
|
||||||
|
forcecase = -1;
|
||||||
|
forcecasereset = 0;
|
||||||
|
ptr += 2;
|
||||||
|
+ if (ptr + 2 < repend && ptr[0] == CHAR_BACKSLASH && ptr[1] == CHAR_U)
|
||||||
|
+ {
|
||||||
|
+ /* Perl title-casing feature for \l\U (and \u\L) */
|
||||||
|
+ forcecasereset = 1;
|
||||||
|
+ ptr += 2;
|
||||||
|
+ }
|
||||||
|
continue;
|
||||||
|
|
||||||
|
case CHAR_U:
|
||||||
|
@@ -850,6 +856,11 @@ do
|
||||||
|
forcecase = 1;
|
||||||
|
forcecasereset = 0;
|
||||||
|
ptr += 2;
|
||||||
|
+ if (ptr + 2 < repend && ptr[0] == CHAR_BACKSLASH && ptr[1] == CHAR_L)
|
||||||
|
+ {
|
||||||
|
+ forcecasereset = -1;
|
||||||
|
+ ptr += 2;
|
||||||
|
+ }
|
||||||
|
continue;
|
||||||
|
|
||||||
|
default:
|
||||||
|
diff --git a/testdata/testinput2 b/testdata/testinput2
|
||||||
|
index 51e2095c8..7a836c994 100644
|
||||||
|
--- a/testdata/testinput2
|
||||||
|
+++ b/testdata/testinput2
|
||||||
|
@@ -4612,6 +4612,9 @@ B)x/alt_verbnames,mark
|
||||||
|
/a(bc)(DE)/replace=a\u$1\U$1\E$1\l$2\L$2\Eab\Uab\LYZ\EDone,substitute_extended
|
||||||
|
abcDE
|
||||||
|
|
||||||
|
+/(Hello)|wORLD/g,replace=>${1:+\l\U$0:\u\L$0}<,substitute_extended
|
||||||
|
+ Hello between wORLD
|
||||||
|
+
|
||||||
|
/abcd/replace=xy\kz,substitute_extended
|
||||||
|
abcd
|
||||||
|
|
||||||
|
diff --git a/testdata/testoutput2 b/testdata/testoutput2
|
||||||
|
index eeb635d6d..7c71866b7 100644
|
||||||
|
--- a/testdata/testoutput2
|
||||||
|
+++ b/testdata/testoutput2
|
||||||
|
@@ -14854,6 +14854,10 @@ No match
|
||||||
|
abcDE
|
||||||
|
1: aBcBCbcdEdeabAByzDone
|
||||||
|
|
||||||
|
+/(Hello)|wORLD/g,replace=>${1:+\l\U$0:\u\L$0}<,substitute_extended
|
||||||
|
+ Hello between wORLD
|
||||||
|
+ 2: >hELLO< between >World<
|
||||||
|
+
|
||||||
|
/abcd/replace=xy\kz,substitute_extended
|
||||||
|
abcd
|
||||||
|
Failed: error -57 at offset 4 in replacement: bad escape sequence in replacement string
|
||||||
1489
backport-Additional-PCRE2_EXTRA_ASCII_xxx-code.patch
Normal file
1489
backport-Additional-PCRE2_EXTRA_ASCII_xxx-code.patch
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,65 @@
|
|||||||
|
From 829414f8e549fe7e4b1a6696ca70664e89e5e7f0 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Nicholas Wilson <niwilson@microsoft.com>
|
||||||
|
Date: Wed, 18 Sep 2024 16:39:22 +0100
|
||||||
|
Subject: [PATCH] Fix incorrect positive error code from pcre2_substitute()
|
||||||
|
(#481)
|
||||||
|
|
||||||
|
---
|
||||||
|
src/pcre2_substitute.c | 4 +++-
|
||||||
|
testdata/testinput2 | 6 ++++++
|
||||||
|
testdata/testoutput2 | 10 ++++++++++
|
||||||
|
3 files changed, 19 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/src/pcre2_substitute.c b/src/pcre2_substitute.c
|
||||||
|
index 86c1d1e69..862ea9f73 100644
|
||||||
|
--- a/src/pcre2_substitute.c
|
||||||
|
+++ b/src/pcre2_substitute.c
|
||||||
|
@@ -134,7 +134,9 @@ for (; ptr < ptrend; ptr++)
|
||||||
|
ptr -= 1; /* Back to last code unit of escape */
|
||||||
|
if (errorcode != 0)
|
||||||
|
{
|
||||||
|
- rc = errorcode;
|
||||||
|
+ /* errorcode from check_escape is positive, so must not be returned by
|
||||||
|
+ pcre2_substitute(). */
|
||||||
|
+ rc = PCRE2_ERROR_BADREPESCAPE;
|
||||||
|
goto EXIT;
|
||||||
|
}
|
||||||
|
|
||||||
|
diff --git a/testdata/testinput2 b/testdata/testinput2
|
||||||
|
index c2abdb890..8be78ff50 100644
|
||||||
|
--- a/testdata/testinput2
|
||||||
|
+++ b/testdata/testinput2
|
||||||
|
@@ -4201,6 +4201,12 @@
|
||||||
|
123abc123\=substitute_overflow_length,replace=[1]x$1z
|
||||||
|
123abc123\=substitute_overflow_length,replace=[0]x$1z
|
||||||
|
|
||||||
|
+/a(b)c/substitute_extended
|
||||||
|
+ ZabcZ\=replace=>${1:+ yes : no }
|
||||||
|
+ ZabcZ\=replace=>${1:+ \o{100} : \o{100} }
|
||||||
|
+ ZabcZ\=replace=>${1:+ \o{Z} : no }
|
||||||
|
+ ZabcZ\=replace=>${1:+ yes : \o{Z} }
|
||||||
|
+
|
||||||
|
"((?=(?(?=(?(?=(?(?=()))))))))"
|
||||||
|
a
|
||||||
|
|
||||||
|
diff --git a/testdata/testoutput2 b/testdata/testoutput2
|
||||||
|
index 7a582cd23..ccf209b5c 100644
|
||||||
|
--- a/testdata/testoutput2
|
||||||
|
+++ b/testdata/testoutput2
|
||||||
|
@@ -13818,6 +13818,16 @@ Failed: error -48: no more memory: 10 code units are needed
|
||||||
|
123abc123\=substitute_overflow_length,replace=[0]x$1z
|
||||||
|
Failed: error -48: no more memory: 10 code units are needed
|
||||||
|
|
||||||
|
+/a(b)c/substitute_extended
|
||||||
|
+ ZabcZ\=replace=>${1:+ yes : no }
|
||||||
|
+ 1: Z> yes Z
|
||||||
|
+ ZabcZ\=replace=>${1:+ \o{100} : \o{100} }
|
||||||
|
+ 1: Z> @ Z
|
||||||
|
+ ZabcZ\=replace=>${1:+ \o{Z} : no }
|
||||||
|
+Failed: error -57 at offset 9 in replacement: bad escape sequence in replacement string
|
||||||
|
+ ZabcZ\=replace=>${1:+ yes : \o{Z} }
|
||||||
|
+Failed: error -57 at offset 15 in replacement: bad escape sequence in replacement string
|
||||||
|
+
|
||||||
|
"((?=(?(?=(?(?=(?(?=()))))))))"
|
||||||
|
a
|
||||||
|
0:
|
||||||
@ -0,0 +1,263 @@
|
|||||||
|
From d29e729000a3724e2aebaa64318dfd7530a55370 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Philip Hazel <Philip.Hazel@gmail.com>
|
||||||
|
Date: Wed, 4 Sep 2024 16:18:35 +0100
|
||||||
|
Subject: [PATCH] Fix non-recognition of some octal escapes in substitute
|
||||||
|
replacement strings
|
||||||
|
|
||||||
|
---
|
||||||
|
src/pcre2_compile.c | 15 ++++++++-------
|
||||||
|
src/pcre2_substitute.c | 4 ++--
|
||||||
|
testdata/testinput11 | 6 ++++++
|
||||||
|
testdata/testinput2 | 12 ++++++++++++
|
||||||
|
testdata/testinput5 | 3 +++
|
||||||
|
testdata/testinput9 | 8 ++++++++
|
||||||
|
testdata/testoutput11-16 | 8 ++++++++
|
||||||
|
testdata/testoutput11-32 | 8 ++++++++
|
||||||
|
testdata/testoutput2 | 16 ++++++++++++++++
|
||||||
|
testdata/testoutput5 | 4 ++++
|
||||||
|
testdata/testoutput9 | 10 ++++++++++
|
||||||
|
11 files changed, 85 insertions(+), 9 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
|
||||||
|
index ad2baf8..80a1a48 100644
|
||||||
|
--- a/src/pcre2_compile.c
|
||||||
|
+++ b/src/pcre2_compile.c
|
||||||
|
@@ -1480,8 +1480,8 @@ final code unit of the escape sequence.
|
||||||
|
This function is also called from pcre2_substitute() to handle escape sequences
|
||||||
|
in replacement strings. In this case, the cb argument is NULL, and in the case
|
||||||
|
of escapes that have further processing, only sequences that define a data
|
||||||
|
-character are recognised. The isclass argument is not relevant; the options
|
||||||
|
-argument is the final value of the compiled pattern's options.
|
||||||
|
+character are recognised. The options argument is the final value of the
|
||||||
|
+compiled pattern's options.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
ptrptr points to the input position pointer
|
||||||
|
@@ -1496,7 +1496,7 @@ Arguments:
|
||||||
|
errorcodeptr points to the errorcode variable (containing zero)
|
||||||
|
options the current options bits
|
||||||
|
xoptions the current extra options bits
|
||||||
|
- isclass TRUE if inside a character class
|
||||||
|
+ isclassorsub TRUE if in a character class or called from pcre2_substitute()
|
||||||
|
cb compile data block or NULL when called from pcre2_substitute()
|
||||||
|
|
||||||
|
Returns: zero => a data character
|
||||||
|
@@ -1507,7 +1507,7 @@ Returns: zero => a data character
|
||||||
|
|
||||||
|
int
|
||||||
|
PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
|
||||||
|
- int *errorcodeptr, uint32_t options, uint32_t xoptions, BOOL isclass,
|
||||||
|
+ int *errorcodeptr, uint32_t options, uint32_t xoptions, BOOL isclassorsub,
|
||||||
|
compile_block *cb)
|
||||||
|
{
|
||||||
|
BOOL utf = (options & PCRE2_UTF) != 0;
|
||||||
|
@@ -1607,7 +1607,8 @@ else
|
||||||
|
|
||||||
|
if (cb == NULL)
|
||||||
|
{
|
||||||
|
- if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
|
||||||
|
+ if (c < CHAR_0 ||
|
||||||
|
+ (c > CHAR_9 && (c != CHAR_c && c != CHAR_o && c != CHAR_x)))
|
||||||
|
{
|
||||||
|
*errorcodeptr = ERR3;
|
||||||
|
return 0;
|
||||||
|
@@ -1719,7 +1720,7 @@ else
|
||||||
|
*/
|
||||||
|
|
||||||
|
case CHAR_g:
|
||||||
|
- if (isclass) break;
|
||||||
|
+ if (isclassorsub) break;
|
||||||
|
|
||||||
|
if (ptr >= ptrend)
|
||||||
|
{
|
||||||
|
@@ -1791,7 +1792,7 @@ else
|
||||||
|
case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
|
||||||
|
case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
|
||||||
|
|
||||||
|
- if (!isclass)
|
||||||
|
+ if (!isclassorsub)
|
||||||
|
{
|
||||||
|
oldptr = ptr;
|
||||||
|
ptr--; /* Back to the digit */
|
||||||
|
diff --git a/src/pcre2_substitute.c b/src/pcre2_substitute.c
|
||||||
|
index d1f17eb05..1ccef0660 100644
|
||||||
|
--- a/src/pcre2_substitute.c
|
||||||
|
+++ b/src/pcre2_substitute.c
|
||||||
|
@@ -130,7 +130,7 @@ for (; ptr < ptrend; ptr++)
|
||||||
|
|
||||||
|
ptr += 1; /* Must point after \ */
|
||||||
|
erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode,
|
||||||
|
- code->overall_options, code->extra_options, FALSE, NULL);
|
||||||
|
+ code->overall_options, code->extra_options, TRUE, NULL);
|
||||||
|
ptr -= 1; /* Back to last code unit of escape */
|
||||||
|
if (errorcode != 0)
|
||||||
|
{
|
||||||
|
@@ -858,7 +858,7 @@ do
|
||||||
|
|
||||||
|
ptr++; /* Point after \ */
|
||||||
|
rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode,
|
||||||
|
- code->overall_options, code->extra_options, FALSE, NULL);
|
||||||
|
+ code->overall_options, code->extra_options, TRUE, NULL);
|
||||||
|
if (errorcode != 0) goto BADESCAPE;
|
||||||
|
|
||||||
|
switch(rc)
|
||||||
|
diff --git a/testdata/testinput11 b/testdata/testinput11
|
||||||
|
index 2bc8a25e3..69aea351b 100644
|
||||||
|
--- a/testdata/testinput11
|
||||||
|
+++ b/testdata/testinput11
|
||||||
|
@@ -371,4 +371,10 @@
|
||||||
|
/(?i:A{1,}\6666666666)/
|
||||||
|
A\x{1b6}6666666
|
||||||
|
|
||||||
|
+/abc/substitute_extended,replace=>\777<
|
||||||
|
+ abc
|
||||||
|
+
|
||||||
|
+/abc/substitute_extended,replace=>\o{012345}<
|
||||||
|
+ abc
|
||||||
|
+
|
||||||
|
# End of testinput11
|
||||||
|
diff --git a/testdata/testinput2 b/testdata/testinput2
|
||||||
|
index 7d8dfc149..51e2095c8 100644
|
||||||
|
--- a/testdata/testinput2
|
||||||
|
+++ b/testdata/testinput2
|
||||||
|
@@ -4668,6 +4668,18 @@ B)x/alt_verbnames,mark
|
||||||
|
/abcd/g
|
||||||
|
>abcd1234abcd5678<\=replace=wxyz,substitute_matched
|
||||||
|
|
||||||
|
+/abc/substitute_extended,replace=>\045<
|
||||||
|
+ abc
|
||||||
|
+
|
||||||
|
+/abc/substitute_extended,replace=>\45<
|
||||||
|
+ abc
|
||||||
|
+
|
||||||
|
+/abc/substitute_extended,replace=>\o{45}<
|
||||||
|
+ abc
|
||||||
|
+
|
||||||
|
+/abc/substitute_extended,replace=>\845<
|
||||||
|
+ abc
|
||||||
|
+
|
||||||
|
/^(o(\1{72}{\"{\\{00000059079}\d*){74}}){19}/I
|
||||||
|
|
||||||
|
/((p(?'K/
|
||||||
|
diff --git a/testdata/testinput5 b/testdata/testinput5
|
||||||
|
index 9126236..da2830d 100644
|
||||||
|
--- a/testdata/testinput5
|
||||||
|
+++ b/testdata/testinput5
|
||||||
|
@@ -2442,4 +2442,7 @@
|
||||||
|
|
||||||
|
# End PCRE2_EXTRA_ASCII_xxx tests
|
||||||
|
|
||||||
|
+/abc/utf,substitute_extended,replace=>\777<
|
||||||
|
+ abc
|
||||||
|
+
|
||||||
|
# End of testinput5
|
||||||
|
diff --git a/testdata/testinput9 b/testdata/testinput9
|
||||||
|
index 4eb228afe..f2f50033f 100644
|
||||||
|
--- a/testdata/testinput9
|
||||||
|
+++ b/testdata/testinput9
|
||||||
|
@@ -263,4 +263,12 @@
|
||||||
|
/(?i:A{1,}\6666666666)/
|
||||||
|
A\x{1b6}6666666
|
||||||
|
|
||||||
|
+# Should cause an error
|
||||||
|
+/abc/substitute_extended,replace=>\777<
|
||||||
|
+ abc
|
||||||
|
+
|
||||||
|
+# Should cause an error
|
||||||
|
+/abc/substitute_extended,replace=>\o{012345}<
|
||||||
|
+ abc
|
||||||
|
+
|
||||||
|
# End of testinput9
|
||||||
|
diff --git a/testdata/testoutput11-16 b/testdata/testoutput11-16
|
||||||
|
index f70d89ee9..806f6b3e0 100644
|
||||||
|
--- a/testdata/testoutput11-16
|
||||||
|
+++ b/testdata/testoutput11-16
|
||||||
|
@@ -665,4 +665,12 @@ Subject length lower bound = 1
|
||||||
|
A\x{1b6}6666666
|
||||||
|
0: A\x{1b6}6666666
|
||||||
|
|
||||||
|
+/abc/substitute_extended,replace=>\777<
|
||||||
|
+ abc
|
||||||
|
+ 1: >\x{1ff}<
|
||||||
|
+
|
||||||
|
+/abc/substitute_extended,replace=>\o{012345}<
|
||||||
|
+ abc
|
||||||
|
+ 1: >\x{14e5}<
|
||||||
|
+
|
||||||
|
# End of testinput11
|
||||||
|
diff --git a/testdata/testoutput11-32 b/testdata/testoutput11-32
|
||||||
|
index 961c4cd05..c5f5c8a42 100644
|
||||||
|
--- a/testdata/testoutput11-32
|
||||||
|
+++ b/testdata/testoutput11-32
|
||||||
|
@@ -671,4 +671,12 @@ Subject length lower bound = 1
|
||||||
|
A\x{1b6}6666666
|
||||||
|
0: A\x{1b6}6666666
|
||||||
|
|
||||||
|
+/abc/substitute_extended,replace=>\777<
|
||||||
|
+ abc
|
||||||
|
+ 1: >\x{1ff}<
|
||||||
|
+
|
||||||
|
+/abc/substitute_extended,replace=>\o{012345}<
|
||||||
|
+ abc
|
||||||
|
+ 1: >\x{14e5}<
|
||||||
|
+
|
||||||
|
# End of testinput11
|
||||||
|
diff --git a/testdata/testoutput2 b/testdata/testoutput2
|
||||||
|
index 1cffe6a36..eeb635d6d 100644
|
||||||
|
--- a/testdata/testoutput2
|
||||||
|
+++ b/testdata/testoutput2
|
||||||
|
@@ -14934,6 +14934,22 @@ Failed: error -55 at offset 3 in replacement: requested value is not set
|
||||||
|
>abcd1234abcd5678<\=replace=wxyz,substitute_matched
|
||||||
|
2: >wxyz1234wxyz5678<
|
||||||
|
|
||||||
|
+/abc/substitute_extended,replace=>\045<
|
||||||
|
+ abc
|
||||||
|
+ 1: >%<
|
||||||
|
+
|
||||||
|
+/abc/substitute_extended,replace=>\45<
|
||||||
|
+ abc
|
||||||
|
+ 1: >%<
|
||||||
|
+
|
||||||
|
+/abc/substitute_extended,replace=>\o{45}<
|
||||||
|
+ abc
|
||||||
|
+ 1: >%<
|
||||||
|
+
|
||||||
|
+/abc/substitute_extended,replace=>\845<
|
||||||
|
+ abc
|
||||||
|
+ 1: >845<
|
||||||
|
+
|
||||||
|
/^(o(\1{72}{\"{\\{00000059079}\d*){74}}){19}/I
|
||||||
|
Capture group count = 2
|
||||||
|
Max back reference = 1
|
||||||
|
diff --git a/testdata/testoutput5 b/testdata/testoutput5
|
||||||
|
index b1842df..24d849c 100644
|
||||||
|
--- a/testdata/testoutput5
|
||||||
|
+++ b/testdata/testoutput5
|
||||||
|
@@ -5375,4 +5375,8 @@ No match
|
||||||
|
|
||||||
|
# End PCRE2_EXTRA_ASCII_xxx tests
|
||||||
|
|
||||||
|
+/abc/utf,substitute_extended,replace=>\777<
|
||||||
|
+ abc
|
||||||
|
+ 1: >\x{1ff}<
|
||||||
|
+
|
||||||
|
# End of testinput5
|
||||||
|
diff --git a/testdata/testoutput9 b/testdata/testoutput9
|
||||||
|
index 3613703e0..8556c9e14 100644
|
||||||
|
--- a/testdata/testoutput9
|
||||||
|
+++ b/testdata/testoutput9
|
||||||
|
@@ -371,4 +371,14 @@ Failed: error 176 at offset 259: name is too long in (*MARK), (*PRUNE), (*SKIP),
|
||||||
|
Failed: error 151 at offset 13: octal value is greater than \377 in 8-bit non-UTF-8 mode
|
||||||
|
A\x{1b6}6666666
|
||||||
|
|
||||||
|
+# Should cause an error
|
||||||
|
+/abc/substitute_extended,replace=>\777<
|
||||||
|
+ abc
|
||||||
|
+Failed: error -57 at offset 5 in replacement: bad escape sequence in replacement string
|
||||||
|
+
|
||||||
|
+# Should cause an error
|
||||||
|
+/abc/substitute_extended,replace=>\o{012345}<
|
||||||
|
+ abc
|
||||||
|
+Failed: error -57 at offset 10 in replacement: bad escape sequence in replacement string
|
||||||
|
+
|
||||||
|
# End of testinput9
|
||||||
49
backport-Fix-oversight-in-adding-new-pcre2grep-test.patch
Normal file
49
backport-Fix-oversight-in-adding-new-pcre2grep-test.patch
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
From ace78dc460e7e80592d86216cfdd36a62b083bb3 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Philip Hazel <Philip.Hazel@gmail.com>
|
||||||
|
Date: Wed, 27 Nov 2024 15:50:34 +0000
|
||||||
|
Subject: [PATCH] Fix oversight in adding new pcre2grep test
|
||||||
|
|
||||||
|
Conflict:NA
|
||||||
|
Reference:https://github.com/PCRE2Project/pcre2/commit/ace78dc460e7e80592d86216cfdd36a62b083bb3
|
||||||
|
|
||||||
|
---
|
||||||
|
testdata/grepinput | 2 +-
|
||||||
|
testdata/grepoutput | 3 +--
|
||||||
|
2 files changed, 2 insertions(+), 3 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/testdata/grepinput b/testdata/grepinput
|
||||||
|
index 91d3db88..1a0a9c0f 100644
|
||||||
|
--- a/testdata/grepinput
|
||||||
|
+++ b/testdata/grepinput
|
||||||
|
@@ -630,7 +630,7 @@ asd
|
||||||
|
dfg
|
||||||
|
ghj
|
||||||
|
jkl
|
||||||
|
-abc
|
||||||
|
+abx
|
||||||
|
def
|
||||||
|
ghi
|
||||||
|
xyz
|
||||||
|
diff --git a/testdata/grepoutput b/testdata/grepoutput
|
||||||
|
index 58ea858d..abfabe15 100644
|
||||||
|
--- a/testdata/grepoutput
|
||||||
|
+++ b/testdata/grepoutput
|
||||||
|
@@ -104,7 +104,6 @@ pcre2grep: Error in command-line regex at offset 4: quantifier does not follow a
|
||||||
|
RC=2
|
||||||
|
---------------------------- Test 16 -----------------------------
|
||||||
|
pcre2grep: Failed to open ./testdata/nonexistfile: No such file or directory
|
||||||
|
-./testdata/grepinput:abc
|
||||||
|
RC=2
|
||||||
|
---------------------------- Test 17 -----------------------------
|
||||||
|
features should be added at the end, because some of the tests involve the
|
||||||
|
@@ -1306,7 +1305,7 @@ RC=0
|
||||||
|
630-dfg
|
||||||
|
631-ghj
|
||||||
|
632:jkl
|
||||||
|
-633-abc
|
||||||
|
+633-abx
|
||||||
|
634-def
|
||||||
|
635-ghi
|
||||||
|
RC=0
|
||||||
|
--
|
||||||
|
2.23.0
|
||||||
@ -0,0 +1,318 @@
|
|||||||
|
From 16d7edb56757e5294eeeecc9a19135aab89a50ba Mon Sep 17 00:00:00 2001
|
||||||
|
From: Nicholas Wilson <niwilson@microsoft.com>
|
||||||
|
Date: Fri, 1 Nov 2024 17:13:34 +0000
|
||||||
|
Subject: [PATCH] Fix the lookahead after [\d or [[:posix] to skip whitespace
|
||||||
|
(#544)
|
||||||
|
|
||||||
|
Conflict:don't modify alt_extended_class because fc38d9e784 is not merged;
|
||||||
|
don't modify class_op_state because class_op_state is not merged; adapt context
|
||||||
|
Reference:https://github.com/PCRE2Project/pcre2/commit/16d7edb56757e5294eeeecc9a19135aab89a50ba
|
||||||
|
|
||||||
|
---
|
||||||
|
src/pcre2_compile.c | 88 +++++++++++++++++++++++++++---------------
|
||||||
|
src/pcre2_intmodedep.h | 2 +-
|
||||||
|
testdata/testinput1 | 20 +++++++---
|
||||||
|
testdata/testinput2 | 8 ++++
|
||||||
|
testdata/testoutput1 | 30 ++++++++++----
|
||||||
|
testdata/testoutput2 | 12 ++++++
|
||||||
|
6 files changed, 113 insertions(+), 47 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
|
||||||
|
index 2493c871..9be26b07 100644
|
||||||
|
--- a/src/pcre2_compile.c
|
||||||
|
+++ b/src/pcre2_compile.c
|
||||||
|
@@ -2681,7 +2681,14 @@ the main compiling phase. */
|
||||||
|
/* States used for analyzing ranges in character classes. The two OK values
|
||||||
|
must be last. */
|
||||||
|
|
||||||
|
-enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
|
||||||
|
+enum {
|
||||||
|
+ RANGE_NO, /* State after '[' (initial), or '[a-z'; hyphen is literal */
|
||||||
|
+ RANGE_STARTED, /* State after '[1-'; last-emitted code is META_RANGE_XYZ */
|
||||||
|
+ RANGE_FORBID_NO, /* State after '[\d'; '-]' is allowed but not '-1]' */
|
||||||
|
+ RANGE_FORBID_STARTED, /* State after '[\d-'*/
|
||||||
|
+ RANGE_OK_ESCAPED, /* State after '[1'; hyphen may be a range */
|
||||||
|
+ RANGE_OK_LITERAL /* State after '[\1'; hyphen may be a range */
|
||||||
|
+};
|
||||||
|
|
||||||
|
/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
|
||||||
|
the storing of literal values in the main parsed pattern, where they can always
|
||||||
|
@@ -2734,6 +2741,7 @@ PCRE2_SPTR thisptr;
|
||||||
|
PCRE2_SPTR name;
|
||||||
|
PCRE2_SPTR ptrend = cb->end_pattern;
|
||||||
|
PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */
|
||||||
|
+PCRE2_SPTR class_range_forbid_ptr = NULL;
|
||||||
|
named_group *ng;
|
||||||
|
nest_save *top_nest, *end_nests;
|
||||||
|
|
||||||
|
@@ -3559,6 +3567,21 @@ while (ptr < ptrend)
|
||||||
|
goto FAILED;
|
||||||
|
}
|
||||||
|
|
||||||
|
+ /* Perl treats a hyphen after a POSIX class as a literal, not the
|
||||||
|
+ start of a range. However, it gives a warning in its warning mode
|
||||||
|
+ unless the hyphen is the last character in the class. PCRE does not
|
||||||
|
+ have a warning mode, so we give an error, because this is likely an
|
||||||
|
+ error on the user's part.
|
||||||
|
+
|
||||||
|
+ Roll back to the hyphen for the error position. */
|
||||||
|
+
|
||||||
|
+ if (class_range_state == RANGE_FORBID_STARTED)
|
||||||
|
+ {
|
||||||
|
+ ptr = class_range_forbid_ptr;
|
||||||
|
+ errorcode = ERR50;
|
||||||
|
+ goto FAILED;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
if (*ptr != CHAR_COLON)
|
||||||
|
{
|
||||||
|
errorcode = ERR13;
|
||||||
|
@@ -3579,26 +3602,12 @@ while (ptr < ptrend)
|
||||||
|
}
|
||||||
|
ptr = tempptr + 2;
|
||||||
|
|
||||||
|
- /* Perl treats a hyphen after a POSIX class as a literal, not the
|
||||||
|
- start of a range. However, it gives a warning in its warning mode
|
||||||
|
- unless the hyphen is the last character in the class. PCRE does not
|
||||||
|
- have a warning mode, so we give an error, because this is likely an
|
||||||
|
- error on the user's part. */
|
||||||
|
-
|
||||||
|
- if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
|
||||||
|
- ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
|
||||||
|
- {
|
||||||
|
- errorcode = ERR50;
|
||||||
|
- goto FAILED;
|
||||||
|
- }
|
||||||
|
-
|
||||||
|
- /* Set "a hyphen is not the start of a range" for the -] case, and also
|
||||||
|
- in case the POSIX class is followed by \E or \Q\E (possibly repeated -
|
||||||
|
- fuzzers do that kind of thing) and *then* a hyphen. This causes that
|
||||||
|
- hyphen to be treated as a literal. I don't think it's worth setting up
|
||||||
|
- special apparatus to do otherwise. */
|
||||||
|
+ /* Set "a hyphen is forbidden to be the start of a range". For the '-]'
|
||||||
|
+ case, the hyphen is treated as a literal, but for '-1' it is disallowed
|
||||||
|
+ (because it would be interpreted as range). */
|
||||||
|
|
||||||
|
- class_range_state = RANGE_NO;
|
||||||
|
+ class_range_state = RANGE_FORBID_NO;
|
||||||
|
+ class_range_forbid_ptr = ptr;
|
||||||
|
|
||||||
|
/* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
|
||||||
|
of the POSIX classes are converted to use Unicode properties \p or \P
|
||||||
|
@@ -3648,6 +3657,14 @@ while (ptr < ptrend)
|
||||||
|
class_range_state = RANGE_STARTED;
|
||||||
|
}
|
||||||
|
|
||||||
|
+ /* Handle forbidden start of range */
|
||||||
|
+
|
||||||
|
+ else if (c == CHAR_MINUS && class_range_state == RANGE_FORBID_NO)
|
||||||
|
+ {
|
||||||
|
+ *parsed_pattern++ = CHAR_MINUS;
|
||||||
|
+ class_range_state = RANGE_FORBID_STARTED;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
/* Handle a literal character */
|
||||||
|
|
||||||
|
else if (c != CHAR_BACKSLASH)
|
||||||
|
@@ -3670,6 +3687,12 @@ while (ptr < ptrend)
|
||||||
|
}
|
||||||
|
class_range_state = RANGE_NO;
|
||||||
|
}
|
||||||
|
+ else if (class_range_state == RANGE_FORBID_STARTED)
|
||||||
|
+ {
|
||||||
|
+ ptr = class_range_forbid_ptr;
|
||||||
|
+ errorcode = ERR50;
|
||||||
|
+ goto FAILED;
|
||||||
|
+ }
|
||||||
|
else /* Potential start of range */
|
||||||
|
{
|
||||||
|
class_range_state = char_is_literal?
|
||||||
|
@@ -3733,13 +3756,23 @@ while (ptr < ptrend)
|
||||||
|
if (class_range_state == RANGE_STARTED)
|
||||||
|
{
|
||||||
|
errorcode = ERR50;
|
||||||
|
- goto FAILED; /* Not CLASS_ESCAPE_FAILED; always an error */
|
||||||
|
+ goto FAILED;
|
||||||
|
+ }
|
||||||
|
+ /* Perl gives a warning unless the hyphen following a multi-character
|
||||||
|
+ escape is the last character in the class. PCRE throws an error. */
|
||||||
|
+ if (class_range_state == RANGE_FORBID_STARTED)
|
||||||
|
+ {
|
||||||
|
+ ptr = class_range_forbid_ptr;
|
||||||
|
+ errorcode = ERR50;
|
||||||
|
+ goto FAILED;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Of the remaining escapes, only those that define characters are
|
||||||
|
allowed in a class. None may start a range. */
|
||||||
|
|
||||||
|
- class_range_state = RANGE_NO;
|
||||||
|
+ class_range_state = RANGE_FORBID_NO;
|
||||||
|
+ class_range_forbid_ptr = ptr;
|
||||||
|
+
|
||||||
|
switch(escape)
|
||||||
|
{
|
||||||
|
case ESC_N:
|
||||||
|
@@ -3779,6 +3812,7 @@ while (ptr < ptrend)
|
||||||
|
if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
|
||||||
|
*parsed_pattern++ = META_ESCAPE + escape;
|
||||||
|
*parsed_pattern++ = (ptype << 16) | pdata;
|
||||||
|
+ class_range_forbid_ptr = ptr;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
errorcode = ERR45;
|
||||||
|
@@ -3791,16 +3825,6 @@ while (ptr < ptrend)
|
||||||
|
ptr--;
|
||||||
|
goto FAILED;
|
||||||
|
}
|
||||||
|
-
|
||||||
|
- /* Perl gives a warning unless a following hyphen is the last character
|
||||||
|
- in the class. PCRE throws an error. */
|
||||||
|
-
|
||||||
|
- if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
|
||||||
|
- ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
|
||||||
|
- {
|
||||||
|
- errorcode = ERR50;
|
||||||
|
- goto FAILED;
|
||||||
|
- }
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Proceed to next thing in the class. */
|
||||||
|
diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h
|
||||||
|
index 598060c9..a11b4faa 100644
|
||||||
|
--- a/src/pcre2_intmodedep.h
|
||||||
|
+++ b/src/pcre2_intmodedep.h
|
||||||
|
@@ -435,7 +435,7 @@ UTF-16 mode. */
|
||||||
|
c = *eptr; \
|
||||||
|
if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
|
||||||
|
|
||||||
|
-/* Get the next UTF-816character, testing for UTF-16 mode, not advancing the
|
||||||
|
+/* Get the next UTF-16 character, testing for UTF-16 mode, not advancing the
|
||||||
|
pointer, incrementing length if there is a low surrogate. This is called when
|
||||||
|
we do not know if we are in UTF-16 mode. */
|
||||||
|
|
||||||
|
diff --git a/testdata/testinput1 b/testdata/testinput1
|
||||||
|
index 0794502e..1e50369f 100644
|
||||||
|
--- a/testdata/testinput1
|
||||||
|
+++ b/testdata/testinput1
|
||||||
|
@@ -5787,12 +5787,6 @@ ef) x/x,mark
|
||||||
|
|
||||||
|
/(?'c')XX(?'YYYYYYYYYYYYYYYYYYYYYYYCl')/
|
||||||
|
|
||||||
|
-/[s[:digit:]\E-H]+/
|
||||||
|
- s09-H
|
||||||
|
-
|
||||||
|
-/[s[:digit:]\Q\E-H]+/
|
||||||
|
- s09-H
|
||||||
|
-
|
||||||
|
/a+(?:|b)a/
|
||||||
|
aaaa
|
||||||
|
|
||||||
|
@@ -6435,4 +6429,18 @@ ef) x/x,mark
|
||||||
|
/(a\K.(?1)*)/
|
||||||
|
abac
|
||||||
|
|
||||||
|
+/[[:digit:]- ]/xx
|
||||||
|
+ 1
|
||||||
|
+ -
|
||||||
|
+\= Expect no match
|
||||||
|
+ z
|
||||||
|
+ \ \
|
||||||
|
+
|
||||||
|
+/[\d- ]/xx
|
||||||
|
+ 1
|
||||||
|
+ -
|
||||||
|
+\= Expect no match
|
||||||
|
+ z
|
||||||
|
+ \ \
|
||||||
|
+
|
||||||
|
# End of testinput1
|
||||||
|
diff --git a/testdata/testinput2 b/testdata/testinput2
|
||||||
|
index b6464a0b..61b94e69 100644
|
||||||
|
--- a/testdata/testinput2
|
||||||
|
+++ b/testdata/testinput2
|
||||||
|
@@ -5981,4 +5981,12 @@ a)"xI
|
||||||
|
a
|
||||||
|
a\=noteol
|
||||||
|
|
||||||
|
+/[[:digit:] -Z]/xx
|
||||||
|
+
|
||||||
|
+/[\d -Z]/xx
|
||||||
|
+
|
||||||
|
+/[[:digit:]\E-H]/
|
||||||
|
+
|
||||||
|
+/[[:digit:]\Q\E-H]+/
|
||||||
|
+
|
||||||
|
# End of testinput2
|
||||||
|
diff --git a/testdata/testoutput1 b/testdata/testoutput1
|
||||||
|
index 8daf8362..6f927729 100644
|
||||||
|
--- a/testdata/testoutput1
|
||||||
|
+++ b/testdata/testoutput1
|
||||||
|
@@ -9246,14 +9246,6 @@ No match
|
||||||
|
|
||||||
|
/(?'c')XX(?'YYYYYYYYYYYYYYYYYYYYYYYCl')/
|
||||||
|
|
||||||
|
-/[s[:digit:]\E-H]+/
|
||||||
|
- s09-H
|
||||||
|
- 0: s09-H
|
||||||
|
-
|
||||||
|
-/[s[:digit:]\Q\E-H]+/
|
||||||
|
- s09-H
|
||||||
|
- 0: s09-H
|
||||||
|
-
|
||||||
|
/a+(?:|b)a/
|
||||||
|
aaaa
|
||||||
|
0: aaaa
|
||||||
|
@@ -10197,4 +10189,26 @@ No match
|
||||||
|
0: c
|
||||||
|
1: abac
|
||||||
|
|
||||||
|
+/[[:digit:]- ]/xx
|
||||||
|
+ 1
|
||||||
|
+ 0: 1
|
||||||
|
+ -
|
||||||
|
+ 0: -
|
||||||
|
+\= Expect no match
|
||||||
|
+ z
|
||||||
|
+No match
|
||||||
|
+ \ \
|
||||||
|
+No match
|
||||||
|
+
|
||||||
|
+/[\d- ]/xx
|
||||||
|
+ 1
|
||||||
|
+ 0: 1
|
||||||
|
+ -
|
||||||
|
+ 0: -
|
||||||
|
+\= Expect no match
|
||||||
|
+ z
|
||||||
|
+No match
|
||||||
|
+ \ \
|
||||||
|
+No match
|
||||||
|
+
|
||||||
|
# End of testinput1
|
||||||
|
diff --git a/testdata/testoutput2 b/testdata/testoutput2
|
||||||
|
index 1075b4d4..86bfe964 100644
|
||||||
|
--- a/testdata/testoutput2
|
||||||
|
+++ b/testdata/testoutput2
|
||||||
|
@@ -17815,6 +17815,18 @@ Subject length lower bound = 2
|
||||||
|
a\=noteol
|
||||||
|
0: a
|
||||||
|
|
||||||
|
+/[[:digit:] -Z]/xx
|
||||||
|
+Failed: error 150 at offset 10: invalid range in character class
|
||||||
|
+
|
||||||
|
+/[\d -Z]/xx
|
||||||
|
+Failed: error 150 at offset 3: invalid range in character class
|
||||||
|
+
|
||||||
|
+/[[:digit:]\E-H]/
|
||||||
|
+Failed: error 150 at offset 10: invalid range in character class
|
||||||
|
+
|
||||||
|
+/[[:digit:]\Q\E-H]+/
|
||||||
|
+Failed: error 150 at offset 10: invalid range in character class
|
||||||
|
+
|
||||||
|
# End of testinput2
|
||||||
|
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
|
||||||
|
Error -62: bad serialized data
|
||||||
|
--
|
||||||
|
2.33.0
|
||||||
|
|
||||||
@ -0,0 +1,104 @@
|
|||||||
|
From fc56fd790c1a3ba8f2890fc2b6afba21250923de Mon Sep 17 00:00:00 2001
|
||||||
|
From: Philip Hazel <Philip.Hazel@gmail.com>
|
||||||
|
Date: Thu, 2 Feb 2023 17:19:45 +0000
|
||||||
|
Subject: [PATCH] Further ASCII tests and minor bugfix plus ChangeLog update
|
||||||
|
|
||||||
|
Conflict:don't modify ChangeLog
|
||||||
|
Reference:https://github.com/PCRE2Project/pcre2/commit/fc56fd790c1a3ba8f2890fc2b6afba21250923de
|
||||||
|
|
||||||
|
---
|
||||||
|
src/pcre2_compile.c | 5 ++---
|
||||||
|
testdata/testinput5 | 5 +++++
|
||||||
|
testdata/testinput7 | 5 +++++
|
||||||
|
testdata/testoutput5 | 7 +++++++
|
||||||
|
testdata/testoutput7 | 7 +++++++
|
||||||
|
5 files changed, 26 insertions(+), 3 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
|
||||||
|
index b8a9e098..64a35bda 100644
|
||||||
|
--- a/src/pcre2_compile.c
|
||||||
|
+++ b/src/pcre2_compile.c
|
||||||
|
@@ -2660,10 +2660,9 @@ the main compiling phase. */
|
||||||
|
PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
|
||||||
|
PCRE2_UNGREEDY)
|
||||||
|
|
||||||
|
-#define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT)
|
||||||
|
-
|
||||||
|
#define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \
|
||||||
|
- PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW)
|
||||||
|
+ PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \
|
||||||
|
+ PCRE2_EXTRA_ASCII_POSIX)
|
||||||
|
|
||||||
|
/* States used for analyzing ranges in character classes. The two OK values
|
||||||
|
must be last. */
|
||||||
|
diff --git a/testdata/testinput5 b/testdata/testinput5
|
||||||
|
index 6e186cf0..49b46f82 100644
|
||||||
|
--- a/testdata/testinput5
|
||||||
|
+++ b/testdata/testinput5
|
||||||
|
@@ -2434,6 +2434,11 @@
|
||||||
|
/(?aP)[[:alnum:]\d]+/i,ucp,utf
|
||||||
|
abc\x{660}xyz
|
||||||
|
|
||||||
|
+/(*UCP)(*UTF)[[:alnum:]](?aP:[[:alnum:]])[[:alnum:]]/
|
||||||
|
+ \x{660}A\x{660}
|
||||||
|
+\= Expect no match
|
||||||
|
+ \x{660}\x{660}\x{660}
|
||||||
|
+
|
||||||
|
# VARIOUS
|
||||||
|
|
||||||
|
/[\d\s\w]+/a,ucp,utf
|
||||||
|
diff --git a/testdata/testinput7 b/testdata/testinput7
|
||||||
|
index 64a37ad2..a2b7fb8d 100644
|
||||||
|
--- a/testdata/testinput7
|
||||||
|
+++ b/testdata/testinput7
|
||||||
|
@@ -2453,6 +2453,11 @@
|
||||||
|
/(?aP)[[:alnum:]\d]+/i,ucp,utf
|
||||||
|
abc\x{660}xyz
|
||||||
|
|
||||||
|
+/(*UCP)(*UTF)[[:alnum:]](?aP:[[:alnum:]])[[:alnum:]]/
|
||||||
|
+ \x{660}A\x{660}
|
||||||
|
+\= Expect no match
|
||||||
|
+ \x{660}\x{660}\x{660}
|
||||||
|
+
|
||||||
|
# VARIOUS
|
||||||
|
|
||||||
|
/[\d\s\w]+/a,ucp,utf
|
||||||
|
diff --git a/testdata/testoutput5 b/testdata/testoutput5
|
||||||
|
index 26972f70..4f845c84 100644
|
||||||
|
--- a/testdata/testoutput5
|
||||||
|
+++ b/testdata/testoutput5
|
||||||
|
@@ -5365,6 +5365,13 @@ No match
|
||||||
|
abc\x{660}xyz
|
||||||
|
0: abc\x{660}xyz
|
||||||
|
|
||||||
|
+/(*UCP)(*UTF)[[:alnum:]](?aP:[[:alnum:]])[[:alnum:]]/
|
||||||
|
+ \x{660}A\x{660}
|
||||||
|
+ 0: \x{660}A\x{660}
|
||||||
|
+\= Expect no match
|
||||||
|
+ \x{660}\x{660}\x{660}
|
||||||
|
+No match
|
||||||
|
+
|
||||||
|
# VARIOUS
|
||||||
|
|
||||||
|
/[\d\s\w]+/a,ucp,utf
|
||||||
|
diff --git a/testdata/testoutput7 b/testdata/testoutput7
|
||||||
|
index c830748c..4065981d 100644
|
||||||
|
--- a/testdata/testoutput7
|
||||||
|
+++ b/testdata/testoutput7
|
||||||
|
@@ -4105,6 +4105,13 @@ No match
|
||||||
|
abc\x{660}xyz
|
||||||
|
0: abc\x{660}xyz
|
||||||
|
|
||||||
|
+/(*UCP)(*UTF)[[:alnum:]](?aP:[[:alnum:]])[[:alnum:]]/
|
||||||
|
+ \x{660}A\x{660}
|
||||||
|
+ 0: \x{660}A\x{660}
|
||||||
|
+\= Expect no match
|
||||||
|
+ \x{660}\x{660}\x{660}
|
||||||
|
+No match
|
||||||
|
+
|
||||||
|
# VARIOUS
|
||||||
|
|
||||||
|
/[\d\s\w]+/a,ucp,utf
|
||||||
|
--
|
||||||
|
2.33.0
|
||||||
|
|
||||||
@ -0,0 +1,233 @@
|
|||||||
|
From ef218fbba60bfe5b0a8ac9ea4445eac5fb0847e5 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Alex Dowad <alexinbeijing@gmail.com>
|
||||||
|
Date: Sat, 7 Sep 2024 00:16:03 +0900
|
||||||
|
Subject: [PATCH] Guard against out-of-bounds memory access when parsing
|
||||||
|
LIMIT_HEAP et al (#463)
|
||||||
|
|
||||||
|
Patterns passed to pcre2_compile are not guaranteed to be
|
||||||
|
null-terminated. Also, it can happen that there is an invalid
|
||||||
|
pattern like this:
|
||||||
|
|
||||||
|
(*LIMIT_HEAP=123
|
||||||
|
|
||||||
|
If the next byte of memory after the end of the pattern happens
|
||||||
|
to be a digit, it will be parsed as part of the limit value. Or,
|
||||||
|
if the next byte is a right parenthesis character, it will be taken
|
||||||
|
as the end of the (*LIMIT_HEAP=nnn) construct.
|
||||||
|
|
||||||
|
This will result in `skipatstart` being larger than `patlen`, which
|
||||||
|
will result in underflow and an erroneous call to malloc requesting
|
||||||
|
a huge number of bytes.
|
||||||
|
---
|
||||||
|
src/pcre2_compile.c | 7 ++-
|
||||||
|
src/pcre2_internal.h | 3 +
|
||||||
|
src/pcre2_util.h | 132 ++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
testdata/testoutput15 | 4 +-
|
||||||
|
4 files changed, 141 insertions(+), 5 deletions(-)
|
||||||
|
create mode 100644 src/pcre2_util.h
|
||||||
|
|
||||||
|
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
|
||||||
|
index e6843bb13..410f220b3 100644
|
||||||
|
--- a/src/pcre2_compile.c
|
||||||
|
+++ b/src/pcre2_compile.c
|
||||||
|
@@ -10552,12 +10552,12 @@ if ((options & PCRE2_LITERAL) == 0)
|
||||||
|
ptr += pp;
|
||||||
|
goto HAD_EARLY_ERROR;
|
||||||
|
}
|
||||||
|
- while (IS_DIGIT(ptr[pp]))
|
||||||
|
+ while (pp < patlen && IS_DIGIT(ptr[pp]))
|
||||||
|
{
|
||||||
|
if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */
|
||||||
|
c = c*10 + (ptr[pp++] - CHAR_0);
|
||||||
|
}
|
||||||
|
- if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
|
||||||
|
+ if (pp >= patlen || ptr[pp] != CHAR_RIGHT_PARENTHESIS)
|
||||||
|
{
|
||||||
|
errorcode = ERR60;
|
||||||
|
ptr += pp;
|
||||||
|
@@ -10566,7 +10566,7 @@ if ((options & PCRE2_LITERAL) == 0)
|
||||||
|
if (p->type == PSO_LIMH) limit_heap = c;
|
||||||
|
else if (p->type == PSO_LIMM) limit_match = c;
|
||||||
|
else limit_depth = c;
|
||||||
|
- skipatstart += pp - skipatstart;
|
||||||
|
+ skipatstart = ++pp;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
break; /* Out of the table scan loop */
|
||||||
|
@@ -10574,6 +10574,7 @@ if ((options & PCRE2_LITERAL) == 0)
|
||||||
|
}
|
||||||
|
if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */
|
||||||
|
}
|
||||||
|
+ PCRE2_ASSERT(skipatstart <= patlen);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* End of pattern-start options; advance to start of real regex. */
|
||||||
|
diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h
|
||||||
|
index d8fad1e..edb36ca 100644
|
||||||
|
--- a/src/pcre2_internal.h
|
||||||
|
+++ b/src/pcre2_internal.h
|
||||||
|
@@ -1999,6 +1999,9 @@ extern void * _pcre2_memmove(void *, const void *, size_t);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* PCRE2_CODE_UNIT_WIDTH */
|
||||||
|
+
|
||||||
|
+#include "pcre2_util.h"
|
||||||
|
+
|
||||||
|
#endif /* PCRE2_INTERNAL_H_IDEMPOTENT_GUARD */
|
||||||
|
|
||||||
|
/* End of pcre2_internal.h */
|
||||||
|
diff --git a/src/pcre2_util.h b/src/pcre2_util.h
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000..ea86355
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/src/pcre2_util.h
|
||||||
|
@@ -0,0 +1,132 @@
|
||||||
|
+/*************************************************
|
||||||
|
+* Perl-Compatible Regular Expressions *
|
||||||
|
+*************************************************/
|
||||||
|
+
|
||||||
|
+/* PCRE2 is a library of functions to support regular expressions whose syntax
|
||||||
|
+and semantics are as close as possible to those of the Perl 5 language.
|
||||||
|
+
|
||||||
|
+ Written by Philip Hazel
|
||||||
|
+ Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||||
|
+ New API code Copyright (c) 2016-2024 University of Cambridge
|
||||||
|
+
|
||||||
|
+-----------------------------------------------------------------------------
|
||||||
|
+Redistribution and use in source and binary forms, with or without
|
||||||
|
+modification, are permitted provided that the following conditions are met:
|
||||||
|
+
|
||||||
|
+ * Redistributions of source code must retain the above copyright notice,
|
||||||
|
+ this list of conditions and the following disclaimer.
|
||||||
|
+
|
||||||
|
+ * Redistributions in binary form must reproduce the above copyright
|
||||||
|
+ notice, this list of conditions and the following disclaimer in the
|
||||||
|
+ documentation and/or other materials provided with the distribution.
|
||||||
|
+
|
||||||
|
+ * Neither the name of the University of Cambridge nor the names of its
|
||||||
|
+ contributors may be used to endorse or promote products derived from
|
||||||
|
+ this software without specific prior written permission.
|
||||||
|
+
|
||||||
|
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
+POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
+-----------------------------------------------------------------------------
|
||||||
|
+*/
|
||||||
|
+
|
||||||
|
+#ifndef PCRE2_UTIL_H_IDEMPOTENT_GUARD
|
||||||
|
+#define PCRE2_UTIL_H_IDEMPOTENT_GUARD
|
||||||
|
+
|
||||||
|
+/* Assertion macros */
|
||||||
|
+
|
||||||
|
+#ifdef PCRE2_DEBUG
|
||||||
|
+
|
||||||
|
+#if defined(HAVE_ASSERT_H) && !defined(NDEBUG)
|
||||||
|
+#include <assert.h>
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+/* PCRE2_ASSERT(x) can be used to inject an assert() for conditions
|
||||||
|
+that the code below doesn't support. It is a NOP for non debug builds
|
||||||
|
+but in debug builds will print information about the location of the
|
||||||
|
+code where it triggered and crash.
|
||||||
|
+
|
||||||
|
+It is meant to work like assert(), and therefore the expression used
|
||||||
|
+should indicate what the expected state is, and shouldn't have any
|
||||||
|
+side-effects. */
|
||||||
|
+
|
||||||
|
+#if defined(HAVE_ASSERT_H) && !defined(NDEBUG)
|
||||||
|
+#define PCRE2_ASSERT(x) assert(x)
|
||||||
|
+#else
|
||||||
|
+#define PCRE2_ASSERT(x) do \
|
||||||
|
+{ \
|
||||||
|
+ if (!(x)) \
|
||||||
|
+ { \
|
||||||
|
+ fprintf(stderr, "Assertion failed at " __FILE__ ":%d\n", __LINE__); \
|
||||||
|
+ abort(); \
|
||||||
|
+ } \
|
||||||
|
+} while(0)
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+/* PCRE2_UNREACHABLE() can be used to mark locations on the code that
|
||||||
|
+shouldn't be reached. In non debug builds is defined as a hint for
|
||||||
|
+the compiler to eliminate any code after it, so it is useful also for
|
||||||
|
+performance reasons, but should be used with care because if it is
|
||||||
|
+ever reached will trigger Undefined Behaviour and if you are lucky a
|
||||||
|
+crash. In debug builds it will report the location where it was triggered
|
||||||
|
+and crash. One important point to consider when using this macro, is
|
||||||
|
+that it is only implemented for a few compilers, and therefore can't
|
||||||
|
+be relied on to always be active either, so if it is followed by some
|
||||||
|
+code it is important to make sure that the whole thing is safe to
|
||||||
|
+use even if the macro is not there (ex: make sure there is a `break`
|
||||||
|
+after it if used at the end of a `case`) and to test your code also
|
||||||
|
+with a configuration where the macro will be a NOP. */
|
||||||
|
+
|
||||||
|
+#if defined(HAVE_ASSERT_H) && !defined(NDEBUG)
|
||||||
|
+#define PCRE2_UNREACHABLE() \
|
||||||
|
+assert(((void)"Execution reached unexpected point", 0))
|
||||||
|
+#else
|
||||||
|
+#define PCRE2_UNREACHABLE() do \
|
||||||
|
+{ \
|
||||||
|
+fprintf(stderr, "Execution reached unexpected point at " __FILE__ \
|
||||||
|
+ ":%d\n", __LINE__); \
|
||||||
|
+abort(); \
|
||||||
|
+} while(0)
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+/* PCRE2_DEBUG_UNREACHABLE() is a debug only version of the previous
|
||||||
|
+macro. It is meant to be used in places where the code is handling
|
||||||
|
+an error situation in code that shouldn't be reached, but that has
|
||||||
|
+some sort of fallback code to normally handle the error. When in
|
||||||
|
+doubt you should use this instead of the previous macro. Like in
|
||||||
|
+the previous case, it is a good idea to document as much as possible
|
||||||
|
+the reason and the actions that should be taken if it ever triggers. */
|
||||||
|
+
|
||||||
|
+#define PCRE2_DEBUG_UNREACHABLE() PCRE2_UNREACHABLE()
|
||||||
|
+
|
||||||
|
+#endif /* PCRE2_DEBUG */
|
||||||
|
+
|
||||||
|
+#ifndef PCRE2_DEBUG_UNREACHABLE
|
||||||
|
+#define PCRE2_DEBUG_UNREACHABLE() do {} while(0)
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+#ifndef PCRE2_UNREACHABLE
|
||||||
|
+#ifdef HAVE_BUILTIN_UNREACHABLE
|
||||||
|
+#define PCRE2_UNREACHABLE() __builtin_unreachable()
|
||||||
|
+#elif defined(HAVE_BUILTIN_ASSUME)
|
||||||
|
+#define PCRE2_UNREACHABLE() __assume(0)
|
||||||
|
+#else
|
||||||
|
+#define PCRE2_UNREACHABLE() do {} while(0)
|
||||||
|
+#endif
|
||||||
|
+#endif /* !PCRE2_UNREACHABLE */
|
||||||
|
+
|
||||||
|
+#ifndef PCRE2_ASSERT
|
||||||
|
+#define PCRE2_ASSERT(x) do {} while(0)
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+#endif /* PCRE2_UTIL_H_IDEMPOTENT_GUARD */
|
||||||
|
+
|
||||||
|
+/* End of pcre2_util.h */
|
||||||
|
diff --git a/testdata/testoutput15 b/testdata/testoutput15
|
||||||
|
index aa9c5c930..f36faeeaf 100644
|
||||||
|
--- a/testdata/testoutput15
|
||||||
|
+++ b/testdata/testoutput15
|
||||||
|
@@ -111,10 +111,10 @@ Minimum depth limit = 10
|
||||||
|
3: ee
|
||||||
|
|
||||||
|
/(*LIMIT_MATCH=12bc)abc/
|
||||||
|
-Failed: error 160 at offset 17: (*VERB) not recognized or malformed
|
||||||
|
+Failed: error 160 at offset 16: (*VERB) not recognized or malformed
|
||||||
|
|
||||||
|
/(*LIMIT_MATCH=4294967290)abc/
|
||||||
|
-Failed: error 160 at offset 24: (*VERB) not recognized or malformed
|
||||||
|
+Failed: error 160 at offset 23: (*VERB) not recognized or malformed
|
||||||
|
|
||||||
|
/(*LIMIT_DEPTH=4294967280)abc/I
|
||||||
|
Capture group count = 0
|
||||||
1649
backport-Implement-PCRE2_EXTRA_CASELESS_RESTRICT-and-related-.patch
Normal file
1649
backport-Implement-PCRE2_EXTRA_CASELESS_RESTRICT-and-related-.patch
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,68 @@
|
|||||||
|
From d704ee40c5324e5ff6c08f009a7aaa3b67b71565 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Nicholas Wilson <niwilson@microsoft.com>
|
||||||
|
Date: Fri, 27 Sep 2024 16:31:01 +0100
|
||||||
|
Subject: [PATCH] Improve error message for \N{name} in character classes
|
||||||
|
(#502)
|
||||||
|
|
||||||
|
---
|
||||||
|
src/pcre2_compile.c | 8 ++++++++
|
||||||
|
testdata/testinput2 | 6 ++++++
|
||||||
|
testdata/testoutput2 | 9 +++++++++
|
||||||
|
3 files changed, 23 insertions(+)
|
||||||
|
|
||||||
|
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
|
||||||
|
index ec4940e63..fd554f1d2 100644
|
||||||
|
--- a/src/pcre2_compile.c
|
||||||
|
+++ b/src/pcre2_compile.c
|
||||||
|
@@ -1542,6 +1542,14 @@ else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
+ /* Give an error in contexts where quantifiers are not allowed
|
||||||
|
+ (character classes; substitution strings). */
|
||||||
|
+
|
||||||
|
+ else if (isclassorsub || cb == NULL)
|
||||||
|
+ {
|
||||||
|
+ *errorcodeptr = ERR37;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
/* Give an error if what follows is not a quantifier, but don't override
|
||||||
|
an error set by the quantifier reader (e.g. number overflow). */
|
||||||
|
|
||||||
|
diff --git a/testdata/testinput2 b/testdata/testinput2
|
||||||
|
index c6ee980..a33d987 100644
|
||||||
|
--- a/testdata/testinput2
|
||||||
|
+++ b/testdata/testinput2
|
||||||
|
@@ -913,6 +913,12 @@
|
||||||
|
|
||||||
|
/\U/I
|
||||||
|
|
||||||
|
+/[\N]/
|
||||||
|
+
|
||||||
|
+/[\N{4}]/
|
||||||
|
+
|
||||||
|
+/[\N{name}]/
|
||||||
|
+
|
||||||
|
/a{1,3}b/ungreedy
|
||||||
|
ab
|
||||||
|
|
||||||
|
diff --git a/testdata/testoutput2 b/testdata/testoutput2
|
||||||
|
index 2f2b3d1..4c07b72 100644
|
||||||
|
--- a/testdata/testoutput2
|
||||||
|
+++ b/testdata/testoutput2
|
||||||
|
@@ -3245,6 +3245,15 @@ Failed: error 137 at offset 2: PCRE2 does not support \F, \L, \l, \N{name}, \U,
|
||||||
|
/\U/I
|
||||||
|
Failed: error 137 at offset 2: PCRE2 does not support \F, \L, \l, \N{name}, \U, or \u
|
||||||
|
|
||||||
|
+/[\N]/
|
||||||
|
+Failed: error 171 at offset 3: \N is not supported in a class
|
||||||
|
+
|
||||||
|
+/[\N{4}]/
|
||||||
|
+Failed: error 137 at offset 3: PCRE2 does not support \F, \L, \l, \N{name}, \U, or \u
|
||||||
|
+
|
||||||
|
+/[\N{name}]/
|
||||||
|
+Failed: error 137 at offset 3: PCRE2 does not support \F, \L, \l, \N{name}, \U, or \u
|
||||||
|
+
|
||||||
|
/a{1,3}b/ungreedy
|
||||||
|
ab
|
||||||
|
0: ab
|
||||||
425
backport-Improve-error-offsets-for-character-classes-548.patch
Normal file
425
backport-Improve-error-offsets-for-character-classes-548.patch
Normal file
@ -0,0 +1,425 @@
|
|||||||
|
From 6185344ed8617ff84a08764e808e5b3667c34a7a Mon Sep 17 00:00:00 2001
|
||||||
|
From: Nicholas Wilson <niwilson@microsoft.com>
|
||||||
|
Date: Wed, 6 Nov 2024 08:45:46 +0000
|
||||||
|
Subject: [PATCH] Improve error offsets for character classes (#548)
|
||||||
|
|
||||||
|
Conflict:don't modify alt_extended_class because fc38d9e784 is not merged;
|
||||||
|
don't modify class_op_state because class_op_state is not merged; adapt context
|
||||||
|
Reference:https://github.com/PCRE2Project/pcre2/commit/6185344ed8617ff84a08764e808e5b3667c34a7a
|
||||||
|
|
||||||
|
* Error offset should be advanced by one character for "[\d-z]"
|
||||||
|
invalid range error
|
||||||
|
|
||||||
|
The code does a 1-char lookahead for a hyphen, but then doesn't
|
||||||
|
advance the pointer to consume the hyphen when returning the error.
|
||||||
|
|
||||||
|
Perl's error message (with "use warnings") does advance to just
|
||||||
|
after the hyphen, so PCRE2 should match.
|
||||||
|
|
||||||
|
Fixes #545.
|
||||||
|
|
||||||
|
* Also improve error offsets for [[:bad:]], [[=...=]] and [z-\p{...}]
|
||||||
|
cases
|
||||||
|
---
|
||||||
|
src/pcre2_compile.c | 67 +++++++++++++++++++-------------------
|
||||||
|
testdata/testinput2 | 8 +++++
|
||||||
|
testdata/testinput5 | 8 +++++
|
||||||
|
testdata/testoutput2 | 76 +++++++++++++++++++++++++-------------------
|
||||||
|
testdata/testoutput5 | 14 +++++++-
|
||||||
|
5 files changed, 106 insertions(+), 67 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
|
||||||
|
index 32db44db..290e759b 100644
|
||||||
|
--- a/src/pcre2_compile.c
|
||||||
|
+++ b/src/pcre2_compile.c
|
||||||
|
@@ -3563,6 +3563,7 @@ while (ptr < ptrend)
|
||||||
|
|
||||||
|
if (class_range_state == RANGE_STARTED)
|
||||||
|
{
|
||||||
|
+ ptr = tempptr + 2;
|
||||||
|
errorcode = ERR50;
|
||||||
|
goto FAILED;
|
||||||
|
}
|
||||||
|
@@ -3584,8 +3585,9 @@ while (ptr < ptrend)
|
||||||
|
|
||||||
|
if (*ptr != CHAR_COLON)
|
||||||
|
{
|
||||||
|
+ ptr = tempptr + 2;
|
||||||
|
errorcode = ERR13;
|
||||||
|
- goto FAILED_BACK;
|
||||||
|
+ goto FAILED;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
|
||||||
|
@@ -3595,19 +3597,18 @@ while (ptr < ptrend)
|
||||||
|
}
|
||||||
|
|
||||||
|
posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
|
||||||
|
+ ptr = tempptr + 2;
|
||||||
|
if (posix_class < 0)
|
||||||
|
{
|
||||||
|
errorcode = ERR30;
|
||||||
|
goto FAILED;
|
||||||
|
}
|
||||||
|
- ptr = tempptr + 2;
|
||||||
|
|
||||||
|
/* Set "a hyphen is forbidden to be the start of a range". For the '-]'
|
||||||
|
case, the hyphen is treated as a literal, but for '-1' it is disallowed
|
||||||
|
(because it would be interpreted as range). */
|
||||||
|
|
||||||
|
class_range_state = RANGE_FORBID_NO;
|
||||||
|
- class_range_forbid_ptr = ptr;
|
||||||
|
|
||||||
|
/* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
|
||||||
|
of the POSIX classes are converted to use Unicode properties \p or \P
|
||||||
|
@@ -3664,6 +3665,7 @@ while (ptr < ptrend)
|
||||||
|
{
|
||||||
|
*parsed_pattern++ = CHAR_MINUS;
|
||||||
|
class_range_state = RANGE_FORBID_STARTED;
|
||||||
|
+ class_range_forbid_ptr = ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Handle a literal character */
|
||||||
|
@@ -3746,37 +3748,8 @@ while (ptr < ptrend)
|
||||||
|
errorcode = ERR7;
|
||||||
|
ptr--;
|
||||||
|
goto FAILED;
|
||||||
|
- }
|
||||||
|
|
||||||
|
- /* The second part of a range can be a single-character escape
|
||||||
|
- sequence (detected above), but not any of the other escapes. Perl
|
||||||
|
- treats a hyphen as a literal in such circumstances. However, in Perl's
|
||||||
|
- warning mode, a warning is given, so PCRE now faults it, as it is
|
||||||
|
- almost certainly a mistake on the user's part. */
|
||||||
|
-
|
||||||
|
- if (class_range_state == RANGE_STARTED)
|
||||||
|
- {
|
||||||
|
- errorcode = ERR50;
|
||||||
|
- goto FAILED;
|
||||||
|
- }
|
||||||
|
- /* Perl gives a warning unless the hyphen following a multi-character
|
||||||
|
- escape is the last character in the class. PCRE throws an error. */
|
||||||
|
- if (class_range_state == RANGE_FORBID_STARTED)
|
||||||
|
- {
|
||||||
|
- ptr = class_range_forbid_ptr;
|
||||||
|
- errorcode = ERR50;
|
||||||
|
- goto FAILED;
|
||||||
|
- }
|
||||||
|
-
|
||||||
|
- /* Of the remaining escapes, only those that define characters are
|
||||||
|
- allowed in a class. None may start a range. */
|
||||||
|
-
|
||||||
|
- class_range_state = RANGE_FORBID_NO;
|
||||||
|
- class_range_forbid_ptr = ptr;
|
||||||
|
-
|
||||||
|
- switch(escape)
|
||||||
|
- {
|
||||||
|
- case ESC_N:
|
||||||
|
+ case ESC_N: /* Not permitted by Perl either */
|
||||||
|
errorcode = ERR71;
|
||||||
|
goto FAILED;
|
||||||
|
|
||||||
|
@@ -3813,7 +3786,6 @@ while (ptr < ptrend)
|
||||||
|
if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
|
||||||
|
*parsed_pattern++ = META_ESCAPE + escape;
|
||||||
|
*parsed_pattern++ = (ptype << 16) | pdata;
|
||||||
|
- class_range_forbid_ptr = ptr;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
errorcode = ERR45;
|
||||||
|
@@ -3826,6 +3798,33 @@ while (ptr < ptrend)
|
||||||
|
ptr--;
|
||||||
|
goto FAILED;
|
||||||
|
}
|
||||||
|
+
|
||||||
|
+ /* All the switch-cases above which end in "break" describe a set
|
||||||
|
+ of characters. None may start a range. */
|
||||||
|
+
|
||||||
|
+ /* The second part of a range can be a single-character escape
|
||||||
|
+ sequence (detected above), but not any of the other escapes. Perl
|
||||||
|
+ treats a hyphen as a literal in such circumstances. However, in Perl's
|
||||||
|
+ warning mode, a warning is given, so PCRE now faults it, as it is
|
||||||
|
+ almost certainly a mistake on the user's part. */
|
||||||
|
+
|
||||||
|
+ if (class_range_state == RANGE_STARTED)
|
||||||
|
+ {
|
||||||
|
+ errorcode = ERR50;
|
||||||
|
+ goto FAILED;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ /* Perl gives a warning unless the hyphen following a multi-character
|
||||||
|
+ escape is the last character in the class. PCRE throws an error. */
|
||||||
|
+
|
||||||
|
+ if (class_range_state == RANGE_FORBID_STARTED)
|
||||||
|
+ {
|
||||||
|
+ ptr = class_range_forbid_ptr;
|
||||||
|
+ errorcode = ERR50;
|
||||||
|
+ goto FAILED;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ class_range_state = RANGE_FORBID_NO;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Proceed to next thing in the class. */
|
||||||
|
diff --git a/testdata/testinput2 b/testdata/testinput2
|
||||||
|
index 61b94e69..1fbb778e 100644
|
||||||
|
--- a/testdata/testinput2
|
||||||
|
+++ b/testdata/testinput2
|
||||||
|
@@ -7008,4 +7008,12 @@ a)"xI
|
||||||
|
|
||||||
|
/[[:digit:]\Q\E-H]+/
|
||||||
|
|
||||||
|
+/[z-[:space:]]/
|
||||||
|
+
|
||||||
|
+/[z-\d]/
|
||||||
|
+
|
||||||
|
+/[[:space:]-z]/
|
||||||
|
+
|
||||||
|
+/[\d-z]/
|
||||||
|
+
|
||||||
|
# End of testinput2
|
||||||
|
diff --git a/testdata/testinput5 b/testdata/testinput5
|
||||||
|
index 494371b5..f3faeb8f 100644
|
||||||
|
--- a/testdata/testinput5
|
||||||
|
+++ b/testdata/testinput5
|
||||||
|
@@ -2458,4 +2458,12 @@
|
||||||
|
/abc/utf,substitute_extended,replace=>\777<
|
||||||
|
abc
|
||||||
|
|
||||||
|
+/[z-\p{Lu}]/
|
||||||
|
+
|
||||||
|
+/[z-\pL]/
|
||||||
|
+
|
||||||
|
+/[\p{Lu}-z]/
|
||||||
|
+
|
||||||
|
+/[\pL-z]/
|
||||||
|
+
|
||||||
|
# End of testinput5
|
||||||
|
diff --git a/testdata/testoutput2 b/testdata/testoutput2
|
||||||
|
index 86bfe964..99714596 100644
|
||||||
|
--- a/testdata/testoutput2
|
||||||
|
+++ b/testdata/testoutput2
|
||||||
|
@@ -2176,13 +2176,13 @@ Starting code units: % 0 1 A B C D E F G H I J K L M N O P Q R S T U V W
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/[[.ch.]]/I
|
||||||
|
-Failed: error 113 at offset 1: POSIX collating elements are not supported
|
||||||
|
+Failed: error 113 at offset 7: POSIX collating elements are not supported
|
||||||
|
|
||||||
|
/[[=ch=]]/I
|
||||||
|
-Failed: error 113 at offset 1: POSIX collating elements are not supported
|
||||||
|
+Failed: error 113 at offset 7: POSIX collating elements are not supported
|
||||||
|
|
||||||
|
/[[:rhubarb:]]/I
|
||||||
|
-Failed: error 130 at offset 3: unknown POSIX class name
|
||||||
|
+Failed: error 130 at offset 12: unknown POSIX class name
|
||||||
|
|
||||||
|
/[[:upper:]]/Ii
|
||||||
|
Capture group count = 0
|
||||||
|
@@ -8722,31 +8722,31 @@ Failed: error 162 at offset 4: subpattern name expected
|
||||||
|
Failed: error 162 at offset 4: subpattern name expected
|
||||||
|
|
||||||
|
/[[:foo:]]/
|
||||||
|
-Failed: error 130 at offset 3: unknown POSIX class name
|
||||||
|
+Failed: error 130 at offset 8: unknown POSIX class name
|
||||||
|
|
||||||
|
/[[:1234:]]/
|
||||||
|
-Failed: error 130 at offset 3: unknown POSIX class name
|
||||||
|
+Failed: error 130 at offset 9: unknown POSIX class name
|
||||||
|
|
||||||
|
/[[:f\oo:]]/
|
||||||
|
-Failed: error 130 at offset 3: unknown POSIX class name
|
||||||
|
+Failed: error 130 at offset 9: unknown POSIX class name
|
||||||
|
|
||||||
|
/[[: :]]/
|
||||||
|
-Failed: error 130 at offset 3: unknown POSIX class name
|
||||||
|
+Failed: error 130 at offset 6: unknown POSIX class name
|
||||||
|
|
||||||
|
/[[:...:]]/
|
||||||
|
-Failed: error 130 at offset 3: unknown POSIX class name
|
||||||
|
+Failed: error 130 at offset 8: unknown POSIX class name
|
||||||
|
|
||||||
|
/[[:l\ower:]]/
|
||||||
|
-Failed: error 130 at offset 3: unknown POSIX class name
|
||||||
|
+Failed: error 130 at offset 11: unknown POSIX class name
|
||||||
|
|
||||||
|
/[[:abc\:]]/
|
||||||
|
-Failed: error 130 at offset 3: unknown POSIX class name
|
||||||
|
+Failed: error 130 at offset 9: unknown POSIX class name
|
||||||
|
|
||||||
|
/[abc[:x\]pqr:]]/
|
||||||
|
-Failed: error 130 at offset 6: unknown POSIX class name
|
||||||
|
+Failed: error 130 at offset 14: unknown POSIX class name
|
||||||
|
|
||||||
|
/[[:a\dz:]]/
|
||||||
|
-Failed: error 130 at offset 3: unknown POSIX class name
|
||||||
|
+Failed: error 130 at offset 9: unknown POSIX class name
|
||||||
|
|
||||||
|
/(^(a|b\g<-1'c))/
|
||||||
|
Failed: error 157 at offset 8: \g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number
|
||||||
|
@@ -11409,7 +11409,7 @@ Failed: error 171 at offset 4: \N is not supported in a class
|
||||||
|
aNc
|
||||||
|
|
||||||
|
/a[B-\Nc]/
|
||||||
|
-Failed: error 150 at offset 6: invalid range in character class
|
||||||
|
+Failed: error 171 at offset 6: \N is not supported in a class
|
||||||
|
|
||||||
|
/a[B\Nc]/
|
||||||
|
Failed: error 171 at offset 5: \N is not supported in a class
|
||||||
|
@@ -13232,16 +13232,16 @@ Failed: error 178 at offset 5: digits missing in \x{} or \o{} or \N{U+}
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/[a-[:digit:]]+/
|
||||||
|
-Failed: error 150 at offset 4: invalid range in character class
|
||||||
|
+Failed: error 150 at offset 12: invalid range in character class
|
||||||
|
|
||||||
|
/[A-[:digit:]]+/
|
||||||
|
-Failed: error 150 at offset 4: invalid range in character class
|
||||||
|
+Failed: error 150 at offset 12: invalid range in character class
|
||||||
|
|
||||||
|
/[a-[.xxx.]]+/
|
||||||
|
-Failed: error 150 at offset 4: invalid range in character class
|
||||||
|
+Failed: error 150 at offset 10: invalid range in character class
|
||||||
|
|
||||||
|
/[a-[=xxx=]]+/
|
||||||
|
-Failed: error 150 at offset 4: invalid range in character class
|
||||||
|
+Failed: error 150 at offset 10: invalid range in character class
|
||||||
|
|
||||||
|
/[a-[!xxx!]]+/
|
||||||
|
Failed: error 108 at offset 3: range out of order in character class
|
||||||
|
@@ -13362,7 +13362,7 @@ No match
|
||||||
|
No match
|
||||||
|
|
||||||
|
/[a[:<:]] should give error/
|
||||||
|
-Failed: error 130 at offset 4: unknown POSIX class name
|
||||||
|
+Failed: error 130 at offset 7: unknown POSIX class name
|
||||||
|
|
||||||
|
/(?=ab\K)/aftertext,allow_lookaround_bsk
|
||||||
|
abcd\=startchar
|
||||||
|
@@ -15510,11 +15510,11 @@ Failed: error 125 at offset 13: lookbehind assertion is not fixed length
|
||||||
|
# Perl accepts these, but gives a warning. We can't warn, so give an error.
|
||||||
|
|
||||||
|
/[a-[:digit:]]+/
|
||||||
|
-Failed: error 150 at offset 4: invalid range in character class
|
||||||
|
+Failed: error 150 at offset 12: invalid range in character class
|
||||||
|
a-a9-a
|
||||||
|
|
||||||
|
/[A-[:digit:]]+/
|
||||||
|
-Failed: error 150 at offset 4: invalid range in character class
|
||||||
|
+Failed: error 150 at offset 12: invalid range in character class
|
||||||
|
A-A9-A
|
||||||
|
|
||||||
|
/[a-\d]+/
|
||||||
|
@@ -15651,7 +15651,7 @@ Failed: error 128 at offset 63: assertion expected after (?( or (?(?C)
|
||||||
|
.+(?(?C'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'))?!XXXX.=X
|
||||||
|
|
||||||
|
/[:[:alnum:]-[[a:lnum:]+/
|
||||||
|
-Failed: error 150 at offset 11: invalid range in character class
|
||||||
|
+Failed: error 150 at offset 12: invalid range in character class
|
||||||
|
|
||||||
|
/((?(?C'')\QX\E(?!((?(?C'')(?!X=X));=)r*X=X));=)/
|
||||||
|
Failed: error 128 at offset 11: assertion expected after (?( or (?(?C)
|
||||||
|
@@ -16285,10 +16285,10 @@ Subject length lower bound = 3
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/[Q-\N]/B,bad_escape_is_literal
|
||||||
|
-Failed: error 150 at offset 5: invalid range in character class
|
||||||
|
+Failed: error 171 at offset 5: \N is not supported in a class
|
||||||
|
|
||||||
|
/[\s-_]/bad_escape_is_literal
|
||||||
|
-Failed: error 150 at offset 3: invalid range in character class
|
||||||
|
+Failed: error 150 at offset 4: invalid range in character class
|
||||||
|
|
||||||
|
/[_-\s]/bad_escape_is_literal
|
||||||
|
Failed: error 150 at offset 5: invalid range in character class
|
||||||
|
@@ -16443,19 +16443,19 @@ No match
|
||||||
|
No match
|
||||||
|
|
||||||
|
/[[:digit:]-a]/
|
||||||
|
-Failed: error 150 at offset 10: invalid range in character class
|
||||||
|
+Failed: error 150 at offset 11: invalid range in character class
|
||||||
|
|
||||||
|
/[[:digit:]-[:print:]]/
|
||||||
|
-Failed: error 150 at offset 10: invalid range in character class
|
||||||
|
+Failed: error 150 at offset 11: invalid range in character class
|
||||||
|
|
||||||
|
/[\d-a]/
|
||||||
|
-Failed: error 150 at offset 3: invalid range in character class
|
||||||
|
+Failed: error 150 at offset 4: invalid range in character class
|
||||||
|
|
||||||
|
/[\H-z]/
|
||||||
|
-Failed: error 150 at offset 3: invalid range in character class
|
||||||
|
+Failed: error 150 at offset 4: invalid range in character class
|
||||||
|
|
||||||
|
/[\d-[:print:]]/
|
||||||
|
-Failed: error 150 at offset 3: invalid range in character class
|
||||||
|
+Failed: error 150 at offset 4: invalid range in character class
|
||||||
|
|
||||||
|
# Perl gets the second of these wrong, giving no match.
|
||||||
|
|
||||||
|
@@ -17816,16 +17816,28 @@ Subject length lower bound = 2
|
||||||
|
0: a
|
||||||
|
|
||||||
|
/[[:digit:] -Z]/xx
|
||||||
|
-Failed: error 150 at offset 10: invalid range in character class
|
||||||
|
+Failed: error 150 at offset 14: invalid range in character class
|
||||||
|
|
||||||
|
/[\d -Z]/xx
|
||||||
|
-Failed: error 150 at offset 3: invalid range in character class
|
||||||
|
+Failed: error 150 at offset 7: invalid range in character class
|
||||||
|
|
||||||
|
/[[:digit:]\E-H]/
|
||||||
|
-Failed: error 150 at offset 10: invalid range in character class
|
||||||
|
+Failed: error 150 at offset 13: invalid range in character class
|
||||||
|
|
||||||
|
/[[:digit:]\Q\E-H]+/
|
||||||
|
-Failed: error 150 at offset 10: invalid range in character class
|
||||||
|
+Failed: error 150 at offset 15: invalid range in character class
|
||||||
|
+
|
||||||
|
+/[z-[:space:]]/
|
||||||
|
+Failed: error 150 at offset 12: invalid range in character class
|
||||||
|
+
|
||||||
|
+/[z-\d]/
|
||||||
|
+Failed: error 150 at offset 5: invalid range in character class
|
||||||
|
+
|
||||||
|
+/[[:space:]-z]/
|
||||||
|
+Failed: error 150 at offset 11: invalid range in character class
|
||||||
|
+
|
||||||
|
+/[\d-z]/
|
||||||
|
+Failed: error 150 at offset 4: invalid range in character class
|
||||||
|
|
||||||
|
# End of testinput2
|
||||||
|
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
|
||||||
|
diff --git a/testdata/testoutput5 b/testdata/testoutput5
|
||||||
|
index bf06ee12..0dba11c6 100644
|
||||||
|
--- a/testdata/testoutput5
|
||||||
|
+++ b/testdata/testoutput5
|
||||||
|
@@ -795,7 +795,7 @@ No match
|
||||||
|
No match
|
||||||
|
|
||||||
|
/[[:a\x{100}b:]]/utf
|
||||||
|
-Failed: error 130 at offset 3: unknown POSIX class name
|
||||||
|
+Failed: error 130 at offset 14: unknown POSIX class name
|
||||||
|
|
||||||
|
/a[^]b/utf,allow_empty_class,match_unset_backref
|
||||||
|
a\x{1234}b
|
||||||
|
@@ -5403,4 +5403,16 @@ No match
|
||||||
|
abc
|
||||||
|
1: >\x{1ff}<
|
||||||
|
|
||||||
|
+/[z-\p{Lu}]/
|
||||||
|
+Failed: error 150 at offset 9: invalid range in character class
|
||||||
|
+
|
||||||
|
+/[z-\pL]/
|
||||||
|
+Failed: error 150 at offset 6: invalid range in character class
|
||||||
|
+
|
||||||
|
+/[\p{Lu}-z]/
|
||||||
|
+Failed: error 150 at offset 8: invalid range in character class
|
||||||
|
+
|
||||||
|
+/[\pL-z]/
|
||||||
|
+Failed: error 150 at offset 5: invalid range in character class
|
||||||
|
+
|
||||||
|
# End of testinput5
|
||||||
|
--
|
||||||
|
2.33.0
|
||||||
|
|
||||||
@ -0,0 +1,223 @@
|
|||||||
|
From f34fc0a34ab18d7cb0ff27eacaea43912d797a27 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Philip Hazel <Philip.Hazel@gmail.com>
|
||||||
|
Date: Wed, 27 Nov 2024 15:15:45 +0000
|
||||||
|
Subject: [PATCH] Mend a bug in pcre2grep that caused separator lines to
|
||||||
|
be
|
||||||
|
incorrectly inserted in some cases when above/below context lines are
|
||||||
|
contiguous. Reported by Alejandro Colomar <alx@kernel.org>. Fixes
|
||||||
|
GitHub
|
||||||
|
issue #577.
|
||||||
|
|
||||||
|
Conflict:adapt context; don't modify ChangeLog; don't use
|
||||||
|
group_separator because e179a4b8c is not merged
|
||||||
|
Reference:https://github.com/PCRE2Project/pcre2/commit/f34fc0a34ab18d7cb0ff27eacaea43912d797a27
|
||||||
|
|
||||||
|
---
|
||||||
|
RunGrepTest | 6 +++++-
|
||||||
|
src/pcre2grep.c | 19 ++++++++++++++++--
|
||||||
|
testdata/grepinput | 19 ++++++++++++++++++
|
||||||
|
testdata/grepoutput | 48 ++++++++++++++++++++++++++++++++++++---------
|
||||||
|
4 files changed, 80 insertions(+), 12 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/RunGrepTest b/RunGrepTest
|
||||||
|
index 0a00e82..0d57707 100755
|
||||||
|
--- a/RunGrepTest
|
||||||
|
+++ b/RunGrepTest
|
||||||
|
@@ -853,7 +853,11 @@ fi
|
||||||
|
echo "---------------------------- Test 151 -----------------------------" >>testtrygrep
|
||||||
|
(cd $srcdir; $valgrind $vjs $pcre2grep --colour=always -e this -e The -e 'The wo' testdata/grepinputv) >>testtrygrep
|
||||||
|
|
||||||
|
-
|
||||||
|
+echo "---------------------------- Test 160 -----------------------------" >>testtrygrep
|
||||||
|
+(cd $srcdir; $valgrind $vjs $pcre2grep -nC3 '^(ert|jkl)' ./testdata/grepinput) >>testtrygrep
|
||||||
|
+echo "RC=$?" >>testtrygrep
|
||||||
|
+(cd $srcdir; $valgrind $vjs $pcre2grep -n -B4 -A2 '^(ert|dfg)' ./testdata/grepinput) >>testtrygrep
|
||||||
|
+echo "RC=$?" >>testtrygrep
|
||||||
|
|
||||||
|
|
||||||
|
# Now compare the results.
|
||||||
|
diff --git a/src/pcre2grep.c b/src/pcre2grep.c
|
||||||
|
index 6a5841c..3b79f26 100644
|
||||||
|
--- a/src/pcre2grep.c
|
||||||
|
+++ b/src/pcre2grep.c
|
||||||
|
@@ -2940,12 +2940,15 @@ while (ptr < endptr)
|
||||||
|
FWRITE_IGNORE(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
|
||||||
|
lastmatchrestart = pp;
|
||||||
|
}
|
||||||
|
+
|
||||||
|
if (lastmatchrestart != ptr) hyphenpending = TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
- /* If there were non-contiguous lines printed above, insert hyphens. */
|
||||||
|
+ /* If hyphenpending is TRUE when there is no "after" context, it means we
|
||||||
|
+ are at the start of a new file, having output something from the previous
|
||||||
|
+ file. Output a separator if enabled.*/
|
||||||
|
|
||||||
|
- if (hyphenpending)
|
||||||
|
+ else if (hyphenpending)
|
||||||
|
{
|
||||||
|
fprintf(stdout, "--" STDOUT_NL);
|
||||||
|
hyphenpending = FALSE;
|
||||||
|
@@ -2970,6 +2973,7 @@ while (ptr < endptr)
|
||||||
|
|
||||||
|
if (lastmatchnumber > 0 && p > lastmatchrestart && !hyphenprinted)
|
||||||
|
fprintf(stdout, "--" STDOUT_NL);
|
||||||
|
+ hyphenpending = FALSE;
|
||||||
|
|
||||||
|
while (p < ptr)
|
||||||
|
{
|
||||||
|
@@ -2984,12 +2988,23 @@ while (ptr < endptr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
+ /* If hyphenpending is TRUE here, it was set after outputting some
|
||||||
|
+ "after" lines (and there are no "before" lines). */
|
||||||
|
+
|
||||||
|
+ else if (hyphenpending)
|
||||||
|
+ {
|
||||||
|
+ fprintf(stdout, "--" STDOUT_NL);
|
||||||
|
+ hyphenpending = FALSE;
|
||||||
|
+ hyphenprinted = TRUE;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
/* Now print the matching line(s); ensure we set hyphenpending at the end
|
||||||
|
of the file if any context lines are being output. */
|
||||||
|
|
||||||
|
if (after_context > 0 || before_context > 0)
|
||||||
|
endhyphenpending = TRUE;
|
||||||
|
|
||||||
|
+
|
||||||
|
if (printname != NULL) fprintf(stdout, "%s%c", printname,
|
||||||
|
printname_colon);
|
||||||
|
if (number) fprintf(stdout, "%lu:", linenumber);
|
||||||
|
diff --git a/testdata/grepinput b/testdata/grepinput
|
||||||
|
index 1e2ceb4..91d3db8 100644
|
||||||
|
--- a/testdata/grepinput
|
||||||
|
+++ b/testdata/grepinput
|
||||||
|
@@ -617,6 +617,25 @@ match 5:
|
||||||
|
Rhubarb
|
||||||
|
Custard Tart
|
||||||
|
|
||||||
|
+zxc
|
||||||
|
+cvb
|
||||||
|
+bnm
|
||||||
|
+asd
|
||||||
|
+qwe
|
||||||
|
+ert
|
||||||
|
+tyu
|
||||||
|
+uio
|
||||||
|
+ggg
|
||||||
|
+asd
|
||||||
|
+dfg
|
||||||
|
+ghj
|
||||||
|
+jkl
|
||||||
|
+abc
|
||||||
|
+def
|
||||||
|
+ghi
|
||||||
|
+xyz
|
||||||
|
+
|
||||||
|
+
|
||||||
|
PUT NEW DATA ABOVE THIS LINE.
|
||||||
|
=============================
|
||||||
|
|
||||||
|
diff --git a/testdata/grepoutput b/testdata/grepoutput
|
||||||
|
index aa53aab..df658ed 100644
|
||||||
|
--- a/testdata/grepoutput
|
||||||
|
+++ b/testdata/grepoutput
|
||||||
|
@@ -10,7 +10,7 @@ RC=0
|
||||||
|
7:PATTERN at the start of a line.
|
||||||
|
8:In the middle of a line, PATTERN appears.
|
||||||
|
10:This pattern is in lower case.
|
||||||
|
-623:Check up on PATTERN near the end.
|
||||||
|
+642:Check up on PATTERN near the end.
|
||||||
|
RC=0
|
||||||
|
---------------------------- Test 4 ------------------------------
|
||||||
|
4
|
||||||
|
@@ -19,7 +19,7 @@ RC=0
|
||||||
|
./testdata/grepinput:7:PATTERN at the start of a line.
|
||||||
|
./testdata/grepinput:8:In the middle of a line, PATTERN appears.
|
||||||
|
./testdata/grepinput:10:This pattern is in lower case.
|
||||||
|
-./testdata/grepinput:623:Check up on PATTERN near the end.
|
||||||
|
+./testdata/grepinput:642:Check up on PATTERN near the end.
|
||||||
|
./testdata/grepinputx:3:Here is the pattern again.
|
||||||
|
./testdata/grepinputx:5:Pattern
|
||||||
|
./testdata/grepinputx:42:This line contains pattern not on a line by itself.
|
||||||
|
@@ -28,7 +28,7 @@ RC=0
|
||||||
|
7:PATTERN at the start of a line.
|
||||||
|
8:In the middle of a line, PATTERN appears.
|
||||||
|
10:This pattern is in lower case.
|
||||||
|
-623:Check up on PATTERN near the end.
|
||||||
|
+642:Check up on PATTERN near the end.
|
||||||
|
3:Here is the pattern again.
|
||||||
|
5:Pattern
|
||||||
|
42:This line contains pattern not on a line by itself.
|
||||||
|
@@ -104,6 +104,7 @@ pcre2grep: Error in command-line regex at offset 4: quantifier does not follow a
|
||||||
|
RC=2
|
||||||
|
---------------------------- Test 16 -----------------------------
|
||||||
|
pcre2grep: Failed to open ./testdata/nonexistfile: No such file or directory
|
||||||
|
+./testdata/grepinput:abc
|
||||||
|
RC=2
|
||||||
|
---------------------------- Test 17 -----------------------------
|
||||||
|
features should be added at the end, because some of the tests involve the
|
||||||
|
@@ -324,10 +325,10 @@ RC=0
|
||||||
|
./testdata/grepinput-9-
|
||||||
|
./testdata/grepinput:10:This pattern is in lower case.
|
||||||
|
--
|
||||||
|
-./testdata/grepinput-620-PUT NEW DATA ABOVE THIS LINE.
|
||||||
|
-./testdata/grepinput-621-=============================
|
||||||
|
-./testdata/grepinput-622-
|
||||||
|
-./testdata/grepinput:623:Check up on PATTERN near the end.
|
||||||
|
+./testdata/grepinput-639-PUT NEW DATA ABOVE THIS LINE.
|
||||||
|
+./testdata/grepinput-640-=============================
|
||||||
|
+./testdata/grepinput-641-
|
||||||
|
+./testdata/grepinput:642:Check up on PATTERN near the end.
|
||||||
|
--
|
||||||
|
./testdata/grepinputx-1-This is a second file of input for the pcregrep tests.
|
||||||
|
./testdata/grepinputx-2-
|
||||||
|
@@ -349,8 +350,8 @@ RC=0
|
||||||
|
./testdata/grepinput-12-Here follows a whole lot of stuff that makes the file over 24KiB long.
|
||||||
|
./testdata/grepinput-13-
|
||||||
|
--
|
||||||
|
-./testdata/grepinput:623:Check up on PATTERN near the end.
|
||||||
|
-./testdata/grepinput-624-This is the last line of this file.
|
||||||
|
+./testdata/grepinput:642:Check up on PATTERN near the end.
|
||||||
|
+./testdata/grepinput-643-This is the last line of this file.
|
||||||
|
--
|
||||||
|
./testdata/grepinputx:3:Here is the pattern again.
|
||||||
|
./testdata/grepinputx-4-
|
||||||
|
@@ -1232,3 +1233,32 @@ RC=2
|
||||||
|
[1;31mThe wo[0mrd is cat in [1;31mthis[0m line
|
||||||
|
[1;31mThe[0m caterpillar sat on the mat
|
||||||
|
[1;31mThe[0m snowcat is not an animal
|
||||||
|
+---------------------------- Test 160 -----------------------------
|
||||||
|
+622-bnm
|
||||||
|
+623-asd
|
||||||
|
+624-qwe
|
||||||
|
+625:ert
|
||||||
|
+626-tyu
|
||||||
|
+627-uio
|
||||||
|
+628-ggg
|
||||||
|
+629-asd
|
||||||
|
+630-dfg
|
||||||
|
+631-ghj
|
||||||
|
+632:jkl
|
||||||
|
+633-abc
|
||||||
|
+634-def
|
||||||
|
+635-ghi
|
||||||
|
+RC=0
|
||||||
|
+621-cvb
|
||||||
|
+622-bnm
|
||||||
|
+623-asd
|
||||||
|
+624-qwe
|
||||||
|
+625:ert
|
||||||
|
+626-tyu
|
||||||
|
+627-uio
|
||||||
|
+628-ggg
|
||||||
|
+629-asd
|
||||||
|
+630:dfg
|
||||||
|
+631-ghj
|
||||||
|
+632-jkl
|
||||||
|
+RC=0
|
||||||
|
--
|
||||||
|
2.33.0
|
||||||
|
|
||||||
459
backport-Non-recursive-scan-prefix-in-JIT-560.patch
Normal file
459
backport-Non-recursive-scan-prefix-in-JIT-560.patch
Normal file
@ -0,0 +1,459 @@
|
|||||||
|
From 6f2da25f009ff463cd9357ae5ebe452fbec8ab5c Mon Sep 17 00:00:00 2001
|
||||||
|
From: Zoltan Herczeg <zherczeg7@gmail.com>
|
||||||
|
Date: Fri, 15 Nov 2024 13:21:03 +0100
|
||||||
|
Subject: [PATCH] Non-recursive scan prefix in JIT (#560)
|
||||||
|
|
||||||
|
Conflict:NA
|
||||||
|
Reference:https://github.com/PCRE2Project/pcre2/commit/6f2da25f009ff463cd9357ae5ebe452fbec8ab5c
|
||||||
|
|
||||||
|
---
|
||||||
|
src/pcre2_jit_compile.c | 238 ++++++++++++++++++++++++++++------------
|
||||||
|
src/pcre2_jit_test.c | 1 +
|
||||||
|
2 files changed, 168 insertions(+), 71 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c
|
||||||
|
index 127c393d..4449d59f 100644
|
||||||
|
--- a/src/pcre2_jit_compile.c
|
||||||
|
+++ b/src/pcre2_jit_compile.c
|
||||||
|
@@ -5670,11 +5670,38 @@ if (last)
|
||||||
|
chars->last_count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
-static int scan_prefix(compiler_common *common, PCRE2_SPTR cc, fast_forward_char_data *chars, int max_chars, sljit_u32 *rec_count)
|
||||||
|
+/* Value can be increased if needed. Patterns
|
||||||
|
+such as /(a|){33}b/ can exhaust the stack.
|
||||||
|
+
|
||||||
|
+Note: /(a|){29}b/ already stops scan_prefix()
|
||||||
|
+because it reaches the maximum step_count. */
|
||||||
|
+#define SCAN_PREFIX_STACK_END 32
|
||||||
|
+
|
||||||
|
+/*
|
||||||
|
+Scan prefix stores the prefix string in the chars array.
|
||||||
|
+The elements of the chars array is either small character
|
||||||
|
+sets or "any" (count is set to 255).
|
||||||
|
+
|
||||||
|
+Examples (the chars array is represented by a simple regex):
|
||||||
|
+
|
||||||
|
+/(abc|xbyd)/ prefix: /[ax]b[cy]/ (length: 3)
|
||||||
|
+/a[a-z]b+c/ prefix: a.b (length: 3)
|
||||||
|
+/ab?cd/ prefix: a[bc][cd] (length: 3)
|
||||||
|
+/(ab|cd)|(ef|gh)/ prefix: [aceg][bdfh] (length: 2)
|
||||||
|
+
|
||||||
|
+The length is returned by scan_prefix(). The length is
|
||||||
|
+less than or equal than the minimum length of the pattern.
|
||||||
|
+*/
|
||||||
|
+
|
||||||
|
+static int scan_prefix(compiler_common *common, PCRE2_SPTR cc, fast_forward_char_data *chars)
|
||||||
|
{
|
||||||
|
-/* Recursive function, which scans prefix literals. */
|
||||||
|
+fast_forward_char_data *chars_start = chars;
|
||||||
|
+fast_forward_char_data *chars_end = chars + MAX_N_CHARS;
|
||||||
|
+PCRE2_SPTR cc_stack[SCAN_PREFIX_STACK_END];
|
||||||
|
+fast_forward_char_data *chars_stack[SCAN_PREFIX_STACK_END];
|
||||||
|
+sljit_u8 next_alternative_stack[SCAN_PREFIX_STACK_END];
|
||||||
|
BOOL last, any, class, caseless;
|
||||||
|
-int len, repeat, len_save, consumed = 0;
|
||||||
|
+int stack_ptr, step_count, repeat, len, len_save;
|
||||||
|
sljit_u32 chr; /* Any unicode character. */
|
||||||
|
sljit_u8 *bytes, *bytes_end, byte;
|
||||||
|
PCRE2_SPTR alternative, cc_save, oc;
|
||||||
|
@@ -5687,11 +5714,44 @@ PCRE2_UCHAR othercase[1];
|
||||||
|
#endif
|
||||||
|
|
||||||
|
repeat = 1;
|
||||||
|
+stack_ptr = 0;
|
||||||
|
+step_count = 10000;
|
||||||
|
while (TRUE)
|
||||||
|
{
|
||||||
|
- if (*rec_count == 0)
|
||||||
|
+ if (--step_count == 0)
|
||||||
|
return 0;
|
||||||
|
- (*rec_count)--;
|
||||||
|
+
|
||||||
|
+ SLJIT_ASSERT(chars <= chars_start + MAX_N_CHARS);
|
||||||
|
+
|
||||||
|
+ if (chars >= chars_end)
|
||||||
|
+ {
|
||||||
|
+ if (stack_ptr == 0)
|
||||||
|
+ return (int)(chars_end - chars_start);
|
||||||
|
+
|
||||||
|
+ --stack_ptr;
|
||||||
|
+ cc = cc_stack[stack_ptr];
|
||||||
|
+ chars = chars_stack[stack_ptr];
|
||||||
|
+
|
||||||
|
+ if (chars >= chars_end)
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ if (next_alternative_stack[stack_ptr] != 0)
|
||||||
|
+ {
|
||||||
|
+ /* When an alternative is processed, the
|
||||||
|
+ next alternative is pushed onto the stack. */
|
||||||
|
+ SLJIT_ASSERT(*cc == OP_ALT);
|
||||||
|
+ alternative = cc + GET(cc, 1);
|
||||||
|
+ if (*alternative == OP_ALT)
|
||||||
|
+ {
|
||||||
|
+ SLJIT_ASSERT(stack_ptr < SCAN_PREFIX_STACK_END);
|
||||||
|
+ SLJIT_ASSERT(chars_stack[stack_ptr] == chars);
|
||||||
|
+ SLJIT_ASSERT(next_alternative_stack[stack_ptr] == 1);
|
||||||
|
+ cc_stack[stack_ptr] = alternative;
|
||||||
|
+ stack_ptr++;
|
||||||
|
+ }
|
||||||
|
+ cc += 1 + LINK_SIZE;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
|
||||||
|
last = TRUE;
|
||||||
|
any = FALSE;
|
||||||
|
@@ -5768,9 +5828,17 @@ while (TRUE)
|
||||||
|
#ifdef SUPPORT_UNICODE
|
||||||
|
if (common->utf && HAS_EXTRALEN(*cc)) len += GET_EXTRALEN(*cc);
|
||||||
|
#endif
|
||||||
|
- max_chars = scan_prefix(common, cc + len, chars, max_chars, rec_count);
|
||||||
|
- if (max_chars == 0)
|
||||||
|
- return consumed;
|
||||||
|
+ if (stack_ptr >= SCAN_PREFIX_STACK_END)
|
||||||
|
+ {
|
||||||
|
+ chars_end = chars;
|
||||||
|
+ continue;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ cc_stack[stack_ptr] = cc + len;
|
||||||
|
+ chars_stack[stack_ptr] = chars;
|
||||||
|
+ next_alternative_stack[stack_ptr] = 0;
|
||||||
|
+ stack_ptr++;
|
||||||
|
+
|
||||||
|
last = FALSE;
|
||||||
|
break;
|
||||||
|
|
||||||
|
@@ -5788,12 +5856,18 @@ while (TRUE)
|
||||||
|
case OP_CBRA:
|
||||||
|
case OP_CBRAPOS:
|
||||||
|
alternative = cc + GET(cc, 1);
|
||||||
|
- while (*alternative == OP_ALT)
|
||||||
|
+ if (*alternative == OP_ALT)
|
||||||
|
{
|
||||||
|
- max_chars = scan_prefix(common, alternative + 1 + LINK_SIZE, chars, max_chars, rec_count);
|
||||||
|
- if (max_chars == 0)
|
||||||
|
- return consumed;
|
||||||
|
- alternative += GET(alternative, 1);
|
||||||
|
+ if (stack_ptr >= SCAN_PREFIX_STACK_END)
|
||||||
|
+ {
|
||||||
|
+ chars_end = chars;
|
||||||
|
+ continue;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ cc_stack[stack_ptr] = alternative;
|
||||||
|
+ chars_stack[stack_ptr] = chars;
|
||||||
|
+ next_alternative_stack[stack_ptr] = 1;
|
||||||
|
+ stack_ptr++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (*cc == OP_CBRA || *cc == OP_CBRAPOS)
|
||||||
|
@@ -5804,14 +5878,21 @@ while (TRUE)
|
||||||
|
case OP_CLASS:
|
||||||
|
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
|
if (common->utf && !is_char7_bitset((const sljit_u8 *)(cc + 1), FALSE))
|
||||||
|
- return consumed;
|
||||||
|
+ {
|
||||||
|
+ chars_end = chars;
|
||||||
|
+ continue;
|
||||||
|
+ }
|
||||||
|
#endif
|
||||||
|
class = TRUE;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case OP_NCLASS:
|
||||||
|
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
|
||||||
|
- if (common->utf) return consumed;
|
||||||
|
+ if (common->utf)
|
||||||
|
+ {
|
||||||
|
+ chars_end = chars;
|
||||||
|
+ continue;
|
||||||
|
+ }
|
||||||
|
#endif
|
||||||
|
class = TRUE;
|
||||||
|
break;
|
||||||
|
@@ -5819,7 +5900,11 @@ while (TRUE)
|
||||||
|
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
|
||||||
|
case OP_XCLASS:
|
||||||
|
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
|
||||||
|
- if (common->utf) return consumed;
|
||||||
|
+ if (common->utf)
|
||||||
|
+ {
|
||||||
|
+ chars_end = chars;
|
||||||
|
+ continue;
|
||||||
|
+ }
|
||||||
|
#endif
|
||||||
|
any = TRUE;
|
||||||
|
cc += GET(cc, 1);
|
||||||
|
@@ -5829,7 +5914,10 @@ while (TRUE)
|
||||||
|
case OP_DIGIT:
|
||||||
|
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
|
if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_digit, FALSE))
|
||||||
|
- return consumed;
|
||||||
|
+ {
|
||||||
|
+ chars_end = chars;
|
||||||
|
+ continue;
|
||||||
|
+ }
|
||||||
|
#endif
|
||||||
|
any = TRUE;
|
||||||
|
cc++;
|
||||||
|
@@ -5838,7 +5926,10 @@ while (TRUE)
|
||||||
|
case OP_WHITESPACE:
|
||||||
|
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
|
if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_space, FALSE))
|
||||||
|
- return consumed;
|
||||||
|
+ {
|
||||||
|
+ chars_end = chars;
|
||||||
|
+ continue;
|
||||||
|
+ }
|
||||||
|
#endif
|
||||||
|
any = TRUE;
|
||||||
|
cc++;
|
||||||
|
@@ -5847,7 +5938,10 @@ while (TRUE)
|
||||||
|
case OP_WORDCHAR:
|
||||||
|
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
|
if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_word, FALSE))
|
||||||
|
- return consumed;
|
||||||
|
+ {
|
||||||
|
+ chars_end = chars;
|
||||||
|
+ continue;
|
||||||
|
+ }
|
||||||
|
#endif
|
||||||
|
any = TRUE;
|
||||||
|
cc++;
|
||||||
|
@@ -5863,7 +5957,11 @@ while (TRUE)
|
||||||
|
case OP_ANY:
|
||||||
|
case OP_ALLANY:
|
||||||
|
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
|
||||||
|
- if (common->utf) return consumed;
|
||||||
|
+ if (common->utf)
|
||||||
|
+ {
|
||||||
|
+ chars_end = chars;
|
||||||
|
+ continue;
|
||||||
|
+ }
|
||||||
|
#endif
|
||||||
|
any = TRUE;
|
||||||
|
cc++;
|
||||||
|
@@ -5873,7 +5971,11 @@ while (TRUE)
|
||||||
|
case OP_NOTPROP:
|
||||||
|
case OP_PROP:
|
||||||
|
#if PCRE2_CODE_UNIT_WIDTH != 32
|
||||||
|
- if (common->utf) return consumed;
|
||||||
|
+ if (common->utf)
|
||||||
|
+ {
|
||||||
|
+ chars_end = chars;
|
||||||
|
+ continue;
|
||||||
|
+ }
|
||||||
|
#endif
|
||||||
|
any = TRUE;
|
||||||
|
cc += 1 + 2;
|
||||||
|
@@ -5888,7 +5990,11 @@ while (TRUE)
|
||||||
|
case OP_NOTEXACT:
|
||||||
|
case OP_NOTEXACTI:
|
||||||
|
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
|
||||||
|
- if (common->utf) return consumed;
|
||||||
|
+ if (common->utf)
|
||||||
|
+ {
|
||||||
|
+ chars_end = chars;
|
||||||
|
+ continue;
|
||||||
|
+ }
|
||||||
|
#endif
|
||||||
|
any = TRUE;
|
||||||
|
repeat = GET2(cc, 1);
|
||||||
|
@@ -5896,21 +6002,20 @@ while (TRUE)
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
- return consumed;
|
||||||
|
+ chars_end = chars;
|
||||||
|
+ continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
+ SLJIT_ASSERT(chars < chars_end);
|
||||||
|
+
|
||||||
|
if (any)
|
||||||
|
{
|
||||||
|
do
|
||||||
|
{
|
||||||
|
chars->count = 255;
|
||||||
|
-
|
||||||
|
- consumed++;
|
||||||
|
- if (--max_chars == 0)
|
||||||
|
- return consumed;
|
||||||
|
chars++;
|
||||||
|
}
|
||||||
|
- while (--repeat > 0);
|
||||||
|
+ while (--repeat > 0 && chars < chars_end);
|
||||||
|
|
||||||
|
repeat = 1;
|
||||||
|
continue;
|
||||||
|
@@ -5921,17 +6026,27 @@ while (TRUE)
|
||||||
|
bytes = (sljit_u8*) (cc + 1);
|
||||||
|
cc += 1 + 32 / sizeof(PCRE2_UCHAR);
|
||||||
|
|
||||||
|
+ SLJIT_ASSERT(last == TRUE && repeat == 1);
|
||||||
|
switch (*cc)
|
||||||
|
{
|
||||||
|
- case OP_CRSTAR:
|
||||||
|
- case OP_CRMINSTAR:
|
||||||
|
- case OP_CRPOSSTAR:
|
||||||
|
case OP_CRQUERY:
|
||||||
|
case OP_CRMINQUERY:
|
||||||
|
case OP_CRPOSQUERY:
|
||||||
|
- max_chars = scan_prefix(common, cc + 1, chars, max_chars, rec_count);
|
||||||
|
- if (max_chars == 0)
|
||||||
|
- return consumed;
|
||||||
|
+ last = FALSE;
|
||||||
|
+ /* Fall through */
|
||||||
|
+ case OP_CRSTAR:
|
||||||
|
+ case OP_CRMINSTAR:
|
||||||
|
+ case OP_CRPOSSTAR:
|
||||||
|
+ if (stack_ptr >= SCAN_PREFIX_STACK_END)
|
||||||
|
+ {
|
||||||
|
+ chars_end = chars;
|
||||||
|
+ continue;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ cc_stack[stack_ptr] = ++cc;
|
||||||
|
+ chars_stack[stack_ptr] = chars;
|
||||||
|
+ next_alternative_stack[stack_ptr] = 0;
|
||||||
|
+ stack_ptr++;
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
@@ -5945,7 +6060,13 @@ while (TRUE)
|
||||||
|
case OP_CRPOSRANGE:
|
||||||
|
repeat = GET2(cc, 1);
|
||||||
|
if (repeat <= 0)
|
||||||
|
- return consumed;
|
||||||
|
+ {
|
||||||
|
+ chars_end = chars;
|
||||||
|
+ continue;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ last = (repeat != (int)GET2(cc, 1 + IMM2_SIZE));
|
||||||
|
+ cc += 1 + 2 * IMM2_SIZE;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
@@ -5980,36 +6101,13 @@ while (TRUE)
|
||||||
|
bytes = bytes_end - 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
- consumed++;
|
||||||
|
- if (--max_chars == 0)
|
||||||
|
- return consumed;
|
||||||
|
chars++;
|
||||||
|
}
|
||||||
|
- while (--repeat > 0);
|
||||||
|
-
|
||||||
|
- switch (*cc)
|
||||||
|
- {
|
||||||
|
- case OP_CRSTAR:
|
||||||
|
- case OP_CRMINSTAR:
|
||||||
|
- case OP_CRPOSSTAR:
|
||||||
|
- return consumed;
|
||||||
|
-
|
||||||
|
- case OP_CRQUERY:
|
||||||
|
- case OP_CRMINQUERY:
|
||||||
|
- case OP_CRPOSQUERY:
|
||||||
|
- cc++;
|
||||||
|
- break;
|
||||||
|
-
|
||||||
|
- case OP_CRRANGE:
|
||||||
|
- case OP_CRMINRANGE:
|
||||||
|
- case OP_CRPOSRANGE:
|
||||||
|
- if (GET2(cc, 1) != GET2(cc, 1 + IMM2_SIZE))
|
||||||
|
- return consumed;
|
||||||
|
- cc += 1 + 2 * IMM2_SIZE;
|
||||||
|
- break;
|
||||||
|
- }
|
||||||
|
+ while (--repeat > 0 && chars < chars_end);
|
||||||
|
|
||||||
|
repeat = 1;
|
||||||
|
+ if (last)
|
||||||
|
+ chars_end = chars;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
@@ -6025,7 +6123,10 @@ while (TRUE)
|
||||||
|
{
|
||||||
|
GETCHAR(chr, cc);
|
||||||
|
if ((int)PRIV(ord2utf)(char_othercase(common, chr), othercase) != len)
|
||||||
|
- return consumed;
|
||||||
|
+ {
|
||||||
|
+ chars_end = chars;
|
||||||
|
+ continue;
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
@@ -6056,7 +6157,6 @@ while (TRUE)
|
||||||
|
do
|
||||||
|
{
|
||||||
|
len--;
|
||||||
|
- consumed++;
|
||||||
|
|
||||||
|
chr = *cc;
|
||||||
|
add_prefix_char(*cc, chars, len == 0);
|
||||||
|
@@ -6064,15 +6164,13 @@ while (TRUE)
|
||||||
|
if (caseless)
|
||||||
|
add_prefix_char(*oc, chars, len == 0);
|
||||||
|
|
||||||
|
- if (--max_chars == 0)
|
||||||
|
- return consumed;
|
||||||
|
chars++;
|
||||||
|
cc++;
|
||||||
|
oc++;
|
||||||
|
}
|
||||||
|
- while (len > 0);
|
||||||
|
+ while (len > 0 && chars < chars_end);
|
||||||
|
|
||||||
|
- if (--repeat == 0)
|
||||||
|
+ if (--repeat == 0 || chars >= chars_end)
|
||||||
|
break;
|
||||||
|
|
||||||
|
len = len_save;
|
||||||
|
@@ -6081,7 +6179,7 @@ while (TRUE)
|
||||||
|
|
||||||
|
repeat = 1;
|
||||||
|
if (last)
|
||||||
|
- return consumed;
|
||||||
|
+ chars_end = chars;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@@ -6251,7 +6349,6 @@ int i, max, from;
|
||||||
|
int range_right = -1, range_len;
|
||||||
|
sljit_u8 *update_table = NULL;
|
||||||
|
BOOL in_range;
|
||||||
|
-sljit_u32 rec_count;
|
||||||
|
|
||||||
|
for (i = 0; i < MAX_N_CHARS; i++)
|
||||||
|
{
|
||||||
|
@@ -6259,8 +6356,7 @@ for (i = 0; i < MAX_N_CHARS; i++)
|
||||||
|
chars[i].last_count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
-rec_count = 10000;
|
||||||
|
-max = scan_prefix(common, common->start, chars, MAX_N_CHARS, &rec_count);
|
||||||
|
+max = scan_prefix(common, common->start, chars);
|
||||||
|
|
||||||
|
if (max < 1)
|
||||||
|
return FALSE;
|
||||||
|
diff --git a/src/pcre2_jit_test.c b/src/pcre2_jit_test.c
|
||||||
|
index 28bc7af9..066095fe 100644
|
||||||
|
--- a/src/pcre2_jit_test.c
|
||||||
|
+++ b/src/pcre2_jit_test.c
|
||||||
|
@@ -286,6 +286,7 @@ static struct regression_test_case regression_test_cases[] = {
|
||||||
|
{ CMU, A, 0, 0, "(a|b)?\?d((?:e)?)", "ABABdx" },
|
||||||
|
{ MU, A, 0, 0, "(a|b)?\?d((?:e)?)", "abcde" },
|
||||||
|
{ MU, A, 0, 0, "((?:ab)?\?g|b(?:g(nn|d)?\?)?)?\?(?:n)?m", "abgnbgnnbgdnmm" },
|
||||||
|
+ { M, A, 0, 0, "(?:a?|a)b", "ba" },
|
||||||
|
|
||||||
|
/* Greedy and non-greedy + operators */
|
||||||
|
{ MU, A, 0, 0, "(aa)+aa", "aaaaaaa" },
|
||||||
|
--
|
||||||
|
2.33.0
|
||||||
|
|
||||||
@ -0,0 +1,118 @@
|
|||||||
|
From 674b6640e702705e8e90125f972197fae3aa364d Mon Sep 17 00:00:00 2001
|
||||||
|
From: Philip Hazel <Philip.Hazel@gmail.com>
|
||||||
|
Date: Sat, 3 Aug 2024 17:18:56 +0100
|
||||||
|
Subject: [PATCH] Remove incorrect optimization in DFA matching when partial
|
||||||
|
matching and (*F) are involved
|
||||||
|
|
||||||
|
Conflict:don't modify ChangeLog; adapt context
|
||||||
|
Reference:https://github.com/PCRE2Project/pcre2/commit/674b6640e702705e8e90125f972197fae3aa364d
|
||||||
|
|
||||||
|
---
|
||||||
|
src/pcre2_dfa_match.c | 10 +---------
|
||||||
|
testdata/testinput6 | 9 +++++++++
|
||||||
|
testdata/testoutput6 | 13 +++++++++++++
|
||||||
|
3 files changed, 23 insertions(+), 9 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c
|
||||||
|
index 60f6b4f..1c4495f 100644
|
||||||
|
--- a/src/pcre2_dfa_match.c
|
||||||
|
+++ b/src/pcre2_dfa_match.c
|
||||||
|
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||||
|
|
||||||
|
Written by Philip Hazel
|
||||||
|
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||||
|
- New API code Copyright (c) 2016-2022 University of Cambridge
|
||||||
|
+ New API code Copyright (c) 2016-2024 University of Cambridge
|
||||||
|
|
||||||
|
-----------------------------------------------------------------------------
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
@@ -693,7 +693,6 @@ for (;;)
|
||||||
|
int i, j;
|
||||||
|
int clen, dlen;
|
||||||
|
uint32_t c, d;
|
||||||
|
- int forced_fail = 0;
|
||||||
|
BOOL partial_newline = FALSE;
|
||||||
|
BOOL could_continue = reset_could_continue;
|
||||||
|
reset_could_continue = FALSE;
|
||||||
|
@@ -2765,7 +2764,6 @@ for (;;)
|
||||||
|
though the other "backtracking verbs" are not supported. */
|
||||||
|
|
||||||
|
case OP_FAIL:
|
||||||
|
- forced_fail++; /* Count FAILs for multiple states */
|
||||||
|
break;
|
||||||
|
|
||||||
|
case OP_ASSERT:
|
||||||
|
@@ -3247,18 +3245,12 @@ for (;;)
|
||||||
|
matches that we are going to find. If partial matching has been requested,
|
||||||
|
check for appropriate conditions.
|
||||||
|
|
||||||
|
- The "forced_ fail" variable counts the number of (*F) encountered for the
|
||||||
|
- character. If it is equal to the original active_count (saved in
|
||||||
|
- workspace[1]) it means that (*F) was found on every active state. In this
|
||||||
|
- case we don't want to give a partial match.
|
||||||
|
-
|
||||||
|
The "could_continue" variable is true if a state could have continued but
|
||||||
|
for the fact that the end of the subject was reached. */
|
||||||
|
|
||||||
|
if (new_count <= 0)
|
||||||
|
{
|
||||||
|
if (could_continue && /* Some could go on, and */
|
||||||
|
- forced_fail != workspace[1] && /* Not all forced fail & */
|
||||||
|
( /* either... */
|
||||||
|
(mb->moptions & PCRE2_PARTIAL_HARD) != 0 /* Hard partial */
|
||||||
|
|| /* or... */
|
||||||
|
diff --git a/testdata/testinput6 b/testdata/testinput6
|
||||||
|
index 0ca0d23..b71a69c 100644
|
||||||
|
--- a/testdata/testinput6
|
||||||
|
+++ b/testdata/testinput6
|
||||||
|
@@ -4392,9 +4392,18 @@
|
||||||
|
|
||||||
|
/Z(*F)Q|ZXY/
|
||||||
|
Z\=ps
|
||||||
|
+ XY\=dfa_restart
|
||||||
|
\= Expect no match
|
||||||
|
ZA\=ps
|
||||||
|
X\=ps
|
||||||
|
+
|
||||||
|
+/Z(?:(*F)Q|XY)/
|
||||||
|
+ Z\=ps
|
||||||
|
+ XY\=dfa_restart
|
||||||
|
+
|
||||||
|
+/Z(*F)Q|Z(*F)XY/
|
||||||
|
+\= Expect no match
|
||||||
|
+ Z\=ps
|
||||||
|
|
||||||
|
/\bthe cat\b/
|
||||||
|
the cat\=ps
|
||||||
|
diff --git a/testdata/testoutput6 b/testdata/testoutput6
|
||||||
|
index 607b572..38c653e 100644
|
||||||
|
--- a/testdata/testoutput6
|
||||||
|
+++ b/testdata/testoutput6
|
||||||
|
@@ -6769,11 +6769,24 @@ Partial match: dogs
|
||||||
|
/Z(*F)Q|ZXY/
|
||||||
|
Z\=ps
|
||||||
|
Partial match: Z
|
||||||
|
+ XY\=dfa_restart
|
||||||
|
+ 0: XY
|
||||||
|
\= Expect no match
|
||||||
|
ZA\=ps
|
||||||
|
No match
|
||||||
|
X\=ps
|
||||||
|
No match
|
||||||
|
+
|
||||||
|
+/Z(?:(*F)Q|XY)/
|
||||||
|
+ Z\=ps
|
||||||
|
+Partial match: Z
|
||||||
|
+ XY\=dfa_restart
|
||||||
|
+ 0: XY
|
||||||
|
+
|
||||||
|
+/Z(*F)Q|Z(*F)XY/
|
||||||
|
+\= Expect no match
|
||||||
|
+ Z\=ps
|
||||||
|
+No match
|
||||||
|
|
||||||
|
/\bthe cat\b/
|
||||||
|
the cat\=ps
|
||||||
|
--
|
||||||
|
2.43.0
|
||||||
|
|
||||||
@ -0,0 +1,270 @@
|
|||||||
|
From 64549346f044dec18d18d06c2d08a68a68e26817 Mon Sep 17 00:00:00 2001
|
||||||
|
From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= <carenas@gmail.com>
|
||||||
|
Date: Sun, 9 Apr 2023 04:29:46 -0700
|
||||||
|
Subject: [PATCH] avoid inconsistency between \d and [:digit:] when using /a
|
||||||
|
(#223)
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
Conflict:don't modify Changelog and doc/*; keep pcre2.h.generic consistent
|
||||||
|
with pcre2.h.in according to 1de7291
|
||||||
|
Reference:https://github.com/PCRE2Project/pcre2/commit/64549346f044dec18d18d06c2d08a68a68e26817
|
||||||
|
|
||||||
|
Since a608946 (Additional PCRE2_EXTRA_ASCII_xxx code, 2023-02-01)
|
||||||
|
PCRE2_EXTRA_ASCII_BSD could be used to restrict \d to ASCII causing
|
||||||
|
the following inconsistent behaviour in UCP mode.
|
||||||
|
|
||||||
|
PCRE2 version 10.43-DEV 2023-01-15
|
||||||
|
re> /\d/utf,ucp,ascii_bsd
|
||||||
|
data> ٣
|
||||||
|
No match
|
||||||
|
data>
|
||||||
|
re> /[[:digit:]]/utf,ucp,ascii_bsd
|
||||||
|
data> ٣
|
||||||
|
0: \x{663}
|
||||||
|
|
||||||
|
It has been suggested[1] that the change to match \p{Nd} when Unicode
|
||||||
|
is enabled for [:digit:] might had been unintentional and a bug, as
|
||||||
|
[:digit:] should be able to be POSIX compatible, so add a new flag
|
||||||
|
PCRE2_EXTRA_ASCII_DIGIT to avoid changing its definition in UCP mode.
|
||||||
|
|
||||||
|
[1] https://lore.kernel.org/git/CANgJU+U+xXsh9psd0z5Xjr+Se5QgdKkjQ7LUQ-PdUULSN3n4+g@mail.gmail.com/
|
||||||
|
---
|
||||||
|
src/pcre2.h.generic | 6 ++++++
|
||||||
|
src/pcre2.h.in | 1 +
|
||||||
|
src/pcre2_compile.c | 6 ++++--
|
||||||
|
src/pcre2test.c | 4 +++-
|
||||||
|
testdata/testinput5 | 10 +++++++++-
|
||||||
|
testdata/testinput7 | 10 ++++++++--
|
||||||
|
testdata/testoutput5 | 19 ++++++++++++++++++-
|
||||||
|
testdata/testoutput7 | 13 +++++++++++--
|
||||||
|
8 files changed, 60 insertions(+), 9 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/src/pcre2.h.generic b/src/pcre2.h.generic
|
||||||
|
index dad774ce..05cf9bc1 100644
|
||||||
|
--- a/src/pcre2.h.generic
|
||||||
|
+++ b/src/pcre2.h.generic
|
||||||
|
@@ -153,6 +153,12 @@ D is inspected during pcre2_dfa_match() execution
|
||||||
|
#define PCRE2_EXTRA_ESCAPED_CR_IS_LF 0x00000010u /* C */
|
||||||
|
#define PCRE2_EXTRA_ALT_BSUX 0x00000020u /* C */
|
||||||
|
#define PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK 0x00000040u /* C */
|
||||||
|
+#define PCRE2_EXTRA_CASELESS_RESTRICT 0x00000080u /* C */
|
||||||
|
+#define PCRE2_EXTRA_ASCII_BSD 0x00000100u /* C */
|
||||||
|
+#define PCRE2_EXTRA_ASCII_BSS 0x00000200u /* C */
|
||||||
|
+#define PCRE2_EXTRA_ASCII_BSW 0x00000400u /* C */
|
||||||
|
+#define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */
|
||||||
|
+#define PCRE2_EXTRA_ASCII_DIGIT 0x00001000u /* C */
|
||||||
|
|
||||||
|
/* These are for pcre2_jit_compile(). */
|
||||||
|
|
||||||
|
diff --git a/src/pcre2.h.in b/src/pcre2.h.in
|
||||||
|
index 7202c633..cd7fdcf2 100644
|
||||||
|
--- a/src/pcre2.h.in
|
||||||
|
+++ b/src/pcre2.h.in
|
||||||
|
@@ -158,6 +158,7 @@ D is inspected during pcre2_dfa_match() execution
|
||||||
|
#define PCRE2_EXTRA_ASCII_BSS 0x00000200u /* C */
|
||||||
|
#define PCRE2_EXTRA_ASCII_BSW 0x00000400u /* C */
|
||||||
|
#define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */
|
||||||
|
+#define PCRE2_EXTRA_ASCII_DIGIT 0x00001000u /* C */
|
||||||
|
|
||||||
|
/* These are for pcre2_jit_compile(). */
|
||||||
|
|
||||||
|
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
|
||||||
|
index 95c4a79d..634360b7 100644
|
||||||
|
--- a/src/pcre2_compile.c
|
||||||
|
+++ b/src/pcre2_compile.c
|
||||||
|
@@ -786,7 +786,8 @@ are allowed. */
|
||||||
|
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
|
||||||
|
PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
|
||||||
|
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
|
||||||
|
- PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX)
|
||||||
|
+ PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
|
||||||
|
+ PCRE2_EXTRA_ASCII_DIGIT)
|
||||||
|
|
||||||
|
/* Compile time error code numbers. They are given names so that they can more
|
||||||
|
easily be tracked. When a new number is added, the tables called eint1 and
|
||||||
|
@@ -3581,7 +3582,8 @@ while (ptr < ptrend)
|
||||||
|
|
||||||
|
#ifdef SUPPORT_UNICODE
|
||||||
|
if ((options & PCRE2_UCP) != 0 &&
|
||||||
|
- (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)
|
||||||
|
+ (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
|
||||||
|
+ !(posix_class == 7 && (xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0))
|
||||||
|
{
|
||||||
|
int ptype = posix_substitutes[2*posix_class];
|
||||||
|
int pvalue = posix_substitutes[2*posix_class + 1];
|
||||||
|
diff --git a/src/pcre2test.c b/src/pcre2test.c
|
||||||
|
index 4da3ef90..21b19370 100644
|
||||||
|
--- a/src/pcre2test.c
|
||||||
|
+++ b/src/pcre2test.c
|
||||||
|
@@ -651,6 +651,7 @@ static modstruct modlist[] = {
|
||||||
|
{ "ascii_bsd", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSD, CO(extra_options) },
|
||||||
|
{ "ascii_bss", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSS, CO(extra_options) },
|
||||||
|
{ "ascii_bsw", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSW, CO(extra_options) },
|
||||||
|
+ { "ascii_digit", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_DIGIT, CO(extra_options) },
|
||||||
|
{ "ascii_posix", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_POSIX, CO(extra_options) },
|
||||||
|
{ "auto_callout", MOD_PAT, MOD_OPT, PCRE2_AUTO_CALLOUT, PO(options) },
|
||||||
|
{ "bad_escape_is_literal", MOD_CTC, MOD_OPT, PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL, CO(extra_options) },
|
||||||
|
@@ -4294,13 +4295,14 @@ show_compile_extra_options(uint32_t options, const char *before,
|
||||||
|
const char *after)
|
||||||
|
{
|
||||||
|
if (options == 0) fprintf(outfile, "%s <none>%s", before, after);
|
||||||
|
-else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||||
|
+else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||||
|
before,
|
||||||
|
((options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)? " allow_surrogate_escapes" : "",
|
||||||
|
((options & PCRE2_EXTRA_ALT_BSUX) != 0)? " alt_bsux" : "",
|
||||||
|
((options & PCRE2_EXTRA_ASCII_BSD) != 0)? " ascii_bsd" : "",
|
||||||
|
((options & PCRE2_EXTRA_ASCII_BSS) != 0)? " ascii_bss" : "",
|
||||||
|
((options & PCRE2_EXTRA_ASCII_BSW) != 0)? " ascii_bsw" : "",
|
||||||
|
+ ((options & PCRE2_EXTRA_ASCII_DIGIT) != 0)? " ascii_digit" : "",
|
||||||
|
((options & PCRE2_EXTRA_ASCII_POSIX) != 0)? " ascii_posix" : "",
|
||||||
|
((options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) != 0)? " bad_escape_is_literal" : "",
|
||||||
|
((options & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)? " caseless_restrict" : "",
|
||||||
|
diff --git a/testdata/testinput5 b/testdata/testinput5
|
||||||
|
index 0f105408..0624a0c3 100644
|
||||||
|
--- a/testdata/testinput5
|
||||||
|
+++ b/testdata/testinput5
|
||||||
|
@@ -1215,6 +1215,8 @@
|
||||||
|
|
||||||
|
/[[:digit:]]/B,ucp
|
||||||
|
|
||||||
|
+/[[:digit:]]/B,ucp,ascii_digit
|
||||||
|
+
|
||||||
|
/[[:graph:]]/B,ucp
|
||||||
|
|
||||||
|
/[[:print:]]/B,ucp
|
||||||
|
@@ -1227,7 +1229,7 @@
|
||||||
|
|
||||||
|
/[[:xdigit:]]/B,ucp
|
||||||
|
|
||||||
|
-# Unicode properties for \b abd \B
|
||||||
|
+# Unicode properties for \b and \B
|
||||||
|
|
||||||
|
/\b...\B/utf,ucp
|
||||||
|
abc_
|
||||||
|
@@ -2431,6 +2433,12 @@
|
||||||
|
/[[:digit:]]+/utf,ucp
|
||||||
|
123\x{660}456
|
||||||
|
|
||||||
|
+/[[:digit:]]+/utf,ucp,ascii_digit
|
||||||
|
+ 123\x{660}456
|
||||||
|
+
|
||||||
|
+/[[:digit:]]+/g,utf,ucp,ascii_digit
|
||||||
|
+ 123\x{660}456
|
||||||
|
+
|
||||||
|
/[[:digit:]]+/utf,ucp,ascii_posix
|
||||||
|
123\x{660}456
|
||||||
|
|
||||||
|
diff --git a/testdata/testinput7 b/testdata/testinput7
|
||||||
|
index a2b7fb8d..96deaa30 100644
|
||||||
|
--- a/testdata/testinput7
|
||||||
|
+++ b/testdata/testinput7
|
||||||
|
@@ -1657,7 +1657,7 @@
|
||||||
|
/^[\p{Xwd}]+/utf
|
||||||
|
ABCD1234\x{6ca}\x{a6c}\x{10a7}_
|
||||||
|
|
||||||
|
-# Unicode properties for \b abd \B
|
||||||
|
+# Unicode properties for \b and \B
|
||||||
|
|
||||||
|
/\b...\B/utf,ucp
|
||||||
|
abc_
|
||||||
|
@@ -2435,9 +2435,15 @@
|
||||||
|
/[[:digit:]]+/utf,ucp
|
||||||
|
123\x{660}456
|
||||||
|
|
||||||
|
+/[[:digit:]]+/utf,ucp,ascii_digit
|
||||||
|
+ 123\x{660}456
|
||||||
|
+
|
||||||
|
+/[[:digit:]]+/g,utf,ucp,ascii_digit
|
||||||
|
+ 123\x{660}456
|
||||||
|
+
|
||||||
|
/[[:digit:]]+/utf,ucp,ascii_posix
|
||||||
|
123\x{660}456
|
||||||
|
-
|
||||||
|
+
|
||||||
|
/>[[:space:]]+</utf,ucp
|
||||||
|
>\x{a0} \x{a0}<
|
||||||
|
>\x{a0}\x{a0}\x{a0}<
|
||||||
|
diff --git a/testdata/testoutput5 b/testdata/testoutput5
|
||||||
|
index 3cee990e..febcc954 100644
|
||||||
|
--- a/testdata/testoutput5
|
||||||
|
+++ b/testdata/testoutput5
|
||||||
|
@@ -2520,6 +2520,14 @@ No match
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
+/[[:digit:]]/B,ucp,ascii_digit
|
||||||
|
+------------------------------------------------------------------
|
||||||
|
+ Bra
|
||||||
|
+ [0-9]
|
||||||
|
+ Ket
|
||||||
|
+ End
|
||||||
|
+------------------------------------------------------------------
|
||||||
|
+
|
||||||
|
/[[:graph:]]/B,ucp
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
@@ -2568,7 +2576,7 @@ No match
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
-# Unicode properties for \b abd \B
|
||||||
|
+# Unicode properties for \b and \B
|
||||||
|
|
||||||
|
/\b...\B/utf,ucp
|
||||||
|
abc_
|
||||||
|
@@ -5359,6 +5367,15 @@ No match
|
||||||
|
123\x{660}456
|
||||||
|
0: 123\x{660}456
|
||||||
|
|
||||||
|
+/[[:digit:]]+/utf,ucp,ascii_digit
|
||||||
|
+ 123\x{660}456
|
||||||
|
+ 0: 123
|
||||||
|
+
|
||||||
|
+/[[:digit:]]+/g,utf,ucp,ascii_digit
|
||||||
|
+ 123\x{660}456
|
||||||
|
+ 0: 123
|
||||||
|
+ 0: 456
|
||||||
|
+
|
||||||
|
/[[:digit:]]+/utf,ucp,ascii_posix
|
||||||
|
123\x{660}456
|
||||||
|
0: 123
|
||||||
|
diff --git a/testdata/testoutput7 b/testdata/testoutput7
|
||||||
|
index 4065981d..d98178e6 100644
|
||||||
|
--- a/testdata/testoutput7
|
||||||
|
+++ b/testdata/testoutput7
|
||||||
|
@@ -2853,7 +2853,7 @@ No match
|
||||||
|
ABCD1234\x{6ca}\x{a6c}\x{10a7}_
|
||||||
|
0: ABCD1234\x{6ca}\x{a6c}\x{10a7}_
|
||||||
|
|
||||||
|
-# Unicode properties for \b abd \B
|
||||||
|
+# Unicode properties for \b and \B
|
||||||
|
|
||||||
|
/\b...\B/utf,ucp
|
||||||
|
abc_
|
||||||
|
@@ -4080,10 +4080,19 @@ No match
|
||||||
|
123\x{660}456
|
||||||
|
0: 123\x{660}456
|
||||||
|
|
||||||
|
+/[[:digit:]]+/utf,ucp,ascii_digit
|
||||||
|
+ 123\x{660}456
|
||||||
|
+ 0: 123
|
||||||
|
+
|
||||||
|
+/[[:digit:]]+/g,utf,ucp,ascii_digit
|
||||||
|
+ 123\x{660}456
|
||||||
|
+ 0: 123
|
||||||
|
+ 0: 456
|
||||||
|
+
|
||||||
|
/[[:digit:]]+/utf,ucp,ascii_posix
|
||||||
|
123\x{660}456
|
||||||
|
0: 123
|
||||||
|
-
|
||||||
|
+
|
||||||
|
/>[[:space:]]+</utf,ucp
|
||||||
|
>\x{a0} \x{a0}<
|
||||||
|
0: >\x{a0} \x{a0}<
|
||||||
|
--
|
||||||
|
2.33.0
|
||||||
|
|
||||||
@ -0,0 +1,118 @@
|
|||||||
|
From bc367f1880ae5ccc771d5780e35df4c42744a9c4 Mon Sep 17 00:00:00 2001
|
||||||
|
From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= <carenas@gmail.com>
|
||||||
|
Date: Sun, 22 Sep 2024 01:49:03 -0700
|
||||||
|
Subject: [PATCH] pcre2_compile: avoid 1 byte buffer overread parsing VERBs
|
||||||
|
(#487)
|
||||||
|
|
||||||
|
As reported recently by ef218fb (Guard against out-of-bounds memory
|
||||||
|
access when parsing LIMIT_HEAP et al (#463), 2024-09-07), a malformed
|
||||||
|
pattern could result in reading 1 byte past its end.
|
||||||
|
|
||||||
|
Fix a similar issue that affects all VERBs and add test cases to
|
||||||
|
ensure the original bug and all its siblings are no longer an issue.
|
||||||
|
|
||||||
|
While at it fix the wording of the related documentation.
|
||||||
|
---
|
||||||
|
doc/pcre2syntax.3 | 4 ++--
|
||||||
|
src/pcre2_compile.c | 11 +++--------
|
||||||
|
testdata/testinput2 | 8 ++++++++
|
||||||
|
testdata/testoutput2 | 12 ++++++++++++
|
||||||
|
4 files changed, 25 insertions(+), 10 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/doc/pcre2syntax.3 b/doc/pcre2syntax.3
|
||||||
|
index 232125b82..db0bb6586 100644
|
||||||
|
--- a/doc/pcre2syntax.3
|
||||||
|
+++ b/doc/pcre2syntax.3
|
||||||
|
@@ -408,8 +408,8 @@ only one hyphen. Setting (but no unsetting) is allowed after (?^ for example
|
||||||
|
example (?i:...).
|
||||||
|
.P
|
||||||
|
The following are recognized only at the very start of a pattern or after one
|
||||||
|
-of the newline or \eR options with similar syntax. More than one of them may
|
||||||
|
-appear. For the first three, d is a decimal number.
|
||||||
|
+of the newline or \eR sequences or options with similar syntax. More than one
|
||||||
|
+of them may appear. For the first three, d is a decimal number.
|
||||||
|
.sp
|
||||||
|
(*LIMIT_DEPTH=d) set the backtracking limit to d
|
||||||
|
(*LIMIT_HEAP=d) set the heap size limit to d * 1024 bytes
|
||||||
|
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
|
||||||
|
index 7e48b26..3d9a500 100644
|
||||||
|
--- a/src/pcre2_compile.c
|
||||||
|
+++ b/src/pcre2_compile.c
|
||||||
|
@@ -9877,13 +9877,14 @@ if ((options & PCRE2_LITERAL) == 0)
|
||||||
|
{
|
||||||
|
for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
|
||||||
|
{
|
||||||
|
- uint32_t c, pp;
|
||||||
|
pso *p = pso_list + i;
|
||||||
|
|
||||||
|
if (patlen - skipatstart - 2 >= p->length &&
|
||||||
|
PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
|
||||||
|
p->length) == 0)
|
||||||
|
{
|
||||||
|
+ uint32_t c, pp;
|
||||||
|
+
|
||||||
|
skipatstart += p->length + 2;
|
||||||
|
switch(p->type)
|
||||||
|
{
|
||||||
|
@@ -9910,18 +9911,12 @@ if ((options & PCRE2_LITERAL) == 0)
|
||||||
|
case PSO_LIMH:
|
||||||
|
c = 0;
|
||||||
|
pp = skipatstart;
|
||||||
|
- if (!IS_DIGIT(ptr[pp]))
|
||||||
|
- {
|
||||||
|
- errorcode = ERR60;
|
||||||
|
- ptr += pp;
|
||||||
|
- goto HAD_EARLY_ERROR;
|
||||||
|
- }
|
||||||
|
while (pp < patlen && IS_DIGIT(ptr[pp]))
|
||||||
|
{
|
||||||
|
if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */
|
||||||
|
c = c*10 + (ptr[pp++] - CHAR_0);
|
||||||
|
}
|
||||||
|
- if (pp >= patlen || ptr[pp] != CHAR_RIGHT_PARENTHESIS)
|
||||||
|
+ if (pp >= patlen || pp == skipatstart || ptr[pp] != CHAR_RIGHT_PARENTHESIS)
|
||||||
|
{
|
||||||
|
errorcode = ERR60;
|
||||||
|
ptr += pp;
|
||||||
|
diff --git a/testdata/testinput2 b/testdata/testinput2
|
||||||
|
index a869c5bc2..542d14520 100644
|
||||||
|
--- a/testdata/testinput2
|
||||||
|
+++ b/testdata/testinput2
|
||||||
|
@@ -5261,6 +5261,14 @@ a)"xI
|
||||||
|
|
||||||
|
/(*LIMIT_HEAP=0)xxx/I
|
||||||
|
|
||||||
|
+/(*LIMIT_HEAP=123/use_length
|
||||||
|
+
|
||||||
|
+/(*LIMIT_MATCH=/use_length
|
||||||
|
+
|
||||||
|
+/(*CRLF)(*LIMIT_DEPTH=/use_length
|
||||||
|
+
|
||||||
|
+/(*CRLF)(*LIMIT_RECURSION=1)(*BOGUS/use_length
|
||||||
|
+
|
||||||
|
/\d{0,3}(*:abc)(?C1)xxx/callout_info
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
diff --git a/testdata/testoutput2 b/testdata/testoutput2
|
||||||
|
index bf7b7620e..b99d64781 100644
|
||||||
|
--- a/testdata/testoutput2
|
||||||
|
+++ b/testdata/testoutput2
|
||||||
|
@@ -16220,6 +16220,18 @@ First code unit = 'x'
|
||||||
|
Last code unit = 'x'
|
||||||
|
Subject length lower bound = 3
|
||||||
|
|
||||||
|
+/(*LIMIT_HEAP=123/use_length
|
||||||
|
+Failed: error 160 at offset 16: (*VERB) not recognized or malformed
|
||||||
|
+
|
||||||
|
+/(*LIMIT_MATCH=/use_length
|
||||||
|
+Failed: error 160 at offset 14: (*VERB) not recognized or malformed
|
||||||
|
+
|
||||||
|
+/(*CRLF)(*LIMIT_DEPTH=/use_length
|
||||||
|
+Failed: error 160 at offset 21: (*VERB) not recognized or malformed
|
||||||
|
+
|
||||||
|
+/(*CRLF)(*LIMIT_RECURSION=1)(*BOGUS/use_length
|
||||||
|
+Failed: error 160 at offset 34: (*VERB) not recognized or malformed
|
||||||
|
+
|
||||||
|
/\d{0,3}(*:abc)(?C1)xxx/callout_info
|
||||||
|
Callout 1 x
|
||||||
|
|
||||||
38
pcre2.spec
38
pcre2.spec
@ -1,6 +1,6 @@
|
|||||||
Name: pcre2
|
Name: pcre2
|
||||||
Version: 10.42
|
Version: 10.42
|
||||||
Release: 8
|
Release: 13
|
||||||
Summary: Perl Compatible Regular Expressions
|
Summary: Perl Compatible Regular Expressions
|
||||||
License: BSD
|
License: BSD
|
||||||
URL: http://www.pcre.org/
|
URL: http://www.pcre.org/
|
||||||
@ -30,6 +30,22 @@ Patch6018: backport-Sanity-checks-for-ctype-functions-342.patch
|
|||||||
Patch6019: backport-Fix-incorrect-class-character-matches-in-JIT.patch
|
Patch6019: backport-Fix-incorrect-class-character-matches-in-JIT.patch
|
||||||
Patch6020: backport-Fixing-an-issue-using-empty-character-sets-in-jit.patch
|
Patch6020: backport-Fixing-an-issue-using-empty-character-sets-in-jit.patch
|
||||||
Patch6021: backport-pcre2grep-document-better-possible-multiline-matchin.patch
|
Patch6021: backport-pcre2grep-document-better-possible-multiline-matchin.patch
|
||||||
|
Patch6022: backport-Remove-incorrect-optimization-in-DFA-matching-when-p.patch
|
||||||
|
Patch6023: backport-Implement-PCRE2_EXTRA_CASELESS_RESTRICT-and-related-.patch
|
||||||
|
Patch6024: backport-Additional-PCRE2_EXTRA_ASCII_xxx-code.patch
|
||||||
|
Patch6025: backport-Fix-non-recognition-of-some-octal-escapes-in-substitute.patch
|
||||||
|
Patch6026: backport-Guard-against-out-of-bounds-memory-access-when-parsing.patch
|
||||||
|
Patch6027: backport-Add-Perl-titlecasing-475.patch
|
||||||
|
Patch6028: backport-Fix-incorrect-positive-error-code-from-pcre2_substitute.patch
|
||||||
|
Patch6029: backport-pcre2_compile-avoid-1-byte-buffer-overread-parsing-V.patch
|
||||||
|
Patch6030: backport-Improve-error-message-for-N-name-in-character-classes.patch
|
||||||
|
Patch6031: backport-Further-ASCII-tests-and-minor-bugfix-plus-ChangeLog-.patch
|
||||||
|
Patch6032: backport-avoid-inconsistency-between-d-and-digit-when-using-a.patch
|
||||||
|
Patch6033: backport-Fix-the-lookahead-after-d-or-posix-to-skip-whitespac.patch
|
||||||
|
Patch6034: backport-Improve-error-offsets-for-character-classes-548.patch
|
||||||
|
Patch6035: backport-Non-recursive-scan-prefix-in-JIT-560.patch
|
||||||
|
Patch6036: backport-Mend-a-bug-in-pcre2grep-that-caused-separator-lines-.patch
|
||||||
|
Patch6037: backport-Fix-oversight-in-adding-new-pcre2grep-test.patch
|
||||||
|
|
||||||
BuildRequires: autoconf libtool automake coreutils gcc make readline-devel
|
BuildRequires: autoconf libtool automake coreutils gcc make readline-devel
|
||||||
Obsoletes: pcre2-utf16 pcre2-utf32 pcre2-tools
|
Obsoletes: pcre2-utf16 pcre2-utf32 pcre2-tools
|
||||||
@ -147,6 +163,26 @@ make check
|
|||||||
%{_pkgdocdir}/html/
|
%{_pkgdocdir}/html/
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
*Thu Mar 13 2025 Linux_zhang <zhangruifang@h-partners.com> - 10.42-13
|
||||||
|
- DESC:sync patches from upstream to fix a bug in pcre2grep
|
||||||
|
|
||||||
|
* Tue Dec 10 2024 hugel <gengqihu2@h-partners.com> - 10.42-12
|
||||||
|
- DESC:sync patches from upstream
|
||||||
|
backport-Further-ASCII-tests-and-minor-bugfix-plus-ChangeLog-.patch
|
||||||
|
backport-avoid-inconsistency-between-d-and-digit-when-using-a.patch
|
||||||
|
backport-Fix-the-lookahead-after-d-or-posix-to-skip-whitespac.patch
|
||||||
|
backport-Improve-error-offsets-for-character-classes-548.patch
|
||||||
|
backport-Non-recursive-scan-prefix-in-JIT-560.patch
|
||||||
|
|
||||||
|
* Tue Nov 19 2024 yanglongkang <yanglongkang@h-partners.com> - 10.42-11
|
||||||
|
- DESC:sync patches from upstream
|
||||||
|
|
||||||
|
* Thu Oct 31 2024 xujing <xujing125@huawei.com> - 10.42-10
|
||||||
|
- DESC:sync patches to fix grep testcase failed
|
||||||
|
|
||||||
|
* Tue Sep 03 2024 dongyuzhen <dongyuzhen@h-partners.com> - 10.42-9
|
||||||
|
- DESC:Remove incorrect optimization in DFA matching when partial matching and (*F) are involved
|
||||||
|
|
||||||
* Tue Jul 23 2024 fuanan <fuanan3@h-partners.com> - 10.42-8
|
* Tue Jul 23 2024 fuanan <fuanan3@h-partners.com> - 10.42-8
|
||||||
- DESC:document better possible multiline matching misses
|
- DESC:document better possible multiline matching misses
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user