pcre2/backport-avoid-inconsistency-between-d-and-digit-when-using-a.patch
2024-12-10 11:41:16 +08:00

271 lines
8.9 KiB
Diff

From 64549346f044dec18d18d06c2d08a68a68e26817 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= <carenas@gmail.com>
Date: Sun, 9 Apr 2023 04:29:46 -0700
Subject: [PATCH] avoid inconsistency between \d and [:digit:] when using /a
(#223)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Conflict:don't modify Changelog and doc/*; keep pcre2.h.generic consistent
with pcre2.h.in according to 1de7291
Reference:https://github.com/PCRE2Project/pcre2/commit/64549346f044dec18d18d06c2d08a68a68e26817
Since a608946 (Additional PCRE2_EXTRA_ASCII_xxx code, 2023-02-01)
PCRE2_EXTRA_ASCII_BSD could be used to restrict \d to ASCII causing
the following inconsistent behaviour in UCP mode.
PCRE2 version 10.43-DEV 2023-01-15
re> /\d/utf,ucp,ascii_bsd
data> ٣
No match
data>
re> /[[:digit:]]/utf,ucp,ascii_bsd
data> ٣
0: \x{663}
It has been suggested[1] that the change to match \p{Nd} when Unicode
is enabled for [:digit:] might had been unintentional and a bug, as
[:digit:] should be able to be POSIX compatible, so add a new flag
PCRE2_EXTRA_ASCII_DIGIT to avoid changing its definition in UCP mode.
[1] https://lore.kernel.org/git/CANgJU+U+xXsh9psd0z5Xjr+Se5QgdKkjQ7LUQ-PdUULSN3n4+g@mail.gmail.com/
---
src/pcre2.h.generic | 6 ++++++
src/pcre2.h.in | 1 +
src/pcre2_compile.c | 6 ++++--
src/pcre2test.c | 4 +++-
testdata/testinput5 | 10 +++++++++-
testdata/testinput7 | 10 ++++++++--
testdata/testoutput5 | 19 ++++++++++++++++++-
testdata/testoutput7 | 13 +++++++++++--
8 files changed, 60 insertions(+), 9 deletions(-)
diff --git a/src/pcre2.h.generic b/src/pcre2.h.generic
index dad774ce..05cf9bc1 100644
--- a/src/pcre2.h.generic
+++ b/src/pcre2.h.generic
@@ -153,6 +153,12 @@ D is inspected during pcre2_dfa_match() execution
#define PCRE2_EXTRA_ESCAPED_CR_IS_LF 0x00000010u /* C */
#define PCRE2_EXTRA_ALT_BSUX 0x00000020u /* C */
#define PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK 0x00000040u /* C */
+#define PCRE2_EXTRA_CASELESS_RESTRICT 0x00000080u /* C */
+#define PCRE2_EXTRA_ASCII_BSD 0x00000100u /* C */
+#define PCRE2_EXTRA_ASCII_BSS 0x00000200u /* C */
+#define PCRE2_EXTRA_ASCII_BSW 0x00000400u /* C */
+#define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */
+#define PCRE2_EXTRA_ASCII_DIGIT 0x00001000u /* C */
/* These are for pcre2_jit_compile(). */
diff --git a/src/pcre2.h.in b/src/pcre2.h.in
index 7202c633..cd7fdcf2 100644
--- a/src/pcre2.h.in
+++ b/src/pcre2.h.in
@@ -158,6 +158,7 @@ D is inspected during pcre2_dfa_match() execution
#define PCRE2_EXTRA_ASCII_BSS 0x00000200u /* C */
#define PCRE2_EXTRA_ASCII_BSW 0x00000400u /* C */
#define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */
+#define PCRE2_EXTRA_ASCII_DIGIT 0x00001000u /* C */
/* These are for pcre2_jit_compile(). */
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index 95c4a79d..634360b7 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -786,7 +786,8 @@ are allowed. */
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
- PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX)
+ PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
+ PCRE2_EXTRA_ASCII_DIGIT)
/* Compile time error code numbers. They are given names so that they can more
easily be tracked. When a new number is added, the tables called eint1 and
@@ -3581,7 +3582,8 @@ while (ptr < ptrend)
#ifdef SUPPORT_UNICODE
if ((options & PCRE2_UCP) != 0 &&
- (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)
+ (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
+ !(posix_class == 7 && (xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0))
{
int ptype = posix_substitutes[2*posix_class];
int pvalue = posix_substitutes[2*posix_class + 1];
diff --git a/src/pcre2test.c b/src/pcre2test.c
index 4da3ef90..21b19370 100644
--- a/src/pcre2test.c
+++ b/src/pcre2test.c
@@ -651,6 +651,7 @@ static modstruct modlist[] = {
{ "ascii_bsd", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSD, CO(extra_options) },
{ "ascii_bss", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSS, CO(extra_options) },
{ "ascii_bsw", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSW, CO(extra_options) },
+ { "ascii_digit", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_DIGIT, CO(extra_options) },
{ "ascii_posix", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_POSIX, CO(extra_options) },
{ "auto_callout", MOD_PAT, MOD_OPT, PCRE2_AUTO_CALLOUT, PO(options) },
{ "bad_escape_is_literal", MOD_CTC, MOD_OPT, PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL, CO(extra_options) },
@@ -4294,13 +4295,14 @@ show_compile_extra_options(uint32_t options, const char *before,
const char *after)
{
if (options == 0) fprintf(outfile, "%s <none>%s", before, after);
-else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s",
+else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
before,
((options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)? " allow_surrogate_escapes" : "",
((options & PCRE2_EXTRA_ALT_BSUX) != 0)? " alt_bsux" : "",
((options & PCRE2_EXTRA_ASCII_BSD) != 0)? " ascii_bsd" : "",
((options & PCRE2_EXTRA_ASCII_BSS) != 0)? " ascii_bss" : "",
((options & PCRE2_EXTRA_ASCII_BSW) != 0)? " ascii_bsw" : "",
+ ((options & PCRE2_EXTRA_ASCII_DIGIT) != 0)? " ascii_digit" : "",
((options & PCRE2_EXTRA_ASCII_POSIX) != 0)? " ascii_posix" : "",
((options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) != 0)? " bad_escape_is_literal" : "",
((options & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)? " caseless_restrict" : "",
diff --git a/testdata/testinput5 b/testdata/testinput5
index 0f105408..0624a0c3 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -1215,6 +1215,8 @@
/[[:digit:]]/B,ucp
+/[[:digit:]]/B,ucp,ascii_digit
+
/[[:graph:]]/B,ucp
/[[:print:]]/B,ucp
@@ -1227,7 +1229,7 @@
/[[:xdigit:]]/B,ucp
-# Unicode properties for \b abd \B
+# Unicode properties for \b and \B
/\b...\B/utf,ucp
abc_
@@ -2431,6 +2433,12 @@
/[[:digit:]]+/utf,ucp
123\x{660}456
+/[[:digit:]]+/utf,ucp,ascii_digit
+ 123\x{660}456
+
+/[[:digit:]]+/g,utf,ucp,ascii_digit
+ 123\x{660}456
+
/[[:digit:]]+/utf,ucp,ascii_posix
123\x{660}456
diff --git a/testdata/testinput7 b/testdata/testinput7
index a2b7fb8d..96deaa30 100644
--- a/testdata/testinput7
+++ b/testdata/testinput7
@@ -1657,7 +1657,7 @@
/^[\p{Xwd}]+/utf
ABCD1234\x{6ca}\x{a6c}\x{10a7}_
-# Unicode properties for \b abd \B
+# Unicode properties for \b and \B
/\b...\B/utf,ucp
abc_
@@ -2435,9 +2435,15 @@
/[[:digit:]]+/utf,ucp
123\x{660}456
+/[[:digit:]]+/utf,ucp,ascii_digit
+ 123\x{660}456
+
+/[[:digit:]]+/g,utf,ucp,ascii_digit
+ 123\x{660}456
+
/[[:digit:]]+/utf,ucp,ascii_posix
123\x{660}456
-
+
/>[[:space:]]+</utf,ucp
>\x{a0} \x{a0}<
>\x{a0}\x{a0}\x{a0}<
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index 3cee990e..febcc954 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -2520,6 +2520,14 @@ No match
End
------------------------------------------------------------------
+/[[:digit:]]/B,ucp,ascii_digit
+------------------------------------------------------------------
+ Bra
+ [0-9]
+ Ket
+ End
+------------------------------------------------------------------
+
/[[:graph:]]/B,ucp
------------------------------------------------------------------
Bra
@@ -2568,7 +2576,7 @@ No match
End
------------------------------------------------------------------
-# Unicode properties for \b abd \B
+# Unicode properties for \b and \B
/\b...\B/utf,ucp
abc_
@@ -5359,6 +5367,15 @@ No match
123\x{660}456
0: 123\x{660}456
+/[[:digit:]]+/utf,ucp,ascii_digit
+ 123\x{660}456
+ 0: 123
+
+/[[:digit:]]+/g,utf,ucp,ascii_digit
+ 123\x{660}456
+ 0: 123
+ 0: 456
+
/[[:digit:]]+/utf,ucp,ascii_posix
123\x{660}456
0: 123
diff --git a/testdata/testoutput7 b/testdata/testoutput7
index 4065981d..d98178e6 100644
--- a/testdata/testoutput7
+++ b/testdata/testoutput7
@@ -2853,7 +2853,7 @@ No match
ABCD1234\x{6ca}\x{a6c}\x{10a7}_
0: ABCD1234\x{6ca}\x{a6c}\x{10a7}_
-# Unicode properties for \b abd \B
+# Unicode properties for \b and \B
/\b...\B/utf,ucp
abc_
@@ -4080,10 +4080,19 @@ No match
123\x{660}456
0: 123\x{660}456
+/[[:digit:]]+/utf,ucp,ascii_digit
+ 123\x{660}456
+ 0: 123
+
+/[[:digit:]]+/g,utf,ucp,ascii_digit
+ 123\x{660}456
+ 0: 123
+ 0: 456
+
/[[:digit:]]+/utf,ucp,ascii_posix
123\x{660}456
0: 123
-
+
/>[[:space:]]+</utf,ucp
>\x{a0} \x{a0}<
0: >\x{a0} \x{a0}<
--
2.33.0