271 lines
8.9 KiB
Diff
271 lines
8.9 KiB
Diff
From 64549346f044dec18d18d06c2d08a68a68e26817 Mon Sep 17 00:00:00 2001
|
|
From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= <carenas@gmail.com>
|
|
Date: Sun, 9 Apr 2023 04:29:46 -0700
|
|
Subject: [PATCH] avoid inconsistency between \d and [:digit:] when using /a
|
|
(#223)
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
Conflict:don't modify Changelog and doc/*; keep pcre2.h.generic consistent
|
|
with pcre2.h.in according to 1de7291
|
|
Reference:https://github.com/PCRE2Project/pcre2/commit/64549346f044dec18d18d06c2d08a68a68e26817
|
|
|
|
Since a608946 (Additional PCRE2_EXTRA_ASCII_xxx code, 2023-02-01)
|
|
PCRE2_EXTRA_ASCII_BSD could be used to restrict \d to ASCII causing
|
|
the following inconsistent behaviour in UCP mode.
|
|
|
|
PCRE2 version 10.43-DEV 2023-01-15
|
|
re> /\d/utf,ucp,ascii_bsd
|
|
data> ٣
|
|
No match
|
|
data>
|
|
re> /[[:digit:]]/utf,ucp,ascii_bsd
|
|
data> ٣
|
|
0: \x{663}
|
|
|
|
It has been suggested[1] that the change to match \p{Nd} when Unicode
|
|
is enabled for [:digit:] might had been unintentional and a bug, as
|
|
[:digit:] should be able to be POSIX compatible, so add a new flag
|
|
PCRE2_EXTRA_ASCII_DIGIT to avoid changing its definition in UCP mode.
|
|
|
|
[1] https://lore.kernel.org/git/CANgJU+U+xXsh9psd0z5Xjr+Se5QgdKkjQ7LUQ-PdUULSN3n4+g@mail.gmail.com/
|
|
---
|
|
src/pcre2.h.generic | 6 ++++++
|
|
src/pcre2.h.in | 1 +
|
|
src/pcre2_compile.c | 6 ++++--
|
|
src/pcre2test.c | 4 +++-
|
|
testdata/testinput5 | 10 +++++++++-
|
|
testdata/testinput7 | 10 ++++++++--
|
|
testdata/testoutput5 | 19 ++++++++++++++++++-
|
|
testdata/testoutput7 | 13 +++++++++++--
|
|
8 files changed, 60 insertions(+), 9 deletions(-)
|
|
|
|
diff --git a/src/pcre2.h.generic b/src/pcre2.h.generic
|
|
index dad774ce..05cf9bc1 100644
|
|
--- a/src/pcre2.h.generic
|
|
+++ b/src/pcre2.h.generic
|
|
@@ -153,6 +153,12 @@ D is inspected during pcre2_dfa_match() execution
|
|
#define PCRE2_EXTRA_ESCAPED_CR_IS_LF 0x00000010u /* C */
|
|
#define PCRE2_EXTRA_ALT_BSUX 0x00000020u /* C */
|
|
#define PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK 0x00000040u /* C */
|
|
+#define PCRE2_EXTRA_CASELESS_RESTRICT 0x00000080u /* C */
|
|
+#define PCRE2_EXTRA_ASCII_BSD 0x00000100u /* C */
|
|
+#define PCRE2_EXTRA_ASCII_BSS 0x00000200u /* C */
|
|
+#define PCRE2_EXTRA_ASCII_BSW 0x00000400u /* C */
|
|
+#define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */
|
|
+#define PCRE2_EXTRA_ASCII_DIGIT 0x00001000u /* C */
|
|
|
|
/* These are for pcre2_jit_compile(). */
|
|
|
|
diff --git a/src/pcre2.h.in b/src/pcre2.h.in
|
|
index 7202c633..cd7fdcf2 100644
|
|
--- a/src/pcre2.h.in
|
|
+++ b/src/pcre2.h.in
|
|
@@ -158,6 +158,7 @@ D is inspected during pcre2_dfa_match() execution
|
|
#define PCRE2_EXTRA_ASCII_BSS 0x00000200u /* C */
|
|
#define PCRE2_EXTRA_ASCII_BSW 0x00000400u /* C */
|
|
#define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */
|
|
+#define PCRE2_EXTRA_ASCII_DIGIT 0x00001000u /* C */
|
|
|
|
/* These are for pcre2_jit_compile(). */
|
|
|
|
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
|
|
index 95c4a79d..634360b7 100644
|
|
--- a/src/pcre2_compile.c
|
|
+++ b/src/pcre2_compile.c
|
|
@@ -786,7 +786,8 @@ are allowed. */
|
|
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
|
|
PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
|
|
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
|
|
- PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX)
|
|
+ PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
|
|
+ PCRE2_EXTRA_ASCII_DIGIT)
|
|
|
|
/* Compile time error code numbers. They are given names so that they can more
|
|
easily be tracked. When a new number is added, the tables called eint1 and
|
|
@@ -3581,7 +3582,8 @@ while (ptr < ptrend)
|
|
|
|
#ifdef SUPPORT_UNICODE
|
|
if ((options & PCRE2_UCP) != 0 &&
|
|
- (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)
|
|
+ (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
|
|
+ !(posix_class == 7 && (xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0))
|
|
{
|
|
int ptype = posix_substitutes[2*posix_class];
|
|
int pvalue = posix_substitutes[2*posix_class + 1];
|
|
diff --git a/src/pcre2test.c b/src/pcre2test.c
|
|
index 4da3ef90..21b19370 100644
|
|
--- a/src/pcre2test.c
|
|
+++ b/src/pcre2test.c
|
|
@@ -651,6 +651,7 @@ static modstruct modlist[] = {
|
|
{ "ascii_bsd", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSD, CO(extra_options) },
|
|
{ "ascii_bss", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSS, CO(extra_options) },
|
|
{ "ascii_bsw", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSW, CO(extra_options) },
|
|
+ { "ascii_digit", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_DIGIT, CO(extra_options) },
|
|
{ "ascii_posix", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_POSIX, CO(extra_options) },
|
|
{ "auto_callout", MOD_PAT, MOD_OPT, PCRE2_AUTO_CALLOUT, PO(options) },
|
|
{ "bad_escape_is_literal", MOD_CTC, MOD_OPT, PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL, CO(extra_options) },
|
|
@@ -4294,13 +4295,14 @@ show_compile_extra_options(uint32_t options, const char *before,
|
|
const char *after)
|
|
{
|
|
if (options == 0) fprintf(outfile, "%s <none>%s", before, after);
|
|
-else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
|
+else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
|
before,
|
|
((options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)? " allow_surrogate_escapes" : "",
|
|
((options & PCRE2_EXTRA_ALT_BSUX) != 0)? " alt_bsux" : "",
|
|
((options & PCRE2_EXTRA_ASCII_BSD) != 0)? " ascii_bsd" : "",
|
|
((options & PCRE2_EXTRA_ASCII_BSS) != 0)? " ascii_bss" : "",
|
|
((options & PCRE2_EXTRA_ASCII_BSW) != 0)? " ascii_bsw" : "",
|
|
+ ((options & PCRE2_EXTRA_ASCII_DIGIT) != 0)? " ascii_digit" : "",
|
|
((options & PCRE2_EXTRA_ASCII_POSIX) != 0)? " ascii_posix" : "",
|
|
((options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) != 0)? " bad_escape_is_literal" : "",
|
|
((options & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)? " caseless_restrict" : "",
|
|
diff --git a/testdata/testinput5 b/testdata/testinput5
|
|
index 0f105408..0624a0c3 100644
|
|
--- a/testdata/testinput5
|
|
+++ b/testdata/testinput5
|
|
@@ -1215,6 +1215,8 @@
|
|
|
|
/[[:digit:]]/B,ucp
|
|
|
|
+/[[:digit:]]/B,ucp,ascii_digit
|
|
+
|
|
/[[:graph:]]/B,ucp
|
|
|
|
/[[:print:]]/B,ucp
|
|
@@ -1227,7 +1229,7 @@
|
|
|
|
/[[:xdigit:]]/B,ucp
|
|
|
|
-# Unicode properties for \b abd \B
|
|
+# Unicode properties for \b and \B
|
|
|
|
/\b...\B/utf,ucp
|
|
abc_
|
|
@@ -2431,6 +2433,12 @@
|
|
/[[:digit:]]+/utf,ucp
|
|
123\x{660}456
|
|
|
|
+/[[:digit:]]+/utf,ucp,ascii_digit
|
|
+ 123\x{660}456
|
|
+
|
|
+/[[:digit:]]+/g,utf,ucp,ascii_digit
|
|
+ 123\x{660}456
|
|
+
|
|
/[[:digit:]]+/utf,ucp,ascii_posix
|
|
123\x{660}456
|
|
|
|
diff --git a/testdata/testinput7 b/testdata/testinput7
|
|
index a2b7fb8d..96deaa30 100644
|
|
--- a/testdata/testinput7
|
|
+++ b/testdata/testinput7
|
|
@@ -1657,7 +1657,7 @@
|
|
/^[\p{Xwd}]+/utf
|
|
ABCD1234\x{6ca}\x{a6c}\x{10a7}_
|
|
|
|
-# Unicode properties for \b abd \B
|
|
+# Unicode properties for \b and \B
|
|
|
|
/\b...\B/utf,ucp
|
|
abc_
|
|
@@ -2435,9 +2435,15 @@
|
|
/[[:digit:]]+/utf,ucp
|
|
123\x{660}456
|
|
|
|
+/[[:digit:]]+/utf,ucp,ascii_digit
|
|
+ 123\x{660}456
|
|
+
|
|
+/[[:digit:]]+/g,utf,ucp,ascii_digit
|
|
+ 123\x{660}456
|
|
+
|
|
/[[:digit:]]+/utf,ucp,ascii_posix
|
|
123\x{660}456
|
|
-
|
|
+
|
|
/>[[:space:]]+</utf,ucp
|
|
>\x{a0} \x{a0}<
|
|
>\x{a0}\x{a0}\x{a0}<
|
|
diff --git a/testdata/testoutput5 b/testdata/testoutput5
|
|
index 3cee990e..febcc954 100644
|
|
--- a/testdata/testoutput5
|
|
+++ b/testdata/testoutput5
|
|
@@ -2520,6 +2520,14 @@ No match
|
|
End
|
|
------------------------------------------------------------------
|
|
|
|
+/[[:digit:]]/B,ucp,ascii_digit
|
|
+------------------------------------------------------------------
|
|
+ Bra
|
|
+ [0-9]
|
|
+ Ket
|
|
+ End
|
|
+------------------------------------------------------------------
|
|
+
|
|
/[[:graph:]]/B,ucp
|
|
------------------------------------------------------------------
|
|
Bra
|
|
@@ -2568,7 +2576,7 @@ No match
|
|
End
|
|
------------------------------------------------------------------
|
|
|
|
-# Unicode properties for \b abd \B
|
|
+# Unicode properties for \b and \B
|
|
|
|
/\b...\B/utf,ucp
|
|
abc_
|
|
@@ -5359,6 +5367,15 @@ No match
|
|
123\x{660}456
|
|
0: 123\x{660}456
|
|
|
|
+/[[:digit:]]+/utf,ucp,ascii_digit
|
|
+ 123\x{660}456
|
|
+ 0: 123
|
|
+
|
|
+/[[:digit:]]+/g,utf,ucp,ascii_digit
|
|
+ 123\x{660}456
|
|
+ 0: 123
|
|
+ 0: 456
|
|
+
|
|
/[[:digit:]]+/utf,ucp,ascii_posix
|
|
123\x{660}456
|
|
0: 123
|
|
diff --git a/testdata/testoutput7 b/testdata/testoutput7
|
|
index 4065981d..d98178e6 100644
|
|
--- a/testdata/testoutput7
|
|
+++ b/testdata/testoutput7
|
|
@@ -2853,7 +2853,7 @@ No match
|
|
ABCD1234\x{6ca}\x{a6c}\x{10a7}_
|
|
0: ABCD1234\x{6ca}\x{a6c}\x{10a7}_
|
|
|
|
-# Unicode properties for \b abd \B
|
|
+# Unicode properties for \b and \B
|
|
|
|
/\b...\B/utf,ucp
|
|
abc_
|
|
@@ -4080,10 +4080,19 @@ No match
|
|
123\x{660}456
|
|
0: 123\x{660}456
|
|
|
|
+/[[:digit:]]+/utf,ucp,ascii_digit
|
|
+ 123\x{660}456
|
|
+ 0: 123
|
|
+
|
|
+/[[:digit:]]+/g,utf,ucp,ascii_digit
|
|
+ 123\x{660}456
|
|
+ 0: 123
|
|
+ 0: 456
|
|
+
|
|
/[[:digit:]]+/utf,ucp,ascii_posix
|
|
123\x{660}456
|
|
0: 123
|
|
-
|
|
+
|
|
/>[[:space:]]+</utf,ucp
|
|
>\x{a0} \x{a0}<
|
|
0: >\x{a0} \x{a0}<
|
|
--
|
|
2.33.0
|
|
|