From 64549346f044dec18d18d06c2d08a68a68e26817 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= Date: Sun, 9 Apr 2023 04:29:46 -0700 Subject: [PATCH] avoid inconsistency between \d and [:digit:] when using /a (#223) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Conflict:don't modify Changelog and doc/*; keep pcre2.h.generic consistent with pcre2.h.in according to 1de7291 Reference:https://github.com/PCRE2Project/pcre2/commit/64549346f044dec18d18d06c2d08a68a68e26817 Since a608946 (Additional PCRE2_EXTRA_ASCII_xxx code, 2023-02-01) PCRE2_EXTRA_ASCII_BSD could be used to restrict \d to ASCII causing the following inconsistent behaviour in UCP mode. PCRE2 version 10.43-DEV 2023-01-15 re> /\d/utf,ucp,ascii_bsd data> ٣ No match data> re> /[[:digit:]]/utf,ucp,ascii_bsd data> ٣ 0: \x{663} It has been suggested[1] that the change to match \p{Nd} when Unicode is enabled for [:digit:] might had been unintentional and a bug, as [:digit:] should be able to be POSIX compatible, so add a new flag PCRE2_EXTRA_ASCII_DIGIT to avoid changing its definition in UCP mode. [1] https://lore.kernel.org/git/CANgJU+U+xXsh9psd0z5Xjr+Se5QgdKkjQ7LUQ-PdUULSN3n4+g@mail.gmail.com/ --- src/pcre2.h.generic | 6 ++++++ src/pcre2.h.in | 1 + src/pcre2_compile.c | 6 ++++-- src/pcre2test.c | 4 +++- testdata/testinput5 | 10 +++++++++- testdata/testinput7 | 10 ++++++++-- testdata/testoutput5 | 19 ++++++++++++++++++- testdata/testoutput7 | 13 +++++++++++-- 8 files changed, 60 insertions(+), 9 deletions(-) diff --git a/src/pcre2.h.generic b/src/pcre2.h.generic index dad774ce..05cf9bc1 100644 --- a/src/pcre2.h.generic +++ b/src/pcre2.h.generic @@ -153,6 +153,12 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_EXTRA_ESCAPED_CR_IS_LF 0x00000010u /* C */ #define PCRE2_EXTRA_ALT_BSUX 0x00000020u /* C */ #define PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK 0x00000040u /* C */ +#define PCRE2_EXTRA_CASELESS_RESTRICT 0x00000080u /* C */ +#define PCRE2_EXTRA_ASCII_BSD 0x00000100u /* C */ +#define PCRE2_EXTRA_ASCII_BSS 0x00000200u /* C */ +#define PCRE2_EXTRA_ASCII_BSW 0x00000400u /* C */ +#define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */ +#define PCRE2_EXTRA_ASCII_DIGIT 0x00001000u /* C */ /* These are for pcre2_jit_compile(). */ diff --git a/src/pcre2.h.in b/src/pcre2.h.in index 7202c633..cd7fdcf2 100644 --- a/src/pcre2.h.in +++ b/src/pcre2.h.in @@ -158,6 +158,7 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_EXTRA_ASCII_BSS 0x00000200u /* C */ #define PCRE2_EXTRA_ASCII_BSW 0x00000400u /* C */ #define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */ +#define PCRE2_EXTRA_ASCII_DIGIT 0x00001000u /* C */ /* These are for pcre2_jit_compile(). */ diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 95c4a79d..634360b7 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -786,7 +786,8 @@ are allowed. */ PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \ PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \ PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \ - PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX) + PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \ + PCRE2_EXTRA_ASCII_DIGIT) /* Compile time error code numbers. They are given names so that they can more easily be tracked. When a new number is added, the tables called eint1 and @@ -3581,7 +3582,8 @@ while (ptr < ptrend) #ifdef SUPPORT_UNICODE if ((options & PCRE2_UCP) != 0 && - (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0) + (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 && + !(posix_class == 7 && (xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0)) { int ptype = posix_substitutes[2*posix_class]; int pvalue = posix_substitutes[2*posix_class + 1]; diff --git a/src/pcre2test.c b/src/pcre2test.c index 4da3ef90..21b19370 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -651,6 +651,7 @@ static modstruct modlist[] = { { "ascii_bsd", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSD, CO(extra_options) }, { "ascii_bss", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSS, CO(extra_options) }, { "ascii_bsw", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSW, CO(extra_options) }, + { "ascii_digit", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_DIGIT, CO(extra_options) }, { "ascii_posix", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_POSIX, CO(extra_options) }, { "auto_callout", MOD_PAT, MOD_OPT, PCRE2_AUTO_CALLOUT, PO(options) }, { "bad_escape_is_literal", MOD_CTC, MOD_OPT, PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL, CO(extra_options) }, @@ -4294,13 +4295,14 @@ show_compile_extra_options(uint32_t options, const char *before, const char *after) { if (options == 0) fprintf(outfile, "%s %s", before, after); -else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s", +else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s", before, ((options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)? " allow_surrogate_escapes" : "", ((options & PCRE2_EXTRA_ALT_BSUX) != 0)? " alt_bsux" : "", ((options & PCRE2_EXTRA_ASCII_BSD) != 0)? " ascii_bsd" : "", ((options & PCRE2_EXTRA_ASCII_BSS) != 0)? " ascii_bss" : "", ((options & PCRE2_EXTRA_ASCII_BSW) != 0)? " ascii_bsw" : "", + ((options & PCRE2_EXTRA_ASCII_DIGIT) != 0)? " ascii_digit" : "", ((options & PCRE2_EXTRA_ASCII_POSIX) != 0)? " ascii_posix" : "", ((options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) != 0)? " bad_escape_is_literal" : "", ((options & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)? " caseless_restrict" : "", diff --git a/testdata/testinput5 b/testdata/testinput5 index 0f105408..0624a0c3 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -1215,6 +1215,8 @@ /[[:digit:]]/B,ucp +/[[:digit:]]/B,ucp,ascii_digit + /[[:graph:]]/B,ucp /[[:print:]]/B,ucp @@ -1227,7 +1229,7 @@ /[[:xdigit:]]/B,ucp -# Unicode properties for \b abd \B +# Unicode properties for \b and \B /\b...\B/utf,ucp abc_ @@ -2431,6 +2433,12 @@ /[[:digit:]]+/utf,ucp 123\x{660}456 +/[[:digit:]]+/utf,ucp,ascii_digit + 123\x{660}456 + +/[[:digit:]]+/g,utf,ucp,ascii_digit + 123\x{660}456 + /[[:digit:]]+/utf,ucp,ascii_posix 123\x{660}456 diff --git a/testdata/testinput7 b/testdata/testinput7 index a2b7fb8d..96deaa30 100644 --- a/testdata/testinput7 +++ b/testdata/testinput7 @@ -1657,7 +1657,7 @@ /^[\p{Xwd}]+/utf ABCD1234\x{6ca}\x{a6c}\x{10a7}_ -# Unicode properties for \b abd \B +# Unicode properties for \b and \B /\b...\B/utf,ucp abc_ @@ -2435,9 +2435,15 @@ /[[:digit:]]+/utf,ucp 123\x{660}456 +/[[:digit:]]+/utf,ucp,ascii_digit + 123\x{660}456 + +/[[:digit:]]+/g,utf,ucp,ascii_digit + 123\x{660}456 + /[[:digit:]]+/utf,ucp,ascii_posix 123\x{660}456 - + />[[:space:]]+\x{a0} \x{a0}< >\x{a0}\x{a0}\x{a0}< diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 3cee990e..febcc954 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -2520,6 +2520,14 @@ No match End ------------------------------------------------------------------ +/[[:digit:]]/B,ucp,ascii_digit +------------------------------------------------------------------ + Bra + [0-9] + Ket + End +------------------------------------------------------------------ + /[[:graph:]]/B,ucp ------------------------------------------------------------------ Bra @@ -2568,7 +2576,7 @@ No match End ------------------------------------------------------------------ -# Unicode properties for \b abd \B +# Unicode properties for \b and \B /\b...\B/utf,ucp abc_ @@ -5359,6 +5367,15 @@ No match 123\x{660}456 0: 123\x{660}456 +/[[:digit:]]+/utf,ucp,ascii_digit + 123\x{660}456 + 0: 123 + +/[[:digit:]]+/g,utf,ucp,ascii_digit + 123\x{660}456 + 0: 123 + 0: 456 + /[[:digit:]]+/utf,ucp,ascii_posix 123\x{660}456 0: 123 diff --git a/testdata/testoutput7 b/testdata/testoutput7 index 4065981d..d98178e6 100644 --- a/testdata/testoutput7 +++ b/testdata/testoutput7 @@ -2853,7 +2853,7 @@ No match ABCD1234\x{6ca}\x{a6c}\x{10a7}_ 0: ABCD1234\x{6ca}\x{a6c}\x{10a7}_ -# Unicode properties for \b abd \B +# Unicode properties for \b and \B /\b...\B/utf,ucp abc_ @@ -4080,10 +4080,19 @@ No match 123\x{660}456 0: 123\x{660}456 +/[[:digit:]]+/utf,ucp,ascii_digit + 123\x{660}456 + 0: 123 + +/[[:digit:]]+/g,utf,ucp,ascii_digit + 123\x{660}456 + 0: 123 + 0: 456 + /[[:digit:]]+/utf,ucp,ascii_posix 123\x{660}456 0: 123 - + />[[:space:]]+\x{a0} \x{a0}< 0: >\x{a0} \x{a0}< -- 2.33.0