jq/Correct-UTF-8-and-UTF-16-errors-during-concatenation.patch

389 lines
16 KiB
Diff

From 8829368f14943b8d2674c75805b27e56a569ad2c Mon Sep 17 00:00:00 2001
From: Max Zerzouri <maxdamantus@gmail.com>
Date: Tue, 25 May 2021 22:59:59 +1200
Subject: [PATCH] Correct UTF-8 and UTF-16 errors during concatenation
UTF-8 errors and UTF-16 errors that were previously encoded into the
ends of
strings will now potentially be used to form correct code points.
This is mostly a matter of making string equality behave expectedly, since
without this normalisation, it is possible to produce `jv` strings that are
converted to UTF-8 or UTF-16 the same way but are not equal due well-formed
code units that may or may not be encoded as errors.
---
src/jv.c | 13 ++-
src/jv_unicode.c | 248 ++++++++++++++++++++++++++++++++++++++---------
src/jv_unicode.h | 3 +
tests/jq.test | 15 +++
4 files changed, 230 insertions(+), 49 deletions(-)
diff --git a/src/jv.c b/src/jv.c
index e979cc6..67d86fb 100644
--- a/src/jv.c
+++ b/src/jv.c
@@ -522,20 +522,27 @@ static jv jvp_string_append(jv string, const char* data, uint32_t len) {
jvp_string* s = jvp_string_ptr(string);
uint32_t currlen = jvp_string_length(s);
+ char join_buf[4];
+ int join_len = jvp_utf8_extended_join(s->data, &currlen, &data, &len, join_buf);
+
if (jvp_refcnt_unshared(string.u.ptr) &&
- jvp_string_remaining_space(s) >= len) {
+ jvp_string_remaining_space(s) >= join_len + len) {
// the next string fits at the end of a
+ memcpy(s->data + currlen, join_buf, join_len);
+ currlen += join_len;
memcpy(s->data + currlen, data, len);
s->data[currlen + len] = 0;
s->length_hashed = (currlen + len) << 1;
return string;
} else {
// allocate a bigger buffer and copy
- uint32_t allocsz = (currlen + len) * 2;
+ uint32_t allocsz = (currlen + join_len + len) * 2;
if (allocsz < 32) allocsz = 32;
jvp_string* news = jvp_string_alloc(allocsz);
- news->length_hashed = (currlen + len) << 1;
+ news->length_hashed = (currlen + join_len + len) << 1;
memcpy(news->data, s->data, currlen);
+ memcpy(news->data + currlen, join_buf, join_len);
+ currlen += join_len;
memcpy(news->data + currlen, data, len);
news->data[currlen + len] = 0;
jvp_string_free(string);
diff --git a/src/jv_unicode.c b/src/jv_unicode.c
index 8c47536..7d67300 100644
--- a/src/jv_unicode.c
+++ b/src/jv_unicode.c
@@ -1,8 +1,72 @@
#include <stdio.h>
+#include <string.h>
#include <assert.h>
#include "jv_unicode.h"
#include "jv_utf8_tables.h"
+// length of encoding of erroneous UTF-8 byte
+#define UTF8_ERR_LEN 2
+// length of encoding of erroneous UTF-16 surrogate
+#define UTF16_ERR_LEN 3
+
+#define U32(a, b, c, d) ( \
+ (uint32_t) (a) << 0 | \
+ (uint32_t) (b) << 8 | \
+ (uint32_t) (c) << 16 | \
+ (uint32_t) (d) << 24 \
+)
+
+#define BYTE(u32, n) ((uint32_t) (((u32) >> (n)*8) & 0xFF))
+
+#define B0 0x00 // 00000000
+#define B1 0x80 // 10000000
+#define B2 0xC0 // 11000000
+#define B3 0xE0 // 11100000
+#define B4 0xF0 // 11110000
+#define B5 0xF8 // 11111000
+
+// NOTE: these flags are likely to be optimised out as `decode` gets inlined
+enum decode_flags {
+ DECODE_1 = 1,
+ DECODE_2 = 2,
+ DECODE_3 = 8,
+ DECODE_4 = 16
+};
+
+// decode up to 4 bytes of "generalised UTF-8"; no checking for overlong
+// codings or out-of-range code points, works by testing all fixed bits in each
+// of the 4 coding patterns, then shifting the value bits according to the
+// pattern
+static int decode(enum decode_flags flags, uint32_t data, int* codepoint_ret) {
+ if((flags & DECODE_1) && (data & U32(B1, B0, B0, B0)) == 0){
+ *codepoint_ret = BYTE(data, 0);
+ return 1;
+ }
+ if((flags & DECODE_2) && (data & U32(B3, B2, B0, B0)) == U32(B2, B1, B0, B0)){
+ *codepoint_ret =
+ (BYTE(data, 0) & ~B3) << 6 |
+ (BYTE(data, 1) & ~B2) << 0;
+ return 2;
+ }
+ if((flags & DECODE_3) && (data & U32(B4, B2, B2, B0)) == U32(B3, B1, B1, B0)){
+ *codepoint_ret =
+ (BYTE(data, 0) & ~B4) << 12 |
+ (BYTE(data, 1) & ~B2) << 6 |
+ (BYTE(data, 2) & ~B2) << 0;
+ return 3;
+ }
+ if((flags & DECODE_4) && (data & U32(B5, B2, B2, B2)) == U32(B4, B1, B1, B1)){
+ *codepoint_ret =
+ (BYTE(data, 0) & ~B5) << 18 |
+ (BYTE(data, 1) & ~B2) << 12 |
+ (BYTE(data, 2) & ~B2) << 6 |
+ (BYTE(data, 3) & ~B2) << 0;
+ return 4;
+ }
+ *codepoint_ret = -1;
+ return 1;
+}
+
// jvp_utf8_backtrack returns the beginning of the last codepoint in the
// string, assuming that start is the last byte in the string.
// If the last codepoint is incomplete, returns the number of missing bytes via
@@ -81,56 +145,42 @@ const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf
if (in == end) {
return 0;
}
- int codepoint = -1;
- unsigned char first = (unsigned char)in[0];
- int length = utf8_coding_length[first];
- if ((first & 0x80) == 0) {
+ uint32_t data = in[0] & 0xFF;
+ if ((data & B1) == 0) {
/* Fast-path for ASCII */
- codepoint = first;
- length = 1;
- } else if (length == 0 || length == UTF8_CONTINUATION_BYTE) {
- /* Bad single byte - either an invalid byte or an out-of-place continuation byte */
- if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: bad single byte");
- length = 1;
- } else if (in + length > end) {
- /* String ends before UTF8 sequence ends */
- if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: underrun");
- length = end - in;
- } else {
- codepoint = ((unsigned)in[0]) & utf8_coding_bits[first];
- for (int i=1; i<length; i++) {
- unsigned ch = (unsigned char)in[i];
- if (utf8_coding_length[ch] != UTF8_CONTINUATION_BYTE){
- /* Invalid UTF8 sequence - not followed by the right number of continuation bytes */
- if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: wrong bytes");
- codepoint = -1;
- length = i;
- break;
- }
- codepoint = (codepoint << 6) | (ch & 0x3f);
- }
- if (codepoint < utf8_first_codepoint[length]) {
- /* Overlong UTF8 sequence */
- if ((flags & JVP_UTF8_ERRORS_UTF8) && 0x00 <= codepoint && codepoint <= 0x7F) {
- /* UTF-8 error is emitted as a negative codepoint */
- codepoint = -(codepoint + 0x80);
- } else {
- if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: overlong");
- codepoint = -1;
- }
- }
- if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
- /* Surrogate codepoints are allowed in WTF-8/WTF-8b */
- if (!(flags & JVP_UTF8_ERRORS_UTF16)) {
- /* Surrogate codepoints can't be encoded in UTF8 */
- codepoint = -1;
- }
+ *codepoint_ret = data;
+ return in + 1;
+ }
+ switch (end - in) {
+ default: // fall through
+ case 4: data |= (uint32_t)(in[3] & 0xFF) << 24; // fall through
+ case 3: data |= (uint32_t)(in[2] & 0xFF) << 16; // fall through
+ case 2: data |= (uint32_t)(in[1] & 0xFF) << 8; // fall through
+ case 1: break;
+ }
+ int codepoint;
+ int length = decode(DECODE_2 | DECODE_3 | DECODE_4, data, &codepoint);
+ if (codepoint == -1) {
+ if (flags & JVP_UTF8_ERRORS_UTF8) assert(0 && "Invalid WTF-8b sequence: no match");
+ } else if (codepoint < utf8_first_codepoint[length]) {
+ /* Overlong UTF-8 sequence */
+ if ((flags & JVP_UTF8_ERRORS_UTF8) && length == UTF8_ERR_LEN && 0x00 <= codepoint && codepoint <= 0x7F) {
+ /* UTF-8 error is emitted as a negative codepoint */
+ codepoint = -(codepoint + 0x80);
+ } else {
+ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: overlong");
+ codepoint = -1;
}
- if (codepoint > 0x10FFFF) {
- /* Outside Unicode range */
- if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range");
+ } else if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
+ /* Surrogate codepoints are allowed in WTF-8/WTF-8b */
+ if (!(flags & JVP_UTF8_ERRORS_UTF16)) {
+ /* Surrogate codepoints can't be encoded in UTF8 */
codepoint = -1;
}
+ } else if (codepoint > 0x10FFFF) {
+ /* Outside Unicode range */
+ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range");
+ codepoint = -1;
}
if (codepoint == -1 && (flags & JVP_UTF8_REPLACE))
codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
@@ -139,6 +189,112 @@ const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf
return in + length;
}
+// assumes two bytes are readable from `in`
+static int decode_utf8_error(const char* in) {
+ uint32_t data = U32(in[0] & 0xFF, in[1] & 0xFF, 0, 0);
+ int codepoint;
+ if (decode(DECODE_2, data, &codepoint) == UTF8_ERR_LEN && codepoint < 0x80)
+ return codepoint + 0x80;
+ return -1;
+}
+
+// assumes three bytes are readable from `in`
+static int decode_utf16_error(const char* in) {
+ uint32_t data = U32(in[0] & 0xFF, in[1] & 0xFF, in[2] & 0xFF, 0);
+ int codepoint;
+ if (decode(DECODE_3, data, &codepoint) == UTF16_ERR_LEN && codepoint >= 0xD800 && codepoint < 0xDFFF)
+ return codepoint;
+ return -1;
+}
+
+// jvp_utf8_extended_join attempts to turn errors at the end of `a` and the
+// beginning of `b` into a valid code point. if a correction is possible,
+// `*alen_io`, `*bstart_io` and `*blen_io` are updated to exclude the existing
+// errors, and the UTF-8 encoding of the code point to insert is stored in
+// `out`. the number of bytes that should be inserted from `out` into the
+// middle of the strings is returned (up to 4). this will be 0 if there are no
+// bytes to insert.
+int jvp_utf8_extended_join(const char* astart, uint32_t* alen_io, const char** bstart_io, uint32_t* blen_io, char* out) {
+ const char* aend = astart + *alen_io;
+ const char* bstart = *bstart_io;
+ const char* bend = bstart + *blen_io;
+ int bcp;
+ bstart = jvp_utf8_extended_next(bstart, bend, JVP_UTF8_ERRORS_ALL, &bcp);
+ if (!bstart) {
+ // end of string
+ return 0;
+ }
+ if (bcp >= 0xDC00 && bcp <= 0xDFFF) {
+ // UTF-16 tail surrogate, look for lead surrogate at the end of `a`
+ assert(bstart == *bstart_io + UTF16_ERR_LEN);
+ if (aend - astart < UTF16_ERR_LEN)
+ return 0;
+ int acp = decode_utf16_error(aend - UTF16_ERR_LEN);
+ if (acp >= 0xD800 && acp <= 0xDBFF) {
+ // UTF-16 lead surrogate, decode matching UTF-16 pair
+ *alen_io -= UTF16_ERR_LEN;
+ *blen_io -= UTF16_ERR_LEN;
+ *bstart_io += UTF16_ERR_LEN;
+ int codepoint = 0x10000 + (((acp - 0xD800) << 10) | (bcp - 0xDC00));
+ return jvp_utf8_encode(codepoint, out);
+ }
+ return 0;
+ }
+ if (bcp >= -0xFF && bcp <= -0x80) {
+ // UTF-8 error, if it's a continuation byte, search backwards in `a` for the leading byte
+ bcp = -bcp;
+ assert(bstart == *bstart_io + UTF8_ERR_LEN);
+ if (utf8_coding_length[bcp] != UTF8_CONTINUATION_BYTE)
+ return 0;
+ // if there's a correctable error, we will consume up to 4 encoded error bytes total, with up to 3 bytes from each of `a` and `b`
+ unsigned char buf[6];
+ unsigned char* bufstart = buf + 3;
+ unsigned char* bufend = bufstart;
+ *bufend++ = bcp;
+ int length;
+ // search backwards in `a` for a leading byte
+ for (;;) {
+ if (aend - astart < UTF8_ERR_LEN)
+ return 0; // `a` is too short
+ int acp = decode_utf8_error(aend - UTF8_ERR_LEN);
+ if (acp == -1)
+ return 0; // not a UTF-8 error
+ aend -= UTF8_ERR_LEN;
+ length = utf8_coding_length[acp];
+ if (length == 0)
+ return 0; // not a possible UTF-8 byte
+ *--bufstart = acp;
+ if (length != UTF8_CONTINUATION_BYTE)
+ break; // found leading byte
+ if (bufstart == buf)
+ return 0; // too many continuation bytes
+ }
+ if (bufend - bufstart > length)
+ return 0; // too many continuation bytes
+ // search forwards in `b` for any more needed continuation bytes
+ while (bufend - bufstart < length) {
+ if (bend - bstart < UTF8_ERR_LEN)
+ return 0; // `b` is too short
+ bcp = decode_utf8_error(bstart);
+ if (bcp == -1 || utf8_coding_length[bcp] != UTF8_CONTINUATION_BYTE)
+ return 0; // not a UTF-8 error, didn't find enough continuation bytes
+ bstart += UTF8_ERR_LEN;
+ *bufend++ = bcp;
+ }
+ int codepoint;
+ // check that the bytes are strict UTF-8
+ jvp_utf8_extended_next((char*)bufstart, (char*)bufend, 0, &codepoint);
+ if (codepoint != -1) {
+ memcpy(out, bufstart, 4);
+ *alen_io = aend - astart;
+ *blen_io = bend - bstart;
+ *bstart_io = bstart;
+ return bufend - bufstart;
+ }
+ }
+ return 0;
+}
+
int jvp_utf8_is_valid(const char* in, const char* end) {
int codepoint;
while ((in = jvp_utf8_extended_next(in, end, 0, &codepoint))) {
diff --git a/src/jv_unicode.h b/src/jv_unicode.h
index 37c7fc0..ff2a437 100644
--- a/src/jv_unicode.h
+++ b/src/jv_unicode.h
@@ -1,6 +1,8 @@
#ifndef JV_UNICODE_H
#define JV_UNICODE_H
+#include <stdint.h>
+
enum jvp_utf8_flags {
/* Emit replacement character instead of -1 for errors */
JVP_UTF8_REPLACE = 1,
@@ -14,6 +16,7 @@ enum jvp_utf8_flags {
const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes);
const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint);
const char* jvp_utf8_next(const char* in, const char* end, int* codepoint);
+int jvp_utf8_extended_join(const char* astart, uint32_t* alen, const char** bstart, uint32_t* blen, char* out);
int jvp_utf8_is_valid(const char* in, const char* end);
int jvp_utf8_decode_length(char startchar);
diff --git a/tests/jq.test b/tests/jq.test
index c882fd2..9e6c896 100644
--- a/tests/jq.test
+++ b/tests/jq.test
@@ -62,6 +62,11 @@ null
null
"∀\ud800∃\udc00∅\udfff"
+# Check that unpaired surrogates are paired when concatenated
+add
+["\ud83d","\ude43","\ud83e","\udd11","\ud83e","\udd17","\ud83e","\udd14","\ud83e","\udd10","\ud83d","\ude44","\ud83e","\udd12","\ud83e","\udd15","\ud83e","\udd13","\ud83e","\udd16","\ud83e","\udd18","\ud83c","\udffb","\ud83c","\udffc"]
+"🙃🤑🤗🤔🤐🙄🤒🤕🤓🤖🤘🏻🏼"
+
"inter\("pol" + "ation")"
null
"interpolation"
@@ -87,6 +92,16 @@ null
"Zm/Ds2Jhcgo="
"foóbar\n"
+# test correction of UTF-8 errors when concatenating as binary data (input is a random sequence of code points)
+. as $text | @base64 | . as $b64 | [range(0, 300)] | map($b64[(.*4):((. + 1)*4)] | @base64d) | add | . == $text
+"򍨼衍򙮬񪜁򻴠󖂡󔁰񗏷󛊭񢠃򍧝𭌞󹰞󙴋𿋓󧜹򳔎񦰓򅆹򽐟󂑛򶃯㾱ꕽ񂊛򉙲򅤎􃖣󻣸󁸦򴏜򽃿􄑏󠦱񄛲񄕵񡿚򮩒񡏂򨆯򶚒󎮆󉨗򡮟򆿴񬏪򻀅㫑񉒗󴍶󬪸񝶑񂾑򇔣򉩉􂞇𲡀𨫆򤵇𲺝\u001c񖂟񳐉󲔹𳨬􀮔𸒙񜶻㊬񓐊񽒬󑀧󗧚󞌶󦥥𗌽𘀍󴼹􌇺򫗛񂷶󏷕񜁍񥬟󼁁󓺉𗟒򷝊𩕃񞝏񧄀󁲩򐀄򳂸񲊷򃀋񃫫𝷏򏖝򷂍󢭣􋛨𞪒򁁅勸󯩥󵪭񚮚򻡍騎񾊯򪓚񗡈񎕫򡯬񋫠ᕴ𞨹󾄇񩠶𙯾񢥱𚯴񬥷󢶖񾹌񡈟򧓑񒾘𚸯񳗺񭟡𫸬񷤖񷆐𖋌񦰃椀𫎾󗚋𿋆󈝰񺥲򝕊𵯮򙧚󬱃󍗞󱆃󂟙󟆺񻢬󸮤󗗉񉛮𺵡𰣒􁋙񻍛􇡘ᮍ񕥸񨵂盕嗪𻸮򶆍򊈤񽓎󙴐𗬜󾱒󷹰􇡈񨦎􏥩񴲡𨑮򱏝𭢊󕁶򣙥󶡮󮰌󿙾氕񼻘􆔪񢕀񊿃󮨝񑛖󣴊󎎏򳞓㊁󒭀󇜳𯄌𻙩"
+true
+
+# test preservation of binary data when concatenating (input is a random sequence of UTF-16 surrogates encoded in WTF-8, should be treated as regular UTF-8 errors)
+@base64d | . as $text | @base64 | . as $b64 | [range(0, 300)] | map($b64[(.*4):((. + 1)*4)] | @base64d) | add | . == $text
+"7bKv7aiz7auX7aG37aO77aOe7auy7bmm7bqk7aG87bSH7a6m7bmc7bum7bqj7au+7bqf7aap7buC7byq7aS37aCp7aSl7a+a7bur7aGV7bGl7b6M7biB7aOe7ayR7amW7aOX7b637a+P7bu+7ayP7bOw7ba/7ayp7b6G7aqd7bG37bK57b6O7bq27a+u7a2N7ayu7bKK"
+true
+
@uri
"\u03bc"
"%CE%BC"