389 lines
16 KiB
Diff
389 lines
16 KiB
Diff
From 8829368f14943b8d2674c75805b27e56a569ad2c Mon Sep 17 00:00:00 2001
|
|
From: Max Zerzouri <maxdamantus@gmail.com>
|
|
Date: Tue, 25 May 2021 22:59:59 +1200
|
|
Subject: [PATCH] Correct UTF-8 and UTF-16 errors during concatenation
|
|
|
|
UTF-8 errors and UTF-16 errors that were previously encoded into the
|
|
ends of
|
|
strings will now potentially be used to form correct code points.
|
|
|
|
This is mostly a matter of making string equality behave expectedly, since
|
|
without this normalisation, it is possible to produce `jv` strings that are
|
|
converted to UTF-8 or UTF-16 the same way but are not equal due well-formed
|
|
code units that may or may not be encoded as errors.
|
|
---
|
|
src/jv.c | 13 ++-
|
|
src/jv_unicode.c | 248 ++++++++++++++++++++++++++++++++++++++---------
|
|
src/jv_unicode.h | 3 +
|
|
tests/jq.test | 15 +++
|
|
4 files changed, 230 insertions(+), 49 deletions(-)
|
|
|
|
diff --git a/src/jv.c b/src/jv.c
|
|
index e979cc6..67d86fb 100644
|
|
--- a/src/jv.c
|
|
+++ b/src/jv.c
|
|
@@ -522,20 +522,27 @@ static jv jvp_string_append(jv string, const char* data, uint32_t len) {
|
|
jvp_string* s = jvp_string_ptr(string);
|
|
uint32_t currlen = jvp_string_length(s);
|
|
|
|
+ char join_buf[4];
|
|
+ int join_len = jvp_utf8_extended_join(s->data, &currlen, &data, &len, join_buf);
|
|
+
|
|
if (jvp_refcnt_unshared(string.u.ptr) &&
|
|
- jvp_string_remaining_space(s) >= len) {
|
|
+ jvp_string_remaining_space(s) >= join_len + len) {
|
|
// the next string fits at the end of a
|
|
+ memcpy(s->data + currlen, join_buf, join_len);
|
|
+ currlen += join_len;
|
|
memcpy(s->data + currlen, data, len);
|
|
s->data[currlen + len] = 0;
|
|
s->length_hashed = (currlen + len) << 1;
|
|
return string;
|
|
} else {
|
|
// allocate a bigger buffer and copy
|
|
- uint32_t allocsz = (currlen + len) * 2;
|
|
+ uint32_t allocsz = (currlen + join_len + len) * 2;
|
|
if (allocsz < 32) allocsz = 32;
|
|
jvp_string* news = jvp_string_alloc(allocsz);
|
|
- news->length_hashed = (currlen + len) << 1;
|
|
+ news->length_hashed = (currlen + join_len + len) << 1;
|
|
memcpy(news->data, s->data, currlen);
|
|
+ memcpy(news->data + currlen, join_buf, join_len);
|
|
+ currlen += join_len;
|
|
memcpy(news->data + currlen, data, len);
|
|
news->data[currlen + len] = 0;
|
|
jvp_string_free(string);
|
|
diff --git a/src/jv_unicode.c b/src/jv_unicode.c
|
|
index 8c47536..7d67300 100644
|
|
--- a/src/jv_unicode.c
|
|
+++ b/src/jv_unicode.c
|
|
@@ -1,8 +1,72 @@
|
|
#include <stdio.h>
|
|
+#include <string.h>
|
|
#include <assert.h>
|
|
#include "jv_unicode.h"
|
|
#include "jv_utf8_tables.h"
|
|
|
|
+// length of encoding of erroneous UTF-8 byte
|
|
+#define UTF8_ERR_LEN 2
|
|
+// length of encoding of erroneous UTF-16 surrogate
|
|
+#define UTF16_ERR_LEN 3
|
|
+
|
|
+#define U32(a, b, c, d) ( \
|
|
+ (uint32_t) (a) << 0 | \
|
|
+ (uint32_t) (b) << 8 | \
|
|
+ (uint32_t) (c) << 16 | \
|
|
+ (uint32_t) (d) << 24 \
|
|
+)
|
|
+
|
|
+#define BYTE(u32, n) ((uint32_t) (((u32) >> (n)*8) & 0xFF))
|
|
+
|
|
+#define B0 0x00 // 00000000
|
|
+#define B1 0x80 // 10000000
|
|
+#define B2 0xC0 // 11000000
|
|
+#define B3 0xE0 // 11100000
|
|
+#define B4 0xF0 // 11110000
|
|
+#define B5 0xF8 // 11111000
|
|
+
|
|
+// NOTE: these flags are likely to be optimised out as `decode` gets inlined
|
|
+enum decode_flags {
|
|
+ DECODE_1 = 1,
|
|
+ DECODE_2 = 2,
|
|
+ DECODE_3 = 8,
|
|
+ DECODE_4 = 16
|
|
+};
|
|
+
|
|
+// decode up to 4 bytes of "generalised UTF-8"; no checking for overlong
|
|
+// codings or out-of-range code points, works by testing all fixed bits in each
|
|
+// of the 4 coding patterns, then shifting the value bits according to the
|
|
+// pattern
|
|
+static int decode(enum decode_flags flags, uint32_t data, int* codepoint_ret) {
|
|
+ if((flags & DECODE_1) && (data & U32(B1, B0, B0, B0)) == 0){
|
|
+ *codepoint_ret = BYTE(data, 0);
|
|
+ return 1;
|
|
+ }
|
|
+ if((flags & DECODE_2) && (data & U32(B3, B2, B0, B0)) == U32(B2, B1, B0, B0)){
|
|
+ *codepoint_ret =
|
|
+ (BYTE(data, 0) & ~B3) << 6 |
|
|
+ (BYTE(data, 1) & ~B2) << 0;
|
|
+ return 2;
|
|
+ }
|
|
+ if((flags & DECODE_3) && (data & U32(B4, B2, B2, B0)) == U32(B3, B1, B1, B0)){
|
|
+ *codepoint_ret =
|
|
+ (BYTE(data, 0) & ~B4) << 12 |
|
|
+ (BYTE(data, 1) & ~B2) << 6 |
|
|
+ (BYTE(data, 2) & ~B2) << 0;
|
|
+ return 3;
|
|
+ }
|
|
+ if((flags & DECODE_4) && (data & U32(B5, B2, B2, B2)) == U32(B4, B1, B1, B1)){
|
|
+ *codepoint_ret =
|
|
+ (BYTE(data, 0) & ~B5) << 18 |
|
|
+ (BYTE(data, 1) & ~B2) << 12 |
|
|
+ (BYTE(data, 2) & ~B2) << 6 |
|
|
+ (BYTE(data, 3) & ~B2) << 0;
|
|
+ return 4;
|
|
+ }
|
|
+ *codepoint_ret = -1;
|
|
+ return 1;
|
|
+}
|
|
+
|
|
// jvp_utf8_backtrack returns the beginning of the last codepoint in the
|
|
// string, assuming that start is the last byte in the string.
|
|
// If the last codepoint is incomplete, returns the number of missing bytes via
|
|
@@ -81,56 +145,42 @@ const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf
|
|
if (in == end) {
|
|
return 0;
|
|
}
|
|
- int codepoint = -1;
|
|
- unsigned char first = (unsigned char)in[0];
|
|
- int length = utf8_coding_length[first];
|
|
- if ((first & 0x80) == 0) {
|
|
+ uint32_t data = in[0] & 0xFF;
|
|
+ if ((data & B1) == 0) {
|
|
/* Fast-path for ASCII */
|
|
- codepoint = first;
|
|
- length = 1;
|
|
- } else if (length == 0 || length == UTF8_CONTINUATION_BYTE) {
|
|
- /* Bad single byte - either an invalid byte or an out-of-place continuation byte */
|
|
- if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: bad single byte");
|
|
- length = 1;
|
|
- } else if (in + length > end) {
|
|
- /* String ends before UTF8 sequence ends */
|
|
- if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: underrun");
|
|
- length = end - in;
|
|
- } else {
|
|
- codepoint = ((unsigned)in[0]) & utf8_coding_bits[first];
|
|
- for (int i=1; i<length; i++) {
|
|
- unsigned ch = (unsigned char)in[i];
|
|
- if (utf8_coding_length[ch] != UTF8_CONTINUATION_BYTE){
|
|
- /* Invalid UTF8 sequence - not followed by the right number of continuation bytes */
|
|
- if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: wrong bytes");
|
|
- codepoint = -1;
|
|
- length = i;
|
|
- break;
|
|
- }
|
|
- codepoint = (codepoint << 6) | (ch & 0x3f);
|
|
- }
|
|
- if (codepoint < utf8_first_codepoint[length]) {
|
|
- /* Overlong UTF8 sequence */
|
|
- if ((flags & JVP_UTF8_ERRORS_UTF8) && 0x00 <= codepoint && codepoint <= 0x7F) {
|
|
- /* UTF-8 error is emitted as a negative codepoint */
|
|
- codepoint = -(codepoint + 0x80);
|
|
- } else {
|
|
- if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: overlong");
|
|
- codepoint = -1;
|
|
- }
|
|
- }
|
|
- if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
|
|
- /* Surrogate codepoints are allowed in WTF-8/WTF-8b */
|
|
- if (!(flags & JVP_UTF8_ERRORS_UTF16)) {
|
|
- /* Surrogate codepoints can't be encoded in UTF8 */
|
|
- codepoint = -1;
|
|
- }
|
|
+ *codepoint_ret = data;
|
|
+ return in + 1;
|
|
+ }
|
|
+ switch (end - in) {
|
|
+ default: // fall through
|
|
+ case 4: data |= (uint32_t)(in[3] & 0xFF) << 24; // fall through
|
|
+ case 3: data |= (uint32_t)(in[2] & 0xFF) << 16; // fall through
|
|
+ case 2: data |= (uint32_t)(in[1] & 0xFF) << 8; // fall through
|
|
+ case 1: break;
|
|
+ }
|
|
+ int codepoint;
|
|
+ int length = decode(DECODE_2 | DECODE_3 | DECODE_4, data, &codepoint);
|
|
+ if (codepoint == -1) {
|
|
+ if (flags & JVP_UTF8_ERRORS_UTF8) assert(0 && "Invalid WTF-8b sequence: no match");
|
|
+ } else if (codepoint < utf8_first_codepoint[length]) {
|
|
+ /* Overlong UTF-8 sequence */
|
|
+ if ((flags & JVP_UTF8_ERRORS_UTF8) && length == UTF8_ERR_LEN && 0x00 <= codepoint && codepoint <= 0x7F) {
|
|
+ /* UTF-8 error is emitted as a negative codepoint */
|
|
+ codepoint = -(codepoint + 0x80);
|
|
+ } else {
|
|
+ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: overlong");
|
|
+ codepoint = -1;
|
|
}
|
|
- if (codepoint > 0x10FFFF) {
|
|
- /* Outside Unicode range */
|
|
- if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range");
|
|
+ } else if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
|
|
+ /* Surrogate codepoints are allowed in WTF-8/WTF-8b */
|
|
+ if (!(flags & JVP_UTF8_ERRORS_UTF16)) {
|
|
+ /* Surrogate codepoints can't be encoded in UTF8 */
|
|
codepoint = -1;
|
|
}
|
|
+ } else if (codepoint > 0x10FFFF) {
|
|
+ /* Outside Unicode range */
|
|
+ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range");
|
|
+ codepoint = -1;
|
|
}
|
|
if (codepoint == -1 && (flags & JVP_UTF8_REPLACE))
|
|
codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
|
|
@@ -139,6 +189,112 @@ const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf
|
|
return in + length;
|
|
}
|
|
|
|
+// assumes two bytes are readable from `in`
|
|
+static int decode_utf8_error(const char* in) {
|
|
+ uint32_t data = U32(in[0] & 0xFF, in[1] & 0xFF, 0, 0);
|
|
+ int codepoint;
|
|
+ if (decode(DECODE_2, data, &codepoint) == UTF8_ERR_LEN && codepoint < 0x80)
|
|
+ return codepoint + 0x80;
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+// assumes three bytes are readable from `in`
|
|
+static int decode_utf16_error(const char* in) {
|
|
+ uint32_t data = U32(in[0] & 0xFF, in[1] & 0xFF, in[2] & 0xFF, 0);
|
|
+ int codepoint;
|
|
+ if (decode(DECODE_3, data, &codepoint) == UTF16_ERR_LEN && codepoint >= 0xD800 && codepoint < 0xDFFF)
|
|
+ return codepoint;
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+// jvp_utf8_extended_join attempts to turn errors at the end of `a` and the
|
|
+// beginning of `b` into a valid code point. if a correction is possible,
|
|
+// `*alen_io`, `*bstart_io` and `*blen_io` are updated to exclude the existing
|
|
+// errors, and the UTF-8 encoding of the code point to insert is stored in
|
|
+// `out`. the number of bytes that should be inserted from `out` into the
|
|
+// middle of the strings is returned (up to 4). this will be 0 if there are no
|
|
+// bytes to insert.
|
|
+int jvp_utf8_extended_join(const char* astart, uint32_t* alen_io, const char** bstart_io, uint32_t* blen_io, char* out) {
|
|
+ const char* aend = astart + *alen_io;
|
|
+ const char* bstart = *bstart_io;
|
|
+ const char* bend = bstart + *blen_io;
|
|
+ int bcp;
|
|
+ bstart = jvp_utf8_extended_next(bstart, bend, JVP_UTF8_ERRORS_ALL, &bcp);
|
|
+ if (!bstart) {
|
|
+ // end of string
|
|
+ return 0;
|
|
+ }
|
|
+ if (bcp >= 0xDC00 && bcp <= 0xDFFF) {
|
|
+ // UTF-16 tail surrogate, look for lead surrogate at the end of `a`
|
|
+ assert(bstart == *bstart_io + UTF16_ERR_LEN);
|
|
+ if (aend - astart < UTF16_ERR_LEN)
|
|
+ return 0;
|
|
+ int acp = decode_utf16_error(aend - UTF16_ERR_LEN);
|
|
+ if (acp >= 0xD800 && acp <= 0xDBFF) {
|
|
+ // UTF-16 lead surrogate, decode matching UTF-16 pair
|
|
+ *alen_io -= UTF16_ERR_LEN;
|
|
+ *blen_io -= UTF16_ERR_LEN;
|
|
+ *bstart_io += UTF16_ERR_LEN;
|
|
+ int codepoint = 0x10000 + (((acp - 0xD800) << 10) | (bcp - 0xDC00));
|
|
+ return jvp_utf8_encode(codepoint, out);
|
|
+ }
|
|
+ return 0;
|
|
+ }
|
|
+ if (bcp >= -0xFF && bcp <= -0x80) {
|
|
+ // UTF-8 error, if it's a continuation byte, search backwards in `a` for the leading byte
|
|
+ bcp = -bcp;
|
|
+ assert(bstart == *bstart_io + UTF8_ERR_LEN);
|
|
+ if (utf8_coding_length[bcp] != UTF8_CONTINUATION_BYTE)
|
|
+ return 0;
|
|
+ // if there's a correctable error, we will consume up to 4 encoded error bytes total, with up to 3 bytes from each of `a` and `b`
|
|
+ unsigned char buf[6];
|
|
+ unsigned char* bufstart = buf + 3;
|
|
+ unsigned char* bufend = bufstart;
|
|
+ *bufend++ = bcp;
|
|
+ int length;
|
|
+ // search backwards in `a` for a leading byte
|
|
+ for (;;) {
|
|
+ if (aend - astart < UTF8_ERR_LEN)
|
|
+ return 0; // `a` is too short
|
|
+ int acp = decode_utf8_error(aend - UTF8_ERR_LEN);
|
|
+ if (acp == -1)
|
|
+ return 0; // not a UTF-8 error
|
|
+ aend -= UTF8_ERR_LEN;
|
|
+ length = utf8_coding_length[acp];
|
|
+ if (length == 0)
|
|
+ return 0; // not a possible UTF-8 byte
|
|
+ *--bufstart = acp;
|
|
+ if (length != UTF8_CONTINUATION_BYTE)
|
|
+ break; // found leading byte
|
|
+ if (bufstart == buf)
|
|
+ return 0; // too many continuation bytes
|
|
+ }
|
|
+ if (bufend - bufstart > length)
|
|
+ return 0; // too many continuation bytes
|
|
+ // search forwards in `b` for any more needed continuation bytes
|
|
+ while (bufend - bufstart < length) {
|
|
+ if (bend - bstart < UTF8_ERR_LEN)
|
|
+ return 0; // `b` is too short
|
|
+ bcp = decode_utf8_error(bstart);
|
|
+ if (bcp == -1 || utf8_coding_length[bcp] != UTF8_CONTINUATION_BYTE)
|
|
+ return 0; // not a UTF-8 error, didn't find enough continuation bytes
|
|
+ bstart += UTF8_ERR_LEN;
|
|
+ *bufend++ = bcp;
|
|
+ }
|
|
+ int codepoint;
|
|
+ // check that the bytes are strict UTF-8
|
|
+ jvp_utf8_extended_next((char*)bufstart, (char*)bufend, 0, &codepoint);
|
|
+ if (codepoint != -1) {
|
|
+ memcpy(out, bufstart, 4);
|
|
+ *alen_io = aend - astart;
|
|
+ *blen_io = bend - bstart;
|
|
+ *bstart_io = bstart;
|
|
+ return bufend - bufstart;
|
|
+ }
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
int jvp_utf8_is_valid(const char* in, const char* end) {
|
|
int codepoint;
|
|
while ((in = jvp_utf8_extended_next(in, end, 0, &codepoint))) {
|
|
diff --git a/src/jv_unicode.h b/src/jv_unicode.h
|
|
index 37c7fc0..ff2a437 100644
|
|
--- a/src/jv_unicode.h
|
|
+++ b/src/jv_unicode.h
|
|
@@ -1,6 +1,8 @@
|
|
#ifndef JV_UNICODE_H
|
|
#define JV_UNICODE_H
|
|
|
|
+#include <stdint.h>
|
|
+
|
|
enum jvp_utf8_flags {
|
|
/* Emit replacement character instead of -1 for errors */
|
|
JVP_UTF8_REPLACE = 1,
|
|
@@ -14,6 +16,7 @@ enum jvp_utf8_flags {
|
|
const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes);
|
|
const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint);
|
|
const char* jvp_utf8_next(const char* in, const char* end, int* codepoint);
|
|
+int jvp_utf8_extended_join(const char* astart, uint32_t* alen, const char** bstart, uint32_t* blen, char* out);
|
|
int jvp_utf8_is_valid(const char* in, const char* end);
|
|
|
|
int jvp_utf8_decode_length(char startchar);
|
|
diff --git a/tests/jq.test b/tests/jq.test
|
|
index c882fd2..9e6c896 100644
|
|
--- a/tests/jq.test
|
|
+++ b/tests/jq.test
|
|
@@ -62,6 +62,11 @@ null
|
|
null
|
|
"∀\ud800∃\udc00∅\udfff"
|
|
|
|
+# Check that unpaired surrogates are paired when concatenated
|
|
+add
|
|
+["\ud83d","\ude43","\ud83e","\udd11","\ud83e","\udd17","\ud83e","\udd14","\ud83e","\udd10","\ud83d","\ude44","\ud83e","\udd12","\ud83e","\udd15","\ud83e","\udd13","\ud83e","\udd16","\ud83e","\udd18","\ud83c","\udffb","\ud83c","\udffc"]
|
|
+"🙃🤑🤗🤔🤐🙄🤒🤕🤓🤖🤘🏻🏼"
|
|
+
|
|
"inter\("pol" + "ation")"
|
|
null
|
|
"interpolation"
|
|
@@ -87,6 +92,16 @@ null
|
|
"Zm/Ds2Jhcgo="
|
|
"foóbar\n"
|
|
|
|
+# test correction of UTF-8 errors when concatenating as binary data (input is a random sequence of code points)
|
|
+. as $text | @base64 | . as $b64 | [range(0, 300)] | map($b64[(.*4):((. + 1)*4)] | @base64d) | add | . == $text
|
|
+"衍𭌞㾱ꕽ㫑𨫆\u001c㊬𗌽𘀍𗟒𩕃勸騎ᕴ𫸬椀𫎾𰣒ᮍ盕嗪𗬜𨑮𭢊氕㊁"
|
|
+true
|
|
+
|
|
+# test preservation of binary data when concatenating (input is a random sequence of UTF-16 surrogates encoded in WTF-8, should be treated as regular UTF-8 errors)
|
|
+@base64d | . as $text | @base64 | . as $b64 | [range(0, 300)] | map($b64[(.*4):((. + 1)*4)] | @base64d) | add | . == $text
|
|
+"7bKv7aiz7auX7aG37aO77aOe7auy7bmm7bqk7aG87bSH7a6m7bmc7bum7bqj7au+7bqf7aap7buC7byq7aS37aCp7aSl7a+a7bur7aGV7bGl7b6M7biB7aOe7ayR7amW7aOX7b637a+P7bu+7ayP7bOw7ba/7ayp7b6G7aqd7bG37bK57b6O7bq27a+u7a2N7ayu7bKK"
|
|
+true
|
|
+
|
|
@uri
|
|
"\u03bc"
|
|
"%CE%BC"
|