jq/Update-base64-utf8bytelength-and-fromjson-to-handlebinary-strings.patch

211 lines
7.0 KiB
Diff
Raw Permalink Normal View History

From a6ccbaad05bea30c5700b10bd51e46d390496a9b Mon Sep 17 00:00:00 2001
From: Max Zerzouri <maxdamantus@gmail.com>
Date: Sun, 16 May 2021 09:18:51 +0000
Subject: [PATCH] Update `@base64`, `utf8bytelength` and `fromjson` to handle
binary strings
---
docs/content/3.manual/manual.yml | 1 -
src/builtin.c | 107 ++++++++++++++++++++++++++-----
tests/base64.test | 10 +++
tests/shtest | 19 ++++--
4 files changed, 116 insertions(+), 21 deletions(-)
diff --git a/docs/content/3.manual/manual.yml b/docs/content/3.manual/manual.yml
index bfb17f4..1258dbf 100644
--- a/docs/content/3.manual/manual.yml
+++ b/docs/content/3.manual/manual.yml
@@ -1843,7 +1843,6 @@ sections:
* `@base64d`:
The inverse of `@base64`, input is decoded as specified by RFC 4648.
- Note\: If the decoded string is not UTF-8, the results are undefined.
This syntax can be combined with string interpolation in a
useful way. You can follow a `@foo` token with a string
diff --git a/src/builtin.c b/src/builtin.c
index c6c8c2e..975bf49 100644
--- a/src/builtin.c
+++ b/src/builtin.c
@@ -409,10 +409,55 @@ static jv f_dump(jq_state *jq, jv input) {
static jv f_json_parse(jq_state *jq, jv input) {
if (jv_get_kind(input) != JV_KIND_STRING)
return type_error(input, "only strings can be parsed");
- jv res = jv_parse_sized(jv_string_value(input),
- jv_string_length_bytes(jv_copy(input)));
+
+ const char* i = jv_string_value(input);
+ const char* end = i + jv_string_length_bytes(jv_copy(input));
+
+ struct jv_parser* parser = jv_parser_new(0);
+ int count = 0;
+ jv value = jv_invalid();
+ while (i != NULL) {
+ const int max_utf8_len = 4;
+ unsigned char buf[100 + max_utf8_len];
+ int buflen = 0;
+ int c;
+ while ((buflen + max_utf8_len < sizeof(buf)) && (i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) {
+ if (c >= -0xFF && c <= -0x80) {
+ // Invalid UTF-8 byte, pass through
+ buf[buflen++] = -c;
+ } else
+ buflen += jvp_utf8_encode(c, buf + buflen);
+ }
+ jv_parser_set_buf(parser, buf, buflen, i != NULL);
+ for (;;) {
+ jv next = jv_parser_next(parser);
+ if (!jv_is_valid(next)) {
+ if (jv_invalid_has_msg(jv_copy(next))) {
+ count++;
+ jv_free(value);
+ value = next;
+ i = NULL;
+ }
+ break;
+ }
+ jv_free(value);
+ if (count++ == 0)
+ value = next;
+ else {
+ jv_free(next);
+ value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values"));
+ i = NULL;
+ break;
+ }
+ }
+ }
+ jv_parser_free(parser);
jv_free(input);
- return res;
+ if (count == 0) {
+ jv_free(value);
+ value = jv_invalid_with_msg(jv_string("Expected JSON value"));
+ }
+ return value;
}
static jv f_tonumber(jq_state *jq, jv input) {
@@ -457,7 +502,19 @@ static jv f_tostring(jq_state *jq, jv input) {
static jv f_utf8bytelength(jq_state *jq, jv input) {
if (jv_get_kind(input) != JV_KIND_STRING)
return type_error(input, "only strings have UTF-8 byte length");
- return jv_number(jv_string_length_bytes(input));
+ const char* i = jv_string_value(input);
+ const char* end = i + jv_string_length_bytes(jv_copy(input));
+ int len = 0;
+ int c;
+ while ((i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) {
+ if (c >= -0xFF && c <= -0x80) {
+ // Invalid UTF-8 byte, will be passed through
+ len++;
+ } else
+ len += jvp_utf8_encode_length(c);
+ }
+ jv_free(input);
+ return jv_number(len);
}
#define CHARS_ALPHANUM "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
@@ -632,21 +689,41 @@ static jv f_format(jq_state *jq, jv input, jv fmt) {
jv_free(fmt);
input = f_tostring(jq, input);
jv line = jv_string("");
- const unsigned char* data = (const unsigned char*)jv_string_value(input);
- int len = jv_string_length_bytes(jv_copy(input));
- for (int i=0; i<len; i+=3) {
- uint32_t code = 0;
- int n = len - i >= 3 ? 3 : len-i;
- for (int j=0; j<3; j++) {
+ const char* i = jv_string_value(input);
+ const char* end = i + jv_string_length_bytes(jv_copy(input));
+ uint32_t code = 0;
+ int n = 0;
+ int c;
+ while ((i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) {
+ unsigned char ubuf[4];
+ int len = 0;
+ if (c >= -0xFF && c <= -0x80) {
+ // Invalid UTF-8 byte, pass through
+ ubuf[len++] = -c;
+ } else
+ len += jvp_utf8_encode(c, ubuf);
+ for (int x = 0; x < len; x++) {
code <<= 8;
- code |= j < n ? (unsigned)data[i+j] : 0;
+ code |= ubuf[x];
+ if (++n == 3) {
+ char buf[4];
+ for (int j = 0; j < 4; j++)
+ buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f];
+ line = jv_string_append_buf(line, buf, sizeof(buf));
+ n = 0;
+ code = 0;
+ }
}
+ }
+ if (n > 0) {
+ assert(n < 3);
+ code <<= 8*(3 - n);
char buf[4];
- for (int j=0; j<4; j++) {
+ for (int j = 0; j < 4; j++)
buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f];
- }
- if (n < 3) buf[3] = '=';
- if (n < 2) buf[2] = '=';
+ buf[3] = '=';
+ if (n < 2)
+ buf[2] = '=';
line = jv_string_append_buf(line, buf, sizeof(buf));
}
jv_free(input);
diff --git a/tests/base64.test b/tests/base64.test
index 0f82b0b..6507bb8 100644
--- a/tests/base64.test
+++ b/tests/base64.test
@@ -33,3 +33,13 @@
. | try @base64d catch .
"QUJDa"
"string (\"QUJDa\") trailing base64 byte found"
+
+# random binary data
+(. | @base64d | @base64) == .
+"zns0Su1i4JjDfGiR95WOcU8iiPMOrfJTUBm9P1ot2qIMiyk04b0WSIFNTMD7w9ziMV8nSbwpPqNl3JKF1eWZrRRg24rbvh66O1e7Z1xIGPNqTqm+jdzRCkWSryR+67wXRVgD6Q=="
+true
+
+# replace lone surrogates
+@base64
+"foo\udca9\ud83dbar"
+"Zm9v77+977+9YmFy"
diff --git a/tests/shtest b/tests/shtest
index 4c8b57e..7de61e4 100755
--- a/tests/shtest
+++ b/tests/shtest
@@ -131,11 +131,20 @@ cmp $d/out $d/expected
clean=false
-# Invalid UTF-8 bytes are preserved when encoding/decoding JSON
-dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null
-$VALGRIND $Q $JQ -sR . $d/rand >$d/out.json
-$VALGRIND $Q $JQ -j . $d/out.json >$d/out
-cmp $d/out $d/rand
+# Invalid UTF-8 bytes are preserved when encoding/decoding JSON and base64 and concatenating binary strings
+if dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null; then
+ $VALGRIND $Q $JQ -sR . $d/rand >$d/out.json
+ $VALGRIND $Q $JQ -j . $d/out.json >$d/out
+ cmp $d/out $d/rand
+ $VALGRIND $Q $JQ -jR fromjson $d/out.json >$d/out
+ cmp $d/out $d/rand
+ $VALGRIND $Q $JQ -j '@base64 | @base64d' $d/out.json >$d/out
+ cmp $d/out $d/rand
+ base64 $d/rand | $VALGRIND $Q $JQ -R '@base64d' | $VALGRIND $Q $JQ -sj 'add' >$d/out
+ cmp $d/out $d/rand
+ $VALGRIND $Q $JQ -nj '$a' --rawfile a $d/rand >$d/out
+ cmp $d/out $d/rand
+fi
clean=true