211 lines
7.0 KiB
Diff
211 lines
7.0 KiB
Diff
From a6ccbaad05bea30c5700b10bd51e46d390496a9b Mon Sep 17 00:00:00 2001
|
|
From: Max Zerzouri <maxdamantus@gmail.com>
|
|
Date: Sun, 16 May 2021 09:18:51 +0000
|
|
Subject: [PATCH] Update `@base64`, `utf8bytelength` and `fromjson` to handle
|
|
binary strings
|
|
|
|
---
|
|
docs/content/3.manual/manual.yml | 1 -
|
|
src/builtin.c | 107 ++++++++++++++++++++++++++-----
|
|
tests/base64.test | 10 +++
|
|
tests/shtest | 19 ++++--
|
|
4 files changed, 116 insertions(+), 21 deletions(-)
|
|
|
|
diff --git a/docs/content/3.manual/manual.yml b/docs/content/3.manual/manual.yml
|
|
index bfb17f4..1258dbf 100644
|
|
--- a/docs/content/3.manual/manual.yml
|
|
+++ b/docs/content/3.manual/manual.yml
|
|
@@ -1843,7 +1843,6 @@ sections:
|
|
* `@base64d`:
|
|
|
|
The inverse of `@base64`, input is decoded as specified by RFC 4648.
|
|
- Note\: If the decoded string is not UTF-8, the results are undefined.
|
|
|
|
This syntax can be combined with string interpolation in a
|
|
useful way. You can follow a `@foo` token with a string
|
|
diff --git a/src/builtin.c b/src/builtin.c
|
|
index c6c8c2e..975bf49 100644
|
|
--- a/src/builtin.c
|
|
+++ b/src/builtin.c
|
|
@@ -409,10 +409,55 @@ static jv f_dump(jq_state *jq, jv input) {
|
|
static jv f_json_parse(jq_state *jq, jv input) {
|
|
if (jv_get_kind(input) != JV_KIND_STRING)
|
|
return type_error(input, "only strings can be parsed");
|
|
- jv res = jv_parse_sized(jv_string_value(input),
|
|
- jv_string_length_bytes(jv_copy(input)));
|
|
+
|
|
+ const char* i = jv_string_value(input);
|
|
+ const char* end = i + jv_string_length_bytes(jv_copy(input));
|
|
+
|
|
+ struct jv_parser* parser = jv_parser_new(0);
|
|
+ int count = 0;
|
|
+ jv value = jv_invalid();
|
|
+ while (i != NULL) {
|
|
+ const int max_utf8_len = 4;
|
|
+ unsigned char buf[100 + max_utf8_len];
|
|
+ int buflen = 0;
|
|
+ int c;
|
|
+ while ((buflen + max_utf8_len < sizeof(buf)) && (i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) {
|
|
+ if (c >= -0xFF && c <= -0x80) {
|
|
+ // Invalid UTF-8 byte, pass through
|
|
+ buf[buflen++] = -c;
|
|
+ } else
|
|
+ buflen += jvp_utf8_encode(c, buf + buflen);
|
|
+ }
|
|
+ jv_parser_set_buf(parser, buf, buflen, i != NULL);
|
|
+ for (;;) {
|
|
+ jv next = jv_parser_next(parser);
|
|
+ if (!jv_is_valid(next)) {
|
|
+ if (jv_invalid_has_msg(jv_copy(next))) {
|
|
+ count++;
|
|
+ jv_free(value);
|
|
+ value = next;
|
|
+ i = NULL;
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+ jv_free(value);
|
|
+ if (count++ == 0)
|
|
+ value = next;
|
|
+ else {
|
|
+ jv_free(next);
|
|
+ value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values"));
|
|
+ i = NULL;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ jv_parser_free(parser);
|
|
jv_free(input);
|
|
- return res;
|
|
+ if (count == 0) {
|
|
+ jv_free(value);
|
|
+ value = jv_invalid_with_msg(jv_string("Expected JSON value"));
|
|
+ }
|
|
+ return value;
|
|
}
|
|
|
|
static jv f_tonumber(jq_state *jq, jv input) {
|
|
@@ -457,7 +502,19 @@ static jv f_tostring(jq_state *jq, jv input) {
|
|
static jv f_utf8bytelength(jq_state *jq, jv input) {
|
|
if (jv_get_kind(input) != JV_KIND_STRING)
|
|
return type_error(input, "only strings have UTF-8 byte length");
|
|
- return jv_number(jv_string_length_bytes(input));
|
|
+ const char* i = jv_string_value(input);
|
|
+ const char* end = i + jv_string_length_bytes(jv_copy(input));
|
|
+ int len = 0;
|
|
+ int c;
|
|
+ while ((i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) {
|
|
+ if (c >= -0xFF && c <= -0x80) {
|
|
+ // Invalid UTF-8 byte, will be passed through
|
|
+ len++;
|
|
+ } else
|
|
+ len += jvp_utf8_encode_length(c);
|
|
+ }
|
|
+ jv_free(input);
|
|
+ return jv_number(len);
|
|
}
|
|
|
|
#define CHARS_ALPHANUM "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
|
|
@@ -632,21 +689,41 @@ static jv f_format(jq_state *jq, jv input, jv fmt) {
|
|
jv_free(fmt);
|
|
input = f_tostring(jq, input);
|
|
jv line = jv_string("");
|
|
- const unsigned char* data = (const unsigned char*)jv_string_value(input);
|
|
- int len = jv_string_length_bytes(jv_copy(input));
|
|
- for (int i=0; i<len; i+=3) {
|
|
- uint32_t code = 0;
|
|
- int n = len - i >= 3 ? 3 : len-i;
|
|
- for (int j=0; j<3; j++) {
|
|
+ const char* i = jv_string_value(input);
|
|
+ const char* end = i + jv_string_length_bytes(jv_copy(input));
|
|
+ uint32_t code = 0;
|
|
+ int n = 0;
|
|
+ int c;
|
|
+ while ((i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) {
|
|
+ unsigned char ubuf[4];
|
|
+ int len = 0;
|
|
+ if (c >= -0xFF && c <= -0x80) {
|
|
+ // Invalid UTF-8 byte, pass through
|
|
+ ubuf[len++] = -c;
|
|
+ } else
|
|
+ len += jvp_utf8_encode(c, ubuf);
|
|
+ for (int x = 0; x < len; x++) {
|
|
code <<= 8;
|
|
- code |= j < n ? (unsigned)data[i+j] : 0;
|
|
+ code |= ubuf[x];
|
|
+ if (++n == 3) {
|
|
+ char buf[4];
|
|
+ for (int j = 0; j < 4; j++)
|
|
+ buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f];
|
|
+ line = jv_string_append_buf(line, buf, sizeof(buf));
|
|
+ n = 0;
|
|
+ code = 0;
|
|
+ }
|
|
}
|
|
+ }
|
|
+ if (n > 0) {
|
|
+ assert(n < 3);
|
|
+ code <<= 8*(3 - n);
|
|
char buf[4];
|
|
- for (int j=0; j<4; j++) {
|
|
+ for (int j = 0; j < 4; j++)
|
|
buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f];
|
|
- }
|
|
- if (n < 3) buf[3] = '=';
|
|
- if (n < 2) buf[2] = '=';
|
|
+ buf[3] = '=';
|
|
+ if (n < 2)
|
|
+ buf[2] = '=';
|
|
line = jv_string_append_buf(line, buf, sizeof(buf));
|
|
}
|
|
jv_free(input);
|
|
diff --git a/tests/base64.test b/tests/base64.test
|
|
index 0f82b0b..6507bb8 100644
|
|
--- a/tests/base64.test
|
|
+++ b/tests/base64.test
|
|
@@ -33,3 +33,13 @@
|
|
. | try @base64d catch .
|
|
"QUJDa"
|
|
"string (\"QUJDa\") trailing base64 byte found"
|
|
+
|
|
+# random binary data
|
|
+(. | @base64d | @base64) == .
|
|
+"zns0Su1i4JjDfGiR95WOcU8iiPMOrfJTUBm9P1ot2qIMiyk04b0WSIFNTMD7w9ziMV8nSbwpPqNl3JKF1eWZrRRg24rbvh66O1e7Z1xIGPNqTqm+jdzRCkWSryR+67wXRVgD6Q=="
|
|
+true
|
|
+
|
|
+# replace lone surrogates
|
|
+@base64
|
|
+"foo\udca9\ud83dbar"
|
|
+"Zm9v77+977+9YmFy"
|
|
diff --git a/tests/shtest b/tests/shtest
|
|
index 4c8b57e..7de61e4 100755
|
|
--- a/tests/shtest
|
|
+++ b/tests/shtest
|
|
@@ -131,11 +131,20 @@ cmp $d/out $d/expected
|
|
|
|
|
|
clean=false
|
|
-# Invalid UTF-8 bytes are preserved when encoding/decoding JSON
|
|
-dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null
|
|
-$VALGRIND $Q $JQ -sR . $d/rand >$d/out.json
|
|
-$VALGRIND $Q $JQ -j . $d/out.json >$d/out
|
|
-cmp $d/out $d/rand
|
|
+# Invalid UTF-8 bytes are preserved when encoding/decoding JSON and base64 and concatenating binary strings
|
|
+if dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null; then
|
|
+ $VALGRIND $Q $JQ -sR . $d/rand >$d/out.json
|
|
+ $VALGRIND $Q $JQ -j . $d/out.json >$d/out
|
|
+ cmp $d/out $d/rand
|
|
+ $VALGRIND $Q $JQ -jR fromjson $d/out.json >$d/out
|
|
+ cmp $d/out $d/rand
|
|
+ $VALGRIND $Q $JQ -j '@base64 | @base64d' $d/out.json >$d/out
|
|
+ cmp $d/out $d/rand
|
|
+ base64 $d/rand | $VALGRIND $Q $JQ -R '@base64d' | $VALGRIND $Q $JQ -sj 'add' >$d/out
|
|
+ cmp $d/out $d/rand
|
|
+ $VALGRIND $Q $JQ -nj '$a' --rawfile a $d/rand >$d/out
|
|
+ cmp $d/out $d/rand
|
|
+fi
|
|
clean=true
|
|
|
|
|