jq/Update-base64-utf8bytelength-and-fromjson-to-handlebinary-strings.patch

From a6ccbaad05bea30c5700b10bd51e46d390496a9b Mon Sep 17 00:00:00 2001
From: Max Zerzouri <maxdamantus@gmail.com>
Date: Sun, 16 May 2021 09:18:51 +0000
Subject: [PATCH] Update `@base64`, `utf8bytelength` and `fromjson` to handle
 binary strings

---
 docs/content/3.manual/manual.yml |   1 -
 src/builtin.c                    | 107 ++++++++++++++++++++++++++-----
 tests/base64.test                |  10 +++
 tests/shtest                     |  19 ++++--
 4 files changed, 116 insertions(+), 21 deletions(-)

diff --git a/docs/content/3.manual/manual.yml b/docs/content/3.manual/manual.yml
index bfb17f4..1258dbf 100644
--- a/docs/content/3.manual/manual.yml
+++ b/docs/content/3.manual/manual.yml
@@ -1843,7 +1843,6 @@ sections:
           * `@base64d`:
 
             The inverse of `@base64`, input is decoded as specified by RFC 4648.
-            Note\: If the decoded string is not UTF-8, the results are undefined.
 
           This syntax can be combined with string interpolation in a
           useful way. You can follow a `@foo` token with a string
diff --git a/src/builtin.c b/src/builtin.c
index c6c8c2e..975bf49 100644
--- a/src/builtin.c
+++ b/src/builtin.c
@@ -409,10 +409,55 @@ static jv f_dump(jq_state *jq, jv input) {
 static jv f_json_parse(jq_state *jq, jv input) {
   if (jv_get_kind(input) != JV_KIND_STRING)
     return type_error(input, "only strings can be parsed");
-  jv res = jv_parse_sized(jv_string_value(input),
-                          jv_string_length_bytes(jv_copy(input)));
+
+  const char* i = jv_string_value(input);
+  const char* end = i + jv_string_length_bytes(jv_copy(input));
+
+  struct jv_parser* parser = jv_parser_new(0);
+  int count = 0;
+  jv value = jv_invalid();
+  while (i != NULL) {
+    const int max_utf8_len = 4;
+    unsigned char buf[100 + max_utf8_len];
+    int buflen = 0;
+    int c;
+    while ((buflen + max_utf8_len < sizeof(buf)) && (i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) {
+      if (c >= -0xFF && c <= -0x80) {
+        // Invalid UTF-8 byte, pass through
+        buf[buflen++] = -c;
+      } else
+        buflen += jvp_utf8_encode(c, buf + buflen);
+    }
+    jv_parser_set_buf(parser, buf, buflen, i != NULL);
+    for (;;) {
+      jv next = jv_parser_next(parser);
+      if (!jv_is_valid(next)) {
+        if (jv_invalid_has_msg(jv_copy(next))) {
+          count++;
+          jv_free(value);
+          value = next;
+          i = NULL;
+        }
+        break;
+      }
+      jv_free(value);
+      if (count++ == 0)
+        value = next;
+      else {
+        jv_free(next);
+        value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values"));
+        i = NULL;
+        break;
+      }
+    }
+  }
+  jv_parser_free(parser);
   jv_free(input);
-  return res;
+  if (count == 0) {
+    jv_free(value);
+    value = jv_invalid_with_msg(jv_string("Expected JSON value"));
+  }
+  return value;
 }
 
 static jv f_tonumber(jq_state *jq, jv input) {
@@ -457,7 +502,19 @@ static jv f_tostring(jq_state *jq, jv input) {
 static jv f_utf8bytelength(jq_state *jq, jv input) {
   if (jv_get_kind(input) != JV_KIND_STRING)
     return type_error(input, "only strings have UTF-8 byte length");
-  return jv_number(jv_string_length_bytes(input));
+  const char* i = jv_string_value(input);
+  const char* end = i + jv_string_length_bytes(jv_copy(input));
+  int len = 0;
+  int c;
+  while ((i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) {
+    if (c >= -0xFF && c <= -0x80) {
+      // Invalid UTF-8 byte, will be passed through
+      len++;
+    } else
+      len += jvp_utf8_encode_length(c);
+  }
+  jv_free(input);
+  return jv_number(len);
 }
 
 #define CHARS_ALPHANUM "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
@@ -632,21 +689,41 @@ static jv f_format(jq_state *jq, jv input, jv fmt) {
     jv_free(fmt);
     input = f_tostring(jq, input);
     jv line = jv_string("");
-    const unsigned char* data = (const unsigned char*)jv_string_value(input);
-    int len = jv_string_length_bytes(jv_copy(input));
-    for (int i=0; i<len; i+=3) {
-      uint32_t code = 0;
-      int n = len - i >= 3 ? 3 : len-i;
-      for (int j=0; j<3; j++) {
+    const char* i = jv_string_value(input);
+    const char* end = i + jv_string_length_bytes(jv_copy(input));
+    uint32_t code = 0;
+    int n = 0;
+    int c;
+    while ((i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) {
+      unsigned char ubuf[4];
+      int len = 0;
+      if (c >= -0xFF && c <= -0x80) {
+        // Invalid UTF-8 byte, pass through
+        ubuf[len++] = -c;
+      } else
+        len += jvp_utf8_encode(c, ubuf);
+      for (int x = 0; x < len; x++) {
         code <<= 8;
-        code |= j < n ? (unsigned)data[i+j] : 0;
+        code |= ubuf[x];
+        if (++n == 3) {
+          char buf[4];
+          for (int j = 0; j < 4; j++)
+            buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f];
+          line = jv_string_append_buf(line, buf, sizeof(buf));
+          n = 0;
+          code = 0;
+        }
       }
+    }
+    if (n > 0) {
+      assert(n < 3);
+      code <<= 8*(3 - n);
       char buf[4];
-      for (int j=0; j<4; j++) {
+      for (int j = 0; j < 4; j++)
         buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f];
-      }
-      if (n < 3) buf[3] = '=';
-      if (n < 2) buf[2] = '=';
+      buf[3] = '=';
+      if (n < 2)
+        buf[2] = '=';
       line = jv_string_append_buf(line, buf, sizeof(buf));
     }
     jv_free(input);
diff --git a/tests/base64.test b/tests/base64.test
index 0f82b0b..6507bb8 100644
--- a/tests/base64.test
+++ b/tests/base64.test
@@ -33,3 +33,13 @@
 . | try @base64d catch .
 "QUJDa"
 "string (\"QUJDa\") trailing base64 byte found"
+
+# random binary data
+(. | @base64d | @base64) == .
+"zns0Su1i4JjDfGiR95WOcU8iiPMOrfJTUBm9P1ot2qIMiyk04b0WSIFNTMD7w9ziMV8nSbwpPqNl3JKF1eWZrRRg24rbvh66O1e7Z1xIGPNqTqm+jdzRCkWSryR+67wXRVgD6Q=="
+true
+
+# replace lone surrogates
+@base64
+"foo\udca9\ud83dbar"
+"Zm9v77+977+9YmFy"
diff --git a/tests/shtest b/tests/shtest
index 4c8b57e..7de61e4 100755
--- a/tests/shtest
+++ b/tests/shtest
@@ -131,11 +131,20 @@ cmp $d/out $d/expected
 
 
 clean=false
-# Invalid UTF-8 bytes are preserved when encoding/decoding JSON
-dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null
-$VALGRIND $Q $JQ -sR . $d/rand >$d/out.json
-$VALGRIND $Q $JQ -j . $d/out.json >$d/out
-cmp $d/out $d/rand
+# Invalid UTF-8 bytes are preserved when encoding/decoding JSON and base64 and concatenating binary strings
+if dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null; then
+    $VALGRIND $Q $JQ -sR . $d/rand >$d/out.json
+    $VALGRIND $Q $JQ -j . $d/out.json >$d/out
+    cmp $d/out $d/rand
+    $VALGRIND $Q $JQ -jR fromjson $d/out.json >$d/out
+    cmp $d/out $d/rand
+    $VALGRIND $Q $JQ -j '@base64 | @base64d' $d/out.json >$d/out
+    cmp $d/out $d/rand
+    base64 $d/rand | $VALGRIND $Q $JQ -R '@base64d' | $VALGRIND $Q $JQ -sj 'add' >$d/out
+    cmp $d/out $d/rand
+    $VALGRIND $Q $JQ -nj '$a' --rawfile a $d/rand >$d/out
+    cmp $d/out $d/rand
+fi
 clean=true
Support binary strings preserve UTF-8 and UTF-16 errors 2021-08-30 16:05:10 +08:00			`From a6ccbaad05bea30c5700b10bd51e46d390496a9b Mon Sep 17 00:00:00 2001`
			`From: Max Zerzouri <maxdamantus@gmail.com>`
			`Date: Sun, 16 May 2021 09:18:51 +0000`
			Subject: [PATCH] Update `@base64`, `utf8bytelength` and `fromjson` to handle
			`binary strings`

			`---`
			`docs/content/3.manual/manual.yml \| 1 -`
			`src/builtin.c \| 107 ++++++++++++++++++++++++++-----`
			`tests/base64.test \| 10 +++`
			`tests/shtest \| 19 ++++--`
			`4 files changed, 116 insertions(+), 21 deletions(-)`

			`diff --git a/docs/content/3.manual/manual.yml b/docs/content/3.manual/manual.yml`
			`index bfb17f4..1258dbf 100644`
			`--- a/docs/content/3.manual/manual.yml`
			`+++ b/docs/content/3.manual/manual.yml`
			`@@ -1843,7 +1843,6 @@ sections:`
			* `@base64d`:

			The inverse of `@base64`, input is decoded as specified by RFC 4648.
			`- Note\: If the decoded string is not UTF-8, the results are undefined.`

			`This syntax can be combined with string interpolation in a`
			useful way. You can follow a `@foo` token with a string
			`diff --git a/src/builtin.c b/src/builtin.c`
			`index c6c8c2e..975bf49 100644`
			`--- a/src/builtin.c`
			`+++ b/src/builtin.c`
			`@@ -409,10 +409,55 @@ static jv f_dump(jq_state *jq, jv input) {`
			`static jv f_json_parse(jq_state *jq, jv input) {`
			`if (jv_get_kind(input) != JV_KIND_STRING)`
			`return type_error(input, "only strings can be parsed");`
			`- jv res = jv_parse_sized(jv_string_value(input),`
			`- jv_string_length_bytes(jv_copy(input)));`
			`+`
			`+ const char* i = jv_string_value(input);`
			`+ const char* end = i + jv_string_length_bytes(jv_copy(input));`
			`+`
			`+ struct jv_parser* parser = jv_parser_new(0);`
			`+ int count = 0;`
			`+ jv value = jv_invalid();`
			`+ while (i != NULL) {`
			`+ const int max_utf8_len = 4;`
			`+ unsigned char buf[100 + max_utf8_len];`
			`+ int buflen = 0;`
			`+ int c;`
			`+ while ((buflen + max_utf8_len < sizeof(buf)) && (i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE \| JVP_UTF8_ERRORS_UTF8, &c))) {`
			`+ if (c >= -0xFF && c <= -0x80) {`
			`+ // Invalid UTF-8 byte, pass through`
			`+ buf[buflen++] = -c;`
			`+ } else`
			`+ buflen += jvp_utf8_encode(c, buf + buflen);`
			`+ }`
			`+ jv_parser_set_buf(parser, buf, buflen, i != NULL);`
			`+ for (;;) {`
			`+ jv next = jv_parser_next(parser);`
			`+ if (!jv_is_valid(next)) {`
			`+ if (jv_invalid_has_msg(jv_copy(next))) {`
			`+ count++;`
			`+ jv_free(value);`
			`+ value = next;`
			`+ i = NULL;`
			`+ }`
			`+ break;`
			`+ }`
			`+ jv_free(value);`
			`+ if (count++ == 0)`
			`+ value = next;`
			`+ else {`
			`+ jv_free(next);`
			`+ value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values"));`
			`+ i = NULL;`
			`+ break;`
			`+ }`
			`+ }`
			`+ }`
			`+ jv_parser_free(parser);`
			`jv_free(input);`
			`- return res;`
			`+ if (count == 0) {`
			`+ jv_free(value);`
			`+ value = jv_invalid_with_msg(jv_string("Expected JSON value"));`
			`+ }`
			`+ return value;`
			`}`

			`static jv f_tonumber(jq_state *jq, jv input) {`
			`@@ -457,7 +502,19 @@ static jv f_tostring(jq_state *jq, jv input) {`
			`static jv f_utf8bytelength(jq_state *jq, jv input) {`
			`if (jv_get_kind(input) != JV_KIND_STRING)`
			`return type_error(input, "only strings have UTF-8 byte length");`
			`- return jv_number(jv_string_length_bytes(input));`
			`+ const char* i = jv_string_value(input);`
			`+ const char* end = i + jv_string_length_bytes(jv_copy(input));`
			`+ int len = 0;`
			`+ int c;`
			`+ while ((i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE \| JVP_UTF8_ERRORS_UTF8, &c))) {`
			`+ if (c >= -0xFF && c <= -0x80) {`
			`+ // Invalid UTF-8 byte, will be passed through`
			`+ len++;`
			`+ } else`
			`+ len += jvp_utf8_encode_length(c);`
			`+ }`
			`+ jv_free(input);`
			`+ return jv_number(len);`
			`}`

			`#define CHARS_ALPHANUM "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"`
			`@@ -632,21 +689,41 @@ static jv f_format(jq_state *jq, jv input, jv fmt) {`
			`jv_free(fmt);`
			`input = f_tostring(jq, input);`
			`jv line = jv_string("");`
			`- const unsigned char* data = (const unsigned char*)jv_string_value(input);`
			`- int len = jv_string_length_bytes(jv_copy(input));`
			`- for (int i=0; i<len; i+=3) {`
			`- uint32_t code = 0;`
			`- int n = len - i >= 3 ? 3 : len-i;`
			`- for (int j=0; j<3; j++) {`
			`+ const char* i = jv_string_value(input);`
			`+ const char* end = i + jv_string_length_bytes(jv_copy(input));`
			`+ uint32_t code = 0;`
			`+ int n = 0;`
			`+ int c;`
			`+ while ((i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE \| JVP_UTF8_ERRORS_UTF8, &c))) {`
			`+ unsigned char ubuf[4];`
			`+ int len = 0;`
			`+ if (c >= -0xFF && c <= -0x80) {`
			`+ // Invalid UTF-8 byte, pass through`
			`+ ubuf[len++] = -c;`
			`+ } else`
			`+ len += jvp_utf8_encode(c, ubuf);`
			`+ for (int x = 0; x < len; x++) {`
			`code <<= 8;`
			`- code \|= j < n ? (unsigned)data[i+j] : 0;`
			`+ code \|= ubuf[x];`
			`+ if (++n == 3) {`
			`+ char buf[4];`
			`+ for (int j = 0; j < 4; j++)`
			`+ buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f];`
			`+ line = jv_string_append_buf(line, buf, sizeof(buf));`
			`+ n = 0;`
			`+ code = 0;`
			`+ }`
			`}`
			`+ }`
			`+ if (n > 0) {`
			`+ assert(n < 3);`
			`+ code <<= 8*(3 - n);`
			`char buf[4];`
			`- for (int j=0; j<4; j++) {`
			`+ for (int j = 0; j < 4; j++)`
			`buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f];`
			`- }`
			`- if (n < 3) buf[3] = '=';`
			`- if (n < 2) buf[2] = '=';`
			`+ buf[3] = '=';`
			`+ if (n < 2)`
			`+ buf[2] = '=';`
			`line = jv_string_append_buf(line, buf, sizeof(buf));`
			`}`
			`jv_free(input);`
			`diff --git a/tests/base64.test b/tests/base64.test`
			`index 0f82b0b..6507bb8 100644`
			`--- a/tests/base64.test`
			`+++ b/tests/base64.test`
			`@@ -33,3 +33,13 @@`
			`. \| try @base64d catch .`
			`"QUJDa"`
			`"string (\"QUJDa\") trailing base64 byte found"`
			`+`
			`+# random binary data`
			`+(. \| @base64d \| @base64) == .`
			`+"zns0Su1i4JjDfGiR95WOcU8iiPMOrfJTUBm9P1ot2qIMiyk04b0WSIFNTMD7w9ziMV8nSbwpPqNl3JKF1eWZrRRg24rbvh66O1e7Z1xIGPNqTqm+jdzRCkWSryR+67wXRVgD6Q=="`
			`+true`
			`+`
			`+# replace lone surrogates`
			`+@base64`
			`+"foo\udca9\ud83dbar"`
			`+"Zm9v77+977+9YmFy"`
			`diff --git a/tests/shtest b/tests/shtest`
			`index 4c8b57e..7de61e4 100755`
			`--- a/tests/shtest`
			`+++ b/tests/shtest`
			`@@ -131,11 +131,20 @@ cmp $d/out $d/expected`


			`clean=false`
			`-# Invalid UTF-8 bytes are preserved when encoding/decoding JSON`
			`-dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null`
			`-$VALGRIND $Q $JQ -sR . $d/rand >$d/out.json`
			`-$VALGRIND $Q $JQ -j . $d/out.json >$d/out`
			`-cmp $d/out $d/rand`
			`+# Invalid UTF-8 bytes are preserved when encoding/decoding JSON and base64 and concatenating binary strings`
			`+if dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null; then`
			`+ $VALGRIND $Q $JQ -sR . $d/rand >$d/out.json`
			`+ $VALGRIND $Q $JQ -j . $d/out.json >$d/out`
			`+ cmp $d/out $d/rand`
			`+ $VALGRIND $Q $JQ -jR fromjson $d/out.json >$d/out`
			`+ cmp $d/out $d/rand`
			`+ $VALGRIND $Q $JQ -j '@base64 \| @base64d' $d/out.json >$d/out`
			`+ cmp $d/out $d/rand`
			`+ base64 $d/rand \| $VALGRIND $Q $JQ -R '@base64d' \| $VALGRIND $Q $JQ -sj 'add' >$d/out`
			`+ cmp $d/out $d/rand`
			`+ $VALGRIND $Q $JQ -nj '$a' --rawfile a $d/rand >$d/out`
			`+ cmp $d/out $d/rand`
			`+fi`
			`clean=true`