From a91e75dbf16f3f3e038dbd828efedd9916d59bf9 Mon Sep 17 00:00:00 2001 From: Mark McDuff <mcduff@fb.com> Date: Mon, 16 Dec 2013 19:07:42 -0800 Subject: [PATCH] folly::json: allow skipping invalid UTF8 Summary: folly::json::serialize by default doesn't check for valid UTF8, and as a result can generate invalid JSON. There is an option to check for valid UTF8, which throws on an error. This diff introduces a new option, `skip_invalid`, which replaces invalid chars with U+FFFD. http://en.wikipedia.org/wiki/Specials_(Unicode_block) seems to suggest that this is the correct replacement. @override-unit-failures Test Plan: g-unittest Reviewed By: delong.j@fb.com FB internal diff: D1102923 --- folly/json.cpp | 28 ++++++++++++++++++++++------ folly/json.h | 4 ++++ folly/test/JsonTest.cpp | 17 +++++++++++++++++ 3 files changed, 43 insertions(+), 6 deletions(-) diff --git a/folly/json.cpp b/folly/json.cpp index ab45d594..3b954ef4 100644 --- a/folly/json.cpp +++ b/folly/json.cpp @@ -30,7 +30,10 @@ namespace folly { namespace json { namespace { -char32_t decodeUtf8(const unsigned char*& p, const unsigned char* const e) { +char32_t decodeUtf8( + const unsigned char*& p, + const unsigned char* const e, + bool skipOnError) { /* The following encodings are valid, except for the 5 and 6 byte * combinations: * 0xxxxxxx @@ -41,7 +44,10 @@ char32_t decodeUtf8(const unsigned char*& p, const unsigned char* const e) { * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ + auto skip = [&] { ++p; return U'\ufffd'; }; + if (p >= e) { + if (skipOnError) return skip(); throw std::runtime_error("folly::decodeUtf8 empty/invalid string"); } @@ -62,8 +68,8 @@ char32_t decodeUtf8(const unsigned char*& p, const unsigned char* const e) { uint32_t d = fst; if ((fst & 0xC0) != 0xC0) { - throw std::runtime_error( - to<std::string>("folly::decodeUtf8 i=0 d=", d)); + if (skipOnError) return skip(); + throw std::runtime_error(to<std::string>("folly::decodeUtf8 i=0 d=", d)); } fst <<= 1; @@ -72,6 +78,7 @@ char32_t decodeUtf8(const unsigned char*& p, const unsigned char* const e) { unsigned char tmp = p[i]; if ((tmp & 0xC0) != 0x80) { + if (skipOnError) return skip(); throw std::runtime_error( to<std::string>("folly::decodeUtf8 i=", i, " tmp=", (uint32_t)tmp)); } @@ -84,6 +91,7 @@ char32_t decodeUtf8(const unsigned char*& p, const unsigned char* const e) { // overlong, could have been encoded with i bytes if ((d & ~bitMask[i - 1]) == 0) { + if (skipOnError) return skip(); throw std::runtime_error( to<std::string>("folly::decodeUtf8 i=", i, " d=", d)); } @@ -91,6 +99,7 @@ char32_t decodeUtf8(const unsigned char*& p, const unsigned char* const e) { // check for surrogates only needed for 3 bytes if (i == 2) { if ((d >= 0xD800 && d <= 0xDFFF) || d > 0x10FFFF) { + if (skipOnError) return skip(); throw std::runtime_error( to<std::string>("folly::decodeUtf8 i=", i, " d=", d)); } @@ -101,6 +110,7 @@ char32_t decodeUtf8(const unsigned char*& p, const unsigned char* const e) { } } + if (skipOnError) return skip(); throw std::runtime_error("folly::decodeUtf8 encoding length maxed out"); } @@ -642,7 +652,8 @@ void escapeString(StringPiece input, while (p < e) { // Since non-ascii encoding inherently does utf8 validation // we explicitly validate utf8 only if non-ascii encoding is disabled. - if (opts.validate_utf8 && !opts.encode_non_ascii) { + if ((opts.validate_utf8 || opts.skip_invalid_utf8) + && !opts.encode_non_ascii) { // to achieve better spatial and temporal coherence // we do utf8 validation progressively along with the // string-escaping instead of two separate passes @@ -654,13 +665,18 @@ void escapeString(StringPiece input, if (q == p) { // calling utf8_decode has the side effect of // checking that utf8 encodings are valid - decodeUtf8(q, e); + char32_t v = decodeUtf8(q, e, opts.skip_invalid_utf8); + if (opts.skip_invalid_utf8 && v == U'\ufffd') { + out.append("\ufffd"); + p = q; + continue; + } } } if (opts.encode_non_ascii && (*p & 0x80)) { // note that this if condition captures utf8 chars // with value > 127, so size > 1 byte - char32_t v = decodeUtf8(p, e); + char32_t v = decodeUtf8(p, e, opts.skip_invalid_utf8); out.append("\\u"); out.push_back(hexDigit(v >> 12)); out.push_back(hexDigit((v >> 8) & 0x0f)); diff --git a/folly/json.h b/folly/json.h index 149ca8ff..694cc4ea 100644 --- a/folly/json.h +++ b/folly/json.h @@ -60,6 +60,7 @@ namespace json { , validate_utf8(false) , allow_trailing_comma(false) , sort_keys(false) + , skip_invalid_utf8(false) {} // If true, keys in an object can be non-strings. (In strict @@ -89,6 +90,9 @@ namespace json { // Sort keys of all objects before printing out (potentially slow) bool sort_keys; + + // Replace invalid utf8 characters with U+FFFD and continue + bool skip_invalid_utf8; }; /* diff --git a/folly/test/JsonTest.cpp b/folly/test/JsonTest.cpp index 6dfdcb48..6085c858 100644 --- a/folly/test/JsonTest.cpp +++ b/folly/test/JsonTest.cpp @@ -302,6 +302,23 @@ TEST(Json, UTF8Validation) { // test validate_utf8 with invalid utf8 EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xc0\x80", opts)); EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xe0\x80\x80", opts)); + + opts.skip_invalid_utf8 = true; + EXPECT_EQ(folly::json::serialize("a\xe0\xa0\x80z\xc0\x80", opts), + "\"a\xe0\xa0\x80z\ufffd\ufffd\""); + EXPECT_EQ(folly::json::serialize("a\xe0\xa0\x80z\xc0\x80\x80", opts), + "\"a\xe0\xa0\x80z\ufffd\ufffd\ufffd\""); + EXPECT_EQ(folly::json::serialize("z\xc0\x80z\xe0\xa0\x80", opts), + "\"z\ufffd\ufffdz\xe0\xa0\x80\""); + + opts.encode_non_ascii = true; + EXPECT_EQ(folly::json::serialize("a\xe0\xa0\x80z\xc0\x80", opts), + "\"a\\u0800z\\ufffd\\ufffd\""); + EXPECT_EQ(folly::json::serialize("a\xe0\xa0\x80z\xc0\x80\x80", opts), + "\"a\\u0800z\\ufffd\\ufffd\\ufffd\""); + EXPECT_EQ(folly::json::serialize("z\xc0\x80z\xe0\xa0\x80", opts), + "\"z\\ufffd\\ufffdz\\u0800\""); + } -- 2.34.1