From: Hari Manikarnika Date: Fri, 26 Oct 2012 23:05:01 +0000 (-0700) Subject: make folly::toJson retain non-ascii chars if encode_non_ascii is disabled X-Git-Tag: v0.22.0~1162 X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=e18efdc462d800421fc1e8af0ed37a4b438dcaf9;p=folly.git make folly::toJson retain non-ascii chars if encode_non_ascii is disabled Summary: folly::toJson as demonstrated by the test cases was wrongly encoding utf8 strings. specifically, a utf8 char made up of x bytes was encodeded into x unicode chars. for example, the char: \u2665 which is made of 3 bytes: \xe2\x99\xa5 was encoded correctly when using encode_non_ascii = true: "\u2665" but when encode_non_ascii = false, the json value was wrongly set as: "\u00e2\u0099\u00a5" because we use an signed char that wrongly detects non-readable chars with ascii value > 127 as control chars with ascii value < 32 (\t, \n, etc.) Test Plan: run the test Reviewed By: delong.j@fb.com FB internal diff: D612782 --- diff --git a/folly/json.cpp b/folly/json.cpp index 989cccc7..0d9996ac 100644 --- a/folly/json.cpp +++ b/folly/json.cpp @@ -30,7 +30,7 @@ namespace folly { namespace json { namespace { -char32_t decodeUtf8(const char*& p, const char* const e) { +char32_t decodeUtf8(const unsigned char*& p, const unsigned char* const e) { /* The following encodings are valid, except for the 5 and 6 byte * combinations: * 0xxxxxxx @@ -115,9 +115,9 @@ void escapeString(StringPiece input, out.reserve(out.size() + input.size() + 2); out.push_back('\"'); - const char* p = input.begin(); - const char* q = input.begin(); - const char* const e = input.end(); + auto* p = reinterpret_cast(input.begin()); + auto* q = reinterpret_cast(input.begin()); + auto* e = reinterpret_cast(input.end()); while (p < e) { // Since non-ascii encoding inherently does utf8 validation @@ -139,6 +139,8 @@ void escapeString(StringPiece input, } if (opts.encode_non_ascii && (*p & 0x80)) { + // note that this if condition captures utf8 chars + // with value > 127, so size > 1 byte char32_t v = decodeUtf8(p, e); out.append("\\u"); out.push_back(hexDigit(v >> 12)); @@ -156,8 +158,8 @@ void escapeString(StringPiece input, case '\r': out.append("\\r"); p++; break; case '\t': out.append("\\t"); p++; break; default: - // note that this if condition captures both control characters - // and extended ascii characters + // note that this if condition captures non readable chars + // with value < 32, so size = 1 byte (e.g control chars). out.append("\\u00"); out.push_back(hexDigit((*p & 0xf0) >> 4)); out.push_back(hexDigit(*p & 0xf)); diff --git a/folly/test/JsonTest.cpp b/folly/test/JsonTest.cpp index 4d47522f..2397f217 100644 --- a/folly/test/JsonTest.cpp +++ b/folly/test/JsonTest.cpp @@ -246,20 +246,70 @@ TEST(Json, JsonNonAsciiEncoding) { EXPECT_ANY_THROW(folly::json::serialize("\xed\xaf\xbf\xed\xbf\xbf", opts)); } +TEST(Json, UTF8Retention) { + + // test retention with valid utf8 strings + folly::fbstring input = "\u2665"; + folly::fbstring jsonInput = folly::toJson(input); + folly::fbstring output = folly::parseJson(jsonInput).asString(); + folly::fbstring jsonOutput = folly::toJson(output); + + LOG(INFO) << "input: " << input + <<" => json: " << jsonInput; + LOG(INFO) << "output: " << output + <<" => json: " << jsonOutput; + + EXPECT_EQ(input, output); + EXPECT_EQ(jsonInput, jsonOutput); + + // test retention with invalid utf8 - note that non-ascii chars are retained + // as is, and no unicode encoding is attempted so no exception is thrown. + EXPECT_EQ( + folly::toJson("a\xe0\xa0\x80z\xc0\x80"), + "\"a\xe0\xa0\x80z\xc0\x80\"" + ); +} + +TEST(Json, UTF8EncodeNonAsciiRetention) { + + folly::json::serialization_opts opts; + opts.encode_non_ascii = true; + + // test encode_non_ascii valid utf8 strings + folly::fbstring input = "\u2665"; + folly::fbstring jsonInput = folly::json::serialize(input, opts); + folly::fbstring output = folly::parseJson(jsonInput).asString(); + folly::fbstring jsonOutput = folly::json::serialize(output, opts); + + LOG(INFO) << "input: " << input + <<" => json: " << jsonInput; + LOG(INFO) << "output: " << output + <<" => json: " << jsonOutput; + + EXPECT_EQ(input, output); + EXPECT_EQ(jsonInput, jsonOutput); + + // test encode_non_ascii with invalid utf8 - note that an attempt to encode + // non-ascii to unicode will result is a utf8 validation and throw exceptions. + EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xc0\x80", opts)); + EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xe0\x80\x80", opts)); +} + TEST(Json, UTF8Validation) { folly::json::serialization_opts opts; opts.validate_utf8 = true; - // valid utf8 strings - EXPECT_EQ(folly::json::serialize("a\xc2\x80z", opts), R"("a\u00c2\u0080z")"); + // test validate_utf8 valid utf8 strings - note that we only validate the + // for utf8 but don't encode non-ascii to unicode so they are retained as is. + EXPECT_EQ(folly::json::serialize("a\xc2\x80z", opts), "\"a\xc2\x80z\""); EXPECT_EQ( folly::json::serialize("a\xe0\xa0\x80z", opts), - R"("a\u00e0\u00a0\u0080z")"); + "\"a\xe0\xa0\x80z\""); EXPECT_EQ( folly::json::serialize("a\xe0\xa0\x80m\xc2\x80z", opts), - R"("a\u00e0\u00a0\u0080m\u00c2\u0080z")"); + "\"a\xe0\xa0\x80m\xc2\x80z\""); - // test with invalid utf8 + // test validate_utf8 with invalid utf8 EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xc0\x80", opts)); EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xe0\x80\x80", opts)); }