From b0193e80a8b1a9a74d5e90a51d2490b68cbdf69a Mon Sep 17 00:00:00 2001 From: Tom Jackson Date: Tue, 17 Nov 2015 11:01:28 -0800 Subject: [PATCH] UTF8StringPiece, wrapping boost::u8_to_u32 Summary: For handling UTF8 strings better. Reviewed By: yfeldblum Differential Revision: D1956771 fb-gh-sync-id: e074f9f2c9b472f5e619fef25d8e17296847773c --- folly/Range.h | 1 - folly/String.h | 14 ++++++++++++++ folly/test/StringTest.cpp | 23 +++++++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/folly/Range.h b/folly/Range.h index ce235bef..2e994732 100644 --- a/folly/Range.h +++ b/folly/Range.h @@ -357,7 +357,6 @@ public: return e_ - b_; } size_type walk_size() const { - assert(b_ <= e_); return std::distance(b_, e_); } bool empty() const { return b_ == e_; } diff --git a/folly/String.h b/folly/String.h index 04004b56..f9ec5fd5 100644 --- a/folly/String.h +++ b/folly/String.h @@ -21,6 +21,7 @@ #include #include #include +#include #ifdef FOLLY_HAVE_DEPRECATED_ASSOC #ifdef _GLIBCXX_SYMVER @@ -592,6 +593,19 @@ inline void toLowerAscii(MutableStringPiece str) { toLowerAscii(str.begin(), str.size()); } +template >> +class UTF8Range : public Base { + public: + /* implicit */ UTF8Range(const folly::Range baseRange) + : Base(boost::u8_to_u32_iterator( + baseRange.begin(), baseRange.begin(), baseRange.end()), + boost::u8_to_u32_iterator( + baseRange.end(), baseRange.begin(), baseRange.end())) {} +}; + +using UTF8StringPiece = UTF8Range; + } // namespace folly // Hook into boost's type traits diff --git a/folly/test/StringTest.cpp b/folly/test/StringTest.cpp index 63f9491e..526afeec 100644 --- a/folly/test/StringTest.cpp +++ b/folly/test/StringTest.cpp @@ -1337,6 +1337,29 @@ TEST(String, whitespace) { EXPECT_EQ("", rtrimWhitespace("\r ")); } +const folly::StringPiece kTestUTF8 = "This is \U0001F602 stuff!"; + +TEST(UTF8StringPiece, valid_utf8) { + folly::StringPiece sp = kTestUTF8; + UTF8StringPiece utf8 = sp; + // utf8.size() not available since it's not a random-access range + EXPECT_EQ(16, utf8.walk_size()); +} + +TEST(UTF8StringPiece, valid_suffix) { + UTF8StringPiece utf8 = kTestUTF8.subpiece(8); + EXPECT_EQ(8, utf8.walk_size()); +} + +TEST(UTF8StringPiece, empty_mid_codepoint) { + UTF8StringPiece utf8 = kTestUTF8.subpiece(9, 0); // okay since it's empty + EXPECT_EQ(0, utf8.walk_size()); +} + +TEST(UTF8StringPiece, invalid_mid_codepoint) { + EXPECT_THROW(UTF8StringPiece(kTestUTF8.subpiece(9, 1)), std::out_of_range); +} + int main(int argc, char *argv[]) { testing::InitGoogleTest(&argc, argv); gflags::ParseCommandLineFlags(&argc, &argv, true); -- 2.34.1