UTF8StringPiece, wrapping boost::u8_to_u32
authorTom Jackson <tjackson@fb.com>
Tue, 17 Nov 2015 19:01:28 +0000 (11:01 -0800)
committerfacebook-github-bot-1 <folly-bot@fb.com>
Tue, 17 Nov 2015 19:20:23 +0000 (11:20 -0800)
Summary: For handling UTF8 strings better.

Reviewed By: yfeldblum

Differential Revision: D1956771

fb-gh-sync-id: e074f9f2c9b472f5e619fef25d8e17296847773c

folly/Range.h
folly/String.h
folly/test/StringTest.cpp

index ce235bef18ac0d33721bdfbc722956640cee1ae0..2e994732f80f6b9c9aabf5b53cc07cfa515f35fe 100644 (file)
@@ -357,7 +357,6 @@ public:
     return e_ - b_;
   }
   size_type walk_size() const {
-    assert(b_ <= e_);
     return std::distance(b_, e_);
   }
   bool empty() const { return b_ == e_; }
index 04004b56f510042c62ff7edbf094d5f0692c3abc..f9ec5fd58eb82b76209f90357a4e963fde6bc1d1 100644 (file)
@@ -21,6 +21,7 @@
 #include <stdarg.h>
 #include <string>
 #include <boost/type_traits.hpp>
+#include <boost/regex/pending/unicode_iterator.hpp>
 
 #ifdef FOLLY_HAVE_DEPRECATED_ASSOC
 #ifdef _GLIBCXX_SYMVER
@@ -592,6 +593,19 @@ inline void toLowerAscii(MutableStringPiece str) {
   toLowerAscii(str.begin(), str.size());
 }
 
+template <class Iterator = const char*,
+          class Base = folly::Range<boost::u8_to_u32_iterator<Iterator>>>
+class UTF8Range : public Base {
+ public:
+  /* implicit */ UTF8Range(const folly::Range<Iterator> baseRange)
+      : Base(boost::u8_to_u32_iterator<Iterator>(
+                 baseRange.begin(), baseRange.begin(), baseRange.end()),
+             boost::u8_to_u32_iterator<Iterator>(
+                 baseRange.end(), baseRange.begin(), baseRange.end())) {}
+};
+
+using UTF8StringPiece = UTF8Range<const char*>;
+
 } // namespace folly
 
 // Hook into boost's type traits
index 63f9491ebe81cb2956cc2fa2c14555b9ff216817..526afeecbe129bb3dd0f161733cf1aaf59ace56d 100644 (file)
@@ -1337,6 +1337,29 @@ TEST(String, whitespace) {
   EXPECT_EQ("", rtrimWhitespace("\r   "));
 }
 
+const folly::StringPiece kTestUTF8 = "This is \U0001F602 stuff!";
+
+TEST(UTF8StringPiece, valid_utf8) {
+  folly::StringPiece sp = kTestUTF8;
+  UTF8StringPiece utf8 = sp;
+  // utf8.size() not available since it's not a random-access range
+  EXPECT_EQ(16, utf8.walk_size());
+}
+
+TEST(UTF8StringPiece, valid_suffix) {
+  UTF8StringPiece utf8 = kTestUTF8.subpiece(8);
+  EXPECT_EQ(8, utf8.walk_size());
+}
+
+TEST(UTF8StringPiece, empty_mid_codepoint) {
+  UTF8StringPiece utf8 = kTestUTF8.subpiece(9, 0); // okay since it's empty
+  EXPECT_EQ(0, utf8.walk_size());
+}
+
+TEST(UTF8StringPiece, invalid_mid_codepoint) {
+  EXPECT_THROW(UTF8StringPiece(kTestUTF8.subpiece(9, 1)), std::out_of_range);
+}
+
 int main(int argc, char *argv[]) {
   testing::InitGoogleTest(&argc, argv);
   gflags::ParseCommandLineFlags(&argc, &argv, true);