Port fbstring_core to big-endian architectures.

author Christopher Cole <colec@amazon.com>

Mon, 14 Sep 2015 17:47:12 +0000 (10:47 -0700)

committer facebook-github-bot-4 <folly-bot@fb.com>

Mon, 14 Sep 2015 18:20:25 +0000 (11:20 -0700)
author Christopher Cole <colec@amazon.com>
Mon, 14 Sep 2015 17:47:12 +0000 (10:47 -0700)
committer facebook-github-bot-4 <folly-bot@fb.com>
Mon, 14 Sep 2015 18:20:25 +0000 (11:20 -0700)
diff --git a/folly/FBString.h b/folly/FBString.h

index 4afce748d8bc9d433dc45439f7443da0fd6a435a..85a0a6928449218e9370b5011c1bc72bab879ef4 100644 (file)
--- a/folly/FBString.h
+++ b/folly/FBString.h
@@ -266,8 +266,8 @@ private:
  
  /**
   * This is the core of the string. The code should work on 32- and
  
  /**
   * This is the core of the string. The code should work on 32- and
- * 64-bit architectures and with any Char size. Porting to big endian
- * architectures would require some changes.
+ * 64-bit and both big- and little-endianan architectures with any
+ * Char size.
   *
   * The storage is selected as follows (assuming we store one-byte
   * characters on a 64-bit machine): (a) "small" strings between 0 and
   *
   * The storage is selected as follows (assuming we store one-byte
   * characters on a 64-bit machine): (a) "small" strings between 0 and
@@ -279,19 +279,29 @@ private:
   * reference-counted and copied lazily. the reference count is
   * allocated right before the character array.
   *
   * reference-counted and copied lazily. the reference count is
   * allocated right before the character array.
   *
- * The discriminator between these three strategies sits in the two
- * most significant bits of the rightmost char of the storage. If
- * neither is set, then the string is small (and its length sits in
- * the lower-order bits of that rightmost character). If the MSb is
- * set, the string is medium width. If the second MSb is set, then the
- * string is large.
+ * The discriminator between these three strategies sits in two
+ * bits of the rightmost char of the storage. If neither is set, then the
+ * string is small (and its length sits in the lower-order bits on
+ * little-endian or the high-order bits on big-endian of that
+ * rightmost character). If the MSb is set, the string is medium width.
+ * If the second MSb is set, then the string is large. On little-endian,
+ * these 2 bits are the 2 MSbs of MediumLarge::capacity_, while on
+ * big-endian, these 2 bits are the 2 LSbs. This keeps both little-endian
+ * and big-endian fbstring_core equivalent with merely different ops used
+ * to extract capacity/category.
   */
  template <class Char> class fbstring_core {
  public:
    fbstring_core() noexcept {
      // Only initialize the tag, will set the MSBs (i.e. the small
      // string size) to zero too
   */
  template <class Char> class fbstring_core {
  public:
    fbstring_core() noexcept {
      // Only initialize the tag, will set the MSBs (i.e. the small
      // string size) to zero too
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
      ml_.capacity_ = maxSmallSize << (8 * (sizeof(size_t) - sizeof(Char)));
      ml_.capacity_ = maxSmallSize << (8 * (sizeof(size_t) - sizeof(Char)));
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    ml_.capacity_ = maxSmallSize << 2;
+#else
+#error Unable to identify target endianness
+#endif
      // or: setSmallSize(0);
      writeTerminator();
      assert(category() == Category::isSmall && size() == 0);
      // or: setSmallSize(0);
      writeTerminator();
      assert(category() == Category::isSmall && size() == 0);
@@ -338,8 +348,7 @@ public:
        // No need for writeTerminator() here, we copied one extra
        // element just above.
        ml_.size_ = rhs.ml_.size_;
        // No need for writeTerminator() here, we copied one extra
        // element just above.
        ml_.size_ = rhs.ml_.size_;
-      ml_.capacity_ = (allocSize / sizeof(Char) - 1)
-                      | static_cast<category_type>(Category::isMedium);
+      ml_.setCapacity(allocSize / sizeof(Char) - 1, Category::isMedium);
        assert(category() == Category::isMedium);
      }
      assert(size() == rhs.size());
        assert(category() == Category::isMedium);
      }
      assert(size() == rhs.size());
@@ -414,16 +423,14 @@ public:
        ml_.data_ = static_cast<Char*>(checkedMalloc(allocSize));
        fbstring_detail::pod_copy(data, data + size, ml_.data_);
        ml_.size_ = size;
        ml_.data_ = static_cast<Char*>(checkedMalloc(allocSize));
        fbstring_detail::pod_copy(data, data + size, ml_.data_);
        ml_.size_ = size;
-      ml_.capacity_ = (allocSize / sizeof(Char) - 1)
-                      | static_cast<category_type>(Category::isMedium);
+      ml_.setCapacity(allocSize / sizeof(Char) - 1, Category::isMedium);
      } else {
        // Large strings are allocated differently
        size_t effectiveCapacity = size;
        auto const newRC = RefCounted::create(data, & effectiveCapacity);
        ml_.data_ = newRC->data_;
        ml_.size_ = size;
      } else {
        // Large strings are allocated differently
        size_t effectiveCapacity = size;
        auto const newRC = RefCounted::create(data, & effectiveCapacity);
        ml_.data_ = newRC->data_;
        ml_.size_ = size;
-      ml_.capacity_ = effectiveCapacity
-                      | static_cast<category_type>(Category::isLarge);
+      ml_.setCapacity(effectiveCapacity, Category::isLarge);
      }
      writeTerminator();
    }
      }
      writeTerminator();
    }
@@ -458,8 +465,7 @@ public:
        ml_.data_ = data;
        ml_.size_ = size;
        // Don't forget about null terminator
        ml_.data_ = data;
        ml_.size_ = size;
        // Don't forget about null terminator
-      ml_.capacity_ = (allocatedSize - 1)
-                      | static_cast<category_type>(Category::isMedium);
+      ml_.setCapacity(allocatedSize - 1, Category::isMedium);
      } else {
        // No need for the memory
        free(data);
      } else {
        // No need for the memory
        free(data);
@@ -556,8 +562,7 @@ public:
          // we have + 1 above.
          RefCounted::decrementRefs(ml_.data_);
          ml_.data_ = newRC->data_;
          // we have + 1 above.
          RefCounted::decrementRefs(ml_.data_);
          ml_.data_ = newRC->data_;
-        ml_.capacity_ = minCapacity
-                        | static_cast<category_type>(Category::isLarge);
+        ml_.setCapacity(minCapacity, Category::isLarge);
          // size remains unchanged
        } else {
          // String is not shared, so let's try to realloc (if needed)
          // size remains unchanged
        } else {
          // String is not shared, so let's try to realloc (if needed)
@@ -567,8 +572,7 @@ public:
                 RefCounted::reallocate(ml_.data_, ml_.size_,
                                        ml_.capacity(), minCapacity);
            ml_.data_ = newRC->data_;
                 RefCounted::reallocate(ml_.data_, ml_.size_,
                                        ml_.capacity(), minCapacity);
            ml_.data_ = newRC->data_;
-          ml_.capacity_ = minCapacity
-                          | static_cast<category_type>(Category::isLarge);
+          ml_.setCapacity(minCapacity, Category::isLarge);
            writeTerminator();
          }
          assert(capacity() >= minCapacity);
            writeTerminator();
          }
          assert(capacity() >= minCapacity);
@@ -589,8 +593,7 @@ public:
              (ml_.capacity() + 1) * sizeof(Char),
              capacityBytes));
          writeTerminator();
              (ml_.capacity() + 1) * sizeof(Char),
              capacityBytes));
          writeTerminator();
-        ml_.capacity_ = (capacityBytes / sizeof(Char) - 1)
-                        | static_cast<category_type>(Category::isMedium);
+        ml_.setCapacity(capacityBytes / sizeof(Char) - 1, Category::isMedium);
        } else {
          // Conversion from medium to large string
          fbstring_core nascent;
        } else {
          // Conversion from medium to large string
          fbstring_core nascent;
@@ -613,8 +616,7 @@ public:
          // No need for writeTerminator(), we wrote it above with + 1.
          ml_.data_ = newRC->data_;
          ml_.size_ = size;
          // No need for writeTerminator(), we wrote it above with + 1.
          ml_.data_ = newRC->data_;
          ml_.size_ = size;
-        ml_.capacity_ = minCapacity
-                        | static_cast<category_type>(Category::isLarge);
+        ml_.setCapacity(minCapacity, Category::isLarge);
          assert(capacity() >= minCapacity);
        } else if (minCapacity > maxSmallSize) {
          // medium
          assert(capacity() >= minCapacity);
        } else if (minCapacity > maxSmallSize) {
          // medium
@@ -627,8 +629,7 @@ public:
          // No need for writeTerminator(), we wrote it above with + 1.
          ml_.data_ = data;
          ml_.size_ = size;
          // No need for writeTerminator(), we wrote it above with + 1.
          ml_.data_ = data;
          ml_.size_ = size;
-        ml_.capacity_ = (allocSizeBytes / sizeof(Char) - 1)
-                        | static_cast<category_type>(Category::isMedium);
+        ml_.setCapacity(allocSizeBytes / sizeof(Char) - 1, Category::isMedium);
        } else {
          // small
          // Nothing to do, everything stays put
        } else {
          // small
          // Nothing to do, everything stays put
@@ -728,16 +729,6 @@ private:
    // Disabled
    fbstring_core & operator=(const fbstring_core & rhs);
  
    // Disabled
    fbstring_core & operator=(const fbstring_core & rhs);
  
-  struct MediumLarge {
-    Char * data_;
-    size_t size_;
-    size_t capacity_;
-
-    size_t capacity() const {
-      return capacity_ & capacityExtractMask;
-    }
-  };
-
    struct RefCounted {
      std::atomic<size_t> refCount_;
      Char data_[1];
    struct RefCounted {
      std::atomic<size_t> refCount_;
      Char data_[1];
@@ -805,6 +796,53 @@ private:
      }
    };
  
      }
    };
  
+  typedef std::conditional<sizeof(size_t) == 4, uint32_t, uint64_t>::type
+          category_type;
+
+  enum class Category : category_type {
+    isSmall = 0,
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    isMedium = sizeof(size_t) == 4 ? 0x80000000 : 0x8000000000000000,
+    isLarge =  sizeof(size_t) == 4 ? 0x40000000 : 0x4000000000000000,
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    isMedium = 0x2,
+    isLarge =  0x1,
+#else
+#error Unable to identify target endianness
+#endif
+  };
+
+  Category category() const {
+    // works for both big-endian and little-endian
+    return static_cast<Category>(ml_.capacity_ & categoryExtractMask);
+  }
+
+  struct MediumLarge {
+    Char * data_;
+    size_t size_;
+    size_t capacity_;
+
+    size_t capacity() const {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+      return capacity_ & capacityExtractMask;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+      return capacity_ >> 2;
+#else
+#error Unable to identify target endianness
+#endif
+    }
+
+    void setCapacity(size_t cap, Category cat) {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+        capacity_ = cap | static_cast<category_type>(cat);
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+        capacity_ = (cap << 2) | static_cast<category_type>(cat);
+#else
+#error Unable to identify target endianness
+#endif
+    }
+  };
+
    union {
      Char small_[sizeof(MediumLarge) / sizeof(Char)];
      MediumLarge ml_;
    union {
      Char small_[sizeof(MediumLarge) / sizeof(Char)];
      MediumLarge ml_;
@@ -815,32 +853,34 @@ private:
      maxSmallSize = lastChar / sizeof(Char),
      maxMediumSize = 254 / sizeof(Char),            // coincides with the small
                                                     // bin size in dlmalloc
      maxSmallSize = lastChar / sizeof(Char),
      maxMediumSize = 254 / sizeof(Char),            // coincides with the small
                                                     // bin size in dlmalloc
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
      categoryExtractMask = sizeof(size_t) == 4 ? 0xC0000000 : 0xC000000000000000,
      capacityExtractMask = ~categoryExtractMask,
      categoryExtractMask = sizeof(size_t) == 4 ? 0xC0000000 : 0xC000000000000000,
      capacityExtractMask = ~categoryExtractMask,
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    categoryExtractMask = 0x3,
+#else
+#error Unable to identify target endianness
+#endif
    };
    static_assert(!(sizeof(MediumLarge) % sizeof(Char)),
                  "Corrupt memory layout for fbstring.");
  
    };
    static_assert(!(sizeof(MediumLarge) % sizeof(Char)),
                  "Corrupt memory layout for fbstring.");
  
-  typedef std::conditional<sizeof(size_t) == 4, uint32_t, uint64_t>::type
-          category_type;
-
-  enum class Category : category_type {
-    isSmall = 0,
-    isMedium = sizeof(size_t) == 4 ? 0x80000000 : 0x8000000000000000,
-    isLarge =  sizeof(size_t) == 4 ? 0x40000000 : 0x4000000000000000,
-  };
-
-  Category category() const {
-    // Assumes little endian
-    return static_cast<Category>(ml_.capacity_ & categoryExtractMask);
-  }
-
    size_t smallSize() const {
    size_t smallSize() const {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
      assert(category() == Category::isSmall &&
             static_cast<size_t>(small_[maxSmallSize])
             <= static_cast<size_t>(maxSmallSize));
      return static_cast<size_t>(maxSmallSize)
        - static_cast<size_t>(small_[maxSmallSize]);
      assert(category() == Category::isSmall &&
             static_cast<size_t>(small_[maxSmallSize])
             <= static_cast<size_t>(maxSmallSize));
      return static_cast<size_t>(maxSmallSize)
        - static_cast<size_t>(small_[maxSmallSize]);
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    assert(category() == Category::isSmall &&
+           (static_cast<size_t>(small_[maxSmallSize]) >> 2)
+           <= static_cast<size_t>(maxSmallSize));
+    return static_cast<size_t>(maxSmallSize)
+      - (static_cast<size_t>(small_[maxSmallSize]) >> 2);
+#else
+#error Unable to identify target endianness
+#endif
    }
  
    void setSmallSize(size_t s) {
    }
  
    void setSmallSize(size_t s) {
@@ -848,7 +888,13 @@ private:
      // so don't assume anything about the previous value of
      // small_[maxSmallSize].
      assert(s <= maxSmallSize);
      // so don't assume anything about the previous value of
      // small_[maxSmallSize].
      assert(s <= maxSmallSize);
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
      small_[maxSmallSize] = maxSmallSize - s;
      small_[maxSmallSize] = maxSmallSize - s;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    small_[maxSmallSize] = (maxSmallSize - s) << 2;
+#else
+#error Unable to identify target endianness
+#endif
      writeTerminator();
    }
  };
      writeTerminator();
    }
  };
diff --git a/folly/docs/FBString.md b/folly/docs/FBString.md

index bfd69f1a43995079c87c5ec1affdf1053902c981..b41a63ed24f15e2cd1b653fa93c82865f63d4181 100644 (file)
--- a/folly/docs/FBString.md
+++ b/folly/docs/FBString.md
@@ -9,8 +9,8 @@ allocator. In particular, `fbstring` is designed to detect use of
  jemalloc and cooperate with it to achieve significant improvements in
  speed and memory usage.
  
  jemalloc and cooperate with it to achieve significant improvements in
  speed and memory usage.
  
-`fbstring` supports x32 and x64 architectures. Porting it to big endian
-architectures would require some changes.
+`fbstring` supports 32- and 64-bit and little- and big-endian
+architectures.
  
  ### Storage strategies
  ***
  
  ### Storage strategies
  ***
@@ -43,4 +43,4 @@ architectures would require some changes.
    `string::find()` for successful searches and a 1.5x speed
    improvement for failed searches.
  
    `string::find()` for successful searches and a 1.5x speed
    improvement for failed searches.
  
-* Offers conversions to and from `std::string`.
-\ No newline at end of file
+* Offers conversions to and from `std::string`.
author	Christopher Cole <colec@amazon.com>
	Mon, 14 Sep 2015 17:47:12 +0000 (10:47 -0700)
committer	facebook-github-bot-4 <folly-bot@fb.com>
	Mon, 14 Sep 2015 18:20:25 +0000 (11:20 -0700)
folly/FBString.h		patch \| blob \| history
folly/docs/FBString.md		patch \| blob \| history