#include <nmmintrin.h>
namespace folly {
namespace detail {
-extern const __m128i groupVarintSSEMasks[];
+alignas(16) extern const uint64_t groupVarintSSEMasks[];
} // namespace detail
} // namespace folly
#endif
static const char* decode(const char* p, uint32_t* dest) {
uint8_t key = p[0];
__m128i val = _mm_loadu_si128((const __m128i*)(p+1));
- __m128i mask = detail::groupVarintSSEMasks[key];
+ __m128i mask =
+ _mm_load_si128((const __m128i*)&detail::groupVarintSSEMasks[key * 2]);
__m128i r = _mm_shuffle_epi8(val, mask);
_mm_storeu_si128((__m128i*)dest, r);
return p + detail::groupVarintLengths[key];
uint32_t* c, uint32_t* d) {
uint8_t key = p[0];
__m128i val = _mm_loadu_si128((const __m128i*)(p+1));
- __m128i mask = detail::groupVarintSSEMasks[key];
+ __m128i mask =
+ _mm_load_si128((const __m128i*)&detail::groupVarintSSEMasks[key * 2]);
__m128i r = _mm_shuffle_epi8(val, mask);
// Extracting 32 bits at a time out of an XMM register is a SSE4 feature
#include <stdint.h>
-#if (FOLLY_X64 || defined(__i386__)) && (FOLLY_SSE >= 2)
-#include <x86intrin.h>
-#endif
-
namespace folly {
namespace detail {
#if (FOLLY_X64 || defined(__i386__)) && (FOLLY_SSE >= 2)
-extern const __m128i groupVarintSSEMasks[] = {
+alignas(16) extern const uint64_t groupVarintSSEMasks[512] = {
""")
# Compute SSE masks
# 0xff: set corresponding byte in result to 0
for k in range(d, 4):
vals[j] |= 0xff << (8 * k)
- f.write(" {{static_cast<int64_t>(0x{1:08x}{0:08x}), "
- "static_cast<int64_t>(0x{3:08x}{2:08x})}},\n".format(*vals))
+ f.write(" 0x{1:08x}{0:08x}ULL, "
+ "0x{3:08x}{2:08x}ULL,\n".format(*vals))
f.write("};\n"
"#endif /*#if (FOLLY_X64 || defined(__i386__)) && (FOLLY_SSE >= 2)*/\n"