--- /dev/null
+/*
+ * Copyright 2015 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * memcpy: An optimized memcpy implementation for x86_64. It uses AVX when
+ * __AVX__ is defined, and uses SSE2 otherwise.
+ *
+ * @author Bin Liu <binliu@fb.com>
+ */
+
+#if defined(__x86_64__) && defined(__linux__) && !defined(__CYGWIN__)
+
+ .file "memcpy.S"
+ .text
+
+/*
+ * _memcpy_short is a local helper used when length < 8. It cannot be called
+ * from outside, because it expects a non-standard calling convention:
+ *
+ * %rax: destination buffer address.
+ * %rsi: source buffer address.
+ * %edx: length, in the range of [0, 7]
+ */
+ .type _memcpy_short, @function
+_memcpy_short:
+.LSHORT:
+ .cfi_startproc
+ // if (length == 0) return;
+ test %edx, %edx
+ jz .LEND
+
+ movzbl (%rsi), %ecx
+ // if (length - 4 < 0) goto LS4;
+ sub $4, %edx
+ jb .LS4
+
+ mov (%rsi), %ecx
+ mov (%rsi, %rdx), %edi
+ mov %ecx, (%rax)
+ mov %edi, (%rax, %rdx)
+.LEND:
+ rep
+ ret
+ nop
+
+.LS4:
+ // At this point, length can be 1 or 2 or 3, and $cl contains
+ // the first byte.
+ mov %cl, (%rax)
+ // if (length - 4 + 2 < 0) return;
+ add $2, %edx
+ jnc .LEND
+
+ // length is 2 or 3 here. In either case, just copy the last
+ // two bytes.
+ movzwl (%rsi, %rdx), %ecx
+ mov %cx, (%rax, %rdx)
+ ret
+
+ .cfi_endproc
+ .size _memcpy_short, .-_memcpy_short
+
+
+/*
+ * void* memcpy(void* dst, void* src, uint32_t length);
+ *
+ */
+ .align 16
+ .globl memcpy
+ .type memcpy, @function
+memcpy:
+ .cfi_startproc
+
+ mov %rdx, %rcx
+ mov %rdi, %rax
+ cmp $8, %rdx
+ jb .LSHORT
+
+ mov -8(%rsi, %rdx), %r8
+ mov (%rsi), %r9
+ mov %r8, -8(%rdi, %rdx)
+ and $24, %rcx
+ jz .L32
+
+ mov %r9, (%rdi)
+ mov %rcx, %r8
+ sub $16, %rcx
+ jb .LT32
+#ifndef __AVX__
+ movdqu (%rsi, %rcx), %xmm1
+ movdqu %xmm1, (%rdi, %rcx)
+#else
+ vmovdqu (%rsi, %rcx), %xmm1
+ vmovdqu %xmm1, (%rdi, %rcx)
+#endif
+ // Test if there are 32-byte groups
+.LT32:
+ add %r8, %rsi
+ and $-32, %rdx
+ jnz .L32_adjDI
+ ret
+
+ .align 16
+.L32_adjDI:
+ add %r8, %rdi
+.L32:
+#ifndef __AVX__
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm1
+#else
+ vmovdqu (%rsi), %ymm0
+#endif
+ shr $6, %rdx
+ jnc .L64_32read
+#ifndef __AVX__
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, 16(%rdi)
+#else
+ vmovdqu %ymm0, (%rdi)
+#endif
+ lea 32(%rsi), %rsi
+ jnz .L64_adjDI
+#ifdef __AVX__
+ vzeroupper
+#endif
+ ret
+
+.L64_adjDI:
+ add $32, %rdi
+
+.L64:
+#ifndef __AVX__
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm1
+#else
+ vmovdqu (%rsi), %ymm0
+#endif
+
+.L64_32read:
+#ifndef __AVX__
+ movdqu 32(%rsi), %xmm2
+ movdqu 48(%rsi), %xmm3
+ add $64, %rsi
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, 16(%rdi)
+ movdqu %xmm2, 32(%rdi)
+ movdqu %xmm3, 48(%rdi)
+#else
+ vmovdqu 32(%rsi), %ymm1
+ add $64, %rsi
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm1, 32(%rdi)
+#endif
+ add $64, %rdi
+ dec %rdx
+ jnz .L64
+#ifdef __AVX__
+ vzeroupper
+#endif
+ ret
+
+ .cfi_endproc
+ .size memcpy, .-memcpy
+
+#endif
--- /dev/null
+/*
+ * Copyright 2015 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+namespace {
+
+constexpr size_t SIZE = 4096 * 4;
+char src[SIZE];
+char dst[SIZE];
+
+void init() {
+ for (size_t i = 0; i < SIZE; ++i) {
+ src[i] = static_cast<char>(i);
+ dst[i] = static_cast<char>(255 - i);
+ }
+}
+}
+
+TEST(memcpy, zero_len) {
+ // If length is 0, we shouldn't touch any memory. So this should
+ // not crash.
+ char* srcNull = nullptr;
+ char* dstNull = nullptr;
+ memcpy(dstNull, srcNull, 0);
+}
+
+// Test copy `len' bytes and verify that exactly `len' bytes are copied.
+void testLen(size_t len) {
+ if (len > SIZE) {
+ return;
+ }
+ init();
+ memcpy(dst, src, len);
+ for (size_t i = 0; i < len; ++i) {
+ EXPECT_EQ(src[i], static_cast<char>(i));
+ EXPECT_EQ(src[i], dst[i]);
+ }
+ if (len < SIZE) {
+ EXPECT_EQ(src[len], static_cast<char>(len));
+ EXPECT_EQ(dst[len], static_cast<char>(255 - len));
+ }
+}
+
+TEST(memcpy, small) {
+ for (size_t len = 1; len < 8; ++len) {
+ testLen(len);
+ }
+}
+
+TEST(memcpy, main) {
+ for (size_t len = 8; len < 128; ++len) {
+ testLen(len);
+ }
+
+ for (size_t len = 128; len < SIZE; len += 128) {
+ testLen(len);
+ }
+
+ for (size_t len = 128; len < SIZE; len += 73) {
+ testLen(len);
+ }
+}