2 * Copyright 2016 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 * memcpy: An optimized memcpy implementation for x86_64. It uses AVX when
19 * __AVX__ is defined, and uses SSE2 otherwise.
21 * @author Bin Liu <binliu@fb.com>
24 #if defined(__x86_64__) && defined(__linux__) && !defined(__CYGWIN__)
30 * _memcpy_short is a local helper used when length < 8. It cannot be called
31 * from outside, because it expects a non-standard calling convention:
33 * %rax: destination buffer address.
34 * %rsi: source buffer address.
35 * %edx: length, in the range of [0, 7]
37 .type _memcpy_short, @function
41 // if (length == 0) return;
46 // if (length - 4 < 0) goto LS4;
51 mov (%rsi, %rdx), %edi
53 mov %edi, (%rax, %rdx)
60 // At this point, length can be 1 or 2 or 3, and $cl contains
63 // if (length - 4 + 2 < 0) return;
67 // length is 2 or 3 here. In either case, just copy the last
69 movzwl (%rsi, %rdx), %ecx
74 .size _memcpy_short, .-_memcpy_short
78 * void* memcpy(void* dst, void* src, uint32_t length);
83 .type memcpy, @function
92 mov -8(%rsi, %rdx), %r8
94 mov %r8, -8(%rdi, %rdx)
103 movdqu (%rsi, %rcx), %xmm1
104 movdqu %xmm1, (%rdi, %rcx)
106 vmovdqu (%rsi, %rcx), %xmm1
107 vmovdqu %xmm1, (%rdi, %rcx)
109 // Test if there are 32-byte groups
122 movdqu 16(%rsi), %xmm1
124 vmovdqu (%rsi), %ymm0
130 movdqu %xmm1, 16(%rdi)
132 vmovdqu %ymm0, (%rdi)
147 movdqu 16(%rsi), %xmm1
149 vmovdqu (%rsi), %ymm0
154 movdqu 32(%rsi), %xmm2
155 movdqu 48(%rsi), %xmm3
158 movdqu %xmm1, 16(%rdi)
159 movdqu %xmm2, 32(%rdi)
160 movdqu %xmm3, 48(%rdi)
162 vmovdqu 32(%rsi), %ymm1
164 vmovdqu %ymm0, (%rdi)
165 vmovdqu %ymm1, 32(%rdi)
176 .size memcpy, .-memcpy