1 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
4 declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i32, i1) nounwind
5 declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind
8 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1:
82 define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
83 %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
84 %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
85 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 1, i1 false) nounwind
89 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align2:
127 define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
128 %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
129 %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
130 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 2, i1 false) nounwind
134 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align4:
135 ; SI-DAG: ds_read_b32
136 ; SI-DAG: ds_write_b32
138 ; SI-DAG: ds_read_b32
139 ; SI-DAG: ds_write_b32
141 ; SI-DAG: ds_read_b32
142 ; SI-DAG: ds_write_b32
144 ; SI-DAG: ds_read_b32
145 ; SI-DAG: ds_write_b32
147 ; SI-DAG: ds_read_b32
148 ; SI-DAG: ds_write_b32
150 ; SI-DAG: ds_read_b32
151 ; SI-DAG: ds_write_b32
153 ; SI-DAG: ds_read_b32
154 ; SI-DAG: ds_write_b32
156 ; SI-DAG: ds_read_b32
157 ; SI-DAG: ds_write_b32
159 ; SI-DAG: ds_read_b32
160 ; SI-DAG: ds_write_b32
163 define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
164 %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
165 %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
166 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 4, i1 false) nounwind
170 ; FIXME: Use 64-bit ops
171 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8:
173 ; SI-DAG: ds_read_b32
174 ; SI-DAG: ds_write_b32
176 ; SI-DAG: ds_read_b32
177 ; SI-DAG: ds_write_b32
179 ; SI-DAG: ds_read_b32
180 ; SI-DAG: ds_write_b32
182 ; SI-DAG: ds_read_b32
183 ; SI-DAG: ds_write_b32
185 ; SI-DAG: ds_read_b32
186 ; SI-DAG: ds_write_b32
188 ; SI-DAG: ds_read_b32
189 ; SI-DAG: ds_write_b32
191 ; SI-DAG: ds_read_b32
192 ; SI-DAG: ds_write_b32
194 ; SI-DAG: ds_read_b32
195 ; SI-DAG: ds_write_b32
197 ; SI-DAG: ds_read_b32
198 ; SI-DAG: ds_write_b32
201 define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
202 %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
203 %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
204 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 8, i1 false) nounwind
208 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align1:
209 ; SI-DAG: buffer_load_ubyte
210 ; SI-DAG: buffer_store_byte
211 ; SI-DAG: buffer_load_ubyte
212 ; SI-DAG: buffer_store_byte
213 ; SI-DAG: buffer_load_ubyte
214 ; SI-DAG: buffer_store_byte
215 ; SI-DAG: buffer_load_ubyte
216 ; SI-DAG: buffer_store_byte
217 ; SI-DAG: buffer_load_ubyte
218 ; SI-DAG: buffer_store_byte
219 ; SI-DAG: buffer_load_ubyte
220 ; SI-DAG: buffer_store_byte
221 ; SI-DAG: buffer_load_ubyte
222 ; SI-DAG: buffer_store_byte
223 ; SI-DAG: buffer_load_ubyte
224 ; SI-DAG: buffer_store_byte
226 ; SI-DAG: buffer_load_ubyte
227 ; SI-DAG: buffer_store_byte
228 ; SI-DAG: buffer_load_ubyte
229 ; SI-DAG: buffer_store_byte
230 ; SI-DAG: buffer_load_ubyte
231 ; SI-DAG: buffer_store_byte
232 ; SI-DAG: buffer_load_ubyte
233 ; SI-DAG: buffer_store_byte
234 ; SI-DAG: buffer_load_ubyte
235 ; SI-DAG: buffer_store_byte
236 ; SI-DAG: buffer_load_ubyte
237 ; SI-DAG: buffer_store_byte
238 ; SI-DAG: buffer_load_ubyte
239 ; SI-DAG: buffer_store_byte
240 ; SI-DAG: buffer_load_ubyte
241 ; SI-DAG: buffer_store_byte
243 ; SI-DAG: buffer_load_ubyte
244 ; SI-DAG: buffer_store_byte
245 ; SI-DAG: buffer_load_ubyte
246 ; SI-DAG: buffer_store_byte
247 ; SI-DAG: buffer_load_ubyte
248 ; SI-DAG: buffer_store_byte
249 ; SI-DAG: buffer_load_ubyte
250 ; SI-DAG: buffer_store_byte
251 ; SI-DAG: buffer_load_ubyte
252 ; SI-DAG: buffer_store_byte
253 ; SI-DAG: buffer_load_ubyte
254 ; SI-DAG: buffer_store_byte
255 ; SI-DAG: buffer_load_ubyte
256 ; SI-DAG: buffer_store_byte
257 ; SI-DAG: buffer_load_ubyte
258 ; SI-DAG: buffer_store_byte
260 ; SI-DAG: buffer_load_ubyte
261 ; SI-DAG: buffer_store_byte
262 ; SI-DAG: buffer_load_ubyte
263 ; SI-DAG: buffer_store_byte
264 ; SI-DAG: buffer_load_ubyte
265 ; SI-DAG: buffer_store_byte
266 ; SI-DAG: buffer_load_ubyte
267 ; SI-DAG: buffer_store_byte
268 ; SI-DAG: buffer_load_ubyte
269 ; SI-DAG: buffer_store_byte
270 ; SI-DAG: buffer_load_ubyte
271 ; SI-DAG: buffer_store_byte
272 ; SI-DAG: buffer_load_ubyte
273 ; SI-DAG: buffer_store_byte
274 ; SI-DAG: buffer_load_ubyte
275 ; SI-DAG: buffer_store_byte
278 define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
279 %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
280 %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
281 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 1, i1 false) nounwind
285 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align2:
286 ; SI-DAG: buffer_load_ushort
287 ; SI-DAG: buffer_load_ushort
288 ; SI-DAG: buffer_load_ushort
289 ; SI-DAG: buffer_load_ushort
290 ; SI-DAG: buffer_load_ushort
291 ; SI-DAG: buffer_load_ushort
292 ; SI-DAG: buffer_load_ushort
293 ; SI-DAG: buffer_load_ushort
294 ; SI-DAG: buffer_load_ushort
295 ; SI-DAG: buffer_load_ushort
296 ; SI-DAG: buffer_load_ushort
297 ; SI-DAG: buffer_load_ushort
298 ; SI-DAG: buffer_load_ushort
299 ; SI-DAG: buffer_load_ushort
300 ; SI-DAG: buffer_load_ushort
301 ; SI-DAG: buffer_load_ushort
303 ; SI-DAG: buffer_store_short
304 ; SI-DAG: buffer_store_short
305 ; SI-DAG: buffer_store_short
306 ; SI-DAG: buffer_store_short
307 ; SI-DAG: buffer_store_short
308 ; SI-DAG: buffer_store_short
309 ; SI-DAG: buffer_store_short
310 ; SI-DAG: buffer_store_short
311 ; SI-DAG: buffer_store_short
312 ; SI-DAG: buffer_store_short
313 ; SI-DAG: buffer_store_short
314 ; SI-DAG: buffer_store_short
315 ; SI-DAG: buffer_store_short
316 ; SI-DAG: buffer_store_short
317 ; SI-DAG: buffer_store_short
318 ; SI-DAG: buffer_store_short
321 define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
322 %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
323 %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
324 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 2, i1 false) nounwind
328 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align4:
329 ; SI: buffer_load_dwordx4
330 ; SI: buffer_load_dwordx4
331 ; SI: buffer_store_dwordx4
332 ; SI: buffer_store_dwordx4
334 define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
335 %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
336 %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
337 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 4, i1 false) nounwind
341 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align8:
342 ; SI: buffer_load_dwordx4
343 ; SI: buffer_load_dwordx4
344 ; SI: buffer_store_dwordx4
345 ; SI: buffer_store_dwordx4
347 define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
348 %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
349 %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
350 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 8, i1 false) nounwind
354 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align16:
355 ; SI: buffer_load_dwordx4
356 ; SI: buffer_load_dwordx4
357 ; SI: buffer_store_dwordx4
358 ; SI: buffer_store_dwordx4
360 define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
361 %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
362 %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
363 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 16, i1 false) nounwind