From 3a5e9cb146a444148b1f40265acadb6a27ee7776 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 28 Jul 2014 17:49:26 +0000 Subject: [PATCH] R600/SI: Implement getOptimalMemOpType The default guess uses i32. This needs an address space argument to really do the right thing in all cases. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214104 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/SIISelLowering.cpp | 20 ++ lib/Target/R600/SIISelLowering.h | 6 + test/CodeGen/R600/llvm.memcpy.ll | 358 +++++++++++++++++++++++++++++ 3 files changed, 384 insertions(+) create mode 100644 test/CodeGen/R600/llvm.memcpy.ll diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index a4d4195edde..c3405e10453 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -274,6 +274,26 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, return VT.bitsGT(MVT::i32); } +EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, + unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, + bool MemcpyStrSrc, + MachineFunction &MF) const { + // FIXME: Should account for address space here. + + // The default fallback uses the private pointer size as a guess for a type to + // use. Make sure we switch these to 64-bit accesses. + + if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global + return MVT::v4i32; + + if (Size >= 8 && DstAlign >= 4) + return MVT::v2i32; + + // Use the default. + return MVT::Other; +} + TargetLoweringBase::LegalizeTypeAction SITargetLowering::getPreferredVectorAction(EVT VT) const { if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h index d03bc864148..93e97c32357 100644 --- a/lib/Target/R600/SIISelLowering.h +++ b/lib/Target/R600/SIISelLowering.h @@ -63,6 +63,12 @@ public: unsigned Align, bool *IsFast) const override; + EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, + unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, + bool MemcpyStrSrc, + MachineFunction &MF) const override; + TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; diff --git a/test/CodeGen/R600/llvm.memcpy.ll b/test/CodeGen/R600/llvm.memcpy.ll new file mode 100644 index 00000000000..6e9a51ef4dd --- /dev/null +++ b/test/CodeGen/R600/llvm.memcpy.ll @@ -0,0 +1,358 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i32, i1) nounwind +declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind + + +; FUNC-LABEL: @test_small_memcpy_i64_lds_to_lds_align1 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 + +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 + +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 + +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 +; SI: DS_READ_U8 +; SI: DS_WRITE_B8 + +; SI: S_ENDPGM +define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* + %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* + call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 1, i1 false) nounwind + ret void +} + +; FUNC-LABEL: @test_small_memcpy_i64_lds_to_lds_align2 +; SI: DS_READ_U16 +; SI: DS_WRITE_B16 +; SI: DS_READ_U16 +; SI: DS_WRITE_B16 +; SI: DS_READ_U16 +; SI: DS_WRITE_B16 +; SI: DS_READ_U16 +; SI: DS_WRITE_B16 +; SI: DS_READ_U16 +; SI: DS_WRITE_B16 +; SI: DS_READ_U16 +; SI: DS_WRITE_B16 +; SI: DS_READ_U16 +; SI: DS_WRITE_B16 +; SI: DS_READ_U16 +; SI: DS_WRITE_B16 + +; SI: DS_READ_U16 +; SI: DS_WRITE_B16 +; SI: DS_READ_U16 +; SI: DS_WRITE_B16 +; SI: DS_READ_U16 +; SI: DS_WRITE_B16 +; SI: DS_READ_U16 +; SI: DS_WRITE_B16 +; SI: DS_READ_U16 +; SI: DS_WRITE_B16 +; SI: DS_READ_U16 +; SI: DS_WRITE_B16 +; SI: DS_READ_U16 +; SI: DS_WRITE_B16 +; SI: DS_READ_U16 +; SI: DS_WRITE_B16 + +; SI: S_ENDPGM +define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* + %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* + call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 2, i1 false) nounwind + ret void +} + +; FUNC-LABEL: @test_small_memcpy_i64_lds_to_lds_align4 +; SI-DAG: DS_READ_B32 +; SI-DAG: DS_WRITE_B32 + +; SI-DAG: DS_READ_B32 +; SI-DAG: DS_WRITE_B32 + +; SI-DAG: DS_READ_B32 +; SI-DAG: DS_WRITE_B32 + +; SI-DAG: DS_READ_B32 +; SI-DAG: DS_WRITE_B32 + +; SI-DAG: DS_READ_B32 +; SI-DAG: DS_WRITE_B32 + +; SI-DAG: DS_READ_B32 +; SI-DAG: DS_WRITE_B32 + +; SI-DAG: DS_READ_B32 +; SI-DAG: DS_WRITE_B32 + +; SI-DAG: DS_READ_B32 +; SI-DAG: DS_WRITE_B32 + +; SI-DAG: DS_READ_B32 +; SI-DAG: DS_WRITE_B32 + +; SI: S_ENDPGM +define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* + %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* + call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 4, i1 false) nounwind + ret void +} + +; FIXME: Use 64-bit ops +; FUNC-LABEL: @test_small_memcpy_i64_lds_to_lds_align8 + +; SI-DAG: DS_READ_B32 +; SI-DAG: DS_WRITE_B32 + +; SI-DAG: DS_READ_B32 +; SI-DAG: DS_WRITE_B32 + +; SI-DAG: DS_READ_B32 +; SI-DAG: DS_WRITE_B32 + +; SI-DAG: DS_READ_B32 +; SI-DAG: DS_WRITE_B32 + +; SI-DAG: DS_READ_B32 +; SI-DAG: DS_WRITE_B32 + +; SI-DAG: DS_READ_B32 +; SI-DAG: DS_WRITE_B32 + +; SI-DAG: DS_READ_B32 +; SI-DAG: DS_WRITE_B32 + +; SI-DAG: DS_READ_B32 +; SI-DAG: DS_WRITE_B32 + +; SI-DAG: DS_READ_B32 +; SI-DAG: DS_WRITE_B32 + +; SI-DAG: S_ENDPGM +define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* + %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* + call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 8, i1 false) nounwind + ret void +} + +; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align1 +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE + +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE + +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE + +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_LOAD_UBYTE +; SI-DAG: BUFFER_STORE_BYTE + +; SI: S_ENDPGM +define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* + %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 1, i1 false) nounwind + ret void +} + +; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align2 +; SI-DAG: BUFFER_LOAD_USHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_LOAD_USHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_LOAD_USHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_LOAD_USHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_LOAD_USHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_LOAD_USHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_LOAD_USHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_LOAD_USHORT +; SI-DAG: BUFFER_STORE_SHORT + +; SI-DAG: BUFFER_LOAD_USHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_LOAD_USHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_LOAD_USHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_LOAD_USHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_LOAD_USHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_LOAD_USHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_LOAD_USHORT +; SI-DAG: BUFFER_STORE_SHORT +; SI-DAG: BUFFER_LOAD_USHORT +; SI-DAG: BUFFER_STORE_SHORT + +; SI: S_ENDPGM +define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* + %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 2, i1 false) nounwind + ret void +} + +; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align4 +; SI: BUFFER_LOAD_DWORDX4 +; SI: BUFFER_STORE_DWORDX4 +; SI: BUFFER_LOAD_DWORDX4 +; SI: BUFFER_STORE_DWORDX4 +; SI: S_ENDPGM +define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* + %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 4, i1 false) nounwind + ret void +} + +; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align8 +; SI: BUFFER_LOAD_DWORDX4 +; SI: BUFFER_STORE_DWORDX4 +; SI: BUFFER_LOAD_DWORDX4 +; SI: BUFFER_STORE_DWORDX4 +; SI: S_ENDPGM +define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* + %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 8, i1 false) nounwind + ret void +} + +; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align16 +; SI: BUFFER_LOAD_DWORDX4 +; SI: BUFFER_STORE_DWORDX4 +; SI: BUFFER_LOAD_DWORDX4 +; SI: BUFFER_STORE_DWORDX4 +; SI: S_ENDPGM +define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* + %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 16, i1 false) nounwind + ret void +} -- 2.34.1