From b5659367ca98a51bcce017d042ea681f01d14e37 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 5 Jan 2016 03:40:16 +0000 Subject: [PATCH] AMDGPU/SI: Select non-uniform constant addrspace loads to flat instructions for HSA Summary: This fixes a regression caused by r256282. Reviewers: arsenm, cfang Subscribers: arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D15736 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@256810 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIInstrInfo.td | 3 +- test/CodeGen/AMDGPU/load.ll | 140 ++++++++++----- test/CodeGen/AMDGPU/salu-to-valu.ll | 269 ++++++++++++++++------------ 3 files changed, 246 insertions(+), 166 deletions(-) diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 84a386d8872..8735277149a 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -141,7 +141,8 @@ def SIconstdata_ptr : SDNode< class flat_ld : PatFrag<(ops node:$ptr), (ld node:$ptr), [{ return isFlatLoad(dyn_cast(N)) || - isGlobalLoad(dyn_cast(N)); + isGlobalLoad(dyn_cast(N)) || + isConstantLoad(cast(N), -1); }]>; def flat_load : flat_ld ; diff --git a/test/CodeGen/AMDGPU/load.ll b/test/CodeGen/AMDGPU/load.ll index 6a04261fe47..6486c6ab2ff 100644 --- a/test/CodeGen/AMDGPU/load.ll +++ b/test/CodeGen/AMDGPU/load.ll @@ -1,7 +1,8 @@ ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600 --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-NOHSA --check-prefix=FUNC %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs | FileCheck --check-prefix=FUNC --check-prefix=CI-HSA --check-prefix=SI %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI-NOHSA --check-prefix=FUNC %s ;===------------------------------------------------------------------------===; ; GLOBAL ADDRESS SPACE @@ -11,7 +12,8 @@ ; FUNC-LABEL: {{^}}load_i8: ; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}} -; SI: buffer_load_ubyte v{{[0-9]+}}, +; SI-NOHSA: buffer_load_ubyte v{{[0-9]+}}, +; CI-HSA: flat_load_ubyte define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { %1 = load i8, i8 addrspace(1)* %in %2 = zext i8 %1 to i32 @@ -23,7 +25,8 @@ define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { ; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]] ; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; R600: 8 -; SI: buffer_load_sbyte +; SI-NOHSA: buffer_load_sbyte +; CI-HSA: flat_load_sbyte define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { entry: %0 = load i8, i8 addrspace(1)* %in @@ -35,8 +38,10 @@ entry: ; FUNC-LABEL: {{^}}load_v2i8: ; R600: VTX_READ_8 ; R600: VTX_READ_8 -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte +; SI-NOHSA: buffer_load_ubyte +; SI-NOHSA: buffer_load_ubyte +; CI-HSA: flat_load_ubyte +; CI-HSA: flat_load_ubyte define void @load_v2i8(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) { entry: %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in @@ -53,8 +58,10 @@ entry: ; R600-DAG: 8 ; R600-DAG: 8 -; SI: buffer_load_sbyte -; SI: buffer_load_sbyte +; SI-NOHSA: buffer_load_sbyte +; SI-NOHSA: buffer_load_sbyte +; CI-HSA: flat_load_sbyte +; CI-HSA: flat_load_sbyte define void @load_v2i8_sext(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) { entry: %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in @@ -68,10 +75,14 @@ entry: ; R600: VTX_READ_8 ; R600: VTX_READ_8 ; R600: VTX_READ_8 -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte +; SI-NOHSA: buffer_load_ubyte +; SI-NOHSA: buffer_load_ubyte +; SI-NOHSA: buffer_load_ubyte +; SI-NOHSA: buffer_load_ubyte +; CI-HSA: flat_load_ubyte +; CI-HSA: flat_load_ubyte +; CI-HSA: flat_load_ubyte +; CI-HSA: flat_load_ubyte define void @load_v4i8(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) { entry: %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in @@ -93,10 +104,14 @@ entry: ; R600-DAG: 8 ; R600-DAG: 8 ; R600-DAG: 8 -; SI: buffer_load_sbyte -; SI: buffer_load_sbyte -; SI: buffer_load_sbyte -; SI: buffer_load_sbyte +; SI-NOHSA: buffer_load_sbyte +; SI-NOHSA: buffer_load_sbyte +; SI-NOHSA: buffer_load_sbyte +; SI-NOHSA: buffer_load_sbyte +; CI-HSA: flat_load_sbyte +; CI-HSA: flat_load_sbyte +; CI-HSA: flat_load_sbyte +; CI-HSA: flat_load_sbyte define void @load_v4i8_sext(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) { entry: %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in @@ -108,7 +123,8 @@ entry: ; Load an i16 value from the global address space. ; FUNC-LABEL: {{^}}load_i16: ; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}} -; SI: buffer_load_ushort +; SI-NOHSA: buffer_load_ushort +; CI-HSA: flat_load_ushort define void @load_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { entry: %0 = load i16 , i16 addrspace(1)* %in @@ -121,7 +137,8 @@ entry: ; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]] ; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; R600: 16 -; SI: buffer_load_sshort +; SI-NOHSA: buffer_load_sshort +; CI-HSA: flat_load_sshort define void @load_i16_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { entry: %0 = load i16, i16 addrspace(1)* %in @@ -133,8 +150,10 @@ entry: ; FUNC-LABEL: {{^}}load_v2i16: ; R600: VTX_READ_16 ; R600: VTX_READ_16 -; SI: buffer_load_ushort -; SI: buffer_load_ushort +; SI-NOHSA: buffer_load_ushort +; SI-NOHSA: buffer_load_ushort +; CI-HSA: flat_load_ushort +; CI-HSA: flat_load_ushort define void @load_v2i16(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { entry: %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in @@ -150,8 +169,10 @@ entry: ; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal ; R600-DAG: 16 ; R600-DAG: 16 -; SI: buffer_load_sshort -; SI: buffer_load_sshort +; SI-NOHSA: buffer_load_sshort +; SI-NOHSA: buffer_load_sshort +; CI-HSA: flat_load_sshort +; CI-HSA: flat_load_sshort define void @load_v2i16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { entry: %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in @@ -165,10 +186,14 @@ entry: ; R600: VTX_READ_16 ; R600: VTX_READ_16 ; R600: VTX_READ_16 -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort +; SI-NOHSA: buffer_load_ushort +; SI-NOHSA: buffer_load_ushort +; SI-NOHSA: buffer_load_ushort +; SI-NOHSA: buffer_load_ushort +; CI-HSA: flat_load_ushort +; CI-HSA: flat_load_ushort +; CI-HSA: flat_load_ushort +; CI-HSA: flat_load_ushort define void @load_v4i16(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { entry: %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in @@ -190,10 +215,14 @@ entry: ; R600-DAG: 16 ; R600-DAG: 16 ; R600-DAG: 16 -; SI: buffer_load_sshort -; SI: buffer_load_sshort -; SI: buffer_load_sshort -; SI: buffer_load_sshort +; SI-NOHSA: buffer_load_sshort +; SI-NOHSA: buffer_load_sshort +; SI-NOHSA: buffer_load_sshort +; SI-NOHSA: buffer_load_sshort +; CI-HSA: flat_load_sshort +; CI-HSA: flat_load_sshort +; CI-HSA: flat_load_sshort +; CI-HSA: flat_load_sshort define void @load_v4i16_sext(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { entry: %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in @@ -206,7 +235,8 @@ entry: ; FUNC-LABEL: {{^}}load_i32: ; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 -; SI: buffer_load_dword v{{[0-9]+}} +; SI-NOHSA: buffer_load_dword v{{[0-9]+}} +; CI-HSA: flat_load_dword define void @load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %0 = load i32, i32 addrspace(1)* %in @@ -218,7 +248,8 @@ entry: ; FUNC-LABEL: {{^}}load_f32: ; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 -; SI: buffer_load_dword v{{[0-9]+}} +; SI-NOHSA: buffer_load_dword v{{[0-9]+}} +; CI-HSA: flat_load_dword define void @load_f32(float addrspace(1)* %out, float addrspace(1)* %in) { entry: %0 = load float, float addrspace(1)* %in @@ -230,7 +261,8 @@ entry: ; FUNC-LABEL: {{^}}load_v2f32: ; R600: MEM_RAT ; R600: VTX_READ_64 -; SI: buffer_load_dwordx2 +; SI-NOHSA: buffer_load_dwordx2 +; CI-HSA: flat_load_dwordx2 define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) { entry: %0 = load <2 x float>, <2 x float> addrspace(1)* %in @@ -240,7 +272,8 @@ entry: ; FUNC-LABEL: {{^}}load_i64: ; R600: VTX_READ_64 -; SI: buffer_load_dwordx2 +; SI-NOHSA: buffer_load_dwordx2 +; CI-HSA: flat_load_dwordx2 define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { entry: %0 = load i64, i64 addrspace(1)* %in @@ -253,7 +286,8 @@ entry: ; R600: MEM_RAT ; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.x ; R600: 31 -; SI: buffer_load_dword +; SI-NOHSA: buffer_load_dword +; CI-HSA: flat_load_dword define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: @@ -278,8 +312,10 @@ entry: ; R600: VTX_READ_128 ; R600: VTX_READ_128 -; SI: buffer_load_dwordx4 -; SI: buffer_load_dwordx4 +; SI-NOHSA: buffer_load_dwordx4 +; SI-NOHSA: buffer_load_dwordx4 +; CI-HSA: flat_load_dwordx4 +; CI-HSA: flat_load_dwordx4 define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) { entry: %0 = load <8 x i32>, <8 x i32> addrspace(1)* %in @@ -293,10 +329,14 @@ entry: ; R600: VTX_READ_128 ; R600: VTX_READ_128 -; SI: buffer_load_dwordx4 -; SI: buffer_load_dwordx4 -; SI: buffer_load_dwordx4 -; SI: buffer_load_dwordx4 +; SI-NOHSA: buffer_load_dwordx4 +; SI-NOHSA: buffer_load_dwordx4 +; SI-NOHSA: buffer_load_dwordx4 +; SI-NOHSA: buffer_load_dwordx4 +; CI-HSA: flat_load_dwordx4 +; CI-HSA: flat_load_dwordx4 +; CI-HSA: flat_load_dwordx4 +; CI-HSA: flat_load_dwordx4 define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) { entry: %0 = load <16 x i32>, <16 x i32> addrspace(1)* %in @@ -313,7 +353,8 @@ entry: ; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]] ; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; R600: 8 -; SI: buffer_load_sbyte v{{[0-9]+}}, +; SI-NOHSA: buffer_load_sbyte v{{[0-9]+}}, +; CI-HSA: flat_load_sbyte v{{[0-9]+}}, define void @load_const_i8_sext(i32 addrspace(1)* %out, i8 addrspace(2)* %in) { entry: %0 = load i8, i8 addrspace(2)* %in @@ -325,7 +366,8 @@ entry: ; Load an aligned i8 value ; FUNC-LABEL: {{^}}load_const_i8_aligned: ; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}} -; SI: buffer_load_ubyte v{{[0-9]+}}, +; SI-NOHSA: buffer_load_ubyte v{{[0-9]+}}, +; CI-HSA: flat_load_ubyte v{{[0-9]+}}, define void @load_const_i8_aligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) { entry: %0 = load i8, i8 addrspace(2)* %in @@ -337,7 +379,8 @@ entry: ; Load an un-aligned i8 value ; FUNC-LABEL: {{^}}load_const_i8_unaligned: ; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}} -; SI: buffer_load_ubyte v{{[0-9]+}}, +; SI-NOHSA: buffer_load_ubyte v{{[0-9]+}}, +; CI-HSA: flat_load_ubyte v{{[0-9]+}}, define void @load_const_i8_unaligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) { entry: %0 = getelementptr i8, i8 addrspace(2)* %in, i32 1 @@ -352,7 +395,8 @@ entry: ; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]] ; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; R600: 16 -; SI: buffer_load_sshort +; SI-NOHSA: buffer_load_sshort +; CI-HSA: flat_load_sshort define void @load_const_i16_sext(i32 addrspace(1)* %out, i16 addrspace(2)* %in) { entry: %0 = load i16, i16 addrspace(2)* %in @@ -364,7 +408,8 @@ entry: ; Load an aligned i16 value ; FUNC-LABEL: {{^}}load_const_i16_aligned: ; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}} -; SI: buffer_load_ushort +; SI-NOHSA: buffer_load_ushort +; CI-HSA: flat_load_ushort define void @load_const_i16_aligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) { entry: %0 = load i16, i16 addrspace(2)* %in @@ -376,7 +421,8 @@ entry: ; Load an un-aligned i16 value ; FUNC-LABEL: {{^}}load_const_i16_unaligned: ; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}} -; SI: buffer_load_ushort +; SI-NOHSA: buffer_load_ushort +; CI-HSA: flat_load_ushort define void @load_const_i16_unaligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) { entry: %0 = getelementptr i16, i16 addrspace(2)* %in, i32 1 diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll index a30c25e700a..551f34339a1 100644 --- a/test/CodeGen/AMDGPU/salu-to-valu.ll +++ b/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -1,5 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s declare i32 @llvm.r600.read.tidig.x() #0 declare i32 @llvm.r600.read.tidig.y() #0 @@ -18,8 +19,10 @@ declare i32 @llvm.r600.read.tidig.y() #0 ; Make sure we aren't using VGPR's for the srsrc operand of BUFFER_LOAD_* ; instructions -; GCN: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 -; GCN: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 +; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 +; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 +; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}} +; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}} define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { entry: @@ -50,8 +53,10 @@ done: ; preds = %loop ; Test moving an SMRD instruction to the VALU ; GCN-LABEL: {{^}}smrd_valu: +; FIXME: We should be using flat load for HSA. ; GCN: buffer_load_dword [[OUT:v[0-9]+]] -; GCN: buffer_store_dword [[OUT]] +; GCN-NOHSA: buffer_store_dword [[OUT]] +; GCN-HSA: flat_store_dword [[OUT]] define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 { entry: %tmp = icmp ne i32 %a, 0 @@ -77,8 +82,9 @@ endif: ; preds = %else, %if ; Test moving an SMRD with an immediate offset to the VALU ; GCN-LABEL: {{^}}smrd_valu2: -; GCN-NOT: v_add -; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}} +; GCN-NOHSA-NOT: v_add +; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}} +; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 { entry: %tmp = call i32 @llvm.r600.read.tidig.x() #0 @@ -91,12 +97,14 @@ entry: ; Use a big offset that will use the SMRD literal offset on CI ; GCN-LABEL: {{^}}smrd_valu_ci_offset: -; GCN-NOT: v_add -; GCN: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4e20{{$}} -; GCN-NOT: v_add -; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}} -; GCN: v_add_i32_e32 -; GCN: buffer_store_dword +; GCN-NOHSA-NOT: v_add +; GCN-NOHSA: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4e20{{$}} +; GCN-NOHSA-NOT: v_add +; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}} +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: buffer_store_dword +; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GCN-HSA: flat_store_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] define void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 { entry: %tmp = call i32 @llvm.r600.read.tidig.x() #0 @@ -109,13 +117,14 @@ entry: } ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x2: -; GCN-NOT: v_add -; GCN: s_mov_b32 [[OFFSET:s[0-9]+]], 0x9c40{{$}} -; GCN-NOT: v_add -; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}} -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: buffer_store_dwordx2 +; GCN-NOHSA-NOT: v_add +; GCN-NOHSA: s_mov_b32 [[OFFSET:s[0-9]+]], 0x9c40{{$}} +; GCN-NOHSA-NOT: v_add +; GCN-NOHSA: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}} +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: buffer_store_dwordx2 +; GCN-HSA: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] define void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 { entry: %tmp = call i32 @llvm.r600.read.tidig.x() #0 @@ -128,15 +137,16 @@ entry: } ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x4: -; GCN-NOT: v_add -; GCN: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4d20{{$}} -; GCN-NOT: v_add -; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}} -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: buffer_store_dwordx4 +; GCN-NOHSA-NOT: v_add +; GCN-NOHSA: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4d20{{$}} +; GCN-NOHSA-NOT: v_add +; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}} +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: buffer_store_dwordx4 +; GCN-HSA: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] define void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 { entry: %tmp = call i32 @llvm.r600.read.tidig.x() #0 @@ -152,25 +162,27 @@ entry: ; CI. ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8: -; GCN-NOT: v_add -; GCN: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}} -; GCN-NOT: v_add -; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} -; GCN-NOT: v_add -; GCN: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}} -; GCN-NOT: v_add -; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}} - -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: buffer_store_dwordx4 -; GCN: buffer_store_dwordx4 +; GCN-NOHSA-NOT: v_add +; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}} +; GCN-NOHSA-NOT: v_add +; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} +; GCN-NOHSA-NOT: v_add +; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}} +; GCN-NOHSA-NOT: v_add +; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}} + +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: buffer_store_dwordx4 +; GCN-NOHSA: buffer_store_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 define void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 { entry: %tmp = call i32 @llvm.r600.read.tidig.x() #0 @@ -184,35 +196,40 @@ entry: ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16: -; GCN-NOT: v_add -; GCN: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}} -; GCN-NOT: v_add -; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} -; GCN-NOT: v_add -; GCN: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}} -; GCN-NOT: v_add -; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}} -; GCN-NOT: v_add -; GCN: s_mov_b32 [[OFFSET2:s[0-9]+]], 0x134a0{{$}} -; GCN-NOT: v_add -; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET2]] addr64{{$}} -; GCN-NOT: v_add -; GCN: s_mov_b32 [[OFFSET3:s[0-9]+]], 0x134b0{{$}} -; GCN-NOT: v_add -; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET3]] addr64{{$}} - -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: buffer_store_dwordx4 -; GCN: buffer_store_dwordx4 -; GCN: buffer_store_dwordx4 -; GCN: buffer_store_dwordx4 +; GCN-NOHSA-NOT: v_add +; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}} +; GCN-NOHSA-NOT: v_add +; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} +; GCN-NOHSA-NOT: v_add +; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}} +; GCN-NOHSA-NOT: v_add +; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}} +; GCN-NOHSA-NOT: v_add +; GCN-NOHSA: s_mov_b32 [[OFFSET2:s[0-9]+]], 0x134a0{{$}} +; GCN-NOHSA-NOT: v_add +; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET2]] addr64{{$}} +; GCN-NOHSA-NOT: v_add +; GCN-NOHSA: s_mov_b32 [[OFFSET3:s[0-9]+]], 0x134b0{{$}} +; GCN-NOHSA-NOT: v_add +; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET3]] addr64{{$}} + +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN-NOHSA: buffer_store_dwordx4 +; GCN-NOHSA: buffer_store_dwordx4 +; GCN-NOHSA: buffer_store_dwordx4 +; GCN-NOHSA: buffer_store_dwordx4 + +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 ; GCN: s_endpgm define void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 { @@ -227,9 +244,11 @@ entry: } ; GCN-LABEL: {{^}}smrd_valu2_salu_user: -; GCN: buffer_load_dword [[MOVED:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; GCN-NOHSA: buffer_load_dword [[MOVED:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; GCN-HSA: flat_load_dword [[MOVED:v[0-9]+]], v[{{[0-9+:[0-9]+}}] ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[MOVED]] -; GCN: buffer_store_dword [[ADD]] +; GCN-NOHSA: buffer_store_dword [[ADD]] +; GCN-HSA: flat_store_dword [[ADD]] define void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 { entry: %tmp = call i32 @llvm.r600.read.tidig.x() #0 @@ -242,7 +261,8 @@ entry: } ; GCN-LABEL: {{^}}smrd_valu2_max_smrd_offset: -; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}} +; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}} +; GCN-HSA flat_load_dword v{{[0-9]}}, v{{[0-9]+:[0-9]+}} define void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 { entry: %tmp = call i32 @llvm.r600.read.tidig.x() #0 @@ -254,8 +274,9 @@ entry: } ; GCN-LABEL: {{^}}smrd_valu2_mubuf_offset: -; GCN-NOT: v_add -; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}} +; GCN-NOHSA-NOT: v_add +; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}} +; GCN-HSA: flat_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}] define void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 { entry: %tmp = call i32 @llvm.r600.read.tidig.x() #0 @@ -267,8 +288,10 @@ entry: } ; GCN-LABEL: {{^}}s_load_imm_v8i32: -; GCN: buffer_load_dwordx4 -; GCN: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { entry: %tmp0 = tail call i32 @llvm.r600.read.tidig.x() @@ -280,16 +303,18 @@ entry: } ; GCN-LABEL: {{^}}s_load_imm_v8i32_salu_user: -; GCN: buffer_load_dwordx4 -; GCN: buffer_load_dwordx4 -; GCN: v_add_i32_e32 -; GCN: v_add_i32_e32 -; GCN: v_add_i32_e32 -; GCN: v_add_i32_e32 -; GCN: v_add_i32_e32 -; GCN: v_add_i32_e32 -; GCN: v_add_i32_e32 -; GCN: buffer_store_dword +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: buffer_store_dword +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 define void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { entry: %tmp0 = tail call i32 @llvm.r600.read.tidig.x() @@ -319,10 +344,14 @@ entry: } ; GCN-LABEL: {{^}}s_load_imm_v16i32: -; GCN: buffer_load_dwordx4 -; GCN: buffer_load_dwordx4 -; GCN: buffer_load_dwordx4 -; GCN: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { entry: %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1 @@ -334,26 +363,30 @@ entry: } ; GCN-LABEL: {{^}}s_load_imm_v16i32_salu_user: -; GCN: buffer_load_dwordx4 -; GCN: buffer_load_dwordx4 -; GCN: buffer_load_dwordx4 -; GCN: buffer_load_dwordx4 -; GCN: v_add_i32_e32 -; GCN: v_add_i32_e32 -; GCN: v_add_i32_e32 -; GCN: v_add_i32_e32 -; GCN: v_add_i32_e32 -; GCN: v_add_i32_e32 -; GCN: v_add_i32_e32 -; GCN: v_add_i32_e32 -; GCN: v_add_i32_e32 -; GCN: v_add_i32_e32 -; GCN: v_add_i32_e32 -; GCN: v_add_i32_e32 -; GCN: v_add_i32_e32 -; GCN: v_add_i32_e32 -; GCN: v_add_i32_e32 -; GCN: buffer_store_dword +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: v_add_i32_e32 +; GCN-NOHSA: buffer_store_dword +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 define void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { entry: %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1 -- 2.34.1