From: Matt Arsenault Date: Mon, 14 Dec 2015 17:25:38 +0000 (+0000) Subject: AMDGPU: Use generic bitreverse intrinsic X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=1451e94ee001b2719b7967d52675c9edbf923f86;p=oota-llvm.git AMDGPU: Use generic bitreverse intrinsic Also fix bug in vector legalization for bitreverse. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255512 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index eddf666c9c3..4bd0b6b518a 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -105,6 +105,7 @@ class VectorLegalizer { SDValue ExpandLoad(SDValue Op); SDValue ExpandStore(SDValue Op); SDValue ExpandFNEG(SDValue Op); + SDValue ExpandBITREVERSE(SDValue Op); /// \brief Implements vector promotion. /// @@ -280,6 +281,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::ROTL: case ISD::ROTR: case ISD::BSWAP: + case ISD::BITREVERSE: case ISD::CTLZ: case ISD::CTTZ: case ISD::CTLZ_ZERO_UNDEF: @@ -417,7 +419,7 @@ SDValue VectorLegalizer::Promote(SDValue Op) { else Operands[j] = Op.getOperand(j); } - + Op = DAG.getNode(Op.getOpcode(), dl, NVT, Operands, Op.getNode()->getFlags()); if ((VT.isFloatingPoint() && NVT.isFloatingPoint()) || (VT.isVector() && VT.getVectorElementType().isFloatingPoint() && @@ -715,6 +717,8 @@ SDValue VectorLegalizer::Expand(SDValue Op) { return ExpandFNEG(Op); case ISD::SETCC: return UnrollVSETCC(Op); + case ISD::BITREVERSE: + return ExpandBITREVERSE(Op); default: return DAG.UnrollVectorOp(Op.getNode()); } @@ -900,6 +904,25 @@ SDValue VectorLegalizer::ExpandBSWAP(SDValue Op) { return DAG.getNode(ISD::BITCAST, DL, VT, Op); } +SDValue VectorLegalizer::ExpandBITREVERSE(SDValue Op) { + EVT VT = Op.getValueType(); + + // If we have the scalar operation, it's probably cheaper to unroll it. + if (TLI.isOperationLegalOrCustom(ISD::BITREVERSE, VT.getScalarType())) + return DAG.UnrollVectorOp(Op.getNode()); + + // If we have the appropriate vector bit operations, it is better to use them + // than unrolling and expanding each component. + if (!TLI.isOperationLegalOrCustom(ISD::SHL, VT) || + !TLI.isOperationLegalOrCustom(ISD::SRL, VT) || + !TLI.isOperationLegalOrCustom(ISD::AND, VT) || + !TLI.isOperationLegalOrCustom(ISD::OR, VT)) + return DAG.UnrollVectorOp(Op.getNode()); + + // Let LegalizeDAG handle this later. + return Op; +} + SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) { // Implement VSELECT in terms of XOR, AND, OR // on platforms which do not support blend natively. diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 25aa2bb0988..222f63161be 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1036,9 +1036,6 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1), Op.getOperand(2)); - case AMDGPUIntrinsic::AMDGPU_brev: - return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1)); - case Intrinsic::AMDGPU_class: return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -1050,6 +1047,8 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name. return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_brev: // Legacy name + return DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(1)); } } @@ -2700,7 +2699,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BFE_I32) NODE_NAME_CASE(BFI) NODE_NAME_CASE(BFM) - NODE_NAME_CASE(BREV) NODE_NAME_CASE(MUL_U24) NODE_NAME_CASE(MUL_I24) NODE_NAME_CASE(MAD_U24) diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index 1e060c4d708..7314cc050ba 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -263,7 +263,6 @@ enum NodeType : unsigned { BFE_I32, // Extract range of bits with sign extension to 32-bits. BFI, // (src0 & src1) | (~src0 & src2) BFM, // Insert a range of bits into a 32-bit word. - BREV, // Reverse bits. MUL_U24, MUL_I24, MAD_U24, diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td index b413897d9d2..70e589c2842 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -191,8 +191,6 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; -def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>; - // Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when // performing the mulitply. The result is a 32-bit value. def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 8dc3934b8cb..e31552c5554 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -107,6 +107,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setOperationAction(ISD::SETCC, MVT::v4i1, Expand); setOperationAction(ISD::BSWAP, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index e874db1fe0a..f247cbf41c9 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -127,7 +127,7 @@ let Defs = [SCC] in { defm S_BREV_B32 : SOP1_32 , "s_brev_b32", - [(set i32:$dst, (AMDGPUbrev i32:$src0))] + [(set i32:$dst, (bitreverse i32:$src0))] >; defm S_BREV_B64 : SOP1_64 , "s_brev_b64", []>; diff --git a/test/CodeGen/AMDGPU/bitreverse.ll b/test/CodeGen/AMDGPU/bitreverse.ll new file mode 100644 index 00000000000..0ef7d5184c1 --- /dev/null +++ b/test/CodeGen/AMDGPU/bitreverse.ll @@ -0,0 +1,115 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i16 @llvm.bitreverse.i16(i16) #1 +declare i32 @llvm.bitreverse.i32(i32) #1 +declare i64 @llvm.bitreverse.i64(i64) #1 + +declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>) #1 +declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) #1 + +declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) #1 +declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1 + +declare i32 @llvm.AMDGPU.brev(i32) #1 + +; FUNC-LABEL: {{^}}s_brev_i16: +; SI: s_brev_b32 +define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 { + %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 + store i16 %brev, i16 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_brev_i16: +; SI: v_bfrev_b32_e32 +define void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 { + %val = load i16, i16 addrspace(1)* %valptr + %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 + store i16 %brev, i16 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_brev_i32: +; SI: s_load_dword [[VAL:s[0-9]+]], +; SI: s_brev_b32 [[SRESULT:s[0-9]+]], [[VAL]] +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +; SI: buffer_store_dword [[VRESULT]], +; SI: s_endpgm +define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 { + %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 + store i32 %brev, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_brev_i32: +; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 { + %val = load i32, i32 addrspace(1)* %valptr + %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 + store i32 %brev, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_brev_v2i32: +; SI: s_brev_b32 +; SI: s_brev_b32 +define void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 { + %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 + store <2 x i32> %brev, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_brev_v2i32: +; SI: v_bfrev_b32_e32 +; SI: v_bfrev_b32_e32 +define void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 { + %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr + %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 + store <2 x i32> %brev, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_brev_i64: +define void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 { + %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 + store i64 %brev, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_brev_i64: +define void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 { + %val = load i64, i64 addrspace(1)* %valptr + %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 + store i64 %brev, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_brev_v2i64: +define void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 { + %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 + store <2 x i64> %brev, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_brev_v2i64: +define void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 { + %val = load <2 x i64>, <2 x i64> addrspace(1)* %valptr + %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 + store <2 x i64> %brev, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}legacy_s_brev_i32: +; SI: s_brev_b32 +define void @legacy_s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { + %brev = call i32 @llvm.AMDGPU.brev(i32 %val) #1 + store i32 %brev, i32 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll deleted file mode 100644 index 301de4b1c82..00000000000 --- a/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll +++ /dev/null @@ -1,28 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.brev(i32) nounwind readnone - -; FUNC-LABEL: {{^}}s_brev_i32: -; SI: s_load_dword [[VAL:s[0-9]+]], -; SI: s_brev_b32 [[SRESULT:s[0-9]+]], [[VAL]] -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] -; SI: buffer_store_dword [[VRESULT]], -; SI: s_endpgm -define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { - %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone - store i32 %ctlz, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_brev_i32: -; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr, align 4 - %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone - store i32 %ctlz, i32 addrspace(1)* %out, align 4 - ret void -}