From e67a4afb5da59c02338622eea68e096ba143113f Mon Sep 17 00:00:00 2001 From: Vincent Lejeune Date: Tue, 4 Jun 2013 23:17:15 +0000 Subject: [PATCH] R600: Const/Neg/Abs can be folded to dot4 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@183278 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDILISelDAGToDAG.cpp | 186 +++++++++++++++----- lib/Target/R600/R600EmitClauseMarkers.cpp | 6 +- lib/Target/R600/R600ExpandSpecialInstrs.cpp | 4 +- lib/Target/R600/R600InstrInfo.cpp | 35 ++++ lib/Target/R600/R600InstrInfo.h | 2 + test/CodeGen/R600/dot4-folding.ll | 27 +++ 6 files changed, 213 insertions(+), 47 deletions(-) create mode 100644 test/CodeGen/R600/dot4-folding.ll diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp index 959d6621fba..82f1c7e305a 100644 --- a/lib/Target/R600/AMDILISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp @@ -49,7 +49,10 @@ public: private: inline SDValue getSmallIPtrImm(unsigned Imm); + bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs, + const R600InstrInfo *TII, std::vector Cst); bool FoldOperands(unsigned, const R600InstrInfo *, std::vector &); + bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector &); // Complex pattern selectors bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2); @@ -318,6 +321,20 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { const R600InstrInfo *TII = static_cast(TM.getInstrInfo()); + if (Result && Result->isMachineOpcode() && Result->getMachineOpcode() == AMDGPU::DOT_4) { + bool IsModified = false; + do { + std::vector Ops; + for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end(); + I != E; ++I) + Ops.push_back(*I); + IsModified = FoldDotOperands(Result->getMachineOpcode(), TII, Ops); + if (IsModified) { + Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size()); + } + } while (IsModified); + + } if (Result && Result->isMachineOpcode() && !(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR) && TII->isALUInstr(Result->getMachineOpcode())) { @@ -360,6 +377,43 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { return Result; } +bool AMDGPUDAGToDAGISel::FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, + SDValue &Abs, const R600InstrInfo *TII, + std::vector Consts) { + switch (Src.getOpcode()) { + case AMDGPUISD::CONST_ADDRESS: { + SDValue CstOffset; + if (Src.getValueType().isVector() || + !SelectGlobalValueConstantOffset(Src.getOperand(0), CstOffset)) + return false; + + ConstantSDNode *Cst = dyn_cast(CstOffset); + Consts.push_back(Cst->getZExtValue()); + if (!TII->fitsConstReadLimitations(Consts)) + return false; + + Src = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32); + Sel = CstOffset; + return true; + } + case ISD::FNEG: + Src = Src.getOperand(0); + Neg = CurDAG->getTargetConstant(1, MVT::i32); + return true; + case ISD::FABS: + if (!Abs.getNode()) + return false; + Src = Src.getOperand(0); + Abs = CurDAG->getTargetConstant(1, MVT::i32); + return true; + case ISD::BITCAST: + Src = Src.getOperand(0); + return true; + default: + return false; + } +} + bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode, const R600InstrInfo *TII, std::vector &Ops) { int OperandIdx[] = { @@ -383,59 +437,101 @@ bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode, -1 }; + // Gather constants values + std::vector Consts; + for (unsigned j = 0; j < 3; j++) { + int SrcIdx = OperandIdx[j]; + if (SrcIdx < 0) + break; + if (RegisterSDNode *Reg = dyn_cast(Ops[SrcIdx - 1])) { + if (Reg->getReg() == AMDGPU::ALU_CONST) { + ConstantSDNode *Cst = dyn_cast(Ops[SelIdx[j] - 1]); + Consts.push_back(Cst->getZExtValue()); + } + } + } + for (unsigned i = 0; i < 3; i++) { if (OperandIdx[i] < 0) return false; - SDValue Operand = Ops[OperandIdx[i] - 1]; - switch (Operand.getOpcode()) { - case AMDGPUISD::CONST_ADDRESS: { - SDValue CstOffset; - if (Operand.getValueType().isVector() || - !SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) - break; - - // Gather others constants values - std::vector Consts; - for (unsigned j = 0; j < 3; j++) { - int SrcIdx = OperandIdx[j]; - if (SrcIdx < 0) - break; - if (RegisterSDNode *Reg = dyn_cast(Ops[SrcIdx - 1])) { - if (Reg->getReg() == AMDGPU::ALU_CONST) { - ConstantSDNode *Cst = dyn_cast(Ops[SelIdx[j] - 1]); - Consts.push_back(Cst->getZExtValue()); - } - } - } + SDValue &Src = Ops[OperandIdx[i] - 1]; + SDValue &Sel = Ops[SelIdx[i] - 1]; + SDValue &Neg = Ops[NegIdx[i] - 1]; + SDValue FakeAbs; + SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs; + if (FoldOperand(Src, Sel, Neg, Abs, TII, Consts)) + return true; + } + return false; +} - ConstantSDNode *Cst = dyn_cast(CstOffset); - Consts.push_back(Cst->getZExtValue()); - if (!TII->fitsConstReadLimitations(Consts)) - break; +bool AMDGPUDAGToDAGISel::FoldDotOperands(unsigned Opcode, + const R600InstrInfo *TII, std::vector &Ops) { + int OperandIdx[] = { + TII->getOperandIdx(Opcode, R600Operands::SRC0_X), + TII->getOperandIdx(Opcode, R600Operands::SRC0_Y), + TII->getOperandIdx(Opcode, R600Operands::SRC0_Z), + TII->getOperandIdx(Opcode, R600Operands::SRC0_W), + TII->getOperandIdx(Opcode, R600Operands::SRC1_X), + TII->getOperandIdx(Opcode, R600Operands::SRC1_Y), + TII->getOperandIdx(Opcode, R600Operands::SRC1_Z), + TII->getOperandIdx(Opcode, R600Operands::SRC1_W) + }; + int SelIdx[] = { + TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL_X), + TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL_Y), + TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL_Z), + TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL_W), + TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL_X), + TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL_Y), + TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL_Z), + TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL_W) + }; + int NegIdx[] = { + TII->getOperandIdx(Opcode, R600Operands::SRC0_NEG_X), + TII->getOperandIdx(Opcode, R600Operands::SRC0_NEG_Y), + TII->getOperandIdx(Opcode, R600Operands::SRC0_NEG_Z), + TII->getOperandIdx(Opcode, R600Operands::SRC0_NEG_W), + TII->getOperandIdx(Opcode, R600Operands::SRC1_NEG_X), + TII->getOperandIdx(Opcode, R600Operands::SRC1_NEG_Y), + TII->getOperandIdx(Opcode, R600Operands::SRC1_NEG_Z), + TII->getOperandIdx(Opcode, R600Operands::SRC1_NEG_W) + }; + int AbsIdx[] = { + TII->getOperandIdx(Opcode, R600Operands::SRC0_ABS_X), + TII->getOperandIdx(Opcode, R600Operands::SRC0_ABS_Y), + TII->getOperandIdx(Opcode, R600Operands::SRC0_ABS_Z), + TII->getOperandIdx(Opcode, R600Operands::SRC0_ABS_W), + TII->getOperandIdx(Opcode, R600Operands::SRC1_ABS_X), + TII->getOperandIdx(Opcode, R600Operands::SRC1_ABS_Y), + TII->getOperandIdx(Opcode, R600Operands::SRC1_ABS_Z), + TII->getOperandIdx(Opcode, R600Operands::SRC1_ABS_W) + }; - Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32); - Ops[SelIdx[i] - 1] = CstOffset; - return true; - } - case ISD::FNEG: - if (NegIdx[i] < 0) - break; - Ops[OperandIdx[i] - 1] = Operand.getOperand(0); - Ops[NegIdx[i] - 1] = CurDAG->getTargetConstant(1, MVT::i32); - return true; - case ISD::FABS: - if (AbsIdx[i] < 0) - break; - Ops[OperandIdx[i] - 1] = Operand.getOperand(0); - Ops[AbsIdx[i] - 1] = CurDAG->getTargetConstant(1, MVT::i32); - return true; - case ISD::BITCAST: - Ops[OperandIdx[i] - 1] = Operand.getOperand(0); - return true; - default: + // Gather constants values + std::vector Consts; + for (unsigned j = 0; j < 8; j++) { + int SrcIdx = OperandIdx[j]; + if (SrcIdx < 0) break; + if (RegisterSDNode *Reg = dyn_cast(Ops[SrcIdx - 1])) { + if (Reg->getReg() == AMDGPU::ALU_CONST) { + ConstantSDNode *Cst = dyn_cast(Ops[SelIdx[j] - 1]); + Consts.push_back(Cst->getZExtValue()); + } } } + + for (unsigned i = 0; i < 8; i++) { + if (OperandIdx[i] < 0) + return false; + SDValue &Src = Ops[OperandIdx[i] - 1]; + SDValue &Sel = Ops[SelIdx[i] - 1]; + SDValue &Neg = Ops[NegIdx[i] - 1]; + SDValue &Abs = Ops[AbsIdx[i] - 1]; + if (FoldOperand(Src, Sel, Neg, Abs, TII, Consts)) + return true; + } return false; } diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp index ecfcfeb4e8f..c9d8ed1d00d 100644 --- a/lib/Target/R600/R600EmitClauseMarkers.cpp +++ b/lib/Target/R600/R600EmitClauseMarkers.cpp @@ -108,7 +108,8 @@ private: std::vector > UsedKCache; const SmallVector, 3> &Consts = TII->getSrcs(MI); - assert(TII->isALUInstr(MI->getOpcode()) && "Can't assign Const"); + assert((TII->isALUInstr(MI->getOpcode()) || + MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const"); for (unsigned i = 0, n = Consts.size(); i < n; ++i) { if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) continue; @@ -183,6 +184,9 @@ private: if (TII->isALUInstr(I->getOpcode()) && !SubstituteKCacheBank(I, KCacheBanks)) break; + if (I->getOpcode() == AMDGPU::DOT_4 && + !SubstituteKCacheBank(I, KCacheBanks)) + break; AluInstCount += OccupiedDwords(I); } unsigned Opcode = PushBeforeModifier ? diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp index b9d5303ce1a..072ae3a5dcc 100644 --- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp +++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp @@ -214,7 +214,9 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { .getReg(); (void) Src0; (void) Src1; - assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1)); + if ((TRI.getEncodingValue(Src0) & 0xff) < 127 && + (TRI.getEncodingValue(Src1) & 0xff) < 127) + assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1)); } MI.eraseFromParent(); continue; diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index 5f8486d1519..2a4a2459015 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -169,6 +169,31 @@ SmallVector, 3> R600InstrInfo::getSrcs(MachineInstr *MI) const { SmallVector, 3> Result; + if (MI->getOpcode() == AMDGPU::DOT_4) { + static const R600Operands::VecOps OpTable[8][2] = { + {R600Operands::SRC0_X, R600Operands::SRC0_SEL_X}, + {R600Operands::SRC0_Y, R600Operands::SRC0_SEL_Y}, + {R600Operands::SRC0_Z, R600Operands::SRC0_SEL_Z}, + {R600Operands::SRC0_W, R600Operands::SRC0_SEL_W}, + {R600Operands::SRC1_X, R600Operands::SRC1_SEL_X}, + {R600Operands::SRC1_Y, R600Operands::SRC1_SEL_Y}, + {R600Operands::SRC1_Z, R600Operands::SRC1_SEL_Z}, + {R600Operands::SRC1_W, R600Operands::SRC1_SEL_W}, + }; + + for (unsigned j = 0; j < 8; j++) { + MachineOperand &MO = MI->getOperand(OpTable[j][0] + 1); + unsigned Reg = MO.getReg(); + if (Reg == AMDGPU::ALU_CONST) { + unsigned Sel = MI->getOperand(OpTable[j][1] + 1).getImm(); + Result.push_back(std::pair(&MO, Sel)); + continue; + } + + } + return Result; + } + static const R600Operands::Ops OpTable[3][2] = { {R600Operands::SRC0, R600Operands::SRC0_SEL}, {R600Operands::SRC1, R600Operands::SRC1_SEL}, @@ -967,6 +992,11 @@ int R600InstrInfo::getOperandIdx(const MachineInstr &MI, return getOperandIdx(MI.getOpcode(), Op); } +int R600InstrInfo::getOperandIdx(const MachineInstr &MI, + R600Operands::VecOps Op) const { + return getOperandIdx(MI.getOpcode(), Op); +} + int R600InstrInfo::getOperandIdx(unsigned Opcode, R600Operands::Ops Op) const { unsigned TargetFlags = get(Opcode).TSFlags; @@ -997,6 +1027,11 @@ int R600InstrInfo::getOperandIdx(unsigned Opcode, return R600Operands::ALUOpTable[OpTableIdx][Op]; } +int R600InstrInfo::getOperandIdx(unsigned Opcode, + R600Operands::VecOps Op) const { + return Op + 1; +} + void R600InstrInfo::setImmOperand(MachineInstr *MI, R600Operands::Ops Op, int64_t Imm) const { int Idx = getOperandIdx(*MI, Op); diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h index f9ccf4fbb9a..afc24e2d09f 100644 --- a/lib/Target/R600/R600InstrInfo.h +++ b/lib/Target/R600/R600InstrInfo.h @@ -212,11 +212,13 @@ namespace llvm { /// /// \returns -1 if the Instruction does not contain the specified \p Op. int getOperandIdx(const MachineInstr &MI, R600Operands::Ops Op) const; + int getOperandIdx(const MachineInstr &MI, R600Operands::VecOps Op) const; /// \brief Get the index of \p Op for the given Opcode. /// /// \returns -1 if the Instruction does not contain the specified \p Op. int getOperandIdx(unsigned Opcode, R600Operands::Ops Op) const; + int getOperandIdx(unsigned Opcode, R600Operands::VecOps Op) const; /// \brief Helper function for setting instruction flag values. void setImmOperand(MachineInstr *MI, R600Operands::Ops Op, int64_t Imm) const; diff --git a/test/CodeGen/R600/dot4-folding.ll b/test/CodeGen/R600/dot4-folding.ll new file mode 100644 index 00000000000..3e8330f9b3e --- /dev/null +++ b/test/CodeGen/R600/dot4-folding.ll @@ -0,0 +1,27 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; Exactly one constant vector can be folded into dot4, which means exactly +; 4 MOV instructions +; CHECK: @main +; CHECK: MOV +; CHECK: MOV +; CHECK: MOV +; CHECK: MOV +; CHECK-NOT: MOV +; CHECK-NOT: MOV +; CHECK-NOT: MOV +; CHECK-NOT: MOV + +define void @main(float addrspace(1)* %out) { +main_body: + %0 = load <4 x float> addrspace(8)* null + %1 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %2 = call float @llvm.AMDGPU.dp4(<4 x float> %0,<4 x float> %1) + %3 = insertelement <4 x float> undef, float %2, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %3, i32 0, i32 0) + ret void +} + +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) +attributes #1 = { readnone } -- 2.34.1