From 68db37b952be497c94c7aa98cf26f3baadb5afd3 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Wed, 14 Aug 2013 23:24:45 +0000 Subject: [PATCH] R600/SI: Convert v16i8 resource descriptors to i128 Now that compute support is better on SI, we can't continue using v16i8 for descriptors since this is also a legal type in OpenCL. This patch fixes numerous hangs with the piglit OpenCL test and since we now use a target specific DAG node for LOAD_CONSTANT with the correct MemOperandFlags, this should also fix: https://bugs.freedesktop.org/show_bug.cgi?id=66805 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188429 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPU.h | 1 + lib/Target/R600/AMDGPUISelLowering.cpp | 6 + lib/Target/R600/AMDGPUISelLowering.h | 7 ++ lib/Target/R600/AMDGPUTargetMachine.cpp | 1 + lib/Target/R600/CMakeLists.txt | 1 + lib/Target/R600/SIISelLowering.cpp | 55 ++++++++- lib/Target/R600/SIISelLowering.h | 3 + lib/Target/R600/SIInstrInfo.td | 20 ++++ lib/Target/R600/SIInstructions.td | 76 ++++++------ lib/Target/R600/SIIntrinsics.td | 6 +- lib/Target/R600/SIRegisterInfo.td | 2 +- lib/Target/R600/SITypeRewriter.cpp | 146 ++++++++++++++++++++++++ test/CodeGen/R600/llvm.SI.sample.ll | 34 +++--- test/CodeGen/R600/llvm.SI.sampled.ll | 34 +++--- 14 files changed, 314 insertions(+), 78 deletions(-) create mode 100644 lib/Target/R600/SITypeRewriter.cpp diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index 6b374cbfff2..e2d1caf8676 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -34,6 +34,7 @@ FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm); FunctionPass *createAMDGPUCFGStructurizerPass(TargetMachine &tm); // SI Passes +FunctionPass *createSITypeRewriter(); FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm); diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index efd27567fe2..9bb487e550c 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -507,5 +507,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CONST_ADDRESS) NODE_NAME_CASE(REGISTER_LOAD) NODE_NAME_CASE(REGISTER_STORE) + NODE_NAME_CASE(LOAD_CONSTANT) + NODE_NAME_CASE(LOAD_INPUT) + NODE_NAME_CASE(SAMPLE) + NODE_NAME_CASE(SAMPLEB) + NODE_NAME_CASE(SAMPLED) + NODE_NAME_CASE(SAMPLEL) } } diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index f614e23e2ce..5419e71e7f2 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -139,6 +139,13 @@ enum { CONST_ADDRESS, REGISTER_LOAD, REGISTER_STORE, + LOAD_INPUT, + SAMPLE, + SAMPLEB, + SAMPLED, + SAMPLEL, + FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, + LOAD_CONSTANT, LAST_AMDGPU_ISD_NUMBER }; diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index 5ebc5f27dc4..d77cdddf8b5 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -121,6 +121,7 @@ AMDGPUPassConfig::addPreISel() { const AMDGPUSubtarget &ST = TM->getSubtarget(); addPass(createFlattenCFGPass()); if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) { + addPass(createSITypeRewriter()); addPass(createStructurizeCFGPass()); addPass(createSIAnnotateControlFlowPass()); } else { diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt index fde187b0d3e..658eeea8148 100644 --- a/lib/Target/R600/CMakeLists.txt +++ b/lib/Target/R600/CMakeLists.txt @@ -47,6 +47,7 @@ add_llvm_target(R600CodeGen SILowerControlFlow.cpp SIMachineFunctionInfo.cpp SIRegisterInfo.cpp + SITypeRewriter.cpp ) add_dependencies(LLVMR600CodeGen AMDGPUCommonTableGen intrinsics_gen) diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 4631f8a3a8b..40f082723af 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -37,7 +37,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : addRegisterClass(MVT::v2i1, &AMDGPU::VReg_64RegClass); addRegisterClass(MVT::v4i1, &AMDGPU::VReg_128RegClass); - addRegisterClass(MVT::v16i8, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); @@ -70,6 +69,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::ADD, MVT::i64, Legal); setOperationAction(ISD::ADD, MVT::i32, Legal); + setOperationAction(ISD::BITCAST, MVT::i128, Legal); + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); @@ -82,6 +83,9 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand); @@ -415,7 +419,31 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case Intrinsic::r600_read_tidig_z: return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, AMDGPU::VGPR2, VT); - + case AMDGPUIntrinsic::SI_load_const: { + SDValue Ops [] = { + ResourceDescriptorToi128(Op.getOperand(1), DAG), + Op.getOperand(2) + }; + + MachineMemOperand *MMO = new MachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, + VT.getSizeInBits() / 8, 4); + return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, + Op->getVTList(), Ops, 2, VT, MMO); + } + case AMDGPUIntrinsic::SI_sample: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); + case AMDGPUIntrinsic::SI_sampleb: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); + case AMDGPUIntrinsic::SI_sampled: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); + case AMDGPUIntrinsic::SI_samplel: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); + case AMDGPUIntrinsic::SI_vs_load_input: + return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, + ResourceDescriptorToi128(Op.getOperand(1), DAG), + Op.getOperand(2), + Op.getOperand(3)); } } } @@ -516,6 +544,29 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, return Chain; } +SDValue SITargetLowering::ResourceDescriptorToi128(SDValue Op, + SelectionDAG &DAG) const { + + if (Op.getValueType() == MVT::i128) { + return Op; + } + + assert(Op.getOpcode() == ISD::UNDEF); + + return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), MVT::i128, + DAG.getConstant(0, MVT::i64), + DAG.getConstant(0, MVT::i64)); +} + +SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, + const SDValue &Op, + SelectionDAG &DAG) const { + return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), + Op.getOperand(2), + ResourceDescriptorToi128(Op.getOperand(3), DAG), + Op.getOperand(4)); +} + SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h index effbf1f85de..321e58c153b 100644 --- a/lib/Target/R600/SIISelLowering.h +++ b/lib/Target/R600/SIISelLowering.h @@ -23,11 +23,14 @@ namespace llvm { class SITargetLowering : public AMDGPUTargetLowering { SDValue LowerParameter(SelectionDAG &DAG, EVT VT, SDLoc DL, SDValue Chain, unsigned Offset) const; + SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op, + SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + SDValue ResourceDescriptorToi128(SDValue Op, SelectionDAG &DAG) const; bool foldImm(SDValue &Operand, int32_t &Immediate, bool &ScalarSlotUsed) const; const TargetRegisterClass *getRegClassForNode(SelectionDAG &DAG, diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index df6d99410d9..b7419782d34 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -16,6 +16,26 @@ def SIadd64bit32bit : SDNode<"ISD::ADD", SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisVT<0, i64>, SDTCisVT<2, i32>]> >; +def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT", + SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, i128>, SDTCisVT<2, i32>]>, + [SDNPMayLoad, SDNPMemOperand] +>; + +def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT", + SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, i128>, SDTCisVT<2, i16>, + SDTCisVT<3, i32>]> +>; + +class SDSample : SDNode , SDTCisVec<1>, SDTCisVT<2, v32i8>, + SDTCisVT<3, i128>, SDTCisVT<4, i32>]> +>; + +def SIsample : SDSample<"AMDGPUISD::SAMPLE">; +def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">; +def SIsampled : SDSample<"AMDGPUISD::SAMPLED">; +def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">; + // Transformation function, extract the lower 32bit of a 64bit immediate def LO32 : SDNodeXFormgetTargetConstant(N->getZExtValue() & 0xffffffff, MVT::i32); diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index d4e0b033702..47042174891 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -1303,7 +1303,7 @@ def : Pat < /* int_SI_vs_load_input */ def : Pat< - (int_SI_vs_load_input v16i8:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr), + (SIload_input i128:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr), (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset) >; @@ -1324,63 +1324,63 @@ def : Pat < /********** Image sampling patterns **********/ /********** ======================= **********/ -/* int_SI_sample for simple 1D texture lookup */ +/* SIsample for simple 1D texture lookup */ def : Pat < - (int_SI_sample v1i32:$addr, v32i8:$rsrc, v16i8:$sampler, imm), + (SIsample v1i32:$addr, v32i8:$rsrc, i128:$sampler, imm), (IMAGE_SAMPLE_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; -class SamplePattern : Pat < - (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, imm), +class SamplePattern : Pat < + (name vt:$addr, v32i8:$rsrc, i128:$sampler, imm), (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; -class SampleRectPattern : Pat < - (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_RECT), +class SampleRectPattern : Pat < + (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_RECT), (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; -class SampleArrayPattern : Pat < - (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_ARRAY), +class SampleArrayPattern : Pat < + (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_ARRAY), (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; -class SampleShadowPattern : Pat < - (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_SHADOW), + (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_SHADOW), (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; -class SampleShadowArrayPattern : Pat < - (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_SHADOW_ARRAY), + (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_SHADOW_ARRAY), (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; -/* int_SI_sample* for texture lookups consuming more address parameters */ +/* SIsample* for texture lookups consuming more address parameters */ multiclass SamplePatterns { - def : SamplePattern ; - def : SampleRectPattern ; - def : SampleArrayPattern ; - def : SampleShadowPattern ; - def : SampleShadowArrayPattern ; - - def : SamplePattern ; - def : SampleArrayPattern ; - def : SampleShadowPattern ; - def : SampleShadowArrayPattern ; - - def : SamplePattern ; - def : SampleArrayPattern ; - def : SampleShadowPattern ; - def : SampleShadowArrayPattern ; - - def : SamplePattern ; - def : SampleArrayPattern ; - def : SampleShadowPattern ; - def : SampleShadowArrayPattern ; + def : SamplePattern ; + def : SampleRectPattern ; + def : SampleArrayPattern ; + def : SampleShadowPattern ; + def : SampleShadowArrayPattern ; + + def : SamplePattern ; + def : SampleArrayPattern ; + def : SampleShadowPattern ; + def : SampleShadowArrayPattern ; + + def : SamplePattern ; + def : SampleArrayPattern ; + def : SampleShadowPattern ; + def : SampleShadowArrayPattern ; + + def : SamplePattern ; + def : SampleArrayPattern ; + def : SampleShadowPattern ; + def : SampleShadowArrayPattern ; } defm : SamplePatterns; // 2. Offset loaded in an 32bit SGPR def : Pat < - (int_SI_load_const v16i8:$sbase, imm:$offset), + (SIload_constant i128:$sbase, imm:$offset), (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset)) >; // 3. Offset in an 32Bit VGPR def : Pat < - (int_SI_load_const v16i8:$sbase, i32:$voff), + (SIload_constant i128:$sbase, i32:$voff), (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff) >; @@ -1777,7 +1777,7 @@ defm : SMRD_Pattern ; defm : SMRD_Pattern ; defm : SMRD_Pattern ; defm : SMRD_Pattern ; -defm : SMRD_Pattern ; +defm : SMRD_Pattern ; defm : SMRD_Pattern ; //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td index 2fa073ea099..d6e26adf65b 100644 --- a/lib/Target/R600/SIIntrinsics.td +++ b/lib/Target/R600/SIIntrinsics.td @@ -17,10 +17,10 @@ let TargetPrefix = "SI", isTarget = 1 in { def int_SI_tid : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>; def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; - def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v16i8_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ; + def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; + def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ; - class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>; + class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; def int_SI_sample : Sample; def int_SI_sampleb : Sample; diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td index 82d1e719257..0b90772f288 100644 --- a/lib/Target/R600/SIRegisterInfo.td +++ b/lib/Target/R600/SIRegisterInfo.td @@ -157,7 +157,7 @@ def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64, (add SGPR_64, VCCReg, EXECReg) >; -def SReg_128 : RegisterClass<"AMDGPU", [v16i8, i128], 128, (add SGPR_128)>; +def SReg_128 : RegisterClass<"AMDGPU", [i128], 128, (add SGPR_128)>; def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>; diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp new file mode 100644 index 00000000000..9da11e88eb8 --- /dev/null +++ b/lib/Target/R600/SITypeRewriter.cpp @@ -0,0 +1,146 @@ +//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass removes performs the following type substitution on all +/// non-compute shaders: +/// +/// v16i8 => i128 +/// - v16i8 is used for constant memory resource descriptors. This type is +/// legal for some compute APIs, and we don't want to declare it as legal +/// in the backend, because we want the legalizer to expand all v16i8 +/// operations. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" + +#include "llvm/IR/IRBuilder.h" +#include "llvm/InstVisitor.h" + +using namespace llvm; + +namespace { + +class SITypeRewriter : public FunctionPass, + public InstVisitor { + + static char ID; + Module *Mod; + Type *v16i8; + Type *i128; + +public: + SITypeRewriter() : FunctionPass(ID) { } + virtual bool doInitialization(Module &M); + virtual bool runOnFunction(Function &F); + virtual const char *getPassName() const { + return "SI Type Rewriter"; + } + void visitLoadInst(LoadInst &I); + void visitCallInst(CallInst &I); + void visitBitCast(BitCastInst &I); +}; + +} // End anonymous namespace + +char SITypeRewriter::ID = 0; + +bool SITypeRewriter::doInitialization(Module &M) { + Mod = &M; + v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16); + i128 = Type::getIntNTy(M.getContext(), 128); + return false; +} + +bool SITypeRewriter::runOnFunction(Function &F) { + AttributeSet Set = F.getAttributes(); + Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, "ShaderType"); + + unsigned ShaderType = ShaderType::COMPUTE; + if (A.isStringAttribute()) { + StringRef Str = A.getValueAsString(); + Str.getAsInteger(0, ShaderType); + } + if (ShaderType != ShaderType::COMPUTE) { + visit(F); + } + + visit(F); + + return false; +} + +void SITypeRewriter::visitLoadInst(LoadInst &I) { + Value *Ptr = I.getPointerOperand(); + Type *PtrTy = Ptr->getType(); + Type *ElemTy = PtrTy->getPointerElementType(); + IRBuilder<> Builder(&I); + if (ElemTy == v16i8) { + Value *BitCast = Builder.CreateBitCast(Ptr, Type::getIntNPtrTy(I.getContext(), 128, 2)); + LoadInst *Load = Builder.CreateLoad(BitCast); + SmallVector , 8> MD; + I.getAllMetadataOtherThanDebugLoc(MD); + for (unsigned i = 0, e = MD.size(); i != e; ++i) { + Load->setMetadata(MD[i].first, MD[i].second); + } + Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType()); + I.replaceAllUsesWith(BitCastLoad); + I.eraseFromParent(); + } +} + +void SITypeRewriter::visitCallInst(CallInst &I) { + IRBuilder<> Builder(&I); + SmallVector Args; + SmallVector Types; + bool NeedToReplace = false; + Function *F = I.getCalledFunction(); + std::string Name = F->getName().str(); + for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) { + Value *Arg = I.getArgOperand(i); + if (Arg->getType() == v16i8) { + Args.push_back(Builder.CreateBitCast(Arg, i128)); + Types.push_back(i128); + NeedToReplace = true; + Name = Name + ".i128"; + } else { + Args.push_back(Arg); + Types.push_back(Arg->getType()); + } + } + + if (!NeedToReplace) { + return; + } + Function *NewF = Mod->getFunction(Name); + if (!NewF) { + NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod); + NewF->setAttributes(F->getAttributes()); + } + I.replaceAllUsesWith(Builder.CreateCall(NewF, Args)); + I.eraseFromParent(); +} + +void SITypeRewriter::visitBitCast(BitCastInst &I) { + IRBuilder<> Builder(&I); + if (I.getDestTy() != i128) { + return; + } + + if (BitCastInst *Op = dyn_cast(I.getOperand(0))) { + if (Op->getSrcTy() == i128) { + I.replaceAllUsesWith(Op->getOperand(0)); + I.eraseFromParent(); + } + } +} + +FunctionPass *llvm::createSITypeRewriter() { + return new SITypeRewriter(); +} diff --git a/test/CodeGen/R600/llvm.SI.sample.ll b/test/CodeGen/R600/llvm.SI.sample.ll index dc2948a78f2..1c830a9919c 100644 --- a/test/CodeGen/R600/llvm.SI.sample.ll +++ b/test/CodeGen/R600/llvm.SI.sample.ll @@ -35,37 +35,37 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) { %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2 %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3 %res1 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v1, - <32 x i8> undef, <4 x i32> undef, i32 1) + <32 x i8> undef, <16 x i8> undef, i32 1) %res2 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v2, - <32 x i8> undef, <4 x i32> undef, i32 2) + <32 x i8> undef, <16 x i8> undef, i32 2) %res3 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v3, - <32 x i8> undef, <4 x i32> undef, i32 3) + <32 x i8> undef, <16 x i8> undef, i32 3) %res4 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v4, - <32 x i8> undef, <4 x i32> undef, i32 4) + <32 x i8> undef, <16 x i8> undef, i32 4) %res5 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v5, - <32 x i8> undef, <4 x i32> undef, i32 5) + <32 x i8> undef, <16 x i8> undef, i32 5) %res6 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v6, - <32 x i8> undef, <4 x i32> undef, i32 6) + <32 x i8> undef, <16 x i8> undef, i32 6) %res7 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v7, - <32 x i8> undef, <4 x i32> undef, i32 7) + <32 x i8> undef, <16 x i8> undef, i32 7) %res8 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v8, - <32 x i8> undef, <4 x i32> undef, i32 8) + <32 x i8> undef, <16 x i8> undef, i32 8) %res9 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v9, - <32 x i8> undef, <4 x i32> undef, i32 9) + <32 x i8> undef, <16 x i8> undef, i32 9) %res10 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v10, - <32 x i8> undef, <4 x i32> undef, i32 10) + <32 x i8> undef, <16 x i8> undef, i32 10) %res11 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v11, - <32 x i8> undef, <4 x i32> undef, i32 11) + <32 x i8> undef, <16 x i8> undef, i32 11) %res12 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v12, - <32 x i8> undef, <4 x i32> undef, i32 12) + <32 x i8> undef, <16 x i8> undef, i32 12) %res13 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v13, - <32 x i8> undef, <4 x i32> undef, i32 13) + <32 x i8> undef, <16 x i8> undef, i32 13) %res14 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v14, - <32 x i8> undef, <4 x i32> undef, i32 14) + <32 x i8> undef, <16 x i8> undef, i32 14) %res15 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v15, - <32 x i8> undef, <4 x i32> undef, i32 15) + <32 x i8> undef, <16 x i8> undef, i32 15) %res16 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v16, - <32 x i8> undef, <4 x i32> undef, i32 16) + <32 x i8> undef, <16 x i8> undef, i32 16) %e1 = extractelement <4 x float> %res1, i32 0 %e2 = extractelement <4 x float> %res2, i32 1 %e3 = extractelement <4 x float> %res3, i32 2 @@ -135,6 +135,6 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) { ret void } -declare <4 x float> @llvm.SI.sample.(<4 x i32>, <32 x i8>, <4 x i32>, i32) readnone +declare <4 x float> @llvm.SI.sample.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/R600/llvm.SI.sampled.ll b/test/CodeGen/R600/llvm.SI.sampled.ll index 56645deaa2a..d43b3789f8b 100644 --- a/test/CodeGen/R600/llvm.SI.sampled.ll +++ b/test/CodeGen/R600/llvm.SI.sampled.ll @@ -35,37 +35,37 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) { %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2 %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3 %res1 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v1, - <32 x i8> undef, <4 x i32> undef, i32 1) + <32 x i8> undef, <16 x i8> undef, i32 1) %res2 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v2, - <32 x i8> undef, <4 x i32> undef, i32 2) + <32 x i8> undef, <16 x i8> undef, i32 2) %res3 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v3, - <32 x i8> undef, <4 x i32> undef, i32 3) + <32 x i8> undef, <16 x i8> undef, i32 3) %res4 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v4, - <32 x i8> undef, <4 x i32> undef, i32 4) + <32 x i8> undef, <16 x i8> undef, i32 4) %res5 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v5, - <32 x i8> undef, <4 x i32> undef, i32 5) + <32 x i8> undef, <16 x i8> undef, i32 5) %res6 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v6, - <32 x i8> undef, <4 x i32> undef, i32 6) + <32 x i8> undef, <16 x i8> undef, i32 6) %res7 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v7, - <32 x i8> undef, <4 x i32> undef, i32 7) + <32 x i8> undef, <16 x i8> undef, i32 7) %res8 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v8, - <32 x i8> undef, <4 x i32> undef, i32 8) + <32 x i8> undef, <16 x i8> undef, i32 8) %res9 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v9, - <32 x i8> undef, <4 x i32> undef, i32 9) + <32 x i8> undef, <16 x i8> undef, i32 9) %res10 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v10, - <32 x i8> undef, <4 x i32> undef, i32 10) + <32 x i8> undef, <16 x i8> undef, i32 10) %res11 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v11, - <32 x i8> undef, <4 x i32> undef, i32 11) + <32 x i8> undef, <16 x i8> undef, i32 11) %res12 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v12, - <32 x i8> undef, <4 x i32> undef, i32 12) + <32 x i8> undef, <16 x i8> undef, i32 12) %res13 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v13, - <32 x i8> undef, <4 x i32> undef, i32 13) + <32 x i8> undef, <16 x i8> undef, i32 13) %res14 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v14, - <32 x i8> undef, <4 x i32> undef, i32 14) + <32 x i8> undef, <16 x i8> undef, i32 14) %res15 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v15, - <32 x i8> undef, <4 x i32> undef, i32 15) + <32 x i8> undef, <16 x i8> undef, i32 15) %res16 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v16, - <32 x i8> undef, <4 x i32> undef, i32 16) + <32 x i8> undef, <16 x i8> undef, i32 16) %e1 = extractelement <4 x float> %res1, i32 0 %e2 = extractelement <4 x float> %res2, i32 1 %e3 = extractelement <4 x float> %res3, i32 2 @@ -135,6 +135,6 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) { ret void } -declare <4 x float> @llvm.SI.sampled.(<4 x i32>, <32 x i8>, <4 x i32>, i32) readnone +declare <4 x float> @llvm.SI.sampled.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) -- 2.34.1