From: Bill Schmidt Date: Fri, 30 Aug 2013 22:18:55 +0000 (+0000) Subject: [PowerPC] Call support for fast-isel. X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=11addd2a2f584571ffcfd51711972b27aef0133f;p=oota-llvm.git [PowerPC] Call support for fast-isel. This patch adds fast-isel support for calls (but not intrinsic calls or varargs calls). It also removes a badly-formed assert. There are some new tests just for calls, and also for folding loads into arguments on calls to avoid extra extends. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189701 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td index 9c937edee4c..e8e7f4c2d22 100644 --- a/lib/Target/PowerPC/PPCCallingConv.td +++ b/lib/Target/PowerPC/PPCCallingConv.td @@ -42,6 +42,17 @@ def RetCC_PPC : CallingConv<[ // logic. FIXME: See if the logic can be simplified with use of CCs. // This may require some extensions to current table generation. +// Simple calling convention for 64-bit ELF PowerPC fast isel. +// Only handle ints and floats. All ints are promoted to i64. +// Vector types and quadword ints are not handled. +def CC_PPC64_ELF_FIS : CallingConv<[ + CCIfType<[i8], CCPromoteToType>, + CCIfType<[i16], CCPromoteToType>, + CCIfType<[i32], CCPromoteToType>, + CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6, X7, X8, X9, X10]>>, + CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>> +]>; + // Simple return-value convention for 64-bit ELF PowerPC fast isel. // All small ints are promoted to i64. Vector types, quadword ints, // and multiple register returns are "supported" to avoid compile diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp index 3bceed418a1..8a88e76e90d 100644 --- a/lib/Target/PowerPC/PPCFastISel.cpp +++ b/lib/Target/PowerPC/PPCFastISel.cpp @@ -114,6 +114,7 @@ class PPCFastISel : public FastISel { bool SelectIToFP(const Instruction *I, bool IsSigned); bool SelectFPToI(const Instruction *I, bool IsSigned); bool SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode); + bool SelectCall(const Instruction *I); bool SelectRet(const Instruction *I); bool SelectIntExt(const Instruction *I); @@ -145,6 +146,17 @@ class PPCFastISel : public FastISel { // Call handling routines. private: + bool processCallArgs(SmallVectorImpl &Args, + SmallVectorImpl &ArgRegs, + SmallVectorImpl &ArgVTs, + SmallVectorImpl &ArgFlags, + SmallVectorImpl &RegArgs, + CallingConv::ID CC, + unsigned &NumBytes, + bool IsVarArg); + void finishCall(MVT RetVT, SmallVectorImpl &UsedRegs, + const Instruction *I, CallingConv::ID CC, + unsigned &NumBytes, bool IsVarArg); CCAssignFn *usePPC32CCs(unsigned Flag); private: @@ -1150,6 +1162,316 @@ bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) { return true; } +// Handle arguments to a call that we're attempting to fast-select. +// Return false if the arguments are too complex for us at the moment. +bool PPCFastISel::processCallArgs(SmallVectorImpl &Args, + SmallVectorImpl &ArgRegs, + SmallVectorImpl &ArgVTs, + SmallVectorImpl &ArgFlags, + SmallVectorImpl &RegArgs, + CallingConv::ID CC, + unsigned &NumBytes, + bool IsVarArg) { + SmallVector ArgLocs; + CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, ArgLocs, *Context); + CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_PPC64_ELF_FIS); + + // Bail out if we can't handle any of the arguments. + for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) { + CCValAssign &VA = ArgLocs[I]; + MVT ArgVT = ArgVTs[VA.getValNo()]; + + // Skip vector arguments for now, as well as long double and + // uint128_t, and anything that isn't passed in a register. + if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64 || + !VA.isRegLoc() || VA.needsCustom()) + return false; + + // Skip bit-converted arguments for now. + if (VA.getLocInfo() == CCValAssign::BCvt) + return false; + } + + // Get a count of how many bytes are to be pushed onto the stack. + NumBytes = CCInfo.getNextStackOffset(); + + // Issue CALLSEQ_START. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TII.getCallFrameSetupOpcode())) + .addImm(NumBytes); + + // Prepare to assign register arguments. Every argument uses up a + // GPR protocol register even if it's passed in a floating-point + // register. + unsigned NextGPR = PPC::X3; + unsigned NextFPR = PPC::F1; + + // Process arguments. + for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) { + CCValAssign &VA = ArgLocs[I]; + unsigned Arg = ArgRegs[VA.getValNo()]; + MVT ArgVT = ArgVTs[VA.getValNo()]; + + // Handle argument promotion and bitcasts. + switch (VA.getLocInfo()) { + default: + llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: + break; + case CCValAssign::SExt: { + MVT DestVT = VA.getLocVT(); + const TargetRegisterClass *RC = + (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; + unsigned TmpReg = createResultReg(RC); + if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/false)) + llvm_unreachable("Failed to emit a sext!"); + ArgVT = DestVT; + Arg = TmpReg; + break; + } + case CCValAssign::AExt: + case CCValAssign::ZExt: { + MVT DestVT = VA.getLocVT(); + const TargetRegisterClass *RC = + (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; + unsigned TmpReg = createResultReg(RC); + if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/true)) + llvm_unreachable("Failed to emit a zext!"); + ArgVT = DestVT; + Arg = TmpReg; + break; + } + case CCValAssign::BCvt: { + // FIXME: Not yet handled. + llvm_unreachable("Should have bailed before getting here!"); + break; + } + } + + // Copy this argument to the appropriate register. + unsigned ArgReg; + if (ArgVT == MVT::f32 || ArgVT == MVT::f64) { + ArgReg = NextFPR++; + ++NextGPR; + } else + ArgReg = NextGPR++; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + ArgReg).addReg(Arg); + RegArgs.push_back(ArgReg); + } + + return true; +} + +// For a call that we've determined we can fast-select, finish the +// call sequence and generate a copy to obtain the return value (if any). +void PPCFastISel::finishCall(MVT RetVT, SmallVectorImpl &UsedRegs, + const Instruction *I, CallingConv::ID CC, + unsigned &NumBytes, bool IsVarArg) { + // Issue CallSEQ_END. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TII.getCallFrameDestroyOpcode())) + .addImm(NumBytes).addImm(0); + + // Next, generate a copy to obtain the return value. + // FIXME: No multi-register return values yet, though I don't foresee + // any real difficulties there. + if (RetVT != MVT::isVoid) { + SmallVector RVLocs; + CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, RVLocs, *Context); + CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS); + CCValAssign &VA = RVLocs[0]; + assert(RVLocs.size() == 1 && "No support for multi-reg return values!"); + assert(VA.isRegLoc() && "Can only return in registers!"); + + MVT DestVT = VA.getValVT(); + MVT CopyVT = DestVT; + + // Ints smaller than a register still arrive in a full 64-bit + // register, so make sure we recognize this. + if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32) + CopyVT = MVT::i64; + + unsigned SourcePhysReg = VA.getLocReg(); + unsigned ResultReg; + + if (RetVT == CopyVT) { + const TargetRegisterClass *CpyRC = TLI.getRegClassFor(CopyVT); + ResultReg = createResultReg(CpyRC); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(SourcePhysReg); + + // If necessary, round the floating result to single precision. + } else if (CopyVT == MVT::f64) { + ResultReg = createResultReg(TLI.getRegClassFor(RetVT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::FRSP), + ResultReg).addReg(SourcePhysReg); + + // If only the low half of a general register is needed, generate + // a GPRC copy instead of a G8RC copy. (EXTRACT_SUBREG can't be + // used along the fast-isel path (not lowered), and downstream logic + // also doesn't like a direct subreg copy on a physical reg.) + } else if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32) { + ResultReg = createResultReg(&PPC::GPRCRegClass); + // Convert physical register from G8RC to GPRC. + SourcePhysReg -= PPC::X0 - PPC::R0; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(SourcePhysReg); + } + + UsedRegs.push_back(SourcePhysReg); + UpdateValueMap(I, ResultReg); + } +} + +// Attempt to fast-select a call instruction. +bool PPCFastISel::SelectCall(const Instruction *I) { + const CallInst *CI = cast(I); + const Value *Callee = CI->getCalledValue(); + + // Can't handle inline asm. + if (isa(Callee)) + return false; + + // Allow SelectionDAG isel to handle tail calls. + if (CI->isTailCall()) + return false; + + // Obtain calling convention. + ImmutableCallSite CS(CI); + CallingConv::ID CC = CS.getCallingConv(); + + PointerType *PT = cast(CS.getCalledValue()->getType()); + FunctionType *FTy = cast(PT->getElementType()); + bool IsVarArg = FTy->isVarArg(); + + // Not ready for varargs yet. + if (IsVarArg) + return false; + + // Handle simple calls for now, with legal return types and + // those that can be extended. + Type *RetTy = I->getType(); + MVT RetVT; + if (RetTy->isVoidTy()) + RetVT = MVT::isVoid; + else if (!isTypeLegal(RetTy, RetVT) && RetVT != MVT::i16 && + RetVT != MVT::i8) + return false; + + // FIXME: No multi-register return values yet. + if (RetVT != MVT::isVoid && RetVT != MVT::i8 && RetVT != MVT::i16 && + RetVT != MVT::i32 && RetVT != MVT::i64 && RetVT != MVT::f32 && + RetVT != MVT::f64) { + SmallVector RVLocs; + CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, RVLocs, *Context); + CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS); + if (RVLocs.size() > 1) + return false; + } + + // Bail early if more than 8 arguments, as we only currently + // handle arguments passed in registers. + unsigned NumArgs = CS.arg_size(); + if (NumArgs > 8) + return false; + + // Set up the argument vectors. + SmallVector Args; + SmallVector ArgRegs; + SmallVector ArgVTs; + SmallVector ArgFlags; + + Args.reserve(NumArgs); + ArgRegs.reserve(NumArgs); + ArgVTs.reserve(NumArgs); + ArgFlags.reserve(NumArgs); + + for (ImmutableCallSite::arg_iterator II = CS.arg_begin(), IE = CS.arg_end(); + II != IE; ++II) { + // FIXME: ARM does something for intrinsic calls here, check into that. + + unsigned AttrIdx = II - CS.arg_begin() + 1; + + // Only handle easy calls for now. It would be reasonably easy + // to handle <= 8-byte structures passed ByVal in registers, but we + // have to ensure they are right-justified in the register. + if (CS.paramHasAttr(AttrIdx, Attribute::InReg) || + CS.paramHasAttr(AttrIdx, Attribute::StructRet) || + CS.paramHasAttr(AttrIdx, Attribute::Nest) || + CS.paramHasAttr(AttrIdx, Attribute::ByVal)) + return false; + + ISD::ArgFlagsTy Flags; + if (CS.paramHasAttr(AttrIdx, Attribute::SExt)) + Flags.setSExt(); + if (CS.paramHasAttr(AttrIdx, Attribute::ZExt)) + Flags.setZExt(); + + Type *ArgTy = (*II)->getType(); + MVT ArgVT; + if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8) + return false; + + if (ArgVT.isVector()) + return false; + + unsigned Arg = getRegForValue(*II); + if (Arg == 0) + return false; + + unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy); + Flags.setOrigAlign(OriginalAlignment); + + Args.push_back(*II); + ArgRegs.push_back(Arg); + ArgVTs.push_back(ArgVT); + ArgFlags.push_back(Flags); + } + + // Process the arguments. + SmallVector RegArgs; + unsigned NumBytes; + + if (!processCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, + RegArgs, CC, NumBytes, IsVarArg)) + return false; + + // FIXME: No handling for function pointers yet. This requires + // implementing the function descriptor (OPD) setup. + const GlobalValue *GV = dyn_cast(Callee); + if (!GV) + return false; + + // Build direct call with NOP for TOC restore. + // FIXME: We can and should optimize away the NOP for local calls. + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(PPC::BL8_NOP)); + // Add callee. + MIB.addGlobalAddress(GV); + + // Add implicit physical register uses to the call. + for (unsigned II = 0, IE = RegArgs.size(); II != IE; ++II) + MIB.addReg(RegArgs[II], RegState::Implicit); + + // Add a register mask with the call-preserved registers. Proper + // defs for return values will be added by setPhysRegsDeadExcept(). + MIB.addRegMask(TRI.getCallPreservedMask(CC)); + + // Finish off the call including any return values. + SmallVector UsedRegs; + finishCall(RetVT, UsedRegs, I, CC, NumBytes, IsVarArg); + + // Set all unused physregs defs as dead. + static_cast(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI); + + return true; +} + // Attempt to fast-select a return instruction. bool PPCFastISel::SelectRet(const Instruction *I) { @@ -1414,6 +1736,10 @@ bool PPCFastISel::TargetSelectInstruction(const Instruction *I) { return SelectBinaryIntOp(I, ISD::OR); case Instruction::Sub: return SelectBinaryIntOp(I, ISD::SUB); + case Instruction::Call: + if (dyn_cast(I)) + return false; + return SelectCall(I); case Instruction::Ret: return SelectRet(I); case Instruction::ZExt: @@ -1490,7 +1816,6 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) { // If GV is an alias, use the aliasee for determining thread-locality. if (const GlobalAlias *GA = dyn_cast(GV)) GVar = dyn_cast_or_null(GA->resolveAliasedGlobal(false)); - assert((GVar || isa(GV)) && "Unexpected GV subclass!"); } // FIXME: We don't yet handle the complexity of TLS. diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 29c2270d4ab..244e00d4380 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1813,8 +1813,7 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG, // Function whose sole purpose is to kill compiler warnings // stemming from unused functions included from PPCGenCallingConv.inc. CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const { - /* One of these will be CC_PPC64_ELF_FIS in a future patch. */ - return Flag ? RetCC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS; + return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS; } bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, diff --git a/test/CodeGen/PowerPC/fast-isel-call.ll b/test/CodeGen/PowerPC/fast-isel-call.ll new file mode 100644 index 00000000000..33a8ba903e3 --- /dev/null +++ b/test/CodeGen/PowerPC/fast-isel-call.ll @@ -0,0 +1,132 @@ +; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64 + +define i32 @t1(i8 signext %a) nounwind { + %1 = sext i8 %a to i32 + ret i32 %1 +} + +define i32 @t2(i8 zeroext %a) nounwind { + %1 = zext i8 %a to i32 + ret i32 %1 +} + +define i32 @t3(i16 signext %a) nounwind { + %1 = sext i16 %a to i32 + ret i32 %1 +} + +define i32 @t4(i16 zeroext %a) nounwind { + %1 = zext i16 %a to i32 + ret i32 %1 +} + +define void @foo(i8 %a, i16 %b) nounwind { +; ELF64: foo + %1 = call i32 @t1(i8 signext %a) +; ELF64: extsb + %2 = call i32 @t2(i8 zeroext %a) +; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56 + %3 = call i32 @t3(i16 signext %b) +; ELF64: extsh + %4 = call i32 @t4(i16 zeroext %b) +; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48 + +;; A few test to check materialization + %5 = call i32 @t2(i8 zeroext 255) +; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56 + %6 = call i32 @t4(i16 zeroext 65535) +; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48 + ret void +} + +define void @foo2() nounwind { + %1 = call signext i16 @t5() + %2 = call zeroext i16 @t6() + %3 = call signext i8 @t7() + %4 = call zeroext i8 @t8() + ret void +} + +declare signext i16 @t5(); +declare zeroext i16 @t6(); +declare signext i8 @t7(); +declare zeroext i8 @t8(); + +define i32 @t10(i32 %argc, i8** nocapture %argv) { +entry: +; ELF64: t10 + %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70) +; ELF64: li 3, 0 +; ELF64: li 4, 248 +; ELF64: li 5, 187 +; ELF64: li 6, 28 +; ELF64: li 7, 40 +; ELF64: li 8, 186 +; ELF64: rldicl 3, 3, 0, 56 +; ELF64: rldicl 4, 4, 0, 56 +; ELF64: rldicl 5, 5, 0, 56 +; ELF64: rldicl 6, 6, 0, 56 +; ELF64: rldicl 7, 7, 0, 56 +; ELF64: rldicl 8, 8, 0, 56 + ret i32 0 +} + +declare i32 @bar(i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext) + +define i32 @bar0(i32 %i) nounwind { + ret i32 0 +} + +; Function pointers are not yet implemented. +;define void @foo3() uwtable { +; %fptr = alloca i32 (i32)*, align 8 +; store i32 (i32)* @bar0, i32 (i32)** %fptr, align 8 +; %1 = load i32 (i32)** %fptr, align 8 +; %call = call i32 %1(i32 0) +; ret void +;} + +; Intrinsic calls not yet implemented, and udiv isn't one for PPC anyway. +;define i32 @LibCall(i32 %a, i32 %b) { +;entry: +; %tmp1 = udiv i32 %a, %b ; [#uses=1] +; ret i32 %tmp1 +;} + +declare void @float_foo(float %f) ssp + +define void @float_const() ssp { +entry: +; ELF64: float_const + call void @float_foo(float 0x401C666660000000) +; ELF64: addis [[REG:[0-9]+]], 2, .LCPI[[SUF:[0-9_]+]]@toc@ha +; ELF64: lfs 1, .LCPI[[SUF]]@toc@l([[REG]]) + ret void +} + +define void @float_reg(float %dummy, float %f) ssp { +entry: +; ELF64: float_reg + call void @float_foo(float %f) +; ELF64: fmr 1, 2 + ret void +} + +declare void @double_foo(double %d) ssp + +define void @double_const() ssp { +entry: +; ELF64: double_const + call void @double_foo(double 0x1397723CCABD0000401C666660000000) +; ELF64: addis [[REG2:[0-9]+]], 2, .LCPI[[SUF2:[0-9_]+]]@toc@ha +; ELF64: lfd 1, .LCPI[[SUF2]]@toc@l([[REG2]]) + ret void +} + +define void @double_reg(double %dummy, double %d) ssp { +entry: +; ELF64: double_reg + call void @double_foo(double %d) +; ELF64: fmr 1, 2 + ret void +} diff --git a/test/CodeGen/PowerPC/fast-isel-fold.ll b/test/CodeGen/PowerPC/fast-isel-fold.ll index 21e691224df..4de345f309a 100644 --- a/test/CodeGen/PowerPC/fast-isel-fold.ll +++ b/test/CodeGen/PowerPC/fast-isel-fold.ll @@ -4,6 +4,40 @@ @b = global i16 2, align 2 @c = global i32 4, align 4 +define void @t1() nounwind uwtable ssp { +; ELF64: t1 + %1 = load i8* @a, align 1 + call void @foo1(i8 zeroext %1) +; ELF64: lbz +; ELF64-NOT: rldicl +; ELF64-NOT: rlwinm + ret void +} + +define void @t2() nounwind uwtable ssp { +; ELF64: t2 + %1 = load i16* @b, align 2 + call void @foo2(i16 zeroext %1) +; ELF64: lhz +; ELF64-NOT: rldicl +; ELF64-NOT: rlwinm + ret void +} + +define void @t2a() nounwind uwtable ssp { +; ELF64: t2a + %1 = load i32* @c, align 4 + call void @foo3(i32 zeroext %1) +; ELF64: lwz +; ELF64-NOT: rldicl +; ELF64-NOT: rlwinm + ret void +} + +declare void @foo1(i8 zeroext) +declare void @foo2(i16 zeroext) +declare void @foo3(i32 zeroext) + define i32 @t3() nounwind uwtable ssp { ; ELF64: t3 %1 = load i8* @a, align 1