X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FPowerPC%2FPPCISelLowering.cpp;h=630385cc7c0c564ee46fb1c668f504852233031d;hb=4a816471f527b90464806892feeecc242491a459;hp=3bda37b43e0d60f96a3003928f15c9c767e2ba58;hpb=827307b95fa909e35a3ddef612f9f50ffcf0963a;p=oota-llvm.git diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 3bda37b43e0..630385cc7c0 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -16,6 +16,7 @@ #include "PPCMachineFunctionInfo.h" #include "PPCPerfectShuffle.h" #include "PPCTargetMachine.h" +#include "PPCTargetObjectFile.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -29,6 +30,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -36,21 +38,6 @@ #include "llvm/Target/TargetOptions.h" using namespace llvm; -static bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State); -static bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State); -static bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State); - static cl::opt DisablePPCPreinc("disable-ppc-preinc", cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden); @@ -64,13 +51,15 @@ static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) { if (TM.getSubtargetImpl()->isDarwin()) return new TargetLoweringObjectFileMachO(); + if (TM.getSubtargetImpl()->isSVR4ABI()) + return new PPC64LinuxTargetObjectFile(); + return new TargetLoweringObjectFileELF(); } PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) : TargetLowering(TM, CreateTLOF(TM)), PPCSubTarget(*TM.getSubtargetImpl()) { const PPCSubtarget *Subtarget = &TM.getSubtarget(); - PPCRegInfo = TM.getRegisterInfo(); setPow2DivIsCheap(); @@ -116,6 +105,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand); setOperationAction(ISD::FRINT, MVT::ppcf128, Expand); setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand); + setOperationAction(ISD::FREM, MVT::ppcf128, Expand); // PowerPC has no SREM/UREM instructions setOperationAction(ISD::SREM, MVT::i32, Expand); @@ -160,28 +150,24 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) Subtarget->hasFRSQRTES() && Subtarget->hasFRES())) setOperationAction(ISD::FSQRT, MVT::f32, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + if (Subtarget->hasFCPSGN()) { + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal); + } else { + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + } if (Subtarget->hasFPRND()) { setOperationAction(ISD::FFLOOR, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FTRUNC, MVT::f64, Legal); + setOperationAction(ISD::FROUND, MVT::f64, Legal); setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FCEIL, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); - - // frin does not implement "ties to even." Thus, this is safe only in - // fast-math mode. - if (TM.Options.UnsafeFPMath) { - setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); - - // These need to set FE_INEXACT, and use a custom inserter. - setOperationAction(ISD::FRINT, MVT::f64, Legal); - setOperationAction(ISD::FRINT, MVT::f32, Legal); - } + setOperationAction(ISD::FROUND, MVT::f32, Legal); } // PowerPC does not have BSWAP, CTPOP or CTTZ @@ -239,11 +225,6 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // We cannot sextinreg(i1). Expand to shifts. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); - setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); - setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); - setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); - setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); - // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support // SjLj exception handling but a light-weight setjmp/longjmp replacement to // support continuation, user-level threading, and etc.. As a result, no @@ -296,8 +277,13 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) } else setOperationAction(ISD::VAARG, MVT::Other, Expand); + if (Subtarget->isSVR4ABI() && !isPPC64) + // VACOPY is custom lowered with the 32-bit SVR4 ABI. + setOperationAction(ISD::VACOPY , MVT::Other, Custom); + else + setOperationAction(ISD::VACOPY , MVT::Other, Expand); + // Use the default implementation. - setOperationAction(ISD::VACOPY , MVT::Other, Expand); setOperationAction(ISD::VAEND , MVT::Other, Expand); setOperationAction(ISD::STACKSAVE , MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom); @@ -307,6 +293,9 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + // To handle counter-based loop conditions. + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom); + // Comparisons that require checking two conditions. setCondCodeAction(ISD::SETULT, MVT::f32, Expand); setCondCodeAction(ISD::SETULT, MVT::f64, Expand); @@ -405,6 +394,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::FDIV, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FNEG, VT, Expand); setOperationAction(ISD::FSQRT, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); @@ -499,6 +489,9 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setCondCodeAction(ISD::SETUGE, MVT::v4f32, Expand); setCondCodeAction(ISD::SETULT, MVT::v4f32, Expand); setCondCodeAction(ISD::SETULE, MVT::v4f32, Expand); + + setCondCodeAction(ISD::SETO, MVT::v4f32, Expand); + setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand); } if (Subtarget->has64BitSupport()) { @@ -512,7 +505,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand); setBooleanContents(ZeroOrOneBooleanContent); - setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct? + // Altivec instructions set fields to all zeros or all ones. + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); if (isPPC64) { setStackPointerRegisterToSaveRestore(PPC::X1); @@ -526,9 +520,11 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::SINT_TO_FP); + setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::BR_CC); setTargetDAGCombine(ISD::BSWAP); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); // Use reciprocal estimates. if (TM.Options.UnsafeFPMath) { @@ -561,7 +557,10 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setInsertFencesForAtomic(true); - setSchedulingPreference(Sched::Hybrid); + if (Subtarget->enableMachineScheduler()) + setSchedulingPreference(Sched::Source); + else + setSchedulingPreference(Sched::Hybrid); computeRegisterProperties(); @@ -580,24 +579,47 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) } } +/// getMaxByValAlign - Helper for getByValTypeAlignment to determine +/// the desired ByVal argument alignment. +static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign, + unsigned MaxMaxAlign) { + if (MaxAlign == MaxMaxAlign) + return; + if (VectorType *VTy = dyn_cast(Ty)) { + if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256) + MaxAlign = 32; + else if (VTy->getBitWidth() >= 128 && MaxAlign < 16) + MaxAlign = 16; + } else if (ArrayType *ATy = dyn_cast(Ty)) { + unsigned EltAlign = 0; + getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign); + if (EltAlign > MaxAlign) + MaxAlign = EltAlign; + } else if (StructType *STy = dyn_cast(Ty)) { + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + unsigned EltAlign = 0; + getMaxByValAlign(STy->getElementType(i), EltAlign, MaxMaxAlign); + if (EltAlign > MaxAlign) + MaxAlign = EltAlign; + if (MaxAlign == MaxMaxAlign) + break; + } + } +} + /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate /// function arguments in the caller parameter area. unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const { - const TargetMachine &TM = getTargetMachine(); // Darwin passes everything on 4 byte boundary. - if (TM.getSubtarget().isDarwin()) + if (PPCSubTarget.isDarwin()) return 4; // 16byte and wider vectors are passed on 16byte boundary. - if (VectorType *VTy = dyn_cast(Ty)) - if (VTy->getBitWidth() >= 128) - return 16; - // The rest is 8 on PPC64 and 4 on PPC32 boundary. - if (PPCSubTarget.isPPC64()) - return 8; - - return 4; + unsigned Align = PPCSubTarget.isPPC64() ? 8 : 4; + if (PPCSubTarget.hasAltivec() || PPCSubTarget.hasQPX()) + getMaxByValAlign(Ty, Align, PPCSubTarget.hasQPX() ? 32 : 16); + return Align; } const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { @@ -631,7 +653,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG"; case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP"; case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP"; - case PPCISD::MFCR: return "PPCISD::MFCR"; + case PPCISD::MFOCRF: return "PPCISD::MFOCRF"; case PPCISD::VCMP: return "PPCISD::VCMP"; case PPCISD::VCMPo: return "PPCISD::VCMPo"; case PPCISD::LBRX: return "PPCISD::LBRX"; @@ -639,6 +661,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::LARX: return "PPCISD::LARX"; case PPCISD::STCX: return "PPCISD::STCX"; case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH"; + case PPCISD::BDNZ: return "PPCISD::BDNZ"; + case PPCISD::BDZ: return "PPCISD::BDZ"; case PPCISD::MFFS: return "PPCISD::MFFS"; case PPCISD::FADDRTZ: return "PPCISD::FADDRTZ"; case PPCISD::TC_RETURN: return "PPCISD::TC_RETURN"; @@ -647,6 +671,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::ADDIS_TOC_HA: return "PPCISD::ADDIS_TOC_HA"; case PPCISD::LD_TOC_L: return "PPCISD::LD_TOC_L"; case PPCISD::ADDI_TOC_L: return "PPCISD::ADDI_TOC_L"; + case PPCISD::PPC32_GOT: return "PPCISD::PPC32_GOT"; case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA"; case PPCISD::LD_GOT_TPREL_L: return "PPCISD::LD_GOT_TPREL_L"; case PPCISD::ADD_TLS: return "PPCISD::ADD_TLS"; @@ -659,10 +684,11 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA"; case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L"; case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT"; + case PPCISD::SC: return "PPCISD::SC"; } } -EVT PPCTargetLowering::getSetCCResultType(EVT VT) const { +EVT PPCTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { if (!VT.isVector()) return MVT::i32; return VT.changeVectorElementTypeToInteger(); @@ -1033,24 +1059,68 @@ bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base, return false; } +// If we happen to be doing an i64 load or store into a stack slot that has +// less than a 4-byte alignment, then the frame-index elimination may need to +// use an indexed load or store instruction (because the offset may not be a +// multiple of 4). The extra register needed to hold the offset comes from the +// register scavenger, and it is possible that the scavenger will need to use +// an emergency spill slot. As a result, we need to make sure that a spill slot +// is allocated when doing an i64 load/store into a less-than-4-byte-aligned +// stack slot. +static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) { + // FIXME: This does not handle the LWA case. + if (VT != MVT::i64) + return; + + // NOTE: We'll exclude negative FIs here, which come from argument + // lowering, because there are no known test cases triggering this problem + // using packed structures (or similar). We can remove this exclusion if + // we find such a test case. The reason why this is so test-case driven is + // because this entire 'fixup' is only to prevent crashes (from the + // register scavenger) on not-really-valid inputs. For example, if we have: + // %a = alloca i1 + // %b = bitcast i1* %a to i64* + // store i64* a, i64 b + // then the store should really be marked as 'align 1', but is not. If it + // were marked as 'align 1' then the indexed form would have been + // instruction-selected initially, and the problem this 'fixup' is preventing + // won't happen regardless. + if (FrameIdx < 0) + return; + + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + + unsigned Align = MFI->getObjectAlignment(FrameIdx); + if (Align >= 4) + return; + + PPCFunctionInfo *FuncInfo = MF.getInfo(); + FuncInfo->setHasNonRISpills(); +} + /// Returns true if the address N can be represented by a base register plus /// a signed 16-bit displacement [r+imm], and if it is not better -/// represented as reg+reg. +/// represented as reg+reg. If Aligned is true, only accept displacements +/// suitable for STD and friends, i.e. multiples of 4. bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base, - SelectionDAG &DAG) const { + SelectionDAG &DAG, + bool Aligned) const { // FIXME dl should come from parent load or store, not from address - DebugLoc dl = N.getDebugLoc(); + SDLoc dl(N); // If this can be more profitably realized as r+r, fail. if (SelectAddressRegReg(N, Disp, Base, DAG)) return false; if (N.getOpcode() == ISD::ADD) { short imm = 0; - if (isIntS16Immediate(N.getOperand(1), imm)) { - Disp = DAG.getTargetConstant((int)imm & 0xFFFF, MVT::i32); + if (isIntS16Immediate(N.getOperand(1), imm) && + (!Aligned || (imm & 3) == 0)) { + Disp = DAG.getTargetConstant(imm, N.getValueType()); if (FrameIndexSDNode *FI = dyn_cast(N.getOperand(0))) { Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); + fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); } else { Base = N.getOperand(0); } @@ -1069,7 +1139,8 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, } } else if (N.getOpcode() == ISD::OR) { short imm = 0; - if (isIntS16Immediate(N.getOperand(1), imm)) { + if (isIntS16Immediate(N.getOperand(1), imm) && + (!Aligned || (imm & 3) == 0)) { // If this is an or of disjoint bitfields, we can codegen this as an add // (for better address arithmetic) if the LHS and RHS of the OR are // provably disjoint. @@ -1080,7 +1151,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, // If all of the bits are known zero on the LHS or RHS, the add won't // carry. Base = N.getOperand(0); - Disp = DAG.getTargetConstant((int)imm & 0xFFFF, MVT::i32); + Disp = DAG.getTargetConstant(imm, N.getValueType()); return true; } } @@ -1090,7 +1161,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, // If this address fits entirely in a 16-bit sext immediate field, codegen // this as "d, 0" short Imm; - if (isIntS16Immediate(CN, Imm)) { + if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) { Disp = DAG.getTargetConstant(Imm, CN->getValueType(0)); Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, CN->getValueType(0)); @@ -1098,8 +1169,9 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, } // Handle 32-bit sext immediates with LIS + addr mode. - if (CN->getValueType(0) == MVT::i32 || - (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) { + if ((CN->getValueType(0) == MVT::i32 || + (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) && + (!Aligned || (CN->getZExtValue() & 3) == 0)) { int Addr = (int)CN->getZExtValue(); // Otherwise, break this down into an LIS + disp. @@ -1113,9 +1185,10 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, } Disp = DAG.getTargetConstant(0, getPointerTy()); - if (FrameIndexSDNode *FI = dyn_cast(N)) + if (FrameIndexSDNode *FI = dyn_cast(N)) { Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); - else + fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); + } else Base = N; return true; // [r+0] } @@ -1147,92 +1220,6 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, return true; } -/// SelectAddressRegImmShift - Returns true if the address N can be -/// represented by a base register plus a signed 14-bit displacement -/// [r+imm*4]. Suitable for use by STD and friends. -bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp, - SDValue &Base, - SelectionDAG &DAG) const { - // FIXME dl should come from the parent load or store, not the address - DebugLoc dl = N.getDebugLoc(); - // If this can be more profitably realized as r+r, fail. - if (SelectAddressRegReg(N, Disp, Base, DAG)) - return false; - - if (N.getOpcode() == ISD::ADD) { - short imm = 0; - if (isIntS16Immediate(N.getOperand(1), imm) && (imm & 3) == 0) { - Disp = DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32); - if (FrameIndexSDNode *FI = dyn_cast(N.getOperand(0))) { - Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); - } else { - Base = N.getOperand(0); - } - return true; // [r+i] - } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) { - // Match LOAD (ADD (X, Lo(G))). - assert(!cast(N.getOperand(1).getOperand(1))->getZExtValue() - && "Cannot handle constant offsets yet!"); - Disp = N.getOperand(1).getOperand(0); // The global address. - assert(Disp.getOpcode() == ISD::TargetGlobalAddress || - Disp.getOpcode() == ISD::TargetConstantPool || - Disp.getOpcode() == ISD::TargetJumpTable); - Base = N.getOperand(0); - return true; // [&g+r] - } - } else if (N.getOpcode() == ISD::OR) { - short imm = 0; - if (isIntS16Immediate(N.getOperand(1), imm) && (imm & 3) == 0) { - // If this is an or of disjoint bitfields, we can codegen this as an add - // (for better address arithmetic) if the LHS and RHS of the OR are - // provably disjoint. - APInt LHSKnownZero, LHSKnownOne; - DAG.ComputeMaskedBits(N.getOperand(0), LHSKnownZero, LHSKnownOne); - if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) { - // If all of the bits are known zero on the LHS or RHS, the add won't - // carry. - Base = N.getOperand(0); - Disp = DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32); - return true; - } - } - } else if (ConstantSDNode *CN = dyn_cast(N)) { - // Loading from a constant address. Verify low two bits are clear. - if ((CN->getZExtValue() & 3) == 0) { - // If this address fits entirely in a 14-bit sext immediate field, codegen - // this as "d, 0" - short Imm; - if (isIntS16Immediate(CN, Imm)) { - Disp = DAG.getTargetConstant((unsigned short)Imm >> 2, getPointerTy()); - Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, - CN->getValueType(0)); - return true; - } - - // Fold the low-part of 32-bit absolute addresses into addr mode. - if (CN->getValueType(0) == MVT::i32 || - (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) { - int Addr = (int)CN->getZExtValue(); - - // Otherwise, break this down into an LIS + disp. - Disp = DAG.getTargetConstant((short)Addr >> 2, MVT::i32); - Base = DAG.getTargetConstant((Addr-(signed short)Addr) >> 16, MVT::i32); - unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8; - Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base),0); - return true; - } - } - } - - Disp = DAG.getTargetConstant(0, getPointerTy()); - if (FrameIndexSDNode *FI = dyn_cast(N)) - Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); - else - Base = N; - return true; // [r+0] -} - - /// getPreIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if the node's address /// can be legally represented as pre-indexed load / store address. @@ -1285,18 +1272,16 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, return true; } - // LDU/STU use reg+imm*4, others use reg+imm. + // LDU/STU can only handle immediates that are a multiple of 4. if (VT != MVT::i64) { - // reg + imm - if (!SelectAddressRegImm(Ptr, Offset, Base, DAG)) + if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, false)) return false; } else { // LDU/STU need an address with at least 4-byte alignment. if (Alignment < 4) return false; - // reg + imm * 4. - if (!SelectAddressRegImmShift(Ptr, Offset, Base, DAG)) + if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, true)) return false; } @@ -1321,8 +1306,8 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, /// PICBase, set the HiOpFlags and LoOpFlags to the target MO flags. static bool GetLabelAccessInfo(const TargetMachine &TM, unsigned &HiOpFlags, unsigned &LoOpFlags, const GlobalValue *GV = 0) { - HiOpFlags = PPCII::MO_HA16; - LoOpFlags = PPCII::MO_LO16; + HiOpFlags = PPCII::MO_HA; + LoOpFlags = PPCII::MO_LO; // Don't use the pic base if not in PIC relocation model. Or if we are on a // non-darwin platform. We don't support PIC on other platforms yet. @@ -1352,7 +1337,7 @@ static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC, SelectionDAG &DAG) { EVT PtrVT = HiPart.getValueType(); SDValue Zero = DAG.getConstant(0, PtrVT); - DebugLoc DL = HiPart.getDebugLoc(); + SDLoc DL(HiPart); SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero); SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero); @@ -1377,7 +1362,7 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, // The actual address of the GlobalValue is stored in the TOC. if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) { SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0); - return DAG.getNode(PPCISD::TOC_ENTRY, CP->getDebugLoc(), MVT::i64, GA, + return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(CP), MVT::i64, GA, DAG.getRegister(PPC::X2, MVT::i64)); } @@ -1398,7 +1383,7 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { // The actual address of the GlobalValue is stored in the TOC. if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) { SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT); - return DAG.getNode(PPCISD::TOC_ENTRY, JT->getDebugLoc(), MVT::i64, GA, + return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(JT), MVT::i64, GA, DAG.getRegister(PPC::X2, MVT::i64)); } @@ -1425,8 +1410,12 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op, SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { + // FIXME: TLS addresses currently use medium model code sequences, + // which is the most useful form. Eventually support for small and + // large models could be added if users need it, at the cost of + // additional complexity. GlobalAddressSDNode *GA = cast(Op); - DebugLoc dl = GA->getDebugLoc(); + SDLoc dl(GA); const GlobalValue *GV = GA->getGlobal(); EVT PtrVT = getPointerTy(); bool is64bit = PPCSubTarget.isPPC64(); @@ -1435,26 +1424,29 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, if (Model == TLSModel::LocalExec) { SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, - PPCII::MO_TPREL16_HA); + PPCII::MO_TPREL_HA); SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, - PPCII::MO_TPREL16_LO); + PPCII::MO_TPREL_LO); SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2, is64bit ? MVT::i64 : MVT::i32); SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg); return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi); } - if (!is64bit) - llvm_unreachable("only local-exec is currently supported for ppc32"); - if (Model == TLSModel::InitialExec) { SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); - SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); - SDValue TPOffsetHi = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl, - PtrVT, GOTReg, TGA); + SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, + PPCII::MO_TLS); + SDValue GOTPtr; + if (is64bit) { + SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); + GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl, + PtrVT, GOTReg, TGA); + } else + GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT); SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl, - PtrVT, TGA, TPOffsetHi); - return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGA); + PtrVT, TGA, GOTPtr); + return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS); } if (Model == TLSModel::GeneralDynamic) { @@ -1512,7 +1504,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = Op.getValueType(); GlobalAddressSDNode *GSDN = cast(Op); - DebugLoc DL = GSDN->getDebugLoc(); + SDLoc DL(GSDN); const GlobalValue *GV = GSDN->getGlobal(); // 64-bit SVR4 ABI code is always position-independent. @@ -1543,7 +1535,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op, SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { ISD::CondCode CC = cast(Op.getOperand(2))->get(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); // If we're comparing for equality to zero, expose the fact that this is // implented as a ctlz/srl pair on ppc, so that the dag combiner can @@ -1592,7 +1584,7 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG, SDValue InChain = Node->getOperand(0); SDValue VAListPtr = Node->getOperand(1); const Value *SV = cast(Node->getOperand(2))->getValue(); - DebugLoc dl = Node->getDebugLoc(); + SDLoc dl(Node); assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only"); @@ -1692,6 +1684,18 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG, false, false, false, 0); } +SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const { + assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only"); + + // We have to copy the entire va_list struct: + // 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte + return DAG.getMemcpy(Op.getOperand(0), Op, + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(12, MVT::i32), 8, false, true, + MachinePointerInfo(), MachinePointerInfo()); +} + SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { return Op.getOperand(0); @@ -1703,7 +1707,7 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SDValue Trmp = Op.getOperand(1); // trampoline SDValue FPtr = Op.getOperand(2); // nested function SDValue Nest = Op.getOperand(3); // 'nest' parameter value - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); bool isPPC64 = (PtrVT == MVT::i64); @@ -1745,7 +1749,7 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG, MachineFunction &MF = DAG.getMachineFunction(); PPCFunctionInfo *FuncInfo = MF.getInfo(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) { // vastart just stores the address of the VarArgsFrameIndex slot into the @@ -1839,18 +1843,24 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG, #include "PPCGenCallingConv.inc" -static bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { +// Function whose sole purpose is to kill compiler warnings +// stemming from unused functions included from PPCGenCallingConv.inc. +CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const { + return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS; +} + +bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { return true; } -static bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { +bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { static const uint16_t ArgRegs[] = { PPC::R3, PPC::R4, PPC::R5, PPC::R6, PPC::R7, PPC::R8, PPC::R9, PPC::R10, @@ -1873,11 +1883,11 @@ static bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, return false; } -static bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { +bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { static const uint16_t ArgRegs[] = { PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, PPC::F8 @@ -1928,7 +1938,7 @@ PPCTargetLowering::LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { if (PPCSubTarget.isSVR4ABI()) { @@ -1950,7 +1960,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { // 32-bit SVR4 ABI Stack Frame Layout: @@ -2167,14 +2177,14 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, SelectionDAG &DAG, SDValue ArgVal, - DebugLoc dl) const { + SDLoc dl) const { if (Flags.isSExt()) ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal, DAG.getValueType(ObjectVT)); else if (Flags.isZExt()) ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal, DAG.getValueType(ObjectVT)); - + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal); } @@ -2210,7 +2220,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { // TODO: add description of PPC stack frame format, or at least some docs. // @@ -2301,6 +2311,13 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( InVals.push_back(FIN); continue; } + + unsigned BVAlign = Flags.getByValAlign(); + if (BVAlign > 8) { + ArgOffset = ((ArgOffset+BVAlign-1)/BVAlign)*BVAlign; + CurArgOffset = ArgOffset; + } + // All aggregates smaller than 8 bytes must be passed right-justified. if (ObjSize < PtrByteSize) CurArgOffset = CurArgOffset + (PtrByteSize - ObjSize); @@ -2499,7 +2516,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin( CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { // TODO: add description of PPC stack frame format, or at least some docs. // @@ -2597,17 +2614,17 @@ PPCTargetLowering::LowerFormalArguments_Darwin( SmallVector MemOps; unsigned nAltivecParamsAtEnd = 0; - // FIXME: FuncArg and Ins[ArgNo] must reference the same argument. - // When passing anonymous aggregates, this is currently not true. - // See LowerFormalArguments_64SVR4 for a fix. Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); - for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo, ++FuncArg) { + unsigned CurArgIdx = 0; + for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { SDValue ArgVal; bool needsLoad = false; EVT ObjectVT = Ins[ArgNo].VT; unsigned ObjSize = ObjectVT.getSizeInBits()/8; unsigned ArgSize = ObjSize; ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; + std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx); + CurArgIdx = Ins[ArgNo].OrigArgIndex; unsigned CurArgOffset = ArgOffset; @@ -2954,7 +2971,7 @@ PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (Flags.isByVal()) return false; } - // Non PIC/GOT tail calls are supported. + // Non-PIC/GOT tail calls are supported. if (getTargetMachine().getRelocationModel() != Reloc::PIC_) return true; @@ -2999,9 +3016,9 @@ struct TailCallArgumentInfo { static void StoreTailCallArgumentsToStackSlot(SelectionDAG &DAG, SDValue Chain, - const SmallVector &TailCallArgs, - SmallVector &MemOpChains, - DebugLoc dl) { + const SmallVectorImpl &TailCallArgs, + SmallVectorImpl &MemOpChains, + SDLoc dl) { for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) { SDValue Arg = TailCallArgs[i].Arg; SDValue FIN = TailCallArgs[i].FrameIdxOp; @@ -3023,7 +3040,7 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, int SPDiff, bool isPPC64, bool isDarwinABI, - DebugLoc dl) { + SDLoc dl) { if (SPDiff) { // Calculate the new stack slot for the return address. int SlotSize = isPPC64 ? 8 : 4; @@ -3058,7 +3075,7 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, static void CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64, SDValue Arg, int SPDiff, unsigned ArgOffset, - SmallVector& TailCallArguments) { + SmallVectorImpl& TailCallArguments) { int Offset = ArgOffset + SPDiff; uint32_t OpSize = (Arg.getValueType().getSizeInBits()+7)/8; int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); @@ -3080,7 +3097,7 @@ SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG, SDValue &LROpOut, SDValue &FPOpOut, bool isDarwinABI, - DebugLoc dl) const { + SDLoc dl) const { if (SPDiff) { // Load the LR and FP stack slot for later adjusting. EVT VT = PPCSubTarget.isPPC64() ? MVT::i64 : MVT::i32; @@ -3110,7 +3127,7 @@ SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG, static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, - DebugLoc dl) { + SDLoc dl) { SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), false, false, MachinePointerInfo(0), @@ -3123,9 +3140,9 @@ static void LowerMemOpCallTo(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg, SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64, bool isTailCall, - bool isVector, SmallVector &MemOpChains, - SmallVector &TailCallArguments, - DebugLoc dl) { + bool isVector, SmallVectorImpl &MemOpChains, + SmallVectorImpl &TailCallArguments, + SDLoc dl) { EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); if (!isTailCall) { if (isVector) { @@ -3146,9 +3163,9 @@ LowerMemOpCallTo(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, static void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain, - DebugLoc dl, bool isPPC64, int SPDiff, unsigned NumBytes, + SDLoc dl, bool isPPC64, int SPDiff, unsigned NumBytes, SDValue LROp, SDValue FPOp, bool isDarwinABI, - SmallVector &TailCallArguments) { + SmallVectorImpl &TailCallArguments) { MachineFunction &MF = DAG.getMachineFunction(); // Emit a sequence of copyto/copyfrom virtual registers for arguments that @@ -3168,15 +3185,15 @@ void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain, // Emit callseq_end just before tailcall node. Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), - DAG.getIntPtrConstant(0, true), InFlag); + DAG.getIntPtrConstant(0, true), InFlag, dl); InFlag = Chain.getValue(1); } static unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, - SDValue &Chain, DebugLoc dl, int SPDiff, bool isTailCall, - SmallVector, 8> &RegsToPass, - SmallVector &Ops, std::vector &NodeTys, + SDValue &Chain, SDLoc dl, int SPDiff, bool isTailCall, + SmallVectorImpl > &RegsToPass, + SmallVectorImpl &Ops, std::vector &NodeTys, const PPCSubtarget &PPCSubTarget) { bool isPPC64 = PPCSubTarget.isPPC64(); @@ -3360,7 +3377,7 @@ SDValue PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { SmallVector RVLocs; @@ -3403,7 +3420,7 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, } SDValue -PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl, +PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, bool isTailCall, bool isVarArg, SelectionDAG &DAG, SmallVector, 8> @@ -3473,7 +3490,9 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl, // from allocating it), resulting in an additional register being // allocated and an unnecessary move instruction being generated. needsTOCRestore = true; - } else if ((CallOpc == PPCISD::CALL) && !isLocalCall(Callee)) { + } else if ((CallOpc == PPCISD::CALL) && + (!isLocalCall(Callee) || + DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { // Otherwise insert NOP for non-local calls. CallOpc = PPCISD::CALL_NOP; } @@ -3490,7 +3509,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl, Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), DAG.getIntPtrConstant(BytesCalleePops, true), - InFlag); + InFlag, dl); if (!Ins.empty()) InFlag = Chain.getValue(1); @@ -3502,10 +3521,10 @@ SDValue PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; - DebugLoc &dl = CLI.DL; - SmallVector &Outs = CLI.Outs; - SmallVector &OutVals = CLI.OutVals; - SmallVector &Ins = CLI.Ins; + SDLoc &dl = CLI.DL; + SmallVectorImpl &Outs = CLI.Outs; + SmallVectorImpl &OutVals = CLI.OutVals; + SmallVectorImpl &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; bool &isTailCall = CLI.IsTailCall; @@ -3539,7 +3558,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description // of the 32-bit SVR4 ABI stack frame layout. @@ -3625,7 +3644,8 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), + dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be moved somewhere else @@ -3676,7 +3696,8 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, // This must go outside the CALLSEQ_START..END. SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, - CallSeqStart.getNode()->getOperand(1)); + CallSeqStart.getNode()->getOperand(1), + SDLoc(MemcpyCall)); DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), NewCallSeqStart.getNode()); Chain = CallSeqStart = NewCallSeqStart; @@ -3752,13 +3773,14 @@ PPCTargetLowering::createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, - DebugLoc dl) const { + SDLoc dl) const { SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff, CallSeqStart.getNode()->getOperand(0), Flags, DAG, dl); // The MEMCPY must go outside the CALLSEQ_START..END. SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, - CallSeqStart.getNode()->getOperand(1)); + CallSeqStart.getNode()->getOperand(1), + SDLoc(MemcpyCall)); DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), NewCallSeqStart.getNode()); return NewCallSeqStart; @@ -3771,7 +3793,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { unsigned NumOps = Outs.size(); @@ -3812,7 +3834,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), + dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be move somewhere else @@ -3886,6 +3909,15 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, if (Size == 0) continue; + unsigned BVAlign = Flags.getByValAlign(); + if (BVAlign > 8) { + if (BVAlign % PtrByteSize != 0) + llvm_unreachable( + "ByVal alignment is not a multiple of the pointer size"); + + ArgOffset = ((ArgOffset+BVAlign-1)/BVAlign)*BVAlign; + } + // All aggregates smaller than 8 bytes must be passed right-justified. if (Size==1 || Size==2 || Size==4) { EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32); @@ -3937,7 +3969,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // register. // FIXME: The memcpy seems to produce pretty awful code for // small aggregates, particularly for packed ones. - // FIXME: It would be preferable to use the slot in the + // FIXME: It would be preferable to use the slot in the // parameter save area instead of a new local variable. SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType()); SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); @@ -3977,7 +4009,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, continue; } - switch (Arg.getValueType().getSimpleVT().SimpleTy) { + switch (Arg.getSimpleValueType().SimpleTy) { default: llvm_unreachable("Unexpected ValueType for argument!"); case MVT::i32: case MVT::i64: @@ -4000,7 +4032,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // must be passed right-justified in the stack doubleword, and // in the GPR, if one is available. SDValue StoreOff; - if (Arg.getValueType().getSimpleVT().SimpleTy == MVT::f32) { + if (Arg.getSimpleValueType().SimpleTy == MVT::f32) { SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType()); StoreOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); } else @@ -4142,7 +4174,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { unsigned NumOps = Outs.size(); @@ -4183,7 +4215,8 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), + dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be move somewhere else @@ -4307,7 +4340,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, continue; } - switch (Arg.getValueType().getSimpleVT().SimpleTy) { + switch (Arg.getSimpleValueType().SimpleTy) { default: llvm_unreachable("Unexpected ValueType for argument!"); case MVT::i32: case MVT::i64: @@ -4499,7 +4532,7 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, - DebugLoc dl, SelectionDAG &DAG) const { + SDLoc dl, SelectionDAG &DAG) const { SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), @@ -4548,7 +4581,7 @@ PPCTargetLowering::LowerReturn(SDValue Chain, SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const { // When we pop the dynamic allocation we need to restore the SP link. - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); // Get the corect type for pointers. EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); @@ -4633,7 +4666,7 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, // Get the inputs. SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); // Get the corect type for pointers. EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); @@ -4650,7 +4683,7 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL, DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), Op.getOperand(1)); @@ -4658,7 +4691,7 @@ SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0), Op.getOperand(1)); } @@ -4671,22 +4704,37 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { !Op.getOperand(2).getValueType().isFloatingPoint()) return Op; - ISD::CondCode CC = cast(Op.getOperand(4))->get(); + // We might be able to do better than this under some circumstances, but in + // general, fsel-based lowering of select is a finite-math-only optimization. + // For more information, see section F.3 of the 2.06 ISA specification. + if (!DAG.getTarget().Options.NoInfsFPMath || + !DAG.getTarget().Options.NoNaNsFPMath) + return Op; - // Cannot handle SETEQ/SETNE. - if (CC == ISD::SETEQ || CC == ISD::SETNE) return Op; + ISD::CondCode CC = cast(Op.getOperand(4))->get(); EVT ResVT = Op.getValueType(); EVT CmpVT = Op.getOperand(0).getValueType(); SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); SDValue TV = Op.getOperand(2), FV = Op.getOperand(3); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); // If the RHS of the comparison is a 0.0, we don't need to do the // subtraction at all. + SDValue Sel1; if (isFloatingPointZero(RHS)) switch (CC) { default: break; // SETUO etc aren't handled by fsel. + case ISD::SETNE: + std::swap(TV, FV); + case ISD::SETEQ: + if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits + LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); + Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV); + if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits + Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, + DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV); case ISD::SETULT: case ISD::SETLT: std::swap(TV, FV); // fsel is natively setge, swap operands for setlt @@ -4709,44 +4757,55 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp; switch (CC) { default: break; // SETUO etc aren't handled by fsel. + case ISD::SETNE: + std::swap(TV, FV); + case ISD::SETEQ: + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS); + if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits + Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); + Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); + if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits + Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, + DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV); case ISD::SETULT: case ISD::SETLT: Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); - return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); case ISD::SETOGE: case ISD::SETGE: Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); - return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); case ISD::SETUGT: case ISD::SETGT: Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); - return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); case ISD::SETOLE: case ISD::SETLE: Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); - return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); } return Op; } // FIXME: Split this code up when LegalizeDAGTypes lands. SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, - DebugLoc dl) const { + SDLoc dl) const { assert(Op.getOperand(0).getValueType().isFloatingPoint()); SDValue Src = Op.getOperand(0); if (Src.getValueType() == MVT::f32) Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); SDValue Tmp; - switch (Op.getValueType().getSimpleVT().SimpleTy) { + switch (Op.getSimpleValueType().SimpleTy) { default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); case MVT::i32: Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIWZ : @@ -4798,7 +4857,7 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); // Don't handle ppc_fp128 here; let it be lowered to a libcall. if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) return SDValue(); @@ -4932,7 +4991,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); /* The rounding mode is in bits 30:31 of FPSR, and has the following settings: @@ -4998,7 +5057,7 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op, SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); unsigned BitWidth = VT.getSizeInBits(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); assert(Op.getNumOperands() == 3 && VT == Op.getOperand(1).getValueType() && "Unexpected SHL!"); @@ -5026,7 +5085,7 @@ SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const { SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); unsigned BitWidth = VT.getSizeInBits(); assert(Op.getNumOperands() == 3 && VT == Op.getOperand(1).getValueType() && @@ -5054,7 +5113,7 @@ SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const { } SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT VT = Op.getValueType(); unsigned BitWidth = VT.getSizeInBits(); assert(Op.getNumOperands() == 3 && @@ -5089,7 +5148,7 @@ SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const { /// BuildSplatI - Build a canonical splati of Val with an element size of /// SplatSize. Cast the result to VT. static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT, - SelectionDAG &DAG, DebugLoc dl) { + SelectionDAG &DAG, SDLoc dl) { assert(Val >= -16 && Val <= 15 && "vsplti is out of range!"); static const EVT VTys[] = { // canonical VT to use for each size. @@ -5113,10 +5172,20 @@ static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT, return DAG.getNode(ISD::BITCAST, dl, ReqVT, Res); } +/// BuildIntrinsicOp - Return a unary operator intrinsic node with the +/// specified intrinsic ID. +static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, + SelectionDAG &DAG, SDLoc dl, + EVT DestVT = MVT::Other) { + if (DestVT == MVT::Other) DestVT = Op.getValueType(); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, + DAG.getConstant(IID, MVT::i32), Op); +} + /// BuildIntrinsicOp - Return a binary operator intrinsic node with the /// specified intrinsic ID. static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS, - SelectionDAG &DAG, DebugLoc dl, + SelectionDAG &DAG, SDLoc dl, EVT DestVT = MVT::Other) { if (DestVT == MVT::Other) DestVT = LHS.getValueType(); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, @@ -5127,7 +5196,7 @@ static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS, /// specified intrinsic ID. static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1, SDValue Op2, SelectionDAG &DAG, - DebugLoc dl, EVT DestVT = MVT::Other) { + SDLoc dl, EVT DestVT = MVT::Other) { if (DestVT == MVT::Other) DestVT = Op0.getValueType(); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, DAG.getConstant(IID, MVT::i32), Op0, Op1, Op2); @@ -5137,7 +5206,7 @@ static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1, /// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified /// amount. The result has the specified value type. static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, - EVT VT, SelectionDAG &DAG, DebugLoc dl) { + EVT VT, SelectionDAG &DAG, SDLoc dl) { // Force LHS/RHS to be the right type. LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS); RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS); @@ -5156,7 +5225,7 @@ static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, // sequence of ops that should be used. SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); assert(BVN != 0 && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR"); @@ -5312,7 +5381,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, /// the specified operations to build the shuffle. static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, - DebugLoc dl) { + SDLoc dl) { unsigned OpNum = (PFEntry >> 26) & 0x0F; unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); @@ -5391,7 +5460,7 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, /// lowered into a vperm. SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); ShuffleVectorSDNode *SVOp = cast(Op); @@ -5558,7 +5627,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { // If this is a lowered altivec predicate compare, CompareOpc is set to the // opcode number of the comparison. - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); int CompareOpc; bool isDot; if (!getAltivecCompareInfo(Op, CompareOpc, isDot)) @@ -5583,7 +5652,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, // Now that we have the comparison, emit a copy from the CR to a GPR. // This is flagged to the above dot comparison. - SDValue Flags = DAG.getNode(PPCISD::MFCR, dl, MVT::i32, + SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32, DAG.getRegister(PPC::CR6, MVT::i32), CompNode.getValue(1)); @@ -5622,7 +5691,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); // Create a stack slot that is 16-byte aligned. MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); int FrameIdx = FrameInfo->CreateStackObject(16, 16, false); @@ -5639,7 +5708,7 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, } SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); if (Op.getValueType() == MVT::v4i32) { SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); @@ -5716,6 +5785,9 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VAARG: return LowerVAARG(Op, DAG, PPCSubTarget); + case ISD::VACOPY: + return LowerVACOPY(Op, DAG, PPCSubTarget); + case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG, PPCSubTarget); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG, PPCSubTarget); @@ -5726,7 +5798,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::FP_TO_UINT: case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, - Op.getDebugLoc()); + SDLoc(Op)); case ISD::UINT_TO_FP: case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); @@ -5743,6 +5815,9 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); + // For counter-based loop handling. + case ISD::INTRINSIC_W_CHAIN: return SDValue(); + // Frame & Return address. case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); @@ -5753,10 +5828,26 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl&Results, SelectionDAG &DAG) const { const TargetMachine &TM = getTargetMachine(); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); + case ISD::INTRINSIC_W_CHAIN: { + if (cast(N->getOperand(1))->getZExtValue() != + Intrinsic::ppc_is_decremented_ctr_nonzero) + break; + + assert(N->getValueType(0) == MVT::i1 && + "Unexpected result type for CTR decrement intrinsic"); + EVT SVT = getSetCCResultType(*DAG.getContext(), N->getValueType(0)); + SDVTList VTs = DAG.getVTList(SVT, MVT::Other); + SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0), + N->getOperand(1)); + + Results.push_back(NewInt); + Results.push_back(NewInt.getValue(1)); + break; + } case ISD::VAARG: { if (!TM.getSubtarget().isSVR4ABI() || TM.getSubtarget().isPPC64()) @@ -5792,6 +5883,9 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, return; } case ISD::FP_TO_SINT: + // LowerFP_TO_INT() can only handle f32 and f64. + if (N->getOperand(0).getValueType() == MVT::ppcf128) + return; Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl)); return; } @@ -6063,6 +6157,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, // thisMBB: const int64_t LabelOffset = 1 * PVT.getStoreSize(); const int64_t TOCOffset = 3 * PVT.getStoreSize(); + const int64_t BPOffset = 4 * PVT.getStoreSize(); // Prepare IP either in reg. const TargetRegisterClass *PtrRC = getRegClassFor(PVT); @@ -6072,15 +6167,32 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, if (PPCSubTarget.isPPC64() && PPCSubTarget.isSVR4ABI()) { MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD)) .addReg(PPC::X2) - .addImm(TOCOffset / 4) + .addImm(TOCOffset) .addReg(BufReg); - MIB.setMemRefs(MMOBegin, MMOEnd); } + // Naked functions never have a base pointer, and so we use r1. For all + // other functions, this decision must be delayed until during PEI. + unsigned BaseReg; + if (MF->getFunction()->getAttributes().hasAttribute( + AttributeSet::FunctionIndex, Attribute::Naked)) + BaseReg = PPCSubTarget.isPPC64() ? PPC::X1 : PPC::R1; + else + BaseReg = PPCSubTarget.isPPC64() ? PPC::BP8 : PPC::BP; + + MIB = BuildMI(*thisMBB, MI, DL, + TII->get(PPCSubTarget.isPPC64() ? PPC::STD : PPC::STW)) + .addReg(BaseReg) + .addImm(BPOffset) + .addReg(BufReg); + MIB.setMemRefs(MMOBegin, MMOEnd); + // Setup - MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCL)).addMBB(mainMBB); - MIB.addRegMask(PPCRegInfo->getNoPreservedMask()); + MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB); + const PPCRegisterInfo *TRI = + static_cast(getTargetMachine().getRegisterInfo()); + MIB.addRegMask(TRI->getNoPreservedMask()); BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1); @@ -6100,7 +6212,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, if (PPCSubTarget.isPPC64()) { MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD)) .addReg(LabelReg) - .addImm(LabelOffset / 4) + .addImm(LabelOffset) .addReg(BufReg); } else { MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW)) @@ -6147,12 +6259,14 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, // Since FP is only updated here but NOT referenced, it's treated as GPR. unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31; unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1; + unsigned BP = (PVT == MVT::i64) ? PPC::X30 : PPC::R30; MachineInstrBuilder MIB; const int64_t LabelOffset = 1 * PVT.getStoreSize(); const int64_t SPOffset = 2 * PVT.getStoreSize(); const int64_t TOCOffset = 3 * PVT.getStoreSize(); + const int64_t BPOffset = 4 * PVT.getStoreSize(); unsigned BufReg = MI->getOperand(0).getReg(); @@ -6173,7 +6287,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, // Reload IP if (PVT == MVT::i64) { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp) - .addImm(LabelOffset / 4) + .addImm(LabelOffset) .addReg(BufReg); } else { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp) @@ -6185,7 +6299,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, // Reload SP if (PVT == MVT::i64) { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP) - .addImm(SPOffset / 4) + .addImm(SPOffset) .addReg(BufReg); } else { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP) @@ -6194,13 +6308,22 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, } MIB.setMemRefs(MMOBegin, MMOEnd); - // FIXME: When we also support base pointers, that register must also be - // restored here. + // Reload BP + if (PVT == MVT::i64) { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP) + .addImm(BPOffset) + .addReg(BufReg); + } else { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP) + .addImm(BPOffset) + .addReg(BufReg); + } + MIB.setMemRefs(MMOBegin, MMOEnd); // Reload TOC if (PVT == MVT::i64 && PPCSubTarget.isSVR4ABI()) { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2) - .addImm(TOCOffset / 4) + .addImm(TOCOffset) .addReg(BufReg); MIB.setMemRefs(MMOBegin, MMOEnd); @@ -6238,29 +6361,15 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, if (PPCSubTarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 || MI->getOpcode() == PPC::SELECT_CC_I8)) { - unsigned OpCode = MI->getOpcode() == PPC::SELECT_CC_I8 ? - PPC::ISEL8 : PPC::ISEL; - unsigned SelectPred = MI->getOperand(4).getImm(); - DebugLoc dl = MI->getDebugLoc(); + SmallVector Cond; + Cond.push_back(MI->getOperand(4)); + Cond.push_back(MI->getOperand(1)); - unsigned SubIdx; - bool SwapOps; - switch (SelectPred) { - default: llvm_unreachable("invalid predicate for isel"); - case PPC::PRED_EQ: SubIdx = PPC::sub_eq; SwapOps = false; break; - case PPC::PRED_NE: SubIdx = PPC::sub_eq; SwapOps = true; break; - case PPC::PRED_LT: SubIdx = PPC::sub_lt; SwapOps = false; break; - case PPC::PRED_GE: SubIdx = PPC::sub_lt; SwapOps = true; break; - case PPC::PRED_GT: SubIdx = PPC::sub_gt; SwapOps = false; break; - case PPC::PRED_LE: SubIdx = PPC::sub_gt; SwapOps = true; break; - case PPC::PRED_UN: SubIdx = PPC::sub_un; SwapOps = false; break; - case PPC::PRED_NU: SubIdx = PPC::sub_un; SwapOps = true; break; - } - - BuildMI(*BB, MI, dl, TII->get(OpCode), MI->getOperand(0).getReg()) - .addReg(MI->getOperand(SwapOps? 3 : 2).getReg()) - .addReg(MI->getOperand(SwapOps? 2 : 3).getReg()) - .addReg(MI->getOperand(1).getReg(), 0, SubIdx); + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + TII->insertSelect(*BB, MI, dl, MI->getOperand(0).getReg(), + Cond, MI->getOperand(2).getReg(), + MI->getOperand(3).getReg()); } else if (MI->getOpcode() == PPC::SELECT_CC_I4 || MI->getOpcode() == PPC::SELECT_CC_I8 || MI->getOpcode() == PPC::SELECT_CC_F4 || @@ -6620,51 +6729,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // Restore FPSCR value. BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF)).addImm(1).addReg(MFFSReg); - } else if (MI->getOpcode() == PPC::FRINDrint || - MI->getOpcode() == PPC::FRINSrint) { - bool isf32 = MI->getOpcode() == PPC::FRINSrint; - unsigned Dest = MI->getOperand(0).getReg(); - unsigned Src = MI->getOperand(1).getReg(); - DebugLoc dl = MI->getDebugLoc(); - - MachineRegisterInfo &RegInfo = F->getRegInfo(); - unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); - - // Perform the rounding. - BuildMI(*BB, MI, dl, TII->get(isf32 ? PPC::FRINS : PPC::FRIND), Dest) - .addReg(Src); - - // Compare the results. - BuildMI(*BB, MI, dl, TII->get(isf32 ? PPC::FCMPUS : PPC::FCMPUD), CRReg) - .addReg(Dest).addReg(Src); - - // If the results were not equal, then set the FPSCR XX bit. - MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(It, midMBB); - F->insert(It, exitMBB); - exitMBB->splice(exitMBB->begin(), BB, - llvm::next(MachineBasicBlock::iterator(MI)), - BB->end()); - exitMBB->transferSuccessorsAndUpdatePHIs(BB); - - BuildMI(*BB, MI, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_EQ).addReg(CRReg).addMBB(exitMBB); - - BB->addSuccessor(midMBB); - BB->addSuccessor(exitMBB); - - BB = midMBB; - - // Set the FPSCR XX bit (FE_INEXACT). Note that we cannot just set - // the FI bit here because that will not automatically set XX also, - // and XX is what libm interprets as the FE_INEXACT flag. - BuildMI(BB, dl, TII->get(PPC::MTFSB1)).addImm(/* 38 - 32 = */ 6); - BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); - - BB->addSuccessor(exitMBB); - - BB = exitMBB; } else { llvm_unreachable("Unexpected instr type to insert"); } @@ -6677,15 +6741,16 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // Target Optimization Hooks //===----------------------------------------------------------------------===// -SDValue PPCTargetLowering::DAGCombineFastRecip(SDNode *N, - DAGCombinerInfo &DCI, - bool UseOperand) const { +SDValue PPCTargetLowering::DAGCombineFastRecip(SDValue Op, + DAGCombinerInfo &DCI) const { if (DCI.isAfterLegalizeVectorOps()) return SDValue(); - if ((N->getValueType(0) == MVT::f32 && PPCSubTarget.hasFRES()) || - (N->getValueType(0) == MVT::f64 && PPCSubTarget.hasFRE()) || - (N->getValueType(0) == MVT::v4f32 && PPCSubTarget.hasAltivec())) { + EVT VT = Op.getValueType(); + + if ((VT == MVT::f32 && PPCSubTarget.hasFRES()) || + (VT == MVT::f64 && PPCSubTarget.hasFRE()) || + (VT == MVT::v4f32 && PPCSubTarget.hasAltivec())) { // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) // For the reciprocal, we need to find the zero of the function: @@ -6699,46 +6764,36 @@ SDValue PPCTargetLowering::DAGCombineFastRecip(SDNode *N, // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has // 23 digits and double has 52 digits. int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3; - if (N->getValueType(0).getScalarType() == MVT::f64) + if (VT.getScalarType() == MVT::f64) ++Iterations; SelectionDAG &DAG = DCI.DAG; - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(Op); SDValue FPOne = - DAG.getConstantFP(1.0, N->getValueType(0).getScalarType()); - if (N->getValueType(0).isVector()) { - assert(N->getValueType(0).getVectorNumElements() == 4 && + DAG.getConstantFP(1.0, VT.getScalarType()); + if (VT.isVector()) { + assert(VT.getVectorNumElements() == 4 && "Unknown vector type"); - FPOne = DAG.getNode(ISD::BUILD_VECTOR, dl, N->getValueType(0), + FPOne = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, FPOne, FPOne, FPOne, FPOne); } - SDValue Est = DAG.getNode(PPCISD::FRE, dl, - N->getValueType(0), - UseOperand ? N->getOperand(1) : - SDValue(N, 0)); + SDValue Est = DAG.getNode(PPCISD::FRE, dl, VT, Op); DCI.AddToWorklist(Est.getNode()); // Newton iterations: Est = Est + Est (1 - Arg * Est) for (int i = 0; i < Iterations; ++i) { - SDValue NewEst = DAG.getNode(ISD::FMUL, dl, - N->getValueType(0), - UseOperand ? N->getOperand(1) : - SDValue(N, 0), - Est); + SDValue NewEst = DAG.getNode(ISD::FMUL, dl, VT, Op, Est); DCI.AddToWorklist(NewEst.getNode()); - NewEst = DAG.getNode(ISD::FSUB, dl, - N->getValueType(0), FPOne, NewEst); + NewEst = DAG.getNode(ISD::FSUB, dl, VT, FPOne, NewEst); DCI.AddToWorklist(NewEst.getNode()); - NewEst = DAG.getNode(ISD::FMUL, dl, - N->getValueType(0), Est, NewEst); + NewEst = DAG.getNode(ISD::FMUL, dl, VT, Est, NewEst); DCI.AddToWorklist(NewEst.getNode()); - Est = DAG.getNode(ISD::FADD, dl, - N->getValueType(0), Est, NewEst); + Est = DAG.getNode(ISD::FADD, dl, VT, Est, NewEst); DCI.AddToWorklist(Est.getNode()); } @@ -6748,14 +6803,16 @@ SDValue PPCTargetLowering::DAGCombineFastRecip(SDNode *N, return SDValue(); } -SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDNode *N, +SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op, DAGCombinerInfo &DCI) const { if (DCI.isAfterLegalizeVectorOps()) return SDValue(); - if ((N->getValueType(0) == MVT::f32 && PPCSubTarget.hasFRSQRTES()) || - (N->getValueType(0) == MVT::f64 && PPCSubTarget.hasFRSQRTE()) || - (N->getValueType(0) == MVT::v4f32 && PPCSubTarget.hasAltivec())) { + EVT VT = Op.getValueType(); + + if ((VT == MVT::f32 && PPCSubTarget.hasFRSQRTES()) || + (VT == MVT::f64 && PPCSubTarget.hasFRSQRTE()) || + (VT == MVT::v4f32 && PPCSubTarget.hasAltivec())) { // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) // For the reciprocal sqrt, we need to find the zero of the function: @@ -6769,52 +6826,45 @@ SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDNode *N, // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has // 23 digits and double has 52 digits. int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3; - if (N->getValueType(0).getScalarType() == MVT::f64) + if (VT.getScalarType() == MVT::f64) ++Iterations; SelectionDAG &DAG = DCI.DAG; - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(Op); - SDValue FPThreeHalfs = - DAG.getConstantFP(1.5, N->getValueType(0).getScalarType()); - if (N->getValueType(0).isVector()) { - assert(N->getValueType(0).getVectorNumElements() == 4 && + SDValue FPThreeHalves = + DAG.getConstantFP(1.5, VT.getScalarType()); + if (VT.isVector()) { + assert(VT.getVectorNumElements() == 4 && "Unknown vector type"); - FPThreeHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, N->getValueType(0), - FPThreeHalfs, FPThreeHalfs, - FPThreeHalfs, FPThreeHalfs); + FPThreeHalves = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, + FPThreeHalves, FPThreeHalves, + FPThreeHalves, FPThreeHalves); } - SDValue Est = DAG.getNode(PPCISD::FRSQRTE, dl, - N->getValueType(0), N->getOperand(0)); + SDValue Est = DAG.getNode(PPCISD::FRSQRTE, dl, VT, Op); DCI.AddToWorklist(Est.getNode()); // We now need 0.5*Arg which we can write as (1.5*Arg - Arg) so that // this entire sequence requires only one FP constant. - SDValue HalfArg = DAG.getNode(ISD::FMUL, dl, N->getValueType(0), - FPThreeHalfs, N->getOperand(0)); + SDValue HalfArg = DAG.getNode(ISD::FMUL, dl, VT, FPThreeHalves, Op); DCI.AddToWorklist(HalfArg.getNode()); - HalfArg = DAG.getNode(ISD::FSUB, dl, N->getValueType(0), - HalfArg, N->getOperand(0)); + HalfArg = DAG.getNode(ISD::FSUB, dl, VT, HalfArg, Op); DCI.AddToWorklist(HalfArg.getNode()); // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est) for (int i = 0; i < Iterations; ++i) { - SDValue NewEst = DAG.getNode(ISD::FMUL, dl, - N->getValueType(0), Est, Est); + SDValue NewEst = DAG.getNode(ISD::FMUL, dl, VT, Est, Est); DCI.AddToWorklist(NewEst.getNode()); - NewEst = DAG.getNode(ISD::FMUL, dl, - N->getValueType(0), HalfArg, NewEst); + NewEst = DAG.getNode(ISD::FMUL, dl, VT, HalfArg, NewEst); DCI.AddToWorklist(NewEst.getNode()); - NewEst = DAG.getNode(ISD::FSUB, dl, - N->getValueType(0), FPThreeHalfs, NewEst); + NewEst = DAG.getNode(ISD::FSUB, dl, VT, FPThreeHalves, NewEst); DCI.AddToWorklist(NewEst.getNode()); - Est = DAG.getNode(ISD::FMUL, dl, - N->getValueType(0), Est, NewEst); + Est = DAG.getNode(ISD::FMUL, dl, VT, Est, NewEst); DCI.AddToWorklist(Est.getNode()); } @@ -6824,11 +6874,120 @@ SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDNode *N, return SDValue(); } +// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does +// not enforce equality of the chain operands. +static bool isConsecutiveLS(LSBaseSDNode *LS, LSBaseSDNode *Base, + unsigned Bytes, int Dist, + SelectionDAG &DAG) { + EVT VT = LS->getMemoryVT(); + if (VT.getSizeInBits() / 8 != Bytes) + return false; + + SDValue Loc = LS->getBasePtr(); + SDValue BaseLoc = Base->getBasePtr(); + if (Loc.getOpcode() == ISD::FrameIndex) { + if (BaseLoc.getOpcode() != ISD::FrameIndex) + return false; + const MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + int FI = cast(Loc)->getIndex(); + int BFI = cast(BaseLoc)->getIndex(); + int FS = MFI->getObjectSize(FI); + int BFS = MFI->getObjectSize(BFI); + if (FS != BFS || FS != (int)Bytes) return false; + return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes); + } + + // Handle X+C + if (DAG.isBaseWithConstantOffset(Loc) && Loc.getOperand(0) == BaseLoc && + cast(Loc.getOperand(1))->getSExtValue() == Dist*Bytes) + return true; + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + const GlobalValue *GV1 = NULL; + const GlobalValue *GV2 = NULL; + int64_t Offset1 = 0; + int64_t Offset2 = 0; + bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1); + bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2); + if (isGA1 && isGA2 && GV1 == GV2) + return Offset1 == (Offset2 + Dist*Bytes); + return false; +} + +// Return true is there is a nearyby consecutive load to the one provided +// (regardless of alignment). We search up and down the chain, looking though +// token factors and other loads (but nothing else). As a result, a true +// results indicates that it is safe to create a new consecutive load adjacent +// to the load provided. +static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { + SDValue Chain = LD->getChain(); + EVT VT = LD->getMemoryVT(); + + SmallSet LoadRoots; + SmallVector Queue(1, Chain.getNode()); + SmallSet Visited; + + // First, search up the chain, branching to follow all token-factor operands. + // If we find a consecutive load, then we're done, otherwise, record all + // nodes just above the top-level loads and token factors. + while (!Queue.empty()) { + SDNode *ChainNext = Queue.pop_back_val(); + if (!Visited.insert(ChainNext)) + continue; + + if (LoadSDNode *ChainLD = dyn_cast(ChainNext)) { + if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) + return true; + + if (!Visited.count(ChainLD->getChain().getNode())) + Queue.push_back(ChainLD->getChain().getNode()); + } else if (ChainNext->getOpcode() == ISD::TokenFactor) { + for (SDNode::op_iterator O = ChainNext->op_begin(), + OE = ChainNext->op_end(); O != OE; ++O) + if (!Visited.count(O->getNode())) + Queue.push_back(O->getNode()); + } else + LoadRoots.insert(ChainNext); + } + + // Second, search down the chain, starting from the top-level nodes recorded + // in the first phase. These top-level nodes are the nodes just above all + // loads and token factors. Starting with their uses, recursively look though + // all loads (just the chain uses) and token factors to find a consecutive + // load. + Visited.clear(); + Queue.clear(); + + for (SmallSet::iterator I = LoadRoots.begin(), + IE = LoadRoots.end(); I != IE; ++I) { + Queue.push_back(*I); + + while (!Queue.empty()) { + SDNode *LoadRoot = Queue.pop_back_val(); + if (!Visited.insert(LoadRoot)) + continue; + + if (LoadSDNode *ChainLD = dyn_cast(LoadRoot)) + if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) + return true; + + for (SDNode::use_iterator UI = LoadRoot->use_begin(), + UE = LoadRoot->use_end(); UI != UE; ++UI) + if (((isa(*UI) && + cast(*UI)->getChain().getNode() == LoadRoot) || + UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI)) + Queue.push_back(*UI); + } + } + + return false; +} + SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { const TargetMachine &TM = getTargetMachine(); SelectionDAG &DAG = DCI.DAG; - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); switch (N->getOpcode()) { default: break; case PPCISD::SHL: @@ -6855,15 +7014,43 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, "Reciprocal estimates require UnsafeFPMath"); if (N->getOperand(1).getOpcode() == ISD::FSQRT) { - SDValue RV = DAGCombineFastRecipFSQRT(N->getOperand(1).getNode(), DCI); + SDValue RV = + DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0), DCI); if (RV.getNode() != 0) { DCI.AddToWorklist(RV.getNode()); return DAG.getNode(ISD::FMUL, dl, N->getValueType(0), N->getOperand(0), RV); } + } else if (N->getOperand(1).getOpcode() == ISD::FP_EXTEND && + N->getOperand(1).getOperand(0).getOpcode() == ISD::FSQRT) { + SDValue RV = + DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0), + DCI); + if (RV.getNode() != 0) { + DCI.AddToWorklist(RV.getNode()); + RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N->getOperand(1)), + N->getValueType(0), RV); + DCI.AddToWorklist(RV.getNode()); + return DAG.getNode(ISD::FMUL, dl, N->getValueType(0), + N->getOperand(0), RV); + } + } else if (N->getOperand(1).getOpcode() == ISD::FP_ROUND && + N->getOperand(1).getOperand(0).getOpcode() == ISD::FSQRT) { + SDValue RV = + DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0), + DCI); + if (RV.getNode() != 0) { + DCI.AddToWorklist(RV.getNode()); + RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N->getOperand(1)), + N->getValueType(0), RV, + N->getOperand(1).getOperand(1)); + DCI.AddToWorklist(RV.getNode()); + return DAG.getNode(ISD::FMUL, dl, N->getValueType(0), + N->getOperand(0), RV); + } } - SDValue RV = DAGCombineFastRecip(N, DCI); + SDValue RV = DAGCombineFastRecip(N->getOperand(1), DCI); if (RV.getNode() != 0) { DCI.AddToWorklist(RV.getNode()); return DAG.getNode(ISD::FMUL, dl, N->getValueType(0), @@ -6878,12 +7065,32 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, // Compute this as 1/(1/sqrt(X)), which is the reciprocal of the // reciprocal sqrt. - SDValue RV = DAGCombineFastRecipFSQRT(N, DCI); + SDValue RV = DAGCombineFastRecipFSQRT(N->getOperand(0), DCI); if (RV.getNode() != 0) { DCI.AddToWorklist(RV.getNode()); - RV = DAGCombineFastRecip(RV.getNode(), DCI, false); - if (RV.getNode() != 0) + RV = DAGCombineFastRecip(RV, DCI); + if (RV.getNode() != 0) { + // Unfortunately, RV is now NaN if the input was exactly 0. Select out + // this case and force the answer to 0. + + EVT VT = RV.getValueType(); + + SDValue Zero = DAG.getConstantFP(0.0, VT.getScalarType()); + if (VT.isVector()) { + assert(VT.getVectorNumElements() == 4 && "Unknown vector type"); + Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Zero, Zero, Zero, Zero); + } + + SDValue ZeroCmp = + DAG.getSetCC(dl, getSetCCResultType(*DAG.getContext(), VT), + N->getOperand(0), Zero, ISD::SETEQ); + DCI.AddToWorklist(ZeroCmp.getNode()); + DCI.AddToWorklist(RV.getNode()); + + RV = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, dl, VT, + ZeroCmp, Zero, RV); return RV; + } } } @@ -6971,6 +7178,160 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, cast(N)->getMemoryVT(), cast(N)->getMemOperand()); } + break; + case ISD::LOAD: { + LoadSDNode *LD = cast(N); + EVT VT = LD->getValueType(0); + Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext()); + unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty); + if (ISD::isNON_EXTLoad(N) && VT.isVector() && + TM.getSubtarget().hasAltivec() && + (VT == MVT::v16i8 || VT == MVT::v8i16 || + VT == MVT::v4i32 || VT == MVT::v4f32) && + LD->getAlignment() < ABIAlignment) { + // This is a type-legal unaligned Altivec load. + SDValue Chain = LD->getChain(); + SDValue Ptr = LD->getBasePtr(); + + // This implements the loading of unaligned vectors as described in + // the venerable Apple Velocity Engine overview. Specifically: + // https://developer.apple.com/hardwaredrivers/ve/alignment.html + // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html + // + // The general idea is to expand a sequence of one or more unaligned + // loads into a alignment-based permutation-control instruction (lvsl), + // a series of regular vector loads (which always truncate their + // input address to an aligned address), and a series of permutations. + // The results of these permutations are the requested loaded values. + // The trick is that the last "extra" load is not taken from the address + // you might suspect (sizeof(vector) bytes after the last requested + // load), but rather sizeof(vector) - 1 bytes after the last + // requested vector. The point of this is to avoid a page fault if the + // base address happend to be aligned. This works because if the base + // address is aligned, then adding less than a full vector length will + // cause the last vector in the sequence to be (re)loaded. Otherwise, + // the next vector will be fetched as you might suspect was necessary. + + // We might be able to reuse the permutation generation from + // a different base address offset from this one by an aligned amount. + // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this + // optimization later. + SDValue PermCntl = BuildIntrinsicOp(Intrinsic::ppc_altivec_lvsl, Ptr, + DAG, dl, MVT::v16i8); + + // Refine the alignment of the original load (a "new" load created here + // which was identical to the first except for the alignment would be + // merged with the existing node regardless). + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = + MF.getMachineMemOperand(LD->getPointerInfo(), + LD->getMemOperand()->getFlags(), + LD->getMemoryVT().getStoreSize(), + ABIAlignment); + LD->refineAlignment(MMO); + SDValue BaseLoad = SDValue(LD, 0); + + // Note that the value of IncOffset (which is provided to the next + // load's pointer info offset value, and thus used to calculate the + // alignment), and the value of IncValue (which is actually used to + // increment the pointer value) are different! This is because we + // require the next load to appear to be aligned, even though it + // is actually offset from the base pointer by a lesser amount. + int IncOffset = VT.getSizeInBits() / 8; + int IncValue = IncOffset; + + // Walk (both up and down) the chain looking for another load at the real + // (aligned) offset (the alignment of the other load does not matter in + // this case). If found, then do not use the offset reduction trick, as + // that will prevent the loads from being later combined (as they would + // otherwise be duplicates). + if (!findConsecutiveLoad(LD, DAG)) + --IncValue; + + SDValue Increment = DAG.getConstant(IncValue, getPointerTy()); + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + + SDValue ExtraLoad = + DAG.getLoad(VT, dl, Chain, Ptr, + LD->getPointerInfo().getWithOffset(IncOffset), + LD->isVolatile(), LD->isNonTemporal(), + LD->isInvariant(), ABIAlignment); + + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + BaseLoad.getValue(1), ExtraLoad.getValue(1)); + + if (BaseLoad.getValueType() != MVT::v4i32) + BaseLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, BaseLoad); + + if (ExtraLoad.getValueType() != MVT::v4i32) + ExtraLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ExtraLoad); + + SDValue Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm, + BaseLoad, ExtraLoad, PermCntl, DAG, dl); + + if (VT != MVT::v4i32) + Perm = DAG.getNode(ISD::BITCAST, dl, VT, Perm); + + // Now we need to be really careful about how we update the users of the + // original load. We cannot just call DCI.CombineTo (or + // DAG.ReplaceAllUsesWith for that matter), because the load still has + // uses created here (the permutation for example) that need to stay. + SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); + while (UI != UE) { + SDUse &Use = UI.getUse(); + SDNode *User = *UI; + // Note: BaseLoad is checked here because it might not be N, but a + // bitcast of N. + if (User == Perm.getNode() || User == BaseLoad.getNode() || + User == TF.getNode() || Use.getResNo() > 1) { + ++UI; + continue; + } + + SDValue To = Use.getResNo() ? TF : Perm; + ++UI; + + SmallVector Ops; + for (SDNode::op_iterator O = User->op_begin(), + OE = User->op_end(); O != OE; ++O) { + if (*O == Use) + Ops.push_back(To); + else + Ops.push_back(*O); + } + + DAG.UpdateNodeOperands(User, Ops.data(), Ops.size()); + } + + return SDValue(N, 0); + } + } + break; + case ISD::INTRINSIC_WO_CHAIN: + if (cast(N->getOperand(0))->getZExtValue() == + Intrinsic::ppc_altivec_lvsl && + N->getOperand(1)->getOpcode() == ISD::ADD) { + SDValue Add = N->getOperand(1); + + if (DAG.MaskedValueIsZero(Add->getOperand(1), + APInt::getAllOnesValue(4 /* 16 byte alignment */).zext( + Add.getValueType().getScalarType().getSizeInBits()))) { + SDNode *BasePtr = Add->getOperand(0).getNode(); + for (SDNode::use_iterator UI = BasePtr->use_begin(), + UE = BasePtr->use_end(); UI != UE; ++UI) { + if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && + cast(UI->getOperand(0))->getZExtValue() == + Intrinsic::ppc_altivec_lvsl) { + // We've found another LVSL, and this address if an aligned + // multiple of that one. The results will be the same, so use the + // one we've just found instead. + + return SDValue(*UI, 0); + } + } + } + } + break; case ISD::BSWAP: // Turn BSWAP (LOAD) -> lhbrx/lwbrx. @@ -7056,20 +7417,53 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } } - // If the user is a MFCR instruction, we know this is safe. Otherwise we - // give up for right now. - if (FlagUser->getOpcode() == PPCISD::MFCR) + // If the user is a MFOCRF instruction, we know this is safe. + // Otherwise we give up for right now. + if (FlagUser->getOpcode() == PPCISD::MFOCRF) return SDValue(VCMPoNode, 0); } break; } case ISD::BR_CC: { // If this is a branch on an altivec predicate comparison, lower this so - // that we don't have to do a MFCR: instead, branch directly on CR6. This + // that we don't have to do a MFOCRF: instead, branch directly on CR6. This // lowering is done pre-legalize, because the legalizer lowers the predicate // compare down to code that is difficult to reassemble. ISD::CondCode CC = cast(N->getOperand(1))->get(); SDValue LHS = N->getOperand(2), RHS = N->getOperand(3); + + // Sometimes the promoted value of the intrinsic is ANDed by some non-zero + // value. If so, pass-through the AND to get to the intrinsic. + if (LHS.getOpcode() == ISD::AND && + LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN && + cast(LHS.getOperand(0).getOperand(1))->getZExtValue() == + Intrinsic::ppc_is_decremented_ctr_nonzero && + isa(LHS.getOperand(1)) && + !cast(LHS.getOperand(1))->getConstantIntValue()-> + isZero()) + LHS = LHS.getOperand(0); + + if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN && + cast(LHS.getOperand(1))->getZExtValue() == + Intrinsic::ppc_is_decremented_ctr_nonzero && + isa(RHS)) { + assert((CC == ISD::SETEQ || CC == ISD::SETNE) && + "Counter decrement comparison is not EQ or NE"); + + unsigned Val = cast(RHS)->getZExtValue(); + bool isBDNZ = (CC == ISD::SETEQ && Val) || + (CC == ISD::SETNE && !Val); + + // We now need to make the intrinsic dead (it cannot be instruction + // selected). + DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0)); + assert(LHS.getNode()->hasOneUse() && + "Counter decrement has more than one use"); + + return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other, + N->getOperand(0), N->getOperand(4)); + } + int CompareOpc; bool isDot; @@ -7244,7 +7638,7 @@ PPCTargetLowering::getSingleConstraintMatchWeight( std::pair PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, - EVT VT) const { + MVT VT) const { if (Constraint.size() == 1) { // GCC RS6000 Constraint Letters switch (Constraint[0]) { @@ -7269,7 +7663,24 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, } } - return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); + std::pair R = + TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); + + // r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers + // (which we call X[0-9]+). If a 64-bit value has been requested, and a + // 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent + // register. + // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use + // the AsmName field from *RegisterInfo.td, then this would not be necessary. + if (R.first && VT == MVT::i64 && PPCSubTarget.isPPC64() && + PPC::GPRCRegClass.contains(R.first)) { + const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + return std::make_pair(TRI->getMatchingSuperReg(R.first, + PPC::sub_32, &PPC::G8RCRegClass), + &PPC::G8RCRegClass); + } + + return R; } @@ -7379,25 +7790,19 @@ bool PPCTargetLowering::isLegalAddressingMode(const AddrMode &AM, return true; } -/// isLegalAddressImmediate - Return true if the integer value can be used -/// as the offset of the target addressing mode for load / store of the -/// given type. -bool PPCTargetLowering::isLegalAddressImmediate(int64_t V,Type *Ty) const{ - // PPC allows a sign-extended 16-bit immediate field. - return (V > -(1 << 16) && V < (1 << 16)-1); -} - -bool PPCTargetLowering::isLegalAddressImmediate(GlobalValue* GV) const { - return false; -} - SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); MFI->setReturnAddressIsTaken(true); - DebugLoc dl = Op.getDebugLoc(); + if (!isa(Op.getOperand(0))) { + DAG.getContext()->emitError("argument to '__builtin_return_address' must " + "be a constant integer"); + return SDValue(); + } + + SDLoc dl(Op); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); // Make sure the function does not optimize away the store of the RA to @@ -7427,7 +7832,7 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op, SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); @@ -7510,18 +7915,15 @@ bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, return true; } -/// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than -/// a pair of mul and add instructions. fmuladd intrinsics will be expanded to -/// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd -/// is expanded to mul + add. -bool PPCTargetLowering::isFMAFasterThanMulAndAdd(EVT VT) const { +bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + VT = VT.getScalarType(); + if (!VT.isSimple()) return false; switch (VT.getSimpleVT().SimpleTy) { case MVT::f32: case MVT::f64: - case MVT::v4f32: return true; default: break; @@ -7531,9 +7933,15 @@ bool PPCTargetLowering::isFMAFasterThanMulAndAdd(EVT VT) const { } Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const { - if (DisableILPPref) + if (DisableILPPref || PPCSubTarget.enableMachineScheduler()) return TargetLowering::getSchedulingPreference(N); return Sched::ILP; } +// Create a fast isel object. +FastISel * +PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo, + const TargetLibraryInfo *LibInfo) const { + return PPC::createFastISel(FuncInfo, LibInfo); +}