X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FPowerPC%2FPPCISelLowering.cpp;h=61003cf1b340798020fbee93cf3a9fb73f60732e;hb=875710a2fd6b3c4f814961582594bd5c1cdb493a;hp=e2826f726dda57223227982d2eafd75c36c1ed2d;hpb=d644d17dd44e89ffd2dcb6896bc23b4c478c37cf;p=oota-llvm.git diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index e2826f726dd..61003cf1b34 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -453,6 +453,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::BUILD_VECTOR, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); + setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); @@ -526,11 +528,6 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // Altivec does not contain unordered floating-point compare instructions setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand); setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand); - setCondCodeAction(ISD::SETUGT, MVT::v4f32, Expand); - setCondCodeAction(ISD::SETUGE, MVT::v4f32, Expand); - setCondCodeAction(ISD::SETULT, MVT::v4f32, Expand); - setCondCodeAction(ISD::SETULE, MVT::v4f32, Expand); - setCondCodeAction(ISD::SETO, MVT::v4f32, Expand); setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand); @@ -561,11 +558,6 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // Share the Altivec comparison restrictions. setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand); setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand); - setCondCodeAction(ISD::SETUGT, MVT::v2f64, Expand); - setCondCodeAction(ISD::SETUGE, MVT::v2f64, Expand); - setCondCodeAction(ISD::SETULT, MVT::v2f64, Expand); - setCondCodeAction(ISD::SETULE, MVT::v2f64, Expand); - setCondCodeAction(ISD::SETO, MVT::v2f64, Expand); setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand); @@ -626,6 +618,13 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // Altivec instructions set fields to all zeros or all ones. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + if (!isPPC64) { + // These libcalls are not available in 32-bit. + setLibcallName(RTLIB::SHL_I128, nullptr); + setLibcallName(RTLIB::SRL_I128, nullptr); + setLibcallName(RTLIB::SRA_I128, nullptr); + } + if (isPPC64) { setStackPointerRegisterToSaveRestore(PPC::X1); setExceptionPointerRegister(PPC::X3); @@ -685,11 +684,6 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) if (Subtarget.isDarwin()) setPrefFunctionAlignment(4); - if (isPPC64 && Subtarget.isJITCodeModel()) - // Temporary workaround for the inability of PPC64 JIT to handle jump - // tables. - setSupportJumpTables(false); - setInsertFencesForAtomic(true); if (Subtarget.enableMachineScheduler()) @@ -853,14 +847,27 @@ static bool isConstantOrUndef(int Op, int Val) { /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a /// VPKUHUM instruction. -bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary, +/// The ShuffleKind distinguishes between big-endian operations with +/// two different inputs (0), either-endian operations with two identical +/// inputs (1), and little-endian operantion with two different inputs (2). +/// For the latter, the input operands are swapped (see PPCInstrAltivec.td). +bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, SelectionDAG &DAG) { - unsigned j = DAG.getTarget().getDataLayout()->isLittleEndian() ? 0 : 1; - if (!isUnary) { + bool IsLE = DAG.getSubtarget().getDataLayout()->isLittleEndian(); + if (ShuffleKind == 0) { + if (IsLE) + return false; for (unsigned i = 0; i != 16; ++i) - if (!isConstantOrUndef(N->getMaskElt(i), i*2+j)) + if (!isConstantOrUndef(N->getMaskElt(i), i*2+1)) return false; - } else { + } else if (ShuffleKind == 2) { + if (!IsLE) + return false; + for (unsigned i = 0; i != 16; ++i) + if (!isConstantOrUndef(N->getMaskElt(i), i*2)) + return false; + } else if (ShuffleKind == 1) { + unsigned j = IsLE ? 0 : 1; for (unsigned i = 0; i != 8; ++i) if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) || !isConstantOrUndef(N->getMaskElt(i+8), i*2+j)) @@ -871,27 +878,34 @@ bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary, /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a /// VPKUWUM instruction. -bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary, +/// The ShuffleKind distinguishes between big-endian operations with +/// two different inputs (0), either-endian operations with two identical +/// inputs (1), and little-endian operantion with two different inputs (2). +/// For the latter, the input operands are swapped (see PPCInstrAltivec.td). +bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, SelectionDAG &DAG) { - unsigned j, k; - if (DAG.getTarget().getDataLayout()->isLittleEndian()) { - j = 0; - k = 1; - } else { - j = 2; - k = 3; - } - if (!isUnary) { + bool IsLE = DAG.getSubtarget().getDataLayout()->isLittleEndian(); + if (ShuffleKind == 0) { + if (IsLE) + return false; for (unsigned i = 0; i != 16; i += 2) - if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || - !isConstantOrUndef(N->getMaskElt(i+1), i*2+k)) + if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) || + !isConstantOrUndef(N->getMaskElt(i+1), i*2+3)) return false; - } else { + } else if (ShuffleKind == 2) { + if (!IsLE) + return false; + for (unsigned i = 0; i != 16; i += 2) + if (!isConstantOrUndef(N->getMaskElt(i ), i*2) || + !isConstantOrUndef(N->getMaskElt(i+1), i*2+1)) + return false; + } else if (ShuffleKind == 1) { + unsigned j = IsLE ? 0 : 2; for (unsigned i = 0; i != 8; i += 2) - if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || - !isConstantOrUndef(N->getMaskElt(i+1), i*2+k) || - !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) || - !isConstantOrUndef(N->getMaskElt(i+9), i*2+k)) + if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || + !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) || + !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) || + !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1)) return false; } return true; @@ -919,38 +933,63 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize, /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for /// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes). +/// The ShuffleKind distinguishes between big-endian merges with two +/// different inputs (0), either-endian merges with two identical inputs (1), +/// and little-endian merges with two different inputs (2). For the latter, +/// the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, - bool isUnary, SelectionDAG &DAG) { - if (DAG.getTarget().getDataLayout()->isLittleEndian()) { - if (!isUnary) + unsigned ShuffleKind, SelectionDAG &DAG) { + if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) { + if (ShuffleKind == 1) // unary + return isVMerge(N, UnitSize, 0, 0); + else if (ShuffleKind == 2) // swapped return isVMerge(N, UnitSize, 0, 16); - return isVMerge(N, UnitSize, 0, 0); + else + return false; } else { - if (!isUnary) + if (ShuffleKind == 1) // unary + return isVMerge(N, UnitSize, 8, 8); + else if (ShuffleKind == 0) // normal return isVMerge(N, UnitSize, 8, 24); - return isVMerge(N, UnitSize, 8, 8); + else + return false; } } /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for /// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes). +/// The ShuffleKind distinguishes between big-endian merges with two +/// different inputs (0), either-endian merges with two identical inputs (1), +/// and little-endian merges with two different inputs (2). For the latter, +/// the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, - bool isUnary, SelectionDAG &DAG) { - if (DAG.getTarget().getDataLayout()->isLittleEndian()) { - if (!isUnary) + unsigned ShuffleKind, SelectionDAG &DAG) { + if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) { + if (ShuffleKind == 1) // unary + return isVMerge(N, UnitSize, 8, 8); + else if (ShuffleKind == 2) // swapped return isVMerge(N, UnitSize, 8, 24); - return isVMerge(N, UnitSize, 8, 8); + else + return false; } else { - if (!isUnary) + if (ShuffleKind == 1) // unary + return isVMerge(N, UnitSize, 0, 0); + else if (ShuffleKind == 0) // normal return isVMerge(N, UnitSize, 0, 16); - return isVMerge(N, UnitSize, 0, 0); + else + return false; } } /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift /// amount, otherwise return -1. -int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary, SelectionDAG &DAG) { +/// The ShuffleKind distinguishes between big-endian operations with two +/// different inputs (0), either-endian operations with two identical inputs +/// (1), and little-endian operations with two different inputs (2). For the +/// latter, the input operands are swapped (see PPCInstrAltivec.td). +int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, + SelectionDAG &DAG) { if (N->getValueType(0) != MVT::v16i8) return -1; @@ -968,38 +1007,26 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary, SelectionDAG &DAG) { unsigned ShiftAmt = SVOp->getMaskElt(i); if (ShiftAmt < i) return -1; - if (DAG.getTarget().getDataLayout()->isLittleEndian()) { - - ShiftAmt += i; - - if (!isUnary) { - // Check the rest of the elements to see if they are consecutive. - for (++i; i != 16; ++i) - if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt - i)) - return -1; - } else { - // Check the rest of the elements to see if they are consecutive. - for (++i; i != 16; ++i) - if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt - i) & 15)) - return -1; - } - - } else { // Big Endian + ShiftAmt -= i; + bool isLE = DAG.getTarget().getSubtargetImpl()->getDataLayout()-> + isLittleEndian(); + + if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) { + // Check the rest of the elements to see if they are consecutive. + for (++i; i != 16; ++i) + if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) + return -1; + } else if (ShuffleKind == 1) { + // Check the rest of the elements to see if they are consecutive. + for (++i; i != 16; ++i) + if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15)) + return -1; + } else + return -1; - ShiftAmt -= i; + if (ShuffleKind == 2 && isLE) + ShiftAmt = 16 - ShiftAmt; - if (!isUnary) { - // Check the rest of the elements to see if they are consecutive. - for (++i; i != 16; ++i) - if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) - return -1; - } else { - // Check the rest of the elements to see if they are consecutive. - for (++i; i != 16; ++i) - if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15)) - return -1; - } - } return ShiftAmt; } @@ -1055,7 +1082,7 @@ unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast(N); assert(isSplatShuffleMask(SVOp, EltSize)); - if (DAG.getTarget().getDataLayout()->isLittleEndian()) + if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize); else return SVOp->getMaskElt(0) / EltSize; @@ -1331,7 +1358,13 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) { // If all of the bits are known zero on the LHS or RHS, the add won't // carry. - Base = N.getOperand(0); + if (FrameIndexSDNode *FI = + dyn_cast(N.getOperand(0))) { + Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); + fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); + } else { + Base = N.getOperand(0); + } Disp = DAG.getTargetConstant(imm, N.getValueType()); return true; } @@ -1650,47 +1683,61 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, if (Model == TLSModel::GeneralDynamic) { SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); - SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); - SDValue GOTEntryHi = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT, - GOTReg, TGA); + SDValue GOTPtr; + if (is64bit) { + SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); + GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT, + GOTReg, TGA); + } else { + GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); + } SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSGD_L, dl, PtrVT, - GOTEntryHi, TGA); + GOTPtr, TGA); // We need a chain node, and don't have one handy. The underlying // call has no side effects, so using the function entry node // suffices. SDValue Chain = DAG.getEntryNode(); - Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, GOTEntry); - SDValue ParmReg = DAG.getRegister(PPC::X3, MVT::i64); + Chain = DAG.getCopyToReg(Chain, dl, + is64bit ? PPC::X3 : PPC::R3, GOTEntry); + SDValue ParmReg = DAG.getRegister(is64bit ? PPC::X3 : PPC::R3, + is64bit ? MVT::i64 : MVT::i32); SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLS_ADDR, dl, PtrVT, ParmReg, TGA); // The return value from GET_TLS_ADDR really is in X3 already, but // some hacks are needed here to tie everything together. The extra // copies dissolve during subsequent transforms. - Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, TLSAddr); - return DAG.getCopyFromReg(Chain, dl, PPC::X3, PtrVT); + Chain = DAG.getCopyToReg(Chain, dl, is64bit ? PPC::X3 : PPC::R3, TLSAddr); + return DAG.getCopyFromReg(Chain, dl, is64bit ? PPC::X3 : PPC::R3, PtrVT); } if (Model == TLSModel::LocalDynamic) { SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); - SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); - SDValue GOTEntryHi = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT, - GOTReg, TGA); + SDValue GOTPtr; + if (is64bit) { + SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); + GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT, + GOTReg, TGA); + } else { + GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); + } SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSLD_L, dl, PtrVT, - GOTEntryHi, TGA); + GOTPtr, TGA); // We need a chain node, and don't have one handy. The underlying // call has no side effects, so using the function entry node // suffices. SDValue Chain = DAG.getEntryNode(); - Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, GOTEntry); - SDValue ParmReg = DAG.getRegister(PPC::X3, MVT::i64); + Chain = DAG.getCopyToReg(Chain, dl, + is64bit ? PPC::X3 : PPC::R3, GOTEntry); + SDValue ParmReg = DAG.getRegister(is64bit ? PPC::X3 : PPC::R3, + is64bit ? MVT::i64 : MVT::i32); SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLSLD_ADDR, dl, PtrVT, ParmReg, TGA); // The return value from GET_TLSLD_ADDR really is in X3 already, but // some hacks are needed here to tie everything together. The extra // copies dissolve during subsequent transforms. - Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, TLSAddr); + Chain = DAG.getCopyToReg(Chain, dl, is64bit ? PPC::X3 : PPC::R3, TLSAddr); SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, PtrVT, Chain, ParmReg, TGA); return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA); @@ -1819,7 +1866,7 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG, // gpr_index SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, VAListPtr, MachinePointerInfo(SV), MVT::i8, - false, false, 0); + false, false, false, 0); InChain = GprIndex.getValue(1); if (VT == MVT::i64) { @@ -1842,7 +1889,7 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG, // fpr SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, FprPtr, MachinePointerInfo(SV), MVT::i8, - false, false, 0); + false, false, false, 0); InChain = FprIndex.getValue(1); SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, @@ -2152,14 +2199,19 @@ static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags, unsigned ArgSize = ArgVT.getStoreSize(); if (Flags.isByVal()) ArgSize = Flags.getByValSize(); - ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + + // Round up to multiples of the pointer size, except for array members, + // which are always packed. + if (!Flags.isInConsecutiveRegs()) + ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; return ArgSize; } /// CalculateStackSlotAlignment - Calculates the alignment of this argument /// on the stack. -static unsigned CalculateStackSlotAlignment(EVT ArgVT, ISD::ArgFlagsTy Flags, +static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, + ISD::ArgFlagsTy Flags, unsigned PtrByteSize) { unsigned Align = PtrByteSize; @@ -2181,14 +2233,78 @@ static unsigned CalculateStackSlotAlignment(EVT ArgVT, ISD::ArgFlagsTy Flags, } } + // Array members are always packed to their original alignment. + if (Flags.isInConsecutiveRegs()) { + // If the array member was split into multiple registers, the first + // needs to be aligned to the size of the full type. (Except for + // ppcf128, which is only aligned as its f64 components.) + if (Flags.isSplit() && OrigVT != MVT::ppcf128) + Align = OrigVT.getStoreSize(); + else + Align = ArgVT.getStoreSize(); + } + return Align; } +/// CalculateStackSlotUsed - Return whether this argument will use its +/// stack slot (instead of being passed in registers). ArgOffset, +/// AvailableFPRs, and AvailableVRs must hold the current argument +/// position, and will be updated to account for this argument. +static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, + ISD::ArgFlagsTy Flags, + unsigned PtrByteSize, + unsigned LinkageSize, + unsigned ParamAreaSize, + unsigned &ArgOffset, + unsigned &AvailableFPRs, + unsigned &AvailableVRs) { + bool UseMemory = false; + + // Respect alignment of argument on the stack. + unsigned Align = + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); + ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; + // If there's no space left in the argument save area, we must + // use memory (this check also catches zero-sized arguments). + if (ArgOffset >= LinkageSize + ParamAreaSize) + UseMemory = true; + + // Allocate argument on the stack. + ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + // If we overran the argument save area, we must use memory + // (this check catches arguments passed partially in memory) + if (ArgOffset > LinkageSize + ParamAreaSize) + UseMemory = true; + + // However, if the argument is actually passed in an FPR or a VR, + // we don't use memory after all. + if (!Flags.isByVal()) { + if (ArgVT == MVT::f32 || ArgVT == MVT::f64) + if (AvailableFPRs > 0) { + --AvailableFPRs; + return false; + } + if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || + ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || + ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) + if (AvailableVRs > 0) { + --AvailableVRs; + return false; + } + } + + return UseMemory; +} + /// EnsureStackAlignment - Round stack frame size up from NumBytes to /// ensure minimum alignment required for target. static unsigned EnsureStackAlignment(const TargetMachine &Target, unsigned NumBytes) { - unsigned TargetAlign = Target.getFrameLowering()->getStackAlignment(); + unsigned TargetAlign = + Target.getSubtargetImpl()->getFrameLowering()->getStackAlignment(); unsigned AlignMask = TargetAlign - 1; NumBytes = (NumBytes + AlignMask) & ~AlignMask; return NumBytes; @@ -2265,11 +2381,11 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( // Assign locations to all of the incoming arguments. SmallVector ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); // Reserve space for the linkage area on the stack. - unsigned LinkageSize = PPCFrameLowering::getLinkageSize(false, false); + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(false, false, false); CCInfo.AllocateStack(LinkageSize, PtrByteSize); CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4); @@ -2340,7 +2456,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( // caller's stack frame, right above the parameter list area. SmallVector ByValArgLocs; CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ByValArgLocs, *DAG.getContext()); + ByValArgLocs, *DAG.getContext()); // Reserve stack space for the allocations in CCInfo. CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); @@ -2462,6 +2578,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( SmallVectorImpl &InVals) const { // TODO: add description of PPC stack frame format, or at least some docs. // + bool isELFv2ABI = Subtarget.isELFv2ABI(); bool isLittleEndian = Subtarget.isLittleEndian(); MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -2473,8 +2590,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( (CallConv == CallingConv::Fast)); unsigned PtrByteSize = 8; - unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false); - unsigned ArgOffset = LinkageSize; + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false, + isELFv2ABI); static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, @@ -2496,12 +2613,29 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( const unsigned Num_FPR_Regs = 13; const unsigned Num_VR_Regs = array_lengthof(VR); - unsigned GPR_idx, FPR_idx = 0, VR_idx = 0; + // Do a first pass over the arguments to determine whether the ABI + // guarantees that our caller has allocated the parameter save area + // on its stack frame. In the ELFv1 ABI, this is always the case; + // in the ELFv2 ABI, it is true if this is a vararg function or if + // any parameter is located in a stack slot. + + bool HasParameterArea = !isELFv2ABI || isVarArg; + unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize; + unsigned NumBytes = LinkageSize; + unsigned AvailableFPRs = Num_FPR_Regs; + unsigned AvailableVRs = Num_VR_Regs; + for (unsigned i = 0, e = Ins.size(); i != e; ++i) + if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags, + PtrByteSize, LinkageSize, ParamAreaSize, + NumBytes, AvailableFPRs, AvailableVRs)) + HasParameterArea = true; // Add DAG nodes to load the arguments or copy them out of registers. On // entry to a function on PPC, the arguments start after the linkage area, // although the first ones are often in registers. + unsigned ArgOffset = LinkageSize; + unsigned GPR_idx, FPR_idx = 0, VR_idx = 0; SmallVector MemOps; Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); unsigned CurArgIdx = 0; @@ -2509,6 +2643,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( SDValue ArgVal; bool needsLoad = false; EVT ObjectVT = Ins[ArgNo].VT; + EVT OrigVT = Ins[ArgNo].ArgVT; unsigned ObjSize = ObjectVT.getStoreSize(); unsigned ArgSize = ObjSize; ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; @@ -2517,7 +2652,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( /* Respect alignment of argument on the stack. */ unsigned Align = - CalculateStackSlotAlignment(ObjectVT, Flags, PtrByteSize); + CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize); ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; unsigned CurArgOffset = ArgOffset; @@ -2545,15 +2680,31 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( continue; } - // All aggregates smaller than 8 bytes must be passed right-justified. - if (ObjSize < PtrByteSize && !isLittleEndian) - CurArgOffset = CurArgOffset + (PtrByteSize - ObjSize); - // The value of the object is its address. - int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true); + // Create a stack object covering all stack doublewords occupied + // by the argument. If the argument is (fully or partially) on + // the stack, or if the argument is fully in registers but the + // caller has allocated the parameter save anyway, we can refer + // directly to the caller's stack frame. Otherwise, create a + // local copy in our own frame. + int FI; + if (HasParameterArea || + ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize) + FI = MFI->CreateFixedObject(ArgSize, ArgOffset, false); + else + FI = MFI->CreateStackObject(ArgSize, Align, false); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - InVals.push_back(FIN); - if (ObjSize < 8) { + // Handle aggregates smaller than 8 bytes. + if (ObjSize < PtrByteSize) { + // The value of the object is its address, which differs from the + // address of the enclosing doubleword on big-endian systems. + SDValue Arg = FIN; + if (!isLittleEndian) { + SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, PtrVT); + Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff); + } + InVals.push_back(Arg); + if (GPR_idx != Num_GPR_Regs) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); @@ -2562,18 +2713,13 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( if (ObjSize==1 || ObjSize==2 || ObjSize==4) { EVT ObjType = (ObjSize == 1 ? MVT::i8 : (ObjSize == 2 ? MVT::i16 : MVT::i32)); - Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN, + Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg, MachinePointerInfo(FuncArg), ObjType, false, false, 0); } else { // For sizes that don't fit a truncating store (3, 5, 6, 7), // store the whole register as-is to the parameter save area - // slot. The address of the parameter was already calculated - // above (InVals.push_back(FIN)) to be the right-justified - // offset within the slot. For this store, we need a new - // frame index that points at the beginning of the slot. - int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true); - SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + // slot. Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo(FuncArg), false, false, 0); @@ -2587,27 +2733,29 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( continue; } + // The value of the object is its address, which is the address of + // its first stack doubleword. + InVals.push_back(FIN); + + // Store whatever pieces of the object are in registers to memory. for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { - // Store whatever pieces of the object are in registers - // to memory. ArgOffset will be the address of the beginning - // of the object. - if (GPR_idx != Num_GPR_Regs) { - unsigned VReg; - VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); - int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true); - SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); - SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo(FuncArg, j), - false, false, 0); - MemOps.push_back(Store); - ++GPR_idx; - ArgOffset += PtrByteSize; - } else { - ArgOffset += ArgSize - j; + if (GPR_idx == Num_GPR_Regs) break; + + unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); + SDValue Addr = FIN; + if (j) { + SDValue Off = DAG.getConstant(j, PtrVT); + Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off); } + SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr, + MachinePointerInfo(FuncArg, j), + false, false, 0); + MemOps.push_back(Store); + ++GPR_idx; } + ArgOffset += ArgSize; continue; } @@ -2616,6 +2764,9 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::i1: case MVT::i32: case MVT::i64: + // These can be scalar arguments or elements of an integer array type + // passed directly. Clang may use those instead of "byval" aggregate + // types to avoid forcing arguments to memory unnecessarily. if (GPR_idx != Num_GPR_Regs) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); @@ -2633,6 +2784,9 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::f32: case MVT::f64: + // These can be scalar arguments or elements of a float array type + // passed directly. The latter are used to implement ELFv2 homogenous + // float aggregates. if (FPR_idx != Num_FPR_Regs) { unsigned VReg; @@ -2645,12 +2799,32 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); ++FPR_idx; + } else if (GPR_idx != Num_GPR_Regs) { + // This can only ever happen in the presence of f32 array types, + // since otherwise we never run out of FPRs before running out + // of GPRs. + unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); + + if (ObjectVT == MVT::f32) { + if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0)) + ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal, + DAG.getConstant(32, MVT::i32)); + ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal); + } + + ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal); } else { needsLoad = true; - ArgSize = PtrByteSize; } - ArgOffset += 8; + // When passing an array of floats, the array occupies consecutive + // space in the argument area; only round up to the next doubleword + // at the end of the array. Otherwise, each float takes 8 bytes. + ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize; + ArgOffset += ArgSize; + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; break; case MVT::v4f32: case MVT::v4i32: @@ -2658,6 +2832,9 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::v16i8: case MVT::v2f64: case MVT::v2i64: + // These can be scalar arguments or elements of a vector array type + // passed directly. The latter are used to implement ELFv2 homogenous + // vector aggregates. if (VR_idx != Num_VR_Regs) { unsigned VReg = (ObjectVT == MVT::v2f64 || ObjectVT == MVT::v2i64) ? MF.addLiveIn(VSRH[VR_idx], &PPC::VSHRCRegClass) : @@ -2687,7 +2864,10 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( // Area that is at least reserved in the caller of this function. unsigned MinReservedArea; - MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize); + if (HasParameterArea) + MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize); + else + MinReservedArea = LinkageSize; // Set the size that is at least reserved in caller of this function. Tail // call optimized functions' reserved stack space needs to be aligned so that @@ -2748,7 +2928,8 @@ PPCTargetLowering::LowerFormalArguments_Darwin( (CallConv == CallingConv::Fast)); unsigned PtrByteSize = isPPC64 ? 8 : 4; - unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true); + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true, + false); unsigned ArgOffset = LinkageSize; // Area that is at least reserved in caller of this function. unsigned MinReservedArea = ArgOffset; @@ -3361,6 +3542,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, bool isPPC64 = Subtarget.isPPC64(); bool isSVR4ABI = Subtarget.isSVR4ABI(); + bool isELFv2ABI = Subtarget.isELFv2ABI(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); NodeTys.push_back(MVT::Other); // Returns a chain @@ -3377,33 +3559,27 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, } if (GlobalAddressSDNode *G = dyn_cast(Callee)) { - // XXX Work around for http://llvm.org/bugs/show_bug.cgi?id=5201 - // Use indirect calls for ALL functions calls in JIT mode, since the - // far-call stubs may be outside relocation limits for a BL instruction. - if (!DAG.getTarget().getSubtarget().isJITCodeModel()) { - unsigned OpFlags = 0; - if ((DAG.getTarget().getRelocationModel() != Reloc::Static && - (Subtarget.getTargetTriple().isMacOSX() && - Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) && - (G->getGlobal()->isDeclaration() || - G->getGlobal()->isWeakForLinker())) || - (Subtarget.isTargetELF() && !isPPC64 && - !G->getGlobal()->hasLocalLinkage() && - DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { - // PC-relative references to external symbols should go through $stub, - // unless we're building with the leopard linker or later, which - // automatically synthesizes these stubs. - OpFlags = PPCII::MO_PLT_OR_STUB; - } - - // If the callee is a GlobalAddress/ExternalSymbol node (quite common, - // every direct call is) turn it into a TargetGlobalAddress / - // TargetExternalSymbol node so that legalize doesn't hack it. - Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, - Callee.getValueType(), - 0, OpFlags); - needIndirectCall = false; + unsigned OpFlags = 0; + if ((DAG.getTarget().getRelocationModel() != Reloc::Static && + (Subtarget.getTargetTriple().isMacOSX() && + Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) && + (G->getGlobal()->isDeclaration() || + G->getGlobal()->isWeakForLinker())) || + (Subtarget.isTargetELF() && !isPPC64 && + !G->getGlobal()->hasLocalLinkage() && + DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { + // PC-relative references to external symbols should go through $stub, + // unless we're building with the leopard linker or later, which + // automatically synthesizes these stubs. + OpFlags = PPCII::MO_PLT_OR_STUB; } + + // If the callee is a GlobalAddress/ExternalSymbol node (quite common, + // every direct call is) turn it into a TargetGlobalAddress / + // TargetExternalSymbol node so that legalize doesn't hack it. + Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, + Callee.getValueType(), 0, OpFlags); + needIndirectCall = false; } if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { @@ -3430,7 +3606,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, // to do the call, we can't use PPCISD::CALL. SDValue MTCTROps[] = {Chain, Callee, InFlag}; - if (isSVR4ABI && isPPC64) { + if (isSVR4ABI && isPPC64 && !isELFv2ABI) { // Function pointers in the 64-bit SVR4 ABI do not point to the function // entry point, but to the function descriptor (the function entry point // address is part of the function descriptor though). @@ -3510,7 +3686,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, CallOpc = PPCISD::BCTRL; Callee.setNode(nullptr); // Add use of X11 (holding environment pointer) - if (isSVR4ABI && isPPC64) + if (isSVR4ABI && isPPC64 && !isELFv2ABI) Ops.push_back(DAG.getRegister(PPC::X11, PtrVT)); // Add CTR register as callee so a bctr can be emitted later. if (isTailCall) @@ -3532,6 +3708,10 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); + // Direct calls in the ELFv2 ABI need the TOC register live into the call. + if (Callee.getNode() && isELFv2ABI) + Ops.push_back(DAG.getRegister(PPC::X2, PtrVT)); + return CallOpc; } @@ -3552,8 +3732,8 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, SmallVectorImpl &InVals) const { SmallVector RVLocs; - CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC); // Copy all of the result registers out of their specified physreg. @@ -3601,6 +3781,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, int SPDiff, unsigned NumBytes, const SmallVectorImpl &Ins, SmallVectorImpl &InVals) const { + + bool isELFv2ABI = Subtarget.isELFv2ABI(); std::vector NodeTys; SmallVector Ops; unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, dl, SPDiff, @@ -3619,7 +3801,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0; // Add a register mask operand representing the call-preserved registers. - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterInfo *TRI = + getTargetMachine().getSubtargetImpl()->getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -3676,7 +3859,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT); - unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(); + unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI); SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset); SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff); Chain = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain, AddTOC, InFlag); @@ -3765,11 +3948,12 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, // Assign locations to all of the outgoing arguments. SmallVector ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); // Reserve space for the linkage area on the stack. - CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize); + CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false, false), + PtrByteSize); if (isVarArg) { // Handle fixed and variable vector arguments differently. @@ -3806,7 +3990,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, // Assign locations to all of the outgoing aggregate by value arguments. SmallVector ByValArgLocs; CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ByValArgLocs, *DAG.getContext()); + ByValArgLocs, *DAG.getContext()); // Reserve stack space for the allocations in CCInfo. CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); @@ -3978,6 +4162,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { + bool isELFv2ABI = Subtarget.isELFv2ABI(); bool isLittleEndian = Subtarget.isLittleEndian(); unsigned NumOps = Outs.size(); @@ -3996,21 +4181,27 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, MF.getInfo()->setHasFastCall(); // Count how many bytes are to be pushed on the stack, including the linkage - // area, and parameter passing area. We start with at least 48 bytes, which - // is reserved space for [SP][CR][LR][3 x unused]. - unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false); + // area, and parameter passing area. On ELFv1, the linkage area is 48 bytes + // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage + // area is 32 bytes reserved space for [SP][CR][LR][TOC]. + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false, + isELFv2ABI); unsigned NumBytes = LinkageSize; // Add up all the space actually used. for (unsigned i = 0; i != NumOps; ++i) { ISD::ArgFlagsTy Flags = Outs[i].Flags; EVT ArgVT = Outs[i].VT; + EVT OrigVT = Outs[i].ArgVT; /* Respect alignment of argument on the stack. */ - unsigned Align = CalculateStackSlotAlignment(ArgVT, Flags, PtrByteSize); + unsigned Align = + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); NumBytes = ((NumBytes + Align - 1) / Align) * Align; NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); + if (Flags.isInConsecutiveRegsLast()) + NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; } unsigned NumBytesActuallyUsed = NumBytes; @@ -4020,6 +4211,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // Because we cannot tell if this is needed on the caller side, we have to // conservatively assume that it is needed. As such, make sure we have at // least enough stack space for the caller to store the 8 GPRs. + // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area. NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); // Tail call needs the stack to be aligned. @@ -4086,10 +4278,12 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, for (unsigned i = 0; i != NumOps; ++i) { SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; + EVT ArgVT = Outs[i].VT; + EVT OrigVT = Outs[i].ArgVT; /* Respect alignment of argument on the stack. */ unsigned Align = - CalculateStackSlotAlignment(Outs[i].VT, Flags, PtrByteSize); + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; /* Compute GPR index associated with argument offset. */ @@ -4133,7 +4327,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, if (GPR_idx != NumGPRs) { SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, MachinePointerInfo(), VT, - false, false, 0); + false, false, false, 0); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load)); @@ -4229,6 +4423,9 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, case MVT::i1: case MVT::i32: case MVT::i64: + // These can be scalar arguments or elements of an integer array type + // passed directly. Clang may use those instead of "byval" aggregate + // types to avoid forcing arguments to memory unnecessarily. if (GPR_idx != NumGPRs) { RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Arg)); } else { @@ -4239,39 +4436,70 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, ArgOffset += PtrByteSize; break; case MVT::f32: - case MVT::f64: - if (FPR_idx != NumFPRs) { + case MVT::f64: { + // These can be scalar arguments or elements of a float array type + // passed directly. The latter are used to implement ELFv2 homogenous + // float aggregates. + + // Named arguments go into FPRs first, and once they overflow, the + // remaining arguments go into GPRs and then the parameter save area. + // Unnamed arguments for vararg functions always go to GPRs and + // then the parameter save area. For now, put all arguments to vararg + // routines always in both locations (FPR *and* GPR or stack slot). + bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs; + + // First load the argument into the next available FPR. + if (FPR_idx != NumFPRs) RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg)); - if (isVarArg) { - // A single float or an aggregate containing only a single float - // must be passed right-justified in the stack doubleword, and - // in the GPR, if one is available. - SDValue StoreOff; - if (Arg.getSimpleValueType().SimpleTy == MVT::f32 && - !isLittleEndian) { - SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType()); - StoreOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); - } else - StoreOff = PtrOff; - - SDValue Store = DAG.getStore(Chain, dl, Arg, StoreOff, - MachinePointerInfo(), false, false, 0); - MemOpChains.push_back(Store); - - // Float varargs are always shadowed in available integer registers - if (GPR_idx != NumGPRs) { - SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff, - MachinePointerInfo(), false, false, - false, 0); - MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load)); - } - } + // Next, load the argument into GPR or stack slot if needed. + if (!NeedGPROrStack) + ; + else if (GPR_idx != NumGPRs) { + // In the non-vararg case, this can only ever happen in the + // presence of f32 array types, since otherwise we never run + // out of FPRs before running out of GPRs. + SDValue ArgVal; + + // Double values are always passed in a single GPR. + if (Arg.getValueType() != MVT::f32) { + ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); + + // Non-array float values are extended and passed in a GPR. + } else if (!Flags.isInConsecutiveRegs()) { + ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); + ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); + + // If we have an array of floats, we collect every odd element + // together with its predecessor into one GPR. + } else if (ArgOffset % PtrByteSize != 0) { + SDValue Lo, Hi; + Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]); + Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); + if (!isLittleEndian) + std::swap(Lo, Hi); + ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); + + // The final element, if even, goes into the first half of a GPR. + } else if (Flags.isInConsecutiveRegsLast()) { + ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); + ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); + if (!isLittleEndian) + ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal, + DAG.getConstant(32, MVT::i32)); + + // Non-final even elements are skipped; they will be handled + // together the with subsequent argument on the next go-around. + } else + ArgVal = SDValue(); + + if (ArgVal.getNode()) + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], ArgVal)); } else { // Single-precision floating-point values are mapped to the // second (rightmost) word of the stack doubleword. - if (Arg.getValueType() == MVT::f32 && !isLittleEndian) { + if (Arg.getValueType() == MVT::f32 && + !isLittleEndian && !Flags.isInConsecutiveRegs()) { SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType()); PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); } @@ -4280,14 +4508,25 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, true, isTailCall, false, MemOpChains, TailCallArguments, dl); } - ArgOffset += 8; + // When passing an array of floats, the array occupies consecutive + // space in the argument area; only round up to the next doubleword + // at the end of the array. Otherwise, each float takes 8 bytes. + ArgOffset += (Arg.getValueType() == MVT::f32 && + Flags.isInConsecutiveRegs()) ? 4 : 8; + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; break; + } case MVT::v4f32: case MVT::v4i32: case MVT::v8i16: case MVT::v16i8: case MVT::v2f64: case MVT::v2i64: + // These can be scalar arguments or elements of a vector array type + // passed directly. The latter are used to implement ELFv2 homogenous + // vector aggregates. + // For a varargs call, named arguments go into VRs or on the stack as // usual; unnamed arguments always go to the stack or the corresponding // GPRs when within range. For now, we always put the value in both @@ -4358,11 +4597,16 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // Load r2 into a virtual register and store it to the TOC save area. SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64); // TOC save area offset. - unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(); + unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI); SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset); SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(), false, false, 0); + // In the ELFv2 ABI, R12 must contain the address of an indirect callee. + // This does not mean the MTCTR instruction must use R12; it's easier + // to model this as an extra parameter, so do that. + if (isELFv2ABI) + RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee)); } // Build a sequence of copy-to-reg nodes chained together with token chain @@ -4413,7 +4657,8 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, // Count how many bytes are to be pushed on the stack, including the linkage // area, and parameter passing area. We start with 24/48 bytes, which is // prereserved space for [SP][CR][LR][3 x unused]. - unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true); + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true, + false); unsigned NumBytes = LinkageSize; // Add up all the space actually used. @@ -4552,7 +4797,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, if (GPR_idx != NumGPRs) { SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, MachinePointerInfo(), VT, - false, false, 0); + false, false, false, 0); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); @@ -4781,8 +5026,7 @@ PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv, const SmallVectorImpl &Outs, LLVMContext &Context) const { SmallVector RVLocs; - CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), - RVLocs, Context); + CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_PPC); } @@ -4794,8 +5038,8 @@ PPCTargetLowering::LowerReturn(SDValue Chain, SDLoc dl, SelectionDAG &DAG) const { SmallVector RVLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_PPC); SDValue Flag; @@ -5803,15 +6047,15 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, if (PPC::isSplatShuffleMask(SVOp, 1) || PPC::isSplatShuffleMask(SVOp, 2) || PPC::isSplatShuffleMask(SVOp, 4) || - PPC::isVPKUWUMShuffleMask(SVOp, true, DAG) || - PPC::isVPKUHUMShuffleMask(SVOp, true, DAG) || - PPC::isVSLDOIShuffleMask(SVOp, true, DAG) != -1 || - PPC::isVMRGLShuffleMask(SVOp, 1, true, DAG) || - PPC::isVMRGLShuffleMask(SVOp, 2, true, DAG) || - PPC::isVMRGLShuffleMask(SVOp, 4, true, DAG) || - PPC::isVMRGHShuffleMask(SVOp, 1, true, DAG) || - PPC::isVMRGHShuffleMask(SVOp, 2, true, DAG) || - PPC::isVMRGHShuffleMask(SVOp, 4, true, DAG)) { + PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) || + PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) || + PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 || + PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG)) { return Op; } } @@ -5819,15 +6063,16 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, // Altivec has a variety of "shuffle immediates" that take two vector inputs // and produce a fixed permutation. If any of these match, do not lower to // VPERM. - if (PPC::isVPKUWUMShuffleMask(SVOp, false, DAG) || - PPC::isVPKUHUMShuffleMask(SVOp, false, DAG) || - PPC::isVSLDOIShuffleMask(SVOp, false, DAG) != -1 || - PPC::isVMRGLShuffleMask(SVOp, 1, false, DAG) || - PPC::isVMRGLShuffleMask(SVOp, 2, false, DAG) || - PPC::isVMRGLShuffleMask(SVOp, 4, false, DAG) || - PPC::isVMRGHShuffleMask(SVOp, 1, false, DAG) || - PPC::isVMRGHShuffleMask(SVOp, 2, false, DAG) || - PPC::isVMRGHShuffleMask(SVOp, 4, false, DAG)) + unsigned int ShuffleKind = isLittleEndian ? 2 : 0; + if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) || + PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) || + PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 || + PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG)) return Op; // Check to see if this is a shuffle of 4-byte values. If so, we can use our @@ -6286,7 +6531,8 @@ MachineBasicBlock * PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, bool is64bit, unsigned BinOpcode) const { // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction *F = BB->getParent(); @@ -6348,7 +6594,8 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI, bool is8bit, // operation unsigned BinOpcode) const { // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); // In 64 bit mode we have to use 64 bits for addresses, even though the // lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address // registers without caring whether they're 32 or 64, but here we're @@ -6476,7 +6723,8 @@ llvm::MachineBasicBlock* PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -6575,7 +6823,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, // Setup MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB); const PPCRegisterInfo *TRI = - static_cast(getTargetMachine().getRegisterInfo()); + getTargetMachine().getSubtarget().getRegisterInfo(); MIB.addRegMask(TRI->getNoPreservedMask()); BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1); @@ -6624,7 +6872,8 @@ MachineBasicBlock * PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -6736,7 +6985,8 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, return emitEHSjLjLongJmp(MI, BB); } - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); // To "insert" these instructions we actually have to insert their // control-flow patterns. @@ -6759,7 +7009,8 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, Cond.push_back(MI->getOperand(1)); DebugLoc dl = MI->getDebugLoc(); - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); TII->insertSelect(*BB, MI, dl, MI->getOperand(0).getReg(), Cond, MI->getOperand(2).getReg(), MI->getOperand(3).getReg()); @@ -7299,16 +7550,12 @@ SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op, return SDValue(); } -// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does -// not enforce equality of the chain operands. -static bool isConsecutiveLS(LSBaseSDNode *LS, LSBaseSDNode *Base, +static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base, unsigned Bytes, int Dist, SelectionDAG &DAG) { - EVT VT = LS->getMemoryVT(); if (VT.getSizeInBits() / 8 != Bytes) return false; - SDValue Loc = LS->getBasePtr(); SDValue BaseLoc = Base->getBasePtr(); if (Loc.getOpcode() == ISD::FrameIndex) { if (BaseLoc.getOpcode() != ISD::FrameIndex) @@ -7339,11 +7586,69 @@ static bool isConsecutiveLS(LSBaseSDNode *LS, LSBaseSDNode *Base, return false; } +// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does +// not enforce equality of the chain operands. +static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, + unsigned Bytes, int Dist, + SelectionDAG &DAG) { + if (LSBaseSDNode *LS = dyn_cast(N)) { + EVT VT = LS->getMemoryVT(); + SDValue Loc = LS->getBasePtr(); + return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG); + } + + if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { + EVT VT; + switch (cast(N->getOperand(1))->getZExtValue()) { + default: return false; + case Intrinsic::ppc_altivec_lvx: + case Intrinsic::ppc_altivec_lvxl: + VT = MVT::v4i32; + break; + case Intrinsic::ppc_altivec_lvebx: + VT = MVT::i8; + break; + case Intrinsic::ppc_altivec_lvehx: + VT = MVT::i16; + break; + case Intrinsic::ppc_altivec_lvewx: + VT = MVT::i32; + break; + } + + return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG); + } + + if (N->getOpcode() == ISD::INTRINSIC_VOID) { + EVT VT; + switch (cast(N->getOperand(1))->getZExtValue()) { + default: return false; + case Intrinsic::ppc_altivec_stvx: + case Intrinsic::ppc_altivec_stvxl: + VT = MVT::v4i32; + break; + case Intrinsic::ppc_altivec_stvebx: + VT = MVT::i8; + break; + case Intrinsic::ppc_altivec_stvehx: + VT = MVT::i16; + break; + case Intrinsic::ppc_altivec_stvewx: + VT = MVT::i32; + break; + } + + return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG); + } + + return false; +} + // Return true is there is a nearyby consecutive load to the one provided // (regardless of alignment). We search up and down the chain, looking though -// token factors and other loads (but nothing else). As a result, a true -// results indicates that it is safe to create a new consecutive load adjacent -// to the load provided. +// token factors and other loads (but nothing else). As a result, a true result +// indicates that it is safe to create a new consecutive load adjacent to the +// load provided. static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { SDValue Chain = LD->getChain(); EVT VT = LD->getMemoryVT(); @@ -7360,7 +7665,7 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { if (!Visited.insert(ChainNext)) continue; - if (LoadSDNode *ChainLD = dyn_cast(ChainNext)) { + if (MemSDNode *ChainLD = dyn_cast(ChainNext)) { if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) return true; @@ -7391,14 +7696,14 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { if (!Visited.insert(LoadRoot)) continue; - if (LoadSDNode *ChainLD = dyn_cast(LoadRoot)) + if (MemSDNode *ChainLD = dyn_cast(LoadRoot)) if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) return true; for (SDNode::use_iterator UI = LoadRoot->use_begin(), UE = LoadRoot->use_end(); UI != UE; ++UI) - if (((isa(*UI) && - cast(*UI)->getChain().getNode() == LoadRoot) || + if (((isa(*UI) && + cast(*UI)->getChain().getNode() == LoadRoot) || UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI)) Queue.push_back(*UI); } @@ -8182,17 +8487,25 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, Intrinsic::ppc_altivec_lvsl); SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, MVT::v16i8); - // Refine the alignment of the original load (a "new" load created here - // which was identical to the first except for the alignment would be - // merged with the existing node regardless). + // Create the new MMO for the new base load. It is like the original MMO, + // but represents an area in memory almost twice the vector size centered + // on the original address. If the address is unaligned, we might start + // reading up to (sizeof(vector)-1) bytes below the address of the + // original unaligned load. MachineFunction &MF = DAG.getMachineFunction(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(LD->getPointerInfo(), - LD->getMemOperand()->getFlags(), - LD->getMemoryVT().getStoreSize(), - ABIAlignment); - LD->refineAlignment(MMO); - SDValue BaseLoad = SDValue(LD, 0); + MachineMemOperand *BaseMMO = + MF.getMachineMemOperand(LD->getMemOperand(), + -LD->getMemoryVT().getStoreSize()+1, + 2*LD->getMemoryVT().getStoreSize()-1); + + // Create the new base load. + SDValue LDXIntID = DAG.getTargetConstant(Intrinsic::ppc_altivec_lvx, + getPointerTy()); + SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr }; + SDValue BaseLoad = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, + DAG.getVTList(MVT::v4i32, MVT::Other), + BaseLoadOps, MVT::v4i32, BaseMMO); // Note that the value of IncOffset (which is provided to the next // load's pointer info offset value, and thus used to calculate the @@ -8214,21 +8527,18 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, SDValue Increment = DAG.getConstant(IncValue, getPointerTy()); Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + MachineMemOperand *ExtraMMO = + MF.getMachineMemOperand(LD->getMemOperand(), + 1, 2*LD->getMemoryVT().getStoreSize()-1); + SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr }; SDValue ExtraLoad = - DAG.getLoad(VT, dl, Chain, Ptr, - LD->getPointerInfo().getWithOffset(IncOffset), - LD->isVolatile(), LD->isNonTemporal(), - LD->isInvariant(), ABIAlignment); + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, + DAG.getVTList(MVT::v4i32, MVT::Other), + ExtraLoadOps, MVT::v4i32, ExtraMMO); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, BaseLoad.getValue(1), ExtraLoad.getValue(1)); - if (BaseLoad.getValueType() != MVT::v4i32) - BaseLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, BaseLoad); - - if (ExtraLoad.getValueType() != MVT::v4i32) - ExtraLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ExtraLoad); - // Because vperm has a big-endian bias, we must reverse the order // of the input vectors and complement the permute control vector // when generating little endian code. We have already handled the @@ -8245,36 +8555,9 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, if (VT != MVT::v4i32) Perm = DAG.getNode(ISD::BITCAST, dl, VT, Perm); - // Now we need to be really careful about how we update the users of the - // original load. We cannot just call DCI.CombineTo (or - // DAG.ReplaceAllUsesWith for that matter), because the load still has - // uses created here (the permutation for example) that need to stay. - SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); - while (UI != UE) { - SDUse &Use = UI.getUse(); - SDNode *User = *UI; - // Note: BaseLoad is checked here because it might not be N, but a - // bitcast of N. - if (User == Perm.getNode() || User == BaseLoad.getNode() || - User == TF.getNode() || Use.getResNo() > 1) { - ++UI; - continue; - } - - SDValue To = Use.getResNo() ? TF : Perm; - ++UI; - - SmallVector Ops; - for (const SDUse &O : User->ops()) { - if (O == Use) - Ops.push_back(To); - else - Ops.push_back(O); - } - - DAG.UpdateNodeOperands(User, Ops); - } - + // The output of the permutation is our loaded result, the TokenFactor is + // our new chain. + DCI.CombineTo(N, Perm, TF); return SDValue(N, 0); } } @@ -8692,7 +8975,8 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, // the AsmName field from *RegisterInfo.td, then this would not be necessary. if (R.first && VT == MVT::i64 && Subtarget.isPPC64() && PPC::GPRCRegClass.contains(R.first)) { - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterInfo *TRI = + getTargetMachine().getSubtargetImpl()->getRegisterInfo(); return std::make_pair(TRI->getMatchingSuperReg(R.first, PPC::sub_32, &PPC::G8RCRegClass), &PPC::G8RCRegClass); @@ -8964,9 +9248,10 @@ bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const { return isInt<16>(Imm) || isUInt<16>(Imm); } -bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, - unsigned, - bool *Fast) const { +bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned, + unsigned, + bool *Fast) const { if (DisablePPCUnaligned) return false;