X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86ISelLowering.cpp;h=2b8c5408e6c6163076cf91c6c23ed44f63086245;hb=64b7bf71e84094193b40ab81aa7dacad921ecbea;hp=7ca6f70d1392768b956c8503b94f660f2aa69cf1;hpb=0ef701e6ae816b0360e0a66e8b815ff875fe2a32;p=oota-llvm.git diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 7ca6f70d139..2b8c5408e6c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -16,7 +16,6 @@ #include "X86.h" #include "X86InstrBuilder.h" #include "X86ISelLowering.h" -#include "X86MCTargetExpr.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "llvm/CallingConv.h" @@ -37,6 +36,7 @@ #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSymbol.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallSet.h" @@ -45,10 +45,12 @@ #include "llvm/ADT/VectorExtras.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Dwarf.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; +using namespace dwarf; STATISTIC(NumTailCalls, "Number of tail calls"); @@ -62,6 +64,9 @@ DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); static cl::opt Disable16Bit("disable-16bit", cl::Hidden, cl::desc("Disable use of 16-bit instructions")); +static cl::opt +Promote16Bit("promote-16bit", cl::Hidden, + cl::desc("Promote 16-bit instructions")); // Forward declarations. static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, @@ -800,6 +805,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (!VT.is128BitVector()) { continue; } + setOperationAction(ISD::AND, SVT, Promote); AddPromotedToType (ISD::AND, SVT, MVT::v2i64); setOperationAction(ISD::OR, SVT, Promote); @@ -988,9 +994,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::SELECT); - setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); @@ -1006,7 +1012,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // FIXME: These should be based on subtarget info. Plus, the values should // be smaller when we are in optimizing for size mode. maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores - maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores + maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores setPrefLoopAlignment(16); benefitFromCodePlacementOpt = true; @@ -1065,22 +1071,45 @@ unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { /// getOptimalMemOpType - Returns the target specific optimal type for load /// and store operations as a result of memset, memcpy, and memmove -/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for -/// determining it. +/// lowering. If DstAlign is zero that means it's safe to destination +/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it +/// means there isn't a need to check it against alignment requirement, +/// probably because the source does not need to be loaded. If +/// 'NonScalarIntSafe' is true, that means it's safe to return a +/// non-scalar-integer type, e.g. empty string source, constant, or loaded +/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is +/// constant so it does not need to be loaded. +/// It returns EVT::Other if SelectionDAG should be responsible for +/// determining the type. EVT -X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align, - bool isSrcConst, bool isSrcStr, +X86TargetLowering::getOptimalMemOpType(uint64_t Size, + unsigned DstAlign, unsigned SrcAlign, + bool NonScalarIntSafe, + bool MemcpyStrSrc, SelectionDAG &DAG) const { // FIXME: This turns off use of xmm stores for memset/memcpy on targets like // linux. This is because the stack realignment code can't handle certain // cases like PR2962. This should be removed when PR2962 is fixed. const Function *F = DAG.getMachineFunction().getFunction(); - bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); - if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) { - if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16) - return MVT::v4i32; - if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16) - return MVT::v4f32; + if (NonScalarIntSafe && + !F->hasFnAttr(Attribute::NoImplicitFloat)) { + if (Size >= 16 && + (Subtarget->isUnalignedMemAccessFast() || + ((DstAlign == 0 || DstAlign >= 16) && + (SrcAlign == 0 || SrcAlign >= 16))) && + Subtarget->getStackAlignment() >= 16) { + if (Subtarget->hasSSE2()) + return MVT::v4i32; + if (Subtarget->hasSSE1()) + return MVT::v4f32; + } else if (!MemcpyStrSrc && Size >= 8 && + !Subtarget->is64Bit() && + Subtarget->getStackAlignment() >= 8 && + Subtarget->hasSSE2()) { + // Do not use f64 to lower memcpy if source is string constant. It's + // better to use i32 to avoid the loads. + return MVT::f64; + } } if (Subtarget->is64Bit() && Size >= 8) return MVT::i64; @@ -1119,8 +1148,8 @@ X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, Subtarget->isPICStyleGOT()); // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF // entries. - return X86MCTargetExpr::Create(MBB->getSymbol(Ctx), - X86MCTargetExpr::GOTOFF, Ctx); + return MCSymbolRefExpr::Create(MBB->getSymbol(), + MCSymbolRefExpr::VK_GOTOFF, Ctx); } /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC @@ -1130,8 +1159,7 @@ SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, if (!Subtarget->is64Bit()) // This doesn't have DebugLoc associated with it, but is not really the // same as a Register. - return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(), - getPointerTy()); + return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); return Table; } @@ -1286,7 +1314,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { - llvm_report_error("SSE register return with SSE disabled"); + report_fatal_error("SSE register return with SSE disabled"); } // If this is a call to a function that returns an fp value on the floating @@ -1379,6 +1407,8 @@ bool X86TargetLowering::IsCalleePop(bool IsVarArg, CallingConv::ID CallingConv){ return !Subtarget->is64Bit(); case CallingConv::Fast: return GuaranteedTailCallOpt; + case CallingConv::GHC: + return GuaranteedTailCallOpt; } } @@ -1386,7 +1416,9 @@ bool X86TargetLowering::IsCalleePop(bool IsVarArg, CallingConv::ID CallingConv){ /// given CallingConvention value. CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { if (Subtarget->is64Bit()) { - if (Subtarget->isTargetWin64()) + if (CC == CallingConv::GHC) + return CC_X86_64_GHC; + else if (Subtarget->isTargetWin64()) return CC_X86_Win64_C; else return CC_X86_64_C; @@ -1396,6 +1428,8 @@ CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { return CC_X86_32_FastCall; else if (CC == CallingConv::Fast) return CC_X86_32_FastCC; + else if (CC == CallingConv::GHC) + return CC_X86_32_GHC; else return CC_X86_32_C; } @@ -1410,13 +1444,20 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, DebugLoc dl) { SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), - /*AlwaysInline=*/true, NULL, 0, NULL, 0); + /*isVolatile*/false, /*AlwaysInline=*/true, + NULL, 0, NULL, 0); +} + +/// IsTailCallConvention - Return true if the calling convention is one that +/// supports tail call optimization. +static bool IsTailCallConvention(CallingConv::ID CC) { + return (CC == CallingConv::Fast || CC == CallingConv::GHC); } /// FuncIsMadeTailCallSafe - Return true if the function is being made into /// a tailcall target by changing its ABI. static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { - return GuaranteedTailCallOpt && CC == CallingConv::Fast; + return GuaranteedTailCallOpt && IsTailCallConvention(CC); } SDValue @@ -1466,7 +1507,6 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, DebugLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) { - MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); @@ -1480,8 +1520,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isTargetWin64(); - assert(!(isVarArg && CallConv == CallingConv::Fast) && - "Var args not supported with calling convention fastcc"); + assert(!(isVarArg && IsTailCallConvention(CallConv)) && + "Var args not supported with calling convention fastcc or ghc"); // Assign locations to all of the incoming arguments. SmallVector ArgLocs; @@ -1684,7 +1724,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, } else { BytesToPopOnReturn = 0; // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. - if (!Is64Bit && CallConv != CallingConv::Fast && ArgsAreStructReturn(Ins)) + if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) BytesToPopOnReturn = 4; } @@ -1768,7 +1808,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (isTailCall) { // Check if it's really possible to do a tail call. - isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, + isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, + isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), Outs, Ins, DAG); // Sibcalls are automatically detected tailcalls which do not require @@ -1780,8 +1821,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, ++NumTailCalls; } - assert(!(isVarArg && CallConv == CallingConv::Fast) && - "Var args not supported with calling convention fastcc"); + assert(!(isVarArg && IsTailCallConvention(CallConv)) && + "Var args not supported with calling convention fastcc or ghc"); // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; @@ -1795,7 +1836,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // This is a sibcall. The memory operands are available in caller's // own caller's stack. NumBytes = 0; - else if (GuaranteedTailCallOpt && CallConv == CallingConv::Fast) + else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; @@ -1900,8 +1941,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (!isTailCall) { Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, DAG.getNode(X86ISD::GlobalBaseReg, - DebugLoc::getUnknownLoc(), - getPointerTy()), + DebugLoc(), getPointerTy()), InFlag); InFlag = Chain.getValue(1); } else { @@ -2030,7 +2070,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // We should use extra load for direct calls to dllimported functions in // non-JIT mode. - GlobalValue *GV = G->getGlobal(); + const GlobalValue *GV = G->getGlobal(); if (!GV->hasDLLImportLinkage()) { unsigned char OpFlags = 0; @@ -2075,18 +2115,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, OpFlags); } - if (isTailCall && !WasGlobalOrExternal) { - // Force the address into a (call preserved) caller-saved register since - // tailcall must happen after callee-saved registers are poped. - // FIXME: Give it a special register class that contains caller-saved - // register instead? - unsigned TCReg = Is64Bit ? X86::R11 : X86::EAX; - Chain = DAG.getCopyToReg(Chain, dl, - DAG.getRegister(TCReg, getPointerTy()), - Callee,InFlag); - Callee = DAG.getRegister(TCReg, getPointerTy()); - } - // Returns a chain & a flag for retval copy to use. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); SmallVector Ops; @@ -2132,14 +2160,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (RVLocs[i].isRegLoc()) MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg()); } - - assert(((Callee.getOpcode() == ISD::Register && - (cast(Callee)->getReg() == X86::EAX || - cast(Callee)->getReg() == X86::R11)) || - Callee.getOpcode() == ISD::TargetExternalSymbol || - Callee.getOpcode() == ISD::TargetGlobalAddress) && - "Expecting a global address, external symbol, or scratch register"); - return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); } @@ -2151,7 +2171,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, unsigned NumBytesForCalleeToPush; if (IsCalleePop(isVarArg, CallConv)) NumBytesForCalleeToPush = NumBytes; // Callee pops everything - else if (!Is64Bit && CallConv != CallingConv::Fast && IsStructRet) + else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) // If this is a call to a struct-return function, the callee // pops the hidden struct pointer, so we have to push it back. // This is common for Darwin/X86, Linux & Mingw32 targets. @@ -2236,7 +2256,8 @@ static bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, const X86InstrInfo *TII) { - int FI; + unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; + int FI = INT_MAX; if (Arg.getOpcode() == ISD::CopyFromReg) { unsigned VR = cast(Arg.getOperand(1))->getReg(); if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) @@ -2252,25 +2273,30 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && Def->getOperand(1).isFI()) { FI = Def->getOperand(1).getIndex(); - if (MFI->getObjectSize(FI) != Flags.getByValSize()) - return false; + Bytes = Flags.getByValSize(); } else return false; } - } else { - LoadSDNode *Ld = dyn_cast(Arg); - if (!Ld) + } else if (LoadSDNode *Ld = dyn_cast(Arg)) { + if (Flags.isByVal()) + // ByVal argument is passed in as a pointer but it's now being + // dereferenced. e.g. + // define @foo(%struct.X* %A) { + // tail call @bar(%struct.X* byval %A) + // } return false; SDValue Ptr = Ld->getBasePtr(); FrameIndexSDNode *FINode = dyn_cast(Ptr); if (!FINode) return false; FI = FINode->getIndex(); - } + } else + return false; + assert(FI != INT_MAX); if (!MFI->isFixedObjectIndex(FI)) return false; - return Offset == MFI->getObjectOffset(FI); + return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); } /// IsEligibleForTailCallOptimization - Check whether the call is eligible @@ -2280,17 +2306,20 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, + bool isCalleeStructRet, + bool isCallerStructRet, const SmallVectorImpl &Outs, const SmallVectorImpl &Ins, SelectionDAG& DAG) const { - if (CalleeCC != CallingConv::Fast && + if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) return false; // If -tailcallopt is specified, make fastcc functions tail-callable. + const MachineFunction &MF = DAG.getMachineFunction(); const Function *CallerF = DAG.getMachineFunction().getFunction(); if (GuaranteedTailCallOpt) { - if (CalleeCC == CallingConv::Fast && + if (IsTailCallConvention(CalleeCC) && CallerF->getCallingConv() == CalleeCC) return true; return false; @@ -2299,10 +2328,43 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Look for obvious safe cases to perform tail call optimization that does not // requite ABI changes. This is what gcc calls sibcall. - // Do not tail call optimize vararg calls for now. - if (isVarArg) + // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to + // emit a special epilogue. + if (RegInfo->needsStackRealignment(MF)) + return false; + + // Do not sibcall optimize vararg calls unless the call site is not passing any + // arguments. + if (isVarArg && !Outs.empty()) + return false; + + // Also avoid sibcall optimization if either caller or callee uses struct + // return semantics. + if (isCalleeStructRet || isCallerStructRet) return false; + // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. + // Therefore if it's not used by the call it is not safe to optimize this into + // a sibcall. + bool Unused = false; + for (unsigned i = 0, e = Ins.size(); i != e; ++i) { + if (!Ins[i].Used) { + Unused = true; + break; + } + } + if (Unused) { + SmallVector RVLocs; + CCState CCInfo(CalleeCC, false, getTargetMachine(), + RVLocs, *DAG.getContext()); + CCInfo.AnalyzeCallResult(Ins, RetCC_X86); + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign &VA = RVLocs[i]; + if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) + return false; + } + } + // If the callee takes no arguments then go on to check the results of the // call. if (!Outs.empty()) { @@ -2346,16 +2408,15 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, } FastISel * -X86TargetLowering::createFastISel(MachineFunction &mf, MachineModuleInfo *mmo, - DwarfWriter *dw, +X86TargetLowering::createFastISel(MachineFunction &mf, DenseMap &vm, DenseMap &bm, DenseMap &am #ifndef NDEBUG - , SmallSet &cil + , SmallSet &cil #endif ) { - return X86::createFastISel(mf, mmo, dw, vm, bm, am + return X86::createFastISel(mf, vm, bm, am #ifndef NDEBUG , cil #endif @@ -2388,7 +2449,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement) { // Offset should fit into 32 bit immediate field. - if (!isInt32(Offset)) + if (!isInt<32>(Offset)) return false; // If we don't have a symbolic displacement - we don't have any extra @@ -3389,7 +3450,7 @@ unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, /// FIXME: split into pslldqi, psrldqi, palignr variants. static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { - int NumElems = SVOp->getValueType(0).getVectorNumElements(); + unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); isLeft = true; unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); @@ -3401,11 +3462,12 @@ static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, } bool SeenV1 = false; bool SeenV2 = false; - for (int i = NumZeros; i < NumElems; ++i) { - int Val = isLeft ? (i - NumZeros) : i; - int Idx = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); - if (Idx < 0) + for (unsigned i = NumZeros; i < NumElems; ++i) { + unsigned Val = isLeft ? (i - NumZeros) : i; + int Idx_ = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); + if (Idx_ < 0) continue; + unsigned Idx = (unsigned) Idx_; if (Idx < NumElems) SeenV1 = true; else { @@ -3584,6 +3646,69 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, return SDValue(); } +/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a +/// vector of type 'VT', see if the elements can be replaced by a single large +/// load which has the same value as a build_vector whose operands are 'elts'. +/// +/// Example: -> zextload a +/// +/// FIXME: we'd also like to handle the case where the last elements are zero +/// rather than undef via VZEXT_LOAD, but we do not detect that case today. +/// There's even a handy isZeroNode for that purpose. +static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl &Elts, + DebugLoc &dl, SelectionDAG &DAG) { + EVT EltVT = VT.getVectorElementType(); + unsigned NumElems = Elts.size(); + + LoadSDNode *LDBase = NULL; + unsigned LastLoadedElt = -1U; + + // For each element in the initializer, see if we've found a load or an undef. + // If we don't find an initial load element, or later load elements are + // non-consecutive, bail out. + for (unsigned i = 0; i < NumElems; ++i) { + SDValue Elt = Elts[i]; + + if (!Elt.getNode() || + (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) + return SDValue(); + if (!LDBase) { + if (Elt.getNode()->getOpcode() == ISD::UNDEF) + return SDValue(); + LDBase = cast(Elt.getNode()); + LastLoadedElt = i; + continue; + } + if (Elt.getOpcode() == ISD::UNDEF) + continue; + + LoadSDNode *LD = cast(Elt); + if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) + return SDValue(); + LastLoadedElt = i; + } + + // If we have found an entire vector of loads and undefs, then return a large + // load of the entire vector width starting at the base pointer. If we found + // consecutive loads for the low half, generate a vzext_load node. + if (LastLoadedElt == NumElems - 1) { + if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) + return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), + LDBase->getSrcValue(), LDBase->getSrcValueOffset(), + LDBase->isVolatile(), LDBase->isNonTemporal(), 0); + return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), + LDBase->getSrcValue(), LDBase->getSrcValueOffset(), + LDBase->isVolatile(), LDBase->isNonTemporal(), + LDBase->getAlignment()); + } else if (NumElems == 4 && LastLoadedElt == 1) { + SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); + SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; + SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); + } + return SDValue(); +} + SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); @@ -3812,14 +3937,18 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); } - if (Values.size() > 2) { - // If we have SSE 4.1, Expand into a number of inserts unless the number of - // values to be inserted is equal to the number of elements, in which case - // use the unpack code below in the hopes of matching the consecutive elts - // load merge pattern for shuffles. - // FIXME: We could probably just check that here directly. - if (Values.size() < NumElems && VT.getSizeInBits() == 128 && - getSubtarget()->hasSSE41()) { + if (Values.size() > 1 && VT.getSizeInBits() == 128) { + // Check for a build vector of consecutive loads. + for (unsigned i = 0; i < NumElems; ++i) + V[i] = Op.getOperand(i); + + // Check for elements which are consecutive loads. + SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); + if (LD.getNode()) + return LD; + + // For SSE 4.1, use inserts into undef. + if (getSubtarget()->hasSSE41()) { V[0] = DAG.getUNDEF(VT); for (unsigned i = 0; i < NumElems; ++i) if (Op.getOperand(i).getOpcode() != ISD::UNDEF) @@ -3827,7 +3956,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { Op.getOperand(i), DAG.getIntPtrConstant(i)); return V[0]; } - // Expand into a number of unpckl*. + + // Otherwise, expand into a number of unpckl* // e.g. for v4f32 // Step 1: unpcklps 0, 2 ==> X: // : unpcklps 1, 3 ==> Y: @@ -3842,7 +3972,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { } return V[0]; } - return SDValue(); } @@ -4941,7 +5070,7 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) { if (OpFlag) { Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), DAG.getNode(X86ISD::GlobalBaseReg, - DebugLoc::getUnknownLoc(), getPointerTy()), + DebugLoc(), getPointerTy()), Result); } @@ -4974,7 +5103,7 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) { if (OpFlag) { Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), DAG.getNode(X86ISD::GlobalBaseReg, - DebugLoc::getUnknownLoc(), getPointerTy()), + DebugLoc(), getPointerTy()), Result); } @@ -5010,8 +5139,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) { !Subtarget->is64Bit()) { Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), DAG.getNode(X86ISD::GlobalBaseReg, - DebugLoc::getUnknownLoc(), - getPointerTy()), + DebugLoc(), getPointerTy()), Result); } @@ -5024,7 +5152,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) { unsigned char OpFlags = Subtarget->ClassifyBlockAddressReference(); CodeModel::Model M = getTargetMachine().getCodeModel(); - BlockAddress *BA = cast(Op)->getBlockAddress(); + const BlockAddress *BA = cast(Op)->getBlockAddress(); DebugLoc dl = Op.getDebugLoc(); SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), /*isTarget=*/true, OpFlags); @@ -5133,8 +5261,7 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, DAG.getNode(X86ISD::GlobalBaseReg, - DebugLoc::getUnknownLoc(), - PtrVT), InFlag); + DebugLoc(), PtrVT), InFlag); InFlag = Chain.getValue(1); return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); @@ -5156,7 +5283,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, DebugLoc dl = GA->getDebugLoc(); // Get the Thread Pointer SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, - DebugLoc::getUnknownLoc(), PtrVT, + DebugLoc(), PtrVT, DAG.getRegister(is64Bit? X86::FS : X86::GS, MVT::i32)); @@ -6153,7 +6280,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { N2C && N2C->isNullValue() && RHSC && RHSC->isNullValue()) { SDValue CmpOp0 = Cmp.getOperand(0); - Cmp = DAG.getNode(X86ISD::CMP, dl, CmpOp0.getValueType(), + Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(), DAG.getConstant(X86::COND_B, MVT::i8), Cmp); @@ -6413,24 +6540,13 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, EVT IntPtr = getPointerTy(); EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true)); - Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); Flag = Chain.getValue(1); - SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); - SDValue Ops[] = { Chain, - DAG.getTargetExternalSymbol("_alloca", IntPtr), - DAG.getRegister(X86::EAX, IntPtr), - DAG.getRegister(X86StackPtr, SPTy), - Flag }; - Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5); - Flag = Chain.getValue(1); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); - Chain = DAG.getCALLSEQ_END(Chain, - DAG.getIntPtrConstant(0, true), - DAG.getIntPtrConstant(0, true), - Flag); + Chain = DAG.getNode(X86ISD::MINGW_ALLOCA, dl, NodeTys, Chain, Flag); + Flag = Chain.getValue(1); Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); @@ -6443,6 +6559,7 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, + bool isVolatile, const Value *DstSV, uint64_t DstSVOff) { ConstantSDNode *ConstantSize = dyn_cast(Size); @@ -6571,7 +6688,7 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, DAG.getConstant(Offset, AddrVT)), Src, DAG.getConstant(BytesLeft, SizeVT), - Align, DstSV, DstSVOff + Offset); + Align, isVolatile, DstSV, DstSVOff + Offset); } // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. @@ -6582,7 +6699,7 @@ SDValue X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, - bool AlwaysInline, + bool isVolatile, bool AlwaysInline, const Value *DstSV, uint64_t DstSVOff, const Value *SrcSV, uint64_t SrcSVOff) { // This requires the copy size to be a constant, preferrably @@ -6614,7 +6731,7 @@ X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, Count, InFlag); InFlag = Chain.getValue(1); Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : - X86::EDI, + X86::EDI, Dst, InFlag); InFlag = Chain.getValue(1); Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : @@ -6641,7 +6758,7 @@ X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, SrcVT)), DAG.getConstant(BytesLeft, SizeVT), - Align, AlwaysInline, + Align, isVolatile, AlwaysInline, DstSV, DstSVOff + Offset, SrcSV, SrcSVOff + Offset)); } @@ -6709,7 +6826,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) { SDValue SrcPtr = Op.getOperand(1); SDValue SrcSV = Op.getOperand(2); - llvm_report_error("VAArgInst is not yet implemented for x86-64!"); + report_fatal_error("VAArgInst is not yet implemented for x86-64!"); return SDValue(); } @@ -6724,8 +6841,8 @@ SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, - DAG.getIntPtrConstant(24), 8, false, - DstSV, 0, SrcSV, 0); + DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, + false, DstSV, 0, SrcSV, 0); } SDValue @@ -7126,7 +7243,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; if (InRegCount > 2) { - llvm_report_error("Nest register in use - reduce number of inreg parameters!"); + report_fatal_error("Nest register in use - reduce number of inreg parameters!"); } } break; @@ -7736,6 +7853,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::PTEST: return "X86ISD::PTEST"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; + case X86ISD::MINGW_ALLOCA: return "X86ISD::MINGW_ALLOCA"; } } @@ -7834,9 +7952,9 @@ bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { bool X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, EVT VT) const { - // Only do shuffles on 128-bit vector types for now. + // Very little shuffling can be done for 64-bit vectors right now. if (VT.getSizeInBits() == 64) - return false; + return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); // FIXME: pshufb, blends, shifts. return (VT.getVectorNumElements() == 2 || @@ -8405,6 +8523,29 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, return BB; } +MachineBasicBlock * +X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI, + MachineBasicBlock *BB, + DenseMap *EM) const { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + MachineFunction *F = BB->getParent(); + + // The lowering is pretty easy: we're just emitting the call to _alloca. The + // non-trivial part is impdef of ESP. + // FIXME: The code should be tweaked as soon as we'll try to do codegen for + // mingw-w64. + + BuildMI(BB, DL, TII->get(X86::CALLpcrel32)) + .addExternalSymbol("_alloca") + .addReg(X86::EAX, RegState::Implicit) + .addReg(X86::ESP, RegState::Implicit) + .addReg(X86::EAX, RegState::Define | RegState::Implicit) + .addReg(X86::ESP, RegState::Define | RegState::Implicit); + + F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. + return BB; +} MachineBasicBlock * X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, @@ -8412,6 +8553,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, DenseMap *EM) const { switch (MI->getOpcode()) { default: assert(false && "Unexpected instr type to insert"); + case X86::MINGW_ALLOCA: + return EmitLoweredMingwAlloca(MI, BB, EM); case X86::CMOV_GR8: case X86::CMOV_V1I64: case X86::CMOV_FR32: @@ -8419,6 +8562,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::CMOV_V4F32: case X86::CMOV_V2F64: case X86::CMOV_V2I64: + case X86::CMOV_GR16: + case X86::CMOV_GR32: + case X86::CMOV_RFP32: + case X86::CMOV_RFP64: + case X86::CMOV_RFP80: return EmitLoweredSelect(MI, BB, EM); case X86::FP32_TO_INT16_IN_MEM: @@ -8501,6 +8649,21 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. return BB; } + // DBG_VALUE. Only the frame index case is done here. + case X86::DBG_VALUE: { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + X86AddressMode AM; + MachineFunction *F = BB->getParent(); + AM.BaseType = X86AddressMode::FrameIndexBase; + AM.Base.FrameIndex = MI->getOperand(0).getImm(); + addFullAddress(BuildMI(BB, DL, TII->get(X86::DBG_VALUE)), AM). + addImm(MI->getOperand(1).getImm()). + addMetadata(MI->getOperand(2).getMetadata()); + F->DeleteMachineInstr(MI); // Remove pseudo. + return BB; + } + // String/text processing lowering. case X86::PCMPISTRM128REG: return EmitPCMP(MI, BB, 3, false /* in-mem */); @@ -8722,7 +8885,8 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the /// node is a GlobalAddress + offset. bool X86TargetLowering::isGAPlusOffset(SDNode *N, - GlobalValue* &GA, int64_t &Offset) const{ + const GlobalValue* &GA, + int64_t &Offset) const { if (N->getOpcode() == X86ISD::Wrapper) { if (isa(N->getOperand(0))) { GA = cast(N->getOperand(0))->getGlobal(); @@ -8733,82 +8897,104 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N, return TargetLowering::isGAPlusOffset(N, GA, Offset); } -static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems, - EVT EltVT, LoadSDNode *&LDBase, - unsigned &LastLoadedElt, - SelectionDAG &DAG, MachineFrameInfo *MFI, - const TargetLowering &TLI) { - LDBase = NULL; - LastLoadedElt = -1U; - for (unsigned i = 0; i < NumElems; ++i) { - if (N->getMaskElt(i) < 0) { - if (!LDBase) - return false; - continue; - } - - SDValue Elt = DAG.getShuffleScalarElt(N, i); - if (!Elt.getNode() || - (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) - return false; - if (!LDBase) { - if (Elt.getNode()->getOpcode() == ISD::UNDEF) - return false; - LDBase = cast(Elt.getNode()); - LastLoadedElt = i; - continue; - } - if (Elt.getOpcode() == ISD::UNDEF) - continue; - - LoadSDNode *LD = cast(Elt); - if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) - return false; - LastLoadedElt = i; - } - return true; -} - /// PerformShuffleCombine - Combine a vector_shuffle that is equal to /// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load /// if the load addresses are consecutive, non-overlapping, and in the right -/// order. In the case of v2i64, it will see if it can rewrite the -/// shuffle to be an appropriate build vector so it can take advantage of -// performBuildVectorCombine. +/// order. static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI) { DebugLoc dl = N->getDebugLoc(); EVT VT = N->getValueType(0); - EVT EltVT = VT.getVectorElementType(); ShuffleVectorSDNode *SVN = cast(N); - unsigned NumElems = VT.getVectorNumElements(); if (VT.getSizeInBits() != 128) return SDValue(); - // Try to combine a vector_shuffle into a 128-bit load. - MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); - LoadSDNode *LD = NULL; - unsigned LastLoadedElt; - if (!EltsFromConsecutiveLoads(SVN, NumElems, EltVT, LD, LastLoadedElt, DAG, - MFI, TLI)) + SmallVector Elts; + for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) + Elts.push_back(DAG.getShuffleScalarElt(SVN, i)); + + return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); +} + +/// PerformShuffleCombine - Detect vector gather/scatter index generation +/// and convert it from being a bunch of shuffles and extracts to a simple +/// store and scalar loads to extract the elements. +static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, + const TargetLowering &TLI) { + SDValue InputVector = N->getOperand(0); + + // Only operate on vectors of 4 elements, where the alternative shuffling + // gets to be more expensive. + if (InputVector.getValueType() != MVT::v4i32) return SDValue(); - if (LastLoadedElt == NumElems - 1) { - if (DAG.InferPtrAlignment(LD->getBasePtr()) >= 16) - return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), - LD->getSrcValue(), LD->getSrcValueOffset(), - LD->isVolatile(), LD->isNonTemporal(), 0); - return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), - LD->getSrcValue(), LD->getSrcValueOffset(), - LD->isVolatile(), LD->isNonTemporal(), - LD->getAlignment()); - } else if (NumElems == 4 && LastLoadedElt == 1) { - SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); - SDValue Ops[] = { LD->getChain(), LD->getBasePtr() }; - SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); + // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a + // single use which is a sign-extend or zero-extend, and all elements are + // used. + SmallVector Uses; + unsigned ExtractedElements = 0; + for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), + UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { + if (UI.getUse().getResNo() != InputVector.getResNo()) + return SDValue(); + + SDNode *Extract = *UI; + if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + if (Extract->getValueType(0) != MVT::i32) + return SDValue(); + if (!Extract->hasOneUse()) + return SDValue(); + if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && + Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) + return SDValue(); + if (!isa(Extract->getOperand(1))) + return SDValue(); + + // Record which element was extracted. + ExtractedElements |= + 1 << cast(Extract->getOperand(1))->getZExtValue(); + + Uses.push_back(Extract); + } + + // If not all the elements were used, this may not be worthwhile. + if (ExtractedElements != 15) + return SDValue(); + + // Ok, we've now decided to do the transformation. + DebugLoc dl = InputVector.getDebugLoc(); + + // Store the value to a temporary stack slot. + SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); + SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, 0, + false, false, 0); + + // Replace each use (extract) with a load of the appropriate element. + for (SmallVectorImpl::iterator UI = Uses.begin(), + UE = Uses.end(); UI != UE; ++UI) { + SDNode *Extract = *UI; + + // Compute the element's address. + SDValue Idx = Extract->getOperand(1); + unsigned EltSize = + InputVector.getValueType().getVectorElementType().getSizeInBits()/8; + uint64_t Offset = EltSize * cast(Idx)->getZExtValue(); + SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); + + SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), OffsetVal, StackPtr); + + // Load the scalar. + SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, ScalarAddr, + NULL, 0, false, false, 0); + + // Replace the exact with the load. + DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); } + + // The replacement was made in place; don't return anything. return SDValue(); } @@ -9174,58 +9360,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// PerformANDCombine - Look for SSE and instructions of this form: -/// (and x, (build_vector signbit,signbit,signbit,signbit)). If there -/// exists a use of a build_vector that's the bitwise complement of the mask, -/// then transform the node to -/// (and (xor x, (build_vector -1,-1,-1,-1)), (build_vector ~sb,~sb,~sb,~sb)). -static SDValue PerformANDCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { - EVT VT = N->getValueType(0); - if (!VT.isVector() || !VT.isInteger()) - return SDValue(); - - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - if (N0.getOpcode() == ISD::XOR || !N1.hasOneUse()) - return SDValue(); - - if (N1.getOpcode() == ISD::BUILD_VECTOR) { - unsigned NumElts = VT.getVectorNumElements(); - EVT EltVT = VT.getVectorElementType(); - SmallVector Mask; - Mask.reserve(NumElts); - for (unsigned i = 0; i != NumElts; ++i) { - SDValue Arg = N1.getOperand(i); - if (Arg.getOpcode() == ISD::UNDEF) { - Mask.push_back(Arg); - continue; - } - ConstantSDNode *C = dyn_cast(Arg); - if (!C) - return SDValue(); - if (!C->getAPIntValue().isSignBit() && - !C->getAPIntValue().isMaxSignedValue()) - return SDValue(); - Mask.push_back(DAG.getConstant(~C->getAPIntValue(), EltVT)); - } - N1 = DAG.getNode(ISD::BUILD_VECTOR, N1.getDebugLoc(), VT, - &Mask[0], NumElts); - if (!N1.use_empty()) { - unsigned Bits = EltVT.getSizeInBits(); - Mask.clear(); - for (unsigned i = 0; i != NumElts; ++i) - Mask.push_back(DAG.getConstant(APInt::getAllOnesValue(Bits), EltVT)); - SDValue NewMask = DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), - VT, &Mask[0], NumElts); - return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, - DAG.getNode(ISD::XOR, N->getDebugLoc(), VT, - N0, NewMask), N1); - } - } - - return SDValue(); -} /// PerformMulCombine - Optimize a single multiply with constant into two /// in order to implement it with two cheaper instructions, e.g. @@ -9753,9 +9887,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, switch (N->getOpcode()) { default: break; case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); + case ISD::EXTRACT_VECTOR_ELT: + return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); - case ISD::AND: return PerformANDCombine(N, DAG, DCI); case ISD::MUL: return PerformMulCombine(N, DAG, DCI); case ISD::SHL: case ISD::SRA: @@ -9774,6 +9909,44 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); } +/// PerformDAGCombinePromotion - This method query the target whether it is +/// beneficial for dag combiner to promote the specified node. If true, it +/// should return the desired promotion type by reference. +bool X86TargetLowering::PerformDAGCombinePromotion(SDValue Op, EVT &PVT) const { + if (!Promote16Bit) + return false; + + EVT VT = Op.getValueType(); + if (VT != MVT::i16) + return false; + + bool Commute = true; + switch (Op.getOpcode()) { + default: return false; + case ISD::SUB: + Commute = false; + // fallthrough + case ISD::ADD: + case ISD::MUL: + case ISD::AND: + case ISD::OR: + case ISD::XOR: { + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + if (!Commute && isa(N1)) + return false; + // Avoid disabling potential load folding opportunities. + if ((isa(N0) && N0.hasOneUse()) && !isa(N1)) + return false; + if ((isa(N1) && N1.hasOneUse()) && !isa(N0)) + return false; + } + } + + PVT = MVT::i32; + return true; +} + //===----------------------------------------------------------------------===// // X86 Inline Assembly Support //===----------------------------------------------------------------------===// @@ -9843,7 +10016,8 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { AsmPieces[2] == "${0:w}" && IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { AsmPieces.clear(); - SplitString(IA->getConstraintString().substr(5), AsmPieces, ","); + const std::string &Constraints = IA->getConstraintString(); + SplitString(StringRef(Constraints).substr(5), AsmPieces, ","); std::sort(AsmPieces.begin(), AsmPieces.end()); if (AsmPieces.size() == 4 && AsmPieces[0] == "~{cc}" && @@ -10035,7 +10209,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, return; } - GlobalValue *GV = GA->getGlobal(); + const GlobalValue *GV = GA->getGlobal(); // If we require an extra load to get this address, as in PIC mode, we // can't accept it. if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, @@ -10298,41 +10472,3 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, return Res; } - -//===----------------------------------------------------------------------===// -// X86 Widen vector type -//===----------------------------------------------------------------------===// - -/// getWidenVectorType: given a vector type, returns the type to widen -/// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself. -/// If there is no vector type that we want to widen to, returns MVT::Other -/// When and where to widen is target dependent based on the cost of -/// scalarizing vs using the wider vector type. - -EVT X86TargetLowering::getWidenVectorType(EVT VT) const { - assert(VT.isVector()); - if (isTypeLegal(VT)) - return VT; - - // TODO: In computeRegisterProperty, we can compute the list of legal vector - // type based on element type. This would speed up our search (though - // it may not be worth it since the size of the list is relatively - // small). - EVT EltVT = VT.getVectorElementType(); - unsigned NElts = VT.getVectorNumElements(); - - // On X86, it make sense to widen any vector wider than 1 - if (NElts <= 1) - return MVT::Other; - - for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE; - nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { - EVT SVT = (MVT::SimpleValueType)nVT; - - if (isTypeLegal(SVT) && - SVT.getVectorElementType() == EltVT && - SVT.getVectorNumElements() > NElts) - return SVT; - } - return MVT::Other; -}