X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86ISelLowering.cpp;h=42193c41cb4b07bd54900d26180adb0927c842fa;hb=6bcd2196e5febb97efe188d0d6de403c577d96fc;hp=eedb1ff71418600740e5827f2bf1b12f4da56e34;hpb=07b7f672a066317ffc8224c61dc3b72ce857f24d;p=oota-llvm.git diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index eedb1ff7141..42193c41cb4 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -51,6 +51,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOptions.h" using namespace llvm; using namespace dwarf; @@ -168,8 +169,8 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) : TargetLowering(TM, createTLOF(TM)) { Subtarget = &TM.getSubtarget(); - X86ScalarSSEf64 = Subtarget->hasXMMInt() || Subtarget->hasAVX(); - X86ScalarSSEf32 = Subtarget->hasXMM() || Subtarget->hasAVX(); + X86ScalarSSEf64 = Subtarget->hasXMMInt(); + X86ScalarSSEf32 = Subtarget->hasXMM(); X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; RegInfo = TM.getRegisterInfo(); @@ -180,6 +181,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // X86 is weird, it always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); + // X86-SSE is even stranger. It uses -1 or 0 for vector masks. + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // For 64-bit since we have so many registers use the ILP scheduler, for // 32-bit code use the register pressure specific scheduling. @@ -312,7 +315,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); } else if (!UseSoftFloat) { - if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) + // Since AVX is a superset of SSE3, only check for SSE here. + if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) // Expand FP_TO_UINT into a select. // FIXME: We would like to use a Custom expander here eventually to do // the optimal thing for SSE vs. the default expansion in the legalizer. @@ -478,6 +482,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); } + if (Subtarget->hasCmpxchg16b()) { + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); + } + // FIXME - use subtarget debug flags if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() && @@ -499,7 +507,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); - setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); + setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); + setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::TRAP, MVT::Other, Legal); @@ -516,11 +525,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - setOperationAction(ISD::DYNAMIC_STACKALLOC, - (Subtarget->is64Bit() ? MVT::i64 : MVT::i32), - (Subtarget->isTargetCOFF() - && !Subtarget->isTargetEnvMacho() - ? Custom : Expand)); + + if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) + setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? + MVT::i64 : MVT::i32, Custom); + else if (EnableSegmentedStacks) + setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? + MVT::i64 : MVT::i32, Custom); + else + setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? + MVT::i64 : MVT::i32, Expand); if (!UseSoftFloat && X86ScalarSSEf64) { // f32 and f64 use SSE. @@ -699,7 +713,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::SETCC, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); @@ -714,6 +728,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::VSELECT, (MVT::SimpleValueType)VT, Expand); for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) setTruncStoreAction((MVT::SimpleValueType)VT, @@ -776,7 +791,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); - setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); + setOperationAction(ISD::SETCC, MVT::v4f32, Custom); } if (!UseSoftFloat && Subtarget->hasXMMInt()) { @@ -806,10 +821,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); - setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); - setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); - setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); - setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); + setOperationAction(ISD::SETCC, MVT::v2f64, Custom); + setOperationAction(ISD::SETCC, MVT::v16i8, Custom); + setOperationAction(ISD::SETCC, MVT::v8i16, Custom); + setOperationAction(ISD::SETCC, MVT::v4i32, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); @@ -904,6 +919,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SHL, MVT::v4i32, Custom); setOperationAction(ISD::SHL, MVT::v16i8, Custom); + setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); + setOperationAction(ISD::VSELECT, MVT::v2i64, Legal); + setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); + setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); + setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); + // i8 and i16 vectors are custom , because the source register and source // source memory operand types are not the same width. f32 vectors are // custom since the immediate controlling the insert encodes additional @@ -924,7 +945,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } } - if (Subtarget->hasSSE2() || Subtarget->hasAVX()) { + if (Subtarget->hasXMMInt()) { setOperationAction(ISD::SRL, MVT::v2i64, Custom); setOperationAction(ISD::SRL, MVT::v4i32, Custom); setOperationAction(ISD::SRL, MVT::v16i8, Custom); @@ -939,7 +960,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } if (Subtarget->hasSSE42() || Subtarget->hasAVX()) - setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); + setOperationAction(ISD::SETCC, MVT::v2i64, Custom); if (!UseSoftFloat && Subtarget->hasAVX()) { addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); @@ -991,15 +1012,20 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SRA, MVT::v8i32, Custom); setOperationAction(ISD::SRA, MVT::v16i16, Custom); - setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); - setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); - setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); - setOperationAction(ISD::VSETCC, MVT::v4i64, Custom); + setOperationAction(ISD::SETCC, MVT::v32i8, Custom); + setOperationAction(ISD::SETCC, MVT::v16i16, Custom); + setOperationAction(ISD::SETCC, MVT::v8i32, Custom); + setOperationAction(ISD::SETCC, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v4f64, Custom); setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); + setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); + setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); + setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); + setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); + setOperationAction(ISD::ADD, MVT::v4i64, Custom); setOperationAction(ISD::ADD, MVT::v8i32, Custom); setOperationAction(ISD::ADD, MVT::v16i16, Custom); @@ -1103,6 +1129,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::BUILD_VECTOR); + setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); @@ -1134,8 +1161,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } -MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { - return MVT::i8; +EVT X86TargetLowering::getSetCCResultType(EVT VT) const { + if (!VT.isVector()) return MVT::i8; + return VT.changeVectorElementTypeToInteger(); } @@ -1213,9 +1241,12 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16))) && Subtarget->getStackAlignment() >= 16) { - if (Subtarget->hasSSE2()) + if (Subtarget->hasAVX() && + Subtarget->getStackAlignment() >= 32) + return MVT::v8f32; + if (Subtarget->hasXMMInt()) return MVT::v4i32; - if (Subtarget->hasSSE1()) + if (Subtarget->hasXMM()) return MVT::v4f32; } else if (!MemcpyStrSrc && Size >= 8 && !Subtarget->is64Bit() && @@ -1418,7 +1449,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, ValToCopy); // If we don't have SSE2 available, convert to v4f32 so the generated // register is legal. - if (!Subtarget->hasSSE2()) + if (!Subtarget->hasXMMInt()) ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); } } @@ -1927,6 +1958,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); } + FuncInfo->setArgumentStackSize(StackSize); + return Chain; } @@ -3146,13 +3179,13 @@ bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PALIGNR. static bool isPALIGNRMask(const SmallVectorImpl &Mask, EVT VT, - bool hasSSSE3) { + bool hasSSSE3OrAVX) { int i, e = VT.getVectorNumElements(); if (VT.getSizeInBits() != 128 && VT.getSizeInBits() != 64) return false; // Do not handle v2i64 / v2f64 shuffles with palignr. - if (e < 4 || !hasSSSE3) + if (e < 4 || !hasSSSE3OrAVX) return false; for (i = 0; i != e; ++i) @@ -3560,6 +3593,13 @@ static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl &Mask, EVT VT) { if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) return false; + // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern + // FIXME: Need a better way to get rid of this, there's no latency difference + // between UNPCKLPD and MOVDDUP, the later should always be checked first and + // the former later. We should also remove the "_undef" special mask. + if (NumElems == 4 && VT.getSizeInBits() == 256) + return false; + // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate // independently on 128-bit lanes. unsigned NumLanes = VT.getSizeInBits() / 128; @@ -3913,11 +3953,38 @@ bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N, return true; } +/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to 256-bit +/// version of MOVDDUP. +static bool isMOVDDUPYMask(ShuffleVectorSDNode *N, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + int NumElts = VT.getVectorNumElements(); + bool V2IsUndef = N->getOperand(1).getOpcode() == ISD::UNDEF; + + if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256 || + !V2IsUndef || NumElts != 4) + return false; + + for (int i = 0; i != NumElts/2; ++i) + if (!isUndefOrEqual(N->getMaskElt(i), 0)) + return false; + for (int i = NumElts/2; i != NumElts; ++i) + if (!isUndefOrEqual(N->getMaskElt(i), NumElts/2)) + return false; + return true; +} + /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to MOVDDUP. +/// specifies a shuffle of elements that is suitable for input to 128-bit +/// version of MOVDDUP. bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { - int e = N->getValueType(0).getVectorNumElements() / 2; + EVT VT = N->getValueType(0); + + if (VT.getSizeInBits() != 128) + return false; + int e = VT.getVectorNumElements() / 2; for (int i = 0; i < e; ++i) if (!isUndefOrEqual(N->getMaskElt(i), i)) return false; @@ -4220,7 +4287,7 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) { /// getZeroVector - Returns a vector of specified type with all zero elements. /// -static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, +static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); @@ -4228,7 +4295,7 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, // to their dest type. This ensures they get CSE'd. SDValue Vec; if (VT.getSizeInBits() == 128) { // SSE - if (HasSSE2) { // SSE2 + if (HasXMMInt) { // SSE2 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } else { // SSE1 @@ -4424,11 +4491,11 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { /// element of V2 is swizzled into the zero/undef vector, landing at element /// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, - bool isZero, bool HasSSE2, - SelectionDAG &DAG) { + bool isZero, bool HasXMMInt, + SelectionDAG &DAG) { EVT VT = V2.getValueType(); SDValue V1 = isZero - ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); + ? getZeroVector(VT, HasXMMInt, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); unsigned NumElems = VT.getVectorNumElements(); SmallVector MaskVec; for (unsigned i = 0; i != NumElems; ++i) @@ -4554,8 +4621,16 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, DecodeVPERM2F128Mask(VT, cast(ImmN)->getZExtValue(), ShuffleMask); break; + case X86ISD::MOVDDUP: + case X86ISD::MOVLHPD: + case X86ISD::MOVLPD: + case X86ISD::MOVLPS: + case X86ISD::MOVSHDUP: + case X86ISD::MOVSLDUP: + case X86ISD::PALIGN: + return SDValue(); // Not yet implemented. default: - assert("not implemented for target shuffle node"); + assert(0 && "unknown target shuffle node"); return SDValue(); } @@ -4707,6 +4782,11 @@ static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, /// logical left or right shift of a vector. static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { + // Although the logic below support any bitwidth size, there are no + // shift instructions which handle more than 128-bit vectors. + if (SVOp->getValueType(0).getSizeInBits() > 128) + return false; + if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) return true; @@ -4797,6 +4877,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, DebugLoc dl) { + assert(VT.getSizeInBits() == 128 && "Unknown type for VShift"); EVT ShVT = MVT::v2i64; unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); @@ -4944,9 +5025,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl &Elts, DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; - SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, - Ops, 2, MVT::i32, - LDBase->getMemOperand()); + SDValue ResNode = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64, + LDBase->getPointerInfo(), + LDBase->getAlignment(), + false/*isVolatile*/, true/*ReadMem*/, + false/*WriteMem*/); return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); } return SDValue(); @@ -4968,7 +5052,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { Op.getValueType() == MVT::v8i32) return Op; - return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); + return getZeroVector(Op.getValueType(), Subtarget->hasXMMInt(), DAG, dl); } // Vectors containing all ones can be matched by pcmpeqd on 128-bit width @@ -5030,7 +5114,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, - Subtarget->hasSSE2(), DAG); + Subtarget->hasXMMInt(), DAG); // Now we have our 32-bit value zero extended in the low element of // a vector. If Idx != 0, swizzle it into place. @@ -5058,7 +5142,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { (ExtVT == MVT::i64 && Subtarget->is64Bit())) { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. - return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), + return getShuffleVectorZeroOrUndef(Item, 0, true,Subtarget->hasXMMInt(), DAG); } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); @@ -5066,7 +5150,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { EVT MiddleVT = MVT::v4i32; Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, - Subtarget->hasSSE2(), DAG); + Subtarget->hasXMMInt(), DAG); return DAG.getNode(ISD::BITCAST, dl, VT, Item); } } @@ -5095,7 +5179,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // Turn it into a shuffle of zero and zero-extended scalar to vector. Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, - Subtarget->hasSSE2(), DAG); + Subtarget->hasXMMInt(), DAG); SmallVector MaskVec; for (unsigned i = 0; i < NumElems; i++) MaskVec.push_back(i == Idx ? 0 : 1); @@ -5152,7 +5236,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(Idx)); return getShuffleVectorZeroOrUndef(V2, Idx, true, - Subtarget->hasSSE2(), DAG); + Subtarget->hasXMMInt(), DAG); } return SDValue(); } @@ -5177,7 +5261,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { for (unsigned i = 0; i < 4; ++i) { bool isZero = !(NonZeros & (1 << i)); if (isZero) - V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); + V[i] = getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl); else V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); } @@ -5221,7 +5305,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return LD; // For SSE 4.1, use insertps to put the high elements into the low element. - if (getSubtarget()->hasSSE41()) { + if (getSubtarget()->hasSSE41() || getSubtarget()->hasAVX()) { SDValue Result; if (Op.getOperand(0).getOpcode() != ISD::UNDEF) Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); @@ -5392,7 +5476,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, // quads, disable the next transformation since it does not help SSSE3. bool V1Used = InputQuads[0] || InputQuads[1]; bool V2Used = InputQuads[2] || InputQuads[3]; - if (Subtarget->hasSSSE3()) { + if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) { if (InputQuads.count() == 2 && V1Used && V2Used) { BestLoQuad = InputQuads.find_first(); BestHiQuad = InputQuads.find_next(BestLoQuad); @@ -5465,7 +5549,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, // If we have SSSE3, and all words of the result are from 1 input vector, // case 2 is generated, otherwise case 3 is generated. If no SSSE3 // is present, fall back to case 4. - if (Subtarget->hasSSSE3()) { + if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) { SmallVector pshufbMask; // If we have elements from both input vectors, set the high bit of the @@ -5533,7 +5617,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskV[0]); - if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) + if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && + (Subtarget->hasSSSE3() || Subtarget->hasAVX())) NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, NewV.getOperand(0), X86::getShufflePSHUFLWImmediate(NewV.getNode()), @@ -5561,7 +5646,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskV[0]); - if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) + if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && + (Subtarget->hasSSSE3() || Subtarget->hasAVX())) NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, NewV.getOperand(0), X86::getShufflePSHUFHWImmediate(NewV.getNode()), @@ -5627,7 +5713,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, } // If SSSE3, use 1 pshufb instruction per vector with elements in the result. - if (TLI.getSubtarget()->hasSSSE3()) { + if (TLI.getSubtarget()->hasSSSE3() || TLI.getSubtarget()->hasAVX()) { SmallVector pshufbMask; // If all result elements are from one input vector, then only translate @@ -6184,18 +6270,21 @@ SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { static SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, - bool HasSSE2) { + bool HasXMMInt) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); EVT VT = Op.getValueType(); assert(VT != MVT::v2i64 && "unsupported shuffle type"); - if (HasSSE2 && VT == MVT::v2f64) + if (HasXMMInt && VT == MVT::v2f64) return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); - // v4f32 or v4i32 - return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG); + // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1) + return DAG.getNode(ISD::BITCAST, dl, VT, + getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32, + DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1), + DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG)); } static @@ -6231,7 +6320,7 @@ static inline unsigned getSHUFPOpcode(EVT VT) { } static -SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { +SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); EVT VT = Op.getValueType(); @@ -6260,7 +6349,7 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { CanFoldLoad = false; if (CanFoldLoad) { - if (HasSSE2 && NumElems == 2) + if (HasXMMInt && NumElems == 2) return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); if (NumElems == 4) @@ -6274,11 +6363,13 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { // this is horrible, but will stay like this until we move all shuffle // matching to x86 specific nodes. Note that for the 1st condition all // types are matched with movsd. - if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp)) - return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); - else if (HasSSE2) + if (HasXMMInt) { + // FIXME: isMOVLMask should be checked and matched before getMOVLP, + // as to remove this logic from here, as much as possible + if (NumElems == 2 || !X86::isMOVLMask(SVOp)) + return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); - + } assert(VT != MVT::v4i32 && "unsupported shuffle type"); @@ -6360,12 +6451,16 @@ static bool isVectorBroadcast(SDValue &Op) { if (Is256) V = V.getOperand(1); - if (V.hasOneUse() && V.getOpcode() != ISD::SCALAR_TO_VECTOR) + + if (!V.hasOneUse()) return false; // Check the source scalar_to_vector type. 256-bit broadcasts are // supported for 32/64-bit sizes, while 128-bit ones are only supported // for 32-bit scalars. + if (V.getOpcode() != ISD::SCALAR_TO_VECTOR) + return false; + unsigned ScalarSize = V.getOperand(0).getValueType().getSizeInBits(); if (ScalarSize != 32 && ScalarSize != 64) return false; @@ -6392,7 +6487,7 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, SDValue V2 = Op.getOperand(1); if (isZeroShuffle(SVOp)) - return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); + return getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl); // Handle splat operations if (SVOp->isSplat()) { @@ -6424,7 +6519,8 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); if (NewOp.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); - } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { + } else if ((VT == MVT::v4i32 || + (VT == MVT::v4f32 && Subtarget->hasXMMInt()))) { // FIXME: Figure out a cleaner way to do this. // Try to make use of movq to zero out the top part. if (ISD::isBuildVectorAllZeros(V2.getNode())) { @@ -6457,9 +6553,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; bool V1IsSplat = false; bool V2IsSplat = false; - bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX(); - bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX(); - bool HasSSSE3 = Subtarget->hasSSSE3() || Subtarget->hasAVX(); + bool HasXMMInt = Subtarget->hasXMMInt(); MachineFunction &MF = DAG.getMachineFunction(); bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); @@ -6495,15 +6589,16 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); - if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef && - RelaxedMayFoldVectorLoad(V1)) + if (X86::isMOVDDUPMask(SVOp) && + (Subtarget->hasSSE3() || Subtarget->hasAVX()) && + V2IsUndef && RelaxedMayFoldVectorLoad(V1)) return getMOVDDup(Op, dl, V1, DAG); if (X86::isMOVHLPS_v_undef_Mask(SVOp)) return getMOVHighToLow(Op, dl, DAG); // Use to match splats - if (HasSSE2 && X86::isUNPCKHMask(SVOp) && V2IsUndef && + if (HasXMMInt && X86::isUNPCKHMask(SVOp) && V2IsUndef && (VT == MVT::v2f64 || VT == MVT::v2i64)) return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); @@ -6516,7 +6611,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); - if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) + if (HasXMMInt && (VT == MVT::v4f32 || VT == MVT::v4i32)) return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V1, @@ -6527,8 +6622,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { bool isLeft = false; unsigned ShAmt = 0; SDValue ShVal; - bool isShift = getSubtarget()->hasSSE2() && - isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); + bool isShift = getSubtarget()->hasXMMInt() && + isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); if (isShift && ShVal.hasOneUse()) { // If the shifted value has multiple uses, it may be cheaper to use // v_set0 + movlhps or movhlps, etc. @@ -6543,7 +6638,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (ISD::isBuildVectorAllZeros(V1.getNode())) return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); if (!X86::isMOVLPMask(SVOp)) { - if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) + if (HasXMMInt && (VT == MVT::v2i64 || VT == MVT::v2f64)) return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); if (VT == MVT::v4i32 || VT == MVT::v4f32) @@ -6553,7 +6648,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // FIXME: fold these into legal mask. if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp)) - return getMOVLowToHigh(Op, dl, DAG, HasSSE2); + return getMOVLowToHigh(Op, dl, DAG, HasXMMInt); if (X86::isMOVHLPSMask(SVOp)) return getMOVHighToLow(Op, dl, DAG); @@ -6565,7 +6660,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); if (X86::isMOVLPMask(SVOp)) - return getMOVLP(Op, dl, DAG, HasSSE2); + return getMOVLP(Op, dl, DAG, HasXMMInt); if (ShouldXformToMOVHLPS(SVOp) || ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) @@ -6649,7 +6744,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SmallVector M; SVOp->getMask(M); - if (isPALIGNRMask(M, VT, HasSSSE3)) + if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX())) return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, X86::getShufflePALIGNRImmediate(SVOp), DAG); @@ -6686,6 +6781,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // supported in the AVX instruction set. // + // Handle VMOVDDUPY permutations + if (isMOVDDUPYMask(SVOp, Subtarget)) + return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); + // Handle VPERMILPS* permutations if (isVPERMILPSMask(M, VT, Subtarget)) return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, @@ -7671,6 +7770,10 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(0)); + // Zero out the upper parts of the register. + Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget->hasXMMInt(), + DAG); + Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), DAG.getIntPtrConstant(0)); @@ -8258,6 +8361,9 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, } SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { + + if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG); + assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); @@ -8313,7 +8419,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { static SDValue Lower256IntVETCC(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); - assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::VSETCC && + assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::SETCC && "Unsupported value type for operation"); int NumElems = VT.getVectorNumElements(); @@ -8359,6 +8465,15 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = EltVT == MVT::f32 ? X86ISD::CMPPS : X86ISD::CMPPD; bool Swap = false; + // SSE Condition code mapping: + // 0 - EQ + // 1 - LT + // 2 - LE + // 3 - UNORD + // 4 - NEQ + // 5 - NLT + // 6 - NLE + // 7 - ORD switch (SetCCOpcode) { default: break; case ISD::SETOEQ: @@ -8797,9 +8912,11 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { - assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) && - "This should be used only on Windows targets"); - assert(!Subtarget->isTargetEnvMacho()); + assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() || + EnableSegmentedStacks) && + "This should be used only on Windows targets or when segmented stacks " + "are being used"); + assert(!Subtarget->isTargetEnvMacho() && "Not implemented"); DebugLoc dl = Op.getDebugLoc(); // Get the inputs. @@ -8807,23 +8924,49 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue Size = Op.getOperand(1); // FIXME: Ensure alignment here - SDValue Flag; + bool Is64Bit = Subtarget->is64Bit(); + EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; - EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; - unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); + if (EnableSegmentedStacks) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); - Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); - Flag = Chain.getValue(1); + if (Is64Bit) { + // The 64 bit implementation of segmented stacks needs to clobber both r10 + // r11. This makes it impossible to use it along with nested parameters. + const Function *F = MF.getFunction(); + + for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; I++) + if (I->hasNestAttr()) + report_fatal_error("Cannot use segmented stacks with functions that " + "have nested arguments."); + } + + const TargetRegisterClass *AddrRegClass = + getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32); + unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); + Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); + SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, + DAG.getRegister(Vreg, SPTy)); + SDValue Ops1[2] = { Value, Chain }; + return DAG.getMergeValues(Ops1, 2, dl); + } else { + SDValue Flag; + unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); - SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); + Flag = Chain.getValue(1); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); - Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); - Flag = Chain.getValue(1); + Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); + Flag = Chain.getValue(1); - Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); + Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); - SDValue Ops1[2] = { Chain.getValue(0), Chain }; - return DAG.getMergeValues(Ops1, 2, dl); + SDValue Ops1[2] = { Chain.getValue(0), Chain }; + return DAG.getMergeValues(Ops1, 2, dl); + } } SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { @@ -9318,8 +9461,13 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); } -SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, - SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, + SelectionDAG &DAG) const { + return Op.getOperand(0); +} + +SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, + SelectionDAG &DAG) const { SDValue Root = Op.getOperand(0); SDValue Trmp = Op.getOperand(1); // trampoline SDValue FPtr = Op.getOperand(2); // nested function @@ -9383,9 +9531,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, MachinePointerInfo(TrmpAddr, 22), false, false, 0); - SDValue Ops[] = - { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; - return DAG.getMergeValues(Ops, 2, dl); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6); } else { const Function *Func = cast(cast(Op.getOperand(5))->getValue()); @@ -9465,9 +9611,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, MachinePointerInfo(TrmpAddr, 6), false, false, 1); - SDValue Ops[] = - { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; - return DAG.getMergeValues(Ops, 2, dl); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4); } } @@ -9707,7 +9851,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { SDValue Amt = Op.getOperand(1); LLVMContext *Context = DAG.getContext(); - if (!(Subtarget->hasSSE2() || Subtarget->hasAVX())) + if (!Subtarget->hasXMMInt()) return SDValue(); // Decompose 256-bit shifts into smaller 128-bit shifts. @@ -9841,7 +9985,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, DAG.getConstant(4, MVT::i32)); - R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); + R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M); // a += a Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); @@ -9856,13 +10000,13 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, DAG.getConstant(2, MVT::i32)); - R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); + R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M); // a += a Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); // return pblendv(r, r+r, a); - R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, - R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); + R = DAG.getNode(ISD::VSELECT, dl, VT, Op, + R, DAG.getNode(ISD::ADD, dl, VT, R, R)); return R; } return SDValue(); @@ -9948,8 +10092,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) SDNode* Node = Op.getNode(); EVT ExtraVT = cast(Node->getOperand(1))->getVT(); EVT VT = Node->getValueType(0); - - if (Subtarget->hasSSE2() && VT.isVector()) { + if (Subtarget->hasXMMInt() && VT.isVector()) { unsigned BitsDiff = VT.getScalarType().getSizeInBits() - ExtraVT.getScalarType().getSizeInBits(); SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32); @@ -10000,7 +10143,7 @@ SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ // Go ahead and emit the fence on x86-64 even if we asked for no-sse2. // There isn't any reason to disable it if the target processor supports it. - if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) { + if (!Subtarget->hasXMMInt() && !Subtarget->is64Bit()) { SDValue Chain = Op.getOperand(0); SDValue Zero = DAG.getConstant(0, MVT::i32); SDValue Ops[] = { @@ -10054,7 +10197,7 @@ SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op, // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for // no-sse2). There isn't any reason to disable it if the target processor // supports it. - if (Subtarget->hasSSE2() || Subtarget->is64Bit()) + if (Subtarget->hasXMMInt() || Subtarget->is64Bit()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); SDValue Chain = Op.getOperand(0); @@ -10134,7 +10277,7 @@ SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { EVT SrcVT = Op.getOperand(0).getValueType(); EVT DstVT = Op.getValueType(); - assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && + assert(Subtarget->is64Bit() && !Subtarget->hasXMMInt() && Subtarget->hasMMX() && "Unexpected custom BITCAST"); assert((DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && @@ -10254,7 +10397,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); - case ISD::VSETCC: return LowerVSETCC(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); @@ -10268,7 +10410,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); - case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); + case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); + case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::CTLZ: return LowerCTLZ(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op, DAG); @@ -10303,9 +10446,8 @@ static void ReplaceATOMIC_LOAD(SDNode *Node, // FIXME: On 32-bit, load -> fild or movq would be more efficient // (The only way to get a 16-byte load is cmpxchg16b) // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment. - SDValue Zero = DAG.getConstant(0, cast(Node)->getMemoryVT()); - SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, - cast(Node)->getMemoryVT(), + SDValue Zero = DAG.getConstant(0, VT); + SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT, Node->getOperand(0), Node->getOperand(1), Zero, Zero, cast(Node)->getMemOperand(), @@ -10383,37 +10525,48 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::ATOMIC_CMP_SWAP: { EVT T = N->getValueType(0); - assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); + assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); + bool Regs64bit = T == MVT::i128; + EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; SDValue cpInL, cpInH; - cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), - DAG.getConstant(0, MVT::i32)); - cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), - DAG.getConstant(1, MVT::i32)); - cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); - cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, - cpInL.getValue(1)); + cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), + DAG.getConstant(0, HalfT)); + cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), + DAG.getConstant(1, HalfT)); + cpInL = DAG.getCopyToReg(N->getOperand(0), dl, + Regs64bit ? X86::RAX : X86::EAX, + cpInL, SDValue()); + cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, + Regs64bit ? X86::RDX : X86::EDX, + cpInH, cpInL.getValue(1)); SDValue swapInL, swapInH; - swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), - DAG.getConstant(0, MVT::i32)); - swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), - DAG.getConstant(1, MVT::i32)); - swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, - cpInH.getValue(1)); - swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, - swapInL.getValue(1)); + swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), + DAG.getConstant(0, HalfT)); + swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), + DAG.getConstant(1, HalfT)); + swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, + Regs64bit ? X86::RBX : X86::EBX, + swapInL, cpInH.getValue(1)); + swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, + Regs64bit ? X86::RCX : X86::ECX, + swapInH, swapInL.getValue(1)); SDValue Ops[] = { swapInH.getValue(0), N->getOperand(1), swapInH.getValue(1) }; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); MachineMemOperand *MMO = cast(N)->getMemOperand(); - SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, + unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : + X86ISD::LCMPXCHG8_DAG; + SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, 3, T, MMO); - SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, - MVT::i32, Result.getValue(1)); - SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, - MVT::i32, cpOutL.getValue(2)); + SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, + Regs64bit ? X86::RAX : X86::EAX, + HalfT, Result.getValue(1)); + SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, + Regs64bit ? X86::RDX : X86::EDX, + HalfT, cpOutL.getValue(2)); SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; - Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2)); Results.push_back(cpOutH.getValue(1)); return; } @@ -10489,7 +10642,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PSIGNB: return "X86ISD::PSIGNB"; case X86ISD::PSIGNW: return "X86ISD::PSIGNW"; case X86ISD::PSIGND: return "X86ISD::PSIGND"; - case X86ISD::PBLENDVB: return "X86ISD::PBLENDVB"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMIN: return "X86ISD::FMIN"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; @@ -10579,6 +10731,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; + case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; } } @@ -10681,7 +10834,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, EVT VT) const { // Very little shuffling can be done for 64-bit vectors right now. if (VT.getSizeInBits() == 64) - return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); + return isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()); // FIXME: pshufb, blends, shifts. return (VT.getVectorNumElements() == 2 || @@ -10691,7 +10844,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, isPSHUFDMask(M, VT) || isPSHUFHWMask(M, VT) || isPSHUFLWMask(M, VT) || - isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || + isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()) || isUNPCKLMask(M, VT) || isUNPCKHMask(M, VT) || isUNPCKL_v_undef_Mask(M, VT) || @@ -11124,7 +11277,9 @@ X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, if (!(Op.isReg() && Op.isImplicit())) MIB.addOperand(Op); } - BuildMI(*BB, MI, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) + BuildMI(*BB, MI, dl, + TII->get(Subtarget->hasAVX() ? X86::VMOVAPSrr : X86::MOVAPSrr), + MI->getOperand(0).getReg()) .addReg(X86::XMM0); MI->eraseFromParent(); @@ -11479,6 +11634,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( MBB->addSuccessor(EndMBB); } + unsigned MOVOpc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; // In the XMM save block, save all the XMM argument registers. for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; @@ -11487,7 +11643,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), MachineMemOperand::MOStore, /*Size=*/16, /*Align=*/16); - BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) + BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) .addFrameIndex(RegSaveFrameIndex) .addImm(/*Scale=*/1) .addReg(/*IndexReg=*/0) @@ -11531,17 +11687,9 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. - const MachineFunction *MF = BB->getParent(); - const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); - BitVector ReservedRegs = TRI->getReservedRegs(*MF); - - for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { - const MachineOperand &MO = MI->getOperand(I); - if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; - unsigned Reg = MO.getReg(); - if (Reg != X86::EFLAGS) continue; - copy0MBB->addLiveIn(Reg); - sinkMBB->addLiveIn(Reg); + if (!MI->killsRegister(X86::EFLAGS)) { + copy0MBB->addLiveIn(X86::EFLAGS); + sinkMBB->addLiveIn(X86::EFLAGS); } // Transfer the remainder of BB and its successor edges to sinkMBB. @@ -11576,6 +11724,119 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, return sinkMBB; } +MachineBasicBlock * +X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, + bool Is64Bit) const { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + MachineFunction *MF = BB->getParent(); + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + + assert(EnableSegmentedStacks); + + unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; + unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; + + // BB: + // ... [Till the alloca] + // If stacklet is not large enough, jump to mallocMBB + // + // bumpMBB: + // Allocate by subtracting from RSP + // Jump to continueMBB + // + // mallocMBB: + // Allocate by call to runtime + // + // continueMBB: + // ... + // [rest of original BB] + // + + MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); + + MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetRegisterClass *AddrRegClass = + getRegClassFor(Is64Bit ? MVT::i64:MVT::i32); + + unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), + bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), + tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), + sizeVReg = MI->getOperand(1).getReg(), + physSPReg = Is64Bit ? X86::RSP : X86::ESP; + + MachineFunction::iterator MBBIter = BB; + ++MBBIter; + + MF->insert(MBBIter, bumpMBB); + MF->insert(MBBIter, mallocMBB); + MF->insert(MBBIter, continueMBB); + + continueMBB->splice(continueMBB->begin(), BB, llvm::next + (MachineBasicBlock::iterator(MI)), BB->end()); + continueMBB->transferSuccessorsAndUpdatePHIs(BB); + + // Add code to the main basic block to check if the stack limit has been hit, + // and if so, jump to mallocMBB otherwise to bumpMBB. + BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); + BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), tmpSPVReg) + .addReg(tmpSPVReg).addReg(sizeVReg); + BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr)) + .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg) + .addReg(tmpSPVReg); + BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB); + + // bumpMBB simply decreases the stack pointer, since we know the current + // stacklet has enough space. + BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) + .addReg(tmpSPVReg); + BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) + .addReg(tmpSPVReg); + BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); + + // Calls into a routine in libgcc to allocate more space from the heap. + if (Is64Bit) { + BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) + .addReg(sizeVReg); + BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) + .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI); + } else { + BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) + .addImm(12); + BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); + BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) + .addExternalSymbol("__morestack_allocate_stack_space"); + } + + if (!Is64Bit) + BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) + .addImm(16); + + BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) + .addReg(Is64Bit ? X86::RAX : X86::EAX); + BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); + + // Set up the CFG correctly. + BB->addSuccessor(bumpMBB); + BB->addSuccessor(mallocMBB); + mallocMBB->addSuccessor(continueMBB); + bumpMBB->addSuccessor(continueMBB); + + // Take care of the PHI nodes. + BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), + MI->getOperand(0).getReg()) + .addReg(mallocPtrVReg).addMBB(mallocMBB) + .addReg(bumpSPPtrVReg).addMBB(bumpMBB); + + // Delete the original pseudo instruction. + MI->eraseFromParent(); + + // And we're done. + return continueMBB; +} + MachineBasicBlock * X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { @@ -11711,6 +11972,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, return BB; case X86::WIN_ALLOCA: return EmitLoweredWinAlloca(MI, BB); + case X86::SEG_ALLOCA_32: + return EmitLoweredSegAlloca(MI, BB, false); + case X86::SEG_ALLOCA_64: + return EmitLoweredSegAlloca(MI, BB, true); case X86::TLSCall_32: case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); @@ -12143,7 +12408,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, // Emit a zeroed vector and insert the desired subvector on its // first half. - SDValue Zeros = getZeroVector(VT, true /* HasSSE2 */, DAG, dl); + SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl); SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), DAG.getConstant(0, MVT::i32), DAG, dl); return DCI.CombineTo(N, InsV); @@ -12287,7 +12552,8 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. +/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT +/// nodes. static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { DebugLoc DL = N->getDebugLoc(); @@ -12300,9 +12566,9 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // instructions match the semantics of the common C idiom xhasSSE2() && - (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && - Cond.getOpcode() == ISD::SETCC) { + if (Subtarget->hasXMMInt() && Cond.getOpcode() == ISD::SETCC && + (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64 || + LHS.getValueType() == MVT::v4f32 || LHS.getValueType() == MVT::v2f64)) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); unsigned Opcode = 0; @@ -12758,7 +13024,7 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, // all elements are shifted by the same amount. We can't do this in legalize // because the a constant vector is typically transformed to a constant pool // so we have no knowledge of the shift amount. - if (!(Subtarget->hasSSE2() || Subtarget->hasAVX())) + if (!Subtarget->hasXMMInt()) return SDValue(); if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) @@ -12874,7 +13140,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but // we're requiring SSE2 for both. - if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { + if (Subtarget->hasXMMInt() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CMP0 = N0->getOperand(1); @@ -13027,7 +13293,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, SDValue N1 = N->getOperand(1); // look for psign/blend - if (Subtarget->hasSSSE3()) { + if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) { if (VT == MVT::v2i64) { // Canonicalize pandn to RHS if (N0.getOpcode() == X86ISD::ANDNP) @@ -13100,13 +13366,13 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, } } // PBLENDVB only available on SSE 4.1 - if (!Subtarget->hasSSE41()) + if (!(Subtarget->hasSSE41() || Subtarget->hasAVX())) return SDValue(); X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X); Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y); Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask); - Mask = DAG.getNode(X86ISD::PBLENDVB, DL, MVT::v16i8, X, Y, Mask); + Mask = DAG.getNode(ISD::VSELECT, DL, MVT::v16i8, Mask, X, Y); return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask); } } @@ -13287,7 +13553,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, const Function *F = DAG.getMachineFunction().getFunction(); bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps - && Subtarget->hasSSE2(); + && Subtarget->hasXMMInt(); if ((VT.isVector() || (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && isa(St->getValue()) && @@ -13607,6 +13873,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, default: break; case ISD::EXTRACT_VECTOR_ELT: return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); + case ISD::VSELECT: case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); case ISD::ADD: return OptimizeConditionalInDecrement(N, DAG);