X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;ds=sidebyside;f=lib%2FTarget%2FX86%2FX86ISelLowering.cpp;h=d2b7690489380289230cdc4b6b1a6b6558971fa8;hb=726ebd6ff3178499e8455ce8433a4310badefe26;hp=d229592587d7a5fd02fe2989c0c2acfdf758751b;hpb=2eb4c2bcadfbef9d3c4e2fbb6478ed5dc3d65524;p=oota-llvm.git diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index d229592587d..d2b76904893 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -168,8 +168,8 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) : TargetLowering(TM, createTLOF(TM)) { Subtarget = &TM.getSubtarget(); - X86ScalarSSEf64 = Subtarget->hasXMMInt(); - X86ScalarSSEf32 = Subtarget->hasXMM(); + X86ScalarSSEf64 = Subtarget->hasXMMInt() || Subtarget->hasAVX(); + X86ScalarSSEf32 = Subtarget->hasXMM() || Subtarget->hasAVX(); X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; RegInfo = TM.getRegisterInfo(); @@ -883,7 +883,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); } - if (Subtarget->hasSSE41()) { + if (Subtarget->hasSSE41() || Subtarget->hasAVX()) { setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FCEIL, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); @@ -922,10 +922,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } } - if (Subtarget->hasSSE2()) { + if (Subtarget->hasSSE2() || Subtarget->hasAVX()) { setOperationAction(ISD::SRL, MVT::v2i64, Custom); setOperationAction(ISD::SRL, MVT::v4i32, Custom); setOperationAction(ISD::SRL, MVT::v16i8, Custom); + setOperationAction(ISD::SRL, MVT::v8i16, Custom); setOperationAction(ISD::SHL, MVT::v2i64, Custom); setOperationAction(ISD::SHL, MVT::v4i32, Custom); @@ -935,7 +936,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SRA, MVT::v8i16, Custom); } - if (Subtarget->hasSSE42()) + if (Subtarget->hasSSE42() || Subtarget->hasAVX()) setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); if (!UseSoftFloat && Subtarget->hasAVX()) { @@ -966,6 +967,36 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); + + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f64, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i64, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i8, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i16, Custom); + + setOperationAction(ISD::SRL, MVT::v4i64, Custom); + setOperationAction(ISD::SRL, MVT::v8i32, Custom); + setOperationAction(ISD::SRL, MVT::v16i16, Custom); + setOperationAction(ISD::SRL, MVT::v32i8, Custom); + + setOperationAction(ISD::SHL, MVT::v4i64, Custom); + setOperationAction(ISD::SHL, MVT::v8i32, Custom); + setOperationAction(ISD::SHL, MVT::v16i16, Custom); + setOperationAction(ISD::SHL, MVT::v32i8, Custom); + + setOperationAction(ISD::SRA, MVT::v8i32, Custom); + setOperationAction(ISD::SRA, MVT::v16i16, Custom); + + setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); + setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); + setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); + setOperationAction(ISD::VSETCC, MVT::v4i64, Custom); + + setOperationAction(ISD::SELECT, MVT::v4f64, Custom); + setOperationAction(ISD::SELECT, MVT::v4i64, Custom); + setOperationAction(ISD::SELECT, MVT::v8f32, Custom); // Custom lower several nodes for 256-bit types. for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; @@ -2724,6 +2755,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::VPERMILPSY: case X86ISD::VPERMILPD: case X86ISD::VPERMILPDY: + case X86ISD::VPERM2F128: return true; } return false; @@ -2766,6 +2798,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::PALIGN: case X86ISD::SHUFPD: case X86ISD::SHUFPS: + case X86ISD::VPERM2F128: return DAG.getNode(Opc, dl, VT, V1, V2, DAG.getConstant(TargetMask, MVT::i8)); } @@ -2996,6 +3029,17 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) { return (Val < 0) || (Val >= Low && Val < Hi); } +/// isUndefOrInRange - Return true if every element in Mask, begining +/// from position Pos and ending in Pos+Size, falls within the specified +/// range (L, L+Pos]. or is undef. +static bool isUndefOrInRange(const SmallVectorImpl &Mask, + int Pos, int Size, int Low, int Hi) { + for (int i = Pos, e = Pos+Size; i != e; ++i) + if (!isUndefOrInRange(Mask[i], Low, Hi)) + return false; + return true; +} + /// isUndefOrEqual - Val is either less than zero (undef) or equal to the /// specified value. static bool isUndefOrEqual(int Val, int CmpVal) { @@ -3004,6 +3048,17 @@ static bool isUndefOrEqual(int Val, int CmpVal) { return false; } +/// isSequentialOrUndefInRange - Return true if every element in Mask, begining +/// from position Pos and ending in Pos+Size, falls within the specified +/// sequential range (L, L+Pos]. or is undef. +static bool isSequentialOrUndefInRange(const SmallVectorImpl &Mask, + int Pos, int Size, int Low) { + for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low) + if (!isUndefOrEqual(Mask[i], Low)) + return false; + return true; +} + /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference /// the second operand. @@ -3159,7 +3214,13 @@ static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVHLPS. bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { - if (N->getValueType(0).getVectorNumElements() != 4) + EVT VT = N->getValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + + if (VT.getSizeInBits() != 128) + return false; + + if (NumElems != 4) return false; // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 @@ -3173,15 +3234,19 @@ bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, /// <2, 3, 2, 3> bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { - unsigned NumElems = N->getValueType(0).getVectorNumElements(); + EVT VT = N->getValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + + if (VT.getSizeInBits() != 128) + return false; if (NumElems != 4) return false; return isUndefOrEqual(N->getMaskElt(0), 2) && - isUndefOrEqual(N->getMaskElt(1), 3) && - isUndefOrEqual(N->getMaskElt(2), 2) && - isUndefOrEqual(N->getMaskElt(3), 3); + isUndefOrEqual(N->getMaskElt(1), 3) && + isUndefOrEqual(N->getMaskElt(2), 2) && + isUndefOrEqual(N->getMaskElt(3), 3); } /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand @@ -3405,6 +3470,67 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) { return ::isMOVLMask(M, N->getValueType(0)); } +/// isVPERM2F128Mask - Match 256-bit shuffles where the elements are considered +/// as permutations between 128-bit chunks or halves. As an example: this +/// shuffle bellow: +/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> +/// The first half comes from the second half of V1 and the second half from the +/// the second half of V2. +static bool isVPERM2F128Mask(const SmallVectorImpl &Mask, EVT VT, + const X86Subtarget *Subtarget) { + if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) + return false; + + // The shuffle result is divided into half A and half B. In total the two + // sources have 4 halves, namely: C, D, E, F. The final values of A and + // B must come from C, D, E or F. + int HalfSize = VT.getVectorNumElements()/2; + bool MatchA = false, MatchB = false; + + // Check if A comes from one of C, D, E, F. + for (int Half = 0; Half < 4; ++Half) { + if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { + MatchA = true; + break; + } + } + + // Check if B comes from one of C, D, E, F. + for (int Half = 0; Half < 4; ++Half) { + if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { + MatchB = true; + break; + } + } + + return MatchA && MatchB; +} + +/// getShuffleVPERM2F128Immediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VPERM2F128 instructions. +static unsigned getShuffleVPERM2F128Immediate(SDNode *N) { + ShuffleVectorSDNode *SVOp = cast(N); + EVT VT = SVOp->getValueType(0); + + int HalfSize = VT.getVectorNumElements()/2; + + int FstHalf = 0, SndHalf = 0; + for (int i = 0; i < HalfSize; ++i) { + if (SVOp->getMaskElt(i) > 0) { + FstHalf = SVOp->getMaskElt(i)/HalfSize; + break; + } + } + for (int i = HalfSize; i < HalfSize*2; ++i) { + if (SVOp->getMaskElt(i) > 0) { + SndHalf = SVOp->getMaskElt(i)/HalfSize; + break; + } + } + + return (FstHalf | (SndHalf << 4)); +} + /// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to VPERMILPD*. /// Note that VPERMIL mask matching is different depending whether theunderlying @@ -3465,10 +3591,16 @@ static bool isVPERMILPSMask(const SmallVectorImpl &Mask, EVT VT, return false; // The mask on the high lane should be the same as the low. Actually, - // they can differ if any of the corresponding index in a lane is undef. + // they can differ if any of the corresponding index in a lane is undef + // and the other stays in range. int LaneSize = NumElts/NumLanes; for (int i = 0; i < LaneSize; ++i) { int HighElt = i+LaneSize; + bool HighValid = isUndefOrInRange(Mask[HighElt], LaneSize, NumElts); + bool LowValid = isUndefOrInRange(Mask[i], 0, LaneSize); + + if (!HighValid || !LowValid) + return false; if (Mask[i] < 0 || Mask[HighElt] < 0) continue; if (Mask[HighElt]-Mask[i] != LaneSize) @@ -3486,13 +3618,22 @@ static unsigned getShuffleVPERMILPSImmediate(SDNode *N) { int NumElts = VT.getVectorNumElements(); int NumLanes = VT.getSizeInBits()/128; + int LaneSize = NumElts/NumLanes; + // Although the mask is equal for both lanes do it twice to get the cases + // where a mask will match because the same mask element is undef on the + // first half but valid on the second. This would get pathological cases + // such as: shuffle , which is completely valid. unsigned Mask = 0; - for (int i = 0; i < NumElts/NumLanes /* lane size */; ++i) { - int MaskElt = SVOp->getMaskElt(i); - if (MaskElt < 0) - continue; - Mask |= MaskElt << (i*2); + for (int l = 0; l < NumLanes; ++l) { + for (int i = 0; i < LaneSize; ++i) { + int MaskElt = SVOp->getMaskElt(i+(l*LaneSize)); + if (MaskElt < 0) + continue; + if (MaskElt >= LaneSize) + MaskElt -= LaneSize; + Mask |= MaskElt << (i*2); + } } return Mask; @@ -3809,7 +3950,10 @@ static void CommuteVectorShuffleMask(SmallVectorImpl &Mask, EVT VT) { /// V1 (and in order), and the upper half elements should come from the upper /// half of V2 (and in order). static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { - if (Op->getValueType(0).getVectorNumElements() != 4) + EVT VT = Op->getValueType(0); + if (VT.getSizeInBits() != 128) + return false; + if (VT.getVectorNumElements() != 4) return false; for (unsigned i = 0, e = 2; i != e; ++i) if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) @@ -3841,6 +3985,10 @@ static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { /// MOVLP, it must be either a vector load or a scalar load to vector. static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, ShuffleVectorSDNode *Op) { + EVT VT = Op->getValueType(0); + if (VT.getSizeInBits() != 128) + return false; + if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) return false; // Is V2 is a vector load, don't do this transformation. We will try to use @@ -3848,7 +3996,7 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, if (ISD::isNON_EXTLoad(V2)) return false; - unsigned NumElems = Op->getValueType(0).getVectorNumElements(); + unsigned NumElems = VT.getVectorNumElements(); if (NumElems != 2 && NumElems != 4) return false; @@ -4012,11 +4160,11 @@ static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } -// PromoteSplatv8v16 - All i16 and i8 vector types can't be used directly by +// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by // a generic shuffle instruction because the target has no such instructions. // Generate shuffles which repeat i16 and i8 several times until they can be // represented by v4f32 and then be manipulated by target suported shuffles. -static SDValue PromoteSplatv8v16(SDValue V, SelectionDAG &DAG, int &EltNo) { +static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { EVT VT = V.getValueType(); int NumElems = VT.getVectorNumElements(); DebugLoc dl = V.getDebugLoc(); @@ -4059,36 +4207,7 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { return DAG.getNode(ISD::BITCAST, dl, VT, V); } -/// PromoteVectorToScalarSplat - Since there's no native support for -/// scalar_to_vector for 256-bit AVX, a 128-bit scalar_to_vector + -/// INSERT_SUBVECTOR is generated. Recognize this idiom and do the -/// shuffle before the insertion, this yields less instructions in the end. -static SDValue PromoteVectorToScalarSplat(ShuffleVectorSDNode *SV, - SelectionDAG &DAG) { - EVT SrcVT = SV->getValueType(0); - SDValue V1 = SV->getOperand(0); - DebugLoc dl = SV->getDebugLoc(); - int NumElems = SrcVT.getVectorNumElements(); - - assert(SrcVT.is256BitVector() && "unknown howto handle vector type"); - - SmallVector Mask; - for (int i = 0; i < NumElems/2; ++i) - Mask.push_back(SV->getMaskElt(i)); - - EVT SVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), - NumElems/2); - SDValue SV1 = DAG.getVectorShuffle(SVT, dl, V1.getOperand(1), - DAG.getUNDEF(SVT), &Mask[0]); - SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), SV1, - DAG.getConstant(0, MVT::i32), DAG, dl); - - return Insert128BitVector(InsV, SV1, - DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); -} - -/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32 and -/// v8i32, v16i16 or v32i8 to v8f32. +/// PromoteSplat - Splat is promoted to target supported vector shuffles. static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { EVT SrcVT = SV->getValueType(0); SDValue V1 = SV->getOperand(0); @@ -4107,9 +4226,14 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { EltNo -= NumElems/2; } - // Make this 128-bit vector duplicate i8 and i16 elements - if (NumElems > 4) - V1 = PromoteSplatv8v16(V1, DAG, EltNo); + // All i16 and i8 vector types can't be used directly by a generic shuffle + // instruction because the target has no such instruction. Generate shuffles + // which repeat i16 and i8 several times until they fit in i32, and then can + // be manipulated by target suported shuffles. After the insertion of the + // necessary shuffles, the result is bitcasted back to v4f32 or v8f32. + EVT EltVT = SrcVT.getVectorElementType(); + if (NumElems > 4 && (EltVT == MVT::i8 || EltVT == MVT::i16)) + V1 = PromoteSplati8i16(V1, DAG, EltNo); // Recreate the 256-bit vector and place the same 128-bit vector // into the low and high part. This is necessary because we want @@ -4255,6 +4379,11 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, DecodeVPERMILPDMask(4, cast(ImmN)->getZExtValue(), ShuffleMask); break; + case X86ISD::VPERM2F128: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeVPERM2F128Mask(VT, cast(ImmN)->getZExtValue(), + ShuffleMask); + break; default: assert("not implemented for target shuffle node"); return SDValue(); @@ -4536,42 +4665,52 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, return SDValue(); } + // FIXME: 256-bit vector instructions don't require a strict alignment, + // improve this code to support it better. + unsigned RequiredAlign = VT.getSizeInBits()/8; SDValue Chain = LD->getChain(); - // Make sure the stack object alignment is at least 16. + // Make sure the stack object alignment is at least 16 or 32. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); - if (DAG.InferPtrAlignment(Ptr) < 16) { + if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { if (MFI->isFixedObjectIndex(FI)) { // Can't change the alignment. FIXME: It's possible to compute // the exact stack offset and reference FI + adjust offset instead. // If someone *really* cares about this. That's the way to implement it. return SDValue(); } else { - MFI->setObjectAlignment(FI, 16); + MFI->setObjectAlignment(FI, RequiredAlign); } } - // (Offset % 16) must be multiple of 4. Then address is then + // (Offset % 16 or 32) must be multiple of 4. Then address is then // Ptr + (Offset & ~15). if (Offset < 0) return SDValue(); - if ((Offset % 16) & 3) + if ((Offset % RequiredAlign) & 3) return SDValue(); - int64_t StartOffset = Offset & ~15; + int64_t StartOffset = Offset & ~(RequiredAlign-1); if (StartOffset) Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); int EltNo = (Offset - StartOffset) >> 2; - int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; - EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; - SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr, + int NumElems = VT.getVectorNumElements(); + + EVT CanonVT = VT.getSizeInBits() == 128 ? MVT::v4i32 : MVT::v8i32; + EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); + SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(StartOffset), false, false, 0); - // Canonicalize it to a v4i32 shuffle. - V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); - return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getVectorShuffle(MVT::v4i32, dl, V1, - DAG.getUNDEF(MVT::v4i32),&Mask[0])); + + // Canonicalize it to a v4i32 or v8i32 shuffle. + SmallVector Mask; + for (int i = 0; i < NumElems; ++i) + Mask.push_back(EltNo); + + V1 = DAG.getNode(ISD::BITCAST, dl, CanonVT, V1); + return DAG.getNode(ISD::BITCAST, dl, NVT, + DAG.getVectorShuffle(CanonVT, dl, V1, + DAG.getUNDEF(CanonVT),&Mask[0])); } return SDValue(); @@ -4651,24 +4790,26 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { EVT ExtVT = VT.getVectorElementType(); unsigned NumElems = Op.getNumOperands(); - // All zero's: - // - pxor (SSE2), xorps (SSE1), vpxor (128 AVX), xorp[s|d] (256 AVX) - // All one's: - // - pcmpeqd (SSE2 and 128 AVX), fallback to constant pools (256 AVX) - if (ISD::isBuildVectorAllZeros(Op.getNode()) || - ISD::isBuildVectorAllOnes(Op.getNode())) { - // Canonicalize this to <4 x i32> or <8 x 32> (SSE) to - // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are - // eliminated on x86-32 hosts. + // Vectors containing all zeros can be matched by pxor and xorps later + if (ISD::isBuildVectorAllZeros(Op.getNode())) { + // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd + // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v8i32) return Op; - if (ISD::isBuildVectorAllOnes(Op.getNode())) - return getOnesVector(Op.getValueType(), DAG, dl); return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); } + // Vectors containing all ones can be matched by pcmpeqd on 128-bit width + // vectors or broken into v4i32 operations on 256-bit vectors. + if (ISD::isBuildVectorAllOnes(Op.getNode())) { + if (Op.getValueType() == MVT::v4i32) + return Op; + + return getOnesVector(Op.getValueType(), DAG, dl); + } + unsigned EVTBits = ExtVT.getSizeInBits(); unsigned NumZero = 0; @@ -4960,13 +5101,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue -X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { - // We support concatenate two MMX registers and place them in a MMX - // register. This is better than doing a stack convert. +// LowerMMXCONCAT_VECTORS - We support concatenate two MMX registers and place +// them in a MMX register. This is better than doing a stack convert. +static SDValue LowerMMXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); EVT ResVT = Op.getValueType(); - assert(Op.getNumOperands() == 2); + assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || ResVT == MVT::v8i16 || ResVT == MVT::v16i8); int Mask[2]; @@ -4987,6 +5127,42 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); } +// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction +// to create 256-bit vectors from two other 128-bit ones. +static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { + DebugLoc dl = Op.getDebugLoc(); + EVT ResVT = Op.getValueType(); + + assert(ResVT.getSizeInBits() == 256 && "Value type must be 256-bit wide"); + + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + unsigned NumElems = ResVT.getVectorNumElements(); + + SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, ResVT), V1, + DAG.getConstant(0, MVT::i32), DAG, dl); + return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32), + DAG, dl); +} + +SDValue +X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { + EVT ResVT = Op.getValueType(); + + assert(Op.getNumOperands() == 2); + assert((ResVT.getSizeInBits() == 128 || ResVT.getSizeInBits() == 256) && + "Unsupported CONCAT_VECTORS for value type"); + + // We support concatenate two MMX registers and place them in a MMX register. + // This is better than doing a stack convert. + if (ResVT.is128BitVector()) + return LowerMMXCONCAT_VECTORS(Op, DAG); + + // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors + // from two other 128-bit ones. + return LowerAVXCONCAT_VECTORS(Op, DAG); +} + // v8i16 shuffles - Prefer shuffles in the following order: // 1. [all] pshuflw, pshufhw, optional move // 2. [ssse3] 1 x pshufb @@ -5478,10 +5654,95 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT, OpVT, SrcOp))); } +/// areShuffleHalvesWithinDisjointLanes - Check whether each half of a vector +/// shuffle node referes to only one lane in the sources. +static bool areShuffleHalvesWithinDisjointLanes(ShuffleVectorSDNode *SVOp) { + EVT VT = SVOp->getValueType(0); + int NumElems = VT.getVectorNumElements(); + int HalfSize = NumElems/2; + SmallVector M; + SVOp->getMask(M); + bool MatchA = false, MatchB = false; + + for (int l = 0; l < NumElems*2; l += HalfSize) { + if (isUndefOrInRange(M, 0, HalfSize, l, l+HalfSize)) { + MatchA = true; + break; + } + } + + for (int l = 0; l < NumElems*2; l += HalfSize) { + if (isUndefOrInRange(M, HalfSize, HalfSize, l, l+HalfSize)) { + MatchB = true; + break; + } + } + + return MatchA && MatchB; +} + /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles /// which could not be matched by any known target speficic shuffle static SDValue LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { + if (areShuffleHalvesWithinDisjointLanes(SVOp)) { + // If each half of a vector shuffle node referes to only one lane in the + // source vectors, extract each used 128-bit lane and shuffle them using + // 128-bit shuffles. Then, concatenate the results. Otherwise leave + // the work to the legalizer. + DebugLoc dl = SVOp->getDebugLoc(); + EVT VT = SVOp->getValueType(0); + int NumElems = VT.getVectorNumElements(); + int HalfSize = NumElems/2; + + // Extract the reference for each half + int FstVecExtractIdx = 0, SndVecExtractIdx = 0; + int FstVecOpNum = 0, SndVecOpNum = 0; + for (int i = 0; i < HalfSize; ++i) { + int Elt = SVOp->getMaskElt(i); + if (SVOp->getMaskElt(i) < 0) + continue; + FstVecOpNum = Elt/NumElems; + FstVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize; + break; + } + for (int i = HalfSize; i < NumElems; ++i) { + int Elt = SVOp->getMaskElt(i); + if (SVOp->getMaskElt(i) < 0) + continue; + SndVecOpNum = Elt/NumElems; + SndVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize; + break; + } + + // Extract the subvectors + SDValue V1 = Extract128BitVector(SVOp->getOperand(FstVecOpNum), + DAG.getConstant(FstVecExtractIdx, MVT::i32), DAG, dl); + SDValue V2 = Extract128BitVector(SVOp->getOperand(SndVecOpNum), + DAG.getConstant(SndVecExtractIdx, MVT::i32), DAG, dl); + + // Generate 128-bit shuffles + SmallVector MaskV1, MaskV2; + for (int i = 0; i < HalfSize; ++i) { + int Elt = SVOp->getMaskElt(i); + MaskV1.push_back(Elt < 0 ? Elt : Elt % HalfSize); + } + for (int i = HalfSize; i < NumElems; ++i) { + int Elt = SVOp->getMaskElt(i); + MaskV2.push_back(Elt < 0 ? Elt : Elt % HalfSize); + } + + EVT NVT = V1.getValueType(); + V1 = DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &MaskV1[0]); + V2 = DAG.getVectorShuffle(NVT, dl, V2, DAG.getUNDEF(NVT), &MaskV2[0]); + + // Concatenate the result back + SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), V1, + DAG.getConstant(0, MVT::i32), DAG, dl); + return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32), + DAG, dl); + } + return SDValue(); } @@ -5846,7 +6107,9 @@ static inline unsigned getUNPCKLOpcode(EVT VT) { case MVT::v2i64: return X86ISD::PUNPCKLQDQ; case MVT::v4f32: return X86ISD::UNPCKLPS; case MVT::v2f64: return X86ISD::UNPCKLPD; + case MVT::v8i32: // Use fp unit for int unpack. case MVT::v8f32: return X86ISD::VUNPCKLPSY; + case MVT::v4i64: // Use fp unit for int unpack. case MVT::v4f64: return X86ISD::VUNPCKLPDY; case MVT::v16i8: return X86ISD::PUNPCKLBW; case MVT::v8i16: return X86ISD::PUNPCKLWD; @@ -5862,7 +6125,9 @@ static inline unsigned getUNPCKHOpcode(EVT VT) { case MVT::v2i64: return X86ISD::PUNPCKHQDQ; case MVT::v4f32: return X86ISD::UNPCKHPS; case MVT::v2f64: return X86ISD::UNPCKHPD; + case MVT::v8i32: // Use fp unit for int unpack. case MVT::v8f32: return X86ISD::VUNPCKHPSY; + case MVT::v4i64: // Use fp unit for int unpack. case MVT::v4f64: return X86ISD::VUNPCKHPDY; case MVT::v16i8: return X86ISD::PUNPCKHBW; case MVT::v8i16: return X86ISD::PUNPCKHWD; @@ -5888,6 +6153,48 @@ static inline unsigned getVPERMILOpcode(EVT VT) { return 0; } +/// isVectorBroadcast - Check if the node chain is suitable to be xformed to +/// a vbroadcast node. The nodes are suitable whenever we can fold a load coming +/// from a 32 or 64 bit scalar. Update Op to the desired load to be folded. +static bool isVectorBroadcast(SDValue &Op) { + EVT VT = Op.getValueType(); + bool Is256 = VT.getSizeInBits() == 256; + + assert((VT.getSizeInBits() == 128 || Is256) && + "Unsupported type for vbroadcast node"); + + SDValue V = Op; + if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + + if (Is256 && !(V.hasOneUse() && + V.getOpcode() == ISD::INSERT_SUBVECTOR && + V.getOperand(0).getOpcode() == ISD::UNDEF)) + return false; + + if (Is256) + V = V.getOperand(1); + if (V.hasOneUse() && V.getOpcode() != ISD::SCALAR_TO_VECTOR) + return false; + + // Check the source scalar_to_vector type. 256-bit broadcasts are + // supported for 32/64-bit sizes, while 128-bit ones are only supported + // for 32-bit scalars. + unsigned ScalarSize = V.getOperand(0).getValueType().getSizeInBits(); + if (ScalarSize != 32 && ScalarSize != 64) + return false; + if (!Is256 && ScalarSize == 64) + return false; + + V = V.getOperand(0); + if (!MayFoldLoad(V)) + return false; + + // Return the load node + Op = V; + return true; +} + static SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, const TargetLowering &TLI, @@ -5911,26 +6218,15 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI)) return Op; - // Since there's no native support for scalar_to_vector for 256-bit AVX, a - // 128-bit scalar_to_vector + INSERT_SUBVECTOR is generated. Recognize this - // idiom and do the shuffle before the insertion, this yields less - // instructions in the end. - if (VT.is256BitVector() && - V1.getOpcode() == ISD::INSERT_SUBVECTOR && - V1.getOperand(0).getOpcode() == ISD::UNDEF && - V1.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR) - return PromoteVectorToScalarSplat(SVOp, DAG); + // Use vbroadcast whenever the splat comes from a foldable load + if (Subtarget->hasAVX() && isVectorBroadcast(V1)) + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1); // Handle splats by matching through known shuffle masks - if ((VT.is128BitVector() && NumElem <= 4) || - (VT.is256BitVector() && NumElem <= 8)) + if (VT.is128BitVector() && NumElem <= 4) return SDValue(); - // All i16 and i8 vector types can't be used directly by a generic shuffle - // instruction because the target has no such instruction. Generate shuffles - // which repeat i16 and i8 several times until they fit in i32, and then can - // be manipulated by target suported shuffles. After the insertion of the - // necessary shuffles, the result is bitcasted back to v4f32 or v8f32. + // All remaning splats are promoted to target supported vector shuffles. return PromoteSplat(SVOp, DAG); } @@ -6223,6 +6519,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, getShuffleVPERMILPDImmediate(SVOp), DAG); + // Handle VPERM2F128 permutations + if (isVPERM2F128Mask(M, VT, Subtarget)) + return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2, + getShuffleVPERM2F128Immediate(SVOp), DAG); + //===--------------------------------------------------------------------===// // Since no target specific shuffle was selected for this generic one, // lower it into other known shuffles. FIXME: this isn't true yet, but @@ -6653,12 +6954,17 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { CodeModel::Model M = getTargetMachine().getCodeModel(); if (Subtarget->isPICStyleRIPRel() && - (M == CodeModel::Small || M == CodeModel::Kernel)) + (M == CodeModel::Small || M == CodeModel::Kernel)) { + if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF()) + OpFlag = X86II::MO_GOTPCREL; WrapperKind = X86ISD::WrapperRIP; - else if (Subtarget->isPICStyleGOT()) - OpFlag = X86II::MO_GOTOFF; - else if (Subtarget->isPICStyleStubPIC()) - OpFlag = X86II::MO_PIC_BASE_OFFSET; + } else if (Subtarget->isPICStyleGOT()) { + OpFlag = X86II::MO_GOT; + } else if (Subtarget->isPICStyleStubPIC()) { + OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE; + } else if (Subtarget->isPICStyleStubNoDynamic()) { + OpFlag = X86II::MO_DARWIN_NONLAZY; + } SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); @@ -6675,6 +6981,12 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { Result); } + // For symbols that require a load from a stub to get the address, emit the + // load. + if (isGlobalStubReference(OpFlag)) + Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, + MachinePointerInfo::getGOT(), false, false, 0); + return Result; } @@ -7170,9 +7482,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, // Load the 32-bit value into an XMM register. SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, - DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Op.getOperand(0), - DAG.getIntPtrConstant(0))); + Op.getOperand(0)); Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), @@ -7811,6 +8121,39 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { DAG.getConstant(X86CC, MVT::i8), EFLAGS); } +// Lower256IntVETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 +// ones, and then concatenate the result back. +static SDValue Lower256IntVETCC(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + + assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::VSETCC && + "Unsupported value type for operation"); + + int NumElems = VT.getVectorNumElements(); + DebugLoc dl = Op.getDebugLoc(); + SDValue CC = Op.getOperand(2); + SDValue Idx0 = DAG.getConstant(0, MVT::i32); + SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); + + // Extract the LHS vectors + SDValue LHS = Op.getOperand(0); + SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); + SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); + + // Extract the RHS vectors + SDValue RHS = Op.getOperand(1); + SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl); + SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl); + + // Issue the operation on the smaller types and concatenate the result back + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), + DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); +} + + SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { SDValue Cond; SDValue Op0 = Op.getOperand(0); @@ -7823,9 +8166,10 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { if (isFP) { unsigned SSECC = 8; - EVT VT0 = Op0.getValueType(); - assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); - unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; + EVT EltVT = Op0.getValueType().getVectorElementType(); + assert(EltVT == MVT::f32 || EltVT == MVT::f64); + + unsigned Opc = EltVT == MVT::f32 ? X86ISD::CMPPS : X86ISD::CMPPD; bool Swap = false; switch (SetCCOpcode) { @@ -7872,6 +8216,10 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); } + // Break 256-bit integer vector compare into smaller ones. + if (!isFP && VT.getSizeInBits() == 256) + return Lower256IntVETCC(Op, DAG); + // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. @@ -9120,11 +9468,51 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); - LLVMContext *Context = DAG.getContext(); - // Must have SSE2. - if (!Subtarget->hasSSE2()) return SDValue(); + if (!(Subtarget->hasSSE2() || Subtarget->hasAVX())) + return SDValue(); + + // Decompose 256-bit shifts into smaller 128-bit shifts. + if (VT.getSizeInBits() == 256) { + int NumElems = VT.getVectorNumElements(); + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + + // Extract the two vectors + SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl); + SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32), + DAG, dl); + + // Recreate the shift amount vectors + SDValue Amt1, Amt2; + if (Amt.getOpcode() == ISD::BUILD_VECTOR) { + // Constant shift amount + SmallVector Amt1Csts; + SmallVector Amt2Csts; + for (int i = 0; i < NumElems/2; ++i) + Amt1Csts.push_back(Amt->getOperand(i)); + for (int i = NumElems/2; i < NumElems; ++i) + Amt2Csts.push_back(Amt->getOperand(i)); + + Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, + &Amt1Csts[0], NumElems/2); + Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, + &Amt2Csts[0], NumElems/2); + } else { + // Variable shift amount + Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl); + Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32), + DAG, dl); + } + + // Issue new vector shifts for the smaller types + V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); + V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); + + // Concatenate the result back + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); + } // Optimize shl/srl/sra with constant shift amount. if (isSplatVector(Amt.getNode())) { @@ -9175,9 +9563,6 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { } // Lower SHL with variable shift amount. - // Cannot lower SHL without SSE2 or later. - if (!Subtarget->hasSSE2()) return SDValue(); - if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), @@ -9540,7 +9925,9 @@ SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { Node->getOperand(0), Node->getOperand(1), negOp, cast(Node)->getSrcValue(), - cast(Node)->getAlignment()); + cast(Node)->getAlignment(), + cast(Node)->getOrdering(), + cast(Node)->getSynchScope()); } static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { @@ -9893,10 +10280,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; + case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; case X86ISD::VPERMILPS: return "X86ISD::VPERMILPS"; case X86ISD::VPERMILPSY: return "X86ISD::VPERMILPSY"; case X86ISD::VPERMILPD: return "X86ISD::VPERMILPD"; case X86ISD::VPERMILPDY: return "X86ISD::VPERMILPDY"; + case X86ISD::VPERM2F128: return "X86ISD::VPERM2F128"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; @@ -11042,6 +11431,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::CMOV_V4F32: case X86::CMOV_V2F64: case X86::CMOV_V2I64: + case X86::CMOV_V8F32: + case X86::CMOV_V4F64: + case X86::CMOV_V4I64: case X86::CMOV_GR16: case X86::CMOV_GR32: case X86::CMOV_RFP32: @@ -11390,6 +11782,38 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N, return TargetLowering::isGAPlusOffset(N, GA, Offset); } +/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the +/// same as extracting the high 128-bit part of 256-bit vector and then +/// inserting the result into the low part of a new 256-bit vector +static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { + EVT VT = SVOp->getValueType(0); + int NumElems = VT.getVectorNumElements(); + + // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> + for (int i = 0, j = NumElems/2; i < NumElems/2; ++i, ++j) + if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || + SVOp->getMaskElt(j) >= 0) + return false; + + return true; +} + +/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the +/// same as extracting the low 128-bit part of 256-bit vector and then +/// inserting the result into the high part of a new 256-bit vector +static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { + EVT VT = SVOp->getValueType(0); + int NumElems = VT.getVectorNumElements(); + + // vector_shuffle or + for (int i = NumElems/2, j = 0; i < NumElems; ++i, ++j) + if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || + SVOp->getMaskElt(j) >= 0) + return false; + + return true; +} + /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { @@ -11398,6 +11822,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); EVT VT = SVOp->getValueType(0); + int NumElems = VT.getVectorNumElements(); if (V1.getOpcode() == ISD::CONCAT_VECTORS && V2.getOpcode() == ISD::CONCAT_VECTORS) { @@ -11422,7 +11847,6 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, // To match the shuffle mask, the first half of the mask should // be exactly the first vector, and all the rest a splat with the // first element of the second one. - int NumElems = VT.getVectorNumElements(); for (int i = 0; i < NumElems/2; ++i) if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) @@ -11436,12 +11860,34 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, return DCI.CombineTo(N, InsV); } + //===--------------------------------------------------------------------===// + // Combine some shuffles into subvector extracts and inserts: + // + + // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> + if (isShuffleHigh128VectorInsertLow(SVOp)) { + SDValue V = Extract128BitVector(V1, DAG.getConstant(NumElems/2, MVT::i32), + DAG, dl); + SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), + V, DAG.getConstant(0, MVT::i32), DAG, dl); + return DCI.CombineTo(N, InsV); + } + + // vector_shuffle or + if (isShuffleLow128VectorInsertHigh(SVOp)) { + SDValue V = Extract128BitVector(V1, DAG.getConstant(0, MVT::i32), DAG, dl); + SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), + V, DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); + return DCI.CombineTo(N, InsV); + } + return SDValue(); } /// PerformShuffleCombine - Performs several different shuffle combines. static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { DebugLoc dl = N->getDebugLoc(); EVT VT = N->getValueType(0); @@ -11450,8 +11896,9 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) return SDValue(); - // Only handle pure VECTOR_SHUFFLE nodes. - if (VT.getSizeInBits() == 256 && N->getOpcode() == ISD::VECTOR_SHUFFLE) + // Combine 256-bit vector shuffles. This is only profitable when in AVX mode + if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 && + N->getOpcode() == ISD::VECTOR_SHUFFLE) return PerformShuffleCombine256(N, DAG, DCI); // Only handle 128 wide vector from here on. @@ -11609,7 +12056,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Converting this to a max would handle comparisons between positive // and negative zero incorrectly. if (!UnsafeFPMath && - !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) + !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) break; Opcode = X86ISD::FMAX; break; @@ -12022,7 +12469,7 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, // all elements are shifted by the same amount. We can't do this in legalize // because the a constant vector is typically transformed to a constant pool // so we have no knowledge of the shift amount. - if (!Subtarget->hasSSE2()) + if (!(Subtarget->hasSSE2() || Subtarget->hasAVX())) return SDValue(); if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) @@ -12221,16 +12668,17 @@ static bool CanFoldXORWithAllOnes(const SDNode *N) { // Sometimes the operand may come from a insert_subvector building a 256-bit // allones vector - SDValue V1 = N->getOperand(0); - SDValue V2 = N->getOperand(1); - if (VT.getSizeInBits() == 256 && - N->getOpcode() == ISD::INSERT_SUBVECTOR && - V1.getOpcode() == ISD::INSERT_SUBVECTOR && - V1.getOperand(0).getOpcode() == ISD::UNDEF && - ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && - ISD::isBuildVectorAllOnes(V2.getNode())) - return true; + N->getOpcode() == ISD::INSERT_SUBVECTOR) { + SDValue V1 = N->getOperand(0); + SDValue V2 = N->getOperand(1); + + if (V1.getOpcode() == ISD::INSERT_SUBVECTOR && + V1.getOperand(0).getOpcode() == ISD::UNDEF && + ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && + ISD::isBuildVectorAllOnes(V2.getNode())) + return true; + } return false; } @@ -12433,14 +12881,117 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, /// PerformSTORECombine - Do target-specific dag combines on STORE nodes. static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { + StoreSDNode *St = cast(N); + EVT VT = St->getValue().getValueType(); + EVT StVT = St->getMemoryVT(); + DebugLoc dl = St->getDebugLoc(); + SDValue StoredVal = St->getOperand(1); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // If we are saving a concatination of two XMM registers, perform two stores. + // This is better in Sandy Bridge cause one 256-bit mem op is done via two + // 128-bit ones. If in the future the cost becomes only one memory access the + // first version would be better. + if (VT.getSizeInBits() == 256 && + StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS && + StoredVal.getNumOperands() == 2) { + + SDValue Value0 = StoredVal.getOperand(0); + SDValue Value1 = StoredVal.getOperand(1); + + SDValue Stride = DAG.getConstant(16, TLI.getPointerTy()); + SDValue Ptr0 = St->getBasePtr(); + SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); + + SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); + } + + // Optimize trunc store (of multiple scalars) to shuffle and store. + // First, pack all of the elements in one place. Next, store to memory + // in fewer chunks. + if (St->isTruncatingStore() && VT.isVector()) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned NumElems = VT.getVectorNumElements(); + assert(StVT != VT && "Cannot truncate to the same type"); + unsigned FromSz = VT.getVectorElementType().getSizeInBits(); + unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); + + // From, To sizes and ElemCount must be pow of two + if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); + // We are going to use the original vector elt for storing. + // accumulated smaller vector elements must be a multiple of bigger size. + if (0 != (NumElems * ToSz) % FromSz) return SDValue(); + unsigned SizeRatio = FromSz / ToSz; + + assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), + StVT.getScalarType(), NumElems*SizeRatio); + + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue()); + SmallVector ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio; + + // Can't shuffle using an illegal type + if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, + DAG.getUNDEF(WideVec.getValueType()), + ShuffleVec.data()); + // At this point all of the data is stored at the bottom of the + // register. We now need to save it to mem. + + // Find the largest store unit + MVT StoreType = MVT::i8; + for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; + tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { + MVT Tp = (MVT::SimpleValueType)tp; + if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz) + StoreType = Tp; + } + + // Bitcast the original vector into a vector of store-size units + EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), + StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); + assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); + SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff); + SmallVector Chains; + SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, + TLI.getPointerTy()); + SDValue Ptr = St->getBasePtr(); + + // Perform one or more big stores into memory. + for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) { + SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + StoreType, ShuffWide, + DAG.getIntPtrConstant(i)); + SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + Chains.push_back(Ch); + } + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], + Chains.size()); + } + + // Turn load->store of MMX types into GPR load/stores. This avoids clobbering // the FP state in cases where an emms may be missing. // A preferable solution to the general problem is to figure out the right // places to insert EMMS. This qualifies as a quick hack. // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. - StoreSDNode *St = cast(N); - EVT VT = St->getValue().getValueType(); if (VT.getSizeInBits() != 64) return SDValue(); @@ -12742,20 +13293,18 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) { // X86 can't encode an immediate LHS of a sub. See if we can push the // negation into a preceding instruction. if (ConstantSDNode *C = dyn_cast(Op0)) { - uint64_t Op0C = C->getSExtValue(); - // If the RHS of the sub is a XOR with one use and a constant, invert the // immediate. Then add one to the LHS of the sub so we can turn // X-Y -> X+~Y+1, saving one register. if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && isa(Op1.getOperand(1))) { - uint64_t XorC = cast(Op1.getOperand(1))->getSExtValue(); + APInt XorC = cast(Op1.getOperand(1))->getAPIntValue(); EVT VT = Op0.getValueType(); SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT, Op1.getOperand(0), DAG.getConstant(~XorC, VT)); return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor, - DAG.getConstant(Op0C+1, VT)); + DAG.getConstant(C->getAPIntValue()+1, VT)); } } @@ -12819,7 +13368,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERMILPSY: case X86ISD::VPERMILPD: case X86ISD::VPERMILPDY: - case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI); + case X86ISD::VPERM2F128: + case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); } return SDValue();