From d9d09600ee7ffa5e8fcaf13fa5b37c144831e6c6 Mon Sep 17 00:00:00 2001 From: Michael Liao Date: Tue, 23 Oct 2012 17:34:00 +0000 Subject: [PATCH] Enable lowering ZERO_EXTEND/ANY_EXTEND to PMOVZX from SSE4.1 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166486 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 95 +++++++++++++++++++ lib/Target/X86/X86ISelLowering.h | 8 ++ lib/Target/X86/X86InstrFragmentsSIMD.td | 8 ++ lib/Target/X86/X86InstrSSE.td | 75 +++++++++++++++ test/CodeGen/X86/2012-01-18-vbitcast.ll | 4 +- .../CodeGen/X86/2012-03-15-build_vector_wl.ll | 2 +- test/CodeGen/X86/2012-07-10-extload64.ll | 4 +- test/CodeGen/X86/pointer-vector.ll | 5 +- test/CodeGen/X86/promote.ll | 2 +- test/CodeGen/X86/trunc-ext-ld-st.ll | 15 ++- test/CodeGen/X86/vec_compare-2.ll | 3 +- test/CodeGen/X86/widen_load-2.ll | 2 +- 12 files changed, 202 insertions(+), 21 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 5034cc0330a..3667ff91be3 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -6562,6 +6562,78 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { getShuffleSHUFImmediate(SVOp), DAG); } +// Reduce a vector shuffle to zext. +SDValue +X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { + // PMOVZX is only available from SSE41. + if (!Subtarget->hasSSE41()) + return SDValue(); + + EVT VT = Op.getValueType(); + + // Only AVX2 support 256-bit vector integer extending. + if (!Subtarget->hasAVX2() && VT.is256BitVector()) + return SDValue(); + + ShuffleVectorSDNode *SVOp = cast(Op); + DebugLoc DL = Op.getDebugLoc(); + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + unsigned NumElems = VT.getVectorNumElements(); + + // Extending is an unary operation and the element type of the source vector + // won't be equal to or larger than i64. + if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() || + VT.getVectorElementType() == MVT::i64) + return SDValue(); + + // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4. + unsigned Shift = 1; // Start from 2, i.e. 1 << 1. + while ((1 << Shift) < NumElems) { + if (SVOp->getMaskElt(1 << Shift) == 1) + break; + Shift += 1; + // The maximal ratio is 8, i.e. from i8 to i64. + if (Shift > 3) + return SDValue(); + } + + // Check the shuffle mask. + unsigned Mask = (1U << Shift) - 1; + for (unsigned i = 0; i != NumElems; ++i) { + int EltIdx = SVOp->getMaskElt(i); + if ((i & Mask) != 0 && EltIdx != -1) + return SDValue(); + if ((i & Mask) == 0 && EltIdx != (i >> Shift)) + return SDValue(); + } + + unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift; + EVT NeVT = EVT::getIntegerVT(*DAG.getContext(), NBits); + EVT NVT = EVT::getVectorVT(*DAG.getContext(), NeVT, NumElems >> Shift); + + if (!isTypeLegal(NVT)) + return SDValue(); + + // Simplify the operand as it's prepared to be fed into shuffle. + unsigned SignificantBits = NVT.getSizeInBits() >> Shift; + if (V1.getOpcode() == ISD::BITCAST && + V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && + V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && + V1.getOperand(0) + .getOperand(0).getValueType().getSizeInBits() == SignificantBits) { + // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) + SDValue V = V1.getOperand(0).getOperand(0).getOperand(0); + // If it's foldable, i.e. normal load with single use, we will let code + // selection to fold it. Otherwise, we will short the conversion sequence. + if (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse()) + V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V); + } + + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::VZEXT, DL, NVT, V1)); +} + SDValue X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { ShuffleVectorSDNode *SVOp = cast(Op); @@ -6592,6 +6664,11 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { return PromoteSplat(SVOp, DAG); } + // Check integer expanding shuffles. + SDValue NewOp = lowerVectorIntExtend(Op, DAG); + if (NewOp.getNode()) + return NewOp; + // If the shuffle can be profitably rewritten as a narrower shuffle, then // do it! if (VT == MVT::v8i16 || VT == MVT::v16i8 || @@ -11825,6 +11902,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VSEXT_MOVL: return "X86ISD::VSEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; + case X86ISD::VZEXT: return "X86ISD::VZEXT"; + case X86ISD::VSEXT: return "X86ISD::VSEXT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; @@ -16529,6 +16608,21 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, return OptimizeConditionalInDecrement(N, DAG); } +/// performVZEXTCombine - Performs build vector combines +static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + // (vzext (bitcast (vzext (x)) -> (vzext x) + SDValue In = N->getOperand(0); + while (In.getOpcode() == ISD::BITCAST) + In = In.getOperand(0); + + if (In.getOpcode() != X86ISD::VZEXT) + return SDValue(); + + return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0), In.getOperand(0)); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -16569,6 +16663,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SETCC: return PerformISDSETCCCombine(N, DAG); case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); + case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::PALIGN: case X86ISD::UNPCKH: diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 228fab1689e..af23be6c3c0 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -236,6 +236,12 @@ namespace llvm { // VSEXT_MOVL - Vector move low and sign extend. VSEXT_MOVL, + // VZEXT - Vector integer zero-extend. + VZEXT, + + // VSEXT - Vector integer signed-extend. + VSEXT, + // VFPEXT - Vector FP extend. VFPEXT, @@ -832,6 +838,8 @@ namespace llvm { SDValue LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const; + virtual SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 46281efa571..73ba0011df1 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -90,6 +90,14 @@ def X86vsmovl : SDNode<"X86ISD::VSEXT_MOVL", def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86vzext : SDNode<"X86ISD::VZEXT", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCisInt<1>]>>; + +def X86vsext : SDNode<"X86ISD::VSEXT", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCisInt<1>]>>; + def X86vfpext : SDNode<"X86ISD::VFPEXT", SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisFP<0>, SDTCisFP<1>]>>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index cc1291a8a0f..26f78d7cca4 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -5841,6 +5841,81 @@ let Predicates = [UseSSE41] in { (PMOVZXBQrm addr:$src)>; } +let Predicates = [HasAVX2] in { + def : Pat<(v16i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWYrr VR128:$src)>; + def : Pat<(v8i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDYrr VR128:$src)>; + def : Pat<(v4i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQYrr VR128:$src)>; + + def : Pat<(v8i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDYrr VR128:$src)>; + def : Pat<(v4i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQYrr VR128:$src)>; + + def : Pat<(v4i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQYrr VR128:$src)>; +} + +let Predicates = [HasAVX] in { + def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWrr VR128:$src)>; + def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDrr VR128:$src)>; + def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQrr VR128:$src)>; + + def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDrr VR128:$src)>; + def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQrr VR128:$src)>; + + def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQrr VR128:$src)>; + + def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVZXBWrm addr:$src)>; + def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVZXBWrm addr:$src)>; + def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (VPMOVZXBDrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))), + (VPMOVZXBQrm addr:$src)>; + + def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVZXWDrm addr:$src)>; + def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVZXWDrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (VPMOVZXWQrm addr:$src)>; + + def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVZXDQrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVZXDQrm addr:$src)>; +} + +let Predicates = [UseSSE41] in { + def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (PMOVZXBWrr VR128:$src)>; + def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (PMOVZXBDrr VR128:$src)>; + def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (PMOVZXBQrr VR128:$src)>; + + def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (PMOVZXWDrr VR128:$src)>; + def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (PMOVZXWQrr VR128:$src)>; + + def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (PMOVZXDQrr VR128:$src)>; + + def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (PMOVZXBWrm addr:$src)>; + def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), + (PMOVZXBWrm addr:$src)>; + def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (PMOVZXBDrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))), + (PMOVZXBQrm addr:$src)>; + + def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (PMOVZXWDrm addr:$src)>; + def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), + (PMOVZXWDrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (PMOVZXWQrm addr:$src)>; + + def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (PMOVZXDQrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), + (PMOVZXDQrm addr:$src)>; +} + //===----------------------------------------------------------------------===// // SSE4.1 - Extract Instructions //===----------------------------------------------------------------------===// diff --git a/test/CodeGen/X86/2012-01-18-vbitcast.ll b/test/CodeGen/X86/2012-01-18-vbitcast.ll index 8a3ccc8dfda..3ce7db6e413 100644 --- a/test/CodeGen/X86/2012-01-18-vbitcast.ll +++ b/test/CodeGen/X86/2012-01-18-vbitcast.ll @@ -2,8 +2,8 @@ ;CHECK: vcast define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) { -;CHECK: pshufd -;CHECK: pshufd +;CHECK: pmovzxdq +;CHECK: pmovzxdq %af = bitcast <2 x float> %a to <2 x i32> %bf = bitcast <2 x float> %b to <2 x i32> %x = sub <2 x i32> %af, %bf diff --git a/test/CodeGen/X86/2012-03-15-build_vector_wl.ll b/test/CodeGen/X86/2012-03-15-build_vector_wl.ll index fec17e9f4ac..c4b307e5a5d 100644 --- a/test/CodeGen/X86/2012-03-15-build_vector_wl.ll +++ b/test/CodeGen/X86/2012-03-15-build_vector_wl.ll @@ -4,7 +4,7 @@ define <4 x i8> @build_vector_again(<16 x i8> %in) nounwind readnone { entry: %out = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> -; CHECK: shufb +; CHECK: pmovzxbd ret <4 x i8> %out ; CHECK: ret } diff --git a/test/CodeGen/X86/2012-07-10-extload64.ll b/test/CodeGen/X86/2012-07-10-extload64.ll index 906b748fa42..4abdded38d8 100644 --- a/test/CodeGen/X86/2012-07-10-extload64.ll +++ b/test/CodeGen/X86/2012-07-10-extload64.ll @@ -3,7 +3,7 @@ ; CHECK: load_store define void @load_store(<4 x i16>* %in) { entry: -; CHECK: movsd +; CHECK: pmovzxwd %A27 = load <4 x i16>* %in, align 4 %A28 = add <4 x i16> %A27, %A27 ; CHECK: movlpd @@ -27,6 +27,6 @@ define <2 x i32> @load_64(<2 x i32>* %ptr) { BB: %t = load <2 x i32>* %ptr ret <2 x i32> %t -;CHECK: movsd +;CHECK: pmovzxdq ;CHECK: ret } diff --git a/test/CodeGen/X86/pointer-vector.ll b/test/CodeGen/X86/pointer-vector.ll index 800fbedb4f9..58423d19596 100644 --- a/test/CodeGen/X86/pointer-vector.ll +++ b/test/CodeGen/X86/pointer-vector.ll @@ -81,8 +81,7 @@ define <4 x i32*> @INT2PTR1(<4 x i8>* %p) nounwind { entry: %G = load <4 x i8>* %p ;CHECK: movl -;CHECK: movd -;CHECK: pshufb +;CHECK: pmovzxbd ;CHECK: pand %K = inttoptr <4 x i8> %G to <4 x i32*> ;CHECK: ret @@ -105,7 +104,7 @@ define <2 x i32*> @BITCAST1(<2 x i8*>* %p) nounwind { entry: %G = load <2 x i8*>* %p ;CHECK: movl -;CHECK: movsd +;CHECK: pmovzxdq %T = bitcast <2 x i8*> %G to <2 x i32*> ;CHECK: ret ret <2 x i32*> %T diff --git a/test/CodeGen/X86/promote.ll b/test/CodeGen/X86/promote.ll index 8b30dc718b0..283f48cd37b 100644 --- a/test/CodeGen/X86/promote.ll +++ b/test/CodeGen/X86/promote.ll @@ -20,7 +20,7 @@ entry: ; CHECK: shuff_f define i32 @shuff_f(<4 x i8>* %A) { entry: -; CHECK: pshufb +; CHECK: pmovzxbd ; CHECK: paddd ; CHECK: pshufb %0 = load <4 x i8>* %A, align 8 diff --git a/test/CodeGen/X86/trunc-ext-ld-st.ll b/test/CodeGen/X86/trunc-ext-ld-st.ll index 9877d7be169..1d22a185def 100644 --- a/test/CodeGen/X86/trunc-ext-ld-st.ll +++ b/test/CodeGen/X86/trunc-ext-ld-st.ll @@ -2,8 +2,7 @@ ;CHECK: load_2_i8 ; A single 16-bit load -;CHECK: movzwl -;CHECK: pshufb +;CHECK: pmovzxbq ;CHECK: paddq ;CHECK: pshufb ; A single 16-bit store @@ -19,8 +18,7 @@ define void @load_2_i8(<2 x i8>* %A) { ;CHECK: load_2_i16 ; Read 32-bits -;CHECK: movd -;CHECK: pshufb +;CHECK: pmovzxwq ;CHECK: paddq ;CHECK: pshufb ;CHECK: movd @@ -33,7 +31,7 @@ define void @load_2_i16(<2 x i16>* %A) { } ;CHECK: load_2_i32 -;CHECK: pshufd +;CHECK: pmovzxdq ;CHECK: paddq ;CHECK: pshufd ;CHECK: ret @@ -45,8 +43,7 @@ define void @load_2_i32(<2 x i32>* %A) { } ;CHECK: load_4_i8 -;CHECK: movd -;CHECK: pshufb +;CHECK: pmovzxbd ;CHECK: paddd ;CHECK: pshufb ;CHECK: ret @@ -58,7 +55,7 @@ define void @load_4_i8(<4 x i8>* %A) { } ;CHECK: load_4_i16 -;CHECK: punpcklwd +;CHECK: pmovzxwd ;CHECK: paddd ;CHECK: pshufb ;CHECK: ret @@ -70,7 +67,7 @@ define void @load_4_i16(<4 x i16>* %A) { } ;CHECK: load_8_i8 -;CHECK: punpcklbw +;CHECK: pmovzxbw ;CHECK: paddw ;CHECK: pshufb ;CHECK: ret diff --git a/test/CodeGen/X86/vec_compare-2.ll b/test/CodeGen/X86/vec_compare-2.ll index 46d6a23554f..4da79538dbf 100644 --- a/test/CodeGen/X86/vec_compare-2.ll +++ b/test/CodeGen/X86/vec_compare-2.ll @@ -10,8 +10,7 @@ define void @blackDespeckle_wrapper(i8** %args_list, i64* %gtid, i64 %xend) { entry: ; CHECK: cfi_def_cfa_offset ; CHECK-NOT: set -; CHECK: punpcklwd -; CHECK: pshufd +; CHECK: pmovzxwq ; CHECK: pshufb %shr.i = ashr <4 x i32> zeroinitializer, ; <<4 x i32>> [#uses=1] %cmp318.i = sext <4 x i1> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=1] diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll index 79aa0005025..224898c1a3e 100644 --- a/test/CodeGen/X86/widen_load-2.ll +++ b/test/CodeGen/X86/widen_load-2.ll @@ -170,7 +170,7 @@ define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp ; CHECK: rot %i8vec3pack = type { <3 x i8>, i8 } define %i8vec3pack @rot() nounwind { -; CHECK: movd {{-?[0-9]+}}(%rsp), {{%xmm[0-9]}} +; CHECK: pmovzxbd {{-?[0-9]+}}(%rsp), {{%xmm[0-9]}} entry: %X = alloca %i8vec3pack, align 4 %rot = alloca %i8vec3pack, align 4 -- 2.34.1