From: Bruno Cardoso Lopes Date: Mon, 25 Jul 2011 23:05:25 +0000 (+0000) Subject: - Handle special scalar_to_vector case: splats. Using a native 128-bit X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=6a32adc4e5e9aae2c6d5d2192ae0c0a01858d8c2;p=oota-llvm.git - Handle special scalar_to_vector case: splats. Using a native 128-bit shuffle before inserting on a 256-bit vector. - Add AVX versions of movd/movq instructions - Introduce a few COPY patterns to match insert_subvector instructions. This turns a trivial insert_subvector instruction into a register copy, coalescing the xmm into a ymm and avoid emiting on more instruction. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@136002 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 0b5ac6a2519..2f74c0fdd46 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -3955,6 +3955,34 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { return DAG.getNode(ISD::BITCAST, dl, VT, V); } +/// PromoteVectorToScalarSplat - Since there's no native support for +/// scalar_to_vector for 256-bit AVX, a 128-bit scalar_to_vector + +/// INSERT_SUBVECTOR is generated. Recognize this idiom and do the +/// shuffle before the insertion, this yields less instructions in the end. +static SDValue PromoteVectorToScalarSplat(ShuffleVectorSDNode *SV, + SelectionDAG &DAG) { + EVT SrcVT = SV->getValueType(0); + SDValue V1 = SV->getOperand(0); + DebugLoc dl = SV->getDebugLoc(); + int NumElems = SrcVT.getVectorNumElements(); + + assert(SrcVT.is256BitVector() && "unknown howto handle vector type"); + + SmallVector Mask; + for (int i = 0; i < NumElems/2; ++i) + Mask.push_back(SV->getMaskElt(i)); + + EVT SVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), + NumElems/2); + SDValue SV1 = DAG.getVectorShuffle(SVT, dl, V1.getOperand(1), + DAG.getUNDEF(SVT), &Mask[0]); + SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), SV1, + DAG.getConstant(0, MVT::i32), DAG, dl); + + return Insert128BitVector(InsV, SV1, + DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); +} + /// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32 and /// v8i32, v16i16 or v32i8 to v8f32. static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { @@ -5742,7 +5770,17 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI)) return Op; - // Handle splats by matching through known masks + // Since there's no native support for scalar_to_vector for 256-bit AVX, a + // 128-bit scalar_to_vector + INSERT_SUBVECTOR is generated. Recognize this + // idiom and do the shuffle before the insertion, this yields less + // instructions in the end. + if (VT.is256BitVector() && + V1.getOpcode() == ISD::INSERT_SUBVECTOR && + V1.getOperand(0).getOpcode() == ISD::UNDEF && + V1.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR) + return PromoteVectorToScalarSplat(SVOp, DAG); + + // Handle splats by matching through known shuffle masks if ((VT.is128BitVector() && NumElem <= 4) || (VT.is256BitVector() && NumElem <= 8)) return SDValue(); diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index 6d89bcc29e7..34d26761b79 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -501,6 +501,9 @@ class RSDI o, Format F, dag outs, dag ins, string asm, class RPDI o, Format F, dag outs, dag ins, string asm, list pattern> : PDI, REX_W; +class VRPDI o, Format F, dag outs, dag ins, string asm, + list pattern> + : VPDI, VEX_W; // MMX Instruction templates // diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index f792f53b7ba..4dd2d41446e 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -467,3 +467,4 @@ def vinsertf128_insert : PatFrag<(ops node:$bigvec, node:$smallvec, node:$index), [{ return X86::isVINSERTF128Index(N); }], INSERT_get_vinsertf128_imm>; + diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 24c734ab965..82e0d2a2ff6 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -2858,6 +2858,14 @@ def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), [(set VR128:$dst, (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, VEX; +def VMOV64toPQIrr : VRPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector GR64:$src)))]>, VEX; +def VMOV64toSDrr : VRPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert GR64:$src))]>, VEX; + def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -5358,6 +5366,20 @@ def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), (VINSERTF128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; +// Special COPY patterns +def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (i32 0)), + (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (i32 0)), + (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (i32 0)), + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (i32 0)), + (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (i32 0)), + (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (i32 0)), + (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; + //===----------------------------------------------------------------------===// // VEXTRACTF128 - Extract packed floating-point values // diff --git a/test/CodeGen/X86/avx-256-splat.ll b/test/CodeGen/X86/avx-256-splat.ll index 39230fe4fbf..edc17b72917 100644 --- a/test/CodeGen/X86/avx-256-splat.ll +++ b/test/CodeGen/X86/avx-256-splat.ll @@ -5,7 +5,6 @@ ; CHECK: vextractf128 $0 ; CHECK-NEXT: punpcklbw ; CHECK-NEXT: punpckhbw -; CHECK-NEXT: vinsertf128 $0 ; CHECK-NEXT: vinsertf128 $1 ; CHECK-NEXT: vpermilps $85 define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp { @@ -16,7 +15,6 @@ entry: ; CHECK: vextractf128 $0 ; CHECK-NEXT: punpckhwd -; CHECK-NEXT: vinsertf128 $0 ; CHECK-NEXT: vinsertf128 $1 ; CHECK-NEXT: vpermilps $85 define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp { @@ -25,3 +23,25 @@ entry: ret <16 x i16> %shuffle } +; CHECK: vmovd +; CHECK-NEXT: movlhps +; CHECK-NEXT: vinsertf128 $1 +define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp { +entry: + %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0 + %vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1 + %vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2 + %vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3 + ret <4 x i64> %vecinit6.i +} + +; CHECK: vshufpd +; CHECK-NEXT: vinsertf128 $1 +define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp { +entry: + %vecinit.i = insertelement <4 x double> undef, double %q, i32 0 + %vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1 + %vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2 + %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3 + ret <4 x double> %vecinit6.i +} diff --git a/test/CodeGen/X86/avx-256.ll b/test/CodeGen/X86/avx-256.ll index 20d31e73885..a6d1450c9c1 100644 --- a/test/CodeGen/X86/avx-256.ll +++ b/test/CodeGen/X86/avx-256.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7 -mattr=avx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s @x = common global <8 x float> zeroinitializer, align 32 @y = common global <4 x double> zeroinitializer, align 32 @@ -12,4 +12,3 @@ entry: store <4 x double> zeroinitializer, <4 x double>* @y, align 32 ret void } -