From 3d1161e9ae2aedc45f40cc1b7be1db3df2094903 Mon Sep 17 00:00:00 2001 From: Chad Rosier Date: Tue, 3 Jan 2012 21:05:52 +0000 Subject: [PATCH] Enhance DAGCombine for transforming 128->256 casts into a vmovaps, rather then a vxorps + vinsertf128 pair if the original vector came from a load. rdar://10594409 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@147481 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 14 ++++++++++++++ lib/Target/X86/X86InstrSSE.td | 5 +++++ test/CodeGen/X86/avx-shuffle.ll | 23 +++++++++++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 08c09bd5cb1..47b80d036e5 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -12731,6 +12731,20 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) return SDValue(); + // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. + if (LoadSDNode *Ld = dyn_cast(V1.getOperand(0))) { + SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); + SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; + SDValue ResNode = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2, + Ld->getMemoryVT(), + Ld->getPointerInfo(), + Ld->getAlignment(), + false/*isVolatile*/, true/*ReadMem*/, + false/*WriteMem*/); + return DAG.getNode(ISD::BITCAST, dl, VT, ResNode); + } + // Emit a zeroed vector and insert the desired subvector on its // first half. SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl); diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 4becf99bfa2..49776c6bed6 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -4719,6 +4719,11 @@ let Predicates = [HasAVX], AddedComplexity = 20 in { (VMOVZQI2PQIrm addr:$src)>; } +let Predicates = [HasAVX] in { +def : Pat<(v4i64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>; +} + //===---------------------------------------------------------------------===// // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in // IA32 document. movq xmm1, xmm2 does clear the high bits. diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll index 8532b40613b..ad611fc1b53 100644 --- a/test/CodeGen/X86/avx-shuffle.ll +++ b/test/CodeGen/X86/avx-shuffle.ll @@ -31,4 +31,27 @@ define <8 x float> @test4(float %a) nounwind { ret <8 x float> %b ; CHECK: test4: ; CHECK: vinsertf128 +} + +; rdar://10594409 +define <8 x float> @test5(float* nocapture %f) nounwind uwtable readonly ssp { +entry: + %0 = bitcast float* %f to <4 x float>* + %1 = load <4 x float>* %0, align 16 +; CHECK: vmovaps +; CHECK-NOT: vxorps +; CHECK-NOT: vinsertf128 + %shuffle.i = shufflevector <4 x float> %1, <4 x float> , <8 x i32> + ret <8 x float> %shuffle.i +} + +define <4 x double> @test6(double* nocapture %d) nounwind uwtable readonly ssp { +entry: + %0 = bitcast double* %d to <2 x double>* + %1 = load <2 x double>* %0, align 16 +; CHECK: vmovaps +; CHECK-NOT: vxorps +; CHECK-NOT: vinsertf128 + %shuffle.i = shufflevector <2 x double> %1, <2 x double> , <4 x i32> + ret <4 x double> %shuffle.i } \ No newline at end of file -- 2.34.1