From 01b3858e66ad4ce29bc0f762afd039ad0fb128be Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Fri, 3 Oct 2014 11:25:58 +0000 Subject: [PATCH] [x86] Significantly improve the ability of the new vector shuffle lowering to match VZEXT_MOVL patterns. I hadn't realized that these had sufficient pattern smarts in the backend to lower zext-ing from the low element of a vector without it being a scalar_to_vector node. They do, and this is how to match a bunch of patterns for movq, movss, etc. There is a weird propensity to end up using pshufd to place the element afterward even though it means domain crossing (or rather, to use xorps+movss to zext the element rather than movq) but that's an orthogonal problem with VZEXT_MOVL that someone should probably look at. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218977 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 56 ++++--- test/CodeGen/X86/vector-shuffle-128-v2.ll | 81 ++------- test/CodeGen/X86/vector-shuffle-128-v4.ll | 192 +++++----------------- test/CodeGen/X86/vector-shuffle-256-v4.ll | 15 +- test/CodeGen/X86/vector-shuffle-sse1.ll | 4 +- 5 files changed, 100 insertions(+), 248 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index bbb06aea233..fdced3555fb 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7801,28 +7801,32 @@ static SDValue lowerVectorShuffleAsElementInsertion( return SDValue(); // Not inserting into a zero vector. } + MVT ExtVT = VT; + MVT EltVT = VT.getVectorElementType(); + // Check for a single input from a SCALAR_TO_VECTOR node. // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and // all the smarts here sunk into that routine. However, the current // lowering of BUILD_VECTOR makes that nearly impossible until the old // vector shuffle lowering is dead. - SDValue V2S = - getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(), DAG); - if (!V2S) + if (SDValue V2S = getScalarValueForVectorElement( + V2, Mask[V2Index] - Mask.size(), DAG)) { + // We need to zext the scalar if it is smaller than an i32. + V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S); + if (EltVT == MVT::i8 || EltVT == MVT::i16) { + // Zero-extend directly to i32. + ExtVT = MVT::v4i32; + V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); + } + V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); + } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 || + EltVT == MVT::i16) { + // Either not inserting from the low element of the input or the input + // element size is too small to use VZEXT_MOVL to clear the high bits. return SDValue(); - - // First, we need to zext the scalar if it is smaller than an i32. - MVT ExtVT = VT; - MVT EltVT = VT.getVectorElementType(); - V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S); - if (EltVT == MVT::i8 || EltVT == MVT::i16) { - // Zero-extend directly to i32. - ExtVT = MVT::v4i32; - V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); } - V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, - DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S)); + V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); if (ExtVT != VT) V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); @@ -7998,12 +8002,6 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(WidenedMask, DAG))); } - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 2)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 3)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); - // If we have a single input from V2 insert that into V1 if we can do so // cheaply. if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) @@ -8011,6 +8009,12 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG)) return Insertion; + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 2)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 3)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); + if (Subtarget->hasSSE41()) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) @@ -8275,18 +8279,18 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(Mask, DAG)); } - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2); - if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); - // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2, Mask, Subtarget, DAG)) return V; + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2); + if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); + if (Subtarget->hasSSE41()) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll index 1fc1b287ef7..e15773b067a 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -652,41 +652,15 @@ define <2 x i64> @shuffle_v2i64_31_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64 } define <2 x i64> @shuffle_v2i64_0z(<2 x i64> %a) { -; SSE2-LABEL: shuffle_v2i64_0z: -; SSE2: # BB#0: -; SSE2-NEXT: xorpd %xmm1, %xmm1 -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] -; SSE2-NEXT: retq -; -; SSE3-LABEL: shuffle_v2i64_0z: -; SSE3: # BB#0: -; SSE3-NEXT: xorpd %xmm1, %xmm1 -; SSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] -; SSE3-NEXT: retq -; -; SSSE3-LABEL: shuffle_v2i64_0z: -; SSSE3: # BB#0: -; SSSE3-NEXT: xorpd %xmm1, %xmm1 -; SSSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v2i64_0z: -; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: retq -; -; AVX1-LABEL: shuffle_v2i64_0z: -; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: retq +; SSE-LABEL: shuffle_v2i64_0z: +; SSE: # BB#0: +; SSE-NEXT: movq %xmm0, %xmm0 +; SSE-NEXT: retq ; -; AVX2-LABEL: shuffle_v2i64_0z: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v2i64_0z: +; AVX: # BB#0: +; AVX-NEXT: vmovq %xmm0, %xmm0 +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> ret <2 x i64> %shuffle } @@ -710,15 +684,14 @@ define <2 x i64> @shuffle_v2i64_1z(<2 x i64> %a) { define <2 x i64> @shuffle_v2i64_z0(<2 x i64> %a) { ; SSE-LABEL: shuffle_v2i64_z0: ; SSE: # BB#0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movq %xmm0, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v2i64_z0: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: vmovq %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> ret <2 x i64> %shuffle @@ -769,34 +742,14 @@ define <2 x i64> @shuffle_v2i64_z1(<2 x i64> %a) { } define <2 x double> @shuffle_v2f64_0z(<2 x double> %a) { -; SSE2-LABEL: shuffle_v2f64_0z: -; SSE2: # BB#0: -; SSE2-NEXT: xorpd %xmm1, %xmm1 -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] -; SSE2-NEXT: retq -; -; SSE3-LABEL: shuffle_v2f64_0z: -; SSE3: # BB#0: -; SSE3-NEXT: xorpd %xmm1, %xmm1 -; SSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] -; SSE3-NEXT: retq -; -; SSSE3-LABEL: shuffle_v2f64_0z: -; SSSE3: # BB#0: -; SSSE3-NEXT: xorpd %xmm1, %xmm1 -; SSSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v2f64_0z: -; SSE41: # BB#0: -; SSE41-NEXT: xorpd %xmm1, %xmm1 -; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_v2f64_0z: +; SSE: # BB#0: +; SSE-NEXT: movq %xmm0, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v2f64_0z: ; AVX: # BB#0: -; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX-NEXT: vmovq %xmm0, %xmm0 ; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> ret <2 x double> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll index 3e278d23f8a..f6ba5db85f3 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -438,38 +438,17 @@ define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) { } define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) { -; SSE2-LABEL: shuffle_v4f32_4zzz: -; SSE2: # BB#0: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] -; SSE2-NEXT: retq -; -; SSE3-LABEL: shuffle_v4f32_4zzz: -; SSE3: # BB#0: -; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,0] -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] -; SSE3-NEXT: retq -; -; SSSE3-LABEL: shuffle_v4f32_4zzz: -; SSSE3: # BB#0: -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v4f32_4zzz: -; SSE41: # BB#0: -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_v4f32_4zzz: +; SSE: # BB#0: +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v4f32_4zzz: ; AVX: # BB#0: ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> ret <4 x float> %shuffle @@ -660,152 +639,71 @@ define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) { } define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) { -; SSE2-LABEL: shuffle_v4i32_4zzz: -; SSE2: # BB#0: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] -; SSE2-NEXT: retq -; -; SSE3-LABEL: shuffle_v4i32_4zzz: -; SSE3: # BB#0: -; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,0] -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] -; SSE3-NEXT: retq -; -; SSSE3-LABEL: shuffle_v4i32_4zzz: -; SSSE3: # BB#0: -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v4i32_4zzz: -; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: shuffle_v4i32_4zzz: -; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX1-NEXT: retq +; SSE-LABEL: shuffle_v4i32_4zzz: +; SSE: # BB#0: +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq ; -; AVX2-LABEL: shuffle_v4i32_4zzz: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v4i32_4zzz: +; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) { -; SSE2-LABEL: shuffle_v4i32_z4zz: -; SSE2: # BB#0: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] -; SSE2-NEXT: retq -; -; SSE3-LABEL: shuffle_v4i32_z4zz: -; SSE3: # BB#0: -; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] -; SSE3-NEXT: retq -; -; SSSE3-LABEL: shuffle_v4i32_z4zz: -; SSSE3: # BB#0: -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v4i32_z4zz: -; SSE41: # BB#0: -; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_v4i32_z4zz: +; SSE: # BB#0: +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movss %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] +; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v4i32_z4zz: ; AVX: # BB#0: -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) { -; SSE2-LABEL: shuffle_v4i32_zz4z: -; SSE2: # BB#0: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2] -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: shuffle_v4i32_zz4z: -; SSE3: # BB#0: -; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2] -; SSE3-NEXT: movaps %xmm1, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: shuffle_v4i32_zz4z: -; SSSE3: # BB#0: -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2] -; SSSE3-NEXT: movaps %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v4i32_zz4z: -; SSE41: # BB#0: -; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_v4i32_zz4z: +; SSE: # BB#0: +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movss %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] +; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v4i32_zz4z: ; AVX: # BB#0: -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) { -; SSE2-LABEL: shuffle_v4i32_zuu4: -; SSE2: # BB#0: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: shuffle_v4i32_zuu4: -; SSE3: # BB#0: -; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE3-NEXT: movaps %xmm1, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: shuffle_v4i32_zuu4: -; SSSE3: # BB#0: -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSSE3-NEXT: movaps %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v4i32_zuu4: -; SSE41: # BB#0: -; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0] -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_v4i32_zuu4: +; SSE: # BB#0: +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movss %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0] +; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v4i32_zuu4: ; AVX: # BB#0: -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0] +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> ret <4 x i32> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 5c5bc21a48f..10a27f44320 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -678,8 +678,8 @@ define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) { ; AVX1-LABEL: insert_reg_and_zero_v4i64: ; AVX1: # BB#0: ; AVX1-NEXT: vmovq %rdi, %xmm0 -; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovsd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_reg_and_zero_v4i64: @@ -697,8 +697,8 @@ define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) { ; AVX1-LABEL: insert_mem_and_zero_v4i64: ; AVX1: # BB#0: ; AVX1-NEXT: vmovq (%rdi), %xmm0 -; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovsd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_mem_and_zero_v4i64: @@ -716,9 +716,8 @@ define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) { define <4 x double> @insert_reg_and_zero_v4f64(double %a) { ; ALL-LABEL: insert_reg_and_zero_v4f64: ; ALL: # BB#0: -; ALL-NEXT: # kill: XMM0 XMM0 YMM0 -; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vmovsd %xmm0, %xmm1, %xmm0 ; ALL-NEXT: retq %v = insertelement <4 x double> undef, double %a, i32 0 %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> @@ -729,8 +728,6 @@ define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) { ; ALL-LABEL: insert_mem_and_zero_v4f64: ; ALL: # BB#0: ; ALL-NEXT: vmovsd (%rdi), %xmm0 -; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] ; ALL-NEXT: retq %a = load double* %ptr %v = insertelement <4 x double> undef, double %a, i32 0 diff --git a/test/CodeGen/X86/vector-shuffle-sse1.ll b/test/CodeGen/X86/vector-shuffle-sse1.ll index 43daf85b875..226deb0a3f2 100644 --- a/test/CodeGen/X86/vector-shuffle-sse1.ll +++ b/test/CodeGen/X86/vector-shuffle-sse1.ll @@ -95,8 +95,8 @@ define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) { ; SSE1-LABEL: shuffle_v4f32_4zzz: ; SSE1: # BB#0: ; SSE1-NEXT: xorps %xmm1, %xmm1 -; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,0] -; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSE1-NEXT: movss %xmm0, %xmm1 +; SSE1-NEXT: movaps %xmm1, %xmm0 ; SSE1-NEXT: retq %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> ret <4 x float> %shuffle -- 2.34.1