From 7024c7e949891e7ce778302ad96da11ab7187832 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Tue, 23 Sep 2014 18:16:12 +0000 Subject: [PATCH] [x86] Teach the new shuffle lowering's blend functionality to use AVX2's VPBLENDD where appropriate even on 128-bit vectors. According to Agner's tables, this instruction is significantly higher throughput (can execute on any port) on Haswell chips so we should aggressively try to form it when available. Sadly, this loses our delightful shuffle comments. I'll add those back for VPBLENDD next. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218322 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 51 +++++++++++++++------- test/CodeGen/X86/vector-shuffle-256-v16.ll | 12 ++--- test/CodeGen/X86/vector-shuffle-256-v32.ll | 12 ++--- 3 files changed, 47 insertions(+), 28 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index a3fa78a701a..a15ad4e936c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7243,6 +7243,7 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef Mask, /// that the shuffle mask is in fact a blend. static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { unsigned BlendMask = 0; @@ -7264,9 +7265,27 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, DAG.getConstant(BlendMask, MVT::i8)); - case MVT::v8i16: + case MVT::v2i64: case MVT::v4i32: - case MVT::v2i64: { + // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into + // that instruction. + if (Subtarget->hasAVX2()) { + int Scale = 8 / VT.getVectorNumElements(); + BlendMask = 0; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= Size) + for (int j = 0; j < Scale; ++j) + BlendMask |= 1u << (i * Scale + j); + + MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; + V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, + DAG.getConstant(BlendMask, MVT::i8))); + } + // FALLTHROUGH + case MVT::v8i16: { // For integer shuffles we need to expand the mask and cast the inputs to // v8i16s prior to blending. int Scale = 8 / VT.getVectorNumElements(); @@ -7715,8 +7734,8 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Insertion; if (Subtarget->hasSSE41()) - if (SDValue Blend = - lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, DAG)) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, + Subtarget, DAG)) return Blend; unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); @@ -7769,8 +7788,8 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Insertion; if (Subtarget->hasSSE41()) - if (SDValue Blend = - lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, DAG)) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, + Subtarget, DAG)) return Blend; // Try to use rotation instructions if available. @@ -7921,8 +7940,8 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return V; if (Subtarget->hasSSE41()) - if (SDValue Blend = - lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, DAG)) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, + Subtarget, DAG)) return Blend; // Check for whether we can use INSERTPS to perform the blend. We only use @@ -8026,8 +8045,8 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return V; if (Subtarget->hasSSE41()) - if (SDValue Blend = - lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, DAG)) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, + Subtarget, DAG)) return Blend; // Try to use rotation instructions if available. @@ -8707,8 +8726,8 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return V; if (Subtarget->hasSSE41()) - if (SDValue Blend = - lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, + Subtarget, DAG)) return Blend; // Try to use rotation instructions if available. @@ -9291,8 +9310,8 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG)) return Insertion; - if (SDValue Blend = - lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, DAG)) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, + Subtarget, DAG)) return Blend; // Check if the blend happens to exactly fit that of SHUFPD. @@ -9372,8 +9391,8 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG); - if (SDValue Blend = - lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, DAG)) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, + Subtarget, DAG)) return Blend; // If the shuffle mask is repeated in each 128-bit lane, we have many more diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index c567ac1dc62..ea9289e3849 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -716,9 +716,9 @@ define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_1 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-NEXT: vpshuflw {{.*}} # xmm3 = xmm3[0,0,0,0,4,5,6,7] -; AVX2-NEXT: vpblendw {{.*}} # xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX2-NEXT: vpblendd $-16, %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX2-NEXT: vpblendw {{.*}} # xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX2-NEXT: vpblendd $-16, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -745,10 +745,10 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_1 ; AVX2-NEXT: vpshufhw {{.*}} # xmm2 = xmm2[0,1,2,3,7,6,5,4] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-NEXT: vpshuflw {{.*}} # xmm3 = xmm3[3,2,1,0,4,5,6,7] -; AVX2-NEXT: vpblendw {{.*}} # xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX2-NEXT: vpblendd $-16, %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] ; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7] -; AVX2-NEXT: vpblendw {{.*}} # xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX2-NEXT: vpblendd $-16, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -778,11 +778,11 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpshufd {{.*}} # xmm3 = xmm3[0,1,0,1] ; AVX2-NEXT: vpshufhw {{.*}} # xmm3 = xmm3[0,1,2,3,7,6,5,4] -; AVX2-NEXT: vpblendw {{.*}} # xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; AVX2-NEXT: vpblendd $-16, %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1] ; AVX2-NEXT: vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] -; AVX2-NEXT: vpblendw {{.*}} # xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX2-NEXT: vpblendd $-16, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index 9851e4d786d..5dec202813f 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1138,11 +1138,11 @@ define <32 x i8> @shuffle_v32i8_32_32_32_32_32_32_32_32_08_08_10_10_12_12_14_14_ ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-NEXT: vpunpcklbw {{.*}} # xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX2-NEXT: vpshuflw {{.*}} # xmm4 = xmm4[0,0,0,0,4,5,6,7] -; AVX2-NEXT: vpblendw {{.*}} # xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; AVX2-NEXT: vpblendd $-16, %xmm2, %xmm4, %xmm2 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklbw {{.*}} # xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX2-NEXT: vpblendw {{.*}} # xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX2-NEXT: vpblendd $-16, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -1173,10 +1173,10 @@ define <32 x i8> @shuffle_v32i8_38_38_36_36_34_34_32_32_14_14_12_12_10_10_08_08_ ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-NEXT: vmovdqa .LCPI50_1(%rip), %xmm5 ; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpblendw {{.*}} # xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; AVX2-NEXT: vpblendd $-16, %xmm2, %xmm4, %xmm2 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*}} # xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX2-NEXT: vpblendd $-16, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -1207,10 +1207,10 @@ define <32 x i8> @shuffle_v32i8_38_38_36_36_34_34_32_32_06_06_04_04_02_02_00_00_ ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-NEXT: vmovdqa .LCPI51_1(%rip), %xmm5 ; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpblendw {{.*}} # xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; AVX2-NEXT: vpblendd $-16, %xmm2, %xmm4, %xmm2 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*}} # xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX2-NEXT: vpblendd $-16, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> -- 2.34.1