From: Chandler Carruth Date: Wed, 1 Oct 2014 00:41:21 +0000 (+0000) Subject: [x86] Teach the new vector shuffle lowering about VBROADCAST and X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=afe75172b1080515874ca920a0f50e740fce10dd;p=oota-llvm.git [x86] Teach the new vector shuffle lowering about VBROADCAST and VPBROADCAST. This has the somewhat expected pervasive impact. I don't know why I forgot about this. Everything seems good with lots of significant improvements in the tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218724 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index e50ca6d08f1..394a6221056 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7824,6 +7824,57 @@ static SDValue lowerVectorShuffleAsElementInsertion( return V2; } +/// \brief Try to lower broadcast of a single element. +/// +/// For convenience, this code also bundles all of the subtarget feature set +/// filtering. While a little annoying to re-dispatch on type here, there isn't +/// a convenient way to factor it out. +static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V, + ArrayRef Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + if (!Subtarget->hasAVX()) + return SDValue(); + if (VT.isInteger() && !Subtarget->hasAVX2()) + return SDValue(); + + // Check that the mask is a broadcast. + int BroadcastIdx = -1; + for (int M : Mask) + if (M >= 0 && BroadcastIdx == -1) + BroadcastIdx = M; + else if (M >= 0 && M != BroadcastIdx) + return SDValue(); + + assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " + "a sorted mask where the broadcast " + "comes from V1."); + + // Check if this is a broadcast of a scalar load -- those are more widely + // supported than broadcasting in-register values. + if (V.getOpcode() == ISD::BUILD_VECTOR || + (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { + SDValue BroadcastV = V.getOperand(BroadcastIdx); + if (ISD::isNON_EXTLoad(BroadcastV.getNode())) { + // We can directly broadcast from memory. + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, BroadcastV); + } + } + + // We can't broadcast from a register w/o AVX2. + if (!Subtarget->hasAVX2()) + return SDValue(); + + // Check if this is a broadcast of a BUILD_VECTOR which we can always handle, + // or is a broadcast of the zero element. + if (V.getOpcode() == ISD::BUILD_VECTOR) + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, V.getOperand(BroadcastIdx)); + else if (BroadcastIdx != 0) + return SDValue(); + + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V); +} + /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full @@ -7900,6 +7951,11 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (isSingleInputShuffleMask(Mask)) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We have to map the mask as it is actually a v4i32 shuffle instruction. @@ -8057,6 +8113,11 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); if (NumV2Elements == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + if (Subtarget->hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. @@ -8157,6 +8218,11 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); if (NumV2Elements == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We coerce the shuffle pattern to be compatible with UNPCK instructions @@ -8253,6 +8319,11 @@ static SDValue lowerV8I16SingleInputVectorShuffle( MutableArrayRef HToLInputs(LoInputs.data() + NumLToL, NumHToL); MutableArrayRef HToHInputs(HiInputs.data() + NumLToH, NumHToH); + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V, + Mask, Subtarget, DAG)) + return Broadcast; + // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V); @@ -9036,6 +9107,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // For single-input shuffles, there are some nicer lowering tricks we can use. if (NumV2Elements == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + // Check whether we can widen this to an i16 shuffle by duplicating bytes. // Notably, this handles splat and partial-splat shuffles more efficiently. // However, it only makes sense if the pre-duplication shuffle simplifies @@ -9455,6 +9531,11 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); if (isSingleInputShuffleMask(Mask)) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { // Non-half-crossing single input shuffles can be lowerid with an // interleaved permutation. @@ -9538,6 +9619,11 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Subtarget, DAG)) return Blend; + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + // When the shuffle is mirrored between the 128-bit lanes of the unit, we can // use lower latency instructions that will operate on both 128-bit lanes. SmallVector RepeatedMask; @@ -9592,6 +9678,11 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Subtarget, DAG)) return Blend; + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + // If the shuffle mask is repeated in each 128-bit lane, we have many more // options to efficiently lower the shuffle. SmallVector RepeatedMask; @@ -9665,6 +9756,11 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Subtarget, DAG)) return Blend; + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + // If the shuffle mask is repeated in each 128-bit lane we can use more // efficient instructions that mirror the shuffles across the two 128-bit // lanes. @@ -9714,6 +9810,11 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!"); + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + // There are no generalized cross-lane shuffle operations available on i16 // element types. if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) @@ -9779,6 +9880,11 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!"); + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + // There are no generalized cross-lane shuffle operations available on i8 // element types. if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 3fd576ea25a..486beddb4a9 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -8711,6 +8711,27 @@ let Predicates = [HasAVX2] in { def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))), (VBROADCASTSDYrr VR128:$src)>; + // Provide aliases for broadcast from the same regitser class that + // automatically does the extract. + def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))), + (VPBROADCASTBYrr (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), + sub_xmm)))>; + def : Pat<(v16i16 (X86VBroadcast (v16i16 VR256:$src))), + (VPBROADCASTWYrr (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), + sub_xmm)))>; + def : Pat<(v8i32 (X86VBroadcast (v8i32 VR256:$src))), + (VPBROADCASTDYrr (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), + sub_xmm)))>; + def : Pat<(v4i64 (X86VBroadcast (v4i64 VR256:$src))), + (VPBROADCASTQYrr (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), + sub_xmm)))>; + def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))), + (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), + sub_xmm)))>; + def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))), + (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), + sub_xmm)))>; + // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. let AddedComplexity = 20 in { diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index 306ef8a220f..d7373602431 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -36,11 +36,16 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00( ; SSE41-NEXT: pshufb %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -200,10 +205,15 @@ define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) { ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v16i8_0101010101010101: -; AVX: # BB#0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v16i8_0101010101010101: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i8_0101010101010101: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -231,12 +241,18 @@ define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07( ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: -; AVX: # BB#0: -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: +; AVX1: # BB#0: +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll index 6f891c12505..67723dd07b2 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -14,10 +14,15 @@ define <2 x i64> @shuffle_v2i64_00(<2 x i64> %a, <2 x i64> %b) { ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v2i64_00: -; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2i64_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } @@ -53,10 +58,15 @@ define <2 x i64> @shuffle_v2i64_22(<2 x i64> %a, <2 x i64> %b) { ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v2i64_22: -; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2i64_22: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_22: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm0 +; AVX2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index d0c4a3f9d36..7d6e2bb1124 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -77,10 +77,15 @@ define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) { ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_00000000: -; AVX: # BB#0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_00000000: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i16_00000000: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -1169,12 +1174,19 @@ define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) { ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_XXXXcde3: -; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_XXXXcde3: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i16_XXXXcde3: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15] +; AVX2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -1205,12 +1217,19 @@ define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) { ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_cde3XXXX: -; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_cde3XXXX: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i16_cde3XXXX: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] +; AVX2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -1254,16 +1273,27 @@ define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) { ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_012dcde3: -; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3] -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_012dcde3: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i16_012dcde3: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm2 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index 0cb73521503..cc449899110 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -12,8 +12,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -31,10 +30,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_0 ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,4,4] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,4] +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -51,7 +48,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_0 ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -69,7 +66,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_0 ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -87,7 +84,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_0 ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -105,7 +102,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_0 ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -123,7 +120,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_0 ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -141,7 +138,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0 ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -165,7 +162,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0 ; AVX2: # BB#0: ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -189,7 +186,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0 ; AVX2: # BB#0: ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,2,3,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,u,u,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -214,7 +211,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0 ; AVX2: # BB#0: ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,4,5,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,u,u,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -239,7 +236,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0 ; AVX2: # BB#0: ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,6,7,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,u,u,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -264,7 +261,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0 ; AVX2: # BB#0: ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,u,u,0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -289,7 +286,7 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0 ; AVX2: # BB#0: ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,10,11,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,u,u,0,1,0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -315,7 +312,7 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; AVX2: # BB#0: ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,0,1,0,1,0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -341,7 +338,7 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; AVX2: # BB#0: ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[14,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,0,1,0,1,0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -735,8 +732,8 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_1 ; ; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,4,5,0,1,0,1,0,1,12,13,0,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index fab36b2b273..687928f0a7c 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -13,9 +13,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -32,8 +30,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -52,8 +49,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -72,8 +68,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -92,8 +87,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -112,8 +106,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -132,8 +125,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -152,8 +144,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -172,8 +163,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -192,8 +182,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -212,8 +201,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -232,8 +220,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -252,8 +239,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -272,8 +258,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -292,8 +277,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -317,8 +301,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX2-NEXT: movl $15, %eax ; AVX2-NEXT: vmovd %eax, %xmm1 ; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -339,12 +322,12 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -364,12 +347,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -389,12 +371,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,2,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,2,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -414,12 +395,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,3,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,3,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -439,12 +419,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,4,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,4,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -464,12 +443,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,5,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,5,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -489,12 +467,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,6,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,6,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -514,12 +491,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,7,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -539,12 +515,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,8,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,8,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -564,12 +539,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,9,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,9,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -589,12 +563,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,10,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,10,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -614,12 +587,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,11,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,11,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -639,12 +611,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,12,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,12,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -664,12 +635,11 @@ define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,13,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,13,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -689,12 +659,11 @@ define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -722,7 +691,7 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vinserti128 $0, %xmm2, %ymm3, %ymm2 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1037,9 +1006,10 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_ ; ; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 74662ac82e4..5c5bc21a48f 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -12,7 +12,7 @@ define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) { ; ; AVX2-LABEL: shuffle_v4f64_0000: ; AVX2: # BB#0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,0] +; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -184,11 +184,17 @@ define <4 x double> @shuffle_v4f64_1022(<4 x double> %a, <4 x double> %b) { } define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: shuffle_v4f64_0423: -; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2] -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_0423: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_0423: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -317,7 +323,7 @@ define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) { ; ; AVX2-LABEL: shuffle_v4i64_0000: ; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,0] +; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -460,7 +466,7 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) { ; ; AVX2-LABEL: shuffle_v4i64_0124: ; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] +; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -477,8 +483,8 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) { ; ; AVX2-LABEL: shuffle_v4i64_0142: ; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] +; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -497,8 +503,8 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) { ; ; AVX2-LABEL: shuffle_v4i64_0412: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] +; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -731,3 +737,31 @@ define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) { %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> ret <4 x double> %shuffle } + +define <4 x double> @splat_mem_v4f64(double* %ptr) { +; ALL-LABEL: splat_mem_v4f64: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 +; ALL-NEXT: retq + %a = load double* %ptr + %v = insertelement <4 x double> undef, double %a, i32 0 + %shuffle = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> + ret <4 x double> %shuffle +} + +define <4 x i64> @splat_mem_v4i64(i64* %ptr) { +; AVX1-LABEL: splat_mem_v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovddup (%rdi), %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat_mem_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX2-NEXT: retq + %a = load i64* %ptr + %v = insertelement <4 x i64> undef, i64 %a, i64 0 + %shuffle = shufflevector <4 x i64> %v, <4 x i64> undef, <4 x i32> + ret <4 x i64> %shuffle +} diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index b9a2de0298b..f11554cc2fc 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -12,8 +12,7 @@ define <8 x float> @shuffle_v8f32_00000000(<8 x float> %a, <8 x float> %b) { ; ; AVX2-LABEL: shuffle_v8f32_00000000: ; AVX2: # BB#0: -; AVX2-NEXT: vxorps %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vbroadcastss %xmm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -210,9 +209,8 @@ define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) { ; ; AVX2-LABEL: shuffle_v8f32_08080808: ; AVX2: # BB#0: -; AVX2-NEXT: vxorps %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vbroadcastss %xmm1, %ymm1 +; AVX2-NEXT: vbroadcastss %xmm0, %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -702,8 +700,7 @@ define <8 x i32> @shuffle_v8i32_00000000(<8 x i32> %a, <8 x i32> %b) { ; ; AVX2-LABEL: shuffle_v8i32_00000000: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vbroadcastss %xmm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -910,9 +907,8 @@ define <8 x i32> @shuffle_v8i32_08080808(<8 x i32> %a, <8 x i32> %b) { ; ; AVX2-LABEL: shuffle_v8i32_08080808: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,0] +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll index 09dde3132dd..2f02f2fc08f 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -6,7 +6,7 @@ target triple = "x86_64-unknown-unknown" define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00000000: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,0] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 ; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -16,7 +16,7 @@ define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00000010: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,0] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 ; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq @@ -27,7 +27,7 @@ define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00000200: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,0] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 ; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0] ; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq @@ -38,7 +38,7 @@ define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00003000: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,0] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 ; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0] ; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq @@ -50,10 +50,9 @@ define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00040000: ; ALL: # BB#0: ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,0] -; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,0,0,3] -; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3] -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,0] +; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] ; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -65,9 +64,8 @@ define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) { ; ALL: # BB#0: ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] -; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,0,2,0] -; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3] -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,0] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3] ; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -79,9 +77,8 @@ define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) { ; ALL: # BB#0: ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] -; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,0,0] -; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3] -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,0] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3] ; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -93,7 +90,7 @@ define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) { ; ALL: # BB#0: ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3] -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,0] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 ; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3] ; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq @@ -127,7 +124,7 @@ define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00001111: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,0] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 ; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1] ; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq @@ -151,8 +148,8 @@ define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_08080808: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,0] -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] +; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] ; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ; ALL-NEXT: retq @@ -164,12 +161,12 @@ define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_08084c4c: ; ALL: # BB#0: ; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,0] +; ALL-NEXT: vbroadcastsd %xmm2, %ymm2 ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; ALL-NEXT: vbroadcastsd %xmm3, %ymm3 ; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,0] -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] +; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] ; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq @@ -182,9 +179,9 @@ define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) { ; ALL: # BB#0: ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 ; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; ALL-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2] +; ALL-NEXT: vbroadcastsd %xmm3, %ymm3 ; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2] +; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq @@ -513,7 +510,7 @@ define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) { ; ALL: # BB#0: ; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0] ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,0] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 ; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -638,7 +635,7 @@ define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00uu66uu: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 @@ -694,16 +691,16 @@ define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_c348cda0: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,3,2,3] -; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,3,2,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vbroadcastsd %xmm3, %ymm3 +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3] ; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; ALL-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,1,2,0] +; ALL-NEXT: vbroadcastsd %xmm1, %ymm4 ; ALL-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3] ; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3] ; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3] -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,0] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] ; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 ; ALL-NEXT: retq @@ -734,7 +731,7 @@ define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) { define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00000000: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,0] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 ; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -744,7 +741,7 @@ define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00000010: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,0] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 ; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq @@ -755,7 +752,7 @@ define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00000200: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,0] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 ; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0] ; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq @@ -766,7 +763,7 @@ define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00003000: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,0] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 ; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0] ; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq @@ -778,10 +775,9 @@ define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00040000: ; ALL: # BB#0: ; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] -; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,0,3] -; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,0] +; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -793,9 +789,8 @@ define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) { ; ALL: # BB#0: ; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,0] -; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,0] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -807,9 +802,8 @@ define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) { ; ALL: # BB#0: ; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,0] -; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,0] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -821,7 +815,7 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) { ; ALL: # BB#0: ; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,1,2,3] -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,0] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 ; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq @@ -855,7 +849,7 @@ define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00001111: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,0] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 ; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] ; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq @@ -879,8 +873,8 @@ define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_08080808: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,0] -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] +; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 ; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; ALL-NEXT: retq @@ -892,12 +886,12 @@ define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_08084c4c: ; ALL: # BB#0: ; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,0] +; ALL-NEXT: vpbroadcastq %xmm2, %ymm2 ; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] +; ALL-NEXT: vpbroadcastq %xmm3, %ymm3 ; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,0] -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] +; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 ; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq @@ -910,9 +904,9 @@ define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) { ; ALL: # BB#0: ; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,1,4,5,4,5] +; ALL-NEXT: vpbroadcastq %xmm3, %ymm3 ; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] +; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 ; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq @@ -1241,7 +1235,7 @@ define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) { ; ALL: # BB#0: ; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0] ; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,0] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 ; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -1366,7 +1360,7 @@ define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00uu66uu: ; ALL: # BB#0: -; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 ; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1427,8 +1421,8 @@ define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) { ; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,3,2,1] ; ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] ; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] +; ALL-NEXT: vpbroadcastq %xmm2, %ymm2 ; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]