From 04402a6c139cafaf01eeb20bae5f4f9ec6d0fd49 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Mon, 15 Sep 2014 10:35:41 +0000 Subject: [PATCH] [x86] Undo a flawed transform I added to form UNPCK instructions when AVX is available, and generally tidy up things surrounding UNPCK formation. Originally, I was thinking that the only advantage of PSHUFD over UNPCK instruction variants was its free copy, and otherwise we should use the shorter encoding UNPCK instructions. This isn't right though, there is a larger advantage of being able to fold a load into the operand of a PSHUFD. For UNPCK, the operand *must* be in a register so it can be the second input. This removes the UNPCK formation in the target-specific DAG combine for v4i32 shuffles. It also lifts the v8 and v16 cases out of the AVX-specific check as they are potentially replacing multiple instructions with a single instruction and so should always be valuable. The floating point checks are simplified accordingly. This also adjusts the formation of PSHUFD instructions to attempt to match the shuffle mask to one which would fit an UNPCK instruction variant. This was originally motivated to allow it to match the UNPCK instructions in the combiner, but clearly won't now. Eventually, we should add a MachineCombiner pass that can form UNPCK instructions post-RA when the operand is known to be in a register and thus there is no loss. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217755 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 158 +++++++++++----------- test/CodeGen/X86/avx-basic.ll | 6 +- test/CodeGen/X86/avx-sext.ll | 2 +- test/CodeGen/X86/avx-splat.ll | 2 +- test/CodeGen/X86/exedepsfix-broadcast.ll | 4 +- test/CodeGen/X86/vector-shuffle-128-v8.ll | 2 +- test/CodeGen/X86/vector-shuffle-256-v4.ll | 29 ++-- 7 files changed, 101 insertions(+), 102 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index cbaca286421..a4e94551020 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7598,11 +7598,22 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); - if (NumV2Elements == 0) + if (NumV2Elements == 0) { // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. + // We coerce the shuffle pattern to be compatible with UNPCK instructions + // but we aren't actually going to use the UNPCK instruction because doing + // so prevents folding a load into this instruction or making a copy. + const int UnpackLoMask[] = {0, 0, 1, 1}; + const int UnpackHiMask[] = {2, 2, 3, 3}; + if (isShuffleEquivalent(Mask, 0, 0, 1, 1)) + Mask = UnpackLoMask; + else if (isShuffleEquivalent(Mask, 2, 2, 3, 3)) + Mask = UnpackHiMask; + return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(Mask, DAG)); + } // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) @@ -19347,86 +19358,75 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, bool FloatDomain = VT.isFloatingPoint(); // For floating point shuffles, we don't have free copies in the shuffle - // instructions, so this always makes sense to canonicalize. + // instructions or the ability to load as part of the instruction, so + // canonicalize their shuffles to UNPCK or MOV variants. // - // For integer shuffles, if we don't have access to VEX encodings, the generic - // PSHUF instructions are preferable to some of the specialized forms despite - // requiring one more byte to encode because they can implicitly copy. - // - // IF we *do* have VEX encodings, then we can use shorter, more specific - // shuffle instructions freely as they can copy due to the extra register - // operand. - if (FloatDomain || Subtarget->hasAVX()) { - // We have both floating point and integer variants of shuffles that dup - // either the low or high half of the vector. - if (Mask.equals(0, 0) || Mask.equals(1, 1)) { - bool Lo = Mask.equals(0, 0); - unsigned Shuffle; - MVT ShuffleVT; - // If the input is a floating point, check if we have SSE3 which will let - // us use MOVDDUP. That instruction is no slower than UNPCKLPD but has the - // option to fold the input operand into even an unaligned memory load. - if (FloatDomain && Lo && Subtarget->hasSSE3()) { - Shuffle = X86ISD::MOVDDUP; - ShuffleVT = MVT::v2f64; - } else if (FloatDomain) { - // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller - // than the UNPCK variants. - Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS; - ShuffleVT = MVT::v4f32; - } else if (Subtarget->hasSSE2()) { - // We model everything else using UNPCK instructions. While MOVLHPS and - // MOVHLPS are shorter encodings they cannot accept a memory operand - // which overly constrains subsequent lowering. - Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; - ShuffleVT = MVT::v2i64; - } else { - // No available instructions here. - return false; - } - if (Depth == 1 && Root->getOpcode() == Shuffle) - return false; // Nothing to do! - Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); - DCI.AddToWorklist(Op.getNode()); - if (Shuffle == X86ISD::MOVDDUP) - Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); - else - Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); - DCI.AddToWorklist(Op.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), - /*AddTo*/ true); - return true; - } - - // FIXME: We should match UNPCKLPS and UNPCKHPS here. - - // For the integer domain we have specialized instructions for duplicating - // any element size from the low or high half. - if (!FloatDomain && - (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3) || - Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) || - Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) || - Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) || - Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, - 15))) { - bool Lo = Mask[0] == 0; - unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; - if (Depth == 1 && Root->getOpcode() == Shuffle) - return false; // Nothing to do! - MVT ShuffleVT; - switch (Mask.size()) { - case 4: ShuffleVT = MVT::v4i32; break; - case 8: ShuffleVT = MVT::v8i16; break; - case 16: ShuffleVT = MVT::v16i8; break; - }; - Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); - DCI.AddToWorklist(Op.getNode()); + // Note that even with AVX we prefer the PSHUFD form of shuffle for integer + // vectors because it can have a load folded into it that UNPCK cannot. This + // doesn't preclude something switching to the shorter encoding post-RA. + if (FloatDomain && (Mask.equals(0, 0) || Mask.equals(1, 1))) { + bool Lo = Mask.equals(0, 0); + unsigned Shuffle; + MVT ShuffleVT; + // Check if we have SSE3 which will let us use MOVDDUP. That instruction + // is no slower than UNPCKLPD but has the option to fold the input operand + // into even an unaligned memory load. + if (Lo && Subtarget->hasSSE3()) { + Shuffle = X86ISD::MOVDDUP; + ShuffleVT = MVT::v2f64; + } else { + // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller + // than the UNPCK variants. + Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS; + ShuffleVT = MVT::v4f32; + } + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! + Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); + DCI.AddToWorklist(Op.getNode()); + if (Shuffle == X86ISD::MOVDDUP) + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); + else Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); - DCI.AddToWorklist(Op.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), - /*AddTo*/ true); - return true; - } + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + /*AddTo*/ true); + return true; + } + + // FIXME: We should match UNPCKLPS and UNPCKHPS here. + + // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK + // variants as none of these have single-instruction variants that are + // superior to the UNPCK formulation. + if (!FloatDomain && + (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) || + Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) || + Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) || + Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, + 15))) { + bool Lo = Mask[0] == 0; + unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! + MVT ShuffleVT; + switch (Mask.size()) { + case 8: + ShuffleVT = MVT::v8i16; + break; + case 16: + ShuffleVT = MVT::v16i8; + break; + default: + llvm_unreachable("Impossible mask size!"); + }; + Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); + DCI.AddToWorklist(Op.getNode()); + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + /*AddTo*/ true); + return true; } // Don't try to re-form single instruction chains under any circumstances now diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll index ca540226ee7..a8dae82a8be 100644 --- a/test/CodeGen/X86/avx-basic.ll +++ b/test/CodeGen/X86/avx-basic.ll @@ -72,9 +72,9 @@ entry: ret <4 x i64> %shuffle } -; CHECK: vpunpcklqdq +; CHECK: vmovlhps ; CHECK-NEXT: vextractf128 $1 -; CHECK-NEXT: vpunpcklqdq +; CHECK-NEXT: vmovlhps ; CHECK-NEXT: vinsertf128 $1 define <4 x i64> @C(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { entry: @@ -83,7 +83,7 @@ entry: } ; CHECK: vpshufd $-96 -; CHECK: vpunpckhdq +; CHECK: vpshufd $-6 ; CHECK: vinsertf128 $1 define <8 x i32> @D(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { entry: diff --git a/test/CodeGen/X86/avx-sext.ll b/test/CodeGen/X86/avx-sext.ll index 9bcf06f7b32..fb2287f5289 100644 --- a/test/CodeGen/X86/avx-sext.ll +++ b/test/CodeGen/X86/avx-sext.ll @@ -156,7 +156,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; AVX-LABEL: sext_16i8_to_16i16 ; AVX: vpmovsxbw -; AVX: vpunpckhqdq +; AVX: vmovhlps ; AVX: vpmovsxbw ; AVX: ret define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) { diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll index a2537ce5c04..058db314d28 100644 --- a/test/CodeGen/X86/avx-splat.ll +++ b/test/CodeGen/X86/avx-splat.ll @@ -19,7 +19,7 @@ entry: } ; CHECK: vmovq -; CHECK-NEXT: vpunpcklqdq %xmm +; CHECK-NEXT: vmovlhps %xmm ; CHECK-NEXT: vinsertf128 $1 define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp { entry: diff --git a/test/CodeGen/X86/exedepsfix-broadcast.ll b/test/CodeGen/X86/exedepsfix-broadcast.ll index f4539c8969c..ab794959550 100644 --- a/test/CodeGen/X86/exedepsfix-broadcast.ll +++ b/test/CodeGen/X86/exedepsfix-broadcast.ll @@ -95,8 +95,8 @@ define <4 x double> @ExeDepsFix_broadcastsd256(<4 x double> %arg, <4 x double> % ; CHECK-LABEL: ExeDepsFix_broadcastsd_inreg ; ExeDepsFix works top down, thus it coalesces vpunpcklqdq domain with ; vpand and there is nothing more you can do to match vmaxpd. -; CHECK: vpunpcklqdq -; CHECK: vpand +; CHECK: vmovlhps +; CHECK: vandps ; CHECK: vmaxpd ; CHECK: ret define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double> %arg2, i64 %broadcastvalue) { diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index 8faa3f032fe..f959fea472f 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -639,7 +639,7 @@ define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) { define <8 x i16> @shuffle_v8i16_XXXdXXXX(<8 x i16> %a, <8 x i16> %b) { ; ALL-LABEL: @shuffle_v8i16_XXXdXXXX ; ALL: # BB#0: -; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm1[0,2,2,3] +; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm1[2,2,3,3] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index cd79a38ca4a..affa933768b 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -6,7 +6,7 @@ target triple = "x86_64-unknown-unknown" define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_0001 ; AVX1: # BB#0: -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm0[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -18,7 +18,7 @@ define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0],xmm0[0] -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -41,7 +41,7 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm0[0],xmm1[1] -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -52,7 +52,7 @@ define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_1000 ; AVX1: # BB#0: ; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -63,8 +63,8 @@ define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_2200 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0] -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1] +; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -76,7 +76,7 @@ define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm1[1],xmm0[0] -; AVX1-NEXT: vpunpckhqdq {{.*}} # xmm1 = xmm1[1,1] +; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -281,7 +281,7 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_0124 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm2[0],xmm1[1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -292,7 +292,7 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_0142 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm2 = xmm2[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm2[0,1,0,1] ; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm1[0],xmm2[1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -304,7 +304,7 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0] -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -316,7 +316,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0] -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -336,7 +336,7 @@ define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm1[2,3,0,1] ; AVX1-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm0[1] -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -356,7 +356,7 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm0[2,3,0,1] ; AVX1-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm1[1] -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -368,8 +368,7 @@ define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @stress_test1 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-NEXT: vpunpckhqdq {{.*}} # xmm0 = xmm0[1,1] -; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1] ; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[2,3,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -- 2.34.1