From 8f637786d825f631ecdd58e3c773f06505310048 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Tue, 23 Sep 2014 10:08:29 +0000
Subject: [PATCH] [x86] Teach the AVX1 path of the new vector shuffle lowering
 one more trick that I missed.

VPERMILPS has a non-immediate memory operand mode that allows it to do
asymetric shuffles in the two 128-bit lanes. Use this rather than two
shuffles and a blend.

However, it turns out the variable shuffle path to VPERMILPS (and
VPERMILPD, although that one offers no functional differenc from the
immediate operand other than variability) wasn't even plumbed through
codegen. Do such plumbing so that we can reasonably emit
a variable-masked VPERMILP instruction. Also plumb basic comment parsing
and printing through so that the tests are reasonable.

There are still a few tests which don't show the shuffle pattern. These
are tests with undef lanes. I'll teach the shuffle decoding and printing
to handle undef mask entries in a follow-up. I've looked at the masks
and they seem reasonable.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218300 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/Utils/X86ShuffleDecode.cpp |  23 ++++
 lib/Target/X86/Utils/X86ShuffleDecode.h   |   4 +
 lib/Target/X86/X86ISelLowering.cpp        |  27 ++---
 lib/Target/X86/X86ISelLowering.h          |   1 +
 lib/Target/X86/X86InstrFragmentsSIMD.td   |   3 +
 lib/Target/X86/X86InstrSSE.td             |  18 +++
 lib/Target/X86/X86MCInstLower.cpp         |  27 ++++-
 test/CodeGen/X86/vector-shuffle-256-v8.ll | 134 ++++++----------------
 8 files changed, 116 insertions(+), 121 deletions(-)
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index 6d42a101b0e..9aca2da4902 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -287,4 +287,27 @@ void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
   }
 }
 
+void DecodeVPERMILPMask(const ConstantDataSequential *C,
+                        SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  assert(MaskTy->isVectorTy() && "Expected a vector constant mask!");
+  assert(MaskTy->getVectorElementType()->isIntegerTy() &&
+         "Expected integer constant mask elements!");
+  int ElementBits = MaskTy->getScalarSizeInBits();
+  int NumElements = MaskTy->getVectorNumElements();
+  assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
+         "Unexpected number of vector elements.");
+  assert((unsigned)NumElements == C->getNumElements() &&
+         "Constant mask has a different number of elements!");
+
+  ShuffleMask.reserve(NumElements);
+  for (int i = 0; i < NumElements; ++i) {
+    int Base = (i * ElementBits / 128) * (128 / ElementBits);
+    uint64_t Element = C->getElementAsInteger(i);
+    // Only the least significant 2 bits of the integer are used.
+    int Index = Base + (Element & 0x3);
+    ShuffleMask.push_back(Index);
+  }
+}
+
 } // llvm namespace
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 03a843e7b8d..8034d209ac3 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -84,6 +84,10 @@ void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
 /// No VT provided since it only works on 256-bit, 4 element vectors.
 void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
+/// \brief Decode a VPERMILP variable mask from an IR-level vector constant.
+void DecodeVPERMILPMask(const ConstantDataSequential *C,
+                        SmallVectorImpl<int> &ShuffleMask);
+
 } // llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 2e195080f8b..40ab77aaaa0 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -9395,26 +9395,15 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   }
 
   // If we have a single input shuffle with different shuffle patterns in the
-  // two 128-bit lanes, just do two shuffles and blend them together. This will
-  // be faster than extracting the high 128-bit lane, shuffling it, and
-  // re-inserting it. Especially on newer processors where blending is *the*
-  // fastest operation.
+  // two 128-bit lanes use the variable mask to VPERMILPS.
   if (isSingleInputShuffleMask(Mask)) {
-    int LoMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
-    int HiMask[4] = {Mask[4], Mask[5], Mask[6], Mask[7]};
-    for (int &M : HiMask)
-      if (M >= 0)
-        M -= 4;
-    SDValue Lo = V1, Hi = V1;
-    if (!isNoopShuffleMask(LoMask))
-      Lo = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Lo,
-                       getV4X86ShuffleImm8ForMask(LoMask, DAG));
-    if (!isNoopShuffleMask(HiMask))
-      Hi = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Hi,
-                       getV4X86ShuffleImm8ForMask(HiMask, DAG));
-    unsigned BlendMask = 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7;
-    return DAG.getNode(X86ISD::BLENDI, DL, MVT::v8f32, Lo, Hi,
-                       DAG.getConstant(BlendMask, MVT::i8));
+    SDValue VPermMask[8];
+    for (int i = 0; i < 8; ++i)
+      VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
+                                 : DAG.getConstant(Mask[i], MVT::i32);
+    return DAG.getNode(
+        X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
+        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
   }
 
   // Shuffle the input elements into the desired positions in V1 and V2 and
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index a624fa25dab..a16cf4a0b64 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -343,6 +343,7 @@ namespace llvm {
       MOVSS,
       UNPCKL,
       UNPCKH,
+      VPERMILPV,
       VPERMILPI,
       VPERMV,
       VPERMV3,
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 455991e4681..2badbb7d76b 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -188,6 +188,8 @@ def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
 def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                 SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>;
 
+def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                        SDTCisVec<2>]>;
 def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
                                  SDTCisSameAs<0,1>, SDTCisInt<2>]>;
 def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
@@ -232,6 +234,7 @@ def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>;
 def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
 def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
 
+def X86VPermilpv  : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>;
 def X86VPermilpi  : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>;
 def X86VPermv     : SDNode<"X86ISD::VPERMV",    SDTShuff2Op>;
 def X86VPermi     : SDNode<"X86ISD::VPERMI",    SDTShuff2OpI>;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 7a7ca8548a1..a186899d231 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -8418,6 +8418,15 @@ let ExeDomain = SSEPackedDouble in {
 }
 
 let Predicates = [HasAVX] in {
+def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))),
+          (VPERMILPSYrr VR256:$src1, VR256:$src2)>;
+def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+          (VPERMILPSYrm VR256:$src1, addr:$src2)>;
+def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))),
+          (VPERMILPDYrr VR256:$src1, VR256:$src2)>;
+def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))),
+          (VPERMILPDYrm VR256:$src1, addr:$src2)>;
+
 def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
           (VPERMILPSYri VR256:$src1, imm:$imm)>;
 def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
@@ -8428,6 +8437,15 @@ def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)),
 def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))),
           (VPERMILPDYmi addr:$src1, imm:$imm)>;
 
+def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))),
+          (VPERMILPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))),
+          (VPERMILPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))),
+          (VPERMILPDrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))),
+          (VPERMILPDrm VR128:$src1, addr:$src2)>;
+
 def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))),
           (VPERMILPDri VR128:$src1, imm:$imm)>;
 def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))),
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index ded84fc28f1..5665a012606 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -1022,15 +1022,19 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
   case X86::PSHUFBrm:
   case X86::VPSHUFBrm:
-    // Lower PSHUFB normally but add a comment if we can find a constant
-    // shuffle mask. We won't be able to do this at the MC layer because the
-    // mask isn't an immediate.
+  case X86::VPERMILPSrm:
+  case X86::VPERMILPDrm:
+  case X86::VPERMILPSYrm:
+  case X86::VPERMILPDYrm:
+    // Lower PSHUFB and VPERMILP normally but add a comment if we can find
+    // a constant shuffle mask. We won't be able to do this at the MC layer
+    // because the mask isn't an immediate.
     std::string Comment;
     raw_string_ostream CS(Comment);
     SmallVector<int, 16> Mask;
 
-    assert(MI->getNumOperands() >= 6 &&
-           "Wrong number of operands for PSHUFBrm or VPSHUFBrm");
+    // All of these instructions accept a constant pool operand as their fifth.
+    assert(MI->getNumOperands() > 5 && "We should always have at least 5 operands!");
     const MachineOperand &DstOp = MI->getOperand(0);
     const MachineOperand &SrcOp = MI->getOperand(1);
     const MachineOperand &MaskOp = MI->getOperand(5);
@@ -1061,7 +1065,18 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
           assert(MaskTy == C->getType() &&
                  "Expected a constant of the same type!");
 
-          DecodePSHUFBMask(C, Mask);
+          switch (MI->getOpcode()) {
+          case X86::PSHUFBrm:
+          case X86::VPSHUFBrm:
+            DecodePSHUFBMask(C, Mask);
+            break;
+          case X86::VPERMILPSrm:
+          case X86::VPERMILPDrm:
+          case X86::VPERMILPSYrm:
+          case X86::VPERMILPDYrm:
+            DecodeVPERMILPMask(C, Mask);
+          }
+
           assert(Mask.size() == MaskTy->getVectorNumElements() &&
                  "Shuffle mask has a different size than its type!");
         }
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
index e2f731b2af5..df40df2a325 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -381,9 +381,7 @@ define <8 x float> @shuffle_v8f32_10225466(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_00015444(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_00015444
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,0,0,5,4,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,0,1,4,4,4,5]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,0,1,5,4,4,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
   ret <8 x float> %shuffle
@@ -392,9 +390,7 @@ define <8 x float> @shuffle_v8f32_00015444(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_00204644(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_00204644
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[0,2,0,0,4,6,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,0,4,4,6,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,0,4,6,4,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
   ret <8 x float> %shuffle
@@ -403,9 +399,7 @@ define <8 x float> @shuffle_v8f32_00204644(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_03004474(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_03004474
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[0,0,3,0,4,4,7,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,3,0,0,4,7,4,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,3,0,0,4,4,7,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
   ret <8 x float> %shuffle
@@ -414,9 +408,7 @@ define <8 x float> @shuffle_v8f32_03004474(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_10004444(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_10004444
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[0,0,0,0,4,4,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[1,0,0,0,5,4,4,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[1,0,0,0,4,4,4,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x float> %shuffle
@@ -425,9 +417,7 @@ define <8 x float> @shuffle_v8f32_10004444(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_22006446(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_22006446
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[2,0,0,2,6,4,4,6]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[2,2,0,0,6,6,4,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[2,2,0,0,6,4,4,6]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
   ret <8 x float> %shuffle
@@ -436,9 +426,7 @@ define <8 x float> @shuffle_v8f32_22006446(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_33307474(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_33307474
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[3,0,3,0,7,4,7,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[3,3,3,0,7,7,7,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[3,3,3,0,7,4,7,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
   ret <8 x float> %shuffle
@@ -447,8 +435,7 @@ define <8 x float> @shuffle_v8f32_33307474(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_32104567(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_32104567
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[3,2,1,0,7,6,5,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[3,2,1,0,4,5,6,7]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
   ret <8 x float> %shuffle
@@ -457,9 +444,7 @@ define <8 x float> @shuffle_v8f32_32104567(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_00236744(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_00236744
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[2,3,0,0,6,7,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,4,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,6,7,4,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
   ret <8 x float> %shuffle
@@ -468,9 +453,7 @@ define <8 x float> @shuffle_v8f32_00236744(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_00226644(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_00226644
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[2,2,0,0,6,6,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,2,4,4,6,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,2,6,6,4,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
   ret <8 x float> %shuffle
@@ -479,8 +462,7 @@ define <8 x float> @shuffle_v8f32_00226644(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_10324567(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_10324567
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,3,2,5,4,7,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[1,0,3,2,4,5,6,7]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   ret <8 x float> %shuffle
@@ -489,8 +471,7 @@ define <8 x float> @shuffle_v8f32_10324567(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_11334567(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_11334567
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,1,3,3,5,5,7,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[1,1,3,3,4,5,6,7]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x float> %shuffle
@@ -499,8 +480,7 @@ define <8 x float> @shuffle_v8f32_11334567(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_01235467(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_01235467
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,2,3,5,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,1,2,3,5,4,6,7]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
   ret <8 x float> %shuffle
@@ -509,8 +489,7 @@ define <8 x float> @shuffle_v8f32_01235467(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_01235466(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_01235466
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,2,2,5,4,6,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,1,2,3,5,4,6,6]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
   ret <8 x float> %shuffle
@@ -519,9 +498,7 @@ define <8 x float> @shuffle_v8f32_01235466(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_002u6u44(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_002u6u44
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[2,1,0,0,6,5,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,4,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
   ret <8 x float> %shuffle
@@ -530,9 +507,7 @@ define <8 x float> @shuffle_v8f32_002u6u44(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_00uu66uu(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_00uu66uu
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[2,2,2,3,6,6,6,7]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,4,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
   ret <8 x float> %shuffle
@@ -541,8 +516,7 @@ define <8 x float> @shuffle_v8f32_00uu66uu(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_103245uu(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_103245uu
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,3,2,5,4,7,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
   ret <8 x float> %shuffle
@@ -551,8 +525,7 @@ define <8 x float> @shuffle_v8f32_103245uu(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_1133uu67(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_1133uu67
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,1,3,3,5,5,7,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
   ret <8 x float> %shuffle
@@ -561,8 +534,7 @@ define <8 x float> @shuffle_v8f32_1133uu67(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_0uu354uu(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_0uu354uu
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,2,3,5,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
   ret <8 x float> %shuffle
@@ -571,8 +543,7 @@ define <8 x float> @shuffle_v8f32_0uu354uu(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_uuu3uu66(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_uuu3uu66
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[0,1,2,2,4,5,6,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
   ret <8 x float> %shuffle
@@ -956,9 +927,7 @@ define <8 x i32> @shuffle_v8i32_10225466(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_00015444(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_00015444
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,0,0,5,4,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,0,1,4,4,4,5]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,0,1,5,4,4,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
   ret <8 x i32> %shuffle
@@ -967,9 +936,7 @@ define <8 x i32> @shuffle_v8i32_00015444(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_00204644(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_00204644
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[0,2,0,0,4,6,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,0,4,4,6,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,0,4,6,4,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
   ret <8 x i32> %shuffle
@@ -978,9 +945,7 @@ define <8 x i32> @shuffle_v8i32_00204644(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_03004474(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_03004474
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[0,0,3,0,4,4,7,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,3,0,0,4,7,4,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,3,0,0,4,4,7,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
   ret <8 x i32> %shuffle
@@ -989,9 +954,7 @@ define <8 x i32> @shuffle_v8i32_03004474(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_10004444(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_10004444
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[0,0,0,0,4,4,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[1,0,0,0,5,4,4,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[1,0,0,0,4,4,4,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i32> %shuffle
@@ -1000,9 +963,7 @@ define <8 x i32> @shuffle_v8i32_10004444(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_22006446(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_22006446
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[2,0,0,2,6,4,4,6]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[2,2,0,0,6,6,4,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[2,2,0,0,6,4,4,6]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
   ret <8 x i32> %shuffle
@@ -1011,9 +972,7 @@ define <8 x i32> @shuffle_v8i32_22006446(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_33307474(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_33307474
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[3,0,3,0,7,4,7,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[3,3,3,0,7,7,7,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[3,3,3,0,7,4,7,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
   ret <8 x i32> %shuffle
@@ -1022,8 +981,7 @@ define <8 x i32> @shuffle_v8i32_33307474(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_32104567(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_32104567
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[3,2,1,0,7,6,5,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[3,2,1,0,4,5,6,7]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i32> %shuffle
@@ -1032,9 +990,7 @@ define <8 x i32> @shuffle_v8i32_32104567(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_00236744(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_00236744
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[2,3,0,0,6,7,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,4,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,6,7,4,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
   ret <8 x i32> %shuffle
@@ -1043,9 +999,7 @@ define <8 x i32> @shuffle_v8i32_00236744(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_00226644(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_00226644
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[2,2,0,0,6,6,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,2,4,4,6,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,2,6,6,4,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
   ret <8 x i32> %shuffle
@@ -1054,8 +1008,7 @@ define <8 x i32> @shuffle_v8i32_00226644(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_10324567(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_10324567
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,3,2,5,4,7,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[1,0,3,2,4,5,6,7]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i32> %shuffle
@@ -1064,8 +1017,7 @@ define <8 x i32> @shuffle_v8i32_10324567(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_11334567(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_11334567
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,1,3,3,5,5,7,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[1,1,3,3,4,5,6,7]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i32> %shuffle
@@ -1074,8 +1026,7 @@ define <8 x i32> @shuffle_v8i32_11334567(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_01235467(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_01235467
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,2,3,5,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,1,2,3,5,4,6,7]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
   ret <8 x i32> %shuffle
@@ -1084,8 +1035,7 @@ define <8 x i32> @shuffle_v8i32_01235467(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_01235466(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_01235466
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,2,2,5,4,6,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,1,2,3,5,4,6,6]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
   ret <8 x i32> %shuffle
@@ -1094,9 +1044,7 @@ define <8 x i32> @shuffle_v8i32_01235466(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_002u6u44(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_002u6u44
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[2,1,0,0,6,5,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,4,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
   ret <8 x i32> %shuffle
@@ -1105,9 +1053,7 @@ define <8 x i32> @shuffle_v8i32_002u6u44(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_00uu66uu(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_00uu66uu
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[2,2,2,3,6,6,6,7]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,4,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
   ret <8 x i32> %shuffle
@@ -1116,8 +1062,7 @@ define <8 x i32> @shuffle_v8i32_00uu66uu(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_103245uu(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_103245uu
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,3,2,5,4,7,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
   ret <8 x i32> %shuffle
@@ -1126,8 +1071,7 @@ define <8 x i32> @shuffle_v8i32_103245uu(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_1133uu67(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_1133uu67
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,1,3,3,5,5,7,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
   ret <8 x i32> %shuffle
@@ -1136,8 +1080,7 @@ define <8 x i32> @shuffle_v8i32_1133uu67(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_0uu354uu(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_0uu354uu
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,2,3,5,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
   ret <8 x i32> %shuffle
@@ -1146,8 +1089,7 @@ define <8 x i32> @shuffle_v8i32_0uu354uu(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_uuu3uu66(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_uuu3uu66
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[0,1,2,2,4,5,6,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
   ret <8 x i32> %shuffle
-- 
2.34.1