From 8ac2f142a82f5f37e23bb6115c91ed9f6ac65f1e Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Mon, 29 Sep 2014 09:57:07 +0000
Subject: [PATCH] [x86] Make the new vector shuffle lowering lower blends as
 VSELECT nodes, and rely exclusively on its logic. This removes a ton of
 duplication from the blend lowering and centralizes it in one place.

One downside is that it requires a bunch of hacks to make this work with
the current legalization framework. We have to manually speculate one
aspect of legalizing VSELECT nodes to get everything to work nicely
because the existing legalization framework isn't *actually* bottom-up.

The other grossness is that we somewhat duplicate the analysis of
constant blends. I'm on the fence here. If reviewers thing this would
look better with VSELECT when it has constant operands dumping over tho
VECTOR_SHUFFLE, we could go that way. But it would be a substantial
change because currently all of the actual blend instructions are
matched via patterns in the TD files based around VSELECT nodes (despite
them not being perfect fits for that). Suggestions welcome, but at least
this removes the rampant duplication in the backend.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218600 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp         | 345 +++++++++------------
 test/CodeGen/X86/avx.ll                    |   2 +-
 test/CodeGen/X86/vector-shuffle-256-v16.ll |  24 +-
 3 files changed, 162 insertions(+), 209 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 8ea1790e52b..a2482f26730 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -7143,6 +7143,87 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
 }
 
 
+/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
+/// instruction.
+static SDValue lowerVSELECTtoBLENDI(SDLoc DL, SDValue Cond, SDValue LHS,
+                                    SDValue RHS, const X86Subtarget *Subtarget,
+                                    SelectionDAG &DAG) {
+  MVT VT = LHS.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return SDValue();
+
+  // No blend instruction before SSE4.1.
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+  // There is no byte-blend immediate controlled instruction.
+  if (EltVT == MVT::i8)
+    return SDValue();
+
+  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+    return SDValue();
+
+  auto *CondBV = cast<BuildVectorSDNode>(Cond);
+
+  unsigned BlendMask = 0;
+  MVT BlendVT = VT;
+  if (VT == MVT::v16i16) {
+    // v16i16 blends are completely special. We can only do them when we have
+    // a repeated blend across the two 128-bit halves and we have AVX2.
+    if (!Subtarget->hasAVX2())
+      return SDValue();
+
+    for (int i = 0; i < 8; ++i) {
+      SDValue Lo = CondBV->getOperand(i);
+      SDValue Hi = CondBV->getOperand(i + 8);
+      bool IsLoZero = X86::isZeroNode(Lo);
+      bool IsHiZero = X86::isZeroNode(Hi);
+      if (Lo->getOpcode() != ISD::UNDEF && Hi->getOpcode() != ISD::UNDEF &&
+          IsLoZero != IsHiZero)
+        // Asymmetric blends, bail.
+        return SDValue();
+      BlendMask |= (unsigned)(IsLoZero || IsHiZero) << i;
+    }
+  } else {
+    // Everything else uses a generic blend mask computation with a custom type.
+    if (VT.isInteger()) {
+      if (VT.is256BitVector())
+        // We cast to floating point types if integer blends aren't available,
+        // and we coerce integer blends when available to occur on the v8i32
+        // type.
+        BlendVT = Subtarget->hasAVX2()
+                      ? MVT::v8i32
+                      : MVT::getVectorVT(
+                            MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
+                            VT.getVectorNumElements());
+      else
+        // For 128-bit vectors we do the blend on v8i16 types.
+        BlendVT = MVT::v8i16;
+    }
+    assert(BlendVT.getVectorNumElements() <= 8 &&
+           "Cannot blend more than 8 elements with an immediate!");
+    // Scale the blend mask based on the number of elements in the selected
+    // blend type.
+    int Scale = BlendVT.getVectorNumElements() / VT.getVectorNumElements();
+    for (int i = 0, e = CondBV->getNumOperands(); i < e; ++i) {
+      SDValue CondElement = CondBV->getOperand(i);
+      if (CondElement->getOpcode() != ISD::UNDEF &&
+          X86::isZeroNode(CondElement))
+        for (int j = 0; j < Scale; ++j)
+          BlendMask |= 1u << (i * Scale + j);
+    }
+  }
+
+  LHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, LHS);
+  RHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, RHS);
+
+  return DAG.getNode(ISD::BITCAST, DL, VT,
+                     DAG.getNode(X86ISD::BLENDI, DL, BlendVT, LHS, RHS,
+                                 DAG.getConstant(BlendMask, MVT::i8)));
+}
+
 //===----------------------------------------------------------------------===//
 // Vector shuffle lowering
 //
@@ -7300,119 +7381,48 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
                                          SDValue V2, ArrayRef<int> Mask,
                                          const X86Subtarget *Subtarget,
                                          SelectionDAG &DAG) {
-
-  unsigned BlendMask = 0;
+  // Compute the VSELECT mask. Note that VSELECT is really confusing in the
+  // mix of LLVM's code generator and the x86 backend. We tell the code
+  // generator that boolean values in the elements of an x86 vector register
+  // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
+  // mapping a select to operand #1, and 'false' mapping to operand #2. The
+  // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
+  // of the element (the remaining are ignored) and 0 in that high bit would
+  // mean operand #1 while 1 in the high bit would mean operand #2. So while
+  // the LLVM model for boolean values in vector elements gets the relevant
+  // bit set, it is set backwards and over constrained relative to x86's
+  // actual model.
+  SmallVector<SDValue, 32> VSELECTMask;
+  MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
+  MVT MaskVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
+  SDValue TrueVal = DAG.getConstant(-1, MaskEltVT);
+  SDValue FalseVal = DAG.getConstant(0, MaskEltVT);
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
-    if (Mask[i] >= Size) {
+    if (Mask[i] < 0) {
+      VSELECTMask.push_back(DAG.getUNDEF(MaskEltVT));
+    } else if (Mask[i] < Size) {
+      if (Mask[i] != i)
+        return SDValue(); // Shuffled V1 input!
+      VSELECTMask.push_back(TrueVal);
+    } else {
       if (Mask[i] != i + Size)
-        return SDValue(); // Shuffled V2 input!
-      BlendMask |= 1u << i;
-      continue;
+        return SDValue(); // Shuffled V2 input!;
+      VSELECTMask.push_back(FalseVal);
     }
-    if (Mask[i] >= 0 && Mask[i] != i)
-      return SDValue(); // Shuffled V1 input!
   }
-  switch (VT.SimpleTy) {
-  case MVT::v2f64:
-  case MVT::v4f32:
-  case MVT::v4f64:
-  case MVT::v8f32:
-    return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
-                       DAG.getConstant(BlendMask, MVT::i8));
 
-  case MVT::v4i64:
-  case MVT::v8i32:
-    assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
-    // FALLTHROUGH
-  case MVT::v2i64:
-  case MVT::v4i32:
-    // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
-    // that instruction.
-    if (Subtarget->hasAVX2()) {
-      // Scale the blend by the number of 32-bit dwords per element.
-      int Scale =  VT.getScalarSizeInBits() / 32;
-      BlendMask = 0;
-      for (int i = 0, Size = Mask.size(); i < Size; ++i)
-        if (Mask[i] >= Size)
-          for (int j = 0; j < Scale; ++j)
-            BlendMask |= 1u << (i * Scale + j);
-
-      MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
-      V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
-      V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
-      return DAG.getNode(ISD::BITCAST, DL, VT,
-                         DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
-                                     DAG.getConstant(BlendMask, MVT::i8)));
-    }
-    // FALLTHROUGH
-  case MVT::v8i16: {
-    // For integer shuffles we need to expand the mask and cast the inputs to
-    // v8i16s prior to blending.
-    int Scale = 8 / VT.getVectorNumElements();
-    BlendMask = 0;
-    for (int i = 0, Size = Mask.size(); i < Size; ++i)
-      if (Mask[i] >= Size)
-        for (int j = 0; j < Scale; ++j)
-          BlendMask |= 1u << (i * Scale + j);
-
-    V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
-    V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
-    return DAG.getNode(ISD::BITCAST, DL, VT,
-                       DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
-                                   DAG.getConstant(BlendMask, MVT::i8)));
-  }
-
-  case MVT::v16i16: {
-    assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
-    SmallVector<int, 8> RepeatedMask;
-    if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
-      // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
-      assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
-      BlendMask = 0;
-      for (int i = 0; i < 8; ++i)
-        if (RepeatedMask[i] >= 16)
-          BlendMask |= 1u << i;
-      return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
-                         DAG.getConstant(BlendMask, MVT::i8));
-    }
-  }
-    // FALLTHROUGH
-  case MVT::v32i8: {
-    assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
-    // Scale the blend by the number of bytes per element.
-    int Scale =  VT.getScalarSizeInBits() / 8;
-    assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
-
-    // Compute the VSELECT mask. Note that VSELECT is really confusing in the
-    // mix of LLVM's code generator and the x86 backend. We tell the code
-    // generator that boolean values in the elements of an x86 vector register
-    // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
-    // mapping a select to operand #1, and 'false' mapping to operand #2. The
-    // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
-    // of the element (the remaining are ignored) and 0 in that high bit would
-    // mean operand #1 while 1 in the high bit would mean operand #2. So while
-    // the LLVM model for boolean values in vector elements gets the relevant
-    // bit set, it is set backwards and over constrained relative to x86's
-    // actual model.
-    SDValue VSELECTMask[32];
-    for (int i = 0, Size = Mask.size(); i < Size; ++i)
-      for (int j = 0; j < Scale; ++j)
-        VSELECTMask[Scale * i + j] =
-            Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
-                        : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
-
-    V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
-    V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
-    return DAG.getNode(
-        ISD::BITCAST, DL, VT,
-        DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
-                    DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
-                    V1, V2));
-  }
+  // We have to manually attempt to lower this via BLENDI because at this phase
+  // of legalization we may end up legalizing the BUILD_VECTOR past where it can
+  // be analyzed prior to legalizing the VSELECT.
+  // FIXME: At some point, the legalizer should work more like the DAG combiner
+  // where it evaluates replacement nodes eagerly rather than risking proceeding
+  // to their (now shared) operands.
+  SDValue Cond = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, VSELECTMask);
+  if (SDValue BlendI = lowerVSELECTtoBLENDI(DL, Cond, V1, V2, Subtarget, DAG))
+    return BlendI;
 
-  default:
-    llvm_unreachable("Not a supported integer vector type!");
-  }
+  // Otherwise fall back on the generic VSELECT lowering.
+  return DAG.getNode(ISD::VSELECT, DL, VT, Cond, V1, V2);
 }
 
 /// \brief Generic routine to lower a shuffle and blend as a decomposed set of
@@ -11797,91 +11807,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
-/// instruction.
-static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
-                                    SelectionDAG &DAG) {
-  SDValue Cond = Op.getOperand(0);
-  SDValue LHS = Op.getOperand(1);
-  SDValue RHS = Op.getOperand(2);
-  SDLoc DL(Op);
-  MVT VT = Op.getSimpleValueType();
-  MVT EltVT = VT.getVectorElementType();
-
-  // There is no blend with immediate in AVX-512.
-  if (VT.is512BitVector())
-    return SDValue();
-
-  // No blend instruction before SSE4.1.
-  if (!Subtarget->hasSSE41())
-    return SDValue();
-  // There is no byte-blend immediate controlled instruction.
-  if (EltVT == MVT::i8)
-    return SDValue();
-
-  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
-    return SDValue();
-
-  auto *CondBV = cast<BuildVectorSDNode>(Cond);
-
-  unsigned BlendMask = 0;
-  MVT BlendVT = VT;
-  if (VT == MVT::v16i16) {
-    // v16i16 blends are completely special. We can only do them when we have
-    // a repeated blend across the two 128-bit halves and we have AVX2.
-    if (!Subtarget->hasAVX2())
-      return SDValue();
-
-    for (int i = 0; i < 8; ++i) {
-      SDValue Lo = CondBV->getOperand(i);
-      SDValue Hi = CondBV->getOperand(i + 8);
-      bool IsLoZero = X86::isZeroNode(Lo);
-      bool IsHiZero = X86::isZeroNode(Hi);
-      if (Lo->getOpcode() != ISD::UNDEF && Hi->getOpcode() != ISD::UNDEF &&
-          IsLoZero != IsHiZero)
-        // Asymmetric blends, bail.
-        return SDValue();
-      BlendMask |= (unsigned)(IsLoZero || IsHiZero) << i;
-    }
-  } else {
-    // Everything else uses a generic blend mask computation with a custom type.
-    if (VT.isInteger()) {
-      if (VT.is256BitVector())
-        // We cast to floating point types if integer blends aren't available,
-        // and we coerce integer blends when available to occur on the v8i32
-        // type.
-        BlendVT = Subtarget->hasAVX2()
-                      ? MVT::v8i32
-                      : MVT::getVectorVT(
-                            MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
-                            VT.getVectorNumElements());
-      else
-        // For 128-bit vectors we do the blend on v8i16 types.
-        BlendVT = MVT::v8i16;
-    }
-    assert(BlendVT.getVectorNumElements() <= 8 &&
-           "Cannot blend more than 8 elements with an immediate!");
-    // Scale the blend mask based on the number of elements in the selected
-    // blend type.
-    int Scale = BlendVT.getVectorNumElements() / VT.getVectorNumElements();
-    for (int i = 0, e = CondBV->getNumOperands(); i < e; ++i) {
-      SDValue CondElement = CondBV->getOperand(i);
-      if (CondElement->getOpcode() != ISD::UNDEF &&
-          X86::isZeroNode(CondElement))
-        for (int j = 0; j < Scale; ++j)
-          BlendMask |= 1u << (i * Scale + j);
-    }
-  }
-
-  LHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, LHS);
-  RHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, RHS);
-
-  return DAG.getNode(ISD::BITCAST, DL, VT,
-                     DAG.getNode(X86ISD::BLENDI, DL, BlendVT, LHS, RHS,
-                                 DAG.getConstant(BlendMask, MVT::i8)));
-}
-
 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
   // A vselect where all conditions and data are constants can be optimized into
   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
@@ -11889,22 +11816,48 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
     return SDValue();
 
-  SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);
+  SDValue Cond = Op.getOperand(0);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
+  SDValue BlendOp = lowerVSELECTtoBLENDI(DL, Cond, LHS, RHS, Subtarget, DAG);
   if (BlendOp.getNode())
     return BlendOp;
 
-  // Some types for vselect were previously set to Expand, not Legal or
-  // Custom. Return an empty SDValue so we fall-through to Expand, after
-  // the Custom lowering phase.
+  // If the condition vector type is different from the input vector types, bail
+  // to the TD patterns. This should only happen with vNi1 conditions.
+  if (Op.getSimpleValueType() != Op->getOperand(0).getSimpleValueType())
+    return Op;
+
+  // Check for types that need to be mapped in order to lower.
   MVT VT = Op.getSimpleValueType();
   switch (VT.SimpleTy) {
   default:
     break;
+  case MVT::v4i64:
+  case MVT::v8i32:
+    // If we don't have AVX2 we don't want to drop to a v32i8 which will require
+    // splitting the vector. Instead, let the patterns for v4f64 and v8f32 lower
+    // these blends.
+    if (!Subtarget->hasAVX2())
+      break;
+    // FALL THROUGH
+
+  case MVT::v2i64:
+  case MVT::v4i32:
   case MVT::v8i16:
   case MVT::v16i16:
     if (Subtarget->hasBWI() && Subtarget->hasVLX())
       break;
-    return SDValue();
+
+    // We need to phrase these as i8 blends. Bitcasting the condition is fine
+    // because true is defined as -1 which will set *all* of the bits to one.
+    MVT BlendVT = MVT::getVectorVT(MVT::i8, (VT.getScalarSizeInBits() / 8) *
+                                                VT.getVectorNumElements());
+    Cond = DAG.getNode(ISD::BITCAST, DL, BlendVT, Op->getOperand(0));
+    LHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, Op->getOperand(1));
+    RHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, Op->getOperand(2));
+    return DAG.getNode(ISD::BITCAST, DL, VT,
+                       DAG.getNode(ISD::VSELECT, DL, BlendVT, Cond, LHS, RHS));
   }
 
   // We couldn't create a "Blend with immediate" node.
diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll
index cba6d98f5a8..f66b1b47cc2 100644
--- a/test/CodeGen/X86/avx.ll
+++ b/test/CodeGen/X86/avx.ll
@@ -3,7 +3,7 @@
 
 define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @blendvb_fallback_v4i32
-; CHECK: vblendvps
+; CHECK: vpblendvb
 ; CHECK: ret
   %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y
   ret <4 x i32> %ret
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index ad6eca3b926..3488904c8b0 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -166,7 +166,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0
 ; AVX2-NEXT:    vperm2i128 {{.*}} # ymm1 = ymm0[2,3,0,1]
 ; AVX2-NEXT:    vpshufb {{.*}} # ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -190,7 +190,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0
 ; AVX2-NEXT:    vperm2i128 {{.*}} # ymm1 = ymm0[2,3,0,1]
 ; AVX2-NEXT:    vpshufb {{.*}} # ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,2,3,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,u,u,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [0,0,0,0,0,0,65535,0,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -215,7 +215,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0
 ; AVX2-NEXT:    vperm2i128 {{.*}} # ymm1 = ymm0[2,3,0,1]
 ; AVX2-NEXT:    vpshufb {{.*}} # ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,4,5,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,u,u,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [0,0,0,0,0,65535,0,0,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -240,7 +240,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0
 ; AVX2-NEXT:    vperm2i128 {{.*}} # ymm1 = ymm0[2,3,0,1]
 ; AVX2-NEXT:    vpshufb {{.*}} # ymm1 = ymm1[u,u,u,u,u,u,u,u,6,7,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,u,u,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [0,0,0,0,65535,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -265,7 +265,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0
 ; AVX2-NEXT:    vperm2i128 {{.*}} # ymm1 = ymm0[2,3,0,1]
 ; AVX2-NEXT:    vpshufb {{.*}} # ymm1 = ymm1[u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,u,u,0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [0,0,0,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -290,7 +290,7 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0
 ; AVX2-NEXT:    vperm2i128 {{.*}} # ymm1 = ymm0[2,3,0,1]
 ; AVX2-NEXT:    vpshufb {{.*}} # ymm1 = ymm1[u,u,u,u,10,11,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,u,u,0,1,0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [0,0,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -316,7 +316,7 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0
 ; AVX2-NEXT:    vperm2i128 {{.*}} # ymm1 = ymm0[2,3,0,1]
 ; AVX2-NEXT:    vpshufb {{.*}} # ymm1 = ymm1[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,u,u,0,1,0,1,0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [0,65535,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -342,7 +342,7 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
 ; AVX2-NEXT:    vperm2i128 {{.*}} # ymm1 = ymm0[2,3,0,1]
 ; AVX2-NEXT:    vpshufb {{.*}} # ymm1 = ymm1[14,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[u,u,0,1,0,1,0,1,0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [65535,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -648,7 +648,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_3
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
+; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
@@ -665,7 +665,7 @@ define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_1
 ;
 ; AVX2-LABEL: @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -684,7 +684,7 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_1
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255]
+; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [65535,0,65535,0,65535,0,65535,0,0,65535,0,65535,0,65535,0,65535]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
@@ -703,7 +703,7 @@ define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_3
 ;
 ; AVX2-LABEL: @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255,255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0]
+; AVX2-NEXT:    vmovdqa {{.*}} # ymm2 = [0,65535,0,65535,0,65535,0,65535,65535,0,65535,0,65535,0,65535,0]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
-- 
2.34.1