From 8ac2f142a82f5f37e23bb6115c91ed9f6ac65f1e Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Mon, 29 Sep 2014 09:57:07 +0000 Subject: [PATCH] [x86] Make the new vector shuffle lowering lower blends as VSELECT nodes, and rely exclusively on its logic. This removes a ton of duplication from the blend lowering and centralizes it in one place. One downside is that it requires a bunch of hacks to make this work with the current legalization framework. We have to manually speculate one aspect of legalizing VSELECT nodes to get everything to work nicely because the existing legalization framework isn't *actually* bottom-up. The other grossness is that we somewhat duplicate the analysis of constant blends. I'm on the fence here. If reviewers thing this would look better with VSELECT when it has constant operands dumping over tho VECTOR_SHUFFLE, we could go that way. But it would be a substantial change because currently all of the actual blend instructions are matched via patterns in the TD files based around VSELECT nodes (despite them not being perfect fits for that). Suggestions welcome, but at least this removes the rampant duplication in the backend. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218600 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 345 +++++++++------------ test/CodeGen/X86/avx.ll | 2 +- test/CodeGen/X86/vector-shuffle-256-v16.ll | 24 +- 3 files changed, 162 insertions(+), 209 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 8ea1790e52b..a2482f26730 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7143,6 +7143,87 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { } +/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend +/// instruction. +static SDValue lowerVSELECTtoBLENDI(SDLoc DL, SDValue Cond, SDValue LHS, + SDValue RHS, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = LHS.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + + // There is no blend with immediate in AVX-512. + if (VT.is512BitVector()) + return SDValue(); + + // No blend instruction before SSE4.1. + if (!Subtarget->hasSSE41()) + return SDValue(); + // There is no byte-blend immediate controlled instruction. + if (EltVT == MVT::i8) + return SDValue(); + + if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) + return SDValue(); + + auto *CondBV = cast(Cond); + + unsigned BlendMask = 0; + MVT BlendVT = VT; + if (VT == MVT::v16i16) { + // v16i16 blends are completely special. We can only do them when we have + // a repeated blend across the two 128-bit halves and we have AVX2. + if (!Subtarget->hasAVX2()) + return SDValue(); + + for (int i = 0; i < 8; ++i) { + SDValue Lo = CondBV->getOperand(i); + SDValue Hi = CondBV->getOperand(i + 8); + bool IsLoZero = X86::isZeroNode(Lo); + bool IsHiZero = X86::isZeroNode(Hi); + if (Lo->getOpcode() != ISD::UNDEF && Hi->getOpcode() != ISD::UNDEF && + IsLoZero != IsHiZero) + // Asymmetric blends, bail. + return SDValue(); + BlendMask |= (unsigned)(IsLoZero || IsHiZero) << i; + } + } else { + // Everything else uses a generic blend mask computation with a custom type. + if (VT.isInteger()) { + if (VT.is256BitVector()) + // We cast to floating point types if integer blends aren't available, + // and we coerce integer blends when available to occur on the v8i32 + // type. + BlendVT = Subtarget->hasAVX2() + ? MVT::v8i32 + : MVT::getVectorVT( + MVT::getFloatingPointVT(VT.getScalarSizeInBits()), + VT.getVectorNumElements()); + else + // For 128-bit vectors we do the blend on v8i16 types. + BlendVT = MVT::v8i16; + } + assert(BlendVT.getVectorNumElements() <= 8 && + "Cannot blend more than 8 elements with an immediate!"); + // Scale the blend mask based on the number of elements in the selected + // blend type. + int Scale = BlendVT.getVectorNumElements() / VT.getVectorNumElements(); + for (int i = 0, e = CondBV->getNumOperands(); i < e; ++i) { + SDValue CondElement = CondBV->getOperand(i); + if (CondElement->getOpcode() != ISD::UNDEF && + X86::isZeroNode(CondElement)) + for (int j = 0; j < Scale; ++j) + BlendMask |= 1u << (i * Scale + j); + } + } + + LHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, LHS); + RHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, RHS); + + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::BLENDI, DL, BlendVT, LHS, RHS, + DAG.getConstant(BlendMask, MVT::i8))); +} + //===----------------------------------------------------------------------===// // Vector shuffle lowering // @@ -7300,119 +7381,48 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - - unsigned BlendMask = 0; + // Compute the VSELECT mask. Note that VSELECT is really confusing in the + // mix of LLVM's code generator and the x86 backend. We tell the code + // generator that boolean values in the elements of an x86 vector register + // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' + // mapping a select to operand #1, and 'false' mapping to operand #2. The + // reality in x86 is that vector masks (pre-AVX-512) use only the high bit + // of the element (the remaining are ignored) and 0 in that high bit would + // mean operand #1 while 1 in the high bit would mean operand #2. So while + // the LLVM model for boolean values in vector elements gets the relevant + // bit set, it is set backwards and over constrained relative to x86's + // actual model. + SmallVector VSELECTMask; + MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); + MVT MaskVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); + SDValue TrueVal = DAG.getConstant(-1, MaskEltVT); + SDValue FalseVal = DAG.getConstant(0, MaskEltVT); for (int i = 0, Size = Mask.size(); i < Size; ++i) { - if (Mask[i] >= Size) { + if (Mask[i] < 0) { + VSELECTMask.push_back(DAG.getUNDEF(MaskEltVT)); + } else if (Mask[i] < Size) { + if (Mask[i] != i) + return SDValue(); // Shuffled V1 input! + VSELECTMask.push_back(TrueVal); + } else { if (Mask[i] != i + Size) - return SDValue(); // Shuffled V2 input! - BlendMask |= 1u << i; - continue; + return SDValue(); // Shuffled V2 input!; + VSELECTMask.push_back(FalseVal); } - if (Mask[i] >= 0 && Mask[i] != i) - return SDValue(); // Shuffled V1 input! } - switch (VT.SimpleTy) { - case MVT::v2f64: - case MVT::v4f32: - case MVT::v4f64: - case MVT::v8f32: - return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, - DAG.getConstant(BlendMask, MVT::i8)); - case MVT::v4i64: - case MVT::v8i32: - assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); - // FALLTHROUGH - case MVT::v2i64: - case MVT::v4i32: - // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into - // that instruction. - if (Subtarget->hasAVX2()) { - // Scale the blend by the number of 32-bit dwords per element. - int Scale = VT.getScalarSizeInBits() / 32; - BlendMask = 0; - for (int i = 0, Size = Mask.size(); i < Size; ++i) - if (Mask[i] >= Size) - for (int j = 0; j < Scale; ++j) - BlendMask |= 1u << (i * Scale + j); - - MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; - V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1); - V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2); - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, - DAG.getConstant(BlendMask, MVT::i8))); - } - // FALLTHROUGH - case MVT::v8i16: { - // For integer shuffles we need to expand the mask and cast the inputs to - // v8i16s prior to blending. - int Scale = 8 / VT.getVectorNumElements(); - BlendMask = 0; - for (int i = 0, Size = Mask.size(); i < Size; ++i) - if (Mask[i] >= Size) - for (int j = 0; j < Scale; ++j) - BlendMask |= 1u << (i * Scale + j); - - V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1); - V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2); - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, - DAG.getConstant(BlendMask, MVT::i8))); - } - - case MVT::v16i16: { - assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); - SmallVector RepeatedMask; - if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { - // We can lower these with PBLENDW which is mirrored across 128-bit lanes. - assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); - BlendMask = 0; - for (int i = 0; i < 8; ++i) - if (RepeatedMask[i] >= 16) - BlendMask |= 1u << i; - return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, - DAG.getConstant(BlendMask, MVT::i8)); - } - } - // FALLTHROUGH - case MVT::v32i8: { - assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); - // Scale the blend by the number of bytes per element. - int Scale = VT.getScalarSizeInBits() / 8; - assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!"); - - // Compute the VSELECT mask. Note that VSELECT is really confusing in the - // mix of LLVM's code generator and the x86 backend. We tell the code - // generator that boolean values in the elements of an x86 vector register - // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' - // mapping a select to operand #1, and 'false' mapping to operand #2. The - // reality in x86 is that vector masks (pre-AVX-512) use only the high bit - // of the element (the remaining are ignored) and 0 in that high bit would - // mean operand #1 while 1 in the high bit would mean operand #2. So while - // the LLVM model for boolean values in vector elements gets the relevant - // bit set, it is set backwards and over constrained relative to x86's - // actual model. - SDValue VSELECTMask[32]; - for (int i = 0, Size = Mask.size(); i < Size; ++i) - for (int j = 0; j < Scale; ++j) - VSELECTMask[Scale * i + j] = - Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) - : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8); - - V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1); - V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2); - return DAG.getNode( - ISD::BITCAST, DL, VT, - DAG.getNode(ISD::VSELECT, DL, MVT::v32i8, - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask), - V1, V2)); - } + // We have to manually attempt to lower this via BLENDI because at this phase + // of legalization we may end up legalizing the BUILD_VECTOR past where it can + // be analyzed prior to legalizing the VSELECT. + // FIXME: At some point, the legalizer should work more like the DAG combiner + // where it evaluates replacement nodes eagerly rather than risking proceeding + // to their (now shared) operands. + SDValue Cond = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, VSELECTMask); + if (SDValue BlendI = lowerVSELECTtoBLENDI(DL, Cond, V1, V2, Subtarget, DAG)) + return BlendI; - default: - llvm_unreachable("Not a supported integer vector type!"); - } + // Otherwise fall back on the generic VSELECT lowering. + return DAG.getNode(ISD::VSELECT, DL, VT, Cond, V1, V2); } /// \brief Generic routine to lower a shuffle and blend as a decomposed set of @@ -11797,91 +11807,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend -/// instruction. -static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - SDValue Cond = Op.getOperand(0); - SDValue LHS = Op.getOperand(1); - SDValue RHS = Op.getOperand(2); - SDLoc DL(Op); - MVT VT = Op.getSimpleValueType(); - MVT EltVT = VT.getVectorElementType(); - - // There is no blend with immediate in AVX-512. - if (VT.is512BitVector()) - return SDValue(); - - // No blend instruction before SSE4.1. - if (!Subtarget->hasSSE41()) - return SDValue(); - // There is no byte-blend immediate controlled instruction. - if (EltVT == MVT::i8) - return SDValue(); - - if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) - return SDValue(); - - auto *CondBV = cast(Cond); - - unsigned BlendMask = 0; - MVT BlendVT = VT; - if (VT == MVT::v16i16) { - // v16i16 blends are completely special. We can only do them when we have - // a repeated blend across the two 128-bit halves and we have AVX2. - if (!Subtarget->hasAVX2()) - return SDValue(); - - for (int i = 0; i < 8; ++i) { - SDValue Lo = CondBV->getOperand(i); - SDValue Hi = CondBV->getOperand(i + 8); - bool IsLoZero = X86::isZeroNode(Lo); - bool IsHiZero = X86::isZeroNode(Hi); - if (Lo->getOpcode() != ISD::UNDEF && Hi->getOpcode() != ISD::UNDEF && - IsLoZero != IsHiZero) - // Asymmetric blends, bail. - return SDValue(); - BlendMask |= (unsigned)(IsLoZero || IsHiZero) << i; - } - } else { - // Everything else uses a generic blend mask computation with a custom type. - if (VT.isInteger()) { - if (VT.is256BitVector()) - // We cast to floating point types if integer blends aren't available, - // and we coerce integer blends when available to occur on the v8i32 - // type. - BlendVT = Subtarget->hasAVX2() - ? MVT::v8i32 - : MVT::getVectorVT( - MVT::getFloatingPointVT(VT.getScalarSizeInBits()), - VT.getVectorNumElements()); - else - // For 128-bit vectors we do the blend on v8i16 types. - BlendVT = MVT::v8i16; - } - assert(BlendVT.getVectorNumElements() <= 8 && - "Cannot blend more than 8 elements with an immediate!"); - // Scale the blend mask based on the number of elements in the selected - // blend type. - int Scale = BlendVT.getVectorNumElements() / VT.getVectorNumElements(); - for (int i = 0, e = CondBV->getNumOperands(); i < e; ++i) { - SDValue CondElement = CondBV->getOperand(i); - if (CondElement->getOpcode() != ISD::UNDEF && - X86::isZeroNode(CondElement)) - for (int j = 0; j < Scale; ++j) - BlendMask |= 1u << (i * Scale + j); - } - } - - LHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, LHS); - RHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, RHS); - - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::BLENDI, DL, BlendVT, LHS, RHS, - DAG.getConstant(BlendMask, MVT::i8))); -} - SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); // A vselect where all conditions and data are constants can be optimized into // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) && @@ -11889,22 +11816,48 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) return SDValue(); - SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG); + SDValue Cond = Op.getOperand(0); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + SDValue BlendOp = lowerVSELECTtoBLENDI(DL, Cond, LHS, RHS, Subtarget, DAG); if (BlendOp.getNode()) return BlendOp; - // Some types for vselect were previously set to Expand, not Legal or - // Custom. Return an empty SDValue so we fall-through to Expand, after - // the Custom lowering phase. + // If the condition vector type is different from the input vector types, bail + // to the TD patterns. This should only happen with vNi1 conditions. + if (Op.getSimpleValueType() != Op->getOperand(0).getSimpleValueType()) + return Op; + + // Check for types that need to be mapped in order to lower. MVT VT = Op.getSimpleValueType(); switch (VT.SimpleTy) { default: break; + case MVT::v4i64: + case MVT::v8i32: + // If we don't have AVX2 we don't want to drop to a v32i8 which will require + // splitting the vector. Instead, let the patterns for v4f64 and v8f32 lower + // these blends. + if (!Subtarget->hasAVX2()) + break; + // FALL THROUGH + + case MVT::v2i64: + case MVT::v4i32: case MVT::v8i16: case MVT::v16i16: if (Subtarget->hasBWI() && Subtarget->hasVLX()) break; - return SDValue(); + + // We need to phrase these as i8 blends. Bitcasting the condition is fine + // because true is defined as -1 which will set *all* of the bits to one. + MVT BlendVT = MVT::getVectorVT(MVT::i8, (VT.getScalarSizeInBits() / 8) * + VT.getVectorNumElements()); + Cond = DAG.getNode(ISD::BITCAST, DL, BlendVT, Op->getOperand(0)); + LHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, Op->getOperand(1)); + RHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, Op->getOperand(2)); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(ISD::VSELECT, DL, BlendVT, Cond, LHS, RHS)); } // We couldn't create a "Blend with immediate" node. diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll index cba6d98f5a8..f66b1b47cc2 100644 --- a/test/CodeGen/X86/avx.ll +++ b/test/CodeGen/X86/avx.ll @@ -3,7 +3,7 @@ define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: @blendvb_fallback_v4i32 -; CHECK: vblendvps +; CHECK: vpblendvb ; CHECK: ret %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y ret <4 x i32> %ret diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index ad6eca3b926..3488904c8b0 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -166,7 +166,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0 ; AVX2-NEXT: vperm2i128 {{.*}} # ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -190,7 +190,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0 ; AVX2-NEXT: vperm2i128 {{.*}} # ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,2,3,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,u,u,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [0,0,0,0,0,0,65535,0,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -215,7 +215,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0 ; AVX2-NEXT: vperm2i128 {{.*}} # ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,4,5,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,u,u,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [0,0,0,0,0,65535,0,0,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -240,7 +240,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0 ; AVX2-NEXT: vperm2i128 {{.*}} # ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[u,u,u,u,u,u,u,u,6,7,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,u,u,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [0,0,0,0,65535,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -265,7 +265,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0 ; AVX2-NEXT: vperm2i128 {{.*}} # ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,u,u,0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [0,0,0,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -290,7 +290,7 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0 ; AVX2-NEXT: vperm2i128 {{.*}} # ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[u,u,u,u,10,11,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,u,u,0,1,0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [0,0,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -316,7 +316,7 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; AVX2-NEXT: vperm2i128 {{.*}} # ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,u,u,0,1,0,1,0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [0,65535,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -342,7 +342,7 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; AVX2-NEXT: vperm2i128 {{.*}} # ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[14,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[u,u,0,1,0,1,0,1,0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [65535,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -648,7 +648,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_3 ; ; AVX2-LABEL: @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31 ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -665,7 +665,7 @@ define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_1 ; ; AVX2-LABEL: @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15 ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -684,7 +684,7 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_1 ; ; AVX2-LABEL: @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15 ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255] +; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [65535,0,65535,0,65535,0,65535,0,0,65535,0,65535,0,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -703,7 +703,7 @@ define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_3 ; ; AVX2-LABEL: @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31 ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255,255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0] +; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [0,65535,0,65535,0,65535,0,65535,65535,0,65535,0,65535,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> -- 2.34.1