static bool isTargetShuffle(unsigned Opcode) {
switch(Opcode) {
default: return false;
+ case X86ISD::PSHUFB:
case X86ISD::PSHUFD:
case X86ISD::PSHUFHW:
case X86ISD::PSHUFLW:
DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = true;
break;
+ case X86ISD::PSHUFB: {
+ IsUnary = true;
+ SDValue MaskNode = N->getOperand(1);
+ while (MaskNode->getOpcode() == ISD::BITCAST)
+ MaskNode = MaskNode->getOperand(0);
+
+ if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
+ // If we have a build-vector, then things are easy.
+ EVT VT = MaskNode.getValueType();
+ assert(VT.isVector() &&
+ "Can't produce a non-vector with a build_vector!");
+ if (!VT.isInteger())
+ return false;
+
+ int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
+
+ SmallVector<uint64_t, 32> RawMask;
+ for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
+ auto *CN = dyn_cast<ConstantSDNode>(MaskNode->getOperand(i));
+ if (!CN)
+ return false;
+ APInt MaskElement = CN->getAPIntValue();
+
+ // We now have to decode the element which could be any integer size and
+ // extract each byte of it.
+ for (int j = 0; j < NumBytesPerElement; ++j) {
+ // Note that this is x86 and so always little endian: the low byte is
+ // the first byte of the mask.
+ RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
+ MaskElement = MaskElement.lshr(8);
+ }
+ }
+ DecodePSHUFBMask(RawMask, Mask);
+ break;
+ }
+
+ auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
+ if (!MaskLoad)
+ return false;
+
+ SDValue Ptr = MaskLoad->getBasePtr();
+ if (Ptr->getOpcode() == X86ISD::Wrapper)
+ Ptr = Ptr->getOperand(0);
+
+ auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
+ if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
+ return false;
+
+ if (auto *C = dyn_cast<ConstantDataSequential>(MaskCP->getConstVal())) {
+ // FIXME: Support AVX-512 here.
+ if (!C->getType()->isVectorTy() ||
+ (C->getNumElements() != 16 && C->getNumElements() != 32))
+ return false;
+
+ assert(C->getType()->isVectorTy() && "Expected a vector constant.");
+ DecodePSHUFBMask(C, Mask);
+ break;
+ }
+
+ return false;
+ }
case X86ISD::VPERMI:
ImmN = N->getOperand(N->getNumOperands()-1);
DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds);
}
+ // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
+ // with PSHUFB. It is important to do this before we attempt to generate any
+ // blends but after all of the single-input lowerings. If the single input
+ // lowerings can find an instruction sequence that is faster than a PSHUFB, we
+ // want to preserve that and we can DAG combine any longer sequences into
+ // a PSHUFB in the end. But once we start blending from multiple inputs,
+ // the complexity of DAG combining bad patterns back into PSHUFB is too high,
+ // and there are *very* few patterns that would actually be faster than the
+ // PSHUFB approach because of its ability to zero lanes.
+ //
+ // FIXME: The only exceptions to the above are blends which are exact
+ // interleavings with direct instructions supporting them. We currently don't
+ // handle those well here.
+ if (Subtarget->hasSSSE3()) {
+ SDValue V1Mask[16];
+ SDValue V2Mask[16];
+ for (int i = 0; i < 16; ++i)
+ if (Mask[i] == -1) {
+ V1Mask[i] = V2Mask[i] = DAG.getConstant(0x80, MVT::i8);
+ } else {
+ V1Mask[i] = DAG.getConstant(Mask[i] < 16 ? Mask[i] : 0x80, MVT::i8);
+ V2Mask[i] =
+ DAG.getConstant(Mask[i] < 16 ? 0x80 : Mask[i] - 16, MVT::i8);
+ }
+ V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
+ V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
+ return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
+ }
+
int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
/// for this operation, or into a PSHUFB instruction which is a fully general
/// instruction but should only be used to replace chains over a certain depth.
static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
- int Depth, SelectionDAG &DAG,
+ int Depth, bool HasPSHUFB, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
}
}
- // Bail if we have fewer than 3 shuffle instructions in the chain.
- if (Depth < 3)
+ // Don't try to re-form single instruction chains under any circumstances now
+ // that we've done encoding canonicalization for them.
+ if (Depth < 2)
return false;
- // If we have 3 or more shuffle instructions, we can replace them with
- // a single PSHUFB instruction profitably. Intel's manuals suggest only using
- // PSHUFB if doing so replacing 5 instructions, but in practice PSHUFB tends
- // to be *very* fast so we're more aggressive.
- if (Subtarget->hasSSSE3()) {
+ // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
+ // can replace them with a single PSHUFB instruction profitably. Intel's
+ // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
+ // in practice PSHUFB tends to be *very* fast so we're more aggressive.
+ if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
SmallVector<SDValue, 16> PSHUFBMask;
assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
int Ratio = 16 / Mask.size();
/// combining in this recursive walk.
static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
ArrayRef<int> IncomingMask, int Depth,
- SelectionDAG &DAG,
+ bool HasPSHUFB, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
// Bound the depth of our recursive combine because this is ultimately
// See if we can recurse into the operand to combine more things.
switch (Op.getOpcode()) {
+ case X86ISD::PSHUFB:
+ HasPSHUFB = true;
case X86ISD::PSHUFD:
case X86ISD::PSHUFHW:
case X86ISD::PSHUFLW:
if (Op.getOperand(0).hasOneUse() &&
combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
- DAG, DCI, Subtarget))
+ HasPSHUFB, DAG, DCI, Subtarget))
return true;
break;
// We can't check for single use, we have to check that this shuffle is the only user.
if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
- DAG, DCI, Subtarget))
+ HasPSHUFB, DAG, DCI, Subtarget))
return true;
break;
}
Mask.swap(NewMask);
}
- return combineX86ShuffleChain(Op, Root, Mask, Depth, DAG, DCI, Subtarget);
+ return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
+ Subtarget);
}
/// \brief Get the PSHUF-style mask from PSHUF node.
SmallVector<int, 1> NonceMask; // Just a placeholder.
NonceMask.push_back(0);
if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
- /*Depth*/ 1, DAG, DCI, Subtarget))
+ /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
+ DCI, Subtarget))
return SDValue(); // This routine will use CombineTo to replace N.
}
case X86ISD::UNPCKL:
case X86ISD::MOVHLPS:
case X86ISD::MOVLHPS:
+ case X86ISD::PSHUFB:
case X86ISD::PSHUFD:
case X86ISD::PSHUFHW:
case X86ISD::PSHUFLW:
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSSE3
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT: retq
+; FIXME-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
+; FIXME: # BB#0:
+; FIXME-NEXT: punpcklbw %xmm0, %xmm0
+; FIXME-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; FIXME-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
+; FIXME-NEXT: retq
+; FIXME-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
+;
+; SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
+; SSE2: # BB#0:
+; SSE2-NEXT: punpcklbw %xmm0, %xmm0
+; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01
+; SSE2: # BB#0:
+; SSE2-NEXT: punpcklbw %xmm0, %xmm0
+; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; SSSE3-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,2,4,5,6,7]
-; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,6,6,6]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,2,4,5,6,7]
+; SSE2-NEXT: punpcklbw %xmm0, %xmm0
+; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,6,6,6]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; SSSE3-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT: punpcklwd %xmm0, %xmm0
-; CHECK-SSE2-NEXT: retq
+; ALL-LABEL: @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03
+; ALL: # BB#0:
+; ALL-NEXT: punpcklbw %xmm0, %xmm0
+; ALL-NEXT: punpcklwd %xmm0, %xmm0
+; ALL-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT: punpckhwd %xmm0, %xmm0
-; CHECK-SSE2-NEXT: retq
+; ALL-LABEL: @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07
+; ALL: # BB#0:
+; ALL-NEXT: punpcklbw %xmm0, %xmm0
+; ALL-NEXT: punpckhwd %xmm0, %xmm0
+; ALL-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,6,6]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12
+; SSE2: # BB#0:
+; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: punpcklbw %xmm0, %xmm0
+; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,6,6]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
+; SSSE3-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT: retq
+; ALL-LABEL: @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07
+; ALL: # BB#0:
+; ALL-NEXT: punpcklbw %xmm0, %xmm0
+; ALL-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_0101010101010101
-; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT: retq
+; FIXME-LABEL: @shuffle_v16i8_0101010101010101
+; FIXME: # BB#0:
+; FIXME-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; FIXME-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
+; FIXME-NEXT: retq
+;
+; SSE2-LABEL: @shuffle_v16i8_0101010101010101
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: @shuffle_v16i8_0101010101010101
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; SSSE3-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23
-; CHECK-SSE2: punpcklbw %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; ALL-LABEL: @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23
+; ALL: punpcklbw %xmm1, %xmm0
+; ALL-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm1
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07
+; SSE2: # BB#0:
+; SSE2-NEXT: punpcklbw %xmm1, %xmm1
+; SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: punpcklbw %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07
+; SSSE3: # BB#0:
+; SSSE3-NEXT: punpcklbw %xmm1, %xmm1
+; SSSE3-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSSE3-NEXT: punpcklbw %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12
-; CHECK-SSE2: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: punpckhbw %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm2 = xmm2[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm2 = xmm2[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT: packuswb %xmm2, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw %xmm1, %xmm2
+; SSE2-NEXT: pshuflw {{.*}} # xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*}} # xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw %xmm1, %xmm0
+; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*}} # xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20
-; CHECK-SSE2: pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw %xmm2, %xmm1
+; SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw %xmm2, %xmm0
+; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*}} # xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4]
+; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20
-; CHECK-SSE2: pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3
-; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm3
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm3 = xmm3[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4
-; CHECK-SSE2-NEXT: punpckhbw %xmm2, %xmm4
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm4 = xmm4[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT: shufpd {{.*}} # xmm4 = xmm4[0],xmm3[1]
-; CHECK-SSE2-NEXT: punpckhbw %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
-; CHECK-SSE2-NEXT: packuswb %xmm4, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: punpcklbw %xmm2, %xmm3
+; SSE2-NEXT: pshufhw {{.*}} # xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: punpckhbw %xmm2, %xmm4
+; SSE2-NEXT: pshuflw {{.*}} # xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: shufpd {{.*}} # xmm4 = xmm4[0],xmm3[1]
+; SSE2-NEXT: punpckhbw %xmm2, %xmm1
+; SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw %xmm2, %xmm0
+; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; SSE2-NEXT: packuswb %xmm4, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*}} # xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4]
+; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero
+; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 31, i32 30, i32 29, i32 28, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20>
ret <16 x i8> %shuffle
}
define <16 x i8> @zext_to_v8i16_shuffle(<16 x i8> %a) {
-; CHECK-SSE2-LABEL: @zext_to_v8i16_shuffle
-; CHECK-SSE2: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0
+; ALL-LABEL: @zext_to_v8i16_shuffle
+; ALL: pxor %xmm1, %xmm1
+; ALL-NEXT: punpcklbw %xmm1, %xmm0
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
ret <16 x i8> %shuffle
}
define <16 x i8> @zext_to_v4i32_shuffle(<16 x i8> %a) {
-; CHECK-SSE2-LABEL: @zext_to_v4i32_shuffle
-; CHECK-SSE2: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0
-; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0
+; ALL-LABEL: @zext_to_v4i32_shuffle
+; ALL: pxor %xmm1, %xmm1
+; ALL-NEXT: punpcklbw %xmm1, %xmm0
+; ALL-NEXT: punpcklbw %xmm1, %xmm0
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
ret <16 x i8> %shuffle
}
define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) {
-; CHECK-SSE2-LABEL: @trunc_v4i32_shuffle
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pand
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: packuswb %xmm0, %xmm0
-; CHECK-SSE2-NEXT: retq
+; FIXME-LABEL: @trunc_v4i32_shuffle
+; FIXME: # BB#0:
+; FIXME-NEXT: pand
+; FIXME-NEXT: packuswb %xmm0, %xmm0
+; FIXME-NEXT: packuswb %xmm0, %xmm0
+; FIXME-NEXT: retq
+;
+; SSE2-LABEL: @trunc_v4i32_shuffle
+; SSE2: # BB#0:
+; SSE2-NEXT: pand
+; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: @trunc_v4i32_shuffle
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*}} # xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <16 x i8> %shuffle
}