return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
}
+/// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
+/// This allows for fast cases such as subvector extraction/insertion
+/// or shuffling smaller vector types which can lower more efficiently.
+static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ assert(VT.getSizeInBits() == 256 && "Expected 256-bit vector");
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfNumElts = NumElts / 2;
+ MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
+
+ bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
+ bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
+ if (!UndefLower && !UndefUpper)
+ return SDValue();
+
+ // Upper half is undef and lower half is whole upper subvector.
+ // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+ if (UndefUpper &&
+ isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+ DAG.getIntPtrConstant(HalfNumElts, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ // Lower half is undef and upper half is whole lower subvector.
+ // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+ if (UndefLower &&
+ isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+ DAG.getIntPtrConstant(HalfNumElts, DL));
+ }
+
+ // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
+ if (UndefLower && Subtarget->hasAVX2() &&
+ (VT == MVT::v4f64 || VT == MVT::v4i64))
+ return SDValue();
+
+ // If the shuffle only uses the lower halves of the input operands,
+ // then extract them and perform the 'half' shuffle at half width.
+ // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
+ int HalfIdx1 = -1, HalfIdx2 = -1;
+ SmallVector<int, 8> HalfMask;
+ unsigned Offset = UndefLower ? HalfNumElts : 0;
+ for (unsigned i = 0; i != HalfNumElts; ++i) {
+ int M = Mask[i + Offset];
+ if (M < 0) {
+ HalfMask.push_back(M);
+ continue;
+ }
+
+ // Determine which of the 4 half vectors this element is from.
+ // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
+ int HalfIdx = M / HalfNumElts;
+
+ // Only shuffle using the lower halves of the inputs.
+ // TODO: Investigate usefulness of shuffling with upper halves.
+ if (HalfIdx != 0 && HalfIdx != 2)
+ return SDValue();
+
+ // Determine the element index into its half vector source.
+ int HalfElt = M % HalfNumElts;
+
+ // We can shuffle with up to 2 half vectors, set the new 'half'
+ // shuffle mask accordingly.
+ if (-1 == HalfIdx1 || HalfIdx1 == HalfIdx) {
+ HalfMask.push_back(HalfElt);
+ HalfIdx1 = HalfIdx;
+ continue;
+ }
+ if (-1 == HalfIdx2 || HalfIdx2 == HalfIdx) {
+ HalfMask.push_back(HalfElt + HalfNumElts);
+ HalfIdx2 = HalfIdx;
+ continue;
+ }
+
+ // Too many half vectors referenced.
+ return SDValue();
+ }
+ assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
+
+ auto GetHalfVector = [&](int HalfIdx) {
+ if (HalfIdx < 0)
+ return DAG.getUNDEF(HalfVT);
+ SDValue V = (HalfIdx < 2 ? V1 : V2);
+ HalfIdx = (HalfIdx % 2) * HalfNumElts;
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
+ DAG.getIntPtrConstant(HalfIdx, DL));
+ };
+
+ SDValue Half1 = GetHalfVector(HalfIdx1);
+ SDValue Half2 = GetHalfVector(HalfIdx2);
+ SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
+ DAG.getIntPtrConstant(Offset, DL));
+}
+
/// \brief Test whether the specified input (0 or 1) is in-place blended by the
/// given mask.
///
DL, VT, V1, V2, Mask, Subtarget, DAG))
return Insertion;
+ // Handle special cases where the lower or upper half is UNDEF.
+ if (SDValue V =
+ lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
// There is a really nice hard cut-over between AVX1 and AVX2 that means we
// can check for those subtargets here and avoid much of the subtarget
// querying in the per-vector-type lowering routines. With AVX1 we have
assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
// Use the original mask here, do not modify the mask twice
- Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
+ Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
// The value that should be stored
MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
return TargetLowering::isGAPlusOffset(N, GA, Offset);
}
-/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
-/// same as extracting the high 128-bit part of 256-bit vector and then
-/// inserting the result into the low part of a new 256-bit vector
-static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
- EVT VT = SVOp->getValueType(0);
- unsigned NumElems = VT.getVectorNumElements();
-
- // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
- for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
- if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
- SVOp->getMaskElt(j) >= 0)
- return false;
-
- return true;
-}
-
-/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
-/// same as extracting the low 128-bit part of 256-bit vector and then
-/// inserting the result into the high part of a new 256-bit vector
-static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
- EVT VT = SVOp->getValueType(0);
- unsigned NumElems = VT.getVectorNumElements();
-
- // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
- for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
- if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
- SVOp->getMaskElt(j) >= 0)
- return false;
-
- return true;
-}
-
/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
+/// FIXME: This could be expanded to support 512 bit vectors as well.
static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget* Subtarget) {
return DCI.CombineTo(N, InsV);
}
- //===--------------------------------------------------------------------===//
- // Combine some shuffles into subvector extracts and inserts:
- //
-
- // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
- if (isShuffleHigh128VectorInsertLow(SVOp)) {
- SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
- SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
- return DCI.CombineTo(N, InsV);
- }
-
- // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
- if (isShuffleLow128VectorInsertHigh(SVOp)) {
- SDValue V = Extract128BitVector(V1, 0, DAG, dl);
- SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
- return DCI.CombineTo(N, InsV);
- }
-
return SDValue();
}
// If we're negating a FMUL node on a target with FMA, then we can avoid the
// use of a constant by performing (-0 - A*B) instead.
- // FIXME: Check rounding control flags as well once it becomes available.
+ // FIXME: Check rounding control flags as well once it becomes available.
if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) {
SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
; CHECK-LABEL: funcF:
; CHECK: ## BB#0:
; CHECK-NEXT: vmovd %edi, %xmm0
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,0]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,0]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
%ret6 = insertelement <8 x i32> undef, i32 %val, i32 6
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
-; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
ret <16 x i16> %shuffle
}
+define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19>
+ ret <16 x i16> %shuffle
+}
+
define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
; AVX1: # BB#0:
ret <16 x i16> %shuffle
}
+define <16 x i16> @shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
+; ALL-LABEL: shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u:
+; ALL: # BB#0:
+; ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
+; ALL-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <16 x i16> %shuffle
+}
+
define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
; AVX1: # BB#0:
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
ret <32 x i8> %shuffle
}
+define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
+; ALL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <32 x i8> %shuffle
+}
+
define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
; AVX1: # BB#0:
}
define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {
-; AVX1-LABEL: shuffle_v4f64_0423:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
-; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v4f64_0423:
-; AVX2: # BB#0:
-; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1
-; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v4f64_0423:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vbroadcastsd %xmm1, %ymm1
-; AVX512VL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
-; AVX512VL-NEXT: retq
+; ALL-LABEL: shuffle_v4f64_0423:
+; ALL: # BB#0:
+; ALL-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
+; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
ret <4 x double> %shuffle
}
ret <4 x double> %shuffle
}
+define <4 x double> @shuffle_v4f64_15uu(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_15uu:
+; ALL: # BB#0:
+; ALL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
+ ret <4 x double> %shuffle
+}
+
define <4 x double> @shuffle_v4f64_11uu(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_11uu:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
+; ALL-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
ret <4 x double> %shuffle
define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0142:
; AVX1: # BB#0:
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,1,2,2]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_0142:
; AVX2: # BB#0:
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_0142:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm1
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX512VL-NEXT: retq
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_0412:
; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
-; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_0412:
; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
-; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
ret <4 x i64> %shuffle
}
+define <4 x i64> @shuffle_v4i64_15uu(<4 x i64> %a, <4 x i64> %b) {
+; ALL-LABEL: shuffle_v4i64_15uu:
+; ALL: # BB#0:
+; ALL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
+ ret <4 x i64> %shuffle
+}
+
define <4 x i64> @shuffle_v4i64_11uu(<4 x i64> %a, <4 x i64> %b) {
-; AVX1-LABEL: shuffle_v4i64_11uu:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v4i64_11uu:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v4i64_11uu:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX512VL-NEXT: retq
+; ALL-LABEL: shuffle_v4i64_11uu:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; ALL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
ret <4 x i64> %shuffle
}
define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00040000:
; AVX1: # BB#0:
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8f32_00040000:
}
define <8 x float> @shuffle_v8f32_uuuu1111(<8 x float> %a, <8 x float> %b) {
-; AVX1-LABEL: shuffle_v8f32_uuuu1111:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v8f32_uuuu1111:
-; AVX2: # BB#0:
-; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
+; ALL-LABEL: shuffle_v8f32_uuuu1111:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 1, i32 1>
ret <8 x float> %shuffle
}
ret <8 x float> %shuffle
}
+define <8 x float> @shuffle_v8f32_1188uuuu(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_1188uuuu:
+; ALL: # BB#0:
+; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_uuuu3210(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_uuuu3210:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 2, i32 1, i32 0>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_uuuu1188(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_uuuu1188:
+; ALL: # BB#0:
+; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 8, i32 8>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_1111uuuu:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x float> %shuffle
+}
+
define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_5555uuuu:
; AVX1: # BB#0:
define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00040000:
; AVX1: # BB#0:
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_00040000:
define <8 x i32> @shuffle_v8i32_uuuu1111(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_uuuu1111:
; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_uuuu1111:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 1, i32 1>
ret <8 x i32> %shuffle
}
+define <8 x i32> @shuffle_v8i32_2222uuuu(<8 x i32> %a, <8 x i32> %b) {
+; ALL-LABEL: shuffle_v8i32_2222uuuu:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_2A3Buuuu(<8 x i32> %a, <8 x i32> %b) {
+; ALL-LABEL: shuffle_v8i32_2A3Buuuu:
+; ALL: # BB#0:
+; ALL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x i32> %shuffle
+}
+
define <8 x i32> @shuffle_v8i32_44444444(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_44444444:
; AVX1: # BB#0:
;
; AVX2-LABEL: shuf_zext_16i8_to_4i64_offset11:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: retq
entry:
;
; AVX2-LABEL: shuf_zext_8i16_to_4i64_offset2:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,2,3,5,6,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: retq
entry:
;
; AVX2-LABEL: shuf_zext_8i16_to_8i32_offset3:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: retq
entry:
;
; AVX2-LABEL: shuf_zext_16i16_to_8i32_offset8:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: retq
entry:
;
; AVX2-LABEL: shuf_zext_4i32_to_4i64_offset1:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3]
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: retq
entry: