}
}
+static bool isHalfCrossingShuffleMask(ArrayRef<int> Mask) {
+ int Size = Mask.size();
+ for (int M : Mask.slice(0, Size / 2))
+ if (M >= 0 && (M % Size) >= Size / 2)
+ return true;
+ for (int M : Mask.slice(Size / 2, Size / 2))
+ if (M >= 0 && (M % Size) < Size / 2)
+ return true;
+ return false;
+}
+
/// \brief Generic routine to split a 256-bit vector shuffle into 128-bit
/// shuffles.
///
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
}
+/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
+///
+/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
+/// isn't available.
+static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ // FIXME: If we have AVX2, we should delegate to generic code as crossing
+ // shuffles aren't a problem and FP and int have the same patterns.
+
+ // FIXME: We can handle these more cleverly than splitting for v4f64.
+ if (isHalfCrossingShuffleMask(Mask))
+ return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
+
+ if (isSingleInputShuffleMask(Mask)) {
+ // Non-half-crossing single input shuffles can be lowerid with an
+ // interleaved permutation.
+ unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
+ ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
+ return DAG.getNode(X86ISD::VPERMILP, DL, MVT::v4f64, V1,
+ DAG.getConstant(VPERMILPMask, MVT::i8));
+ }
+
+ // Check if the blend happens to exactly fit that of SHUFPD.
+ if (Mask[0] < 4 && (Mask[1] == -1 || Mask[1] >= 4) &&
+ Mask[2] < 4 && (Mask[3] == -1 || Mask[3] >= 4)) {
+ unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
+ ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
+ return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
+ DAG.getConstant(SHUFPDMask, MVT::i8));
+ }
+ if ((Mask[0] == -1 || Mask[0] >= 4) && Mask[1] < 4 &&
+ (Mask[2] == -1 || Mask[2] >= 4) && Mask[3] < 4) {
+ unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
+ ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
+ return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
+ DAG.getConstant(SHUFPDMask, MVT::i8));
+ }
+
+ // Shuffle the input elements into the desired positions in V1 and V2 and
+ // blend them together.
+ int V1Mask[] = {-1, -1, -1, -1};
+ int V2Mask[] = {-1, -1, -1, -1};
+ for (int i = 0; i < 4; ++i)
+ if (Mask[i] >= 0 && Mask[i] < 4)
+ V1Mask[i] = Mask[i];
+ else if (Mask[i] >= 4)
+ V2Mask[i] = Mask[i] - 4;
+
+ V1 = DAG.getVectorShuffle(MVT::v4f64, DL, V1, DAG.getUNDEF(MVT::v4f64), V1Mask);
+ V2 = DAG.getVectorShuffle(MVT::v4f64, DL, V2, DAG.getUNDEF(MVT::v4f64), V2Mask);
+
+ unsigned BlendMask = 0;
+ for (int i = 0; i < 4; ++i)
+ if (Mask[i] >= 4)
+ BlendMask |= 1 << i;
+
+ return DAG.getNode(X86ISD::BLENDI, DL, MVT::v4f64, V1, V2,
+ DAG.getConstant(BlendMask, MVT::i8));
+}
+
+/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
+///
+/// Largely delegates to common code when we have AVX2 and to the floating-point
+/// code when we only have AVX.
+static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(Op.getSimpleValueType() == MVT::v4i64 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ // FIXME: If we have AVX2, we should delegate to generic code as crossing
+ // shuffles aren't a problem and FP and int have the same patterns.
+
+ if (isHalfCrossingShuffleMask(Mask))
+ return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
+
+ // AVX1 doesn't provide any facilities for v4i64 shuffles, bitcast and
+ // delegate to floating point code.
+ V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f64, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f64, V2);
+ return DAG.getNode(ISD::BITCAST, DL, MVT::v4i64,
+ lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG));
+}
+
/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
///
/// This routine either breaks down the specific type of a 256-bit x86 vector
static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
MVT VT, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- // FIXME: We should detect symmetric patterns and re-use the 128-bit shuffle
- // lowering logic with wider types in that case.
-
- // FIXME: We should detect when we can use AVX2 cross-half shuffles to either
- // implement the shuffle completely, more effectively build symmetry, or
- // minimize half-blends.
+ switch (VT.SimpleTy) {
+ case MVT::v4f64:
+ return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v4i64:
+ return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v8i32:
+ case MVT::v8f32:
+ case MVT::v16i16:
+ case MVT::v32i8:
+ // Fall back to the basic pattern of extracting the high half and forming
+ // a 4-way blend.
+ // FIXME: Add targeted lowering for each type that can document rationale
+ // for delegating to this when necessary.
+ return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
- // Fall back to the basic pattern of extracting the high half and forming
- // a 4-way blend.
- return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
+ default:
+ llvm_unreachable("Not a valid 256-bit x86 vector type!");
+ }
}
/// \brief Tiny helper function to test whether a shuffle mask could be
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
ret <4 x double> %shuffle
}
+define <4 x double> @shuffle_v4f64_0023(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: @shuffle_v4f64_0023
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[0,0,2,3]
+; AVX1-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+ ret <4 x double> %shuffle
+}
+define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: @shuffle_v4f64_0022
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[0,0,2,2]
+; AVX1-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ ret <4 x double> %shuffle
+}
+define <4 x double> @shuffle_v4f64_1032(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: @shuffle_v4f64_1032
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ ret <4 x double> %shuffle
+}
+define <4 x double> @shuffle_v4f64_1133(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: @shuffle_v4f64_1133
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[1,1,3,3]
+; AVX1-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ ret <4 x double> %shuffle
+}
+define <4 x double> @shuffle_v4f64_1023(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: @shuffle_v4f64_1023
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[1,0,2,3]
+; AVX1-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+ ret <4 x double> %shuffle
+}
+define <4 x double> @shuffle_v4f64_1022(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: @shuffle_v4f64_1022
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[1,0,2,2]
+; AVX1-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 2>
+ ret <4 x double> %shuffle
+}
+define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: @shuffle_v4f64_0423
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilpd {{.*}} # ymm1 = ymm1[{{[0-9]}},0,{{[0-9],[0-9]}}]
+; AVX1-NEXT: vblendpd {{.*}} # ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
+; AVX1-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+ ret <4 x double> %shuffle
+}
+define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: @shuffle_v4f64_0462
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilpd {{.*}} # ymm1 = ymm1[{{[0-9]}},0,2,{{[0-9]}}]
+; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[0,{{[0-9],[0-9]}},2]
+; AVX1-NEXT: vblendpd {{.*}} # ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
+; AVX1-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 6, i32 2>
+ ret <4 x double> %shuffle
+}
+define <4 x double> @shuffle_v4f64_0426(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: @shuffle_v4f64_0426
+; AVX1: # BB#0:
+; AVX1-NEXT: vshufpd {{.*}} # ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX1-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ ret <4 x double> %shuffle
+}
+define <4 x double> @shuffle_v4f64_5163(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: @shuffle_v4f64_5163
+; AVX1: # BB#0:
+; AVX1-NEXT: vshufpd {{.*}} # ymm0 = ymm1[1],ymm0[1],ymm1[2],ymm0[3]
+; AVX1-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
+ ret <4 x double> %shuffle
+}
define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: @shuffle_v4i64_0124