From: Chandler Carruth Date: Fri, 15 Aug 2014 11:01:40 +0000 (+0000) Subject: [x86] Add the initial skeleton of type-based dispatch for AVX vectors in X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=12e69a0267138476775f34ba115a669a4dd37e9e;p=oota-llvm.git [x86] Add the initial skeleton of type-based dispatch for AVX vectors in the new shuffle lowering and an implementation for v4 shuffles. This allows us to handle non-half-crossing shuffles directly for v4 shuffles, both integer and floating point. This currently misses places where we could perform the blend via UNPCK instructions, but otherwise generates equally good or better code for the test cases included to the existing vector shuffle lowering. There are a few cases that are entertainingly better. ;] git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@215702 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 4e3e23c0988..327cc296fe6 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -8334,6 +8334,17 @@ static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, } } +static bool isHalfCrossingShuffleMask(ArrayRef Mask) { + int Size = Mask.size(); + for (int M : Mask.slice(0, Size / 2)) + if (M >= 0 && (M % Size) >= Size / 2) + return true; + for (int M : Mask.slice(Size / 2, Size / 2)) + if (M >= 0 && (M % Size) < Size / 2) + return true; + return false; +} + /// \brief Generic routine to split a 256-bit vector shuffle into 128-bit /// shuffles. /// @@ -8399,6 +8410,103 @@ static SDValue splitAndLower256BitVectorShuffle(SDValue Op, SDValue V1, return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); } +/// \brief Handle lowering of 4-lane 64-bit floating point shuffles. +/// +/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 +/// isn't available. +static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + // FIXME: If we have AVX2, we should delegate to generic code as crossing + // shuffles aren't a problem and FP and int have the same patterns. + + // FIXME: We can handle these more cleverly than splitting for v4f64. + if (isHalfCrossingShuffleMask(Mask)) + return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG); + + if (isSingleInputShuffleMask(Mask)) { + // Non-half-crossing single input shuffles can be lowerid with an + // interleaved permutation. + unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | + ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); + return DAG.getNode(X86ISD::VPERMILP, DL, MVT::v4f64, V1, + DAG.getConstant(VPERMILPMask, MVT::i8)); + } + + // Check if the blend happens to exactly fit that of SHUFPD. + if (Mask[0] < 4 && (Mask[1] == -1 || Mask[1] >= 4) && + Mask[2] < 4 && (Mask[3] == -1 || Mask[3] >= 4)) { + unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) | + ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3); + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2, + DAG.getConstant(SHUFPDMask, MVT::i8)); + } + if ((Mask[0] == -1 || Mask[0] >= 4) && Mask[1] < 4 && + (Mask[2] == -1 || Mask[2] >= 4) && Mask[3] < 4) { + unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) | + ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3); + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1, + DAG.getConstant(SHUFPDMask, MVT::i8)); + } + + // Shuffle the input elements into the desired positions in V1 and V2 and + // blend them together. + int V1Mask[] = {-1, -1, -1, -1}; + int V2Mask[] = {-1, -1, -1, -1}; + for (int i = 0; i < 4; ++i) + if (Mask[i] >= 0 && Mask[i] < 4) + V1Mask[i] = Mask[i]; + else if (Mask[i] >= 4) + V2Mask[i] = Mask[i] - 4; + + V1 = DAG.getVectorShuffle(MVT::v4f64, DL, V1, DAG.getUNDEF(MVT::v4f64), V1Mask); + V2 = DAG.getVectorShuffle(MVT::v4f64, DL, V2, DAG.getUNDEF(MVT::v4f64), V2Mask); + + unsigned BlendMask = 0; + for (int i = 0; i < 4; ++i) + if (Mask[i] >= 4) + BlendMask |= 1 << i; + + return DAG.getNode(X86ISD::BLENDI, DL, MVT::v4f64, V1, V2, + DAG.getConstant(BlendMask, MVT::i8)); +} + +/// \brief Handle lowering of 4-lane 64-bit integer shuffles. +/// +/// Largely delegates to common code when we have AVX2 and to the floating-point +/// code when we only have AVX. +static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v4i64 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + // FIXME: If we have AVX2, we should delegate to generic code as crossing + // shuffles aren't a problem and FP and int have the same patterns. + + if (isHalfCrossingShuffleMask(Mask)) + return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG); + + // AVX1 doesn't provide any facilities for v4i64 shuffles, bitcast and + // delegate to floating point code. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f64, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f64, V2); + return DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, + lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG)); +} + /// \brief High-level routine to lower various 256-bit x86 vector shuffles. /// /// This routine either breaks down the specific type of a 256-bit x86 vector @@ -8407,16 +8515,24 @@ static SDValue splitAndLower256BitVectorShuffle(SDValue Op, SDValue V1, static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, MVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - // FIXME: We should detect symmetric patterns and re-use the 128-bit shuffle - // lowering logic with wider types in that case. - - // FIXME: We should detect when we can use AVX2 cross-half shuffles to either - // implement the shuffle completely, more effectively build symmetry, or - // minimize half-blends. + switch (VT.SimpleTy) { + case MVT::v4f64: + return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v4i64: + return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v8i32: + case MVT::v8f32: + case MVT::v16i16: + case MVT::v32i8: + // Fall back to the basic pattern of extracting the high half and forming + // a 4-way blend. + // FIXME: Add targeted lowering for each type that can document rationale + // for delegating to this when necessary. + return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG); - // Fall back to the basic pattern of extracting the high half and forming - // a 4-way blend. - return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG); + default: + llvm_unreachable("Not a valid 256-bit x86 vector type!"); + } } /// \brief Tiny helper function to test whether a shuffle mask could be diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index ac441e9b77f..b7047724e40 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -169,6 +169,89 @@ define <4 x double> @shuffle_v4f64_3210(<4 x double> %a, <4 x double> %b) { %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } +define <4 x double> @shuffle_v4f64_0023(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: @shuffle_v4f64_0023 +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[0,0,2,3] +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle +} +define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: @shuffle_v4f64_0022 +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[0,0,2,2] +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle +} +define <4 x double> @shuffle_v4f64_1032(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: @shuffle_v4f64_1032 +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle +} +define <4 x double> @shuffle_v4f64_1133(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: @shuffle_v4f64_1133 +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[1,1,3,3] +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle +} +define <4 x double> @shuffle_v4f64_1023(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: @shuffle_v4f64_1023 +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[1,0,2,3] +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle +} +define <4 x double> @shuffle_v4f64_1022(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: @shuffle_v4f64_1022 +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[1,0,2,2] +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle +} +define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: @shuffle_v4f64_0423 +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*}} # ymm1 = ymm1[{{[0-9]}},0,{{[0-9],[0-9]}}] +; AVX1-NEXT: vblendpd {{.*}} # ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle +} +define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: @shuffle_v4f64_0462 +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*}} # ymm1 = ymm1[{{[0-9]}},0,2,{{[0-9]}}] +; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[0,{{[0-9],[0-9]}},2] +; AVX1-NEXT: vblendpd {{.*}} # ymm0 = ymm0[0],ymm1[1,2],ymm0[3] +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle +} +define <4 x double> @shuffle_v4f64_0426(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: @shuffle_v4f64_0426 +; AVX1: # BB#0: +; AVX1-NEXT: vshufpd {{.*}} # ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle +} +define <4 x double> @shuffle_v4f64_5163(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: @shuffle_v4f64_5163 +; AVX1: # BB#0: +; AVX1-NEXT: vshufpd {{.*}} # ymm0 = ymm1[1],ymm0[1],ymm1[2],ymm0[3] +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle +} define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_0124