From: Chandler Carruth Date: Fri, 27 Feb 2015 11:45:13 +0000 (+0000) Subject: [x86] Teach a bunch of the x86-specific shuffle combining to work with X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=2d58cc5f1bfa05f29f6b696fe4bd834c78504e83;p=oota-llvm.git [x86] Teach a bunch of the x86-specific shuffle combining to work with 256-bit vectors as well as 128-bit vectors. Fixes some of the redundant shuffles for v16i16. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@230752 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index bafb661ff05..52e2c4cbf18 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -19435,12 +19435,26 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 /// PSHUF-style masks that can be reused with such instructions. static SmallVector getPSHUFShuffleMask(SDValue N) { + MVT VT = N.getSimpleValueType(); SmallVector Mask; bool IsUnary; - bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary); + bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary); (void)HaveMask; assert(HaveMask); + // If we have more than 128-bits, only the low 128-bits of shuffle mask + // matter. Check that the upper masks are repeats and remove them. + if (VT.getSizeInBits() > 128) { + int LaneElts = 128 / VT.getScalarSizeInBits(); +#ifndef NDEBUG + for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i) + for (int j = 0; j < LaneElts; ++j) + assert(Mask[j] == Mask[i * LaneElts + j] - LaneElts && + "Mask doesn't repeat in high 128-bit lanes!"); +#endif + Mask.resize(LaneElts); + } + switch (N.getOpcode()) { case X86ISD::PSHUFD: return Mask; @@ -19513,7 +19527,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef Mask, case X86ISD::UNPCKH: // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword // shuffle into a preceding word shuffle. - if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16) + if (V.getSimpleValueType().getScalarType() != MVT::i8 && + V.getSimpleValueType().getScalarType() != MVT::i16) return SDValue(); // Search for a half-shuffle which we can combine with. @@ -19687,8 +19702,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, break; case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: - assert(VT == MVT::v8i16); - (void)VT; + assert(VT.getScalarType() == MVT::i16 && "Bad word shuffle type!"); if (combineRedundantHalfShuffle(N, Mask, DAG, DCI)) return SDValue(); // We combined away this shuffle, so we're done. @@ -19701,12 +19715,13 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; DMask[DOffset + 0] = DOffset + 1; DMask[DOffset + 1] = DOffset + 0; - V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V); + MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); + V = DAG.getNode(ISD::BITCAST, DL, DVT, V); DCI.AddToWorklist(V.getNode()); - V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V, + V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V, getV4X86ShuffleImm8ForMask(DMask, DAG)); DCI.AddToWorklist(V.getNode()); - return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); + return DAG.getNode(ISD::BITCAST, DL, VT, V); } // Look for shuffle patterns which can be implemented as a single unpack. @@ -19741,11 +19756,11 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, std::equal(std::begin(MappedMask), std::end(MappedMask), std::begin(UnpackHiMask))) { // We can replace all three shuffles with an unpack. - V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0)); + V = DAG.getNode(ISD::BITCAST, DL, VT, D.getOperand(0)); DCI.AddToWorklist(V.getNode()); return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL : X86ISD::UNPCKH, - DL, MVT::v8i16, V, V); + DL, VT, V, V); } } } @@ -19893,10 +19908,6 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, } } - // Only handle 128 wide vector from here on. - if (!VT.is128BitVector()) - return SDValue(); - // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are // consecutive, non-overlapping, and in the right order. @@ -19914,6 +19925,10 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, if (Shuffle.getNode()) return Shuffle; + // Only handle 128 wide vector from here on. + if (!VT.is128BitVector()) + return SDValue(); + // Try recursively combining arbitrary sequences of x86 shuffle // instructions into higher-order shuffles. We do this after combining // specific PSHUF instruction sequences into their minimal form so that we diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index f7f8044b06f..2fcef44a5cb 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -2380,8 +2380,7 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_u ; AVX2: # BB#0: ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,2,0,7,5,6,4] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,3,4,5,6,7,8,11,10,11,12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,3,0,1,4,5,6,7,10,11,8,9,12,13,14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle