[x86] Teach a bunch of the x86-specific shuffle combining to work with

author Chandler Carruth <chandlerc@gmail.com>

Fri, 27 Feb 2015 11:45:13 +0000 (11:45 +0000)

committer Chandler Carruth <chandlerc@gmail.com>

Fri, 27 Feb 2015 11:45:13 +0000 (11:45 +0000)
author Chandler Carruth <chandlerc@gmail.com>
Fri, 27 Feb 2015 11:45:13 +0000 (11:45 +0000)
committer Chandler Carruth <chandlerc@gmail.com>
Fri, 27 Feb 2015 11:45:13 +0000 (11:45 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index bafb661ff0528159815cc141ad4bc9a37be3acc6..52e2c4cbf18d55760b1d66b18a52f3865e0e3b51 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -19435,12 +19435,26 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
  /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
  /// PSHUF-style masks that can be reused with such instructions.
  static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
+  MVT VT = N.getSimpleValueType();
    SmallVector<int, 4> Mask;
    bool IsUnary;
-  bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
+  bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary);
    (void)HaveMask;
    assert(HaveMask);
  
+  // If we have more than 128-bits, only the low 128-bits of shuffle mask
+  // matter. Check that the upper masks are repeats and remove them.
+  if (VT.getSizeInBits() > 128) {
+    int LaneElts = 128 / VT.getScalarSizeInBits();
+#ifndef NDEBUG
+    for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
+      for (int j = 0; j < LaneElts; ++j)
+        assert(Mask[j] == Mask[i * LaneElts + j] - LaneElts &&
+               "Mask doesn't repeat in high 128-bit lanes!");
+#endif
+    Mask.resize(LaneElts);
+  }
+
    switch (N.getOpcode()) {
    case X86ISD::PSHUFD:
      return Mask;
@@ -19513,7 +19527,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
      case X86ISD::UNPCKH:
        // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
        // shuffle into a preceding word shuffle.
-      if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
+      if (V.getSimpleValueType().getScalarType() != MVT::i8 &&
+          V.getSimpleValueType().getScalarType() != MVT::i16)
          return SDValue();
  
        // Search for a half-shuffle which we can combine with.
@@ -19687,8 +19702,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
      break;
    case X86ISD::PSHUFLW:
    case X86ISD::PSHUFHW:
-    assert(VT == MVT::v8i16);
-    (void)VT;
+    assert(VT.getScalarType() == MVT::i16 && "Bad word shuffle type!");
  
      if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
        return SDValue(); // We combined away this shuffle, so we're done.
@@ -19701,12 +19715,13 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
        int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
        DMask[DOffset + 0] = DOffset + 1;
        DMask[DOffset + 1] = DOffset + 0;
-      V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
+      MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+      V = DAG.getNode(ISD::BITCAST, DL, DVT, V);
        DCI.AddToWorklist(V.getNode());
-      V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
+      V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
                        getV4X86ShuffleImm8ForMask(DMask, DAG));
        DCI.AddToWorklist(V.getNode());
-      return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+      return DAG.getNode(ISD::BITCAST, DL, VT, V);
      }
  
      // Look for shuffle patterns which can be implemented as a single unpack.
@@ -19741,11 +19756,11 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
              std::equal(std::begin(MappedMask), std::end(MappedMask),
                         std::begin(UnpackHiMask))) {
            // We can replace all three shuffles with an unpack.
-          V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
+          V = DAG.getNode(ISD::BITCAST, DL, VT, D.getOperand(0));
            DCI.AddToWorklist(V.getNode());
            return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
                                                  : X86ISD::UNPCKH,
-                             DL, MVT::v8i16, V, V);
+                             DL, VT, V, V);
          }
        }
      }
@@ -19893,10 +19908,6 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
      }
    }
  
-  // Only handle 128 wide vector from here on.
-  if (!VT.is128BitVector())
-    return SDValue();
-
    // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
    // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
    // consecutive, non-overlapping, and in the right order.
@@ -19914,6 +19925,10 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
      if (Shuffle.getNode())
        return Shuffle;
  
+    // Only handle 128 wide vector from here on.
+    if (!VT.is128BitVector())
+      return SDValue();
+
      // Try recursively combining arbitrary sequences of x86 shuffle
      // instructions into higher-order shuffles. We do this after combining
      // specific PSHUF instruction sequences into their minimal form so that we
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll

index f7f8044b06f300974adf8fbfb84d0b40273d05ba..2fcef44a5cbf8030578b1e733e0a18df065b7e9b 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -2380,8 +2380,7 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_u
  ; AVX2:       # BB#0:
  ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,2,0,7,5,6,4]
  ; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,3,4,5,6,7,8,11,10,11,12,13,14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[2,3,0,1,4,5,6,7,10,11,8,9,12,13,14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
  ; AVX2-NEXT:    retq
    %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 5, i32 6, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
    ret <16 x i16> %shuffle
author	Chandler Carruth <chandlerc@gmail.com>
	Fri, 27 Feb 2015 11:45:13 +0000 (11:45 +0000)
committer	Chandler Carruth <chandlerc@gmail.com>
	Fri, 27 Feb 2015 11:45:13 +0000 (11:45 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-256-v16.ll		patch \| blob \| history