From 974542d7d8d38c9b759f6945ce719bdbc0d78e91 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Sun, 21 Sep 2014 13:35:14 +0000
Subject: [PATCH] [x86] Teach the new vector shuffle lowering to re-use the
 SHUFPS lowering when it can use a symmetric SHUFPS across both 128-bit lanes.

This required making the SHUFPS lowering tolerant of other vector types,
and adjusting our canonicalization to canonicalize harder.

This is the last of the clever uses of symmetry I've thought of for
v8f32. The rest of the tricks I'm aware of here are to work around
assymetry in the mask.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218216 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp        | 27 +++++++++++++++++++----
 test/CodeGen/X86/vector-shuffle-256-v8.ll | 16 +++++---------
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 14613d3c6e1..2a020b51001 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -7800,7 +7800,7 @@ static SDValue lowerVectorShuffleWithSHUPFS(SDLoc DL, MVT VT,
       // To make this work, blend them together as the first step.
       int V1Index = V2AdjIndex;
       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
-      V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1,
+      V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
 
       // Now proceed to reconstruct the final blend as we have the necessary
@@ -7831,7 +7831,7 @@ static SDValue lowerVectorShuffleWithSHUPFS(SDLoc DL, MVT VT,
                           Mask[2] < 4 ? Mask[2] : Mask[3],
                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
-      V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2,
+      V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
 
       // Now we do a normal shuffle of V1 by giving V1 as both operands to
@@ -7843,7 +7843,7 @@ static SDValue lowerVectorShuffleWithSHUPFS(SDLoc DL, MVT VT,
       NewMask[3] = Mask[2] < 4 ? 3 : 1;
     }
   }
-  return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV,
+  return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
                      getV4X86ShuffleImm8ForMask(NewMask, DAG));
 }
 
@@ -9385,6 +9385,14 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
     if (isShuffleEquivalent(LoMask, 2, 10, 3, 11))
       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
+
+    // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
+    // have already handled any direct blends.
+    int SHUFPSMask[] = {Mask[0], Mask[1], Mask[2], Mask[3]};
+    for (int &M : SHUFPSMask)
+      if (M >= 8)
+        M -= 4;
+    return lowerVectorShuffleWithSHUPFS(DL, MVT::v8f32, SHUFPSMask, V1, V2, DAG);
   }
 
   if (isSingleInputShuffleMask(Mask))
@@ -9531,7 +9539,9 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
     return DAG.getCommutedVectorShuffle(*SVOp);
 
   // When the number of V1 and V2 elements are the same, try to minimize the
-  // number of uses of V2 in the low half of the vector.
+  // number of uses of V2 in the low half of the vector. When that is tied,
+  // ensure that the sum of indices for V1 is equal to or lower than the sum
+  // indices for V2.
   if (NumV1Elements == NumV2Elements) {
     int LowV1Elements = 0, LowV2Elements = 0;
     for (int M : SVOp->getMask().slice(0, NumElements / 2))
@@ -9541,6 +9551,15 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
         ++LowV1Elements;
     if (LowV2Elements > LowV1Elements)
       return DAG.getCommutedVectorShuffle(*SVOp);
+
+    int SumV1Indices = 0, SumV2Indices = 0;
+    for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
+      if (SVOp->getMask()[i] >= NumElements)
+        SumV2Indices += i;
+      else if (SVOp->getMask()[i] >= 0)
+        SumV1Indices += i;
+    if (SumV2Indices < SumV1Indices)
+      return DAG.getCommutedVectorShuffle(*SVOp);
   }
 
   // For each vector width, delegate to a specialized lowering routine.
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 2d4f15769bb..311fe7e4734 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -153,9 +153,8 @@ define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_08084c4c
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm1[0,0,2,0,4,4,6,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,1,0,3,4,5,4,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; ALL-NEXT:    vshufps {{.*}} # ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
+; ALL-NEXT:    vshufps {{.*}} # ymm0 = ymm0[0,2,1,3,4,6,5,7]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
   ret <8 x float> %shuffle
@@ -164,8 +163,7 @@ define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_8823cc67(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_8823cc67
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm1[0,0,2,3,4,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT:    vshufps {{.*}} # ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
   ret <8 x float> %shuffle
@@ -174,9 +172,7 @@ define <8 x float> @shuffle_v8f32_8823cc67(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_9832dc76(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_9832dc76
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm1[1,0,2,3,5,4,6,7]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,1,3,2,4,5,7,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT:    vshufps {{.*}} # ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
   ret <8 x float> %shuffle
@@ -185,9 +181,7 @@ define <8 x float> @shuffle_v8f32_9832dc76(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_9810dc54(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_9810dc54
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm1[1,0,2,3,5,4,6,7]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,1,1,0,4,5,5,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT:    vshufps {{.*}} # ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
   ret <8 x float> %shuffle
-- 
2.34.1