From 982f60be4432d66d819b6018ebbdb19379f29e72 Mon Sep 17 00:00:00 2001
From: Bill Schmidt <wschmidt@linux.vnet.ibm.com>
Date: Wed, 6 May 2015 15:40:46 +0000
Subject: [PATCH] [PPC64LE] Adjust vector splats during VSX swap optimization

The initial code drop for VSX swap optimization permitted the
optimization only when all operations in a web of related computation
are lane-insensitive.  For some lane-sensitive operations, we can
still permit the optimization provided that we make adjustments to
those operations.  This patch adds special handling for vector splats
so that their presence doesn't kill the optimization.

Vector splats are lane-sensitive since they identify by number a
vector element to be used as the source of a splat.  When swap
optimizations take place, the desired vector element will move to the
opposite doubleword of the quadword vector.  We thus replace the index
I by (I + N/2) % N, where N is the number of elements in the vector.

A new test case is added to test that swap optimization succeeds when
vector splats are present, and that the proper input element is used
as the source of the splat.

An ancillary change removes SH_BUILDVEC as one of the kinds of special
handling that may be required by VSX swap optimization.  From
experience with GCC, I had expected to need some modifications for
vector build operations, but I did not find that to be the case.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@236606 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/PowerPC/PPCVSXSwapRemoval.cpp | 47 ++++++++++--
 test/CodeGen/PowerPC/swaps-le-2.ll       | 91 ++++++++++++++++++++++++
 2 files changed, 131 insertions(+), 7 deletions(-)
 create mode 100644 test/CodeGen/PowerPC/swaps-le-2.ll

diff --git a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
index a1dc7f7a5c3..6aa25ff6f8e 100644
--- a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
+++ b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -88,7 +88,6 @@ struct PPCVSXSwapEntry {
 
 enum SHValues {
   SH_NONE = 0,
-  SH_BUILDVEC,
   SH_EXTRACT,
   SH_INSERT,
   SH_NOSWAP_LD,
@@ -329,7 +328,7 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
         // Splats are lane-sensitive, but we can use special handling
         // to adjust the source lane for the splat.  This is not yet
         // implemented.  When it is, we need to uncomment the following:
-        //        SwapVector[VecIdx].IsSwappable = 1;
+        SwapVector[VecIdx].IsSwappable = 1;
         SwapVector[VecIdx].SpecialHandling = SHValues::SH_SPLAT;
         break;
       // The presence of the following lane-sensitive operations in a
@@ -662,8 +661,12 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() {
       }
 
     } else if (SwapVector[EntryIdx].IsSwappable &&
-               SwapVector[EntryIdx].SpecialHandling != 0)
-      handleSpecialSwappables(EntryIdx);
+               SwapVector[EntryIdx].SpecialHandling != 0) {
+      int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId);
+
+      if (!SwapVector[Repr].WebRejected)
+        handleSpecialSwappables(EntryIdx);
+    }
   }
 }
 
@@ -672,6 +675,39 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() {
 // here.
 // FIXME: This code is to be phased in with subsequent patches.
 void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
+  switch (SwapVector[EntryIdx].SpecialHandling) {
+
+  default:
+    assert(false && "Unexpected special handling type");
+    break;
+
+  // For splats based on an index into a vector, add N/2 modulo N
+  // to the index, where N is the number of vector elements.
+  case SHValues::SH_SPLAT: {
+    MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+    unsigned NElts;
+
+    DEBUG(dbgs() << "Changing splat: ");
+    DEBUG(MI->dump());
+
+    switch (MI->getOpcode()) {
+    default:
+      assert(false && "Unexpected splat opcode");
+    case PPC::VSPLTB: NElts = 16; break;
+    case PPC::VSPLTH: NElts = 8;  break;
+    case PPC::VSPLTW: NElts = 4;  break;
+    }
+
+    unsigned EltNo = MI->getOperand(1).getImm();
+    EltNo = (EltNo + NElts / 2) % NElts;
+    MI->getOperand(1).setImm(EltNo);
+
+    DEBUG(dbgs() << "  Into: ");
+    DEBUG(MI->dump());
+    break;
+  }
+
+  }
 }
 
 // Walk the swap vector and replace each entry marked for removal with
@@ -734,9 +770,6 @@ void PPCVSXSwapRemoval::dumpSwapVector() {
         break;
       case SH_NONE:
         break;
-      case SH_BUILDVEC:
-        DEBUG(dbgs() << "special:buildvec ");
-        break;
       case SH_EXTRACT:
         DEBUG(dbgs() << "special:extract ");
         break;
diff --git a/test/CodeGen/PowerPC/swaps-le-2.ll b/test/CodeGen/PowerPC/swaps-le-2.ll
new file mode 100644
index 00000000000..08096ed20dd
--- /dev/null
+++ b/test/CodeGen/PowerPC/swaps-le-2.ll
@@ -0,0 +1,91 @@
+; RUN: llc -O3 -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+
+; Test swap removal when a vector splat must be adjusted to make it legal.
+;
+; Test generated from following C code:
+;
+; vector char vc = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+; vector char vcr;
+; vector short vs = {0, 1, 2, 3, 4, 5, 6, 7};
+; vector short vsr;
+; vector int vi = {0, 1, 2, 3};
+; vector int vir;
+;
+; void cfoo ()
+; {
+;   vcr = (vector char){vc[5], vc[5], vc[5], vc[5], vc[5], vc[5], vc[5], vc[5],
+;                       vc[5], vc[5], vc[5], vc[5], vc[5], vc[5], vc[5], vc[5]};
+; }
+;
+; void sfoo ()
+; {
+;   vsr = (vector short){vs[6], vs[6], vs[6], vs[6],
+;                        vs[6], vs[6], vs[6], vs[6]};
+; }
+;
+; void ifoo ()
+; {
+;   vir = (vector int){vi[1], vi[1], vi[1], vi[1]};
+; }
+
+@vc = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@vs = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@vi = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@vcr = common global <16 x i8> zeroinitializer, align 16
+@vsr = common global <8 x i16> zeroinitializer, align 16
+@vir = common global <4 x i32> zeroinitializer, align 16
+
+; Function Attrs: nounwind
+define void @cfoo() {
+entry:
+  %0 = load <16 x i8>, <16 x i8>* @vc, align 16
+  %vecinit30 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  store <16 x i8> %vecinit30, <16 x i8>* @vcr, align 16
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @sfoo() {
+entry:
+  %0 = load <8 x i16>, <8 x i16>* @vs, align 16
+  %vecinit14 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
+  store <8 x i16> %vecinit14, <8 x i16>* @vsr, align 16
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @ifoo() {
+entry:
+  %0 = load <4 x i32>, <4 x i32>* @vi, align 16
+  %vecinit6 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  store <4 x i32> %vecinit6, <4 x i32>* @vir, align 16
+  ret void
+}
+
+; Justification:
+;  Byte splat of element 5 (BE) becomes element 15-5 = 10 (LE)
+;  which becomes (10+8)%16 = 2 (LE swapped).
+;
+;  Halfword splat of element 6 (BE) becomes element 7-6 = 1 (LE)
+;  which becomes (1+4)%8 = 5 (LE swapped).
+;
+;  Word splat of element 1 (BE) becomes element 3-1 = 2 (LE)
+;  which becomes (2+2)%4 = 0 (LE swapped).
+
+; CHECK-NOT: xxpermdi
+; CHECK-NOT: xxswapd
+
+; CHECK-LABEL: @cfoo
+; CHECK: lxvd2x
+; CHECK: vspltb {{[0-9]+}}, {{[0-9]+}}, 2
+; CHECK: stxvd2x
+
+; CHECK-LABEL: @sfoo
+; CHECK: lxvd2x
+; CHECK: vsplth {{[0-9]+}}, {{[0-9]+}}, 5
+; CHECK: stxvd2x
+
+; CHECK-LABEL: @ifoo
+; CHECK: lxvd2x
+; CHECK: vspltw {{[0-9]+}}, {{[0-9]+}}, 0
+; CHECK: stxvd2x
-- 
2.34.1