[PPC64LE] More vector swap optimization TLC

author Bill Schmidt <wschmidt@linux.vnet.ibm.com>

Tue, 21 Jul 2015 21:40:17 +0000 (21:40 +0000)

committer Bill Schmidt <wschmidt@linux.vnet.ibm.com>

Tue, 21 Jul 2015 21:40:17 +0000 (21:40 +0000)
author Bill Schmidt <wschmidt@linux.vnet.ibm.com>
Tue, 21 Jul 2015 21:40:17 +0000 (21:40 +0000)
committer Bill Schmidt <wschmidt@linux.vnet.ibm.com>
Tue, 21 Jul 2015 21:40:17 +0000 (21:40 +0000)
diff --git a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp

index 3fb1dcc3d4aff9d40085db517622abf6f29c9db8..d9504710b3f2533a5dbc31ad30718a096fe3890d 100644 (file)
--- a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
+++ b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -94,7 +94,7 @@ enum SHValues {
    SH_NOSWAP_ST,
    SH_SPLAT,
    SH_XXPERMDI,
-  SH_COPYSCALAR
+  SH_COPYWIDEN
  };
  
  struct PPCVSXSwapRemoval : public MachineFunctionPass {
@@ -149,6 +149,11 @@ private:
    // handling.  Return true iff any changes are made.
    bool removeSwaps();
  
+  // Insert a swap instruction from SrcReg to DstReg at the given
+  // InsertPoint.
+  void insertSwap(MachineInstr *MI, MachineBasicBlock::iterator InsertPoint,
+                  unsigned DstReg, unsigned SrcReg);
+
    // Update instructions requiring special handling.
    void handleSpecialSwappables(int EntryIdx);
  
@@ -340,6 +345,15 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
          SwapVector[VecIdx].IsLoad = 1;
          SwapVector[VecIdx].IsSwap = 1;
          break;
+      case PPC::LXSDX:
+      case PPC::LXSSPX:
+        // A load of a floating-point value into the high-order half of
+        // a vector register is safe, provided that we introduce a swap
+        // following the load, which will be done by the SUBREG_TO_REG
+        // support.  So just mark these as safe.
+        SwapVector[VecIdx].IsLoad = 1;
+        SwapVector[VecIdx].IsSwappable = 1;
+        break;
        case PPC::STVX:
          // Non-permuting stores are currently unsafe.  We can use special
          // handling for this in the future.  By not marking these as
@@ -382,7 +396,7 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
          else if (isVecReg(MI.getOperand(0).getReg()) &&
                   isScalarVecReg(MI.getOperand(2).getReg())) {
            SwapVector[VecIdx].IsSwappable = 1;
-          SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYSCALAR;
+          SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYWIDEN;
          }
          break;
        }
@@ -417,7 +431,14 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
        case PPC::STVEHX:
        case PPC::STVEWX:
        case PPC::STVXL:
+        // We can handle STXSDX and STXSSPX similarly to LXSDX and LXSSPX,
+        // by adding special handling for narrowing copies as well as
+        // widening ones.  However, I've experimented with this, and in
+        // practice we currently do not appear to use STXSDX fed by 
+        // a narrowing copy from a full vector register.  Since I can't
+        // generate any useful test cases, I've left this alone for now.
        case PPC::STXSDX:
+      case PPC::STXSSPX:
        case PPC::VCIPHER:
        case PPC::VCIPHERLAST:
        case PPC::VMRGHB:
@@ -540,7 +561,8 @@ unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg,
    }
  
    if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg)) {
-    SwapVector[VecIdx].MentionsPhysVR = 1;
+    if (!isScalarVecReg(CopySrcReg))
+      SwapVector[VecIdx].MentionsPhysVR = 1;
      return CopySrcReg;
    }
  
@@ -626,8 +648,8 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
        SwapVector[Repr].WebRejected = 1;
  
        DEBUG(dbgs() <<
-            format("Web %d rejected for physreg, partial reg, or not swap[pable]\n",
-                   Repr));
+            format("Web %d rejected for physreg, partial reg, or not "
+                   "swap[pable]\n", Repr));
        DEBUG(dbgs() << "  in " << EntryIdx << ": ");
        DEBUG(SwapVector[EntryIdx].VSEMI->dump());
        DEBUG(dbgs() << "\n");
@@ -740,6 +762,21 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() {
    }
  }
  
+// Create an xxswapd instruction and insert it prior to the given point.
+// MI is used to determine basic block and debug loc information.
+// FIXME: When inserting a swap, we should check whether SrcReg is
+// defined by another swap:  SrcReg = XXPERMDI Reg, Reg, 2;  If so,
+// then instead we should generate a copy from Reg to DstReg.
+void PPCVSXSwapRemoval::insertSwap(MachineInstr *MI,
+                                   MachineBasicBlock::iterator InsertPoint,
+                                   unsigned DstReg, unsigned SrcReg) {
+  BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
+          TII->get(PPC::XXPERMDI), DstReg)
+    .addReg(SrcReg)
+    .addReg(SrcReg)
+    .addImm(2);
+}
+
  // The identified swap entry requires special handling to allow its
  // containing computation to be optimized.  Perform that handling
  // here.
@@ -808,7 +845,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
    // For a copy from a scalar floating-point register to a vector
    // register, removing swaps will leave the copied value in the
    // wrong lane.  Insert a swap following the copy to fix this.
-  case SHValues::SH_COPYSCALAR: {
+  case SHValues::SH_COPYWIDEN: {
      MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
  
      DEBUG(dbgs() << "Changing SUBREG_TO_REG: ");
@@ -829,7 +866,6 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
      // assignment problem.  In this case we must copy from VRRC to VSRC
      // prior to the swap, and from VSRC to VRRC following the swap.
      // Coalescing will usually remove all this mess.
-
      if (DstRC == &PPC::VRRCRegClass) {
        unsigned VSRCTmp1 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
        unsigned VSRCTmp2 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
@@ -839,11 +875,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
          .addReg(NewVReg);
        DEBUG(MI->getNextNode()->dump());
  
-      BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
-              TII->get(PPC::XXPERMDI), VSRCTmp2)
-        .addReg(VSRCTmp1)
-        .addReg(VSRCTmp1)
-        .addImm(2);
+      insertSwap(MI, InsertPoint, VSRCTmp2, VSRCTmp1);
        DEBUG(MI->getNextNode()->getNextNode()->dump());
  
        BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
@@ -852,13 +884,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
        DEBUG(MI->getNextNode()->getNextNode()->getNextNode()->dump());
  
      } else {
-
-      BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
-              TII->get(PPC::XXPERMDI), DstReg)
-        .addReg(NewVReg)
-        .addReg(NewVReg)
-        .addImm(2);
-
+      insertSwap(MI, InsertPoint, DstReg, NewVReg);
        DEBUG(MI->getNextNode()->dump());
      }
      break;
@@ -944,8 +970,8 @@ void PPCVSXSwapRemoval::dumpSwapVector() {
        case SH_XXPERMDI:
          DEBUG(dbgs() << "special:xxpermdi ");
          break;
-      case SH_COPYSCALAR:
-        DEBUG(dbgs() << "special:copyscalar ");
+      case SH_COPYWIDEN:
+        DEBUG(dbgs() << "special:copywiden ");
          break;
        }
      }
diff --git a/test/CodeGen/PowerPC/swaps-le-6.ll b/test/CodeGen/PowerPC/swaps-le-6.ll

new file mode 100644 (file)

index 0000000..365aeee
--- /dev/null
+++ b/test/CodeGen/PowerPC/swaps-le-6.ll
@@ -0,0 +1,44 @@
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu -O3 < %s | FileCheck %s
+
+; These tests verify that VSX swap optimization works when loading a scalar
+; into a vector register.
+
+
+@x = global <2 x double> <double 9.970000e+01, double -1.032220e+02>, align 16
+@z = global <2 x double> <double 2.332000e+01, double 3.111111e+01>, align 16
+@y = global double 1.780000e+00, align 8
+
+define void @bar0() {
+entry:
+  %0 = load <2 x double>, <2 x double>* @x, align 16
+  %1 = load double, double* @y, align 8
+  %vecins = insertelement <2 x double> %0, double %1, i32 0
+  store <2 x double> %vecins, <2 x double>* @z, align 16
+  ret void
+}
+
+; CHECK-LABEL: @bar0
+; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
+; CHECK-DAG: lxsdx [[REG2:[0-9]+]]
+; CHECK: xxswapd [[REG3:[0-9]+]], [[REG2]]
+; CHECK: xxspltd [[REG4:[0-9]+]], [[REG3]], 1
+; CHECK: xxpermdi [[REG5:[0-9]+]], [[REG4]], [[REG1]], 1
+; CHECK: stxvd2x [[REG5]]
+
+define void @bar1() {
+entry:
+  %0 = load <2 x double>, <2 x double>* @x, align 16
+  %1 = load double, double* @y, align 8
+  %vecins = insertelement <2 x double> %0, double %1, i32 1
+  store <2 x double> %vecins, <2 x double>* @z, align 16
+  ret void
+}
+
+; CHECK-LABEL: @bar1
+; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
+; CHECK-DAG: lxsdx [[REG2:[0-9]+]]
+; CHECK: xxswapd [[REG3:[0-9]+]], [[REG2]]
+; CHECK: xxspltd [[REG4:[0-9]+]], [[REG3]], 1
+; CHECK: xxmrghd [[REG5:[0-9]+]], [[REG1]], [[REG4]]
+; CHECK: stxvd2x [[REG5]]
+
author	Bill Schmidt <wschmidt@linux.vnet.ibm.com>
	Tue, 21 Jul 2015 21:40:17 +0000 (21:40 +0000)
committer	Bill Schmidt <wschmidt@linux.vnet.ibm.com>
	Tue, 21 Jul 2015 21:40:17 +0000 (21:40 +0000)
lib/Target/PowerPC/PPCVSXSwapRemoval.cpp		patch \| blob \| history
test/CodeGen/PowerPC/swaps-le-6.ll	[new file with mode: 0644]	patch \| blob