From c4a32e6596f3974a6c00322db1f5f31ea448bd58 Mon Sep 17 00:00:00 2001
From: Tim Northover <Tim.Northover@arm.com>
Date: Thu, 30 Aug 2012 10:17:45 +0000
Subject: [PATCH] Add support for moving pure S-register to NEON pipeline if
 desired

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@162898 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMBaseInstrInfo.cpp   | 73 ++++++++++++++++++++++++++-
 test/CodeGen/ARM/domain-conv-vmovs.ll | 64 +++++++++++++++++++++++
 2 files changed, 135 insertions(+), 2 deletions(-)
 create mode 100644 test/CodeGen/ARM/domain-conv-vmovs.ll

diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 41d0c570bf4..7b7b6e3395e 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -3377,7 +3377,8 @@ ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const {
   // converted.
   if (Subtarget.isCortexA9() && !isPredicated(MI) &&
       (MI->getOpcode() == ARM::VMOVRS ||
-       MI->getOpcode() == ARM::VMOVSR))
+       MI->getOpcode() == ARM::VMOVSR ||
+       MI->getOpcode() == ARM::VMOVS))
     return std::make_pair(ExeVFP, (1<<ExeVFP) | (1<<ExeNEON));
 
   // No other instructions can be swizzled, so just determine their domain.
@@ -3490,10 +3491,78 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
                         .addReg(DReg, RegState::Undef)
                         .addReg(SrcReg)
                         .addImm(Lane));
-   
+
       // The destination must be marked as set.
       MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
       break;
+    case ARM::VMOVS: {
+      if (Domain != ExeNEON)
+        break;
+
+      // Source instruction is %SDst = VMOVS %SSrc, 14, %noreg (; implicits)
+      DstReg = MI->getOperand(0).getReg();
+      SrcReg = MI->getOperand(1).getReg();
+
+      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
+        MI->RemoveOperand(i-1);
+
+      unsigned DstLane = 0, SrcLane = 0, DDst, DSrc;
+      DDst = getCorrespondingDRegAndLane(TRI, DstReg, DstLane);
+      DSrc = getCorrespondingDRegAndLane(TRI, SrcReg, SrcLane);
+
+      if (DSrc == DDst) {
+        // Destination can be:
+        //     %DDst = VDUPLN32d %DDst, Lane, 14, %noreg (; implicits)
+        MI->setDesc(get(ARM::VDUPLN32d));
+        AddDefaultPred(MIB.addReg(DDst, RegState::Define)
+                          .addReg(DDst, RegState::Undef)
+                          .addImm(SrcLane));
+
+        // Neither the source or the destination are naturally represented any
+        // more, so add them in manually.
+        MIB.addReg(DstReg, RegState::Implicit | RegState::Define);
+        MIB.addReg(SrcReg, RegState::Implicit);
+        break;
+      }
+
+      // In general there's no single instruction that can perform an S <-> S
+      // move in NEON space, but a pair of VEXT instructions *can* do the
+      // job. It turns out that the VEXTs needed will only use DSrc once, with
+      // the position based purely on the combination of lane-0 and lane-1
+      // involved. For example
+      //     vmov s0, s2 -> vext.32 d0, d0, d1, #1  vext.32 d0, d0, d0, #1
+      //     vmov s1, s3 -> vext.32 d0, d1, d0, #1  vext.32 d0, d0, d0, #1
+      //     vmov s0, s3 -> vext.32 d0, d0, d0, #1  vext.32 d0, d1, d0, #1
+      //     vmov s1, s2 -> vext.32 d0, d0, d0, #1  vext.32 d0, d0, d1, #1
+      //
+      // Pattern of the MachineInstrs is:
+      //     %DDst = VEXTd32 %DSrc1, %DSrc2, Lane, 14, %noreg (;implicits)
+      MachineInstrBuilder NewMIB;
+      NewMIB = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                       get(ARM::VEXTd32), DDst);
+      NewMIB.addReg(SrcLane == 1 && DstLane == 1 ? DSrc : DDst, RegState::Undef);
+      NewMIB.addReg(SrcLane == 0 && DstLane == 0 ? DSrc : DDst, RegState::Undef);
+      NewMIB.addImm(1);
+      AddDefaultPred(NewMIB);
+
+      if (SrcLane == DstLane)
+        NewMIB.addReg(SrcReg, RegState::Implicit);
+
+      MI->setDesc(get(ARM::VEXTd32));
+      MIB.addReg(DDst, RegState::Define);
+      MIB.addReg(SrcLane == 1 && DstLane == 0 ? DSrc : DDst, RegState::Undef);
+      MIB.addReg(SrcLane == 0 && DstLane == 1 ? DSrc : DDst, RegState::Undef);
+      MIB.addImm(1);
+      AddDefaultPred(MIB);
+
+      if (SrcLane != DstLane)
+        MIB.addReg(SrcReg, RegState::Implicit);
+
+      // As before, the original destination is no longer represented, add it
+      // implicitly.
+      MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
+      break;
+    }
   }
 
 }
diff --git a/test/CodeGen/ARM/domain-conv-vmovs.ll b/test/CodeGen/ARM/domain-conv-vmovs.ll
new file mode 100644
index 00000000000..f1cd9f58404
--- /dev/null
+++ b/test/CodeGen/ARM/domain-conv-vmovs.ll
@@ -0,0 +1,64 @@
+; RUN: llc -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a9 -mattr=+neon,+neonfp -float-abi=hard < %s | FileCheck %s
+
+define <2 x float> @test_vmovs_via_vext_lane0to0(float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane0to0:
+  %vec = insertelement <2 x float> %in, float %arg, i32 0
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d1, d0, #1
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+define <2 x float> @test_vmovs_via_vext_lane0to1(float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane0to1:
+  %vec = insertelement <2 x float> %in, float %arg, i32 1
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vext.32 d1, d1, d0, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+define <2 x float> @test_vmovs_via_vext_lane1to0(float, float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane1to0:
+  %vec = insertelement <2 x float> %in, float %arg, i32 0
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vext.32 d1, d0, d1, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+define <2 x float> @test_vmovs_via_vext_lane1to1(float, float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane1to1:
+  %vec = insertelement <2 x float> %in, float %arg, i32 1
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d0, d1, #1
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+
+define float @test_vmovs_via_vdup(float, float %ret, float %lhs, float %rhs) {
+; CHECK: test_vmovs_via_vdup:
+
+  ; Do an operation (which will end up NEON because of +neonfp) to convince the
+  ; execution-domain pass that NEON is a good thing to use.
+  %res = fadd float %ret, %ret
+  ;  It makes sense for LLVM to do the addition in d0 here, because it's going
+  ;  to be returned. This means it will want a "vmov s0, s1":
+; CHECK: vdup.32 d0, d0[1]
+
+  ret float %res
+}
+
-- 
2.34.1