This patch combines several changes from Evan Cheng for rdar://8659675.

author Bob Wilson <bob.wilson@apple.com>

Tue, 19 Apr 2011 18:11:57 +0000 (18:11 +0000)

committer Bob Wilson <bob.wilson@apple.com>

Tue, 19 Apr 2011 18:11:57 +0000 (18:11 +0000)
author Bob Wilson <bob.wilson@apple.com>
Tue, 19 Apr 2011 18:11:57 +0000 (18:11 +0000)
committer Bob Wilson <bob.wilson@apple.com>
Tue, 19 Apr 2011 18:11:57 +0000 (18:11 +0000)
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td

index 745d31dcd0f0512b42f682b4945ee0be15b8065e..6af5f85e8a85922e31c2d10b0ea54c829efce60f 100644 (file)
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -118,7 +118,7 @@ def ProcA8      : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8",
                                      FeatureT2XtPk]>;
  def ProcA9      : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
                                     "Cortex-A9 ARM processors",
-                                   [FeatureHasSlowFPVMLx, FeatureVMLxForwarding,
+                                   [FeatureVMLxForwarding,
                                      FeatureT2XtPk, FeatureFP16,
                                      FeatureAvoidPartialCPSR]>;
  
diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp

index e97ce50bc4294952e468085f36eca9a9e3bd002f..517bba8cee8e532927e2b01c47cdcafe3854898a 100644 (file)
--- a/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -49,6 +49,8 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
        const TargetInstrDesc &LastTID = LastMI->getDesc();
        // Skip over one non-VFP / NEON instruction.
        if (!LastTID.isBarrier() &&
+          // On A9, AGU and NEON/FPU are muxed.
+          !(STI.isCortexA9() && (LastTID.mayLoad() || LastTID.mayStore())) &&
            (LastTID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {
          MachineBasicBlock::iterator I = LastMI;
          if (I != LastMI->getParent()->begin()) {
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp

index 01bb2be0aa705c9f1a29fbd202bdddf8b4bbea09..58e2e7cbd02279734f2e49e718ff6269c88ac694 100644 (file)
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -45,7 +45,7 @@ DisableShifterOp("disable-shifter-op", cl::Hidden,
  static cl::opt<bool>
  CheckVMLxHazard("check-vmlx-hazard", cl::Hidden,
    cl::desc("Check fp vmla / vmls hazard at isel time"),
-  cl::init(false));
+  cl::init(true));
  
  //===--------------------------------------------------------------------===//
  /// ARMDAGToDAGISel - ARM specific code to select ARM machine
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp

index 6ec6747b3f149f1304dc87ea81fc2f8f00df8633..197b514e44ac8f762c994964eb3f355b76d5647e 100644 (file)
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -22,8 +22,6 @@
  #include "llvm/Target/TargetRegistry.h"
  using namespace llvm;
  
-static cl::opt<bool>ExpandMLx("expand-fp-mlx", cl::init(false), cl::Hidden);
-
  static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
    Triple TheTriple(TT);
    switch (TheTriple.getOS()) {
@@ -148,8 +146,7 @@ bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM,
    // FIXME: temporarily disabling load / store optimization pass for Thumb1.
    if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only())
      PM.add(createARMLoadStoreOptimizationPass(true));
-  if (ExpandMLx &&
-      OptLevel != CodeGenOpt::None && Subtarget.hasVFP2())
+  if (OptLevel != CodeGenOpt::None && Subtarget.isCortexA9())
      PM.add(createMLxExpansionPass());
  
    return true;
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp

index 9a27e2f4706484c28c770d9004721c63e65319d0..f6d024232eaeb92f2967d9d7bf358de92ca0eadc 100644 (file)
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -15,11 +15,13 @@
  #define DEBUG_TYPE "mlx-expansion"
  #include "ARM.h"
  #include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
  #include "llvm/CodeGen/MachineInstr.h"
  #include "llvm/CodeGen/MachineInstrBuilder.h"
  #include "llvm/CodeGen/MachineFunctionPass.h"
  #include "llvm/CodeGen/MachineRegisterInfo.h"
  #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/SmallPtrSet.h"
  #include "llvm/ADT/Statistic.h"
  #include "llvm/Support/CommandLine.h"
  #include "llvm/Support/Debug.h"
@@ -49,15 +51,17 @@ namespace {
      const TargetRegisterInfo *TRI;
      MachineRegisterInfo *MRI;
  
+    bool isA9;
      unsigned MIIdx;
      MachineInstr* LastMIs[4];
+    SmallPtrSet<MachineInstr*, 4> IgnoreStall;
  
      void clearStack();
      void pushStack(MachineInstr *MI);
      MachineInstr *getAccDefMI(MachineInstr *MI) const;
      unsigned getDefReg(MachineInstr *MI) const;
      bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const;
-    bool FindMLxHazard(MachineInstr *MI) const;
+    bool FindMLxHazard(MachineInstr *MI);
      void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
                                  unsigned MulOpc, unsigned AddSubOpc,
                                  bool NegAcc, bool HasLane);
@@ -146,7 +150,7 @@ bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const {
  }
  
  
-bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const {
+bool MLxExpansion::FindMLxHazard(MachineInstr *MI) {
    if (NumExpand >= ExpandLimit)
      return false;
  
@@ -154,7 +158,7 @@ bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const {
      return true;
  
    MachineInstr *DefMI = getAccDefMI(MI);
-  if (TII->isFpMLxInstruction(DefMI->getOpcode()))
+  if (TII->isFpMLxInstruction(DefMI->getOpcode())) {
      // r0 = vmla
      // r3 = vmla r0, r1, r2
      // takes 16 - 17 cycles
@@ -163,24 +167,33 @@ bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const {
      // r4 = vmul r1, r2
      // r3 = vadd r0, r4
      // takes about 14 - 15 cycles even with vmul stalling for 4 cycles.
+    IgnoreStall.insert(DefMI);
      return true;
+  }
+
+  if (IgnoreStall.count(MI))
+    return false;
  
    // If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the
    // VADD.F or VMUL.F will stall 4 cycles before issue. The 4 cycle stall
    // preserves the in-order retirement of the instructions.
    // Look at the next few instructions, if *most* of them can cause hazards,
    // then the scheduler can't *fix* this, we'd better break up the VMLA.
+  unsigned Limit1 = isA9 ? 1 : 4;
+  unsigned Limit2 = isA9 ? 1 : 4;
    for (unsigned i = 1; i <= 4; ++i) {
      int Idx = ((int)MIIdx - i + 4) % 4;
      MachineInstr *NextMI = LastMIs[Idx];
      if (!NextMI)
        continue;
  
-    if (TII->canCauseFpMLxStall(NextMI->getOpcode()))
-      return true;
+    if (TII->canCauseFpMLxStall(NextMI->getOpcode())) {
+      if (i <= Limit1)
+        return true;
+    }
  
      // Look for VMLx RAW hazard.
-    if (hasRAWHazard(getDefReg(MI), NextMI))
+    if (i <= Limit2 && hasRAWHazard(getDefReg(MI), NextMI))
        return true;
    }
  
@@ -248,6 +261,7 @@ bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) {
    bool Changed = false;
  
    clearStack();
+  IgnoreStall.clear();
  
    unsigned Skip = 0;
    MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend();
@@ -299,6 +313,8 @@ bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) {
    TII = static_cast<const ARMBaseInstrInfo*>(Fn.getTarget().getInstrInfo());
    TRI = Fn.getTarget().getRegisterInfo();
    MRI = &Fn.getRegInfo();
+  const ARMSubtarget *STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
+  isA9 = STI->isCortexA9();
  
    bool Modified = false;
    for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
diff --git a/test/CodeGen/ARM/fmacs.ll b/test/CodeGen/ARM/fmacs.ll

index fb83ef626af69e9e32045e20321e281daa38a7de..b63f609e755a265d4f2a1d72105cc719124f4669 100644 (file)
--- a/test/CodeGen/ARM/fmacs.ll
+++ b/test/CodeGen/ARM/fmacs.ll
@@ -1,6 +1,8 @@
  ; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
  ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NEON
  ; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
+; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=A9
+; RUN: llc < %s -mtriple=arm-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard | FileCheck %s -check-prefix=HARD
  
  define float @t1(float %acc, float %a, float %b) {
  entry:
@@ -49,3 +51,54 @@ entry:
          %1 = fadd float %0, %acc
         ret float %1
  }
+
+; It's possible to make use of fp vmla / vmls on Cortex-A9.
+; rdar://8659675
+define void @t4(float %acc1, float %a, float %b, float %acc2, float %c, float* %P1, float* %P2) {
+entry:
+; A8: t4:
+; A8: vmul.f32
+; A8: vmul.f32
+; A8: vadd.f32
+; A8: vadd.f32
+
+; Two vmla with now RAW hazard
+; A9: t4:
+; A9: vmla.f32
+; A9: vmla.f32
+
+; HARD: t4:
+; HARD: vmla.f32 s0, s1, s2
+; HARD: vmla.f32 s3, s1, s4
+  %0 = fmul float %a, %b
+  %1 = fadd float %acc1, %0
+  %2 = fmul float %a, %c
+  %3 = fadd float %acc2, %2
+  store float %1, float* %P1
+  store float %3, float* %P2
+  ret void
+}
+
+define float @t5(float %a, float %b, float %c, float %d, float %e) {
+entry:
+; A8: t5:
+; A8: vmul.f32
+; A8: vmul.f32
+; A8: vadd.f32
+; A8: vadd.f32
+
+; A9: t5:
+; A9: vmla.f32
+; A9: vmul.f32
+; A9: vadd.f32
+
+; HARD: t5:
+; HARD: vmla.f32 s4, s0, s1
+; HARD: vmul.f32 s0, s2, s3
+; HARD: vadd.f32 s0, s4, s0
+  %0 = fmul float %a, %b
+  %1 = fadd float %e, %0
+  %2 = fmul float %c, %d
+  %3 = fadd float %1, %2
+  ret float %3
+}
author	Bob Wilson <bob.wilson@apple.com>
	Tue, 19 Apr 2011 18:11:57 +0000 (18:11 +0000)
committer	Bob Wilson <bob.wilson@apple.com>
	Tue, 19 Apr 2011 18:11:57 +0000 (18:11 +0000)
lib/Target/ARM/ARM.td		patch \| blob \| history
lib/Target/ARM/ARMHazardRecognizer.cpp		patch \| blob \| history
lib/Target/ARM/ARMISelDAGToDAG.cpp		patch \| blob \| history
lib/Target/ARM/ARMTargetMachine.cpp		patch \| blob \| history
lib/Target/ARM/MLxExpansionPass.cpp		patch \| blob \| history
test/CodeGen/ARM/fmacs.ll		patch \| blob \| history