[ARM64] Adds Cortex-A53 scheduling support for vector load/store post.

author Chad Rosier <mcrosier@codeaurora.org>

Mon, 19 May 2014 22:59:51 +0000 (22:59 +0000)

committer Chad Rosier <mcrosier@codeaurora.org>

Mon, 19 May 2014 22:59:51 +0000 (22:59 +0000)
author Chad Rosier <mcrosier@codeaurora.org>
Mon, 19 May 2014 22:59:51 +0000 (22:59 +0000)
committer Chad Rosier <mcrosier@codeaurora.org>
Mon, 19 May 2014 22:59:51 +0000 (22:59 +0000)
diff --git a/lib/Target/ARM64/ARM64InstrInfo.cpp b/lib/Target/ARM64/ARM64InstrInfo.cpp

index 97194b1d479600caab70411c3e833b1b978b2f32..5643fb0ce2c88d846b6d61829d02be1ab1a2e457 100644 (file)
--- a/lib/Target/ARM64/ARM64InstrInfo.cpp
+++ b/lib/Target/ARM64/ARM64InstrInfo.cpp
@@ -841,10 +841,73 @@ bool ARM64InstrInfo::optimizeCompareInstr(
  }
  
  /// Return true if this is this instruction has a non-zero immediate
-bool ARM64InstrInfo::hasNonZeroImm(const MachineInstr *MI) const {
-  if (MI->getOperand(3).isImm()) {
-    unsigned val = MI->getOperand(3).getImm();
-    return (val != 0);
+bool ARM64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case ARM64::ADDSWrs:
+  case ARM64::ADDSXrs:
+  case ARM64::ADDWrs:
+  case ARM64::ADDXrs:
+  case ARM64::ANDSWrs:
+  case ARM64::ANDSXrs:
+  case ARM64::ANDWrs:
+  case ARM64::ANDXrs:
+  case ARM64::BICSWrs:
+  case ARM64::BICSXrs:
+  case ARM64::BICWrs:
+  case ARM64::BICXrs:
+  case ARM64::CRC32Brr:
+  case ARM64::CRC32CBrr:
+  case ARM64::CRC32CHrr:
+  case ARM64::CRC32CWrr:
+  case ARM64::CRC32CXrr:
+  case ARM64::CRC32Hrr:
+  case ARM64::CRC32Wrr:
+  case ARM64::CRC32Xrr:
+  case ARM64::EONWrs:
+  case ARM64::EONXrs:
+  case ARM64::EORWrs:
+  case ARM64::EORXrs:
+  case ARM64::ORNWrs:
+  case ARM64::ORNXrs:
+  case ARM64::ORRWrs:
+  case ARM64::ORRXrs:
+  case ARM64::SUBSWrs:
+  case ARM64::SUBSXrs:
+  case ARM64::SUBWrs:
+  case ARM64::SUBXrs:
+    if (MI->getOperand(3).isImm()) {
+      unsigned val = MI->getOperand(3).getImm();
+      return (val != 0);
+    }
+    break;
+  }
+  return false;
+}
+
+/// Return true if this is this instruction has a non-zero immediate
+bool ARM64InstrInfo::hasExtendedReg(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case ARM64::ADDSWrx:
+  case ARM64::ADDSXrx:
+  case ARM64::ADDSXrx64:
+  case ARM64::ADDWrx:
+  case ARM64::ADDXrx:
+  case ARM64::ADDXrx64:
+  case ARM64::SUBSWrx:
+  case ARM64::SUBSXrx:
+  case ARM64::SUBSXrx64:
+  case ARM64::SUBWrx:
+  case ARM64::SUBXrx:
+  case ARM64::SUBXrx64:
+    if (MI->getOperand(3).isImm()) {
+      unsigned val = MI->getOperand(3).getImm();
+      return (val != 0);
+    }
+    break;
    }
  
    return false;
diff --git a/lib/Target/ARM64/ARM64InstrInfo.h b/lib/Target/ARM64/ARM64InstrInfo.h

index 4bfdc882a2fdb44b61b584dd7b40cd277fa59496..ce195e763b2b3c014970d8eca435047dcc21ff62 100644 (file)
--- a/lib/Target/ARM64/ARM64InstrInfo.h
+++ b/lib/Target/ARM64/ARM64InstrInfo.h
@@ -56,8 +56,13 @@ public:
    unsigned isStoreToStackSlot(const MachineInstr *MI,
                                int &FrameIndex) const override;
  
-  /// \brief Is there a non-zero immediate?
-  bool hasNonZeroImm(const MachineInstr *MI) const;
+  /// Returns true if there is a shiftable register and that the shift value
+  /// is non-zero.
+  bool hasShiftedReg(const MachineInstr *MI) const;
+
+  /// Returns true if there is an extendable register and that the extending value
+  /// is non-zero.
+  bool hasExtendedReg(const MachineInstr *MI) const;
  
    /// \brief Does this instruction set its full destination register to zero?
    bool isGPRZero(const MachineInstr *MI) const;
diff --git a/lib/Target/ARM64/ARM64SchedA53.td b/lib/Target/ARM64/ARM64SchedA53.td

index e07e93d12ef0ccba4e3a01e3bc44b164efcf3220..cf1a820276426684ca6279e17e409fae0c5fee69 100644 (file)
--- a/lib/Target/ARM64/ARM64SchedA53.td
+++ b/lib/Target/ARM64/ARM64SchedA53.td
@@ -148,7 +148,9 @@ def : ReadAdvance<ReadVLD, 0>;
  
  // ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable
  //       operands are needed one cycle later if and only if they are to be
-//       shifted. Otherwise, they too are needed two cycle later.
+//       shifted. Otherwise, they too are needed two cycle later. This same
+//       ReadAdvance applies to Extended registers as well, even though there is
+//       a seperate SchedPredicate for them.
  def : ReadAdvance<ReadI, 2, [WriteImm,WriteI,
                               WriteISReg, WriteIEReg,WriteIS,
                               WriteID32,WriteID64,
@@ -167,7 +169,7 @@ def A53ReadISReg : SchedReadVariant<[
  def : SchedAlias<ReadISReg, A53ReadISReg>;
  
  def A53ReadIEReg : SchedReadVariant<[
-       SchedVar<RegShiftedPred, [A53ReadShifted]>,
+       SchedVar<RegExtendedPred, [A53ReadShifted]>,
         SchedVar<NoSchedPred, [A53ReadNotShifted]>]>;
  def : SchedAlias<ReadIEReg, A53ReadIEReg>;
  
@@ -196,64 +198,83 @@ def : ReadAdvance<ReadID, 1, [WriteImm,WriteI,
  //---
  def : InstRW<[WriteI], (instrs COPY)>;
  
-//---
-// Vector Mul with Accumulate
-//---
-//def : InstRW<[WriteIM32, A53ReadIMA], (instregex "^M(ADD|SUB)W.*")>;
-//def : InstRW<[WriteIM64, A53ReadIMA], (instregex "^M(ADD|SUB)X.*")>;
-
  //---
  // Vector Loads
  //---
-def : InstRW<[A53WriteVLD1], (instregex "LD1i(8|16|32|64)(_POST)?$")>;
-def : InstRW<[A53WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
-def : InstRW<[A53WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
-def : InstRW<[A53WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
-def : InstRW<[A53WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
-def : InstRW<[A53WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
-
-def : InstRW<[A53WriteVLD1], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
-def : InstRW<[A53WriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
-def : InstRW<[A53WriteVLD2], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
-def : InstRW<[A53WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
-
-def : InstRW<[A53WriteVLD2], (instregex "LD3i(8|16|32|64)(_POST)?$")>;
-def : InstRW<[A53WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2dq)(_POST)?$")>;
-def : InstRW<[A53WriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)(_POST)?$")>;
-def : InstRW<[A53WriteVLD3], (instregex "LD3Threev(2d)(_POST)?$")>;
-
-def : InstRW<[A53WriteVLD2], (instregex "LD4i(8|16|32|64)(_POST)?$")>;
-def : InstRW<[A53WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
-def : InstRW<[A53WriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)(_POST)?$")>;
-def : InstRW<[A53WriteVLD4], (instregex "LD4Fourv(2d)(_POST)?$")>;
-
-def : InstRW<[A53WriteVLD1, A53WriteVLD1], (instregex "LDN?PS.*$")>;
-def : InstRW<[A53WriteVLD2, A53WriteVLD2], (instregex "LDN?PD.*$")>;
-def : InstRW<[A53WriteVLD4, A53WriteVLD4], (instregex "LDN?PQ.*$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD1i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVLD1], (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
+
+def : InstRW<[A53WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVLD3], (instregex "LD3Threev(2d)$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD3Threev(2d)_POST$")>;
+
+def : InstRW<[A53WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD4Fourv(2d)$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD5, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>;
  
  //---
  // Vector Stores
  //---
-def : InstRW<[A53WriteVST1], (instregex "ST1i(8|16|32|64)(_POST)?$")>;
-def : InstRW<[A53WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
-def : InstRW<[A53WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
-def : InstRW<[A53WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
-def : InstRW<[A53WriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
-
-def : InstRW<[A53WriteVST1], (instregex "ST2i(8|16|32|64)(_POST)?$")>;
-def : InstRW<[A53WriteVST1], (instregex "ST2Twov(8b|4h|2s)(_POST)?$")>;
-def : InstRW<[A53WriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)(_POST)?$")>;
-
-def : InstRW<[A53WriteVST2], (instregex "ST3i(8|16|32|64)(_POST)?$")>;
-def : InstRW<[A53WriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)(_POST)?$")>;
-def : InstRW<[A53WriteVST2], (instregex "ST3Threev(2d)(_POST)?$")>;
-
-def : InstRW<[A53WriteVST2], (instregex "ST4i(8|16|32|64)(_POST)?$")>;
-def : InstRW<[A53WriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)(_POST)?$")>;
-def : InstRW<[A53WriteVST2], (instregex "ST4Fourv(2d)(_POST)?$")>;
-
-def : InstRW<[A53WriteVST1], (instregex "STN?P(S|D).*$")>;
-def : InstRW<[A53WriteVST2], (instregex "STN?PQ.*$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVST1], (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST3Threev(2d)$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3Threev(2d)_POST$")>;
+
+def : InstRW<[A53WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>;
  
  //---
  // Floating Point MAC, DIV, SQRT
diff --git a/lib/Target/ARM64/ARM64Schedule.td b/lib/Target/ARM64/ARM64Schedule.td

index 26a484fa0ad55c4355b300ecffdabbd188bb6598..3a4194173a8e4fc668efcae799738b16c19bbb6f 100644 (file)
--- a/lib/Target/ARM64/ARM64Schedule.td
+++ b/lib/Target/ARM64/ARM64Schedule.td
@@ -51,7 +51,10 @@ def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled).
  def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST.
  
  // Predicate for determining when a shiftable register is shifted.
-def RegShiftedPred : SchedPredicate<[{TII->hasNonZeroImm(MI)}]>;
+def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(MI)}]>;
+
+// Predicate for determining when a extendedable register is extended.
+def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(MI)}]>;
  
  // ScaledIdxPred is true if a WriteLDIdx operand will be
  // scaled. Subtargets can use this to dynamically select resources and
diff --git a/test/CodeGen/ARM64/misched-basic-A53.ll b/test/CodeGen/ARM64/misched-basic-A53.ll

index 608e5b65b63d7e2314dd72970f7461b6bd01bf12..b87a523a30be0f798e2449e1ef739a44ceb1562d 100644 (file)
--- a/test/CodeGen/ARM64/misched-basic-A53.ll
+++ b/test/CodeGen/ARM64/misched-basic-A53.ll
@@ -108,3 +108,18 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i
  
  attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
  attributes #1 = { nounwind }
+
+
+; Regression Test for Bug 19761
+; - [ARM64] Cortex-a53 schedule mode can't handle NEON post-increment load
+; - http://llvm.org/bugs/show_bug.cgi?id=19761
+;
+; Nothing explicit to check other than llc not crashing.
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2(i8* %A, i8** %ptr) {
+  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8*)
author	Chad Rosier <mcrosier@codeaurora.org>
	Mon, 19 May 2014 22:59:51 +0000 (22:59 +0000)
committer	Chad Rosier <mcrosier@codeaurora.org>
	Mon, 19 May 2014 22:59:51 +0000 (22:59 +0000)
lib/Target/ARM64/ARM64InstrInfo.cpp		patch \| blob \| history
lib/Target/ARM64/ARM64InstrInfo.h		patch \| blob \| history
lib/Target/ARM64/ARM64SchedA53.td		patch \| blob \| history
lib/Target/ARM64/ARM64Schedule.td		patch \| blob \| history
test/CodeGen/ARM64/misched-basic-A53.ll		patch \| blob \| history