WebAssembly: improve readme, add placeholder for tests.

[oota-llvm.git] / lib / Target / AArch64 / AArch64LoadStoreOptimizer.cpp
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

index 32933930ac97960564031c6f52fcde445bb39018..27d569d7043228e0b7f960650e3e4e1543bf33b3 100644 (file)
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -41,7 +41,8 @@ STATISTIC(NumPostFolded, "Number of post-index updates folded");
  STATISTIC(NumPreFolded, "Number of pre-index updates folded");
  STATISTIC(NumUnscaledPairCreated,
            "Number of load/store from unscaled generated");
-STATISTIC(NumSmallTypeMerged, "Number of small type loads merged");
+STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted");
+STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
  
  static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
                                     cl::init(20), cl::Hidden);
@@ -152,6 +153,8 @@ static bool isUnscaledLdSt(unsigned Opc) {
    case AArch64::STURSi:
    case AArch64::STURDi:
    case AArch64::STURQi:
+  case AArch64::STURBBi:
+  case AArch64::STURHHi:
    case AArch64::STURWi:
    case AArch64::STURXi:
    case AArch64::LDURSi:
@@ -189,7 +192,23 @@ static unsigned getBitExtrOpcode(MachineInstr *MI) {
    }
  }
  
-static bool isSmallTypeLdMerge(unsigned Opc) {
+static bool isNarrowStore(unsigned Opc) {
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::STRBBui:
+  case AArch64::STURBBi:
+  case AArch64::STRHHui:
+  case AArch64::STURHHi:
+    return true;
+  }
+}
+
+static bool isNarrowStore(MachineInstr *MI) {
+  return isNarrowStore(MI->getOpcode());
+}
+
+static bool isNarrowLoad(unsigned Opc) {
    switch (Opc) {
    default:
      return false;
@@ -205,8 +224,8 @@ static bool isSmallTypeLdMerge(unsigned Opc) {
    }
  }
  
-static bool isSmallTypeLdMerge(MachineInstr *MI) {
-  return isSmallTypeLdMerge(MI->getOpcode());
+static bool isNarrowLoad(MachineInstr *MI) {
+  return isNarrowLoad(MI->getOpcode());
  }
  
  // Scaling factor for unscaled load or store.
@@ -219,12 +238,14 @@ static int getMemScale(MachineInstr *MI) {
    case AArch64::LDRSBWui:
    case AArch64::LDURSBWi:
    case AArch64::STRBBui:
+  case AArch64::STURBBi:
      return 1;
    case AArch64::LDRHHui:
    case AArch64::LDURHHi:
    case AArch64::LDRSHWui:
    case AArch64::LDURSHWi:
    case AArch64::STRHHui:
+  case AArch64::STURHHi:
      return 2;
    case AArch64::LDRSui:
    case AArch64::LDURSi:
@@ -278,6 +299,10 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
    case AArch64::STURDi:
    case AArch64::STRQui:
    case AArch64::STURQi:
+  case AArch64::STRBBui:
+  case AArch64::STURBBi:
+  case AArch64::STRHHui:
+  case AArch64::STURHHi:
    case AArch64::STRWui:
    case AArch64::STURWi:
    case AArch64::STRXui:
@@ -327,6 +352,14 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
    case AArch64::STRQui:
    case AArch64::STURQi:
      return AArch64::STPQi;
+  case AArch64::STRBBui:
+    return AArch64::STRHHui;
+  case AArch64::STRHHui:
+    return AArch64::STRWui;
+  case AArch64::STURBBi:
+    return AArch64::STURHHi;
+  case AArch64::STURHHi:
+    return AArch64::STURWi;
    case AArch64::STRWui:
    case AArch64::STURWi:
      return AArch64::STPWi;
@@ -582,7 +615,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
  
    int OffsetImm = getLdStOffsetOp(RtMI).getImm();
  
-  if (isSmallTypeLdMerge(Opc)) {
+  if (isNarrowLoad(Opc)) {
      // Change the scaled offset from small to large type.
      if (!IsUnscaled) {
        assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
@@ -681,17 +714,33 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
      return NextI;
    }
  
-  // Handle Unscaled
-  if (IsUnscaled)
-    OffsetImm /= OffsetStride;
-
    // Construct the new instruction.
-  MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint,
-                                    I->getDebugLoc(), TII->get(NewOpc))
-                                .addOperand(getLdStRegOp(RtMI))
-                                .addOperand(getLdStRegOp(Rt2MI))
-                                .addOperand(BaseRegOp)
-                                .addImm(OffsetImm);
+  MachineInstrBuilder MIB;
+  if (isNarrowStore(Opc)) {
+    // Change the scaled offset from small to large type.
+    if (!IsUnscaled) {
+      assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
+      OffsetImm /= 2;
+    }
+    MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+                  TII->get(NewOpc))
+              .addOperand(getLdStRegOp(I))
+              .addOperand(BaseRegOp)
+              .addImm(OffsetImm);
+    // Copy MachineMemOperands from the original stores.
+    concatenateMemOperands(MIB, I, Paired);
+  } else {
+    // Handle Unscaled
+    if (IsUnscaled)
+      OffsetImm /= OffsetStride;
+    MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+                  TII->get(NewOpc))
+              .addOperand(getLdStRegOp(RtMI))
+              .addOperand(getLdStRegOp(Rt2MI))
+              .addOperand(BaseRegOp)
+              .addImm(OffsetImm);
+  }
+
    (void)MIB;
  
    // FIXME: Do we need/want to copy the mem operands from the source
@@ -830,6 +879,11 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
    unsigned Reg = getLdStRegOp(FirstMI).getReg();
    unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
    int Offset = getLdStOffsetOp(FirstMI).getImm();
+  bool IsNarrowStore = isNarrowStore(Opc);
+
+  // For narrow stores, find only the case where the stored value is WZR.
+  if (IsNarrowStore && Reg != AArch64::WZR)
+    return E;
  
    // Early exit if the first instruction modifies the base register.
    // e.g., ldr x0, [x0]
@@ -840,7 +894,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
    // range, plus allow an extra one in case we find a later insn that matches
    // with Offset-1)
    int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
-  if (!isSmallTypeLdMerge(Opc) &&
+  if (!(isNarrowLoad(Opc) || IsNarrowStore) &&
        !inBoundsForPair(IsUnscaled, Offset, OffsetStride))
      return E;
  
@@ -900,17 +954,17 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
          // If the resultant immediate offset of merging these instructions
          // is out of range for a pairwise instruction, bail and keep looking.
          bool MIIsUnscaled = isUnscaledLdSt(MI);
-        bool IsSmallTypeLd = isSmallTypeLdMerge(MI->getOpcode());
-        if (!IsSmallTypeLd &&
+        bool IsNarrowLoad = isNarrowLoad(MI->getOpcode());
+        if (!IsNarrowLoad &&
              !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
            trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
            MemInsns.push_back(MI);
            continue;
          }
  
-        if (IsSmallTypeLd) {
-          // If the alignment requirements of the larger type scaled load
-          // instruction can't express the scaled offset of the smaller type
+        if (IsNarrowLoad || IsNarrowStore) {
+          // If the alignment requirements of the scaled wide load/store
+          // instruction can't express the offset of the scaled narrow
            // input, bail and keep looking.
            if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) {
              trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
@@ -930,7 +984,10 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
          // If the destination register of the loads is the same register, bail
          // and keep looking. A load-pair instruction with both destination
          // registers the same is UNPREDICTABLE and will result in an exception.
-        if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {
+        // For narrow stores, allow only when the stored value is the same
+        // (i.e., WZR).
+        if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) ||
+            (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) {
            trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
            MemInsns.push_back(MI);
            continue;
@@ -1227,8 +1284,10 @@ bool AArch64LoadStoreOpt::tryToMergeLdStInst(
    LdStPairFlags Flags;
    MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit);
    if (Paired != E) {
-    if (isSmallTypeLdMerge(MI)) {
-      ++NumSmallTypeMerged;
+    if (isNarrowLoad(MI)) {
+      ++NumNarrowLoadsPromoted;
+    } else if (isNarrowStore(MI)) {
+      ++NumZeroStoresPromoted;
      } else {
        ++NumPairCreated;
        if (isUnscaledLdSt(MI))
@@ -1285,11 +1344,15 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
      case AArch64::LDRHHui:
      case AArch64::LDRSBWui:
      case AArch64::LDRSHWui:
+    case AArch64::STRBBui:
+    case AArch64::STRHHui:
      // Unscaled instructions.
      case AArch64::LDURBBi:
      case AArch64::LDURHHi:
      case AArch64::LDURSBWi:
-    case AArch64::LDURSHWi: {
+    case AArch64::LDURSHWi:
+    case AArch64::STURBBi:
+    case AArch64::STURHHi: {
        if (tryToMergeLdStInst(MBBI)) {
          Modified = true;
          break;
@@ -1463,14 +1526,12 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
  }
  
  bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
-  const AArch64Subtarget *SubTarget =
-      &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
-  bool ProfitableArch = SubTarget->isCortexA57();
+  bool ProfitableArch = Subtarget->isCortexA57();
    // FIXME: The benefit from converting narrow loads into a wider load could be
    // microarchitectural as it assumes that a single load with two bitfield
    // extracts is cheaper than two narrow loads. Currently, this conversion is
    // enabled only in cortex-a57 on which performance benefits were verified.
-  return ProfitableArch & (!SubTarget->requiresStrictAlign());
+  return ProfitableArch && !Subtarget->requiresStrictAlign();
  }
  
  bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {