From 361054b1fa1c823f227bf58ad0c22f9c96e0822c Mon Sep 17 00:00:00 2001 From: Matthias Braun Date: Tue, 21 Jul 2015 00:18:59 +0000 Subject: [PATCH] ARMLoadStoreOptimizer: Create LDRD/STRD on thumb2 Re-apply r241926 with an additional check that r13 and r15 are not used for LDRD/STRD. See http://llvm.org/PR24190. This also already includes the fix from r241951. Differential Revision: http://reviews.llvm.org/D10623 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@242742 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 133 ++++++++++++++---- ...013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll | 3 +- test/CodeGen/ARM/byval-align.ll | 3 +- test/CodeGen/ARM/ldrd.ll | 21 +++ test/CodeGen/ARM/memset-inline.ll | 3 +- test/CodeGen/ARM/wrong-t2stmia-size-opt.ll | 16 ++- test/CodeGen/Thumb2/aapcs.ll | 6 +- 7 files changed, 138 insertions(+), 47 deletions(-) diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index bef191fc11e..b6f8c320ba5 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -111,8 +111,12 @@ namespace { /// Index into the basic block where the merged instruction will be /// inserted. (See MemOpQueueEntry.Position) unsigned InsertPos; + /// Whether the instructions can be merged into a ldm/stm instruction. + bool CanMergeToLSMulti; + /// Whether the instructions can be merged into a ldrd/strd instruction. + bool CanMergeToLSDouble; }; - BumpPtrAllocator Allocator; + SpecificBumpPtrAllocator Allocator; SmallVector Candidates; void moveLiveRegsBefore(const MachineBasicBlock &MBB, @@ -122,11 +126,14 @@ namespace { MachineBasicBlock::iterator MBBI, DebugLoc DL, unsigned Base, unsigned WordOffset, ARMCC::CondCodes Pred, unsigned PredReg); - MachineInstr *MergeOps(MachineBasicBlock &MBB, - MachineBasicBlock::iterator InsertBefore, int Offset, - unsigned Base, bool BaseKill, unsigned Opcode, - ARMCC::CondCodes Pred, unsigned PredReg, DebugLoc DL, - ArrayRef> Regs); + MachineInstr *CreateLoadStoreMulti(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, + bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, + DebugLoc DL, ArrayRef> Regs); + MachineInstr *CreateLoadStoreDouble(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, + bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, + DebugLoc DL, ArrayRef> Regs) const; void FormCandidates(const MemOpQueue &MemOps); MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand); bool FixInvalidRegPairOp(MachineBasicBlock &MBB, @@ -555,12 +562,10 @@ static bool ContainsReg(const ArrayRef> &Regs, /// Create and insert a LDM or STM with Base as base register and registers in /// Regs as the register operands that would be loaded / stored. It returns /// true if the transformation is done. -MachineInstr * -ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, - MachineBasicBlock::iterator InsertBefore, int Offset, - unsigned Base, bool BaseKill, unsigned Opcode, - ARMCC::CondCodes Pred, unsigned PredReg, DebugLoc DL, - ArrayRef> Regs) { +MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, + bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, + DebugLoc DL, ArrayRef> Regs) { unsigned NumRegs = Regs.size(); assert(NumRegs > 1); @@ -749,6 +754,28 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, return MIB.getInstr(); } +MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, + bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, + DebugLoc DL, ArrayRef> Regs) const { + bool IsLoad = isi32Load(Opcode); + assert((IsLoad || isi32Store(Opcode)) && "Must have integer load or store"); + unsigned LoadStoreOpcode = IsLoad ? ARM::t2LDRDi8 : ARM::t2STRDi8; + + assert(Regs.size() == 2); + MachineInstrBuilder MIB = BuildMI(MBB, InsertBefore, DL, + TII->get(LoadStoreOpcode)); + if (IsLoad) { + MIB.addReg(Regs[0].first, RegState::Define) + .addReg(Regs[1].first, RegState::Define); + } else { + MIB.addReg(Regs[0].first, getKillRegState(Regs[0].second)) + .addReg(Regs[1].first, getKillRegState(Regs[1].second)); + } + MIB.addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); + return MIB.getInstr(); +} + /// Call MergeOps and update MemOps and merges accordingly on success. MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { const MachineInstr *First = Cand.Instrs.front(); @@ -799,7 +826,12 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(First, PredReg); DebugLoc DL = First->getDebugLoc(); - MachineInstr *Merged = MergeOps(MBB, InsertBefore, Offset, Base, BaseKill, + MachineInstr *Merged = nullptr; + if (Cand.CanMergeToLSDouble) + Merged = CreateLoadStoreDouble(MBB, InsertBefore, Offset, Base, BaseKill, + Opcode, Pred, PredReg, DL, Regs); + if (!Merged && Cand.CanMergeToLSMulti) + Merged = CreateLoadStoreMulti(MBB, InsertBefore, Offset, Base, BaseKill, Opcode, Pred, PredReg, DL, Regs); if (!Merged) return nullptr; @@ -861,6 +893,13 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { return Merged; } +static bool isValidLSDoubleOffset(int Offset) { + unsigned Value = abs(Offset); + // t2LDRDi8/t2STRDi8 supports an 8 bit immediate which is internally + // multiplied by 4. + return (Value % 4) == 0 && Value < 1024; +} + /// Find candidates for load/store multiple merge in list of MemOpQueueEntries. void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { const MachineInstr *FirstMI = MemOps[0].MI; @@ -880,29 +919,55 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { unsigned Latest = SIndex; unsigned Earliest = SIndex; unsigned Count = 1; - - // Merge additional instructions fulfilling LDM/STM constraints. + bool CanMergeToLSDouble = + STI->isThumb2() && isNotVFP && isValidLSDoubleOffset(Offset); + // ARM errata 602117: LDRD with base in list may result in incorrect base + // register when interrupted or faulted. + if (STI->isCortexM3() && isi32Load(Opcode) && + PReg == getLoadStoreBaseOp(*MI).getReg()) + CanMergeToLSDouble = false; + + bool CanMergeToLSMulti = true; + // On swift vldm/vstm starting with an odd register number as that needs + // more uops than single vldrs. + if (STI->isSwift() && !isNotVFP && (PRegNum % 2) == 1) + CanMergeToLSMulti = false; + + // LDRD/STRD do not allow SP/PC. LDM/STM do not support it or have it + // deprecated; LDM to PC is fine but cannot happen here. + if (PReg == ARM::SP || PReg == ARM::PC) + CanMergeToLSMulti = CanMergeToLSDouble = false; + + // Merge following instructions where possible. for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) { int NewOffset = MemOps[I].Offset; if (NewOffset != Offset + (int)Size) break; const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI); unsigned Reg = MO.getReg(); - if (Reg == ARM::SP) + if (Reg == ARM::SP || Reg == ARM::PC) break; + + // See if the current load/store may be part of a multi load/store. unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg); - // Register numbers must be in ascending order. - if (RegNum <= PRegNum) - break; - // For VFP / NEON load/store multiples, the registers must be consecutive - // and within the limit on the number of registers per instruction. - if (!isNotVFP && RegNum != PRegNum+1) - break; - // On Swift we don't want vldm/vstm to start with a odd register num - // because Q register unaligned vldm/vstm need more uops. - if (!isNotVFP && STI->isSwift() && Count == 1 && (PRegNum % 2) == 1) - break; + bool PartOfLSMulti = CanMergeToLSMulti; + if (PartOfLSMulti) { + // Register numbers must be in ascending order. + if (RegNum <= PRegNum) + PartOfLSMulti = false; + // For VFP / NEON load/store multiples, the registers must be + // consecutive and within the limit on the number of registers per + // instruction. + else if (!isNotVFP && RegNum != PRegNum+1) + PartOfLSMulti = false; + } + // See if the current load/store may be part of a double load/store. + bool PartOfLSDouble = CanMergeToLSDouble && Count <= 1; + if (!PartOfLSMulti && !PartOfLSDouble) + break; + CanMergeToLSMulti &= PartOfLSMulti; + CanMergeToLSDouble &= PartOfLSDouble; // Track MemOp with latest and earliest position (Positions are // counted in reverse). unsigned Position = MemOps[I].Position; @@ -916,12 +981,16 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { } // Form a candidate from the Ops collected so far. - MergeCandidate *Candidate = new(Allocator) MergeCandidate; + MergeCandidate *Candidate = new(Allocator.Allocate()) MergeCandidate; for (unsigned C = SIndex, CE = SIndex + Count; C < CE; ++C) Candidate->Instrs.push_back(MemOps[C].MI); Candidate->LatestMIIdx = Latest - SIndex; Candidate->EarliestMIIdx = Earliest - SIndex; Candidate->InsertPos = MemOps[Latest].Position; + if (Count == 1) + CanMergeToLSMulti = CanMergeToLSDouble = false; + Candidate->CanMergeToLSMulti = CanMergeToLSMulti; + Candidate->CanMergeToLSDouble = CanMergeToLSDouble; Candidates.push_back(Candidate); // Continue after the chain. SIndex += Count; @@ -1653,12 +1722,14 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { // Go through list of candidates and merge. bool Changed = false; for (const MergeCandidate *Candidate : Candidates) { - if (Candidate->Instrs.size() > 1) { + if (Candidate->CanMergeToLSMulti || Candidate->CanMergeToLSDouble) { MachineInstr *Merged = MergeOpsUpdate(*Candidate); // Merge preceding/trailing base inc/dec into the merged op. if (Merged) { - MergeBaseUpdateLSMultiple(Merged); Changed = true; + unsigned Opcode = Merged->getOpcode(); + if (Opcode != ARM::t2STRDi8 && Opcode != ARM::t2LDRDi8) + MergeBaseUpdateLSMultiple(Merged); } else { for (MachineInstr *MI : Candidate->Instrs) { if (MergeBaseUpdateLoadStore(MI)) @@ -1738,7 +1809,7 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { Modified |= MergeReturnIntoLDM(MBB); } - Allocator.Reset(); + Allocator.DestroyAll(); return Modified; } diff --git a/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll b/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll index c93d2a2d34f..ac5b6f9c970 100644 --- a/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll +++ b/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll @@ -25,8 +25,7 @@ entry: ;CHECK: push {r7, lr} ;CHECK: sub sp, #4 ;CHECK: add r0, sp, #12 - ;CHECK: str r2, [sp, #16] - ;CHECK: str r1, [sp, #12] + ;CHECK: strd r1, r2, [sp, #12] ;CHECK: bl fooUseStruct call void @fooUseStruct(%st_t* %p1) ret void diff --git a/test/CodeGen/ARM/byval-align.ll b/test/CodeGen/ARM/byval-align.ll index a26b5a79575..8a506280dd5 100644 --- a/test/CodeGen/ARM/byval-align.ll +++ b/test/CodeGen/ARM/byval-align.ll @@ -28,8 +28,7 @@ define i32 @test_align8(i8*, [4 x i32]* byval align 8 %b) { ; CHECK: push {r4, r7, lr} ; CHECK: add r7, sp, #4 -; CHECK-DAG: str r2, [r7, #8] -; CHECK-DAG: str r3, [r7, #12] +; CHECK: strd r2, r3, [r7, #8] ; CHECK: ldr r0, [r7, #8] diff --git a/test/CodeGen/ARM/ldrd.ll b/test/CodeGen/ARM/ldrd.ll index f3e13671ac3..5411618ed86 100644 --- a/test/CodeGen/ARM/ldrd.ll +++ b/test/CodeGen/ARM/ldrd.ll @@ -3,6 +3,7 @@ ; rdar://6949835 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=BASIC -check-prefix=CHECK ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=greedy | FileCheck %s -check-prefix=GREEDY -check-prefix=CHECK +; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=swift | FileCheck %s -check-prefix=SWIFT -check-prefix=CHECK ; Magic ARM pair hints works best with linearscan / fast. @@ -110,5 +111,25 @@ entry: ret void } +; CHECK-LABEL: strd_spill_ldrd_reload: +; A8: strd r1, r0, [sp] +; M3: strd r1, r0, [sp] +; BASIC: strd r1, r0, [sp] +; GREEDY: strd r0, r1, [sp] +; CHECK: @ InlineAsm Start +; CHECK: @ InlineAsm End +; A8: ldrd r2, r1, [sp] +; M3: ldrd r2, r1, [sp] +; BASIC: ldrd r2, r1, [sp] +; GREEDY: ldrd r1, r2, [sp] +; CHECK: bl{{x?}} _extfunc +define void @strd_spill_ldrd_reload(i32 %v0, i32 %v1) { + ; force %v0 and %v1 to be spilled + call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{lr}"() + ; force the reloaded %v0, %v1 into different registers + call void @extfunc(i32 0, i32 %v0, i32 %v1, i32 7) + ret void +} + declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind diff --git a/test/CodeGen/ARM/memset-inline.ll b/test/CodeGen/ARM/memset-inline.ll index 191db1e20a2..f6f8d562350 100644 --- a/test/CodeGen/ARM/memset-inline.ll +++ b/test/CodeGen/ARM/memset-inline.ll @@ -4,8 +4,7 @@ define void @t1(i8* nocapture %c) nounwind optsize { entry: ; CHECK-LABEL: t1: ; CHECK: movs r1, #0 -; CHECK: str r1, [r0] -; CHECK: str r1, [r0, #4] +; CHECK: strd r1, r1, [r0] ; CHECK: str r1, [r0, #8] call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false) ret void diff --git a/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll b/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll index 96c5fb8961e..fe335df7a1a 100644 --- a/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll +++ b/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll @@ -5,16 +5,20 @@ target triple = "thumbv7--linux-gnueabi" declare i8* @llvm.returnaddress(i32) -define i32* @wrong-t2stmia-size-reduction(i32* %addr, i32 %val0) minsize { +define i32* @wrong-t2stmia-size-reduction(i32* %addr, i32 %val0, i32 %val1) minsize { store i32 %val0, i32* %addr %addr1 = getelementptr i32, i32* %addr, i32 1 + %addr2 = getelementptr i32, i32* %addr, i32 2 %lr = call i8* @llvm.returnaddress(i32 0) %lr32 = ptrtoint i8* %lr to i32 - store i32 %lr32, i32* %addr1 - %addr2 = getelementptr i32, i32* %addr1, i32 1 - ret i32* %addr2 + store i32 %val1, i32* %addr1 + store i32 %lr32, i32* %addr2 + + %addr3 = getelementptr i32, i32* %addr, i32 3 + ret i32* %addr3 } -; Check that stm writes two registers. The bug caused one of registers (LR, +; Check that stm writes three registers. The bug caused one of registers (LR, ; which invalid for Thumb1 form of STMIA instruction) to be dropped. -; CHECK: stm{{[^,]*}}, {{{.*,.*}}} +; CHECK-LABEL: wrong-t2stmia-size-reduction: +; CHECK: stm{{[^,]*}}, {{{.*,.*,.*}}} diff --git a/test/CodeGen/Thumb2/aapcs.ll b/test/CodeGen/Thumb2/aapcs.ll index 21af8c119b0..299562fe4c5 100644 --- a/test/CodeGen/Thumb2/aapcs.ll +++ b/test/CodeGen/Thumb2/aapcs.ll @@ -33,8 +33,7 @@ define float @float_on_stack(double %a, double %b, double %c, double %d, double define double @double_on_stack(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i) { ; CHECK-LABEL: double_on_stack: -; SOFT: ldr r0, [sp, #48] -; SOFT: ldr r1, [sp, #52] +; SOFT: ldrd r0, r1, [sp, #48] ; HARD: vldr d0, [sp] ; CHECK-NEXT: bx lr ret double %i @@ -42,8 +41,7 @@ define double @double_on_stack(double %a, double %b, double %c, double %d, doubl define double @double_not_split(double %a, double %b, double %c, double %d, double %e, double %f, double %g, float %h, double %i) { ; CHECK-LABEL: double_not_split: -; SOFT: ldr r0, [sp, #48] -; SOFT: ldr r1, [sp, #52] +; SOFT: ldrd r0, r1, [sp, #48] ; HARD: vldr d0, [sp] ; CHECK-NEXT: bx lr ret double %i -- 2.34.1