From 89becbb97423fb608a4dd85ec10c3fde4398d956 Mon Sep 17 00:00:00 2001 From: Stepan Dyatkovskiy Date: Fri, 5 Apr 2013 05:52:14 +0000 Subject: [PATCH] Fix for PR14824: "Optimization arm_ldst_opt inserts newly generated instruction vldmia at incorrect position". Patch introduces memory operands tracking in ARMLoadStoreOpt::LoadStoreMultipleOpti. For each register it keeps the order of load operations as it was before optimization pass. It is kind of deep improvement of fix proposed by Hao: http://llvm.org/bugs/show_bug.cgi?id=14824#c4 But it also tracks conflicts between different register classes (e.g. D2 and S5). For more details see: Bug description: http://llvm.org/bugs/show_bug.cgi?id=14824 LLVM Commits discussion: http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20130311/167936.html http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20130318/168688.html http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20130325/169376.html http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20130401/170238.html git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@178851 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 171 +++++++++++++++++- .../2013-04-05-overridden-loads-PR14824.ll | 110 +++++++++++ 2 files changed, 271 insertions(+), 10 deletions(-) create mode 100644 test/CodeGen/ARM/2013-04-05-overridden-loads-PR14824.ll diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index e4e683c2a02..0682459e1cd 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -87,6 +87,53 @@ namespace { MachineBasicBlock::iterator i) : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {} }; + class UnitRegsMap { + public: + UnitRegsMap(const TargetRegisterInfo* _TRI) : TRI(_TRI) {} + const SmallVector& operator[](unsigned Reg) { + DenseMap >::iterator found = + Cache.find(Reg); + if (found != Cache.end()) + return found->second; + else + return Cache.insert(std::make_pair(Reg, this->getUnitRegs(Reg))) + .first->second; + } + private: + SmallVector getUnitRegs(unsigned Reg) { + SmallVector Res; + + const TargetRegisterClass* TRC = TRI->getRegClass(Reg); + if (TRC == &ARM::QPRRegClass) { + if (Reg > ARM::Q7) { + Res.push_back(TRI->getSubReg(Reg, ARM::dsub_0)); + Res.push_back(TRI->getSubReg(Reg, ARM::dsub_1)); + return Res; + } + + Res.push_back(TRI->getSubReg(Reg, ARM::ssub_0)); + Res.push_back(TRI->getSubReg(Reg, ARM::ssub_1)); + Res.push_back(TRI->getSubReg(Reg, ARM::ssub_2)); + Res.push_back(TRI->getSubReg(Reg, ARM::ssub_3)); + + return Res; + } + + if (TRC == &ARM::DPRRegClass && Reg < ARM::D15) { + Res.push_back(TRI->getSubReg(Reg, ARM::ssub_0)); + Res.push_back(TRI->getSubReg(Reg, ARM::ssub_1)); + + return Res; + } + + Res.push_back(Reg); + + return Res; + + } + const TargetRegisterInfo* TRI; + DenseMap > Cache; + }; typedef SmallVector MemOpQueue; typedef MemOpQueue::iterator MemOpQueueIter; @@ -128,6 +175,11 @@ namespace { MachineBasicBlock::iterator MBBI, bool &Advance, MachineBasicBlock::iterator &I); + unsigned AddMemOp(MemOpQueue& MemOps, + const MemOpQueueEntry newEntry, + UnitRegsMap& UnitRegsInfo, + SmallSet& UsedUnitRegs, + unsigned At = -1U); bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); bool MergeReturnIntoLDM(MachineBasicBlock &MBB); }; @@ -1213,12 +1265,103 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, return false; } +/// AddMemOp - helper for ARMLoadStoreOpt::LoadStoreMultipleOpti. +/// It adds store mem ops with simple push_back/insert method, +/// without any additional logic. +/// For load operation it does the next: +/// 1. Adds new load operation into MemOp collection at "At" position. +/// 2. Removes any "load" operations from MemOps, that changes "Reg" register +/// contents, prior to "At". +/// UnitRegsInfo - Map of type Map< Register, UnitRegisters-vector > +/// UsedUnitRegs - set of unit-registers currently in use. +/// At - position at which it would added, and prior which the clean-up +/// should be made (for load operation). +/// FIXME: The clean-up also should be made for store operations, +/// but the memory address should be analyzed instead of unit registers. +unsigned ARMLoadStoreOpt::AddMemOp(MemOpQueue& MemOps, + const MemOpQueueEntry NewEntry, + UnitRegsMap& UnitRegsInfo, + SmallSet& UsedUnitRegs, + unsigned At) { + unsigned Cleaned = 0; + + if (At == -1U) { + At = MemOps.size(); + MemOps.push_back(NewEntry); + } else + MemOps.insert(&MemOps[At], NewEntry); + + // FIXME: + // If operation is not load, leave it as is by now, + // So 0 overridden ops would cleaned in this case. + if (!NewEntry.MBBI->mayLoad()) + return 0; + + const SmallVector& NewEntryUnitRegs = UnitRegsInfo[NewEntry.Reg]; + + bool FoundOverriddenLoads = false; + + for (unsigned i = 0, e = NewEntryUnitRegs.size(); i != e; ++i) + if (UsedUnitRegs.count(NewEntryUnitRegs[i])) { + FoundOverriddenLoads = true; + break; + } + + // If we detect that this register is used by load operations that are + // predecessors for the new one, remove them from MemOps then. + if (FoundOverriddenLoads) { + MemOpQueue UpdatedMemOps; + + // Scan through MemOps entries. + for (unsigned i = 0; i != At; ++i) { + MemOpQueueEntry& MemOpEntry = MemOps[i]; + + // FIXME: Skip non-load operations by now. + if (!MemOpEntry.MBBI->mayLoad()) + continue; + + const SmallVector& MemOpUnitRegs = + UnitRegsInfo[MemOpEntry.Reg]; + + // Lookup entry that loads contents into register used by new entry. + bool ReleaseThisEntry = false; + for (unsigned m = 0, em = MemOpUnitRegs.size(); m != em; ++m) { + if (std::find(NewEntryUnitRegs.begin(), NewEntryUnitRegs.end(), + MemOpUnitRegs[m]) != NewEntryUnitRegs.end()) { + ReleaseThisEntry = true; + ++Cleaned; + break; + } + } + + if (ReleaseThisEntry) { + const SmallVector& RelesedRegs = UnitRegsInfo[MemOpEntry.Reg]; + for (unsigned r = 0, er = RelesedRegs.size(); r != er; ++r) + UsedUnitRegs.erase(RelesedRegs[r]); + } else + UpdatedMemOps.push_back(MemOpEntry); + } + + // Keep anything without changes after At position. + for (unsigned i = At, e = MemOps.size(); i != e; ++i) + UpdatedMemOps.push_back(MemOps[i]); + + MemOps.swap(UpdatedMemOps); + } + + UsedUnitRegs.insert(NewEntryUnitRegs.begin(), NewEntryUnitRegs.end()); + + return Cleaned; +} + /// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR /// ops of the same base and incrementing offset into LDM / STM ops. bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { unsigned NumMerges = 0; unsigned NumMemOps = 0; MemOpQueue MemOps; + UnitRegsMap UnitRegsInfo(TRI); + SmallSet UsedRegUnits; unsigned CurrBase = 0; int CurrOpc = -1; unsigned CurrSize = 0; @@ -1265,8 +1408,11 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { CurrSize = Size; CurrPred = Pred; CurrPredReg = PredReg; + MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI)); ++NumMemOps; + const SmallVector& EntryUnitRegs = UnitRegsInfo[Reg]; + UsedRegUnits.insert(EntryUnitRegs.begin(), EntryUnitRegs.end()); Advance = true; } else { if (Clobber) { @@ -1278,20 +1424,24 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { // No need to match PredReg. // Continue adding to the queue. if (Offset > MemOps.back().Offset) { - MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, - Position, MBBI)); - ++NumMemOps; + unsigned OverridesCleaned = + AddMemOp(MemOps, + MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI), + UnitRegsInfo, UsedRegUnits) != 0; + NumMemOps += 1 - OverridesCleaned; Advance = true; } else { - for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); - I != E; ++I) { - if (Offset < I->Offset) { - MemOps.insert(I, MemOpQueueEntry(Offset, Reg, isKill, - Position, MBBI)); - ++NumMemOps; + for (unsigned I = 0; I != NumMemOps; ++I) { + if (Offset < MemOps[I].Offset) { + MemOpQueueEntry entry(Offset, Reg, isKill, Position, MBBI); + unsigned OverridesCleaned = + AddMemOp(MemOps, entry, UnitRegsInfo, + UsedRegUnits, I) != 0; + NumMemOps += 1 - OverridesCleaned; + Advance = true; break; - } else if (Offset == I->Offset) { + } else if (Offset == MemOps[I].Offset) { // Collision! This can't be merged! break; } @@ -1362,6 +1512,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { CurrPredReg = 0; if (NumMemOps) { MemOps.clear(); + UsedRegUnits.clear(); NumMemOps = 0; } diff --git a/test/CodeGen/ARM/2013-04-05-overridden-loads-PR14824.ll b/test/CodeGen/ARM/2013-04-05-overridden-loads-PR14824.ll new file mode 100644 index 00000000000..2561686c1f8 --- /dev/null +++ b/test/CodeGen/ARM/2013-04-05-overridden-loads-PR14824.ll @@ -0,0 +1,110 @@ +; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabi -mcpu=cortex-a9 -mattr=+neon,+neonfp | FileCheck %s +; The test is presented by Jiangning Liu. +;CHECK-NOT: vldmia + +define void @sample_test(<8 x i64> * %secondSource, <8 x i64> * %source, <8 x i64> * %dest) nounwind { +entry: + %s0 = load <8 x i64> * %source, align 64 + %s1 = load <8 x i64> * %secondSource, align 64 + %s2 = bitcast <8 x i64> %s0 to i512 + %data.i.i.48.extract.shift = lshr i512 %s2, 384 + %data.i.i.48.extract.trunc = trunc i512 %data.i.i.48.extract.shift to i64 + %arrayidx64 = getelementptr inbounds <8 x i64> * %source, i32 6 + %s120 = load <8 x i64> * %arrayidx64, align 64 + %arrayidx67 = getelementptr inbounds <8 x i64> * %secondSource, i32 6 + %s121 = load <8 x i64> * %arrayidx67, align 64 + %s122 = bitcast <8 x i64> %s120 to i512 + %data.i.i677.48.extract.shift = lshr i512 %s122, 384 + %data.i.i677.48.extract.trunc = trunc i512 %data.i.i677.48.extract.shift to i64 + %s123 = insertelement <8 x i64> undef, i64 %data.i.i677.48.extract.trunc, i32 0 + %data.i.i677.32.extract.shift = lshr i512 %s122, 256 + %data.i.i677.32.extract.trunc = trunc i512 %data.i.i677.32.extract.shift to i64 + %s124 = insertelement <8 x i64> %s123, i64 %data.i.i677.32.extract.trunc, i32 1 + %data.i.i677.16.extract.shift = lshr i512 %s122, 128 + %data.i.i677.16.extract.trunc = trunc i512 %data.i.i677.16.extract.shift to i64 + %s125 = insertelement <8 x i64> %s124, i64 %data.i.i677.16.extract.trunc, i32 2 + %data.i.i677.56.extract.shift = lshr i512 %s122, 448 + %data.i.i677.56.extract.trunc = trunc i512 %data.i.i677.56.extract.shift to i64 + %s126 = insertelement <8 x i64> %s125, i64 %data.i.i677.56.extract.trunc, i32 3 + %data.i.i677.24.extract.shift = lshr i512 %s122, 192 + %data.i.i677.24.extract.trunc = trunc i512 %data.i.i677.24.extract.shift to i64 + %s127 = insertelement <8 x i64> %s126, i64 %data.i.i677.24.extract.trunc, i32 4 + %s128 = insertelement <8 x i64> %s127, i64 %data.i.i677.32.extract.trunc, i32 5 + %s129 = insertelement <8 x i64> %s128, i64 %data.i.i677.16.extract.trunc, i32 6 + %s130 = insertelement <8 x i64> %s129, i64 %data.i.i677.56.extract.trunc, i32 7 + %s131 = bitcast <8 x i64> %s121 to i512 + %data.i1.i676.48.extract.shift = lshr i512 %s131, 384 + %data.i1.i676.48.extract.trunc = trunc i512 %data.i1.i676.48.extract.shift to i64 + %s132 = insertelement <8 x i64> undef, i64 %data.i1.i676.48.extract.trunc, i32 0 + %data.i1.i676.32.extract.shift = lshr i512 %s131, 256 + %data.i1.i676.32.extract.trunc = trunc i512 %data.i1.i676.32.extract.shift to i64 + %s133 = insertelement <8 x i64> %s132, i64 %data.i1.i676.32.extract.trunc, i32 1 + %data.i1.i676.16.extract.shift = lshr i512 %s131, 128 + %data.i1.i676.16.extract.trunc = trunc i512 %data.i1.i676.16.extract.shift to i64 + %s134 = insertelement <8 x i64> %s133, i64 %data.i1.i676.16.extract.trunc, i32 2 + %data.i1.i676.56.extract.shift = lshr i512 %s131, 448 + %data.i1.i676.56.extract.trunc = trunc i512 %data.i1.i676.56.extract.shift to i64 + %s135 = insertelement <8 x i64> %s134, i64 %data.i1.i676.56.extract.trunc, i32 3 + %data.i1.i676.24.extract.shift = lshr i512 %s131, 192 + %data.i1.i676.24.extract.trunc = trunc i512 %data.i1.i676.24.extract.shift to i64 + %s136 = insertelement <8 x i64> %s135, i64 %data.i1.i676.24.extract.trunc, i32 4 + %s137 = insertelement <8 x i64> %s136, i64 %data.i1.i676.32.extract.trunc, i32 5 + %s138 = insertelement <8 x i64> %s137, i64 %data.i1.i676.16.extract.trunc, i32 6 + %s139 = insertelement <8 x i64> %s138, i64 %data.i1.i676.56.extract.trunc, i32 7 + %vecinit28.i.i699 = shufflevector <8 x i64> %s139, <8 x i64> %s130, <8 x i32> + %vecinit35.i.i700 = shufflevector <8 x i64> %vecinit28.i.i699, <8 x i64> %s139, <8 x i32> + %vecinit42.i.i701 = shufflevector <8 x i64> %vecinit35.i.i700, <8 x i64> %s139, <8 x i32> + %vecinit49.i.i702 = shufflevector <8 x i64> %vecinit42.i.i701, <8 x i64> %s130, <8 x i32> + %arrayidx72 = getelementptr inbounds <8 x i64> * %dest, i32 6 + store <8 x i64> %vecinit49.i.i702, <8 x i64> * %arrayidx72, align 64 + %arrayidx75 = getelementptr inbounds <8 x i64> * %source, i32 7 + %s140 = load <8 x i64> * %arrayidx75, align 64 + %arrayidx78 = getelementptr inbounds <8 x i64> * %secondSource, i32 7 + %s141 = load <8 x i64> * %arrayidx78, align 64 + %s142 = bitcast <8 x i64> %s140 to i512 + %data.i.i650.32.extract.shift = lshr i512 %s142, 256 + %data.i.i650.32.extract.trunc = trunc i512 %data.i.i650.32.extract.shift to i64 + %s143 = insertelement <8 x i64> undef, i64 %data.i.i650.32.extract.trunc, i32 0 + %s144 = insertelement <8 x i64> %s143, i64 %data.i.i650.32.extract.trunc, i32 1 + %data.i.i650.16.extract.shift = lshr i512 %s142, 128 + %data.i.i650.16.extract.trunc = trunc i512 %data.i.i650.16.extract.shift to i64 + %s145 = insertelement <8 x i64> %s144, i64 %data.i.i650.16.extract.trunc, i32 2 + %data.i.i650.8.extract.shift = lshr i512 %s142, 64 + %data.i.i650.8.extract.trunc = trunc i512 %data.i.i650.8.extract.shift to i64 + %s146 = insertelement <8 x i64> %s145, i64 %data.i.i650.8.extract.trunc, i32 3 + %s147 = insertelement <8 x i64> %s146, i64 %data.i.i650.8.extract.trunc, i32 4 + %data.i.i650.48.extract.shift = lshr i512 %s142, 384 + %data.i.i650.48.extract.trunc = trunc i512 %data.i.i650.48.extract.shift to i64 + %s148 = insertelement <8 x i64> %s147, i64 %data.i.i650.48.extract.trunc, i32 5 + %s149 = insertelement <8 x i64> %s148, i64 %data.i.i650.16.extract.trunc, i32 6 + %data.i.i650.0.extract.trunc = trunc i512 %s142 to i64 + %s150 = insertelement <8 x i64> %s149, i64 %data.i.i650.0.extract.trunc, i32 7 + %s151 = bitcast <8 x i64> %s141 to i512 + %data.i1.i649.32.extract.shift = lshr i512 %s151, 256 + %data.i1.i649.32.extract.trunc = trunc i512 %data.i1.i649.32.extract.shift to i64 + %s152 = insertelement <8 x i64> undef, i64 %data.i1.i649.32.extract.trunc, i32 0 + %s153 = insertelement <8 x i64> %s152, i64 %data.i1.i649.32.extract.trunc, i32 1 + %data.i1.i649.16.extract.shift = lshr i512 %s151, 128 + %data.i1.i649.16.extract.trunc = trunc i512 %data.i1.i649.16.extract.shift to i64 + %s154 = insertelement <8 x i64> %s153, i64 %data.i1.i649.16.extract.trunc, i32 2 + %data.i1.i649.8.extract.shift = lshr i512 %s151, 64 + %data.i1.i649.8.extract.trunc = trunc i512 %data.i1.i649.8.extract.shift to i64 + %s155 = insertelement <8 x i64> %s154, i64 %data.i1.i649.8.extract.trunc, i32 3 + %s156 = insertelement <8 x i64> %s155, i64 %data.i1.i649.8.extract.trunc, i32 4 + %data.i1.i649.48.extract.shift = lshr i512 %s151, 384 + %data.i1.i649.48.extract.trunc = trunc i512 %data.i1.i649.48.extract.shift to i64 + %s157 = insertelement <8 x i64> %s156, i64 %data.i1.i649.48.extract.trunc, i32 5 + %s158 = insertelement <8 x i64> %s157, i64 %data.i1.i649.16.extract.trunc, i32 6 + %data.i1.i649.0.extract.trunc = trunc i512 %s151 to i64 + %s159 = insertelement <8 x i64> %s158, i64 %data.i1.i649.0.extract.trunc, i32 7 + %vecinit7.i.i669 = shufflevector <8 x i64> %s159, <8 x i64> %s150, <8 x i32> + %vecinit14.i.i670 = shufflevector <8 x i64> %vecinit7.i.i669, <8 x i64> %s150, <8 x i32> + %vecinit21.i.i671 = shufflevector <8 x i64> %vecinit14.i.i670, <8 x i64> %s150, <8 x i32> + %vecinit28.i.i672 = shufflevector <8 x i64> %vecinit21.i.i671, <8 x i64> %s150, <8 x i32> + %vecinit35.i.i673 = shufflevector <8 x i64> %vecinit28.i.i672, <8 x i64> %s159, <8 x i32> + %vecinit42.i.i674 = shufflevector <8 x i64> %vecinit35.i.i673, <8 x i64> %s159, <8 x i32> + %vecinit49.i.i675 = shufflevector <8 x i64> %vecinit42.i.i674, <8 x i64> %s159, <8 x i32> + %arrayidx83 = getelementptr inbounds <8 x i64> * %dest, i32 7 + store <8 x i64> %vecinit49.i.i675, <8 x i64> * %arrayidx83, align 64 + ret void +} -- 2.34.1