From 74b576041ae3416097d0fe87f6fc3c2ba17ea68a Mon Sep 17 00:00:00 2001 From: Brendon Cahoon Date: Fri, 8 May 2015 20:18:21 +0000 Subject: [PATCH] [Hexagon] Generate more hardware loops Refactored parts of the hardware loop pass to generate more. Also, added more tests. Differential Revision: http://reviews.llvm.org/D9568 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@236896 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Hexagon/HexagonHardwareLoops.cpp | 339 ++++++++++++-------- test/CodeGen/Hexagon/hwloop-lt.ll | 149 ++++----- test/CodeGen/Hexagon/hwloop-missed.ll | 49 +++ test/CodeGen/Hexagon/hwloop-preheader.ll | 40 +++ test/CodeGen/Hexagon/hwloop1.ll | 161 ++++++++++ test/CodeGen/Hexagon/hwloop2.ll | 37 +++ test/CodeGen/Hexagon/hwloop3.ll | 27 ++ test/CodeGen/Hexagon/hwloop4.ll | 76 +++++ 8 files changed, 656 insertions(+), 222 deletions(-) create mode 100644 test/CodeGen/Hexagon/hwloop-missed.ll create mode 100644 test/CodeGen/Hexagon/hwloop-preheader.ll create mode 100644 test/CodeGen/Hexagon/hwloop1.ll create mode 100644 test/CodeGen/Hexagon/hwloop2.ll create mode 100644 test/CodeGen/Hexagon/hwloop3.ll create mode 100644 test/CodeGen/Hexagon/hwloop4.ll diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp index 7a997c67b72..0cad3d04c14 100644 --- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp +++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp @@ -21,7 +21,6 @@ // - Countable loops (w/ ind. var for a trip count) // - Assumes loops are normalized by IndVarSimplify // - Try inner-most loops first -// - No nested hardware loops. // - No function calls in loops. // //===----------------------------------------------------------------------===// @@ -49,9 +48,18 @@ using namespace llvm; #define DEBUG_TYPE "hwloops" #ifndef NDEBUG -static cl::opt HWLoopLimit("max-hwloop", cl::Hidden, cl::init(-1)); +static cl::opt HWLoopLimit("hexagon-max-hwloop", cl::Hidden, cl::init(-1)); + +// Option to create preheader only for a specific function. +static cl::opt PHFn("hexagon-hwloop-phfn", cl::Hidden, + cl::init("")); #endif +// Option to create a preheader if one doesn't exist. +static cl::opt HWCreatePreheader("hexagon-hwloop-preheader", + cl::Hidden, cl::init(true), + cl::desc("Add a preheader to a hardware loop if one doesn't exist")); + STATISTIC(NumHWLoops, "Number of loops converted to hardware loops"); namespace llvm { @@ -87,14 +95,15 @@ namespace { } private: + /// Kinds of comparisons in the compare instructions. struct Comparison { enum Kind { EQ = 0x01, NE = 0x02, - L = 0x04, // Less-than property. - G = 0x08, // Greater-than property. - U = 0x40, // Unsigned property. + L = 0x04, + G = 0x08, + U = 0x40, LTs = L, LEs = L | EQ, GTs = G, @@ -111,6 +120,23 @@ namespace { return (Kind)(Cmp ^ (L|G)); return Cmp; } + + static Kind getNegatedComparison(Kind Cmp) { + if ((Cmp & L) || (Cmp & G)) + return (Kind)((Cmp ^ (L | G)) ^ EQ); + if ((Cmp & NE) || (Cmp & EQ)) + return (Kind)(Cmp ^ (EQ | NE)); + return (Kind)0; + } + + static bool isSigned(Kind Cmp) { + return (Cmp & (L | G) && !(Cmp & U)); + } + + static bool isUnsigned(Kind Cmp) { + return (Cmp & U); + } + }; /// \brief Find the register that contains the loop controlling @@ -128,6 +154,12 @@ namespace { bool findInductionRegister(MachineLoop *L, unsigned &Reg, int64_t &IVBump, MachineInstr *&IVOp) const; + /// \brief Return the comparison kind for the specified opcode. + Comparison::Kind getComparisonKind(unsigned CondOpc, + MachineOperand *InitialValue, + const MachineOperand *Endvalue, + int64_t IVBump) const; + /// \brief Analyze the statements in a loop to determine if the loop /// has a computable trip count and, if so, return a value that represents /// the trip count expression. @@ -141,12 +173,9 @@ namespace { /// If the trip count is not directly available (as an immediate value, /// or a register), the function will attempt to insert computation of it /// to the loop's preheader. - CountValue *computeCount(MachineLoop *Loop, - const MachineOperand *Start, - const MachineOperand *End, - unsigned IVReg, - int64_t IVBump, - Comparison::Kind Cmp) const; + CountValue *computeCount(MachineLoop *Loop, const MachineOperand *Start, + const MachineOperand *End, unsigned IVReg, + int64_t IVBump, Comparison::Kind Cmp) const; /// \brief Return true if the instruction is not valid within a hardware /// loop. @@ -310,6 +339,18 @@ bool HexagonHardwareLoops::runOnMachineFunction(MachineFunction &MF) { return Changed; } +/// \brief Return the latch block if it's one of the exiting blocks. Otherwise, +/// return the exiting block. Return 'null' when multiple exiting blocks are +/// present. +static MachineBasicBlock* getExitingBlock(MachineLoop *L) { + if (MachineBasicBlock *Latch = L->getLoopLatch()) { + if (L->isLoopExiting(Latch)) + return Latch; + else + return L->getExitingBlock(); + } + return nullptr; +} bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L, unsigned &Reg, @@ -319,7 +360,8 @@ bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L, MachineBasicBlock *Header = L->getHeader(); MachineBasicBlock *Preheader = L->getLoopPreheader(); MachineBasicBlock *Latch = L->getLoopLatch(); - if (!Header || !Preheader || !Latch) + MachineBasicBlock *ExitingBlock = getExitingBlock(L); + if (!Header || !Preheader || !Latch || !ExitingBlock) return false; // This pair represents an induction register together with an immediate @@ -366,10 +408,10 @@ bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L, SmallVector Cond; MachineBasicBlock *TB = nullptr, *FB = nullptr; - bool NotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Cond, false); + bool NotAnalyzed = TII->AnalyzeBranch(*ExitingBlock, TB, FB, Cond, false); if (NotAnalyzed) return false; - + unsigned PredR, PredPos, PredRegFlags; if (!TII->getPredReg(Cond, PredR, PredPos, PredRegFlags)) return false; @@ -384,7 +426,7 @@ bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L, CmpMask, CmpImm); // Fail if the compare was not analyzed, or it's not comparing a register // with an immediate value. Not checking the mask here, since we handle - // the individual compare opcodes (including CMPb) later on. + // the individual compare opcodes (including A4_cmpb*) later on. if (!CmpAnalyzed) return false; @@ -414,6 +456,44 @@ bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L, return true; } +// Return the comparison kind for the specified opcode. +HexagonHardwareLoops::Comparison::Kind +HexagonHardwareLoops::getComparisonKind(unsigned CondOpc, + MachineOperand *InitialValue, + const MachineOperand *EndValue, + int64_t IVBump) const { + Comparison::Kind Cmp = (Comparison::Kind)0; + switch (CondOpc) { + case Hexagon::C2_cmpeqi: + case Hexagon::C2_cmpeq: + case Hexagon::C2_cmpeqp: + Cmp = Comparison::Kind::EQ; + break; + case Hexagon::C4_cmpneq: + case Hexagon::C4_cmpneqi: + Cmp = Comparison::Kind::NE; + break; + case Hexagon::C4_cmplte: + Cmp = Comparison::Kind::LEs; + break; + case Hexagon::C4_cmplteu: + Cmp = Comparison::Kind::LEu; + break; + case Hexagon::C2_cmpgtui: + case Hexagon::C2_cmpgtu: + case Hexagon::C2_cmpgtup: + Cmp = Comparison::Kind::GTu; + break; + case Hexagon::C2_cmpgti: + case Hexagon::C2_cmpgt: + case Hexagon::C2_cmpgtp: + Cmp = Comparison::Kind::GTs; + break; + default: + return (Comparison::Kind)0; + } + return Cmp; +} /// \brief Analyze the statements in a loop to determine if the loop has /// a computable trip count and, if so, return a value that represents @@ -423,7 +503,7 @@ bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L, /// induction variable patterns that are used in the calculation for /// the number of time the loop is executed. CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L, - SmallVectorImpl &OldInsts) { + SmallVectorImpl &OldInsts) { MachineBasicBlock *TopMBB = L->getTopBlock(); MachineBasicBlock::pred_iterator PI = TopMBB->pred_begin(); assert(PI != TopMBB->pred_end() && @@ -447,8 +527,8 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L, // Look for the cmp instruction to determine if we can get a useful trip // count. The trip count can be either a register or an immediate. The // location of the value depends upon the type (reg or imm). - MachineBasicBlock *Latch = L->getLoopLatch(); - if (!Latch) + MachineBasicBlock *ExitingBlock = getExitingBlock(L); + if (!ExitingBlock) return nullptr; unsigned IVReg = 0; @@ -462,6 +542,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L, MachineOperand *InitialValue = nullptr; MachineInstr *IV_Phi = MRI->getVRegDef(IVReg); + MachineBasicBlock *Latch = L->getLoopLatch(); for (unsigned i = 1, n = IV_Phi->getNumOperands(); i < n; i += 2) { MachineBasicBlock *MBB = IV_Phi->getOperand(i+1).getMBB(); if (MBB == Preheader) @@ -483,6 +564,17 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L, // the header. Otherwise, branch to TB could be exiting the loop, and // the fall through can go to the header. assert (TB && "Latch block without a branch?"); + if (ExitingBlock != Latch && (TB == Latch || FB == Latch)) { + MachineBasicBlock *LTB = 0, *LFB = 0; + SmallVector LCond; + bool NotAnalyzed = TII->AnalyzeBranch(*Latch, LTB, LFB, LCond, false); + if (NotAnalyzed) + return nullptr; + if (TB == Latch) + (LTB == Header) ? TB = LTB: TB = LFB; + else // FB == Latch + (LTB == Header) ? FB = LTB: FB = LFB; + } assert ((!FB || TB == Header || FB == Header) && "Branches not to header?"); if (!TB || (FB && TB != Header && FB != Header)) return nullptr; @@ -533,57 +625,13 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L, if (!EndValue) return nullptr; - switch (CondOpc) { - case Hexagon::C2_cmpeqi: - case Hexagon::C2_cmpeq: - Cmp = !Negated ? Comparison::EQ : Comparison::NE; - break; - case Hexagon::C2_cmpgtui: - case Hexagon::C2_cmpgtu: - Cmp = !Negated ? Comparison::GTu : Comparison::LEu; - break; - case Hexagon::C2_cmpgti: - case Hexagon::C2_cmpgt: - Cmp = !Negated ? Comparison::GTs : Comparison::LEs; - break; - // Very limited support for byte/halfword compares. - case Hexagon::A4_cmpbeqi: - case Hexagon::A4_cmpheqi: { - if (IVBump != 1) - return nullptr; - - int64_t InitV, EndV; - // Since the comparisons are "ri", the EndValue should be an - // immediate. Check it just in case. - assert(EndValue->isImm() && "Unrecognized latch comparison"); - EndV = EndValue->getImm(); - // Allow InitialValue to be a register defined with an immediate. - if (InitialValue->isReg()) { - if (!defWithImmediate(InitialValue->getReg())) - return nullptr; - InitV = getImmediate(*InitialValue); - } else { - assert(InitialValue->isImm()); - InitV = InitialValue->getImm(); - } - if (InitV >= EndV) - return nullptr; - if (CondOpc == Hexagon::A4_cmpbeqi) { - if (!isInt<8>(InitV) || !isInt<8>(EndV)) - return nullptr; - } else { // Hexagon::CMPhEQri_V4 - if (!isInt<16>(InitV) || !isInt<16>(EndV)) - return nullptr; - } - Cmp = !Negated ? Comparison::EQ : Comparison::NE; - break; - } - default: - return nullptr; - } - + Cmp = getComparisonKind(CondOpc, InitialValue, EndValue, IVBump); + if (!Cmp) + return nullptr; + if (Negated) + Cmp = Comparison::getNegatedComparison(Cmp); if (isSwapped) - Cmp = Comparison::getSwappedComparison(Cmp); + Cmp = Comparison::getSwappedComparison(Cmp); if (InitialValue->isReg()) { unsigned R = InitialValue->getReg(); @@ -637,13 +685,14 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop, bool CmpHasEqual = Cmp & Comparison::EQ; // Avoid certain wrap-arounds. This doesn't detect all wrap-arounds. - // If loop executes while iv is "less" with the iv value going down, then - // the iv must wrap. if (CmpLess && IVBump < 0) + // Loop going while iv is "less" with the iv value going down. Must wrap. return nullptr; + // If loop executes while iv is "greater" with the iv value going up, then // the iv must wrap. if (CmpGreater && IVBump > 0) + // Loop going while iv is "greater" with the iv value going up. Must wrap. return nullptr; if (Start->isImm() && End->isImm()) { @@ -698,8 +747,9 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop, MachineBasicBlock *PH = Loop->getLoopPreheader(); assert (PH && "Should have a preheader by now"); MachineBasicBlock::iterator InsertPos = PH->getFirstTerminator(); - DebugLoc DL = (InsertPos != PH->end()) ? InsertPos->getDebugLoc() - : DebugLoc(); + DebugLoc DL; + if (InsertPos != PH->end()) + InsertPos->getDebugLoc(); // If Start is an immediate and End is a register, the trip count // will be "reg - imm". Hexagon's "subtract immediate" instruction @@ -778,21 +828,35 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop, const MCInstrDesc &SubD = RegToReg ? TII->get(Hexagon::A2_sub) : (RegToImm ? TII->get(Hexagon::A2_subri) : TII->get(Hexagon::A2_addi)); - unsigned SubR = MRI->createVirtualRegister(IntRC); - MachineInstrBuilder SubIB = - BuildMI(*PH, InsertPos, DL, SubD, SubR); - - if (RegToReg) { - SubIB.addReg(End->getReg(), 0, End->getSubReg()) - .addReg(Start->getReg(), 0, Start->getSubReg()); - } else if (RegToImm) { - SubIB.addImm(EndV) - .addReg(Start->getReg(), 0, Start->getSubReg()); - } else { // ImmToReg - SubIB.addReg(End->getReg(), 0, End->getSubReg()) - .addImm(-StartV); + if (RegToReg || RegToImm) { + unsigned SubR = MRI->createVirtualRegister(IntRC); + MachineInstrBuilder SubIB = + BuildMI(*PH, InsertPos, DL, SubD, SubR); + + if (RegToReg) + SubIB.addReg(End->getReg(), 0, End->getSubReg()) + .addReg(Start->getReg(), 0, Start->getSubReg()); + else + SubIB.addImm(EndV) + .addReg(Start->getReg(), 0, Start->getSubReg()); + DistR = SubR; + } else { + // If the loop has been unrolled, we should use the original loop count + // instead of recalculating the value. This will avoid additional + // 'Add' instruction. + const MachineInstr *EndValInstr = MRI->getVRegDef(End->getReg()); + if (EndValInstr->getOpcode() == Hexagon::A2_addi && + EndValInstr->getOperand(2).getImm() == StartV) { + DistR = EndValInstr->getOperand(1).getReg(); + } else { + unsigned SubR = MRI->createVirtualRegister(IntRC); + MachineInstrBuilder SubIB = + BuildMI(*PH, InsertPos, DL, SubD, SubR); + SubIB.addReg(End->getReg(), 0, End->getSubReg()) + .addImm(-StartV); + DistR = SubR; + } } - DistR = SubR; DistSR = 0; } @@ -843,8 +907,9 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop, bool HexagonHardwareLoops::isInvalidLoopOperation( const MachineInstr *MI) const { - // call is not allowed because the callee may use a hardware loop - if (MI->getDesc().isCall()) + // Call is not allowed because the callee may use a hardware loop except for + // the case when the call never returns. + if (MI->getDesc().isCall() && MI->getOpcode() != Hexagon::CALLv3nr) return true; // do not allow nested hardware loops @@ -959,8 +1024,6 @@ void HexagonHardwareLoops::removeIfDead(MachineInstr *MI) { continue; if (Use.isDebug()) UseMI->getOperand(0).setReg(0U); - // This may also be a "instr -> phi -> instr" case which can - // be removed too. } } @@ -1005,10 +1068,6 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) { if (containsInvalidInstruction(L)) return false; - // Is the induction variable bump feeding the latch condition? - if (!fixupInductionVariable(L)) - return false; - MachineBasicBlock *LastMBB = L->getExitingBlock(); // Don't generate hw loop if the loop has more than one exit. if (!LastMBB) @@ -1018,16 +1077,19 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) { if (LastI == LastMBB->end()) return false; + // Is the induction variable bump feeding the latch condition? + if (!fixupInductionVariable(L)) + return false; + // Ensure the loop has a preheader: the loop instruction will be // placed there. - bool NewPreheader = false; MachineBasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) { Preheader = createPreheaderForLoop(L); if (!Preheader) return false; - NewPreheader = true; } + MachineBasicBlock::iterator InsertPos = Preheader->getFirstTerminator(); SmallVector OldInsts; @@ -1042,31 +1104,30 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) { // so make sure that the register is actually defined at that point. MachineInstr *TCDef = MRI->getVRegDef(TripCount->getReg()); MachineBasicBlock *BBDef = TCDef->getParent(); - if (!NewPreheader) { - if (!MDT->dominates(BBDef, Preheader)) - return false; - } else { - // If we have just created a preheader, the dominator tree won't be - // aware of it. Check if the definition of the register dominates - // the header, but is not the header itself. - if (!MDT->properlyDominates(BBDef, L->getHeader())) - return false; - } + if (!MDT->dominates(BBDef, Preheader)) + return false; } // Determine the loop start. - MachineBasicBlock *LoopStart = L->getTopBlock(); - if (L->getLoopLatch() != LastMBB) { - // When the exit and latch are not the same, use the latch block as the - // start. - // The loop start address is used only after the 1st iteration, and the - // loop latch may contains instrs. that need to be executed after the - // first iteration. - LoopStart = L->getLoopLatch(); - // Make sure the latch is a successor of the exit, otherwise it won't work. - if (!LastMBB->isSuccessor(LoopStart)) + MachineBasicBlock *TopBlock = L->getTopBlock(); + MachineBasicBlock *ExitingBlock = getExitingBlock(L); + MachineBasicBlock *LoopStart = 0; + if (ExitingBlock != L->getLoopLatch()) { + MachineBasicBlock *TB = 0, *FB = 0; + SmallVector Cond; + + if (TII->AnalyzeBranch(*ExitingBlock, TB, FB, Cond, false)) + return false; + + if (L->contains(TB)) + LoopStart = TB; + else if (L->contains(FB)) + LoopStart = FB; + else return false; } + else + LoopStart = TopBlock; // Convert the loop to a hardware loop. DEBUG(dbgs() << "Change to hardware loop at "; L->dump()); @@ -1220,13 +1281,7 @@ void HexagonHardwareLoops::setImmediate(MachineOperand &MO, int64_t Val) { assert(MO.isReg()); unsigned R = MO.getReg(); - MachineInstr *DI = defWithImmediate(R); - if (MRI->hasOneNonDBGUse(R)) { - // If R has only one use, then just change its defining instruction to - // the new immediate value. - DI->getOperand(1).setImm(Val); - return; - } + MachineInstr *DI = MRI->getVRegDef(R); const TargetRegisterClass *RC = MRI->getRegClass(R); unsigned NewR = MRI->createVirtualRegister(RC); @@ -1240,10 +1295,10 @@ void HexagonHardwareLoops::setImmediate(MachineOperand &MO, int64_t Val) { bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) { MachineBasicBlock *Header = L->getHeader(); - MachineBasicBlock *Preheader = L->getLoopPreheader(); MachineBasicBlock *Latch = L->getLoopLatch(); + MachineBasicBlock *ExitingBlock = getExitingBlock(L); - if (!Header || !Preheader || !Latch) + if (!(Header && Latch && ExitingBlock)) return false; // These data structures follow the same concept as the corresponding @@ -1271,7 +1326,7 @@ bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) { unsigned PhiReg = Phi->getOperand(i).getReg(); MachineInstr *DI = MRI->getVRegDef(PhiReg); unsigned UpdOpc = DI->getOpcode(); - bool isAdd = (UpdOpc == Hexagon::A2_addi); + bool isAdd = (UpdOpc == Hexagon::A2_addi || UpdOpc == Hexagon::A2_addp); if (isAdd) { // If the register operand to the add/sub is the PHI we are looking @@ -1412,12 +1467,21 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop( if (MachineBasicBlock *TmpPH = L->getLoopPreheader()) return TmpPH; + if (!HWCreatePreheader) + return nullptr; + MachineBasicBlock *Header = L->getHeader(); MachineBasicBlock *Latch = L->getLoopLatch(); + MachineBasicBlock *ExitingBlock = getExitingBlock(L); MachineFunction *MF = Header->getParent(); DebugLoc DL; - if (!Latch || Header->hasAddressTaken()) +#ifndef NDEBUG + if ((PHFn != "") && (PHFn != MF->getName())) + return nullptr; +#endif + + if (!Latch || !ExitingBlock || Header->hasAddressTaken()) return nullptr; typedef MachineBasicBlock::instr_iterator instr_iterator; @@ -1429,16 +1493,14 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop( SmallVector Tmp1; MachineBasicBlock *TB = nullptr, *FB = nullptr; - if (TII->AnalyzeBranch(*Latch, TB, FB, Tmp1, false)) + if (TII->AnalyzeBranch(*ExitingBlock, TB, FB, Tmp1, false)) return nullptr; for (MBBVector::iterator I = Preds.begin(), E = Preds.end(); I != E; ++I) { MachineBasicBlock *PB = *I; - if (PB != Latch) { - bool NotAnalyzed = TII->AnalyzeBranch(*PB, TB, FB, Tmp1, false); - if (NotAnalyzed) - return nullptr; - } + bool NotAnalyzed = TII->AnalyzeBranch(*PB, TB, FB, Tmp1, false); + if (NotAnalyzed) + return nullptr; } MachineBasicBlock *NewPH = MF->CreateMachineBasicBlock(); @@ -1541,5 +1603,16 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop( TII->InsertBranch(*NewPH, Header, nullptr, EmptyCond, DL); NewPH->addSuccessor(Header); + MachineLoop *ParentLoop = L->getParentLoop(); + if (ParentLoop) + ParentLoop->addBasicBlockToLoop(NewPH, MLI->getBase()); + + // Update the dominator information with the new preheader. + if (MDT) { + MachineDomTreeNode *HDom = MDT->getNode(Header); + MDT->addNewBlock(NewPH, HDom->getIDom()->getBlock()); + MDT->changeImmediateDominator(Header, NewPH); + } + return NewPH; } diff --git a/test/CodeGen/Hexagon/hwloop-lt.ll b/test/CodeGen/Hexagon/hwloop-lt.ll index 804f76456e2..7e2ad2a4678 100644 --- a/test/CodeGen/Hexagon/hwloop-lt.ll +++ b/test/CodeGen/Hexagon/hwloop-lt.ll @@ -1,7 +1,6 @@ ; RUN: llc -march=hexagon -mcpu=hexagonv4 -O3 < %s | FileCheck %s - -; CHECK: test_pos1_ir_slt +; CHECK-LABEL: @test_pos1_ir_slt ; CHECK: loop0 ; a < b define void @test_pos1_ir_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind { @@ -9,10 +8,10 @@ entry: %cmp3 = icmp slt i32 8531, %b br i1 %cmp3, label %for.body.lr.ph, label %for.end -for.body.lr.ph: ; preds = %entry +for.body.lr.ph: br label %for.body -for.body: ; preds = %for.body.lr.ph, %for.body +for.body: %i.04 = phi i32 [ 8531, %for.body.lr.ph ], [ %inc, %for.body ] %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04 %0 = load i8, i8* %arrayidx, align 1 @@ -24,13 +23,11 @@ for.body: ; preds = %for.body.lr.ph, %fo %cmp = icmp slt i32 %inc, %b br i1 %cmp, label %for.body, label %for.end -for.end: ; preds = %for.body, %entry +for.end: ret void } - - -; CHECK: test_pos2_ir_slt +; CHECK-LABEL: @test_pos2_ir_slt ; CHECK: loop0 ; a < b define void @test_pos2_ir_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind { @@ -38,10 +35,10 @@ entry: %cmp3 = icmp slt i32 9152, %b br i1 %cmp3, label %for.body.lr.ph, label %for.end -for.body.lr.ph: ; preds = %entry +for.body.lr.ph: br label %for.body -for.body: ; preds = %for.body.lr.ph, %for.body +for.body: %i.04 = phi i32 [ 9152, %for.body.lr.ph ], [ %inc, %for.body ] %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04 %0 = load i8, i8* %arrayidx, align 1 @@ -53,13 +50,11 @@ for.body: ; preds = %for.body.lr.ph, %fo %cmp = icmp slt i32 %inc, %b br i1 %cmp, label %for.body, label %for.end -for.end: ; preds = %for.body, %entry +for.end: ret void } - - -; CHECK: test_pos4_ir_slt +; CHECK-LABEL: @test_pos4_ir_slt ; CHECK: loop0 ; a < b define void @test_pos4_ir_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind { @@ -67,10 +62,10 @@ entry: %cmp3 = icmp slt i32 18851, %b br i1 %cmp3, label %for.body.lr.ph, label %for.end -for.body.lr.ph: ; preds = %entry +for.body.lr.ph: br label %for.body -for.body: ; preds = %for.body.lr.ph, %for.body +for.body: %i.04 = phi i32 [ 18851, %for.body.lr.ph ], [ %inc, %for.body ] %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04 %0 = load i8, i8* %arrayidx, align 1 @@ -82,13 +77,11 @@ for.body: ; preds = %for.body.lr.ph, %fo %cmp = icmp slt i32 %inc, %b br i1 %cmp, label %for.body, label %for.end -for.end: ; preds = %for.body, %entry +for.end: ret void } - - -; CHECK: test_pos8_ir_slt +; CHECK-LABEL: @test_pos8_ir_slt ; CHECK: loop0 ; a < b define void @test_pos8_ir_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind { @@ -96,10 +89,10 @@ entry: %cmp3 = icmp slt i32 25466, %b br i1 %cmp3, label %for.body.lr.ph, label %for.end -for.body.lr.ph: ; preds = %entry +for.body.lr.ph: br label %for.body -for.body: ; preds = %for.body.lr.ph, %for.body +for.body: %i.04 = phi i32 [ 25466, %for.body.lr.ph ], [ %inc, %for.body ] %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04 %0 = load i8, i8* %arrayidx, align 1 @@ -111,13 +104,11 @@ for.body: ; preds = %for.body.lr.ph, %fo %cmp = icmp slt i32 %inc, %b br i1 %cmp, label %for.body, label %for.end -for.end: ; preds = %for.body, %entry +for.end: ret void } - - -; CHECK: test_pos16_ir_slt +; CHECK-LABEL: @test_pos16_ir_slt ; CHECK: loop0 ; a < b define void @test_pos16_ir_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind { @@ -125,10 +116,10 @@ entry: %cmp3 = icmp slt i32 9295, %b br i1 %cmp3, label %for.body.lr.ph, label %for.end -for.body.lr.ph: ; preds = %entry +for.body.lr.ph: br label %for.body -for.body: ; preds = %for.body.lr.ph, %for.body +for.body: %i.04 = phi i32 [ 9295, %for.body.lr.ph ], [ %inc, %for.body ] %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04 %0 = load i8, i8* %arrayidx, align 1 @@ -140,13 +131,11 @@ for.body: ; preds = %for.body.lr.ph, %fo %cmp = icmp slt i32 %inc, %b br i1 %cmp, label %for.body, label %for.end -for.end: ; preds = %for.body, %entry +for.end: ret void } - - -; CHECK: test_pos1_ri_slt +; CHECK-LABEL: @test_pos1_ri_slt ; CHECK: loop0 ; a < b define void @test_pos1_ri_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind { @@ -154,10 +143,10 @@ entry: %cmp3 = icmp slt i32 %a, 31236 br i1 %cmp3, label %for.body.lr.ph, label %for.end -for.body.lr.ph: ; preds = %entry +for.body.lr.ph: br label %for.body -for.body: ; preds = %for.body.lr.ph, %for.body +for.body: %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ] %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04 %0 = load i8, i8* %arrayidx, align 1 @@ -169,13 +158,11 @@ for.body: ; preds = %for.body.lr.ph, %fo %cmp = icmp slt i32 %inc, 31236 br i1 %cmp, label %for.body, label %for.end -for.end: ; preds = %for.body, %entry +for.end: ret void } - - -; CHECK: test_pos2_ri_slt +; CHECK-LABEL: @test_pos2_ri_slt ; CHECK: loop0 ; a < b define void @test_pos2_ri_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind { @@ -183,10 +170,10 @@ entry: %cmp3 = icmp slt i32 %a, 22653 br i1 %cmp3, label %for.body.lr.ph, label %for.end -for.body.lr.ph: ; preds = %entry +for.body.lr.ph: br label %for.body -for.body: ; preds = %for.body.lr.ph, %for.body +for.body: %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ] %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04 %0 = load i8, i8* %arrayidx, align 1 @@ -198,13 +185,11 @@ for.body: ; preds = %for.body.lr.ph, %fo %cmp = icmp slt i32 %inc, 22653 br i1 %cmp, label %for.body, label %for.end -for.end: ; preds = %for.body, %entry +for.end: ret void } - - -; CHECK: test_pos4_ri_slt +; CHECK-LABEL: @test_pos4_ri_slt ; CHECK: loop0 ; a < b define void @test_pos4_ri_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind { @@ -212,10 +197,10 @@ entry: %cmp3 = icmp slt i32 %a, 1431 br i1 %cmp3, label %for.body.lr.ph, label %for.end -for.body.lr.ph: ; preds = %entry +for.body.lr.ph: br label %for.body -for.body: ; preds = %for.body.lr.ph, %for.body +for.body: %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ] %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04 %0 = load i8, i8* %arrayidx, align 1 @@ -227,13 +212,11 @@ for.body: ; preds = %for.body.lr.ph, %fo %cmp = icmp slt i32 %inc, 1431 br i1 %cmp, label %for.body, label %for.end -for.end: ; preds = %for.body, %entry +for.end: ret void } - - -; CHECK: test_pos8_ri_slt +; CHECK-LABEL: @test_pos8_ri_slt ; CHECK: loop0 ; a < b define void @test_pos8_ri_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind { @@ -241,10 +224,10 @@ entry: %cmp3 = icmp slt i32 %a, 22403 br i1 %cmp3, label %for.body.lr.ph, label %for.end -for.body.lr.ph: ; preds = %entry +for.body.lr.ph: br label %for.body -for.body: ; preds = %for.body.lr.ph, %for.body +for.body: %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ] %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04 %0 = load i8, i8* %arrayidx, align 1 @@ -256,13 +239,11 @@ for.body: ; preds = %for.body.lr.ph, %fo %cmp = icmp slt i32 %inc, 22403 br i1 %cmp, label %for.body, label %for.end -for.end: ; preds = %for.body, %entry +for.end: ret void } - - -; CHECK: test_pos16_ri_slt +; CHECK-LABEL: @test_pos16_ri_slt ; CHECK: loop0 ; a < b define void @test_pos16_ri_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind { @@ -270,10 +251,10 @@ entry: %cmp3 = icmp slt i32 %a, 21715 br i1 %cmp3, label %for.body.lr.ph, label %for.end -for.body.lr.ph: ; preds = %entry +for.body.lr.ph: br label %for.body -for.body: ; preds = %for.body.lr.ph, %for.body +for.body: %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ] %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04 %0 = load i8, i8* %arrayidx, align 1 @@ -285,13 +266,11 @@ for.body: ; preds = %for.body.lr.ph, %fo %cmp = icmp slt i32 %inc, 21715 br i1 %cmp, label %for.body, label %for.end -for.end: ; preds = %for.body, %entry +for.end: ret void } - - -; CHECK: test_pos1_rr_slt +; CHECK-LABEL: @test_pos1_rr_slt ; CHECK: loop0 ; a < b define void @test_pos1_rr_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind { @@ -299,10 +278,10 @@ entry: %cmp3 = icmp slt i32 %a, %b br i1 %cmp3, label %for.body.lr.ph, label %for.end -for.body.lr.ph: ; preds = %entry +for.body.lr.ph: br label %for.body -for.body: ; preds = %for.body.lr.ph, %for.body +for.body: %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ] %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04 %0 = load i8, i8* %arrayidx, align 1 @@ -314,13 +293,11 @@ for.body: ; preds = %for.body.lr.ph, %fo %cmp = icmp slt i32 %inc, %b br i1 %cmp, label %for.body, label %for.end -for.end: ; preds = %for.body, %entry +for.end: ret void } - - -; CHECK: test_pos2_rr_slt +; CHECK-LABEL: @test_pos2_rr_slt ; CHECK: loop0 ; a < b define void @test_pos2_rr_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind { @@ -328,10 +305,10 @@ entry: %cmp3 = icmp slt i32 %a, %b br i1 %cmp3, label %for.body.lr.ph, label %for.end -for.body.lr.ph: ; preds = %entry +for.body.lr.ph: br label %for.body -for.body: ; preds = %for.body.lr.ph, %for.body +for.body: %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ] %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04 %0 = load i8, i8* %arrayidx, align 1 @@ -343,13 +320,11 @@ for.body: ; preds = %for.body.lr.ph, %fo %cmp = icmp slt i32 %inc, %b br i1 %cmp, label %for.body, label %for.end -for.end: ; preds = %for.body, %entry +for.end: ret void } - - -; CHECK: test_pos4_rr_slt +; CHECK-LABEL: @test_pos4_rr_slt ; CHECK: loop0 ; a < b define void @test_pos4_rr_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind { @@ -357,10 +332,10 @@ entry: %cmp3 = icmp slt i32 %a, %b br i1 %cmp3, label %for.body.lr.ph, label %for.end -for.body.lr.ph: ; preds = %entry +for.body.lr.ph: br label %for.body -for.body: ; preds = %for.body.lr.ph, %for.body +for.body: %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ] %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04 %0 = load i8, i8* %arrayidx, align 1 @@ -372,13 +347,11 @@ for.body: ; preds = %for.body.lr.ph, %fo %cmp = icmp slt i32 %inc, %b br i1 %cmp, label %for.body, label %for.end -for.end: ; preds = %for.body, %entry +for.end: ret void } - - -; CHECK: test_pos8_rr_slt +; CHECK-LABEL: @test_pos8_rr_slt ; CHECK: loop0 ; a < b define void @test_pos8_rr_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind { @@ -386,10 +359,10 @@ entry: %cmp3 = icmp slt i32 %a, %b br i1 %cmp3, label %for.body.lr.ph, label %for.end -for.body.lr.ph: ; preds = %entry +for.body.lr.ph: br label %for.body -for.body: ; preds = %for.body.lr.ph, %for.body +for.body: %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ] %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04 %0 = load i8, i8* %arrayidx, align 1 @@ -401,13 +374,11 @@ for.body: ; preds = %for.body.lr.ph, %fo %cmp = icmp slt i32 %inc, %b br i1 %cmp, label %for.body, label %for.end -for.end: ; preds = %for.body, %entry +for.end: ret void } - - -; CHECK: test_pos16_rr_slt +; CHECK-LABEL: @test_pos16_rr_slt ; CHECK: loop0 ; a < b define void @test_pos16_rr_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind { @@ -415,10 +386,10 @@ entry: %cmp3 = icmp slt i32 %a, %b br i1 %cmp3, label %for.body.lr.ph, label %for.end -for.body.lr.ph: ; preds = %entry +for.body.lr.ph: br label %for.body -for.body: ; preds = %for.body.lr.ph, %for.body +for.body: %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ] %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04 %0 = load i8, i8* %arrayidx, align 1 @@ -430,7 +401,7 @@ for.body: ; preds = %for.body.lr.ph, %fo %cmp = icmp slt i32 %inc, %b br i1 %cmp, label %for.body, label %for.end -for.end: ; preds = %for.body, %entry +for.end: ret void } diff --git a/test/CodeGen/Hexagon/hwloop-missed.ll b/test/CodeGen/Hexagon/hwloop-missed.ll new file mode 100644 index 00000000000..bcc80065229 --- /dev/null +++ b/test/CodeGen/Hexagon/hwloop-missed.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=hexagon -hexagon-hwloop-preheader < %s | FileCheck %s + +; Generate hardware loops when we also need to add a new preheader. +; we should generate two hardware loops for this test case. + +; CHECK: loop0 +; CHECK: endloop0 +; CHECK: loop0 +; CHECK: endloop0 + +@g = external global i32 + +define void @test(i32* nocapture %a, i32* nocapture %b, i32 %n) nounwind { +entry: + %tobool = icmp eq i32 %n, 0 + br i1 %tobool, label %for.body4.preheader, label %for.body.preheader + +for.body.preheader: + br label %for.body + +for.body: + %arrayidx.phi = phi i32* [ %arrayidx.inc, %for.body ], [ %a, %for.body.preheader ] + %i.014 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %0 = load i32, i32* @g, align 4 + store i32 %0, i32* %arrayidx.phi, align 4 + %inc = add nsw i32 %i.014, 1 + %exitcond15 = icmp eq i32 %inc, 3 + %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1 + br i1 %exitcond15, label %for.body4.preheader.loopexit, label %for.body + +for.body4.preheader.loopexit: + br label %for.body4.preheader + +for.body4.preheader: + br label %for.body4 + +for.body4: + %arrayidx5.phi = phi i32* [ %arrayidx5.inc, %for.body4 ], [ %b, %for.body4.preheader ] + %i1.013 = phi i32 [ %inc7, %for.body4 ], [ 0, %for.body4.preheader ] + %1 = load i32, i32* @g, align 4 + store i32 %1, i32* %arrayidx5.phi, align 4 + %inc7 = add nsw i32 %i1.013, 1 + %exitcond = icmp eq i32 %inc7, 3 + %arrayidx5.inc = getelementptr i32, i32* %arrayidx5.phi, i32 1 + br i1 %exitcond, label %for.end8, label %for.body4 + +for.end8: + ret void +} diff --git a/test/CodeGen/Hexagon/hwloop-preheader.ll b/test/CodeGen/Hexagon/hwloop-preheader.ll new file mode 100644 index 00000000000..66efd2089fc --- /dev/null +++ b/test/CodeGen/Hexagon/hwloop-preheader.ll @@ -0,0 +1,40 @@ +; RUN: llc -march=hexagon -mcpu=hexagonv5 -hexagon-hwloop-preheader < %s +; REQUIRES: asserts + +; Test that the preheader is added to the parent loop, otherwise +; we generate an invalid hardware loop. + +; Function Attrs: nounwind readonly +define void @test(i16 signext %n) #0 { +entry: + br i1 undef, label %for.cond4.preheader.preheader.split.us, label %for.end22 + +for.cond4.preheader.preheader.split.us: + %0 = sext i16 %n to i32 + br label %for.body9.preheader.us + +for.body9.us: + %indvars.iv = phi i32 [ %indvars.iv.next.7, %for.body9.us ], [ 0, %for.body9.preheader.us ] + %indvars.iv.next.7 = add i32 %indvars.iv, 8 + %lftr.wideiv.7 = trunc i32 %indvars.iv.next.7 to i16 + %exitcond.7 = icmp slt i16 %lftr.wideiv.7, 0 + br i1 %exitcond.7, label %for.body9.us, label %for.body9.us.ur + +for.body9.preheader.us: + %i.030.us.pmt = phi i32 [ %inc21.us.pmt, %for.end.loopexit.us ], [ 0, %for.cond4.preheader.preheader.split.us ] + br i1 undef, label %for.body9.us, label %for.body9.us.ur + +for.body9.us.ur: + %exitcond.ur.old = icmp eq i16 undef, %n + br i1 %exitcond.ur.old, label %for.end.loopexit.us, label %for.body9.us.ur + +for.end.loopexit.us: + %inc21.us.pmt = add i32 %i.030.us.pmt, 1 + %exitcond33 = icmp eq i32 %inc21.us.pmt, %0 + br i1 %exitcond33, label %for.end22, label %for.body9.preheader.us + +for.end22: + ret void +} + +attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/CodeGen/Hexagon/hwloop1.ll b/test/CodeGen/Hexagon/hwloop1.ll new file mode 100644 index 00000000000..97b779cf962 --- /dev/null +++ b/test/CodeGen/Hexagon/hwloop1.ll @@ -0,0 +1,161 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s +; Check that we generate hardware loop instructions. + +; Case 1 : Loop with a constant number of iterations. +; CHECK-LABEL: @hwloop1 +; CHECK: loop0(.LBB{{.}}_{{.}}, #10) +; CHECK: endloop0 + +@a = common global [10 x i32] zeroinitializer, align 4 +define i32 @hwloop1() nounwind { +entry: + br label %for.body +for.body: + %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* @a, i32 0, i32 %i.01 + store i32 %i.01, i32* %arrayidx, align 4 + %inc = add nsw i32 %i.01, 1 + %exitcond = icmp eq i32 %inc, 10 + br i1 %exitcond, label %for.end, label %for.body +for.end: + ret i32 0 +} + +; Case 2 : Loop with a run-time number of iterations. +; CHECK-LABEL: @hwloop2 +; CHECK: loop0(.LBB{{.}}_{{.}}, r{{[0-9]+}}) +; CHECK: endloop0 + +define i32 @hwloop2(i32 %n, i32* nocapture %b) nounwind { +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %a.03 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %i.02 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.02 + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, %a.03 + %inc = add nsw i32 %i.02, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + %a.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.end.loopexit ] + ret i32 %a.0.lcssa +} + +; Case 3 : Induction variable increment more than 1. +; CHECK-LABEL: @hwloop3 +; CHECK: lsr(r{{[0-9]+}}, #2) +; CHECK: loop0(.LBB{{.}}_{{.}}, r{{[0-9]+}}) +; CHECK: endloop0 + +define i32 @hwloop3(i32 %n, i32* nocapture %b) nounwind { +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %a.03 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %i.02 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.02 + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, %a.03 + %inc = add nsw i32 %i.02, 4 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + %a.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.end.loopexit ] + ret i32 %a.0.lcssa +} + +; Case 4 : Loop exit compare uses register instead of immediate value. +; CHECK-LABEL: @hwloop4 +; CHECK: loop0(.LBB{{.}}_{{.}}, r{{[0-9]+}}) +; CHECK: endloop0 + +define i32 @hwloop4(i32 %n, i32* nocapture %b) nounwind { +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %i.02 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.02 + store i32 %i.02, i32* %arrayidx, align 4 + %inc = add nsw i32 %i.02, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret i32 0 +} + +; Case 5: After LSR, the initial value is 100 and the iv decrements to 0. +; CHECK-LABEL: @hwloop5 +; CHECK: loop0(.LBB{{.}}_{{.}}, #100) +; CHECK: endloop0 + +define void @hwloop5(i32* nocapture %a, i32* nocapture %res) nounwind { +entry: + br label %for.body + +for.body: + %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.03 + %0 = load i32, i32* %arrayidx, align 4 + %mul = mul nsw i32 %0, %0 + %arrayidx2 = getelementptr inbounds i32, i32* %res, i32 %i.03 + store i32 %mul, i32* %arrayidx2, align 4 + %inc = add nsw i32 %i.03, 1 + %exitcond = icmp eq i32 %inc, 100 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +; Case 6: Large immediate offset +; CHECK-LABEL: @hwloop6 +; CHECK-NOT: loop0(.LBB{{.}}_{{.}}, #1024) +; CHECK: loop0(.LBB{{.}}_{{.}}, r{{[0-9]+}}) +; CHECK: endloop0 + +define void @hwloop6(i32* nocapture %a, i32* nocapture %res) nounwind { +entry: + br label %for.body + +for.body: + %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.02 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %res, i32 %i.02 + store i32 %0, i32* %arrayidx1, align 4 + %inc = add nsw i32 %i.02, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} diff --git a/test/CodeGen/Hexagon/hwloop2.ll b/test/CodeGen/Hexagon/hwloop2.ll new file mode 100644 index 00000000000..d411d979904 --- /dev/null +++ b/test/CodeGen/Hexagon/hwloop2.ll @@ -0,0 +1,37 @@ +; RUN: llc -disable-lsr -march=hexagon < %s | FileCheck %s + +; Test for multiple phis with induction variables. + +; CHECK: loop0(.LBB{{.}}_{{.}}, r{{[0-9]+}}) +; CHECK: endloop0 + +define i32 @hwloop4(i32* nocapture %s, i32* nocapture %a, i32 %n) { +entry: + %cmp3 = icmp eq i32 %n, 0 + br i1 %cmp3, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: + %.pre = load i32, i32* %s, align 4 + br label %for.body + +for.body: + %0 = phi i32 [ %.pre, %for.body.lr.ph ], [ %add1, %for.body ] + %j.05 = phi i32 [ 0, %for.body.lr.ph ], [ %add2, %for.body ] + %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ %n, %for.body.lr.ph ] + %lsr.iv1 = phi i32* [ %scevgep, %for.body ], [ %a, %for.body.lr.ph ] + %1 = load i32, i32* %lsr.iv1, align 4 + %add1 = add nsw i32 %0, %1 + store i32 %add1, i32* %s, align 4 + %add2 = add nsw i32 %j.05, 1 + %lsr.iv.next = add i32 %lsr.iv, -1 + %scevgep = getelementptr i32, i32* %lsr.iv1, i32 1 + %cmp = icmp eq i32 %lsr.iv.next, 0 + br i1 %cmp, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + %j.0.lcssa = phi i32 [ 0, %entry ], [ %add2, %for.end.loopexit ] + ret i32 %j.0.lcssa +} diff --git a/test/CodeGen/Hexagon/hwloop3.ll b/test/CodeGen/Hexagon/hwloop3.ll new file mode 100644 index 00000000000..1135e06a0c4 --- /dev/null +++ b/test/CodeGen/Hexagon/hwloop3.ll @@ -0,0 +1,27 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s +; +; Remove the unconditional jump to following instruction. + +; CHECK: endloop0 +; CHECK-NOT: jump [[L1:.]] +; CHECK-NOT: [[L1]] + +define void @test(i32* nocapture %a, i32 %n) nounwind { +entry: + br label %for.body + +for.body: + %arrayidx.phi = phi i32* [ %a, %entry ], [ %arrayidx.inc, %for.body ] + %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %0 = load i32, i32* %arrayidx.phi, align 4 + %add = add nsw i32 %0, 1 + store i32 %add, i32* %arrayidx.phi, align 4 + %inc = add nsw i32 %i.02, 1 + %exitcond = icmp eq i32 %inc, 100 + %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + diff --git a/test/CodeGen/Hexagon/hwloop4.ll b/test/CodeGen/Hexagon/hwloop4.ll new file mode 100644 index 00000000000..d159c45e3fb --- /dev/null +++ b/test/CodeGen/Hexagon/hwloop4.ll @@ -0,0 +1,76 @@ +; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s +; +; Remove the unnecessary 'add' instruction used for the hardware loop setup. + +; CHECK: [[OP0:r[0-9]+]] = add([[OP1:r[0-9]+]], #-[[OP2:[0-9]+]] +; CHECK-NOT: add([[OP0]], #[[OP2]]) +; CHECK: lsr([[OP1]], #{{[0-9]+}}) +; CHECK: loop0 + +define void @matrix_mul_matrix(i32 %N, i32* nocapture %C, i16* nocapture readnone %A, i16* nocapture readnone %B) #0 { +entry: + %cmp4 = icmp eq i32 %N, 0 + br i1 %cmp4, label %for.end, label %for.body.preheader + +for.body.preheader: + %maxval = add i32 %N, -7 + %0 = icmp sgt i32 %maxval, 0 + br i1 %0, label %for.body.preheader9, label %for.body.ur.preheader + +for.body.preheader9: + br label %for.body + +for.body: + %arrayidx.phi = phi i32* [ %arrayidx.inc.7, %for.body ], [ %C, %for.body.preheader9 ] + %i.05 = phi i32 [ %inc.7, %for.body ], [ 0, %for.body.preheader9 ] + store i32 %i.05, i32* %arrayidx.phi, align 4 + %inc = add i32 %i.05, 1 + %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1 + store i32 %inc, i32* %arrayidx.inc, align 4 + %inc.1 = add i32 %i.05, 2 + %arrayidx.inc.1 = getelementptr i32, i32* %arrayidx.phi, i32 2 + store i32 %inc.1, i32* %arrayidx.inc.1, align 4 + %inc.2 = add i32 %i.05, 3 + %arrayidx.inc.2 = getelementptr i32, i32* %arrayidx.phi, i32 3 + store i32 %inc.2, i32* %arrayidx.inc.2, align 4 + %inc.3 = add i32 %i.05, 4 + %arrayidx.inc.3 = getelementptr i32, i32* %arrayidx.phi, i32 4 + store i32 %inc.3, i32* %arrayidx.inc.3, align 4 + %inc.4 = add i32 %i.05, 5 + %arrayidx.inc.4 = getelementptr i32, i32* %arrayidx.phi, i32 5 + store i32 %inc.4, i32* %arrayidx.inc.4, align 4 + %inc.5 = add i32 %i.05, 6 + %arrayidx.inc.5 = getelementptr i32, i32* %arrayidx.phi, i32 6 + store i32 %inc.5, i32* %arrayidx.inc.5, align 4 + %inc.6 = add i32 %i.05, 7 + %arrayidx.inc.6 = getelementptr i32, i32* %arrayidx.phi, i32 7 + store i32 %inc.6, i32* %arrayidx.inc.6, align 4 + %inc.7 = add i32 %i.05, 8 + %exitcond.7 = icmp slt i32 %inc.7, %maxval + %arrayidx.inc.7 = getelementptr i32, i32* %arrayidx.phi, i32 8 + br i1 %exitcond.7, label %for.body, label %for.end.loopexit.ur-lcssa + +for.end.loopexit.ur-lcssa: + %1 = icmp eq i32 %inc.7, %N + br i1 %1, label %for.end, label %for.body.ur.preheader + +for.body.ur.preheader: + %arrayidx.phi.ur.ph = phi i32* [ %C, %for.body.preheader ], [ %arrayidx.inc.7, %for.end.loopexit.ur-lcssa ] + %i.05.ur.ph = phi i32 [ 0, %for.body.preheader ], [ %inc.7, %for.end.loopexit.ur-lcssa ] + br label %for.body.ur + +for.body.ur: + %arrayidx.phi.ur = phi i32* [ %arrayidx.inc.ur, %for.body.ur ], [ %arrayidx.phi.ur.ph, %for.body.ur.preheader ] + %i.05.ur = phi i32 [ %inc.ur, %for.body.ur ], [ %i.05.ur.ph, %for.body.ur.preheader ] + store i32 %i.05.ur, i32* %arrayidx.phi.ur, align 4 + %inc.ur = add i32 %i.05.ur, 1 + %exitcond.ur = icmp eq i32 %inc.ur, %N + %arrayidx.inc.ur = getelementptr i32, i32* %arrayidx.phi.ur, i32 1 + br i1 %exitcond.ur, label %for.end.loopexit, label %for.body.ur + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} -- 2.34.1