From: Evan Cheng Date: Sat, 15 Aug 2009 07:59:10 +0000 (+0000) Subject: Turn on if-conversion for thumb2. X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=bc9b754091ea281e769e487f396b40f6675b9edb;p=oota-llvm.git Turn on if-conversion for thumb2. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@79084 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index 4810d3baabc..f4d1ef373dc 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -272,6 +272,11 @@ const MachineInstrBuilder &AddDefaultT1CC(const MachineInstrBuilder &MIB, return MIB.addReg(ARM::CPSR, getDefRegState(true) | getDeadRegState(isDead)); } +static inline +const MachineInstrBuilder &AddNoT1CC(const MachineInstrBuilder &MIB) { + return MIB.addReg(0); +} + static inline bool isUncondBranchOpcode(int Opc) { return Opc == ARM::B || Opc == ARM::tB || Opc == ARM::t2B; diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index a4471000b0c..9de737be99a 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -391,26 +391,19 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setStackPointerRegisterToSaveRestore(ARM::SP); setSchedulingPreference(SchedulingForRegPressure); - setIfCvtBlockSizeLimit(Subtarget->isThumb() ? 0 : 10); - setIfCvtDupBlockSizeLimit(Subtarget->isThumb() ? 0 : 2); - - if (!Subtarget->isThumb()) { - // Use branch latency information to determine if-conversion limits. - // FIXME: If-converter should use instruction latency of the branch being - // eliminated to compute the threshold. For ARMv6, the branch "latency" - // varies depending on whether it's dynamically or statically predicted - // and on whether the destination is in the prefetch buffer. - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - const InstrItineraryData &InstrItins = Subtarget->getInstrItineraryData(); - unsigned Latency= InstrItins.getLatency(TII->get(ARM::Bcc).getSchedClass()); - if (Latency > 1) { - setIfCvtBlockSizeLimit(Latency-1); - if (Latency > 2) - setIfCvtDupBlockSizeLimit(Latency-2); - } else { - setIfCvtBlockSizeLimit(10); - setIfCvtDupBlockSizeLimit(2); - } + + // FIXME: If-converter should use instruction latency to determine + // profitability rather than relying on fixed limits. + if (Subtarget->getCPUString() == "generic") { + // Generic (and overly aggressive) if-conversion limits. + setIfCvtBlockSizeLimit(10); + setIfCvtDupBlockSizeLimit(2); + } else if (Subtarget->hasV6Ops()) { + setIfCvtBlockSizeLimit(2); + setIfCvtDupBlockSizeLimit(1); + } else { + setIfCvtBlockSizeLimit(3); + setIfCvtDupBlockSizeLimit(2); } maxStoresPerMemcpy = 1; //// temporary - rewrite interface to use type diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index 972c1f82f6b..d54e20f59a3 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -39,12 +39,14 @@ def GenericItineraries : ProcessorItineraries<[ InstrItinData]>, InstrItinData]>, InstrItinData]>, - InstrItinData, InstrStage<1, [FU_LdSt0]>]>, + InstrItinData, + InstrStage<1, [FU_LdSt0]>]>, InstrItinData]>, InstrItinData]>, InstrItinData]>, InstrItinData]>, - InstrItinData, InstrStage<1, [FU_LdSt0]>]>, + InstrItinData, + InstrStage<1, [FU_LdSt0]>]>, InstrItinData]> ]>; diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td index e1d6de9353c..e594f527868 100644 --- a/lib/Target/ARM/ARMScheduleV6.td +++ b/lib/Target/ARM/ARMScheduleV6.td @@ -18,11 +18,13 @@ def V6Itineraries : ProcessorItineraries<[ InstrItinData]>, InstrItinData]>, InstrItinData]>, - InstrItinData, InstrStage<1, [FU_LdSt0]>]>, + InstrItinData, + InstrStage<1, [FU_LdSt0]>]>, InstrItinData]>, InstrItinData]>, InstrItinData]>, InstrItinData]>, - InstrItinData, InstrStage<1, [FU_LdSt0]>]>, + InstrItinData, + InstrStage<1, [FU_LdSt0]>]>, InstrItinData]> ]>; diff --git a/lib/Target/ARM/ARMScheduleV7.td b/lib/Target/ARM/ARMScheduleV7.td index 78537a515ff..9c3b5e34c5c 100644 --- a/lib/Target/ARM/ARMScheduleV7.td +++ b/lib/Target/ARM/ARMScheduleV7.td @@ -55,11 +55,13 @@ def CortexA9Itineraries : ProcessorItineraries<[ InstrItinData]>, InstrItinData]>, InstrItinData]>, - InstrItinData, InstrStage<1, [FU_LdSt0]>]>, + InstrItinData, + InstrStage<1, [FU_LdSt0]>]>, InstrItinData]>, InstrItinData]>, InstrItinData]>, InstrItinData]>, - InstrItinData, InstrStage<1, [FU_LdSt0]>]>, + InstrItinData, + InstrStage<1, [FU_LdSt0]>]>, InstrItinData]> ]>; diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index fbc5f38efea..fdfa3a32e5a 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -75,17 +75,15 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, const std::string &TT, ThumbTargetMachine::ThumbTargetMachine(const Target &T, const std::string &TT, const std::string &FS) : ARMBaseTargetMachine(T, TT, FS, true), + InstrInfo(Subtarget.hasThumb2() + ? ((ARMBaseInstrInfo*)new Thumb2InstrInfo(Subtarget)) + : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))), DataLayout(Subtarget.isAPCS_ABI() ? std::string("e-p:32:32-f64:32:32-i64:32:32-" "i16:16:32-i8:8:32-i1:8:32-a:0:32") : std::string("e-p:32:32-f64:64:64-i64:64:64-" "i16:16:32-i8:8:32-i1:8:32-a:0:32")), TLInfo(*this) { - // Create the approriate type of Thumb InstrInfo - if (Subtarget.hasThumb2()) - InstrInfo = new Thumb2InstrInfo(Subtarget); - else - InstrInfo = new Thumb1InstrInfo(Subtarget); } @@ -116,7 +114,7 @@ bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM, PM.add(createARMLoadStoreOptimizationPass()); if (OptLevel != CodeGenOpt::None && - !DisableIfConversion && !Subtarget.isThumb()) + !DisableIfConversion && !Subtarget.isThumb1Only()) PM.add(createIfConverterPass()); if (Subtarget.isThumb2()) { diff --git a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp index 178b7a20862..72fd7e4317c 100644 --- a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp @@ -702,7 +702,7 @@ ARMAsmPrinter::printThumbITMask(const MachineInstr *MI, int Op) { unsigned NumTZ = CountTrailingZeros_32(Mask); assert(NumTZ <= 3 && "Invalid IT mask!"); for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) { - bool T = (Mask & (1 << Pos)) != 0; + bool T = (Mask & (1 << Pos)) == 0; if (T) O << 't'; else diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp index da7228b3457..e74a526afae 100644 --- a/lib/Target/ARM/Thumb2ITBlockPass.cpp +++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp @@ -66,23 +66,19 @@ bool Thumb2ITBlockPass::InsertITBlocks(MachineBasicBlock &MBB) { .addImm(CC); ++MBBI; - // Finalize IT mask. If the following instruction is not predicated or it's - // predicated on a condition that's not the same or the opposite of CC, then - // the mask is 0x8. + // Finalize IT mask. ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC); - unsigned Mask = 0x8; - while (MBBI != E || (Mask & 1)) { + unsigned Mask = 0, Pos = 3; + while (MBBI != E && Pos) { ARMCC::CondCodes NCC = getPredicate(&*MBBI, TII); - if (NCC == CC) { - Mask >>= 1; - Mask |= 0x8; - } else if (NCC == OCC) { - Mask >>= 1; - } else { + if (NCC == OCC) { + Mask |= (1 << Pos); + } else if (NCC != CC) break; - } + --Pos; ++MBBI; } + Mask |= (1 << Pos); MIB.addImm(Mask); Modified = true; ++NumITs; diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp index 0a86a75ff7c..b3ed8e84f92 100644 --- a/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -129,7 +129,7 @@ namespace { static char ID; Thumb2SizeReduce(); - const TargetInstrInfo *TII; + const Thumb2InstrInfo *TII; virtual bool runOnMachineFunction(MachineFunction &MF); @@ -454,8 +454,12 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, DebugLoc dl = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewTID); MIB.addOperand(MI->getOperand(0)); - if (HasCC && NewTID.hasOptionalDef()) - AddDefaultT1CC(MIB, CCDead); + if (NewTID.hasOptionalDef()) { + if (HasCC) + AddDefaultT1CC(MIB, CCDead); + else + AddNoT1CC(MIB); + } // Transfer the rest of operands. unsigned NumOps = TID.getNumOperands(); @@ -534,8 +538,12 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, DebugLoc dl = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewTID); MIB.addOperand(MI->getOperand(0)); - if (HasCC && NewTID.hasOptionalDef()) - AddDefaultT1CC(MIB, CCDead); + if (NewTID.hasOptionalDef()) { + if (HasCC) + AddDefaultT1CC(MIB, CCDead); + else + AddNoT1CC(MIB); + } // Transfer the rest of operands. unsigned NumOps = TID.getNumOperands(); @@ -659,7 +667,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) { const TargetMachine &TM = MF.getTarget(); - TII = TM.getInstrInfo(); + TII = static_cast(TM.getInstrInfo()); bool Modified = false; for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) diff --git a/test/CodeGen/Thumb2/frameless2.ll b/test/CodeGen/Thumb2/frameless2.ll new file mode 100644 index 00000000000..0fd480ee1f7 --- /dev/null +++ b/test/CodeGen/Thumb2/frameless2.ll @@ -0,0 +1,12 @@ +; RUN: llvm-as < %s | llc -mtriple=thumbv7-apple-darwin -disable-fp-elim | not grep r7 + +%struct.noise3 = type { [3 x [17 x i32]] } +%struct.noiseguard = type { i32, i32, i32 } + +define arm_apcscc void @vorbis_encode_noisebias_setup(i8* nocapture %vi.0.7.val, double %s, i32 %block, i32* nocapture %suppress, %struct.noise3* nocapture %in, %struct.noiseguard* nocapture %guard, double %userbias) nounwind { +entry: + %0 = getelementptr %struct.noiseguard* %guard, i32 %block, i32 2; [#uses=1] + %1 = load i32* %0, align 4 ; [#uses=1] + store i32 %1, i32* undef, align 4 + unreachable +} diff --git a/test/CodeGen/Thumb2/thumb2-ifcvt1.ll b/test/CodeGen/Thumb2/thumb2-ifcvt1.ll new file mode 100644 index 00000000000..e04ef29c8b3 --- /dev/null +++ b/test/CodeGen/Thumb2/thumb2-ifcvt1.ll @@ -0,0 +1,84 @@ +; RUN: llvm-as < %s | llc -march=thumb -mattr=+thumb2 | FileCheck %s + +define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) { +; CHECK: t1: +; CHECK: it ne +; CHECK: cmpne + switch i32 %c, label %cond_next [ + i32 1, label %cond_true + i32 7, label %cond_true + ] + +cond_true: + %tmp12 = add i32 %a, 1 + %tmp1518 = add i32 %tmp12, %b + ret i32 %tmp1518 + +cond_next: + %tmp15 = add i32 %b, %a + ret i32 %tmp15 +} + +; FIXME: Check for # of unconditional branch after adding branch folding post ifcvt. +define i32 @t2(i32 %a, i32 %b) { +entry: +; CHECK: t2: +; CHECK: ite le +; CHECK: suble +; CHECK: subgt + %tmp1434 = icmp eq i32 %a, %b ; [#uses=1] + br i1 %tmp1434, label %bb17, label %bb.outer + +bb.outer: ; preds = %cond_false, %entry + %b_addr.021.0.ph = phi i32 [ %b, %entry ], [ %tmp10, %cond_false ] ; [#uses=5] + %a_addr.026.0.ph = phi i32 [ %a, %entry ], [ %a_addr.026.0, %cond_false ] ; [#uses=1] + br label %bb + +bb: ; preds = %cond_true, %bb.outer + %indvar = phi i32 [ 0, %bb.outer ], [ %indvar.next, %cond_true ] ; [#uses=2] + %tmp. = sub i32 0, %b_addr.021.0.ph ; [#uses=1] + %tmp.40 = mul i32 %indvar, %tmp. ; [#uses=1] + %a_addr.026.0 = add i32 %tmp.40, %a_addr.026.0.ph ; [#uses=6] + %tmp3 = icmp sgt i32 %a_addr.026.0, %b_addr.021.0.ph ; [#uses=1] + br i1 %tmp3, label %cond_true, label %cond_false + +cond_true: ; preds = %bb + %tmp7 = sub i32 %a_addr.026.0, %b_addr.021.0.ph ; [#uses=2] + %tmp1437 = icmp eq i32 %tmp7, %b_addr.021.0.ph ; [#uses=1] + %indvar.next = add i32 %indvar, 1 ; [#uses=1] + br i1 %tmp1437, label %bb17, label %bb + +cond_false: ; preds = %bb + %tmp10 = sub i32 %b_addr.021.0.ph, %a_addr.026.0 ; [#uses=2] + %tmp14 = icmp eq i32 %a_addr.026.0, %tmp10 ; [#uses=1] + br i1 %tmp14, label %bb17, label %bb.outer + +bb17: ; preds = %cond_false, %cond_true, %entry + %a_addr.026.1 = phi i32 [ %a, %entry ], [ %tmp7, %cond_true ], [ %a_addr.026.0, %cond_false ] ; [#uses=1] + ret i32 %a_addr.026.1 +} + +@x = external global i32* ; [#uses=1] + +define void @foo(i32 %a) { +entry: + %tmp = load i32** @x ; [#uses=1] + store i32 %a, i32* %tmp + ret void +} + +define void @t3(i32 %a, i32 %b) { +entry: +; CHECK: t3: +; CHECK: it lt +; CHECK: poplt {r7, pc} + %tmp1 = icmp sgt i32 %a, 10 ; [#uses=1] + br i1 %tmp1, label %cond_true, label %UnifiedReturnBlock + +cond_true: ; preds = %entry + tail call void @foo( i32 %b ) + ret void + +UnifiedReturnBlock: ; preds = %entry + ret void +} diff --git a/test/CodeGen/Thumb2/thumb2-ifcvt2.ll b/test/CodeGen/Thumb2/thumb2-ifcvt2.ll new file mode 100644 index 00000000000..9e26da0782e --- /dev/null +++ b/test/CodeGen/Thumb2/thumb2-ifcvt2.ll @@ -0,0 +1,93 @@ +; RUN: llvm-as < %s | llc -march=thumb -mattr=+thumb2 | FileCheck %s + +define void @foo(i32 %X, i32 %Y) { +entry: +; CHECK: foo: +; CHECK: it ne +; CHECK: cmpne +; CHECK: it hi +; CHECK: pophi {r7, pc} + %tmp1 = icmp ult i32 %X, 4 ; [#uses=1] + %tmp4 = icmp eq i32 %Y, 0 ; [#uses=1] + %tmp7 = or i1 %tmp4, %tmp1 ; [#uses=1] + br i1 %tmp7, label %cond_true, label %UnifiedReturnBlock + +cond_true: ; preds = %entry + %tmp10 = tail call i32 (...)* @bar( ) ; [#uses=0] + ret void + +UnifiedReturnBlock: ; preds = %entry + ret void +} + +declare i32 @bar(...) + +; FIXME: Need post-ifcvt branch folding to get rid of the extra br at end of BB1. + + %struct.quad_struct = type { i32, i32, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct* } + +define fastcc i32 @CountTree(%struct.quad_struct* %tree) { +entry: +; CHECK: CountTree: +; CHECK: it eq +; CHECK: cmpeq +; CHECK: beq.n +; CHECK: itt eq +; CHECK: moveq +; CHECK: popeq + br label %tailrecurse + +tailrecurse: ; preds = %bb, %entry + %tmp6 = load %struct.quad_struct** null ; <%struct.quad_struct*> [#uses=1] + %tmp9 = load %struct.quad_struct** null ; <%struct.quad_struct*> [#uses=2] + %tmp12 = load %struct.quad_struct** null ; <%struct.quad_struct*> [#uses=1] + %tmp14 = icmp eq %struct.quad_struct* null, null ; [#uses=1] + %tmp17 = icmp eq %struct.quad_struct* %tmp6, null ; [#uses=1] + %tmp23 = icmp eq %struct.quad_struct* %tmp9, null ; [#uses=1] + %tmp29 = icmp eq %struct.quad_struct* %tmp12, null ; [#uses=1] + %bothcond = and i1 %tmp17, %tmp14 ; [#uses=1] + %bothcond1 = and i1 %bothcond, %tmp23 ; [#uses=1] + %bothcond2 = and i1 %bothcond1, %tmp29 ; [#uses=1] + br i1 %bothcond2, label %return, label %bb + +bb: ; preds = %tailrecurse + %tmp41 = tail call fastcc i32 @CountTree( %struct.quad_struct* %tmp9 ) ; [#uses=0] + br label %tailrecurse + +return: ; preds = %tailrecurse + ret i32 0 +} + + %struct.SString = type { i8*, i32, i32 } + +declare void @abort() + +define fastcc void @t1(%struct.SString* %word, i8 signext %c) { +entry: +; CHECK: t1: +; CHECK: it ne +; CHECK: popne {r7, pc} + %tmp1 = icmp eq %struct.SString* %word, null ; [#uses=1] + br i1 %tmp1, label %cond_true, label %cond_false + +cond_true: ; preds = %entry + tail call void @abort( ) + unreachable + +cond_false: ; preds = %entry + ret void +} + +define fastcc void @t2() nounwind { +entry: +; CHECK: t2: +; CHECK: cmp r0, #0 +; CHECK: bne.n + br i1 undef, label %bb.i.i3, label %growMapping.exit + +bb.i.i3: ; preds = %entry + unreachable + +growMapping.exit: ; preds = %entry + unreachable +}