#include "llvm/CallingConv.h"
#include "llvm/InlineAsm.h"
#include "llvm/Attributes.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/Support/CallSite.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
/// a sequence of several shifts, adds, and multiplies for this target.
bool isIntDivCheap() const { return IntDivIsCheap; }
+ /// isSlowDivBypassed - Returns true if target has indicated at least one
+ /// type should be bypassed.
+ bool isSlowDivBypassed() const { return !BypassSlowDivTypes.empty(); }
+
+ /// getBypassSlowDivTypes - Returns map of slow types for division or
+ /// remainder with corresponding fast types
+ const DenseMap<Type *, Type *> &getBypassSlowDivTypes() const {
+ return BypassSlowDivTypes;
+ }
+
/// isPow2DivCheap() - Return true if pow2 div is cheaper than a chain of
/// srl/add/sra.
bool isPow2DivCheap() const { return Pow2DivIsCheap; }
/// of instructions not containing an integer divide.
void setIntDivIsCheap(bool isCheap = true) { IntDivIsCheap = isCheap; }
+ /// addBypassSlowDivType - Tells the code generator which types to bypass.
+ void addBypassSlowDivType(Type *slow_type, Type *fast_type) {
+ BypassSlowDivTypes[slow_type] = fast_type;
+ }
+
/// setPow2DivIsCheap - Tells the code generator that it shouldn't generate
/// srl/add/sra for a signed divide by power of two, and let the target handle
/// it.
/// set to true unconditionally.
bool IntDivIsCheap;
+ /// BypassSlowDivTypes - Tells the code generator to bypass slow divide or
+ /// remainder instructions. For example, SlowDivBypass[i32,u8] tells the code
+ /// generator to bypass 32-bit signed integer div/rem with an 8-bit unsigned
+ /// integer div/rem when the operands are positive and less than 256.
+ DenseMap <Type *, Type *> BypassSlowDivTypes;
+
/// Pow2DivIsCheap - Tells the code generator that it shouldn't generate
/// srl/add/sra for a signed divide by power of two, and let the target handle
/// it.
--- /dev/null
+//===- llvm/Transforms/Utils/BypassSlowDivision.h --------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an optimization for div and rem on architectures that
+// execute short instructions significantly faster than longer instructions.
+// For example, on Intel Atom 32-bit divides are slow enough that during
+// runtime it is profitable to check the value of the operands, and if they are
+// positive and less than 256 use an unsigned 8-bit divide.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TRANSFORMS_UTILS_BYPASSSLOWDIVISION_H
+#define TRANSFORMS_UTILS_BYPASSSLOWDIVISION_H
+
+#include "llvm/Function.h"
+
+/// This optimization identifies DIV instructions that can be
+/// profitably bypassed and carried out with a shorter, faster divide.
+bool bypassSlowDivision(llvm::Function &F,
+ llvm::Function::iterator &I,
+ const llvm::DenseMap<llvm::Type *, llvm::Type *> &BypassTypeMap);
+
+#endif
+//===- llvm/Transforms/Utils/BypassSlowDivision.h --------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an optimization for div and rem on architectures that
+// execute short instructions significantly faster than longer instructions.
+// For example, on Intel Atom 32-bit divides are slow enough that during
+// runtime it is profitable to check the value of the operands, and if they are
+// positive and less than 256 use an unsigned 8-bit divide.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TRANSFORMS_UTILS_BYPASSSLOWDIVISION_H
+#define TRANSFORMS_UTILS_BYPASSSLOWDIVISION_H
+
+#include "llvm/Function.h"
+
+/// This optimization identifies DIV instructions that can be
+/// profitably bypassed and carried out with a shorter, faster divide.
+bool bypassSlowDivision(llvm::Function &F,
+ llvm::Function::iterator &I,
+ const llvm::DenseMap<llvm::Type *, llvm::Type *> &BypassTypeMap);
+
+#endif
return NULL;
}
-
EVT TargetLowering::getSetCCResultType(EVT VT) const {
assert(!VT.isVector() && "No default SetCC type for vectors!");
return PointerTy.SimpleTy;
"Support BMI2 instructions">;
def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
"Use LEA for adjusting the stack pointer">;
+def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb",
+ "HasSlowDivide", "true",
+ "Use small divide for positive values less than 256">;
//===----------------------------------------------------------------------===//
// X86 processors supported.
def : Proc<"penryn", [FeatureSSE41, FeatureCMPXCHG16B,
FeatureSlowBTMem]>;
def : AtomProc<"atom", [ProcIntelAtom, FeatureSSE3, FeatureCMPXCHG16B,
- FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP]>;
+ FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP,
+ FeatureSlowDivide]>;
// "Arrandale" along with corei3 and corei5
def : Proc<"corei7", [FeatureSSE42, FeatureCMPXCHG16B,
FeatureSlowBTMem, FeatureFastUAMem,
setSchedulingPreference(Sched::RegPressure);
setStackPointerRegisterToSaveRestore(X86StackPtr);
+ // Bypass i32 with i8 on Atom when compiling with O2
+ if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default)
+ addBypassSlowDivType(Type::getInt32Ty(getGlobalContext()), Type::getInt8Ty(getGlobalContext()));
+
if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
// Setup Windows compiler runtime calls.
setLibcallName(RTLIB::SDIV_I64, "_alldiv");
, HasVectorUAMem(false)
, HasCmpxchg16b(false)
, UseLeaForSP(false)
+ , HasSlowDivide(false)
, PostRAScheduler(false)
, stackAlignment(4)
// FIXME: this is a known good value for Yonah. How about others?
/// the stack pointer. This is an optimization for Intel Atom processors.
bool UseLeaForSP;
+ /// HasSlowDivide - True if smaller divides are significantly faster than
+ /// full divides and should be used when possible.
+ bool HasSlowDivide;
+
/// PostRAScheduler - True if using post-register-allocation scheduler.
bool PostRAScheduler;
bool hasVectorUAMem() const { return HasVectorUAMem; }
bool hasCmpxchg16b() const { return HasCmpxchg16b; }
bool useLeaForSP() const { return UseLeaForSP; }
+ bool hasSlowDivide() const { return HasSlowDivide; }
bool isAtom() const { return X86ProcFamily == IntelAtom; }
#include "llvm/Transforms/Utils/AddrModeMatcher.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
using namespace llvm::PatternMatch;
PFI = getAnalysisIfAvailable<ProfileInfo>();
OptSize = F.hasFnAttr(Attribute::OptimizeForSize);
- // First pass, eliminate blocks that contain only PHI nodes and an
+ /// This optimization identifies DIV instructions that can be
+ /// profitably bypassed and carried out with a shorter, faster divide.
+ if (TLI && TLI->isSlowDivBypassed()) {
+ const DenseMap<Type *, Type *> &BypassTypeMap = TLI->getBypassSlowDivTypes();
+
+ for (Function::iterator I = F.begin(); I != F.end(); I++) {
+ EverMadeChange |= bypassSlowDivision(F,
+ I,
+ BypassTypeMap);
+ }
+ }
+
+ // Eliminate blocks that contain only PHI nodes and an
// unconditional branch.
EverMadeChange |= EliminateMostlyEmptyBlocks(F);
--- /dev/null
+//===-- BypassSlowDivision.cpp - Bypass slow division ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an optimization for div and rem on architectures that
+// execute short instructions significantly faster than longer instructions.
+// For example, on Intel Atom 32-bit divides are slow enough that during
+// runtime it is profitable to check the value of the operands, and if they are
+// positive and less than 256 use an unsigned 8-bit divide.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "bypass-slow-division"
+#include "llvm/Instructions.h"
+#include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
+
+using namespace llvm;
+
+namespace llvm {
+ struct DivOpInfo {
+ bool SignedOp;
+ Value *Dividend;
+ Value *Divisor;
+
+ DivOpInfo(bool InSignedOp, Value *InDividend, Value *InDivisor)
+ : SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {}
+ };
+
+ struct DivPhiNodes {
+ PHINode *Quotient;
+ PHINode *Remainder;
+
+ DivPhiNodes(PHINode *InQuotient, PHINode *InRemainder)
+ : Quotient(InQuotient), Remainder(InRemainder) {}
+ };
+
+ template<>
+ struct DenseMapInfo<DivOpInfo> {
+ static bool isEqual(const DivOpInfo &Val1, const DivOpInfo &Val2) {
+ return Val1.SignedOp == Val2.SignedOp &&
+ Val1.Dividend == Val2.Dividend &&
+ Val1.Divisor == Val2.Divisor;
+ }
+
+ static DivOpInfo getEmptyKey() {
+ return DivOpInfo(false, 0, 0);
+ }
+
+ static DivOpInfo getTombstoneKey() {
+ return DivOpInfo(true, 0, 0);
+ }
+
+ static unsigned getHashValue(const DivOpInfo &Val) {
+ return (unsigned)(reinterpret_cast<uintptr_t>(Val.Dividend) ^
+ reinterpret_cast<uintptr_t>(Val.Divisor)) ^
+ (unsigned)Val.SignedOp;
+ }
+ };
+
+ typedef DenseMap<DivOpInfo, DivPhiNodes> DivCacheTy;
+}
+
+// insertFastDiv - Substitutes the div/rem instruction with code that checks the
+// value of the operands and uses a shorter-faster div/rem instruction when
+// possible and the longer-slower div/rem instruction otherwise.
+static void insertFastDiv(Function &F,
+ Function::iterator &I,
+ BasicBlock::iterator &J,
+ IntegerType *BypassType,
+ bool UseDivOp,
+ bool UseSignedOp,
+ DivCacheTy &PerBBDivCache)
+{
+ // Get instruction operands
+ Instruction *Instr = J;
+ Value *Dividend = Instr->getOperand(0);
+ Value *Divisor = Instr->getOperand(1);
+
+ if (dyn_cast<ConstantInt>(Divisor) != 0 ||
+ (dyn_cast<ConstantInt>(Dividend) != 0 &&
+ dyn_cast<ConstantInt>(Divisor) != 0)) {
+ // Operations with immediate values should have
+ // been solved and replaced during compile time.
+ return;
+ }
+
+ // Basic Block is split before divide
+ BasicBlock *MainBB = I;
+ BasicBlock *SuccessorBB = I->splitBasicBlock(J);
+ I++; //advance iterator I to successorBB
+
+ // Add new basic block for slow divide operation
+ BasicBlock *SlowBB = BasicBlock::Create(F.getContext(), "",
+ MainBB->getParent(), SuccessorBB);
+ SlowBB->moveBefore(SuccessorBB);
+ IRBuilder<> SlowBuilder(SlowBB, SlowBB->begin());
+ Value *SlowQuotientV;
+ Value *SlowRemainderV;
+ if (UseSignedOp) {
+ SlowQuotientV = SlowBuilder.CreateSDiv(Dividend, Divisor);
+ SlowRemainderV = SlowBuilder.CreateSRem(Dividend, Divisor);
+ } else {
+ SlowQuotientV = SlowBuilder.CreateUDiv(Dividend, Divisor);
+ SlowRemainderV = SlowBuilder.CreateURem(Dividend, Divisor);
+ }
+ SlowBuilder.CreateBr(SuccessorBB);
+
+ // Add new basic block for fast divide operation
+ BasicBlock *FastBB = BasicBlock::Create(F.getContext(), "",
+ MainBB->getParent(), SuccessorBB);
+ FastBB->moveBefore(SlowBB);
+ IRBuilder<> FastBuilder(FastBB, FastBB->begin());
+ Value *ShortDivisorV = FastBuilder.CreateCast(Instruction::Trunc, Divisor, BypassType);
+ Value *ShortDividendV = FastBuilder.CreateCast(Instruction::Trunc, Dividend, BypassType);
+
+ // udiv/urem because optimization only handles positive numbers
+ Value *ShortQuotientV = FastBuilder.CreateExactUDiv(ShortDividendV,
+ ShortDivisorV);
+ Value *ShortRemainderV = FastBuilder.CreateURem(ShortDividendV,
+ ShortDivisorV);
+ Value *FastQuotientV = FastBuilder.CreateCast(Instruction::ZExt,
+ ShortQuotientV,
+ Dividend->getType());
+ Value *FastRemainderV = FastBuilder.CreateCast(Instruction::ZExt,
+ ShortRemainderV,
+ Dividend->getType());
+ FastBuilder.CreateBr(SuccessorBB);
+
+ // Phi nodes for result of div and rem
+ IRBuilder<> SuccessorBuilder(SuccessorBB, SuccessorBB->begin());
+ PHINode *QuoPhi = SuccessorBuilder.CreatePHI(Instr->getType(), 2);
+ QuoPhi->addIncoming(SlowQuotientV, SlowBB);
+ QuoPhi->addIncoming(FastQuotientV, FastBB);
+ PHINode *RemPhi = SuccessorBuilder.CreatePHI(Instr->getType(), 2);
+ RemPhi->addIncoming(SlowRemainderV, SlowBB);
+ RemPhi->addIncoming(FastRemainderV, FastBB);
+
+ // Replace Instr with appropriate phi node
+ if (UseDivOp) {
+ Instr->replaceAllUsesWith(QuoPhi);
+ } else {
+ Instr->replaceAllUsesWith(RemPhi);
+ }
+ Instr->eraseFromParent();
+
+ // Combine operands into a single value with OR for value testing below
+ MainBB->getInstList().back().eraseFromParent();
+ IRBuilder<> MainBuilder(MainBB, MainBB->end());
+ Value *OrV = MainBuilder.CreateOr(Dividend, Divisor);
+
+ // BitMask is inverted to check if the operands are
+ // larger than the bypass type
+ uint64_t BitMask = ~BypassType->getBitMask();
+ Value *AndV = MainBuilder.CreateAnd(OrV, BitMask);
+
+ // Compare operand values and branch
+ Value *ZeroV = MainBuilder.getInt32(0);
+ Value *CmpV = MainBuilder.CreateICmpEQ(AndV, ZeroV);
+ MainBuilder.CreateCondBr(CmpV, FastBB, SlowBB);
+
+ // point iterator J at first instruction of successorBB
+ J = I->begin();
+
+ // Cache phi nodes to be used later in place of other instances
+ // of div or rem with the same sign, dividend, and divisor
+ DivOpInfo Key(UseSignedOp, Dividend, Divisor);
+ DivPhiNodes Value(QuoPhi, RemPhi);
+ PerBBDivCache.insert(std::pair<DivOpInfo, DivPhiNodes>(Key, Value));
+}
+
+// reuseOrInsertFastDiv - Reuses previously computed dividend or remainder if
+// operands and operation are identical. Otherwise call insertFastDiv to perform
+// the optimization and cache the resulting dividend and remainder.
+static void reuseOrInsertFastDiv(Function &F,
+ Function::iterator &I,
+ BasicBlock::iterator &J,
+ IntegerType *BypassType,
+ bool UseDivOp,
+ bool UseSignedOp,
+ DivCacheTy &PerBBDivCache)
+{
+ // Get instruction operands
+ Instruction *Instr = J;
+ DivOpInfo Key(UseSignedOp, Instr->getOperand(0), Instr->getOperand(1));
+ DivCacheTy::const_iterator CacheI = PerBBDivCache.find(Key);
+
+ if (CacheI == PerBBDivCache.end()) {
+ // If previous instance does not exist, insert fast div
+ insertFastDiv(F, I, J, BypassType, UseDivOp, UseSignedOp, PerBBDivCache);
+ return;
+ }
+
+ // Replace operation value with previously generated phi node
+ DivPhiNodes Value = CacheI->second;
+ if (UseDivOp) {
+ // Replace all uses of div instruction with quotient phi node
+ J->replaceAllUsesWith(Value.Quotient);
+ } else {
+ // Replace all uses of rem instruction with remainder phi node
+ J->replaceAllUsesWith(Value.Remainder);
+ }
+
+ // Advance to next operation
+ J++;
+
+ // Remove redundant operation
+ Instr->eraseFromParent();
+}
+
+// bypassSlowDivision - This optimization identifies DIV instructions that can
+// be profitably bypassed and carried out with a shorter, faster divide.
+bool bypassSlowDivision(Function &F,
+ Function::iterator &I,
+ const llvm::DenseMap<Type *, Type *> &BypassTypeMap)
+{
+ DivCacheTy DivCache;
+
+ bool MadeChange = false;
+ for (BasicBlock::iterator J = I->begin(); J != I->end(); J++) {
+
+ // Get instruction details
+ unsigned Opcode = J->getOpcode();
+ bool UseDivOp = Opcode == Instruction::SDiv || Opcode == Instruction::UDiv;
+ bool UseRemOp = Opcode == Instruction::SRem || Opcode == Instruction::URem;
+ bool UseSignedOp = Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
+
+ // Only optimize div or rem ops
+ if (!UseDivOp && !UseRemOp) {
+ continue;
+ }
+ // Continue if div/rem type is not bypassed
+ DenseMap<Type *, Type *>::const_iterator BT = BypassTypeMap.find(J->getType());
+ if (BT == BypassTypeMap.end()) {
+ continue;
+ }
+
+ IntegerType *BypassType = (IntegerType *)BT->second;
+ reuseOrInsertFastDiv(F, I, J, BypassType, UseDivOp, UseSignedOp, DivCache);
+ MadeChange = true;
+ }
+
+ return MadeChange;
+}
BasicBlockUtils.cpp
BreakCriticalEdges.cpp
BuildLibCalls.cpp
+ BypassSlowDivision.cpp
CloneFunction.cpp
CloneModule.cpp
CmpInstAnalysis.cpp
--- /dev/null
+; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
+
+define i32 @test_get_quotient(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_quotient
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: ret
+; CHECK: divb
+; CHECK: ret
+ %result = sdiv i32 %a, %b
+ ret i32 %result
+}
+
+define i32 @test_get_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_remainder
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: ret
+; CHECK: divb
+; CHECK: ret
+ %result = srem i32 %a, %b
+ ret i32 %result
+}
+
+define i32 @test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_quotient_and_remainder
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: divb
+; CHECK: addl
+; CHECK: ret
+; CEECK-NOT: idivl
+; CHECK-NOT: divb
+ %resultdiv = sdiv i32 %a, %b
+ %resultrem = srem i32 %a, %b
+ %result = add i32 %resultdiv, %resultrem
+ ret i32 %result
+}
+
+define i32 @test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
+; CHECK: test_use_div_and_idiv
+; CHECK: idivl
+; CHECK: divb
+; CHECK: divl
+; CHECK: divb
+; CHECK: addl
+; CHECK: ret
+ %resultidiv = sdiv i32 %a, %b
+ %resultdiv = udiv i32 %a, %b
+ %result = add i32 %resultidiv, %resultdiv
+ ret i32 %result
+}
+
+define i32 @test_use_div_imm_imm() nounwind {
+; CHECK: test_use_div_imm_imm
+; CHECK: movl $64
+ %resultdiv = sdiv i32 256, 4
+ ret i32 %resultdiv
+}
+
+define i32 @test_use_div_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_div_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+ %resultdiv = sdiv i32 %a, 33
+ ret i32 %resultdiv
+}
+
+define i32 @test_use_rem_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_rem_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+ %resultrem = srem i32 %a, 33
+ ret i32 %resultrem
+}
+
+define i32 @test_use_divrem_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_divrem_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+ %resultdiv = sdiv i32 %a, 33
+ %resultrem = srem i32 %a, 33
+ %result = add i32 %resultdiv, %resultrem
+ ret i32 %result
+}
+
+define i32 @test_use_div_imm_reg(i32 %a) nounwind {
+; CHECK: test_use_div_imm_reg
+; CHECK: test
+; CHECK: idiv
+; CHECK: divb
+ %resultdiv = sdiv i32 4, %a
+ ret i32 %resultdiv
+}
+
+define i32 @test_use_rem_imm_reg(i32 %a) nounwind {
+; CHECK: test_use_rem_imm_reg
+; CHECK: test
+; CHECK: idiv
+; CHECK: divb
+ %resultdiv = sdiv i32 4, %a
+ ret i32 %resultdiv
+}