//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "t2-reduce-size"
#include "ARM.h"
#include "ARMBaseInstrInfo.h"
-#include "ARMBaseRegisterInfo.h"
#include "ARMSubtarget.h"
#include "MCTargetDesc/ARMAddressingModes.h"
#include "Thumb2InstrInfo.h"
#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Function.h" // To access Function attributes
+#include "llvm/IR/Function.h" // To access Function attributes
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
using namespace llvm;
+#define DEBUG_TYPE "t2-reduce-size"
+
STATISTIC(NumNarrows, "Number of 32-bit instrs reduced to 16-bit ones");
STATISTIC(Num2Addrs, "Number of 32-bit instrs reduced to 2addr 16-bit ones");
STATISTIC(NumLdSts, "Number of 32-bit load / store reduced to 16-bit ones");
{ ARM::t2LSLrr, 0, ARM::tLSLrr, 0, 0, 0, 1, 0,0, 1,0,1 },
{ ARM::t2LSRri, ARM::tLSRri, 0, 5, 0, 1, 0, 0,0, 1,0,1 },
{ ARM::t2LSRrr, 0, ARM::tLSRrr, 0, 0, 0, 1, 0,0, 1,0,1 },
- // FIXME: tMOVi8 and tMVN also partially update CPSR but they are less
- // likely to cause issue in the loop. As a size / performance workaround,
- // they are not marked as such.
- { ARM::t2MOVi, ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0,0,0 },
- { ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0,1,0 },
+ { ARM::t2MOVi, ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 1,0,0 },
+ { ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 1,1,0 },
// FIXME: Do we need the 16-bit 'S' variant?
{ ARM::t2MOVr,ARM::tMOVr, 0, 0, 0, 0, 0, 1,0, 0,0,0 },
{ ARM::t2MUL, 0, ARM::tMUL, 0, 0, 0, 1, 0,0, 1,0,0 },
const Thumb2InstrInfo *TII;
const ARMSubtarget *STI;
- virtual bool runOnMachineFunction(MachineFunction &MF);
+ bool runOnMachineFunction(MachineFunction &MF) override;
- virtual const char *getPassName() const {
+ const char *getPassName() const override {
return "Thumb2 instruction size reduction pass";
}
/// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable.
DenseMap<unsigned, unsigned> ReduceOpcodeMap;
- bool canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use,
- bool IsSelfLoop);
+ bool canAddPseudoFlagDep(MachineInstr *Use, bool IsSelfLoop);
bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
bool is2Addr, ARMCC::CondCodes Pred,
const ReduceEntry &Entry);
bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
- const ReduceEntry &Entry, bool LiveCPSR,
- MachineInstr *CPSRDef, bool IsSelfLoop);
+ const ReduceEntry &Entry, bool LiveCPSR, bool IsSelfLoop);
/// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address
/// instruction.
bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
- const ReduceEntry &Entry,
- bool LiveCPSR, MachineInstr *CPSRDef,
+ const ReduceEntry &Entry, bool LiveCPSR,
bool IsSelfLoop);
/// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit
/// non-two-address instruction.
bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
- const ReduceEntry &Entry,
- bool LiveCPSR, MachineInstr *CPSRDef,
+ const ReduceEntry &Entry, bool LiveCPSR,
bool IsSelfLoop);
/// ReduceMI - Attempt to reduce MI, return true on success.
bool ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI,
- bool LiveCPSR, MachineInstr *CPSRDef,
- bool IsSelfLoop);
+ bool LiveCPSR, bool IsSelfLoop);
/// ReduceMBB - Reduce width of instructions in the specified basic block.
bool ReduceMBB(MachineBasicBlock &MBB);
bool OptimizeSize;
bool MinimizeSize;
+
+ // Last instruction to define CPSR in the current block.
+ MachineInstr *CPSRDef;
+ // Was CPSR last defined by a high latency instruction?
+ // When CPSRDef is null, this refers to CPSR defs in predecessors.
+ bool HighLatencyCPSR;
+
+ struct MBBInfo {
+ // The flags leaving this block have high latency.
+ bool HighLatencyCPSR;
+ // Has this block been visited yet?
+ bool Visited;
+
+ MBBInfo() : HighLatencyCPSR(false), Visited(false) {}
+ };
+
+ SmallVector<MBBInfo, 8> BlockInfo;
};
char Thumb2SizeReduce::ID = 0;
}
return false;
}
+// Check for a likely high-latency flag def.
+static bool isHighLatencyCPSR(MachineInstr *Def) {
+ switch(Def->getOpcode()) {
+ case ARM::FMSTAT:
+ case ARM::tMUL:
+ return true;
+ }
+ return false;
+}
+
/// canAddPseudoFlagDep - For A9 (and other out-of-order) implementations,
/// the 's' 16-bit instruction partially update CPSR. Abort the
/// transformation to avoid adding false dependency on last CPSR setting
/// In this case it would have been ok to narrow the mul.w to muls since there
/// are indirect RAW dependency between the muls and the mul.w
bool
-Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use,
- bool FirstInSelfLoop) {
+Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) {
// Disable the check for -Oz (aka OptimizeForSizeHarder).
if (MinimizeSize || !STI->avoidCPSRPartialUpdate())
return false;
- if (!Def)
+ if (!CPSRDef)
// If this BB loops back to itself, conservatively avoid narrowing the
// first instruction that does partial flag update.
- return FirstInSelfLoop;
+ return HighLatencyCPSR || FirstInSelfLoop;
SmallSet<unsigned, 2> Defs;
- for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) {
- const MachineOperand &MO = Def->getOperand(i);
+ for (const MachineOperand &MO : CPSRDef->operands()) {
if (!MO.isReg() || MO.isUndef() || MO.isUse())
continue;
unsigned Reg = MO.getReg();
Defs.insert(Reg);
}
- for (unsigned i = 0, e = Use->getNumOperands(); i != e; ++i) {
- const MachineOperand &MO = Use->getOperand(i);
+ for (const MachineOperand &MO : Use->operands()) {
if (!MO.isReg() || MO.isUndef() || MO.isDef())
continue;
unsigned Reg = MO.getReg();
return false;
}
+ // If the current CPSR has high latency, try to avoid the false dependency.
+ if (HighLatencyCPSR)
+ return true;
+
+ // tMOVi8 usually doesn't start long dependency chains, and there are a lot
+ // of them, so always shrink them when CPSR doesn't have high latency.
+ if (Use->getOpcode() == ARM::t2MOVi ||
+ Use->getOpcode() == ARM::t2MOVi16)
+ return false;
+
// No read-after-write dependency. The narrowing will add false dependency.
return true;
}
static bool VerifyLowRegs(MachineInstr *MI) {
unsigned Opc = MI->getOpcode();
- bool isPCOk = (Opc == ARM::t2LDMIA_RET || Opc == ARM::t2LDMIA ||
- Opc == ARM::t2LDMDB || Opc == ARM::t2LDMIA_UPD ||
- Opc == ARM::t2LDMDB_UPD);
- bool isLROk = (Opc == ARM::t2STMIA_UPD || Opc == ARM::t2STMDB_UPD);
+ bool isPCOk = (Opc == ARM::t2LDMIA_RET || Opc == ARM::t2LDMIA_UPD);
+ bool isLROk = (Opc == ARM::t2STMDB_UPD);
bool isSPOk = isPCOk || isLROk;
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
const MachineOperand &MO = MI->getOperand(i);
if (MI->getOperand(1).getReg() == ARM::SP) {
Opc = Entry.NarrowOpc2;
ImmLimit = Entry.Imm2Limit;
- HasOffReg = false;
}
Scale = 4;
HasShift = true;
OpNum = 4;
break;
- case ARM::t2LDMIA:
- case ARM::t2LDMDB: {
+ case ARM::t2LDMIA: {
unsigned BaseReg = MI->getOperand(0).getReg();
- if (!isARMLowRegister(BaseReg) || Entry.WideOpc != ARM::t2LDMIA)
- return false;
+ assert(isARMLowRegister(BaseReg));
// For the non-writeback version (this one), the base register must be
// one of the registers being loaded.
bool isOK = false;
- for (unsigned i = 4; i < MI->getNumOperands(); ++i) {
+ for (unsigned i = 3; i < MI->getNumOperands(); ++i) {
if (MI->getOperand(i).getReg() == BaseReg) {
isOK = true;
break;
break;
}
case ARM::t2LDMIA_UPD:
- case ARM::t2LDMDB_UPD:
case ARM::t2STMIA_UPD:
case ARM::t2STMDB_UPD: {
OpNum = 0;
unsigned OffsetReg = 0;
bool OffsetKill = false;
+ bool OffsetInternal = false;
if (HasShift) {
OffsetReg = MI->getOperand(2).getReg();
OffsetKill = MI->getOperand(2).isKill();
+ OffsetInternal = MI->getOperand(2).isInternalRead();
if (MI->getOperand(3).getImm())
// Thumb1 addressing mode doesn't support shift.
assert((!HasShift || OffsetReg) && "Invalid so_reg load / store address!");
if (HasOffReg)
- MIB.addReg(OffsetReg, getKillRegState(OffsetKill));
+ MIB.addReg(OffsetReg, getKillRegState(OffsetKill) |
+ getInternalReadRegState(OffsetInternal));
}
// Transfer the rest of operands.
bool
Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
const ReduceEntry &Entry,
- bool LiveCPSR, MachineInstr *CPSRDef,
- bool IsSelfLoop) {
+ bool LiveCPSR, bool IsSelfLoop) {
unsigned Opc = MI->getOpcode();
if (Opc == ARM::t2ADDri) {
// If the source register is SP, try to reduce to tADDrSPi, otherwise
// it's a normal reduce.
if (MI->getOperand(1).getReg() != ARM::SP) {
- if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop))
+ if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
return true;
- return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop);
+ return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
}
// Try to reduce to tADDrSPi.
unsigned Imm = MI->getOperand(2).getImm();
if (Entry.LowRegs1 && !VerifyLowRegs(MI))
return false;
- if (MI->mayLoad() || MI->mayStore())
+ if (MI->mayLoadOrStore())
return ReduceLoadStore(MBB, MI, Entry);
switch (Opc) {
switch (Opc) {
default: break;
case ARM::t2ADDSri: {
- if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop))
+ if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
return true;
// fallthrough
}
case ARM::t2ADDSrr:
- return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop);
+ return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
}
}
break;
case ARM::t2UXTB:
case ARM::t2UXTH:
if (MI->getOperand(2).getImm() == 0)
- return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop);
+ return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
break;
case ARM::t2MOVi16:
// Can convert only 'pure' immediate operands, not immediates obtained as
// globals' addresses.
if (MI->getOperand(1).isImm())
- return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop);
+ return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
break;
case ARM::t2CMPrr: {
// Try to reduce to the lo-reg only version first. Why there are two
// source insn opcode. So for now, we hack a local entry record to use.
static const ReduceEntry NarrowEntry =
{ ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 0,1,0 };
- if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, CPSRDef, IsSelfLoop))
+ if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, IsSelfLoop))
return true;
- return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop);
+ return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
}
}
return false;
bool
Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
const ReduceEntry &Entry,
- bool LiveCPSR, MachineInstr *CPSRDef,
- bool IsSelfLoop) {
+ bool LiveCPSR, bool IsSelfLoop) {
if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr))
return false;
// Avoid adding a false dependency on partial flag update by some 16-bit
// instructions which has the 's' bit set.
if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC &&
- canAddPseudoFlagDep(CPSRDef, MI, IsSelfLoop))
+ canAddPseudoFlagDep(MI, IsSelfLoop))
return false;
// Add the 16-bit instruction.
bool
Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
const ReduceEntry &Entry,
- bool LiveCPSR, MachineInstr *CPSRDef,
- bool IsSelfLoop) {
+ bool LiveCPSR, bool IsSelfLoop) {
if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit))
return false;
// Avoid adding a false dependency on partial flag update by some 16-bit
// instructions which has the 's' bit set.
if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC &&
- canAddPseudoFlagDep(CPSRDef, MI, IsSelfLoop))
+ canAddPseudoFlagDep(MI, IsSelfLoop))
return false;
// Add the 16-bit instruction.
static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR, bool &DefCPSR) {
bool HasDef = false;
- for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
- const MachineOperand &MO = MI.getOperand(i);
+ for (const MachineOperand &MO : MI.operands()) {
if (!MO.isReg() || MO.isUndef() || MO.isUse())
continue;
if (MO.getReg() != ARM::CPSR)
}
static bool UpdateCPSRUse(MachineInstr &MI, bool LiveCPSR) {
- for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
- const MachineOperand &MO = MI.getOperand(i);
+ for (const MachineOperand &MO : MI.operands()) {
if (!MO.isReg() || MO.isUndef() || MO.isDef())
continue;
if (MO.getReg() != ARM::CPSR)
}
bool Thumb2SizeReduce::ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI,
- bool LiveCPSR, MachineInstr *CPSRDef,
- bool IsSelfLoop) {
+ bool LiveCPSR, bool IsSelfLoop) {
unsigned Opcode = MI->getOpcode();
DenseMap<unsigned, unsigned>::iterator OPI = ReduceOpcodeMap.find(Opcode);
if (OPI == ReduceOpcodeMap.end())
// Don't attempt normal reductions on "special" cases for now.
if (Entry.Special)
- return ReduceSpecial(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop);
+ return ReduceSpecial(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
// Try to transform to a 16-bit two-address instruction.
if (Entry.NarrowOpc2 &&
- ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop))
+ ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
return true;
// Try to transform to a 16-bit non-two-address instruction.
if (Entry.NarrowOpc1 &&
- ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop))
+ ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
return true;
return false;
// Yes, CPSR could be livein.
bool LiveCPSR = MBB.isLiveIn(ARM::CPSR);
- MachineInstr *CPSRDef = 0;
- MachineInstr *BundleMI = 0;
+ MachineInstr *BundleMI = nullptr;
+
+ CPSRDef = nullptr;
+ HighLatencyCPSR = false;
+
+ // Check predecessors for the latest CPSRDef.
+ for (auto *Pred : MBB.predecessors()) {
+ const MBBInfo &PInfo = BlockInfo[Pred->getNumber()];
+ if (!PInfo.Visited) {
+ // Since blocks are visited in RPO, this must be a back-edge.
+ continue;
+ }
+ if (PInfo.HighLatencyCPSR) {
+ HighLatencyCPSR = true;
+ break;
+ }
+ }
// If this BB loops back to itself, conservatively avoid narrowing the
// first instruction that does partial flag update.
MachineBasicBlock::instr_iterator MII = MBB.instr_begin(),E = MBB.instr_end();
MachineBasicBlock::instr_iterator NextMII;
for (; MII != E; MII = NextMII) {
- NextMII = llvm::next(MII);
+ NextMII = std::next(MII);
MachineInstr *MI = &*MII;
if (MI->isBundle()) {
BundleMI = MI;
continue;
}
+ if (MI->isDebugValue())
+ continue;
LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR);
// Does NextMII belong to the same bundle as MI?
bool NextInSameBundle = NextMII != E && NextMII->isBundledWithPred();
- if (ReduceMI(MBB, MI, LiveCPSR, CPSRDef, IsSelfLoop)) {
+ if (ReduceMI(MBB, MI, LiveCPSR, IsSelfLoop)) {
Modified = true;
- MachineBasicBlock::instr_iterator I = prior(NextMII);
+ MachineBasicBlock::instr_iterator I = std::prev(NextMII);
MI = &*I;
// Removing and reinserting the first instruction in a bundle will break
// up the bundle. Fix the bundling if it was broken.
MachineOperand *MO = BundleMI->findRegisterDefOperand(ARM::CPSR);
if (MO && !MO->isDead())
LiveCPSR = true;
+ MO = BundleMI->findRegisterUseOperand(ARM::CPSR);
+ if (MO && !MO->isKill())
+ LiveCPSR = true;
}
bool DefCPSR = false;
LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR, DefCPSR);
if (MI->isCall()) {
// Calls don't really set CPSR.
- CPSRDef = 0;
+ CPSRDef = nullptr;
+ HighLatencyCPSR = false;
IsSelfLoop = false;
} else if (DefCPSR) {
// This is the last CPSR defining instruction.
CPSRDef = MI;
+ HighLatencyCPSR = isHighLatencyCPSR(CPSRDef);
IsSelfLoop = false;
}
}
+ MBBInfo &Info = BlockInfo[MBB.getNumber()];
+ Info.HighLatencyCPSR = HighLatencyCPSR;
+ Info.Visited = true;
return Modified;
}
bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
- const TargetMachine &TM = MF.getTarget();
- TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo());
- STI = &TM.getSubtarget<ARMSubtarget>();
+ STI = &static_cast<const ARMSubtarget &>(MF.getSubtarget());
+ if (STI->isThumb1Only() || STI->prefers32BitThumb())
+ return false;
+
+ TII = static_cast<const Thumb2InstrInfo *>(STI->getInstrInfo());
// Optimizing / minimizing size?
- AttributeSet FnAttrs = MF.getFunction()->getAttributes();
- OptimizeSize = FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
- Attribute::OptimizeForSize);
- MinimizeSize = FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
- Attribute::MinSize);
+ OptimizeSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
+ MinimizeSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize);
+
+ BlockInfo.clear();
+ BlockInfo.resize(MF.getNumBlockIDs());
+ // Visit blocks in reverse post-order so LastCPSRDef is known for all
+ // predecessors.
+ ReversePostOrderTraversal<MachineFunction*> RPOT(&MF);
bool Modified = false;
- for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
- Modified |= ReduceMBB(*I);
+ for (ReversePostOrderTraversal<MachineFunction*>::rpo_iterator
+ I = RPOT.begin(), E = RPOT.end(); I != E; ++I)
+ Modified |= ReduceMBB(**I);
return Modified;
}