unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
MachineTraceMetrics::Trace BlockTrace);
bool
- preservesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
+ improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
MachineTraceMetrics::Trace BlockTrace,
SmallVectorImpl<MachineInstr *> &InsInstrs,
- DenseMap<unsigned, unsigned> &InstrIdxForVirtReg);
+ DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+ bool NewCodeHasLessInsts);
bool preservesResourceLen(MachineBasicBlock *MBB,
MachineTraceMetrics::Trace BlockTrace,
SmallVectorImpl<MachineInstr *> &InsInstrs,
return NewRootLatency;
}
-/// True when the new instruction sequence does not
-/// lengthen the critical path. The DAGCombine code sequence ends in MI
-/// (Machine Instruction) Root. The new code sequence ends in MI NewRoot. A
-/// necessary condition for the new sequence to replace the old sequence is that
-/// it cannot lengthen the critical path. This is decided by the formula
+/// True when the new instruction sequence does not lengthen the critical path
+/// and the new sequence has less instructions or the new sequence improves the
+/// critical path.
+/// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
+/// The new code sequence ends in MI NewRoot. A necessary condition for the new
+/// sequence to replace the old sequence is that it cannot lengthen the critical
+/// path. This is decided by the formula:
/// (NewRootDepth + NewRootLatency) <= (RootDepth + RootLatency + RootSlack)).
-/// The slack is the number of cycles Root can be delayed before the critical
-/// patch becomes longer.
-bool MachineCombiner::preservesCriticalPathLen(
+/// If the new sequence has an equal length critical path but does not reduce
+/// the number of instructions (NewCodeHasLessInsts is false), then it is not
+/// considered an improvement. The slack is the number of cycles Root can be
+/// delayed before the critical patch becomes longer.
+bool MachineCombiner::improvesCriticalPathLen(
MachineBasicBlock *MBB, MachineInstr *Root,
MachineTraceMetrics::Trace BlockTrace,
SmallVectorImpl<MachineInstr *> &InsInstrs,
- DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
+ DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+ bool NewCodeHasLessInsts) {
assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n");
// NewRoot is the last instruction in the \p InsInstrs vector.
dbgs() << " RootDepth + RootLatency + RootSlack "
<< RootDepth + RootLatency + RootSlack << "\n";);
- /// True when the new sequence does not lengthen the critical path.
- return ((NewRootDepth + NewRootLatency) <=
- (RootDepth + RootLatency + RootSlack));
+ unsigned NewCycleCount = NewRootDepth + NewRootLatency;
+ unsigned OldCycleCount = RootDepth + RootLatency + RootSlack;
+
+ if (NewCodeHasLessInsts)
+ return NewCycleCount <= OldCycleCount;
+ else
+ return NewCycleCount < OldCycleCount;
}
/// helper routine to convert instructions into SC
Traces->verifyAnalysis();
TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
InstrIdxForVirtReg);
+ unsigned NewInstCount = InsInstrs.size();
+ unsigned OldInstCount = DelInstrs.size();
// Found pattern, but did not generate alternative sequence.
// This can happen e.g. when an immediate could not be materialized
// in a single instruction.
- if (!InsInstrs.size())
+ if (!NewInstCount)
continue;
// Substitute when we optimize for codesize and the new sequence has
// fewer instructions OR
// the new sequence neither lengthens the critical path nor increases
// resource pressure.
- if (doSubstitute(InsInstrs.size(), DelInstrs.size()) ||
- (preservesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
- InstrIdxForVirtReg) &&
+ if (doSubstitute(NewInstCount, OldInstCount) ||
+ (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
+ InstrIdxForVirtReg,
+ NewInstCount < OldInstCount) &&
preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
for (auto *InstrPtr : InsInstrs)
MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr);
return isHighLatencyDef(DefMI->getOpcode());
}
-/// If the input instruction is part of a chain of dependent ops that are
-/// suitable for reassociation, return the earlier instruction in the sequence
-/// that defines its first operand, otherwise return a nullptr.
-/// If the instruction's operands must be commuted to be considered a
-/// reassociation candidate, Commuted will be set to true.
-static MachineInstr *isReassocCandidate(const MachineInstr &Inst,
- unsigned AssocOpcode,
- bool checkPrevOneUse,
- bool &Commuted) {
- if (Inst.getOpcode() != AssocOpcode)
- return nullptr;
-
- MachineOperand Op1 = Inst.getOperand(1);
- MachineOperand Op2 = Inst.getOperand(2);
-
- const MachineBasicBlock *MBB = Inst.getParent();
+static bool hasVirtualRegDefsInBasicBlock(const MachineInstr &Inst,
+ const MachineBasicBlock *MBB) {
+ assert(Inst.getNumOperands() == 3 && "Reassociation needs binary operators");
+ const MachineOperand &Op1 = Inst.getOperand(1);
+ const MachineOperand &Op2 = Inst.getOperand(2);
const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
// We need virtual register definitions.
MI1 = MRI.getUniqueVRegDef(Op1.getReg());
if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg()))
MI2 = MRI.getUniqueVRegDef(Op2.getReg());
-
+
// And they need to be in the trace (otherwise, they won't have a depth).
- if (!MI1 || !MI2 || MI1->getParent() != MBB || MI2->getParent() != MBB)
- return nullptr;
-
- Commuted = false;
- if (MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode) {
+ if (MI1 && MI2 && MI1->getParent() == MBB && MI2->getParent() == MBB)
+ return true;
+
+ return false;
+}
+
+static bool hasReassocSibling(const MachineInstr &Inst, bool &Commuted) {
+ const MachineBasicBlock *MBB = Inst.getParent();
+ const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ MachineInstr *MI1 = MRI.getUniqueVRegDef(Inst.getOperand(1).getReg());
+ MachineInstr *MI2 = MRI.getUniqueVRegDef(Inst.getOperand(2).getReg());
+ unsigned AssocOpcode = Inst.getOpcode();
+
+ // If only one operand has the same opcode and it's the second source operand,
+ // the operands must be commuted.
+ Commuted = MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode;
+ if (Commuted)
std::swap(MI1, MI2);
- Commuted = true;
- }
- // Avoid reassociating operands when it won't provide any benefit. If both
- // operands are produced by instructions of this type, we may already
- // have the optimal sequence.
- if (MI2->getOpcode() == AssocOpcode)
- return nullptr;
-
- // The instruction must only be used by the other instruction that we
- // reassociate with.
- if (checkPrevOneUse && !MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()))
- return nullptr;
-
- // We must match a simple chain of dependent ops.
- // TODO: This check is not necessary for the earliest instruction in the
- // sequence. Instead of a sequence of 3 dependent instructions with the same
- // opcode, we only need to find a sequence of 2 dependent instructions with
- // the same opcode plus 1 other instruction that adds to the height of the
- // trace.
- if (MI1->getOpcode() != AssocOpcode)
- return nullptr;
+ // 1. The previous instruction must be the same type as Inst.
+ // 2. The previous instruction must have virtual register definitions for its
+ // operands in the same basic block as Inst.
+ // 3. The previous instruction's result must only be used by Inst.
+ if (MI1->getOpcode() == AssocOpcode &&
+ hasVirtualRegDefsInBasicBlock(*MI1, MBB) &&
+ MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()))
+ return true;
- return MI1;
+ return false;
}
-/// Select a pattern based on how the operands of each associative operation
-/// need to be commuted.
-static MachineCombinerPattern::MC_PATTERN getPattern(bool CommutePrev,
- bool CommuteRoot) {
- if (CommutePrev) {
- if (CommuteRoot)
- return MachineCombinerPattern::MC_REASSOC_XA_YB;
- return MachineCombinerPattern::MC_REASSOC_XA_BY;
- } else {
- if (CommuteRoot)
- return MachineCombinerPattern::MC_REASSOC_AX_YB;
- return MachineCombinerPattern::MC_REASSOC_AX_BY;
- }
+/// Return true if the input instruction is part of a chain of dependent ops
+/// that are suitable for reassociation, otherwise return false.
+/// If the instruction's operands must be commuted to have a previous
+/// instruction of the same type define the first source operand, Commuted will
+/// be set to true.
+static bool isReassocCandidate(const MachineInstr &Inst, unsigned AssocOpcode,
+ bool &Commuted) {
+ // 1. The instruction must have the correct type.
+ // 2. The instruction must have virtual register definitions for its
+ // operands in the same basic block.
+ // 3. The instruction must have a reassociatable sibling.
+ if (Inst.getOpcode() == AssocOpcode &&
+ hasVirtualRegDefsInBasicBlock(Inst, Inst.getParent()) &&
+ hasReassocSibling(Inst, Commuted))
+ return true;
+
+ return false;
}
+// FIXME: This has the potential to be expensive (compile time) while not
+// improving the code at all. Some ways to limit the overhead:
+// 1. Track successful transforms; bail out if hit rate gets too low.
+// 2. Only enable at -O3 or some other non-default optimization level.
+// 3. Pre-screen pattern candidates here: if an operand of the previous
+// instruction is known to not increase the critical path, then don't match
+// that pattern.
bool X86InstrInfo::getMachineCombinerPatterns(MachineInstr &Root,
SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) const {
if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath)
return false;
+ // TODO: There is nothing x86-specific here except the instruction type.
+ // This logic could be hoisted into the machine combiner pass itself.
+
+ // Look for this reassociation pattern:
+ // B = A op X (Prev)
+ // C = B op Y (Root)
+
// TODO: There are many more associative instruction types to match:
// 1. Other forms of scalar FP add (non-AVX)
// 2. Other data types (double, integer, vectors)
// 3. Other math / logic operations (mul, and, or)
unsigned AssocOpcode = X86::VADDSSrr;
- // TODO: There is nothing x86-specific here except the instruction type.
- // This logic could be hoisted into the machine combiner pass itself.
- bool CommuteRoot;
- if (MachineInstr *Prev = isReassocCandidate(Root, AssocOpcode, true,
- CommuteRoot)) {
- bool CommutePrev;
- if (isReassocCandidate(*Prev, AssocOpcode, false, CommutePrev)) {
- // We found a sequence of instructions that may be suitable for a
- // reassociation of operands to increase ILP.
- Patterns.push_back(getPattern(CommutePrev, CommuteRoot));
- return true;
+ bool Commute = false;
+ if (isReassocCandidate(Root, AssocOpcode, Commute)) {
+ // We found a sequence of instructions that may be suitable for a
+ // reassociation of operands to increase ILP. Specify each commutation
+ // possibility for the Prev instruction in the sequence and let the
+ // machine combiner decide if changing the operands is worthwhile.
+ if (Commute) {
+ Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_YB);
+ Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_YB);
+ } else {
+ Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_BY);
+ Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_BY);
}
+ return true;
}
-
+
return false;
}
// Select the previous instruction in the sequence based on the input pattern.
MachineInstr *Prev = nullptr;
- if (Pattern == MachineCombinerPattern::MC_REASSOC_AX_BY ||
- Pattern == MachineCombinerPattern::MC_REASSOC_XA_BY)
- Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
- else if (Pattern == MachineCombinerPattern::MC_REASSOC_AX_YB ||
- Pattern == MachineCombinerPattern::MC_REASSOC_XA_YB)
- Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
- else
- llvm_unreachable("Unknown pattern for machine combiner");
+ switch (Pattern) {
+ case MachineCombinerPattern::MC_REASSOC_AX_BY:
+ case MachineCombinerPattern::MC_REASSOC_XA_BY:
+ Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+ break;
+ case MachineCombinerPattern::MC_REASSOC_AX_YB:
+ case MachineCombinerPattern::MC_REASSOC_XA_YB:
+ Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
+ }
+ assert(Prev && "Unknown pattern for machine combiner");
reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg);
return;
ret float %t2
}
-; Verify that the first two adds are independent regardless of how the inputs are
-; commuted. The destination registers are used as source registers for the third add.
-
-define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds1:
-; CHECK: # BB#0:
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
- %t0 = fadd float %x0, %x1
- %t1 = fadd float %t0, %x2
- %t2 = fadd float %t1, %x3
- ret float %t2
-}
-
-define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds2:
-; CHECK: # BB#0:
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
- %t0 = fadd float %x0, %x1
- %t1 = fadd float %x2, %t0
- %t2 = fadd float %t1, %x3
- ret float %t2
-}
-
-define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds3:
-; CHECK: # BB#0:
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
- %t0 = fadd float %x0, %x1
- %t1 = fadd float %t0, %x2
- %t2 = fadd float %x3, %t1
- ret float %t2
-}
-
-define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds4:
-; CHECK: # BB#0:
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
- %t0 = fadd float %x0, %x1
- %t1 = fadd float %x2, %t0
- %t2 = fadd float %x3, %t1
- ret float %t2
-}
-
-; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not
-; produced because that would cost more compile time.
-
-define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) {
-; CHECK-LABEL: reassociate_adds5:
-; CHECK: # BB#0:
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm5, %xmm4, %xmm1
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm7, %xmm6, %xmm1
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
- %t0 = fadd float %x0, %x1
- %t1 = fadd float %t0, %x2
- %t2 = fadd float %t1, %x3
- %t3 = fadd float %t2, %x4
- %t4 = fadd float %t3, %x5
- %t5 = fadd float %t4, %x6
- %t6 = fadd float %t5, %x7
- ret float %t6
-}
--- /dev/null
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s
+
+; Verify that the first two adds are independent regardless of how the inputs are
+; commuted. The destination registers are used as source registers for the third add.
+
+define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %t0 = fadd float %x0, %x1
+ %t1 = fadd float %t0, %x2
+ %t2 = fadd float %t1, %x3
+ ret float %t2
+}
+
+define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds2:
+; CHECK: # BB#0:
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %t0 = fadd float %x0, %x1
+ %t1 = fadd float %x2, %t0
+ %t2 = fadd float %t1, %x3
+ ret float %t2
+}
+
+define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds3:
+; CHECK: # BB#0:
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %t0 = fadd float %x0, %x1
+ %t1 = fadd float %t0, %x2
+ %t2 = fadd float %x3, %t1
+ ret float %t2
+}
+
+define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds4:
+; CHECK: # BB#0:
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %t0 = fadd float %x0, %x1
+ %t1 = fadd float %x2, %t0
+ %t2 = fadd float %x3, %t1
+ ret float %t2
+}
+
+; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not
+; produced because that would cost more compile time.
+
+define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) {
+; CHECK-LABEL: reassociate_adds5:
+; CHECK: # BB#0:
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddss %xmm5, %xmm4, %xmm1
+; CHECK-NEXT: vaddss %xmm6, %xmm1, %xmm1
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddss %xmm7, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %t0 = fadd float %x0, %x1
+ %t1 = fadd float %t0, %x2
+ %t2 = fadd float %t1, %x3
+ %t3 = fadd float %t2, %x4
+ %t4 = fadd float %t3, %x5
+ %t5 = fadd float %t4, %x6
+ %t6 = fadd float %t5, %x7
+ ret float %t6
+}
+
+; Verify that we only need two associative operations to reassociate the operands.
+; Also, we should reassociate such that the result of the high latency division
+; is used by the final 'add' rather than reassociating the %x3 operand with the
+; division. The latter reassociation would not improve anything.
+
+define float @reassociate_adds6(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds6:
+; CHECK: # BB#0:
+; CHECK-NEXT: vdivss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %t0 = fdiv float %x0, %x1
+ %t1 = fadd float %x2, %t0
+ %t2 = fadd float %x3, %t1
+ ret float %t2
+}
+