From 8b1496c922b6a21296f7d42172df45bf205d5419 Mon Sep 17 00:00:00 2001 From: Andrew Trick Date: Wed, 28 Nov 2012 05:13:28 +0000 Subject: [PATCH] misched: Analysis that partitions the DAG into subtrees. This is a simple, cheap infrastructure for analyzing the shape of a DAG. It recognizes uniform DAGs that take the shape of bottom-up subtrees, such as the included matrix multiplication example. This is useful for heuristics that balance register pressure with ILP. Two canonical expressions of the heuristic are implemented in scheduling modes: -misched-ilpmin and -misched-ilpmax. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168773 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/ScheduleDFS.h | 108 ++++++++++++--- lib/CodeGen/MachineScheduler.cpp | 71 +++++++--- lib/CodeGen/ScheduleDAGInstrs.cpp | 207 +++++++++++++++++++++++------ test/CodeGen/X86/misched-matrix.ll | 68 ++++++++++ 4 files changed, 377 insertions(+), 77 deletions(-) diff --git a/include/llvm/CodeGen/ScheduleDFS.h b/include/llvm/CodeGen/ScheduleDFS.h index 1aa40584217..fbbadd95ad1 100644 --- a/include/llvm/CodeGen/ScheduleDFS.h +++ b/include/llvm/CodeGen/ScheduleDFS.h @@ -14,38 +14,41 @@ #ifndef LLVM_CODEGEN_SCHEDULEDAGILP_H #define LLVM_CODEGEN_SCHEDULEDAGILP_H +#include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/Support/DataTypes.h" #include namespace llvm { class raw_ostream; +class IntEqClasses; class ScheduleDAGInstrs; class SUnit; /// \brief Represent the ILP of the subDAG rooted at a DAG node. +/// +/// When computed using bottom-up DFS, this metric assumes that the DAG is a +/// forest of trees with roots at the bottom of the schedule branching upward. struct ILPValue { unsigned InstrCount; - unsigned Cycles; - - ILPValue(): InstrCount(0), Cycles(0) {} - - ILPValue(unsigned count, unsigned cycles): - InstrCount(count), Cycles(cycles) {} + /// Length may either correspond to depth or height, depending on direction, + /// and cycles or nodes depending on context. + unsigned Length; - bool isValid() const { return Cycles > 0; } + ILPValue(unsigned count, unsigned length): + InstrCount(count), Length(length) {} // Order by the ILP metric's value. bool operator<(ILPValue RHS) const { - return (uint64_t)InstrCount * RHS.Cycles - < (uint64_t)Cycles * RHS.InstrCount; + return (uint64_t)InstrCount * RHS.Length + < (uint64_t)Length * RHS.InstrCount; } bool operator>(ILPValue RHS) const { return RHS < *this; } bool operator<=(ILPValue RHS) const { - return (uint64_t)InstrCount * RHS.Cycles - <= (uint64_t)Cycles * RHS.InstrCount; + return (uint64_t)InstrCount * RHS.Length + <= (uint64_t)Length * RHS.InstrCount; } bool operator>=(ILPValue RHS) const { return RHS <= *this; @@ -58,25 +61,88 @@ struct ILPValue { #endif }; -/// \brief Compute the values of each DAG node for an ILP metric. +/// \brief Compute the values of each DAG node for various metrics during DFS. /// -/// This metric assumes that the DAG is a forest of trees with roots at the -/// bottom of the schedule. -class ScheduleDAGILP { +/// ILPValues summarize the DAG subtree rooted at each node up to +/// SubtreeLimit. ILPValues are also valid for interior nodes of a subtree, not +/// just the root. +class SchedDFSResult { + friend class SchedDFSImpl; + + /// \brief Per-SUnit data computed during DFS for various metrics. + struct NodeData { + unsigned InstrCount; + unsigned SubtreeID; + + NodeData(): InstrCount(0), SubtreeID(0) {} + }; + + /// \brief Record a connection between subtrees and the connection level. + struct Connection { + unsigned TreeID; + unsigned Level; + + Connection(unsigned tree, unsigned level): TreeID(tree), Level(level) {} + }; + bool IsBottomUp; - std::vector ILPValues; + unsigned SubtreeLimit; + /// DFS results for each SUnit in this DAG. + std::vector DFSData; + + // For each subtree discovered during DFS, record its connections to other + // subtrees. + std::vector > SubtreeConnections; + + /// Cache the current connection level of each subtree. + /// This mutable array is updated during scheduling. + std::vector SubtreeConnectLevels; public: - ScheduleDAGILP(bool IsBU): IsBottomUp(IsBU) {} + SchedDFSResult(bool IsBU, unsigned lim) + : IsBottomUp(IsBU), SubtreeLimit(lim) {} + + /// \brief Clear the results. + void clear() { + DFSData.clear(); + SubtreeConnections.clear(); + SubtreeConnectLevels.clear(); + } /// \brief Initialize the result data with the size of the DAG. - void resize(unsigned NumSUnits); + void resize(unsigned NumSUnits) { + DFSData.resize(NumSUnits); + } - /// \brief Compute the ILP metric for the subDAG at this root. - void computeILP(const SUnit *Root); + /// \brief Compute various metrics for the DAG with given roots. + void compute(ArrayRef Roots); /// \brief Get the ILP value for a DAG node. - ILPValue getILP(const SUnit *SU); + /// + /// A leaf node has an ILP of 1/1. + ILPValue getILP(const SUnit *SU) { + return ILPValue(DFSData[SU->NodeNum].InstrCount, 1 + SU->getDepth()); + } + + /// \brief The number of subtrees detected in this DAG. + unsigned getNumSubtrees() const { return SubtreeConnectLevels.size(); } + + /// \brief Get the ID of the subtree the given DAG node belongs to. + unsigned getSubtreeID(const SUnit *SU) { + return DFSData[SU->NodeNum].SubtreeID; + } + + /// \brief Get the connection level of a subtree. + /// + /// For bottom-up trees, the connection level is the latency depth (in cycles) + /// of the deepest connection to another subtree. + unsigned getSubtreeLevel(unsigned SubtreeID) { + return SubtreeConnectLevels[SubtreeID]; + } + + /// \brief Scheduler callback to update SubtreeConnectLevels when a tree is + /// initially scheduled. + void scheduleTree(unsigned SubtreeID); }; raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val); diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp index 69e8b83b361..e27bb0dd1bd 100644 --- a/lib/CodeGen/MachineScheduler.cpp +++ b/lib/CodeGen/MachineScheduler.cpp @@ -2054,58 +2054,99 @@ ConvergingSchedRegistry("converge", "Standard converging scheduler.", namespace { /// \brief Order nodes by the ILP metric. struct ILPOrder { - ScheduleDAGILP *ILP; + SchedDFSResult *DFSResult; + BitVector *ScheduledTrees; bool MaximizeILP; - ILPOrder(ScheduleDAGILP *ilp, bool MaxILP): ILP(ilp), MaximizeILP(MaxILP) {} + ILPOrder(SchedDFSResult *dfs, BitVector *schedtrees, bool MaxILP) + : DFSResult(dfs), ScheduledTrees(schedtrees), MaximizeILP(MaxILP) {} /// \brief Apply a less-than relation on node priority. + /// + /// (Return true if A comes after B in the Q.) bool operator()(const SUnit *A, const SUnit *B) const { - // Return true if A comes after B in the Q. + unsigned SchedTreeA = DFSResult->getSubtreeID(A); + unsigned SchedTreeB = DFSResult->getSubtreeID(B); + if (SchedTreeA != SchedTreeB) { + // Unscheduled trees have lower priority. + if (ScheduledTrees->test(SchedTreeA) != ScheduledTrees->test(SchedTreeB)) + return ScheduledTrees->test(SchedTreeB); + + // Trees with shallower connections have have lower priority. + if (DFSResult->getSubtreeLevel(SchedTreeA) + != DFSResult->getSubtreeLevel(SchedTreeB)) { + return DFSResult->getSubtreeLevel(SchedTreeA) + < DFSResult->getSubtreeLevel(SchedTreeB); + } + } if (MaximizeILP) - return ILP->getILP(A) < ILP->getILP(B); + return DFSResult->getILP(A) < DFSResult->getILP(B); else - return ILP->getILP(A) > ILP->getILP(B); + return DFSResult->getILP(A) > DFSResult->getILP(B); } }; /// \brief Schedule based on the ILP metric. class ILPScheduler : public MachineSchedStrategy { - ScheduleDAGILP ILP; + /// In case all subtrees are eventually connected to a common root through + /// data dependence (e.g. reduction), place an upper limit on their size. + /// + /// FIXME: A subtree limit is generally good, but in the situation commented + /// above, where multiple similar subtrees feed a common root, we should + /// only split at a point where the resulting subtrees will be balanced. + /// (a motivating test case must be found). + static const unsigned SubtreeLimit = 16; + + SchedDFSResult DFSResult; + BitVector ScheduledTrees; ILPOrder Cmp; std::vector ReadyQ; public: ILPScheduler(bool MaximizeILP) - : ILP(/*BottomUp=*/true), Cmp(&ILP, MaximizeILP) {} + : DFSResult(/*BottomUp=*/true, SubtreeLimit), + Cmp(&DFSResult, &ScheduledTrees, MaximizeILP) {} virtual void initialize(ScheduleDAGMI *DAG) { ReadyQ.clear(); - ILP.resize(DAG->SUnits.size()); + DFSResult.clear(); + DFSResult.resize(DAG->SUnits.size()); + ScheduledTrees.clear(); } virtual void registerRoots() { - for (std::vector::const_iterator - I = ReadyQ.begin(), E = ReadyQ.end(); I != E; ++I) { - ILP.computeILP(*I); - } + DFSResult.compute(ReadyQ); + ScheduledTrees.resize(DFSResult.getNumSubtrees()); } /// Implement MachineSchedStrategy interface. /// ----------------------------------------- + /// Callback to select the highest priority node from the ready Q. virtual SUnit *pickNode(bool &IsTopNode) { if (ReadyQ.empty()) return NULL; pop_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); SUnit *SU = ReadyQ.back(); ReadyQ.pop_back(); IsTopNode = false; - DEBUG(dbgs() << "*** Scheduling " << *SU->getInstr() - << " ILP: " << ILP.getILP(SU) << '\n'); + DEBUG(dbgs() << "*** Scheduling " << "SU(" << SU->NodeNum << "): " + << *SU->getInstr() + << " ILP: " << DFSResult.getILP(SU) + << " Tree: " << DFSResult.getSubtreeID(SU) << " @" + << DFSResult.getSubtreeLevel(DFSResult.getSubtreeID(SU))<< '\n'); return SU; } - virtual void schedNode(SUnit *, bool) {} + /// Callback after a node is scheduled. Mark a newly scheduled tree, notify + /// DFSResults, and resort the priority Q. + virtual void schedNode(SUnit *SU, bool IsTopNode) { + assert(!IsTopNode && "SchedDFSResult needs bottom-up"); + if (!ScheduledTrees.test(DFSResult.getSubtreeID(SU))) { + ScheduledTrees.set(DFSResult.getSubtreeID(SU)); + DFSResult.scheduleTree(DFSResult.getSubtreeID(SU)); + std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); + } + } virtual void releaseTopNode(SUnit *) { /*only called for top roots*/ } diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp index e9eaff1b182..2b00b596d3b 100644 --- a/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -12,7 +12,7 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "sched-instrs" +#define DEBUG_TYPE "misched" #include "llvm/Operator.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ValueTracking.h" @@ -949,6 +949,120 @@ std::string ScheduleDAGInstrs::getDAGName() const { return "dag." + BB->getFullName(); } +//===----------------------------------------------------------------------===// +// SchedDFSResult Implementation +//===----------------------------------------------------------------------===// + +namespace llvm { +/// \brief Internal state used to compute SchedDFSResult. +class SchedDFSImpl { + SchedDFSResult &R; + + /// Join DAG nodes into equivalence classes by their subtree. + IntEqClasses SubtreeClasses; + /// List PredSU, SuccSU pairs that represent data edges between subtrees. + std::vector > ConnectionPairs; + +public: + SchedDFSImpl(SchedDFSResult &r): R(r), SubtreeClasses(R.DFSData.size()) {} + + /// SubtreID is initialized to zero, set to itself to flag the root of a + /// subtree, set to the parent to indicate an interior node, + /// then set to a representative subtree ID during finalization. + bool isVisited(const SUnit *SU) const { + return R.DFSData[SU->NodeNum].SubtreeID; + } + + /// Initialize this node's instruction count. We don't need to flag the node + /// visited until visitPostorder because the DAG cannot have cycles. + void visitPreorder(const SUnit *SU) { + R.DFSData[SU->NodeNum].InstrCount = SU->getInstr()->isTransient() ? 0 : 1; + } + + /// Mark this node as either the root of a subtree or an interior + /// node. Increment the parent node's instruction count. + void visitPostorder(const SUnit *SU, const SDep *PredDep, const SUnit *Parent) { + R.DFSData[SU->NodeNum].SubtreeID = SU->NodeNum; + + // Join the child to its parent if they are connected via data dependence + // and do not exceed the limit. + if (!Parent || PredDep->getKind() != SDep::Data) + return; + + unsigned PredCnt = R.DFSData[SU->NodeNum].InstrCount; + if (PredCnt > R.SubtreeLimit) + return; + + R.DFSData[SU->NodeNum].SubtreeID = Parent->NodeNum; + + // Add the recently finished predecessor's bottom-up descendent count. + R.DFSData[Parent->NodeNum].InstrCount += PredCnt; + SubtreeClasses.join(Parent->NodeNum, SU->NodeNum); + } + + /// Determine whether the DFS cross edge should be considered a subtree edge + /// or a connection between subtrees. + void visitCross(const SDep &PredDep, const SUnit *Succ) { + if (PredDep.getKind() == SDep::Data) { + // If this is a cross edge to a root, join the subtrees. This happens when + // the root was first reached by a non-data dependence. + unsigned NodeNum = PredDep.getSUnit()->NodeNum; + unsigned PredCnt = R.DFSData[NodeNum].InstrCount; + if (R.DFSData[NodeNum].SubtreeID == NodeNum && PredCnt < R.SubtreeLimit) { + R.DFSData[NodeNum].SubtreeID = Succ->NodeNum; + R.DFSData[Succ->NodeNum].InstrCount += PredCnt; + SubtreeClasses.join(Succ->NodeNum, NodeNum); + return; + } + } + ConnectionPairs.push_back(std::make_pair(PredDep.getSUnit(), Succ)); + } + + /// Set each node's subtree ID to the representative ID and record connections + /// between trees. + void finalize() { + SubtreeClasses.compress(); + R.SubtreeConnections.resize(SubtreeClasses.getNumClasses()); + R.SubtreeConnectLevels.resize(SubtreeClasses.getNumClasses()); + DEBUG(dbgs() << R.getNumSubtrees() << " subtrees:\n"); + for (unsigned Idx = 0, End = R.DFSData.size(); Idx != End; ++Idx) { + R.DFSData[Idx].SubtreeID = SubtreeClasses[Idx]; + DEBUG(dbgs() << " SU(" << Idx << ") in tree " + << R.DFSData[Idx].SubtreeID << '\n'); + } + for (std::vector >::const_iterator + I = ConnectionPairs.begin(), E = ConnectionPairs.end(); + I != E; ++I) { + unsigned PredTree = SubtreeClasses[I->first->NodeNum]; + unsigned SuccTree = SubtreeClasses[I->second->NodeNum]; + if (PredTree == SuccTree) + continue; + unsigned Depth = I->first->getDepth(); + addConnection(PredTree, SuccTree, Depth); + addConnection(SuccTree, PredTree, Depth); + } + } + +protected: + /// Called by finalize() to record a connection between trees. + void addConnection(unsigned FromTree, unsigned ToTree, unsigned Depth) { + if (!Depth) + return; + + SmallVectorImpl &Connections = + R.SubtreeConnections[FromTree]; + for (SmallVectorImpl::iterator + I = Connections.begin(), E = Connections.end(); I != E; ++I) { + if (I->TreeID == ToTree) { + I->Level = std::max(I->Level, Depth); + return; + } + } + Connections.push_back(SchedDFSResult::Connection(ToTree, Depth)); + } +}; +} // namespace llvm + namespace { /// \brief Manage the stack used by a reverse depth-first search over the DAG. class SchedDAGReverseDFS { @@ -961,7 +1075,10 @@ public: } void advance() { ++DFSStack.back().second; } - void backtrack() { DFSStack.pop_back(); } + const SDep *backtrack() { + DFSStack.pop_back(); + return DFSStack.empty() ? 0 : llvm::prior(DFSStack.back().second); + } const SUnit *getCurr() const { return DFSStack.back().first; } @@ -973,57 +1090,65 @@ public: }; } // anonymous -void ScheduleDAGILP::resize(unsigned NumSUnits) { - ILPValues.resize(NumSUnits); -} - -ILPValue ScheduleDAGILP::getILP(const SUnit *SU) { - return ILPValues[SU->NodeNum]; -} - -// A leaf node has an ILP of 1/1. -static ILPValue initILP(const SUnit *SU) { - unsigned Cnt = SU->getInstr()->isTransient() ? 0 : 1; - return ILPValue(Cnt, 1 + SU->getDepth()); -} - /// Compute an ILP metric for all nodes in the subDAG reachable via depth-first /// search from this root. -void ScheduleDAGILP::computeILP(const SUnit *Root) { +void SchedDFSResult::compute(ArrayRef Roots) { if (!IsBottomUp) llvm_unreachable("Top-down ILP metric is unimplemnted"); - SchedDAGReverseDFS DFS; - // Mark a node visited by validating it. - ILPValues[Root->NodeNum] = initILP(Root); - DFS.follow(Root); - for (;;) { - // Traverse the leftmost path as far as possible. - while (DFS.getPred() != DFS.getPredEnd()) { - const SUnit *PredSU = DFS.getPred()->getSUnit(); - DFS.advance(); - // If the pred is already valid, skip it. - if (ILPValues[PredSU->NodeNum].isValid()) - continue; - ILPValues[PredSU->NodeNum] = initILP(PredSU); - DFS.follow(PredSU); + SchedDFSImpl Impl(*this); + for (ArrayRef::const_iterator + RootI = Roots.begin(), RootE = Roots.end(); RootI != RootE; ++RootI) { + SchedDAGReverseDFS DFS; + Impl.visitPreorder(*RootI); + DFS.follow(*RootI); + for (;;) { + // Traverse the leftmost path as far as possible. + while (DFS.getPred() != DFS.getPredEnd()) { + const SDep &PredDep = *DFS.getPred(); + DFS.advance(); + // If the pred is already valid, skip it. We may preorder visit a node + // with InstrCount==0 more than once, but it won't affect heuristics + // because we don't care about cross edges to leaf copies. + if (Impl.isVisited(PredDep.getSUnit())) { + Impl.visitCross(PredDep, DFS.getCurr()); + continue; + } + Impl.visitPreorder(PredDep.getSUnit()); + DFS.follow(PredDep.getSUnit()); + } + // Visit the top of the stack in postorder and backtrack. + const SUnit *Child = DFS.getCurr(); + const SDep *PredDep = DFS.backtrack(); + Impl.visitPostorder(Child, PredDep, PredDep ? DFS.getCurr() : 0); + if (DFS.isComplete()) + break; } - // Visit the top of the stack in postorder and backtrack. - unsigned PredCount = ILPValues[DFS.getCurr()->NodeNum].InstrCount; - DFS.backtrack(); - if (DFS.isComplete()) - break; - // Add the recently finished predecessor's bottom-up descendent count. - ILPValues[DFS.getCurr()->NodeNum].InstrCount += PredCount; + } + Impl.finalize(); +} + +/// The root of the given SubtreeID was just scheduled. For all subtrees +/// connected to this tree, record the depth of the connection so that the +/// nearest connected subtrees can be prioritized. +void SchedDFSResult::scheduleTree(unsigned SubtreeID) { + for (SmallVectorImpl::const_iterator + I = SubtreeConnections[SubtreeID].begin(), + E = SubtreeConnections[SubtreeID].end(); I != E; ++I) { + SubtreeConnectLevels[I->TreeID] = + std::max(SubtreeConnectLevels[I->TreeID], I->Level); + DEBUG(dbgs() << " Tree: " << I->TreeID + << " @" << SubtreeConnectLevels[I->TreeID] << '\n'); } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void ILPValue::print(raw_ostream &OS) const { - if (!isValid()) + OS << InstrCount << " / " << Length << " = "; + if (!Length) OS << "BADILP"; - OS << InstrCount << " / " << Cycles << " = " - << format("%g", ((double)InstrCount / Cycles)); + else + OS << format("%g", ((double)InstrCount / Length)); } void ILPValue::dump() const { diff --git a/test/CodeGen/X86/misched-matrix.ll b/test/CodeGen/X86/misched-matrix.ll index 413e76468a3..f5566e5e5de 100644 --- a/test/CodeGen/X86/misched-matrix.ll +++ b/test/CodeGen/X86/misched-matrix.ll @@ -1,6 +1,12 @@ ; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \ ; RUN: -misched-topdown -verify-machineinstrs \ ; RUN: | FileCheck %s -check-prefix=TOPDOWN +; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \ +; RUN: -misched=ilpmin -verify-machineinstrs \ +; RUN: | FileCheck %s -check-prefix=ILPMIN +; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \ +; RUN: -misched=ilpmax -verify-machineinstrs \ +; RUN: | FileCheck %s -check-prefix=ILPMAX ; ; Verify that the MI scheduler minimizes register pressure for a ; uniform set of bottom-up subtrees (unrolled matrix multiply). @@ -17,6 +23,68 @@ ; TOPDOWN: movl %{{.*}}, 8( ; TOPDOWN: movl %{{.*}}, 12( ; TOPDOWN: %for.end +; +; For -misched=ilpmin, verify that each expression subtree is +; scheduled independently, and that the imull/adds are interleaved. +; +; ILPMIN: %for.body +; ILPMIN: movl %{{.*}}, ( +; ILPMIN: imull +; ILPMIN: imull +; ILPMIN: addl +; ILPMIN: imull +; ILPMIN: addl +; ILPMIN: imull +; ILPMIN: addl +; ILPMIN: movl %{{.*}}, 4( +; ILPMIN: imull +; ILPMIN: imull +; ILPMIN: addl +; ILPMIN: imull +; ILPMIN: addl +; ILPMIN: imull +; ILPMIN: addl +; ILPMIN: movl %{{.*}}, 8( +; ILPMIN: imull +; ILPMIN: imull +; ILPMIN: addl +; ILPMIN: imull +; ILPMIN: addl +; ILPMIN: imull +; ILPMIN: addl +; ILPMIN: movl %{{.*}}, 12( +; ILPMIN: %for.end +; +; For -misched=ilpmax, verify that each expression subtree is +; scheduled independently, and that the imull/adds are clustered. +; +; ILPMAX: %for.body +; ILPMAX: movl %{{.*}}, ( +; ILPMAX: imull +; ILPMAX: imull +; ILPMAX: imull +; ILPMAX: imull +; ILPMAX: addl +; ILPMAX: addl +; ILPMAX: addl +; ILPMAX: movl %{{.*}}, 4( +; ILPMAX: imull +; ILPMAX: imull +; ILPMAX: imull +; ILPMAX: imull +; ILPMAX: addl +; ILPMAX: addl +; ILPMAX: addl +; ILPMAX: movl %{{.*}}, 8( +; ILPMAX: imull +; ILPMAX: imull +; ILPMAX: imull +; ILPMAX: imull +; ILPMAX: addl +; ILPMAX: addl +; ILPMAX: addl +; ILPMAX: movl %{{.*}}, 12( +; ILPMAX: %for.end define void @mmult([4 x i32]* noalias nocapture %m1, [4 x i32]* noalias nocapture %m2, [4 x i32]* noalias nocapture %m3) nounwind uwtable ssp { -- 2.34.1