const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
-
+ AluInstCount = 0;
+ FetchInstCount = 0;
}
void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
QSrc.clear();
}
+static
+unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
+ assert (GPRCount && "GPRCount cannot be 0");
+ return 248 / GPRCount;
+}
+
SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
SUnit *SU = 0;
NextInstKind = IDOther;
bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
(!Available[IDFetch].empty() || !Available[IDOther].empty());
+ if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
+ // We use the heuristic provided by AMD Accelerated Parallel Processing
+ // OpenCL Programming Guide :
+ // The approx. number of WF that allows TEX inst to hide ALU inst is :
+ // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
+ float ALUFetchRationEstimate =
+ (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
+ (FetchInstCount + Available[IDFetch].size());
+ unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
+ DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
+ // We assume the local GPR requirements to be "dominated" by the requirement
+ // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
+ // after TEX are indeed likely to consume or generate values from/for the
+ // TEX clause.
+ // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
+ // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
+ // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
+ // (TODO : use RegisterPressure)
+ // If we are going too use too many GPR, we flush Fetch instruction to lower
+ // register pressure on 128 bits regs.
+ unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
+ if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
+ AllowSwitchFromAlu = true;
+ }
+
+
// We want to scheduled AR defs as soon as possible to make sure they aren't
// put in a different ALU clause from their uses.
if (!SU && !UnscheduledARDefs.empty()) {
}
if (CurInstKind == IDAlu) {
+ AluInstCount ++;
switch (getAluKind(SU)) {
case AluT_XYZW:
CurEmitted += 4;
if (CurInstKind != IDFetch) {
MoveUnits(Pending[IDFetch], Available[IDFetch]);
- }
+ } else
+ FetchInstCount++;
}
static bool
return UnslotedSU;
}
-bool R600SchedStrategy::isAvailablesAluEmpty() const {
- return Pending[IDAlu].empty() && AvailableAlus[AluAny].empty() &&
- AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() &&
- AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() &&
- AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty() &&
- AvailableAlus[AluPredX].empty();
+unsigned R600SchedStrategy::AvailablesAluCount() const {
+ return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
+ AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
+ AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
+ AvailableAlus[AluDiscarded].size() + AvailableAlus[AluPredX].size();
}
SUnit* R600SchedStrategy::pickAlu() {
- while (!isAvailablesAluEmpty()) {
+ while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
if (!OccupedSlotsMask) {
// Bottom up scheduling : predX must comes first
if (!AvailableAlus[AluPredX].empty()) {