From 5a50ceeaea980962c1982ad535226c7ab06c971c Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Thu, 7 Oct 2010 01:50:48 +0000 Subject: [PATCH] Model operand cycles of vldm / vstm; also fixes scheduling itineraries of vldr / vstr, etc. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@115898 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMBaseInstrInfo.cpp | 91 ++++++++++++++++++++++++++--- lib/Target/ARM/ARMInstrNEON.td | 4 +- lib/Target/ARM/ARMInstrVFP.td | 16 ++--- lib/Target/ARM/ARMSchedule.td | 6 +- lib/Target/ARM/ARMScheduleA8.td | 36 +++++++----- lib/Target/ARM/ARMScheduleA9.td | 22 +++++-- lib/Target/ARM/ARMScheduleV6.td | 10 +++- 7 files changed, 143 insertions(+), 42 deletions(-) diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index fcb422fcde6..823126be30c 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1667,6 +1667,41 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, default: DefCycle = ItinData->getOperandCycle(DefClass, DefIdx); break; + case ARM::VLDMD: + case ARM::VLDMS: + case ARM::VLDMD_UPD: + case ARM::VLDMS_UPD: { + int RegNo = (int)(DefIdx+1) - DefTID.getNumOperands() + 1; + if (RegNo <= 0) { + // Def is the address writeback. + DefCycle = ItinData->getOperandCycle(DefClass, DefIdx); + break; + } + if (Subtarget.isCortexA8()) { + // (regno / 2) + (regno % 2) + 1 + DefCycle = RegNo / 2 + 1; + if (RegNo % 2) + ++DefCycle; + } else if (Subtarget.isCortexA9()) { + DefCycle = RegNo; + bool isSLoad = false; + switch (UseTID.getOpcode()) { + default: break; + case ARM::VLDMS: + case ARM::VLDMS_UPD: + isSLoad = true; + break; + } + // If there are odd number of 'S' registers or if it's not 64-bit aligned, + // then it takes an extra cycle. + if ((isSLoad && (RegNo % 2)) || DefAlign < 8) + ++DefCycle; + } else { + // Assume the worst. + DefCycle = RegNo + 2; + } + break; + } case ARM::LDM_RET: case ARM::LDM: case ARM::LDM_UPD: @@ -1677,7 +1712,12 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, case ARM::t2LDM: case ARM::t2LDM_UPD: { LdmBypass = 1; - unsigned RegNo = (DefIdx+1) - DefTID.getNumOperands() + 1; + int RegNo = (int)(DefIdx+1) - DefTID.getNumOperands() + 1; + if (RegNo <= 0) { + // Def is the address writeback. + DefCycle = ItinData->getOperandCycle(DefClass, DefIdx); + break; + } if (Subtarget.isCortexA8()) { // 4 registers would be issued: 1, 2, 1. // 5 registers would be issued: 1, 2, 2. @@ -1710,6 +1750,40 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, default: UseCycle = ItinData->getOperandCycle(UseClass, UseIdx); break; + case ARM::VSTMD: + case ARM::VSTMS: + case ARM::VSTMD_UPD: + case ARM::VSTMS_UPD: { + int RegNo = (int)(UseIdx+1) - UseTID.getNumOperands() + 1; + if (RegNo <= 0) { + UseCycle = ItinData->getOperandCycle(UseClass, UseIdx); + break; + } + if (Subtarget.isCortexA8()) { + // (regno / 2) + (regno % 2) + 1 + UseCycle = RegNo / 2 + 1; + if (RegNo % 2) + ++UseCycle; + } else if (Subtarget.isCortexA9()) { + UseCycle = RegNo; + bool isSStore = false; + switch (UseTID.getOpcode()) { + default: break; + case ARM::VSTMS: + case ARM::VSTMS_UPD: + isSStore = true; + break; + } + // If there are odd number of 'S' registers or if it's not 64-bit aligned, + // then it takes an extra cycle. + if ((isSStore && (RegNo % 2)) || UseAlign < 8) + ++UseCycle; + } else { + // Assume the worst. + UseCycle = RegNo + 2; + } + break; + } case ARM::STM: case ARM::STM_UPD: case ARM::tSTM_UPD: @@ -1717,14 +1791,16 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, case ARM::tPOP: case ARM::t2STM: case ARM::t2STM_UPD: { - unsigned RegNo = UseIdx - UseTID.getNumOperands() + 1; + int RegNo = (int)(UseIdx+1) - UseTID.getNumOperands() + 1; + if (RegNo <= 0) { + UseCycle = ItinData->getOperandCycle(UseClass, UseIdx); + break; + } if (Subtarget.isCortexA8()) { - // 4 registers would be issued: 1, 2, 1. - // 5 registers would be issued: 1, 2, 2. UseCycle = RegNo / 2; if (UseCycle < 2) UseCycle = 2; - // Result latency is issue cycle + 2: E2. + // Read in E3. UseCycle += 2; } else if (Subtarget.isCortexA9()) { UseCycle = (RegNo / 2); @@ -1732,12 +1808,11 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, // then it takes an extra AGU (Address Generation Unit) cycle. if ((RegNo % 2) || UseAlign < 8) ++UseCycle; - // Result latency is AGU cycles + 2. - UseCycle += 2; } else { // Assume the worst. - UseCycle = RegNo + 2; + UseCycle = 1; } + break; } } diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index e3ff884ee3c..6f3f6a0f0f8 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -132,13 +132,13 @@ def nModImm : Operand { // Use VLDM to load a Q register as a D register pair. // This is a pseudo instruction that is expanded to VLDMD after reg alloc. def VLDMQ - : PseudoVFPLdStM<(outs QPR:$dst), (ins addrmode4:$addr), IIC_fpLoadm, "", + : PseudoVFPLdStM<(outs QPR:$dst), (ins addrmode4:$addr), IIC_fpLoad_m, "", [(set QPR:$dst, (v2f64 (load addrmode4:$addr)))]>; // Use VSTM to store a Q register as a D register pair. // This is a pseudo instruction that is expanded to VSTMD after reg alloc. def VSTMQ - : PseudoVFPLdStM<(outs), (ins QPR:$src, addrmode4:$addr), IIC_fpStorem, "", + : PseudoVFPLdStM<(outs), (ins QPR:$src, addrmode4:$addr), IIC_fpStore_m, "", [(store (v2f64 QPR:$src), addrmode4:$addr)]>; let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td index 53d181b13b1..24344008f30 100644 --- a/lib/Target/ARM/ARMInstrVFP.td +++ b/lib/Target/ARM/ARMInstrVFP.td @@ -78,20 +78,20 @@ def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$src, addrmode5:$addr), let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { def VLDMD : AXDI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$dsts, - variable_ops), IndexModeNone, IIC_fpLoadm, + variable_ops), IndexModeNone, IIC_fpLoad_m, "vldm${addr:submode}${p}\t$addr, $dsts", "", []> { let Inst{20} = 1; } def VLDMS : AXSI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$dsts, - variable_ops), IndexModeNone, IIC_fpLoadm, + variable_ops), IndexModeNone, IIC_fpLoad_m, "vldm${addr:submode}${p}\t$addr, $dsts", "", []> { let Inst{20} = 1; } def VLDMD_UPD : AXDI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, reglist:$dsts, variable_ops), - IndexModeUpd, IIC_fpLoadm, + IndexModeUpd, IIC_fpLoad_mu, "vldm${addr:submode}${p}\t$addr!, $dsts", "$addr.addr = $wb", []> { let Inst{20} = 1; @@ -99,7 +99,7 @@ def VLDMD_UPD : AXDI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, def VLDMS_UPD : AXSI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, reglist:$dsts, variable_ops), - IndexModeUpd, IIC_fpLoadm, + IndexModeUpd, IIC_fpLoad_mu, "vldm${addr:submode}${p}\t$addr!, $dsts", "$addr.addr = $wb", []> { let Inst{20} = 1; @@ -108,20 +108,20 @@ def VLDMS_UPD : AXSI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in { def VSTMD : AXDI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$srcs, - variable_ops), IndexModeNone, IIC_fpStorem, + variable_ops), IndexModeNone, IIC_fpStore_m, "vstm${addr:submode}${p}\t$addr, $srcs", "", []> { let Inst{20} = 0; } def VSTMS : AXSI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$srcs, - variable_ops), IndexModeNone, IIC_fpStorem, + variable_ops), IndexModeNone, IIC_fpStore_m, "vstm${addr:submode}${p}\t$addr, $srcs", "", []> { let Inst{20} = 0; } def VSTMD_UPD : AXDI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, reglist:$srcs, variable_ops), - IndexModeUpd, IIC_fpStorem, + IndexModeUpd, IIC_fpStore_mu, "vstm${addr:submode}${p}\t$addr!, $srcs", "$addr.addr = $wb", []> { let Inst{20} = 0; @@ -129,7 +129,7 @@ def VSTMD_UPD : AXDI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, def VSTMS_UPD : AXSI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, reglist:$srcs, variable_ops), - IndexModeUpd, IIC_fpStorem, + IndexModeUpd, IIC_fpStore_mu, "vstm${addr:submode}${p}\t$addr!, $srcs", "$addr.addr = $wb", []> { let Inst{20} = 0; diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index b7ce8322ba2..d4abc3534de 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -120,10 +120,12 @@ def IIC_fpSQRT32 : InstrItinClass; def IIC_fpSQRT64 : InstrItinClass; def IIC_fpLoad32 : InstrItinClass; def IIC_fpLoad64 : InstrItinClass; -def IIC_fpLoadm : InstrItinClass<0>; // micro-coded +def IIC_fpLoad_m : InstrItinClass<0>; // micro-coded +def IIC_fpLoad_mu : InstrItinClass<0>; // micro-coded def IIC_fpStore32 : InstrItinClass; def IIC_fpStore64 : InstrItinClass; -def IIC_fpStorem : InstrItinClass<0>; // micro-coded +def IIC_fpStore_m : InstrItinClass<0>; // micro-coded +def IIC_fpStore_mu : InstrItinClass<0>; // micro-coded def IIC_VLD1 : InstrItinClass; def IIC_VLD2 : InstrItinClass; def IIC_VLD3 : InstrItinClass; diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td index 714bf2e6353..ac4da75e960 100644 --- a/lib/Target/ARM/ARMScheduleA8.td +++ b/lib/Target/ARM/ARMScheduleA8.td @@ -414,54 +414,58 @@ def CortexA8Itineraries : ProcessorItineraries< InstrItinData, InstrStage<1, [A8_Pipe0, A8_Pipe1]>, InstrStage<1, [A8_LdSt0], 0>, - InstrStage<1, [A8_NLSPipe]>], + InstrStage<2, [A8_NLSPipe]>], [2, 1]>, // // Double-precision FP Load // use A8_Issue to enforce the 1 load/store per cycle limit InstrItinData, - InstrStage<1, [A8_Pipe0], 0>, - InstrStage<1, [A8_Pipe1]>, InstrStage<1, [A8_Pipe0, A8_Pipe1]>, InstrStage<1, [A8_LdSt0], 0>, - InstrStage<1, [A8_NLSPipe]>], + InstrStage<2, [A8_NLSPipe]>], [2, 1]>, // // FP Load Multiple // use A8_Issue to enforce the 1 load/store per cycle limit - InstrItinData, - InstrStage<2, [A8_Pipe0], 0>, - InstrStage<2, [A8_Pipe1]>, + InstrItinData, InstrStage<1, [A8_Pipe0, A8_Pipe1]>, InstrStage<1, [A8_LdSt0], 0>, - InstrStage<1, [A8_NLSPipe]>]>, + InstrStage<1, [A8_NLSPipe]>], [1, 1, 1, 2]>, + // + // FP Load Multiple + update + InstrItinData, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0], 0>, + InstrStage<1, [A8_NLSPipe]>], [2, 1, 1, 1, 2]>, // // Single-precision FP Store // use A8_Issue to enforce the 1 load/store per cycle limit InstrItinData, InstrStage<1, [A8_Pipe0, A8_Pipe1]>, InstrStage<1, [A8_LdSt0], 0>, - InstrStage<1, [A8_NLSPipe]>], + InstrStage<2, [A8_NLSPipe]>], [1, 1]>, // // Double-precision FP Store // use A8_Issue to enforce the 1 load/store per cycle limit InstrItinData, - InstrStage<1, [A8_Pipe0], 0>, - InstrStage<1, [A8_Pipe1]>, InstrStage<1, [A8_Pipe0, A8_Pipe1]>, InstrStage<1, [A8_LdSt0], 0>, - InstrStage<1, [A8_NLSPipe]>], + InstrStage<2, [A8_NLSPipe]>], [1, 1]>, // // FP Store Multiple // use A8_Issue to enforce the 1 load/store per cycle limit - InstrItinData, - InstrStage<2, [A8_Pipe0], 0>, - InstrStage<2, [A8_Pipe1]>, + InstrItinData, InstrStage<1, [A8_Pipe0, A8_Pipe1]>, InstrStage<1, [A8_LdSt0], 0>, - InstrStage<1, [A8_NLSPipe]>]>, + InstrStage<1, [A8_NLSPipe]>], [1, 1, 1, 1]>, + // + // FP Store Multiple + update + InstrItinData, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0], 0>, + InstrStage<1, [A8_NLSPipe]>], [2, 1, 1, 1, 1]>, // NEON // Issue through integer pipeline, and execute in NEON unit. diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index 6dd27157030..27745e65300 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -629,11 +629,18 @@ def CortexA9Itineraries : ProcessorItineraries< [2, 1]>, // // FP Load Multiple - InstrItinData, + InstrItinData, InstrStage<2, [A9_DRegsN], 0, Reserved>, InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, - InstrStage<1, [A9_NPipe]>]>, + InstrStage<1, [A9_NPipe]>], [1, 1, 1, 1]>, + // + // FP Load Multiple + update + InstrItinData, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_NPipe]>], [2, 1, 1, 1]>, // // Single-precision FP Store InstrItinData, @@ -652,11 +659,18 @@ def CortexA9Itineraries : ProcessorItineraries< [1, 1]>, // // FP Store Multiple - InstrItinData, + InstrItinData, InstrStage<2, [A9_DRegsN], 0, Reserved>, InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, - InstrStage<1, [A9_NPipe]>]>, + InstrStage<1, [A9_NPipe]>], [1, 1, 1, 1]>, + // + // FP Store Multiple + update + InstrItinData, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_NPipe]>], [2, 1, 1, 1]>, // NEON // Issue through integer pipeline, and execute in NEON unit. // VLD1 diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td index f1dbdbc5a71..b845130e370 100644 --- a/lib/Target/ARM/ARMScheduleV6.td +++ b/lib/Target/ARM/ARMScheduleV6.td @@ -254,7 +254,10 @@ def ARMV6Itineraries : ProcessorItineraries< InstrItinData], [5, 2, 2]>, // // FP Load Multiple - InstrItinData]>, + InstrItinData], [2, 1, 1, 5]>, + // + // FP Load Multiple + update + InstrItinData], [3, 2, 1, 1, 5]>, // // Single-precision FP Store InstrItinData], [2, 2, 2]>, @@ -264,5 +267,8 @@ def ARMV6Itineraries : ProcessorItineraries< InstrItinData], [2, 2, 2]>, // // FP Store Multiple - InstrItinData]> + InstrItinData], [2, 2, 2, 2]>, + // + // FP Store Multiple + update + InstrItinData], [3, 2, 2, 2, 2]> ]>; -- 2.34.1