X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FARM%2FARMScheduleA9.td;h=49fedf63f8bca209b925b0ac6d908deb31dad507;hb=83ec87755ed4d07f6650d6727fb762052bd0041c;hp=14197c824da874574e463ce275f5f177a039068f;hpb=5d42c567c901508e80ab10ddba1bb30a5007d742;p=oota-llvm.git diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index 14197c824da..49fedf63f8b 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -16,166 +16,417 @@ // Reference Manual". // // Functional units -def A9_Pipe0 : FuncUnit; // pipeline 0 -def A9_Pipe1 : FuncUnit; // pipeline 1 -def A9_LSPipe : FuncUnit; // LS pipe -def A9_NPipe : FuncUnit; // NEON ALU/MUL pipe +def A9_Issue0 : FuncUnit; // Issue 0 +def A9_Issue1 : FuncUnit; // Issue 1 +def A9_Branch : FuncUnit; // Branch +def A9_ALU0 : FuncUnit; // ALU / MUL pipeline 0 +def A9_ALU1 : FuncUnit; // ALU pipeline 1 +def A9_AGU : FuncUnit; // Address generation unit for ld / st +def A9_NPipe : FuncUnit; // NEON pipeline +def A9_MUX0 : FuncUnit; // AGU + NEON/FPU multiplexer +def A9_LSUnit : FuncUnit; // L/S Unit def A9_DRegsVFP: FuncUnit; // FP register set, VFP side def A9_DRegsN : FuncUnit; // FP register set, NEON side -// Dual issue pipeline represented by A9_Pipe0 | A9_Pipe1 -// +// Bypasses +def A9_LdBypass : Bypass; + def CortexA9Itineraries : ProcessorItineraries< - [A9_NPipe, A9_DRegsN, A9_DRegsVFP, A9_LSPipe, A9_Pipe0, A9_Pipe1], [], [ + [A9_Issue0, A9_Issue1, A9_Branch, A9_ALU0, A9_ALU1, A9_AGU, A9_NPipe, A9_MUX0, + A9_LSUnit, A9_DRegsVFP, A9_DRegsN], + [A9_LdBypass], [ // Two fully-pipelined integer ALU pipelines // // Move instructions, unconditional - InstrItinData], [1]>, - InstrItinData], [1, 1]>, - InstrItinData], [1, 1]>, - InstrItinData], [2, 2, 1]>, - InstrItinData, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [3]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], [5]>, // // MVN instructions - InstrItinData], [1]>, - InstrItinData], [1, 1]>, - InstrItinData], [1, 1]>, - InstrItinData], [2, 2, 1]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1, 1], [NoBypass, A9_LdBypass]>, + InstrItinData, + InstrStage<2, [A9_ALU0, A9_ALU1]>], + [2, 1]>, + InstrItinData, + InstrStage<3, [A9_ALU0, A9_ALU1]>], + [3, 1, 1]>, // // No operand cycles - InstrItinData]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>]>, // // Binary Instructions that produce a result - InstrItinData], [2, 2]>, - InstrItinData], [2, 2, 2]>, - InstrItinData], [2, 2, 1]>, - InstrItinData], [2, 2, 1, 1]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1, 1], [NoBypass, A9_LdBypass]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1, 1, 1], [NoBypass, A9_LdBypass, A9_LdBypass]>, + InstrItinData, + InstrStage<2, [A9_ALU0, A9_ALU1]>], + [2, 1, 1], [NoBypass, A9_LdBypass, NoBypass]>, + InstrItinData, + InstrStage<2, [A9_ALU0, A9_ALU1]>], + [2, 1, 1], [NoBypass, NoBypass, A9_LdBypass]>, + InstrItinData, + InstrStage<3, [A9_ALU0, A9_ALU1]>], + [3, 1, 1, 1], + [NoBypass, A9_LdBypass, NoBypass, NoBypass]>, // // Bitwise Instructions that produce a result - InstrItinData], [2, 2]>, - InstrItinData], [2, 2, 2]>, - InstrItinData], [2, 2, 1]>, - InstrItinData], [2, 2, 1, 1]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>, + InstrItinData, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>, + InstrItinData, + InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>, // // Unary Instructions that produce a result - InstrItinData], [2, 2]>, - InstrItinData], [2, 1]>, + + // CLZ, RBIT, etc. + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + + // BFC, BFI, UBFX, SBFX + InstrItinData, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1]>, + // // Zero and sign extension instructions - InstrItinData], [2, 1]>, - InstrItinData], [3, 1, 1]>, - InstrItinData],[3, 1, 1, 1]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [2, 1]>, + InstrItinData, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [3, 1, 1]>, + InstrItinData, + InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>, // // Compare instructions - InstrItinData], [2]>, - InstrItinData], [2, 2]>, - InstrItinData], [2, 1]>, - InstrItinData], [2, 1, 1]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1], [A9_LdBypass]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1, 1], [A9_LdBypass, A9_LdBypass]>, + InstrItinData, + InstrStage<2, [A9_ALU0, A9_ALU1]>], + [1, 1], [A9_LdBypass, NoBypass]>, + InstrItinData, + InstrStage<3, [A9_ALU0, A9_ALU1]>], + [1, 1, 1], [A9_LdBypass, NoBypass, NoBypass]>, // // Test instructions - InstrItinData], [2]>, - InstrItinData], [2, 2]>, - InstrItinData], [2, 1]>, - InstrItinData], [2, 1, 1]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData, + InstrStage<3, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>, // // Move instructions, conditional - InstrItinData], [2]>, - InstrItinData], [2, 1]>, - InstrItinData], [2, 1]>, - InstrItinData], [2, 1, 1]>, + // FIXME: Correctly model the extra input dep on the destination. + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>, // Integer multiply pipeline // - InstrItinData, - InstrStage<2, [A9_Pipe0]>], [4, 1, 1]>, - InstrItinData, - InstrStage<2, [A9_Pipe0]>], [4, 1, 1, 2]>, - InstrItinData, - InstrStage<2, [A9_Pipe0]>], [4, 1, 1]>, - InstrItinData, - InstrStage<2, [A9_Pipe0]>], [4, 1, 1, 2]>, - InstrItinData, - InstrStage<3, [A9_Pipe0]>], [4, 5, 1, 1]>, - InstrItinData, - InstrStage<3, [A9_Pipe0]>], [4, 5, 1, 1]>, + InstrItinData, + InstrStage<2, [A9_ALU0]>], [3, 1, 1]>, + InstrItinData, + InstrStage<2, [A9_ALU0]>], + [3, 1, 1, 1]>, + InstrItinData, + InstrStage<2, [A9_ALU0]>], [4, 1, 1]>, + InstrItinData, + InstrStage<2, [A9_ALU0]>], + [4, 1, 1, 1]>, + InstrItinData, + InstrStage<3, [A9_ALU0]>], [4, 5, 1, 1]>, + InstrItinData, + InstrStage<3, [A9_ALU0]>], + [4, 5, 1, 1]>, // Integer load pipeline // FIXME: The timings are some rough approximations // // Immediate offset - InstrItinData, - InstrStage<1, [A9_LSPipe]>], [3, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 1], [A9_LdBypass]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [4, 1], [A9_LdBypass]>, + // FIXME: If address is 64-bit aligned, AGU cycles is 1. + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 3, 1], [A9_LdBypass]>, // // Register offset - InstrItinData, - InstrStage<1, [A9_LSPipe]>], [3, 1, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 1, 1], [A9_LdBypass]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [4, 1, 1], [A9_LdBypass]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 3, 1, 1], [A9_LdBypass]>, // // Scaled register offset - InstrItinData, - InstrStage<2, [A9_LSPipe]>], [4, 1, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit], 0>], + [4, 1, 1], [A9_LdBypass]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [5, 1, 1], [A9_LdBypass]>, // // Immediate offset with update - InstrItinData, - InstrStage<2, [A9_LSPipe]>], [3, 2, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 2, 1], [A9_LdBypass]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [4, 3, 1], [A9_LdBypass]>, // // Register offset with update - InstrItinData, - InstrStage<2, [A9_LSPipe]>], [3, 2, 1, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 2, 1, 1], [A9_LdBypass]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [4, 3, 1, 1], [A9_LdBypass]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 3, 1, 1], [A9_LdBypass]>, // // Scaled register offset with update - InstrItinData, - InstrStage<2, [A9_LSPipe]>], [4, 3, 1, 1]>, - // - // Load multiple - InstrItinData, - InstrStage<1, [A9_LSPipe]>]>, - + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [4, 3, 1, 1], [A9_LdBypass]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [5, 4, 1, 1], [A9_LdBypass]>, + // + // Load multiple, def is the 5th operand. + // FIXME: This assumes 3 to 4 registers. + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1, 3], + [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>, + // + // Load multiple + update, defs are the 1st and 5th operands. + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 3], + [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>, // // Load multiple plus branch - InstrItinData, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 1>, + InstrStage<2, [A9_LSUnit]>, + InstrStage<1, [A9_Branch]>], + [1, 2, 1, 1, 3], + [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>, + // + // Pop, def is the 3rd operand. + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 3], + [NoBypass, NoBypass, A9_LdBypass]>, + // + // Pop + branch, def is the 3rd operand. + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<2, [A9_LSUnit]>, + InstrStage<1, [A9_Branch]>], + [1, 1, 3], + [NoBypass, NoBypass, A9_LdBypass]>, // // iLoadi + iALUr for t2LDRpci_pic. - InstrItinData, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [4, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [2, 1]>, // Integer store pipeline /// // Immediate offset - InstrItinData, - InstrStage<1, [A9_LSPipe]>], [3, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [1, 1]>, + // FIXME: If address is 64-bit aligned, AGU cycles is 1. + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [1, 1]>, // // Register offset - InstrItinData, - InstrStage<1, [A9_LSPipe]>], [3, 1, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, // // Scaled register offset - InstrItinData, - InstrStage<2, [A9_LSPipe]>], [3, 1, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, // // Immediate offset with update - InstrItinData, - InstrStage<1, [A9_LSPipe]>], [2, 3, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], [2, 1, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [3, 1, 1]>, // // Register offset with update - InstrItinData, - InstrStage<1, [A9_LSPipe]>], [2, 3, 1, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], + [3, 1, 1, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], + [3, 1, 1, 1]>, // // Scaled register offset with update - InstrItinData, - InstrStage<2, [A9_LSPipe]>], [3, 3, 1, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], + [3, 1, 1, 1]>, // // Store multiple - InstrItinData, - InstrStage<1, [A9_LSPipe]>]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<2, [A9_LSUnit]>]>, + // + // Store multiple + update + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<2, [A9_LSUnit]>], [2]>, + + // + // Preload + InstrItinData], [1, 1]>, + // Branch // // no delay slots, so the latency of a branch is unimportant - InstrItinData]>, + InstrItinData, + InstrStage<1, [A9_Issue1], 0>, + InstrStage<1, [A9_Branch]>]>, // VFP and NEON shares the same register file. This means that every VFP // instruction should wait for full completion of the consecutive NEON @@ -195,687 +446,1382 @@ def CortexA9Itineraries : ProcessorItineraries< // Issue through integer pipeline, and execute in NEON unit. // FP Special Register to Integer Register File Move - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>]>, + InstrStage<1, [A9_NPipe]>], + [1]>, // // Single-precision FP Unary - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, // Extra latency cycles since wbck is 2 cycles InstrStage<3, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [1, 1]>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, // // Double-precision FP Unary - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, // Extra latency cycles since wbck is 2 cycles InstrStage<3, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [1, 1]>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, // // Single-precision FP Compare - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, // Extra latency cycles since wbck is 4 cycles InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [1, 1]>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, // // Double-precision FP Compare - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, // Extra latency cycles since wbck is 4 cycles InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [1, 1]>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, // // Single to Double FP Convert - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, // // Double to Single FP Convert - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, // // Single to Half FP Convert - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, // // Half to Single FP Convert - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<3, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [2, 1]>, + InstrStage<1, [A9_NPipe]>], + [2, 1]>, // // Single-Precision FP to Integer Convert - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, // // Double-Precision FP to Integer Convert - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, // // Integer to Single-Precision FP Convert - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, // // Integer to Double-Precision FP Convert - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, // // Single-precision FP ALU - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1, 1]>, // // Double-precision FP ALU - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1, 1]>, // // Single-precision FP Multiply - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<6, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [5, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [5, 1, 1]>, // // Double-precision FP Multiply - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<7, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [6, 1, 1]>, + InstrStage<2, [A9_NPipe]>], + [6, 1, 1]>, // // Single-precision FP MAC - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<9, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [8, 0, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [8, 1, 1, 1]>, // // Double-precision FP MAC - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<10, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [9, 0, 1, 1]>, + InstrStage<2, [A9_NPipe]>], + [9, 1, 1, 1]>, // // Single-precision FP DIV - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<16, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<10, [A9_NPipe]>], [15, 1, 1]>, + InstrStage<10, [A9_NPipe]>], + [15, 1, 1]>, // // Double-precision FP DIV - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<26, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<20, [A9_NPipe]>], [25, 1, 1]>, + InstrStage<20, [A9_NPipe]>], + [25, 1, 1]>, // // Single-precision FP SQRT - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<18, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<13, [A9_NPipe]>], [17, 1]>, + InstrStage<13, [A9_NPipe]>], + [17, 1]>, // // Double-precision FP SQRT - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<33, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<28, [A9_NPipe]>], [32, 1]>, + InstrStage<28, [A9_NPipe]>], + [32, 1]>, // // Integer to Single-precision Move - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, // Extra 1 latency cycle since wbck is 2 cycles InstrStage<3, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [1, 1]>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, // // Integer to Double-precision Move - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, // Extra 1 latency cycle since wbck is 2 cycles InstrStage<3, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [1, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [1, 1, 1]>, // // Single-precision to Integer Move - InstrItinData, - InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [1, 1]>, + // + // On A9 move-from-VFP is free to issue with no stall if other VFP + // operations are in flight. I assume it still can't dual-issue though. + InstrItinData, + InstrStage<1, [A9_MUX0], 0>], + [2, 1]>, // // Double-precision to Integer Move - InstrItinData, - InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [1, 1, 1]>, + // + // On A9 move-from-VFP is free to issue with no stall if other VFP + // operations are in flight. I assume it still can't dual-issue though. + InstrItinData, + InstrStage<1, [A9_MUX0], 0>], + [2, 1, 1]>, // // Single-precision FP Load - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>]>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1]>, // // Double-precision FP Load - InstrItinData, + // FIXME: Result latency is 1 if address is 64-bit aligned. + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>]>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1]>, // // FP Load Multiple - InstrItinData, + // FIXME: assumes 2 doubles which requires 2 LS cycles. + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>]>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1]>, + // + // FP Load Multiple + update + // FIXME: assumes 2 doubles which requires 2 LS cycles. + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1]>, // // Single-precision FP Store - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>]>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1]>, // // Double-precision FP Store - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>]>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1]>, // // FP Store Multiple - InstrItinData, + // FIXME: assumes 2 doubles which requires 2 LS cycles. + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>]>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1]>, + // + // FP Store Multiple + update + // FIXME: assumes 2 doubles which requires 2 LS cycles. + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1]>, // NEON - // Issue through integer pipeline, and execute in NEON unit. - // FIXME: Neon pipeline and LdSt unit are multiplexed. - // Add some syntactic sugar to model this! // VLD1 - // FIXME: We don't model this instruction properly - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1]>, + // VLD1x2 + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 1]>, + // VLD1x3 + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 1]>, + // VLD1x4 + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 2, 1]>, + // VLD1u + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 2, 1]>, + // VLD1x2u + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 2, 1]>, + // VLD1x3u + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 2, 1]>, + // VLD1x4u + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 2, 2, 1]>, + // + // VLD1ln + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 1, 1, 1]>, + // + // VLD1lnu + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 2, 1, 1, 1, 1]>, + // + // VLD1dup + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1]>, + // + // VLD1dupu + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>]>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 1, 1]>, // // VLD2 - // FIXME: We don't model this instruction properly - InstrItinData, - // Extra latency cycles since wbck is 6 cycles + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 1]>, + // + // VLD2x2 + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 3, 2, 3, 1]>, + // + // VLD2ln + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 3, 1, 1, 1, 1]>, + // + // VLD2u + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>], [2, 2, 1]>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 2, 1, 1, 1]>, + // + // VLD2x2u + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 3, 2, 3, 2, 1]>, + // + // VLD2lnu + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 3, 2, 1, 1, 1, 1, 1]>, + // + // VLD2dup + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 1]>, + // + // VLD2dupu + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 2, 1, 1]>, // // VLD3 - // FIXME: We don't model this instruction properly - InstrItinData, - // Extra latency cycles since wbck is 6 cycles - InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>], [2, 2, 2, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 1]>, + // + // VLD3ln + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<11,[A9_DRegsVFP], 0, Reserved>, + InstrStage<5, [A9_NPipe], 0>, + InstrStage<5, [A9_LSUnit]>], + [5, 5, 6, 1, 1, 1, 1, 2]>, + // + // VLD3u + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 2, 1]>, + // + // VLD3lnu + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<11,[A9_DRegsVFP], 0, Reserved>, + InstrStage<5, [A9_NPipe], 0>, + InstrStage<5, [A9_LSUnit]>], + [5, 5, 6, 2, 1, 1, 1, 1, 1, 2]>, + // + // VLD3dup + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 1]>, + // + // VLD3dupu + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 2, 1, 1]>, // // VLD4 - // FIXME: We don't model this instruction properly - InstrItinData, - // Extra latency cycles since wbck is 6 cycles - InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>], [2, 2, 2, 2, 1]>, - // - // VST - // FIXME: We don't model this instruction properly - InstrItinData, - // Extra latency cycles since wbck is 6 cycles - InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1], 0>, - InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_NPipe]>]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 4, 1]>, + // + // VLD4ln + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<10,[A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe], 0>, + InstrStage<4, [A9_LSUnit]>], + [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>, + // + // VLD4u + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 4, 2, 1]>, + // + // VLD4lnu + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<10,[A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe], 0>, + InstrStage<4, [A9_LSUnit]>], + [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VLD4dup + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 2, 3, 3, 1]>, + // + // VLD4dupu + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 2, 3, 3, 2, 1, 1]>, + // + // VST1 + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 1]>, + // + // VST1x2 + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 1, 1]>, + // + // VST1x3 + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1, 2]>, + // + // VST1x4 + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST1u + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1, 1]>, + // + // VST1x2u + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1]>, + // + // VST1x3u + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2]>, + // + // VST1x4u + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VST1ln + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 1]>, + // + // VST1lnu + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1, 1]>, + // + // VST2 + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 1, 1]>, + // + // VST2x2 + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST2u + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1]>, + // + // VST2x2u + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VST2ln + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 1, 1]>, + // + // VST2lnu + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1]>, + // + // VST3 + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1, 2]>, + // + // VST3u + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2]>, + // + // VST3ln + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [1, 1, 1, 1, 2]>, + // + // VST3lnu + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2]>, + // + // VST4 + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST4u + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VST4ln + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST4lnu + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // // Double-register Integer Unary - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 2]>, + InstrStage<1, [A9_NPipe]>], + [4, 2]>, // // Quad-register Integer Unary - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 2]>, + InstrStage<1, [A9_NPipe]>], + [4, 2]>, // // Double-register Integer Q-Unary - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, // // Quad-register Integer CountQ-Unary - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, // // Double-register Integer Binary - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [3, 2, 2]>, + InstrStage<1, [A9_NPipe]>], + [3, 2, 2]>, // // Quad-register Integer Binary - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [3, 2, 2]>, + InstrStage<1, [A9_NPipe]>], + [3, 2, 2]>, // // Double-register Integer Subtract - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [3, 2, 1]>, + InstrStage<1, [A9_NPipe]>], + [3, 2, 1]>, // // Quad-register Integer Subtract - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [3, 2, 1]>, + InstrStage<1, [A9_NPipe]>], + [3, 2, 1]>, // // Double-register Integer Shift - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [3, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [3, 1, 1]>, // // Quad-register Integer Shift - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [3, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [3, 1, 1]>, // // Double-register Integer Shift (4 cycle) - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1, 1]>, // // Quad-register Integer Shift (4 cycle) - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 1, 1]>, // // Double-register Integer Binary (4 cycle) - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 2, 2]>, + InstrStage<1, [A9_NPipe]>], + [4, 2, 2]>, // // Quad-register Integer Binary (4 cycle) - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 2, 2]>, + InstrStage<1, [A9_NPipe]>], + [4, 2, 2]>, // // Double-register Integer Subtract (4 cycle) - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 2, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 2, 1]>, // // Quad-register Integer Subtract (4 cycle) - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [4, 2, 1]>, + InstrStage<1, [A9_NPipe]>], + [4, 2, 1]>, // // Double-register Integer Count - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [3, 2, 2]>, + InstrStage<1, [A9_NPipe]>], + [3, 2, 2]>, // // Quad-register Integer Count // Result written in N3, but that is relative to the last cycle of multicycle, // so we use 4 for those cases - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [4, 2, 2]>, + InstrStage<2, [A9_NPipe]>], + [4, 2, 2]>, // // Double-register Absolute Difference and Accumulate - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [6, 3, 2, 1]>, + InstrStage<1, [A9_NPipe]>], + [6, 3, 2, 1]>, // // Quad-register Absolute Difference and Accumulate - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [6, 3, 2, 1]>, + InstrStage<2, [A9_NPipe]>], + [6, 3, 2, 1]>, // // Double-register Integer Pair Add Long - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [6, 3, 1]>, + InstrStage<1, [A9_NPipe]>], + [6, 3, 1]>, // // Quad-register Integer Pair Add Long - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [6, 3, 1]>, + InstrStage<2, [A9_NPipe]>], + [6, 3, 1]>, // // Double-register Integer Multiply (.8, .16) - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [6, 2, 2]>, + InstrStage<1, [A9_NPipe]>], + [6, 2, 2]>, // // Quad-register Integer Multiply (.8, .16) - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [7, 2, 2]>, + InstrStage<2, [A9_NPipe]>], + [7, 2, 2]>, // // Double-register Integer Multiply (.32) - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [7, 2, 1]>, + InstrStage<2, [A9_NPipe]>], + [7, 2, 1]>, // // Quad-register Integer Multiply (.32) - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 9 cycles InstrStage<10, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<4, [A9_NPipe]>], [9, 2, 1]>, + InstrStage<4, [A9_NPipe]>], + [9, 2, 1]>, // // Double-register Integer Multiply-Accumulate (.8, .16) - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [6, 3, 2, 2]>, + InstrStage<1, [A9_NPipe]>], + [6, 3, 2, 2]>, // // Double-register Integer Multiply-Accumulate (.32) - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [7, 3, 2, 1]>, + InstrStage<2, [A9_NPipe]>], + [7, 3, 2, 1]>, // // Quad-register Integer Multiply-Accumulate (.8, .16) - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [7, 3, 2, 2]>, + InstrStage<2, [A9_NPipe]>], + [7, 3, 2, 2]>, // // Quad-register Integer Multiply-Accumulate (.32) - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 9 cycles InstrStage<10, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<4, [A9_NPipe]>], [9, 3, 2, 1]>, + InstrStage<4, [A9_NPipe]>], + [9, 3, 2, 1]>, + + // + // Move + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1,1]>, // // Move Immediate - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [3]>, + InstrStage<1, [A9_NPipe]>], + [3]>, // // Double-register Permute Move - InstrItinData, - // FIXME: all latencies are arbitrary, no information is available - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_LSPipe]>], [2, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [2, 1]>, // // Quad-register Permute Move - // Result written in N2, but that is relative to the last cycle of multicycle, - // so we use 3 for those cases - InstrItinData, - // FIXME: all latencies are arbitrary, no information is available - InstrStage<4, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [3, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [2, 1]>, // // Integer to Single-precision Move - InstrItinData, - // FIXME: all latencies are arbitrary, no information is available + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [2, 1]>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, // // Integer to Double-precision Move - InstrItinData, - // FIXME: all latencies are arbitrary, no information is available + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [2, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [1, 1, 1]>, // // Single-precision to Integer Move - InstrItinData, - // FIXME: all latencies are arbitrary, no information is available + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [2, 1]>, + InstrStage<1, [A9_NPipe]>], + [2, 1]>, // // Double-precision to Integer Move - InstrItinData, - // FIXME: all latencies are arbitrary, no information is available + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [2, 2, 1]>, + InstrStage<1, [A9_NPipe]>], + [2, 2, 1]>, // // Integer to Lane Move - InstrItinData, - // FIXME: all latencies are arbitrary, no information is available + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, InstrStage<4, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [3, 1, 1]>, + InstrStage<2, [A9_NPipe]>], + [3, 1, 1]>, // + // Vector narrow move + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [3, 1]>, + // // Double-register FP Unary - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [5, 2]>, + InstrStage<1, [A9_NPipe]>], + [5, 2]>, // // Quad-register FP Unary // Result written in N5, but that is relative to the last cycle of multicycle, // so we use 6 for those cases - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [6, 2]>, + InstrStage<2, [A9_NPipe]>], + [6, 2]>, // // Double-register FP Binary // FIXME: We're using this itin for many instructions and [2, 2] here is too // optimistic. - InstrItinData, - // Extra latency cycles since wbck is 7 cycles + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [5, 2, 2]>, + + // + // VPADD, etc. + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [5, 2, 2]>, + InstrStage<1, [A9_NPipe]>], + [5, 1, 1]>, + // + // Double-register FP VMUL + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [5, 2, 1]>, // // Quad-register FP Binary // Result written in N5, but that is relative to the last cycle of multicycle, // so we use 6 for those cases // FIXME: We're using this itin for many instructions and [2, 2] here is too // optimistic. - InstrItinData, - // Extra latency cycles since wbck is 8 cycles + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [6, 2, 2]>, + InstrStage<2, [A9_NPipe]>], + [6, 2, 2]>, + // + // Quad-register FP VMUL + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [6, 2, 1]>, // // Double-register FP Multiple-Accumulate - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [6, 3, 2, 1]>, + InstrStage<2, [A9_NPipe]>], + [6, 3, 2, 1]>, // // Quad-register FP Multiple-Accumulate // Result written in N9, but that is relative to the last cycle of multicycle, // so we use 10 for those cases - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 9 cycles InstrStage<10, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<4, [A9_NPipe]>], [8, 4, 2, 1]>, + InstrStage<4, [A9_NPipe]>], + [8, 4, 2, 1]>, // // Double-register Reciprical Step - InstrItinData, - // Extra latency cycles since wbck is 7 cycles - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [6, 2, 2]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 10 cycles + InstrStage<11, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [9, 2, 2]>, // // Quad-register Reciprical Step - InstrItinData, - // Extra latency cycles since wbck is 9 cycles - InstrStage<10, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<4, [A9_NPipe]>], [8, 2, 2]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 11 cycles + InstrStage<12, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [10, 2, 2]>, // // Double-register Permute - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [2, 2, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [2, 2, 1, 1]>, // // Quad-register Permute // Result written in N2, but that is relative to the last cycle of multicycle, // so we use 3 for those cases - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [3, 3, 1, 1]>, + InstrStage<2, [A9_NPipe]>], + [3, 3, 1, 1]>, // // Quad-register Permute (3 cycle issue) // Result written in N2, but that is relative to the last cycle of multicycle, // so we use 4 for those cases - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 8 cycles InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<3, [A9_LSPipe]>], [4, 4, 1, 1]>, + InstrStage<3, [A9_NPipe]>], + [4, 4, 1, 1]>, // // Double-register VEXT - InstrItinData, - // Extra latency cycles since wbck is 7 cycles + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<1, [A9_NPipe]>], [2, 1, 1]>, + InstrStage<1, [A9_NPipe]>], + [2, 1, 1]>, // // Quad-register VEXT - InstrItinData, - // Extra latency cycles since wbck is 9 cycles + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [3, 1, 1]>, + InstrStage<2, [A9_NPipe]>], + [3, 1, 2]>, // // VTB - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [3, 2, 1]>, - InstrItinData, + InstrStage<2, [A9_NPipe]>], + [3, 2, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [3, 2, 2, 1]>, - InstrItinData, + InstrStage<2, [A9_NPipe]>], + [3, 2, 2, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 8 cycles InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<3, [A9_NPipe]>], [4, 2, 2, 3, 1]>, - InstrItinData, + InstrStage<3, [A9_NPipe]>], + [4, 2, 2, 3, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 8 cycles InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<3, [A9_NPipe]>], [4, 2, 2, 3, 3, 1]>, + InstrStage<3, [A9_NPipe]>], + [4, 2, 2, 3, 3, 1]>, // // VTBX - InstrItinData, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [3, 1, 2, 1]>, - InstrItinData, + InstrStage<2, [A9_NPipe]>], + [3, 1, 2, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [3, 1, 2, 2, 1]>, - InstrItinData, + InstrStage<2, [A9_NPipe]>], + [3, 1, 2, 2, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 8 cycles InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<3, [A9_NPipe]>], [4, 1, 2, 2, 3, 1]>, - InstrItinData, + InstrStage<3, [A9_NPipe]>], + [4, 1, 2, 2, 3, 1]>, + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 8 cycles InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [4, 1, 2, 2, 3, 3, 1]> + InstrStage<2, [A9_NPipe]>], + [4, 1, 2, 2, 3, 3, 1]> ]>;