ARM let processInstruction() tranforms chain.
[oota-llvm.git] / lib / Target / ARM / ARMScheduleA9.td
index c199ef7f2b2a40561b8dd9ad1fb0ed6604e1e90a..49fedf63f8bca209b925b0ac6d908deb31dad507 100644 (file)
@@ -24,8 +24,7 @@ def A9_ALU1    : FuncUnit; // ALU pipeline 1
 def A9_AGU     : FuncUnit; // Address generation unit for ld / st
 def A9_NPipe   : FuncUnit; // NEON pipeline
 def A9_MUX0    : FuncUnit; // AGU + NEON/FPU multiplexer
-def A9_LS0     : FuncUnit; // L/S Units, 32-bit per unit. Fake FU to limit l/s.
-def A9_LS1     : FuncUnit; // L/S Units, 32-bit per unit.
+def A9_LSUnit  : FuncUnit; // L/S Unit
 def A9_DRegsVFP: FuncUnit; // FP register set, VFP side
 def A9_DRegsN  : FuncUnit; // FP register set, NEON side
 
@@ -34,7 +33,7 @@ def A9_LdBypass : Bypass;
 
 def CortexA9Itineraries : ProcessorItineraries<
   [A9_Issue0, A9_Issue1, A9_Branch, A9_ALU0, A9_ALU1, A9_AGU, A9_NPipe, A9_MUX0,
-   A9_LS0, A9_LS1, A9_DRegsVFP, A9_DRegsN],
+   A9_LSUnit, A9_DRegsVFP, A9_DRegsN],
   [A9_LdBypass], [
   // Two fully-pipelined integer ALU pipelines
 
@@ -51,6 +50,16 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_iMOVix2 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_ALU0, A9_ALU1]>,
                                InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>,
+  InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                                  InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                                  InstrStage<1, [A9_ALU0, A9_ALU1]>], [3]>,
+  InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_AGU], 0>,
+                               InstrStage<1, [A9_LSUnit]>], [5]>,
   //
   // MVN instructions
   InstrItinData<IIC_iMVNi   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
@@ -124,7 +133,8 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_iCMPr   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_ALU0, A9_ALU1]>],
                                [1, 1], [A9_LdBypass, A9_LdBypass]>,
-  InstrItinData<IIC_iCMPsi  , [InstrStage<2, [A9_ALU0, A9_ALU1]>],
+  InstrItinData<IIC_iCMPsi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0, A9_ALU1]>],
                                 [1, 1], [A9_LdBypass, NoBypass]>,
   InstrItinData<IIC_iCMPsr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<3, [A9_ALU0, A9_ALU1]>],
@@ -150,6 +160,10 @@ def CortexA9Itineraries : ProcessorItineraries<
                                InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
   InstrItinData<IIC_iCMOVsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iCMOVix2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>,
 
   // Integer multiply pipeline
   //
@@ -174,113 +188,113 @@ def CortexA9Itineraries : ProcessorItineraries<
   // Immediate offset
   InstrItinData<IIC_iLoad_i   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                  InstrStage<1, [A9_MUX0], 0>,
-                                 InstrStage<1, [A9_AGU]>,
-                                 InstrStage<1, [A9_LS0, A9_LS1]>],
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
                                 [3, 1], [A9_LdBypass]>,
   InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                  InstrStage<1, [A9_MUX0], 0>,
-                                 InstrStage<2, [A9_AGU]>,
-                                 InstrStage<1, [A9_LS0, A9_LS1]>],
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
                                 [4, 1], [A9_LdBypass]>,
   // FIXME: If address is 64-bit aligned, AGU cycles is 1.
   InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                  InstrStage<1, [A9_MUX0], 0>,
-                                 InstrStage<2, [A9_AGU]>,
-                                 InstrStage<1, [A9_LS0, A9_LS1]>],
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
                                 [3, 3, 1], [A9_LdBypass]>,
   //
   // Register offset
   InstrItinData<IIC_iLoad_r   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                  InstrStage<1, [A9_MUX0], 0>,
-                                 InstrStage<1, [A9_AGU]>,
-                                 InstrStage<1, [A9_LS0, A9_LS1]>],
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
                                 [3, 1, 1], [A9_LdBypass]>,
   InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                  InstrStage<1, [A9_MUX0], 0>,
-                                 InstrStage<2, [A9_AGU]>,
-                                 InstrStage<1, [A9_LS0, A9_LS1]>],
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
                                 [4, 1, 1], [A9_LdBypass]>,
   InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                  InstrStage<1, [A9_MUX0], 0>,
-                                 InstrStage<2, [A9_AGU]>,
-                                 InstrStage<1, [A9_LS0, A9_LS1]>],
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
                                 [3, 3, 1, 1], [A9_LdBypass]>,
   //
   // Scaled register offset
   InstrItinData<IIC_iLoad_si  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                  InstrStage<1, [A9_MUX0], 0>,
-                                 InstrStage<1, [A9_AGU]>,
-                                 InstrStage<1, [A9_LS0, A9_LS1]>],
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit], 0>],
                                 [4, 1, 1], [A9_LdBypass]>,
   InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                  InstrStage<1, [A9_MUX0], 0>,
-                                 InstrStage<2, [A9_AGU]>,
-                                 InstrStage<1, [A9_LS0, A9_LS1]>],
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
                                 [5, 1, 1], [A9_LdBypass]>,
   //
   // Immediate offset with update
   InstrItinData<IIC_iLoad_iu  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                  InstrStage<1, [A9_MUX0], 0>,
-                                 InstrStage<1, [A9_AGU]>,
-                                 InstrStage<1, [A9_LS0, A9_LS1]>],
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
                                 [3, 2, 1], [A9_LdBypass]>,
   InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                  InstrStage<1, [A9_MUX0], 0>,
-                                 InstrStage<2, [A9_AGU]>,
-                                 InstrStage<1, [A9_LS0, A9_LS1]>],
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
                                 [4, 3, 1], [A9_LdBypass]>,
   //
   // Register offset with update
   InstrItinData<IIC_iLoad_ru  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                  InstrStage<1, [A9_MUX0], 0>,
-                                 InstrStage<1, [A9_AGU]>,
-                                 InstrStage<1, [A9_LS0, A9_LS1]>],
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
                                 [3, 2, 1, 1], [A9_LdBypass]>,
   InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                  InstrStage<1, [A9_MUX0], 0>,
-                                 InstrStage<2, [A9_AGU]>,
-                                 InstrStage<1, [A9_LS0, A9_LS1]>],
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
                                 [4, 3, 1, 1], [A9_LdBypass]>,
   InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                  InstrStage<1, [A9_MUX0], 0>,
-                                 InstrStage<2, [A9_AGU]>,
-                                 InstrStage<1, [A9_LS0, A9_LS1]>],
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
                                 [3, 3, 1, 1], [A9_LdBypass]>,
   //
   // Scaled register offset with update
   InstrItinData<IIC_iLoad_siu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                  InstrStage<1, [A9_MUX0], 0>,
-                                 InstrStage<1, [A9_AGU]>,
-                                 InstrStage<1, [A9_LS0, A9_LS1]>],
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
                                 [4, 3, 1, 1], [A9_LdBypass]>,
   InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                   InstrStage<1, [A9_MUX0], 0>,
-                                  InstrStage<2, [A9_AGU]>,
-                                  InstrStage<1, [A9_LS0, A9_LS1]>],
+                                  InstrStage<2, [A9_AGU], 0>,
+                                  InstrStage<1, [A9_LSUnit]>],
                                  [5, 4, 1, 1], [A9_LdBypass]>,
   //
   // Load multiple, def is the 5th operand.
   // FIXME: This assumes 3 to 4 registers.
   InstrItinData<IIC_iLoad_m  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1, [A9_MUX0], 0>,
-                                InstrStage<2, [A9_AGU]>,
-                                InstrStage<2, [A9_LS0, A9_LS1]>],
+                                InstrStage<2, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>],
                                [1, 1, 1, 1, 3],
                          [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>,
   //
   // Load multiple + update, defs are the 1st and 5th operands.
   InstrItinData<IIC_iLoad_mu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1, [A9_MUX0], 0>,
-                                InstrStage<2, [A9_AGU]>,
-                                InstrStage<2, [A9_LS0, A9_LS1]>],
+                                InstrStage<2, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>],
                                [2, 1, 1, 1, 3],
                          [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>,
   //
   // Load multiple plus branch
   InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1, [A9_MUX0], 0>,
-                                InstrStage<1, [A9_AGU]>,
-                                InstrStage<2, [A9_LS0, A9_LS1]>,
+                                InstrStage<1, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>,
                                 InstrStage<1, [A9_Branch]>],
                                [1, 2, 1, 1, 3],
                          [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>,
@@ -288,16 +302,16 @@ def CortexA9Itineraries : ProcessorItineraries<
   // Pop, def is the 3rd operand.
   InstrItinData<IIC_iPop  ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1, [A9_MUX0], 0>,
-                                InstrStage<2, [A9_AGU]>,
-                                InstrStage<2, [A9_LS0, A9_LS1]>],
+                                InstrStage<2, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>],
                                [1, 1, 3],
                                [NoBypass, NoBypass, A9_LdBypass]>,
   //
   // Pop + branch, def is the 3rd operand.
   InstrItinData<IIC_iPop_Br,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1, [A9_MUX0], 0>,
-                                InstrStage<2, [A9_AGU]>,
-                                InstrStage<2, [A9_LS0, A9_LS1]>,
+                                InstrStage<2, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>,
                                 InstrStage<1, [A9_Branch]>],
                                [1, 1, 3],
                                [NoBypass, NoBypass, A9_LdBypass]>,
@@ -306,8 +320,8 @@ def CortexA9Itineraries : ProcessorItineraries<
   // iLoadi + iALUr for t2LDRpci_pic.
   InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1, [A9_MUX0], 0>,
-                                InstrStage<1, [A9_AGU]>,
-                                InstrStage<1, [A9_LS0, A9_LS1]>,
+                                InstrStage<1, [A9_AGU], 0>,
+                                InstrStage<1, [A9_LSUnit]>,
                                 InstrStage<1, [A9_ALU0, A9_ALU1]>],
                                [2, 1]>,
 
@@ -316,92 +330,96 @@ def CortexA9Itineraries : ProcessorItineraries<
   // Immediate offset
   InstrItinData<IIC_iStore_i  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                  InstrStage<1, [A9_MUX0], 0>,
-                                 InstrStage<1, [A9_AGU]>,
-                                 InstrStage<1, [A9_LS0, A9_LS1]>], [1, 1]>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1]>,
   InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                  InstrStage<1, [A9_MUX0], 0>,
-                                 InstrStage<2, [A9_AGU]>,
-                                 InstrStage<1, [A9_LS0, A9_LS1]>], [1, 1]>,
+                                 InstrStage<2, [A9_AGU], 1>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1]>,
   // FIXME: If address is 64-bit aligned, AGU cycles is 1.
   InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                  InstrStage<1, [A9_MUX0], 0>,
-                                 InstrStage<2, [A9_AGU]>,
-                                 InstrStage<1, [A9_LS0, A9_LS1]>], [1, 1]>,
+                                 InstrStage<2, [A9_AGU], 1>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1]>,
   //
   // Register offset
   InstrItinData<IIC_iStore_r  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                  InstrStage<1, [A9_MUX0], 0>,
-                                 InstrStage<1, [A9_AGU]>,
-                                 InstrStage<1, [A9_LS0, A9_LS1]>], [1, 1, 1]>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
   InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                  InstrStage<1, [A9_MUX0], 0>,
-                                 InstrStage<2, [A9_AGU]>,
-                                 InstrStage<1, [A9_LS0, A9_LS1]>], [1, 1, 1]>,
+                                 InstrStage<2, [A9_AGU], 1>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
   InstrItinData<IIC_iStore_d_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                  InstrStage<1, [A9_MUX0], 0>,
-                                 InstrStage<2, [A9_AGU]>,
-                                 InstrStage<1, [A9_LS0, A9_LS1]>], [1, 1, 1]>,
+                                 InstrStage<2, [A9_AGU], 1>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
   //
   // Scaled register offset
   InstrItinData<IIC_iStore_si ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                   InstrStage<1, [A9_MUX0], 0>,
-                                  InstrStage<1, [A9_AGU]>,
-                                  InstrStage<1, [A9_LS0, A9_LS1]>], [1, 1, 1]>,
+                                  InstrStage<1, [A9_AGU], 0>,
+                                  InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
   InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                   InstrStage<1, [A9_MUX0], 0>,
-                                  InstrStage<2, [A9_AGU]>,
-                                  InstrStage<1, [A9_LS0, A9_LS1]>], [1, 1, 1]>,
+                                  InstrStage<2, [A9_AGU], 1>,
+                                  InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
   //
   // Immediate offset with update
   InstrItinData<IIC_iStore_iu ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                   InstrStage<1, [A9_MUX0], 0>,
-                                  InstrStage<1, [A9_AGU]>,
-                                  InstrStage<1, [A9_LS0, A9_LS1]>], [2, 1, 1]>,
+                                  InstrStage<1, [A9_AGU], 0>,
+                                  InstrStage<1, [A9_LSUnit]>], [2, 1, 1]>,
   InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                   InstrStage<1, [A9_MUX0], 0>,
-                                  InstrStage<2, [A9_AGU]>,
-                                  InstrStage<1, [A9_LS0, A9_LS1]>], [3, 1, 1]>,
+                                  InstrStage<2, [A9_AGU], 1>,
+                                  InstrStage<1, [A9_LSUnit]>], [3, 1, 1]>,
   //
   // Register offset with update
   InstrItinData<IIC_iStore_ru ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                   InstrStage<1, [A9_MUX0], 0>,
-                                  InstrStage<1, [A9_AGU]>,
-                                  InstrStage<1, [A9_LS0, A9_LS1]>],
+                                  InstrStage<1, [A9_AGU], 0>,
+                                  InstrStage<1, [A9_LSUnit]>],
                                  [2, 1, 1, 1]>,
   InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                   InstrStage<1, [A9_MUX0], 0>,
-                                  InstrStage<2, [A9_AGU]>,
-                                  InstrStage<1, [A9_LS0, A9_LS1]>],
+                                  InstrStage<2, [A9_AGU], 1>,
+                                  InstrStage<1, [A9_LSUnit]>],
                                  [3, 1, 1, 1]>,
   InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                   InstrStage<1, [A9_MUX0], 0>,
-                                  InstrStage<2, [A9_AGU]>,
-                                  InstrStage<1, [A9_LS0, A9_LS1]>],
+                                  InstrStage<2, [A9_AGU], 1>,
+                                  InstrStage<1, [A9_LSUnit]>],
                                  [3, 1, 1, 1]>,
   //
   // Scaled register offset with update
   InstrItinData<IIC_iStore_siu,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                     InstrStage<1, [A9_MUX0], 0>,
-                                    InstrStage<1, [A9_AGU]>,
-                                    InstrStage<1, [A9_LS0, A9_LS1]>],
+                                    InstrStage<1, [A9_AGU], 0>,
+                                    InstrStage<1, [A9_LSUnit]>],
                                    [2, 1, 1, 1]>,
   InstrItinData<IIC_iStore_bh_siu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                     InstrStage<1, [A9_MUX0], 0>,
-                                    InstrStage<2, [A9_AGU]>,
-                                    InstrStage<1, [A9_LS0, A9_LS1]>],
+                                    InstrStage<2, [A9_AGU], 1>,
+                                    InstrStage<1, [A9_LSUnit]>],
                                    [3, 1, 1, 1]>,
   //
   // Store multiple
   InstrItinData<IIC_iStore_m , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1, [A9_MUX0], 0>,
-                                InstrStage<1, [A9_AGU]>,
-                                InstrStage<2, [A9_LS0, A9_LS1]>]>,
+                                InstrStage<1, [A9_AGU], 0>,
+                                InstrStage<2, [A9_LSUnit]>]>,
   //
   // Store multiple + update
   InstrItinData<IIC_iStore_mu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1, [A9_MUX0], 0>,
-                                InstrStage<1, [A9_AGU]>,
-                                InstrStage<2, [A9_LS0, A9_LS1]>], [2]>,
+                                InstrStage<1, [A9_AGU], 0>,
+                                InstrStage<2, [A9_LSUnit]>], [2]>,
+
+  //
+  // Preload
+  InstrItinData<IIC_Preload,   [InstrStage<1, [A9_Issue0, A9_Issue1]>], [1, 1]>,
 
   // Branch
   //
@@ -428,983 +446,1382 @@ def CortexA9Itineraries : ProcessorItineraries<
   // Issue through integer pipeline, and execute in NEON unit.
 
   // FP Special Register to Integer Register File Move
-  InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                              InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                              InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                               InstrStage<1, [A9_MUX0], 0>,
-                              InstrStage<1, [A9_NPipe]>]>,
+                              InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                              InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                              InstrStage<1, [A9_NPipe]>],
+                             [1]>,
   //
   // Single-precision FP Unary
-  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra latency cycles since wbck is 2 cycles
                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [1, 1]>,
   //
   // Double-precision FP Unary
-  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra latency cycles since wbck is 2 cycles
                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [1, 1]>,
 
   //
   // Single-precision FP Compare
-  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra latency cycles since wbck is 4 cycles
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [1, 1]>,
   //
   // Double-precision FP Compare
-  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra latency cycles since wbck is 4 cycles
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [1, 1]>,
   //
   // Single to Double FP Convert
-  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
                               [4, 1]>,
   //
   // Double to Single FP Convert
-  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
                               [4, 1]>,
 
   //
   // Single to Half FP Convert
-  InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
                               [4, 1]>,
   //
   // Half to Single FP Convert
-  InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
                               [2, 1]>,
 
   //
   // Single-Precision FP to Integer Convert
-  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
                               [4, 1]>,
   //
   // Double-Precision FP to Integer Convert
-  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
                               [4, 1]>,
   //
   // Integer to Single-Precision FP Convert
-  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
                               [4, 1]>,
   //
   // Integer to Double-Precision FP Convert
-  InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
                               [4, 1]>,
   //
   // Single-precision FP ALU
-  InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
                               [4, 1, 1]>,
   //
   // Double-precision FP ALU
-  InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
                               [4, 1, 1]>,
   //
   // Single-precision FP Multiply
-  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<6, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<6, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
                               [5, 1, 1]>,
   //
   // Double-precision FP Multiply
-  InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<7, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<7, [A9_DRegsN],   0, Reserved>,
                                InstrStage<2, [A9_NPipe]>],
                               [6, 1, 1]>,
   //
   // Single-precision FP MAC
-  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<9, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<9, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
-                              [8, 0, 1, 1]>,
+                              [8, 1, 1, 1]>,
   //
   // Double-precision FP MAC
-  InstrItinData<IIC_fpMAC64 , [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
-                               InstrStage<10, [A9_DRegsN],  0, Reserved>,
-                               InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpMAC64 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<10, [A9_DRegsN],  0, Reserved>,
                                InstrStage<2,  [A9_NPipe]>],
-                              [9, 0, 1, 1]>,
+                              [9, 1, 1, 1]>,
   //
   // Single-precision FP DIV
-  InstrItinData<IIC_fpDIV32 , [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
-                               InstrStage<16, [A9_DRegsN],  0, Reserved>,
-                               InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpDIV32 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<16, [A9_DRegsN],  0, Reserved>,
                                InstrStage<10, [A9_NPipe]>],
                               [15, 1, 1]>,
   //
   // Double-precision FP DIV
-  InstrItinData<IIC_fpDIV64 , [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
-                               InstrStage<26, [A9_DRegsN],  0, Reserved>,
-                               InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpDIV64 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<26, [A9_DRegsN],  0, Reserved>,
                                InstrStage<20, [A9_NPipe]>],
                               [25, 1, 1]>,
   //
   // Single-precision FP SQRT
-  InstrItinData<IIC_fpSQRT32, [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
-                               InstrStage<18, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpSQRT32, [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<18, [A9_DRegsN],   0, Reserved>,
                                InstrStage<13, [A9_NPipe]>],
                               [17, 1]>,
   //
   // Double-precision FP SQRT
-  InstrItinData<IIC_fpSQRT64, [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
-                               InstrStage<33, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpSQRT64, [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<33, [A9_DRegsN],   0, Reserved>,
                                InstrStage<28, [A9_NPipe]>],
                               [32, 1]>,
 
   //
   // Integer to Single-precision Move
-  InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra 1 latency cycle since wbck is 2 cycles
                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [1, 1]>,
   //
   // Integer to Double-precision Move
-  InstrItinData<IIC_fpMOVID,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpMOVID,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra 1 latency cycle since wbck is 2 cycles
                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [1, 1, 1]>,
   //
   // Single-precision to Integer Move
-  InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_NPipe]>],
-                              [1, 1]>,
+  //
+  // On A9 move-from-VFP is free to issue with no stall if other VFP
+  // operations are in flight. I assume it still can't dual-issue though.
+  InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>],
+                              [2, 1]>,
   //
   // Double-precision to Integer Move
-  InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_NPipe]>],
-                              [1, 1, 1]>,
+  //
+  // On A9 move-from-VFP is free to issue with no stall if other VFP
+  // operations are in flight. I assume it still can't dual-issue though.
+  InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>],
+                              [2, 1, 1]>,
   //
   // Single-precision FP Load
-  InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_NPipe]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [1, 1]>,
   //
   // Double-precision FP Load
   // FIXME: Result latency is 1 if address is 64-bit aligned.
-  InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_NPipe]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [2, 1]>,
   //
   // FP Load Multiple
-  InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
+  InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_NPipe]>], [1, 1, 1, 1]>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1]>,
   //
   // FP Load Multiple + update
-  InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
+  InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_NPipe]>], [2, 1, 1, 1]>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1]>,
   //
   // Single-precision FP Store
-  InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_NPipe]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [1, 1]>,
   //
   // Double-precision FP Store
-  InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_NPipe]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [1, 1]>,
   //
   // FP Store Multiple
-  InstrItinData<IIC_fpStore_m,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
+  InstrItinData<IIC_fpStore_m,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_NPipe]>], [1, 1, 1, 1]>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1]>,
   //
   // FP Store Multiple + update
-  InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                                InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
+  InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1, [A9_MUX0], 0>,
-                                InstrStage<1, [A9_NPipe]>], [2, 1, 1, 1]>,
+                                InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                                InstrStage<1, [A9_NPipe], 0>,
+                                InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1]>,
   // NEON
   // VLD1
-  // FIXME: Conservatively assume insufficent alignment.
-  InstrItinData<IIC_VLD1,     [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VLD1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<2, [A9_NPipe]>],
-                              [2, 1]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1]>,
   // VLD1x2
-  InstrItinData<IIC_VLD1x2,   [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VLD1x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<2, [A9_NPipe]>],
-                              [2, 2, 1]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1]>,
   // VLD1x3
-  InstrItinData<IIC_VLD1x3,   [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VLD1x3,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<3, [A9_NPipe]>],
-                              [2, 2, 3, 1]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 1]>,
   // VLD1x4
-  InstrItinData<IIC_VLD1x4,   [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VLD1x4,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<3, [A9_NPipe]>],
-                              [2, 2, 3, 3, 1]>,
-  // VLD1u
-  InstrItinData<IIC_VLD1u,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 2, 1]>,
+  // VLD1u
+  InstrItinData<IIC_VLD1u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<2, [A9_NPipe]>],
-                              [2, 2, 1]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 2, 1]>,
   // VLD1x2u
-  InstrItinData<IIC_VLD1x2u,  [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VLD1x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<2, [A9_NPipe]>],
-                              [2, 2, 2, 1]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 2, 1]>,
   // VLD1x3u
-  InstrItinData<IIC_VLD1x3u,  [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VLD1x3u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<3, [A9_NPipe]>],
-                              [2, 2, 3, 2, 1]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 2, 1]>,
   // VLD1x4u
-  InstrItinData<IIC_VLD1x4u,  [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VLD1x4u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<3, [A9_NPipe]>],
-                              [2, 2, 3, 3, 2, 1]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 2, 2, 1]>,
   //
-  // VLD2
-  InstrItinData<IIC_VLD2,     [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 7 cycles
+  // VLD1ln
+  InstrItinData<IIC_VLD1ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 1, 1, 1]>,
+  //
+  // VLD1lnu
+  InstrItinData<IIC_VLD1lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<2, [A9_NPipe]>],
-                              [3, 3, 1]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 2, 1, 1, 1, 1]>,
+  //
+  // VLD1dup
+  InstrItinData<IIC_VLD1dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1]>,
+  //
+  // VLD1dupu
+  InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 1, 1]>,
+  //
+  // VLD2
+  InstrItinData<IIC_VLD2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 1]>,
   //
   // VLD2x2
-  InstrItinData<IIC_VLD2x2,   [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VLD2x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<3, [A9_NPipe]>],
-                              [3, 4, 3, 4, 1]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 3, 2, 3, 1]>,
   //
   // VLD2ln
-  InstrItinData<IIC_VLD2ln,   [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VLD2ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<3, [A9_NPipe]>],
-                              [4, 4, 1, 1, 1, 1]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 3, 1, 1, 1, 1]>,
   //
   // VLD2u
-  InstrItinData<IIC_VLD2u,    [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 7 cycles
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VLD2u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<2, [A9_NPipe]>],
-                              [3, 3, 2, 1, 1, 1]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 2, 1, 1, 1]>,
   //
   // VLD2x2u
-  InstrItinData<IIC_VLD2x2u,  [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VLD2x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<3, [A9_NPipe]>],
-                              [3, 4, 3, 4, 2, 1]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 3, 2, 3, 2, 1]>,
   //
   // VLD2lnu
-  InstrItinData<IIC_VLD2lnu,  [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VLD2lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<3, [A9_NPipe]>],
-                              [4, 4, 2, 1, 1, 1, 1, 1]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 3, 2, 1, 1, 1, 1, 1]>,
   //
-  // VLD3
-  // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VLD3,     [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 6 cycles
-                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  // VLD2dup
+  InstrItinData<IIC_VLD2dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_NPipe]>],
-                              [2, 2, 2, 1]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 1]>,
   //
-  // VLD4
-  // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VLD4,     [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 6 cycles
+  // VLD2dupu
+  InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 2, 1, 1]>,
+  //
+  // VLD3
+  InstrItinData<IIC_VLD3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_NPipe]>],
-                              [2, 2, 2, 2, 1]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 1]>,
   //
-  // VST
-  // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VST,      [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 6 cycles
-                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  // VLD3ln
+  InstrItinData<IIC_VLD3ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_NPipe]>]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<5, [A9_NPipe], 0>,
+                               InstrStage<5, [A9_LSUnit]>],
+                              [5, 5, 6, 1, 1, 1, 1, 2]>,
   //
-  // Double-register Integer Unary
-  InstrItinData<IIC_VUNAiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 6 cycles
-                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  // VLD3u
+  InstrItinData<IIC_VLD3u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_NPipe]>],
-                              [4, 2]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 2, 1]>,
   //
-  // Quad-register Integer Unary
-  InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 6 cycles
-                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  // VLD3lnu
+  InstrItinData<IIC_VLD3lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_NPipe]>],
-                              [4, 2]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<5, [A9_NPipe], 0>,
+                               InstrStage<5, [A9_LSUnit]>],
+                              [5, 5, 6, 2, 1, 1, 1, 1, 1, 2]>,
   //
-  // Double-register Integer Q-Unary
-  InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 6 cycles
-                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  // VLD3dup
+  InstrItinData<IIC_VLD3dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_NPipe]>],
-                              [4, 1]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 1]>,
   //
-  // Quad-register Integer CountQ-Unary
-  InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 6 cycles
-                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  // VLD3dupu
+  InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_NPipe]>],
-                              [4, 1]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 2, 1, 1]>,
   //
-  // Double-register Integer Binary
-  InstrItinData<IIC_VBINiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 6 cycles
-                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  // VLD4
+  InstrItinData<IIC_VLD4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_NPipe]>],
-                              [3, 2, 2]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 4, 1]>,
   //
-  // Quad-register Integer Binary
-  InstrItinData<IIC_VBINiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 6 cycles
-                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  // VLD4ln
+  InstrItinData<IIC_VLD4ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_NPipe]>],
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe], 0>,
+                               InstrStage<4, [A9_LSUnit]>],
+                              [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VLD4u
+  InstrItinData<IIC_VLD4u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 4, 2, 1]>,
+  //
+  // VLD4lnu
+  InstrItinData<IIC_VLD4lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe], 0>,
+                               InstrStage<4, [A9_LSUnit]>],
+                              [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VLD4dup
+  InstrItinData<IIC_VLD4dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 2, 3, 3, 1]>,
+  //
+  // VLD4dupu
+  InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 2, 3, 3, 2, 1, 1]>,
+  //
+  // VST1
+  InstrItinData<IIC_VST1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1]>,
+  //
+  // VST1x2
+  InstrItinData<IIC_VST1x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST1x3
+  InstrItinData<IIC_VST1x3,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST1x4
+  InstrItinData<IIC_VST1x4,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST1u
+  InstrItinData<IIC_VST1u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1]>,
+  //
+  // VST1x2u
+  InstrItinData<IIC_VST1x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST1x3u
+  InstrItinData<IIC_VST1x3u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST1x4u
+  InstrItinData<IIC_VST1x4u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST1ln
+  InstrItinData<IIC_VST1ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1]>,
+  //
+  // VST1lnu
+  InstrItinData<IIC_VST1lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1]>,
+  //
+  // VST2
+  InstrItinData<IIC_VST2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST2x2
+  InstrItinData<IIC_VST2x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST2u
+  InstrItinData<IIC_VST2u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST2x2u
+  InstrItinData<IIC_VST2x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST2ln
+  InstrItinData<IIC_VST2ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST2lnu
+  InstrItinData<IIC_VST2lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST3
+  InstrItinData<IIC_VST3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST3u
+  InstrItinData<IIC_VST3u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST3ln
+  InstrItinData<IIC_VST3ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST3lnu
+  InstrItinData<IIC_VST3lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST4
+  InstrItinData<IIC_VST4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4u
+  InstrItinData<IIC_VST4u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4ln
+  InstrItinData<IIC_VST4ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4lnu
+  InstrItinData<IIC_VST4lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+
+  //
+  // Double-register Integer Unary
+  InstrItinData<IIC_VUNAiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2]>,
+  //
+  // Quad-register Integer Unary
+  InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2]>,
+  //
+  // Double-register Integer Q-Unary
+  InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Quad-register Integer CountQ-Unary
+  InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Double-register Integer Binary
+  InstrItinData<IIC_VBINiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
                               [3, 2, 2]>,
   //
-  // Double-register Integer Subtract
-  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  // Quad-register Integer Binary
+  InstrItinData<IIC_VBINiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 2, 2]>,
+  //
+  // Double-register Integer Subtract
+  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
                               [3, 2, 1]>,
   //
   // Quad-register Integer Subtract
-  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [3, 2, 1]>,
   //
   // Double-register Integer Shift
-  InstrItinData<IIC_VSHLiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VSHLiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [3, 1, 1]>,
   //
   // Quad-register Integer Shift
-  InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [3, 1, 1]>,
   //
   // Double-register Integer Shift (4 cycle)
-  InstrItinData<IIC_VSHLi4D,  [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VSHLi4D,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [4, 1, 1]>,
   //
   // Quad-register Integer Shift (4 cycle)
-  InstrItinData<IIC_VSHLi4Q,  [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VSHLi4Q,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [4, 1, 1]>,
   //
   // Double-register Integer Binary (4 cycle)
-  InstrItinData<IIC_VBINi4D,  [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VBINi4D,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [4, 2, 2]>,
   //
   // Quad-register Integer Binary (4 cycle)
-  InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [4, 2, 2]>,
   //
   // Double-register Integer Subtract (4 cycle)
-  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VSUBi4D,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [4, 2, 1]>,
   //
   // Quad-register Integer Subtract (4 cycle)
-  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VSUBi4Q,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [4, 2, 1]>,
 
   //
   // Double-register Integer Count
-  InstrItinData<IIC_VCNTiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VCNTiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [3, 2, 2]>,
   //
   // Quad-register Integer Count
   // Result written in N3, but that is relative to the last cycle of multicycle,
   // so we use 4 for those cases
-  InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<2, [A9_NPipe]>],
                               [4, 2, 2]>,
   //
   // Double-register Absolute Difference and Accumulate
-  InstrItinData<IIC_VABAD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VABAD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [6, 3, 2, 1]>,
   //
   // Quad-register Absolute Difference and Accumulate
-  InstrItinData<IIC_VABAQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VABAQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<2, [A9_NPipe]>],
                               [6, 3, 2, 1]>,
   //
   // Double-register Integer Pair Add Long
-  InstrItinData<IIC_VPALiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VPALiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [6, 3, 1]>,
   //
   // Quad-register Integer Pair Add Long
-  InstrItinData<IIC_VPALiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VPALiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<2, [A9_NPipe]>],
                               [6, 3, 1]>,
 
   //
   // Double-register Integer Multiply (.8, .16)
-  InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [6, 2, 2]>,
   //
   // Quad-register Integer Multiply (.8, .16)
-  InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<2, [A9_NPipe]>],
                               [7, 2, 2]>,
 
   //
   // Double-register Integer Multiply (.32)
-  InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<2, [A9_NPipe]>],
                               [7, 2, 1]>,
   //
   // Quad-register Integer Multiply (.32)
-  InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 9 cycles
                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<4, [A9_NPipe]>],
                               [9, 2, 1]>,
   //
   // Double-register Integer Multiply-Accumulate (.8, .16)
-  InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [6, 3, 2, 2]>,
   //
   // Double-register Integer Multiply-Accumulate (.32)
-  InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<2, [A9_NPipe]>],
                               [7, 3, 2, 1]>,
   //
   // Quad-register Integer Multiply-Accumulate (.8, .16)
-  InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<2, [A9_NPipe]>],
                               [7, 3, 2, 2]>,
   //
   // Quad-register Integer Multiply-Accumulate (.32)
-  InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 9 cycles
                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<4, [A9_NPipe]>],
                               [9, 3, 2, 1]>,
 
   //
   // Move
-  InstrItinData<IIC_VMOV,     [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VMOV,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
                               [1,1]>,
   //
   // Move Immediate
-  InstrItinData<IIC_VMOVImm,  [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMOVImm,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [3]>,
   //
   // Double-register Permute Move
-  InstrItinData<IIC_VMOVD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
-  // FIXME: all latencies are arbitrary, no information is available
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VMOVD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
                               [2, 1]>,
   //
   // Quad-register Permute Move
-  InstrItinData<IIC_VMOVQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
-  // FIXME: all latencies are arbitrary, no information is available
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VMOVQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
                               [2, 1]>,
   //
   // Integer to Single-precision Move
-  InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
-  // FIXME: all latencies are arbitrary, no information is available
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
-                              [2, 1]>,
+                              [1, 1]>,
   //
   // Integer to Double-precision Move
-  InstrItinData<IIC_VMOVID ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
-  // FIXME: all latencies are arbitrary, no information is available
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VMOVID ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
-                              [2, 1, 1]>,
+                              [1, 1, 1]>,
   //
   // Single-precision to Integer Move
-  InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
-  // FIXME: all latencies are arbitrary, no information is available
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
                               [2, 1]>,
   //
   // Double-precision to Integer Move
-  InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
-  // FIXME: all latencies are arbitrary, no information is available
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
                               [2, 2, 1]>,
   //
   // Integer to Lane Move
-  InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_DRegsN],   0, Required>,
-  // FIXME: all latencies are arbitrary, no information is available
-                               InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
                                InstrStage<2, [A9_NPipe]>],
                               [3, 1, 1]>,
 
   //
   // Vector narrow move
-  InstrItinData<IIC_VMOVN,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMOVN,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [3, 1]>,
   //
   // Double-register FP Unary
-  InstrItinData<IIC_VUNAD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VUNAD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [5, 2]>,
   //
   // Quad-register FP Unary
   // Result written in N5, but that is relative to the last cycle of multicycle,
   // so we use 6 for those cases
-  InstrItinData<IIC_VUNAQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VUNAQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<2, [A9_NPipe]>],
                               [6, 2]>,
   //
   // Double-register FP Binary
   // FIXME: We're using this itin for many instructions and [2, 2] here is too
   // optimistic.
-  InstrItinData<IIC_VBIND,    [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 7 cycles
-                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VBIND,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
                               [5, 2, 2]>,
+
+  //
+  // VPADD, etc.
+  InstrItinData<IIC_VPBIND,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 1, 1]>,
+  //
+  // Double-register FP VMUL
+  InstrItinData<IIC_VFMULD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 2, 1]>,
   //
   // Quad-register FP Binary
   // Result written in N5, but that is relative to the last cycle of multicycle,
   // so we use 6 for those cases
   // FIXME: We're using this itin for many instructions and [2, 2] here is too
   // optimistic.
-  InstrItinData<IIC_VBINQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 8 cycles
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VBINQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
                                InstrStage<2, [A9_NPipe]>],
                               [6, 2, 2]>,
   //
-  // Double-register FP Multiple-Accumulate
-  InstrItinData<IIC_VMACD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+  // Quad-register FP VMUL
+  InstrItinData<IIC_VFMULQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [6, 2, 1]>,
+  //
+  // Double-register FP Multiple-Accumulate
+  InstrItinData<IIC_VMACD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
                                InstrStage<2, [A9_NPipe]>],
                               [6, 3, 2, 1]>,
   //
   // Quad-register FP Multiple-Accumulate
   // Result written in N9, but that is relative to the last cycle of multicycle,
   // so we use 10 for those cases
-  InstrItinData<IIC_VMACQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMACQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 9 cycles
                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<4, [A9_NPipe]>],
                               [8, 4, 2, 1]>,
   //
   // Double-register Reciprical Step
-  InstrItinData<IIC_VRECSD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 7 cycles
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VRECSD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<2, [A9_NPipe]>],
-                              [6, 2, 2]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 10 cycles
+                               InstrStage<11, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [9, 2, 2]>,
   //
   // Quad-register Reciprical Step
-  InstrItinData<IIC_VRECSQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 9 cycles
-                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VRECSQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<4, [A9_NPipe]>],
-                              [8, 2, 2]>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 11 cycles
+                               InstrStage<12, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [10, 2, 2]>,
   //
   // Double-register Permute
-  InstrItinData<IIC_VPERMD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VPERMD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_NPipe]>],
                               [2, 2, 1, 1]>,
   //
   // Quad-register Permute
   // Result written in N2, but that is relative to the last cycle of multicycle,
   // so we use 3 for those cases
-  InstrItinData<IIC_VPERMQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VPERMQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<2, [A9_NPipe]>],
                               [3, 3, 1, 1]>,
   //
   // Quad-register Permute (3 cycle issue)
   // Result written in N2, but that is relative to the last cycle of multicycle,
   // so we use 4 for those cases
-  InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<3, [A9_NPipe]>],
                               [4, 4, 1, 1]>,
 
   //
   // Double-register VEXT
-  InstrItinData<IIC_VEXTD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 7 cycles
-                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VEXTD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
                                InstrStage<1, [A9_NPipe]>],
                               [2, 1, 1]>,
   //
   // Quad-register VEXT
-  InstrItinData<IIC_VEXTQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 9 cycles
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_VEXTQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
                                InstrStage<2, [A9_NPipe]>],
-                              [3, 1, 1]>,
+                              [3, 1, 2]>,
   //
   // VTB
-  InstrItinData<IIC_VTB1,     [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VTB1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<2, [A9_NPipe]>],
                               [3, 2, 1]>,
-  InstrItinData<IIC_VTB2,     [InstrStage<2, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VTB2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<2, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<2, [A9_NPipe]>],
                               [3, 2, 2, 1]>,
-  InstrItinData<IIC_VTB3,     [InstrStage<2, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VTB3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<2, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<3, [A9_NPipe]>],
                               [4, 2, 2, 3, 1]>,
-  InstrItinData<IIC_VTB4,     [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VTB4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<3, [A9_NPipe]>],
                               [4, 2, 2, 3, 3, 1]>,
   //
   // VTBX
-  InstrItinData<IIC_VTBX1,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VTBX1,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<2, [A9_NPipe]>],
                               [3, 1, 2, 1]>,
-  InstrItinData<IIC_VTBX2,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VTBX2,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<2, [A9_NPipe]>],
                               [3, 1, 2, 2, 1]>,
-  InstrItinData<IIC_VTBX3,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VTBX3,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<3, [A9_NPipe]>],
                               [4, 1, 2, 2, 3, 1]>,
-  InstrItinData<IIC_VTBX4,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VTBX4,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<2, [A9_NPipe]>],
                               [4, 1, 2, 2, 3, 3, 1]>
 ]>;