Add operand cycles for vldr / vstr.
[oota-llvm.git] / lib / Target / ARM / ARMScheduleA9.td
1 //=- ARMScheduleA9.td - ARM Cortex-A9 Scheduling Definitions -*- tablegen -*-=//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the itinerary class data for the ARM Cortex A9 processors.
11 //
12 //===----------------------------------------------------------------------===//
13
14 //
15 // Ad-hoc scheduling information derived from pretty vague "Cortex-A9 Technical
16 // Reference Manual".
17 //
18 // Functional units
19 def A9_Pipe0   : FuncUnit; // pipeline 0
20 def A9_Pipe1   : FuncUnit; // pipeline 1
21 def A9_AGU     : FuncUnit; // Address generation unit for ld / st
22 def A9_NPipe   : FuncUnit; // NEON ALU/MUL pipeline
23 def A9_DRegsVFP: FuncUnit; // FP register set, VFP side
24 def A9_DRegsN  : FuncUnit; // FP register set, NEON side
25 def A9_MUX0    : FuncUnit; // AGU + NEON/FPU multiplexer
26
27 // Bypasses
28 def A9_LdBypass : Bypass;
29
30 // Dual issue pipeline represented by A9_Pipe0 | A9_Pipe1
31 //
32 def CortexA9Itineraries : ProcessorItineraries<
33   [A9_Pipe0, A9_Pipe1, A9_AGU, A9_NPipe, A9_DRegsVFP, A9_DRegsN, A9_MUX0],
34   [A9_LdBypass], [
35   // Two fully-pipelined integer ALU pipelines
36
37   //
38   // Move instructions, unconditional
39   InstrItinData<IIC_iMOVi   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1]>,
40   InstrItinData<IIC_iMOVr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
41   InstrItinData<IIC_iMOVsi  , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
42   InstrItinData<IIC_iMOVsr  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
43   InstrItinData<IIC_iMOVix2 , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
44                                InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2]>,
45   //
46   // MVN instructions
47   InstrItinData<IIC_iMVNi   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
48                               [1]>,
49   InstrItinData<IIC_iMVNr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
50                               [1, 1], [NoBypass, A9_LdBypass]>,
51   InstrItinData<IIC_iMVNsi  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>],
52                               [2, 1]>,
53   InstrItinData<IIC_iMVNsr  , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>],
54                               [3, 1, 1]>,
55   //
56   // No operand cycles
57   InstrItinData<IIC_iALUx   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>,
58   //
59   // Binary Instructions that produce a result
60   InstrItinData<IIC_iALUi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
61                             [1, 1], [NoBypass, A9_LdBypass]>,
62   InstrItinData<IIC_iALUr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
63                             [1, 1, 1], [NoBypass, A9_LdBypass, A9_LdBypass]>,
64   InstrItinData<IIC_iALUsi, [InstrStage<2, [A9_Pipe0, A9_Pipe1]>],
65                             [2, 1, 1], [NoBypass, A9_LdBypass, NoBypass]>,
66   InstrItinData<IIC_iALUsir,[InstrStage<2, [A9_Pipe0, A9_Pipe1]>],
67                             [2, 1, 1], [NoBypass, NoBypass, A9_LdBypass]>,
68   InstrItinData<IIC_iALUsr,[InstrStage<3, [A9_Pipe0, A9_Pipe1]>],
69                             [3, 1, 1, 1],
70                             [NoBypass, A9_LdBypass, NoBypass, NoBypass]>,
71   //
72   // Bitwise Instructions that produce a result
73   InstrItinData<IIC_iBITi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
74   InstrItinData<IIC_iBITr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1, 1]>,
75   InstrItinData<IIC_iBITsi, [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
76   InstrItinData<IIC_iBITsr,[InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [3, 1, 1, 1]>,
77   //
78   // Unary Instructions that produce a result
79
80   // CLZ, RBIT, etc.
81   InstrItinData<IIC_iUNAr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
82
83   // BFC, BFI, UBFX, SBFX
84   InstrItinData<IIC_iUNAsi  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
85
86   //
87   // Zero and sign extension instructions
88   InstrItinData<IIC_iEXTr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
89   InstrItinData<IIC_iEXTAr, [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [3, 1, 1]>,
90   InstrItinData<IIC_iEXTAsr,[InstrStage<3, [A9_Pipe0, A9_Pipe1]>],[3, 1, 1, 1]>,
91   //
92   // Compare instructions
93   InstrItinData<IIC_iCMPi   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
94                               [1], [A9_LdBypass]>,
95   InstrItinData<IIC_iCMPr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
96                               [1, 1], [A9_LdBypass, A9_LdBypass]>,
97   InstrItinData<IIC_iCMPsi  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>],
98                               [1, 1], [A9_LdBypass, NoBypass]>,
99   InstrItinData<IIC_iCMPsr  , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>],
100                               [1, 1, 1], [A9_LdBypass, NoBypass, NoBypass]>,
101   //
102   // Test instructions
103   InstrItinData<IIC_iTSTi   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1]>,
104   InstrItinData<IIC_iTSTr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
105   InstrItinData<IIC_iTSTsi  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
106   InstrItinData<IIC_iTSTsr  , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [1, 1, 1]>,
107   //
108   // Move instructions, conditional
109   // FIXME: Correctly model the extra input dep on the destination.
110   InstrItinData<IIC_iCMOVi  , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1]>,
111   InstrItinData<IIC_iCMOVr  , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
112   InstrItinData<IIC_iCMOVsi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
113   InstrItinData<IIC_iCMOVsr , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
114
115   // Integer multiply pipeline
116   //
117   InstrItinData<IIC_iMUL16   , [InstrStage<1, [A9_Pipe1], 0>,
118                                 InstrStage<2, [A9_Pipe0]>], [3, 1, 1]>,
119   InstrItinData<IIC_iMAC16   , [InstrStage<1, [A9_Pipe1], 0>,
120                                 InstrStage<2, [A9_Pipe0]>], [3, 1, 1, 1]>,
121   InstrItinData<IIC_iMUL32   , [InstrStage<1, [A9_Pipe1], 0>,
122                                 InstrStage<2, [A9_Pipe0]>], [4, 1, 1]>,
123   InstrItinData<IIC_iMAC32   , [InstrStage<1, [A9_Pipe1], 0>,
124                                 InstrStage<2, [A9_Pipe0]>], [4, 1, 1, 1]>,
125   InstrItinData<IIC_iMUL64   , [InstrStage<1, [A9_Pipe1], 0>,
126                                 InstrStage<3, [A9_Pipe0]>], [4, 5, 1, 1]>,
127   InstrItinData<IIC_iMAC64   , [InstrStage<1, [A9_Pipe1], 0>,
128                                 InstrStage<3, [A9_Pipe0]>], [4, 5, 1, 1]>,
129   // Integer load pipeline
130   // FIXME: The timings are some rough approximations
131   //
132   // Immediate offset
133   InstrItinData<IIC_iLoad_i   , [InstrStage<1, [A9_Pipe1]>,
134                                  InstrStage<1, [A9_MUX0, A9_AGU]>],
135                                 [3, 1], [A9_LdBypass]>,
136   InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A9_Pipe1]>,
137                                  InstrStage<2, [A9_MUX0, A9_AGU]>],
138                                 [4, 1], [A9_LdBypass]>,
139   // FIXME: If address is 64-bit aligned, AGU cycles is 1.
140   InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [A9_Pipe1]>,
141                                  InstrStage<2, [A9_MUX0, A9_AGU]>],
142                                 [3, 3, 1], [A9_LdBypass]>,
143   //
144   // Register offset
145   InstrItinData<IIC_iLoad_r   , [InstrStage<1, [A9_Pipe1]>,
146                                  InstrStage<1, [A9_MUX0, A9_AGU]>],
147                                 [3, 1, 1], [A9_LdBypass]>,
148   InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A9_Pipe1]>,
149                                  InstrStage<2, [A9_MUX0, A9_AGU]>],
150                                 [4, 1, 1], [A9_LdBypass]>,
151   InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A9_Pipe1]>,
152                                  InstrStage<2, [A9_MUX0, A9_AGU]>],
153                                 [3, 3, 1, 1], [A9_LdBypass]>,
154   //
155   // Scaled register offset
156   InstrItinData<IIC_iLoad_si  , [InstrStage<1, [A9_Pipe1]>,
157                                  InstrStage<1, [A9_MUX0, A9_AGU]>],
158                                 [4, 1, 1], [A9_LdBypass]>,
159   InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [A9_Pipe1]>,
160                                  InstrStage<2, [A9_MUX0, A9_AGU]>],
161                                 [5, 1, 1], [A9_LdBypass]>,
162   //
163   // Immediate offset with update
164   InstrItinData<IIC_iLoad_iu  , [InstrStage<1, [A9_Pipe1]>,
165                                  InstrStage<1, [A9_MUX0, A9_AGU]>],
166                                 [3, 2, 1], [A9_LdBypass]>,
167   InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A9_Pipe1]>,
168                                  InstrStage<2, [A9_MUX0, A9_AGU]>],
169                                 [4, 3, 1], [A9_LdBypass]>,
170   //
171   // Register offset with update
172   InstrItinData<IIC_iLoad_ru  , [InstrStage<1, [A9_Pipe1]>,
173                                  InstrStage<1, [A9_MUX0, A9_AGU]>],
174                                 [3, 2, 1, 1], [A9_LdBypass]>,
175   InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A9_Pipe1]>,
176                                  InstrStage<2, [A9_MUX0, A9_AGU]>],
177                                 [4, 3, 1, 1], [A9_LdBypass]>,
178   InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A9_Pipe1]>,
179                                  InstrStage<2, [A9_MUX0, A9_AGU]>],
180                                 [3, 3, 1, 1], [A9_LdBypass]>,
181   //
182   // Scaled register offset with update
183   InstrItinData<IIC_iLoad_siu , [InstrStage<1, [A9_Pipe1]>,
184                                  InstrStage<1, [A9_MUX0, A9_AGU]>],
185                                 [4, 3, 1, 1], [A9_LdBypass]>,
186   InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [A9_Pipe1]>,
187                                   InstrStage<2, [A9_MUX0, A9_AGU]>],
188                                  [5, 4, 1, 1], [A9_LdBypass]>,
189   //
190   // Load multiple
191   InstrItinData<IIC_iLoadm   , [InstrStage<1, [A9_Pipe1]>,
192                                 InstrStage<2, [A9_MUX0, A9_AGU]>],
193                                [3], [A9_LdBypass]>,
194
195   //
196   // Load multiple plus branch
197   InstrItinData<IIC_iLoadmBr , [InstrStage<1, [A9_Pipe1]>,
198                                 InstrStage<1, [A9_MUX0, A9_AGU]>,
199                                 InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>,
200
201   //
202   // iLoadi + iALUr for t2LDRpci_pic.
203   InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A9_Pipe1]>,
204                                 InstrStage<1, [A9_MUX0, A9_AGU]>,
205                                 InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
206                                [2, 1]>,
207
208   // Integer store pipeline
209   ///
210   // Immediate offset
211   InstrItinData<IIC_iStore_i  , [InstrStage<1, [A9_Pipe1]>,
212                                  InstrStage<1, [A9_MUX0, A9_AGU]>], [1, 1]>,
213   InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A9_Pipe1]>,
214                                  InstrStage<2, [A9_MUX0, A9_AGU]>], [1, 1]>,
215   // FIXME: If address is 64-bit aligned, AGU cycles is 1.
216   InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A9_Pipe1]>,
217                                  InstrStage<2, [A9_MUX0, A9_AGU]>], [1, 1]>,
218   //
219   // Register offset
220   InstrItinData<IIC_iStore_r  , [InstrStage<1, [A9_Pipe1]>,
221                                  InstrStage<1, [A9_MUX0, A9_AGU]>], [1, 1, 1]>,
222   InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [A9_Pipe1]>,
223                                  InstrStage<2, [A9_MUX0, A9_AGU]>], [1, 1, 1]>,
224   InstrItinData<IIC_iStore_d_r, [InstrStage<1, [A9_Pipe1]>,
225                                  InstrStage<2, [A9_MUX0, A9_AGU]>], [1, 1, 1]>,
226   //
227   // Scaled register offset
228   InstrItinData<IIC_iStore_si , [InstrStage<1, [A9_Pipe1]>,
229                                  InstrStage<1, [A9_MUX0, A9_AGU]>], [1, 1, 1]>,
230   InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [A9_Pipe1]>,
231                                  InstrStage<2, [A9_MUX0, A9_AGU]>], [1, 1, 1]>,
232   //
233   // Immediate offset with update
234   InstrItinData<IIC_iStore_iu , [InstrStage<1, [A9_Pipe1]>,
235                                  InstrStage<1, [A9_MUX0, A9_AGU]>], [2, 1, 1]>,
236   InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A9_Pipe1]>,
237                                   InstrStage<2, [A9_MUX0, A9_AGU]>], [3, 1, 1]>,
238   //
239   // Register offset with update
240   InstrItinData<IIC_iStore_ru , [InstrStage<1, [A9_Pipe1]>,
241                                  InstrStage<1, [A9_MUX0, A9_AGU]>],
242                                 [2, 1, 1, 1]>,
243   InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A9_Pipe1]>,
244                                   InstrStage<2, [A9_MUX0, A9_AGU]>],
245                                  [3, 1, 1, 1]>,
246   InstrItinData<IIC_iStore_d_ru,[InstrStage<1, [A9_Pipe1]>,
247                                  InstrStage<2, [A9_MUX0, A9_AGU]>],
248                                 [3, 1, 1, 1]>,
249   //
250   // Scaled register offset with update
251   InstrItinData<IIC_iStore_siu, [InstrStage<1, [A9_Pipe1]>,
252                                  InstrStage<1, [A9_MUX0, A9_AGU]>],
253                                 [2, 1, 1, 1]>,
254   InstrItinData<IIC_iStore_bh_siu,[InstrStage<1, [A9_Pipe1]>,
255                                    InstrStage<2, [A9_MUX0, A9_AGU]>],
256                                   [3, 1, 1, 1]>,
257   //
258   // Store multiple
259   InstrItinData<IIC_iStorem  , [InstrStage<1, [A9_Pipe1]>,
260                                 InstrStage<1, [A9_MUX0, A9_AGU]>]>,
261   // Branch
262   //
263   // no delay slots, so the latency of a branch is unimportant
264   InstrItinData<IIC_Br       , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>,
265
266   // VFP and NEON shares the same register file. This means that every VFP
267   // instruction should wait for full completion of the consecutive NEON
268   // instruction and vice-versa. We model this behavior with two artificial FUs:
269   // DRegsVFP and DRegsVFP.
270   //
271   // Every VFP instruction:
272   //  - Acquires DRegsVFP resource for 1 cycle
273   //  - Reserves DRegsN resource for the whole duration (including time to
274   //    register file writeback!).
275   // Every NEON instruction does the same but with FUs swapped.
276   //
277   // Since the reserved FU cannot be acquired, this models precisely
278   // "cross-domain" stalls.
279
280   // VFP
281   // Issue through integer pipeline, and execute in NEON unit.
282
283   // FP Special Register to Integer Register File Move
284   InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
285                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
286                               InstrStage<1, [A9_Pipe1]>,
287                               InstrStage<1, [A9_MUX0, A9_NPipe]>]>,
288   //
289   // Single-precision FP Unary
290   InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
291                                // Extra latency cycles since wbck is 2 cycles
292                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
293                                InstrStage<1, [A9_Pipe1]>,
294                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
295                               [1, 1]>,
296   //
297   // Double-precision FP Unary
298   InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
299                                // Extra latency cycles since wbck is 2 cycles
300                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
301                                InstrStage<1, [A9_Pipe1]>,
302                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
303                               [1, 1]>,
304
305   //
306   // Single-precision FP Compare
307   InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
308                                // Extra latency cycles since wbck is 4 cycles
309                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
310                                InstrStage<1, [A9_Pipe1]>,
311                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
312                               [1, 1]>,
313   //
314   // Double-precision FP Compare
315   InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
316                                // Extra latency cycles since wbck is 4 cycles
317                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
318                                InstrStage<1, [A9_Pipe1]>,
319                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
320                               [1, 1]>,
321   //
322   // Single to Double FP Convert
323   InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
324                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
325                                InstrStage<1, [A9_Pipe1]>,
326                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
327                               [4, 1]>,
328   //
329   // Double to Single FP Convert
330   InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
331                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
332                                InstrStage<1, [A9_Pipe1]>,
333                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
334                               [4, 1]>,
335
336   //
337   // Single to Half FP Convert
338   InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
339                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
340                                InstrStage<1, [A9_Pipe1]>,
341                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
342                               [4, 1]>,
343   //
344   // Half to Single FP Convert
345   InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
346                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
347                                InstrStage<1, [A9_Pipe1]>,
348                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
349                               [2, 1]>,
350
351   //
352   // Single-Precision FP to Integer Convert
353   InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
354                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
355                                InstrStage<1, [A9_Pipe1]>,
356                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
357                               [4, 1]>,
358   //
359   // Double-Precision FP to Integer Convert
360   InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
361                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
362                                InstrStage<1, [A9_Pipe1]>,
363                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
364                               [4, 1]>,
365   //
366   // Integer to Single-Precision FP Convert
367   InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
368                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
369                                InstrStage<1, [A9_Pipe1]>,
370                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
371                               [4, 1]>,
372   //
373   // Integer to Double-Precision FP Convert
374   InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
375                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
376                                InstrStage<1, [A9_Pipe1]>,
377                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
378                               [4, 1]>,
379   //
380   // Single-precision FP ALU
381   InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
382                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
383                                InstrStage<1, [A9_Pipe1]>,
384                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
385                               [4, 1, 1]>,
386   //
387   // Double-precision FP ALU
388   InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
389                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
390                                InstrStage<1, [A9_Pipe1]>,
391                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
392                               [4, 1, 1]>,
393   //
394   // Single-precision FP Multiply
395   InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
396                                InstrStage<6, [A9_DRegsN],   0, Reserved>,
397                                InstrStage<1, [A9_Pipe1]>,
398                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
399                               [5, 1, 1]>,
400   //
401   // Double-precision FP Multiply
402   InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
403                                InstrStage<7, [A9_DRegsN],   0, Reserved>,
404                                InstrStage<1, [A9_Pipe1]>,
405                                InstrStage<2, [A9_MUX0, A9_NPipe]>],
406                               [6, 1, 1]>,
407   //
408   // Single-precision FP MAC
409   InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
410                                InstrStage<9, [A9_DRegsN],   0, Reserved>,
411                                InstrStage<1, [A9_Pipe1]>,
412                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
413                               [8, 0, 1, 1]>,
414   //
415   // Double-precision FP MAC
416   InstrItinData<IIC_fpMAC64 , [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
417                                InstrStage<10, [A9_DRegsN],  0, Reserved>,
418                                InstrStage<1,  [A9_Pipe1]>,
419                                InstrStage<2,  [A9_MUX0, A9_NPipe]>],
420                               [9, 0, 1, 1]>,
421   //
422   // Single-precision FP DIV
423   InstrItinData<IIC_fpDIV32 , [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
424                                InstrStage<16, [A9_DRegsN],  0, Reserved>,
425                                InstrStage<1,  [A9_Pipe1]>,
426                                InstrStage<10, [A9_MUX0, A9_NPipe]>],
427                               [15, 1, 1]>,
428   //
429   // Double-precision FP DIV
430   InstrItinData<IIC_fpDIV64 , [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
431                                InstrStage<26, [A9_DRegsN],  0, Reserved>,
432                                InstrStage<1,  [A9_Pipe1]>,
433                                InstrStage<20, [A9_MUX0, A9_NPipe]>],
434                               [25, 1, 1]>,
435   //
436   // Single-precision FP SQRT
437   InstrItinData<IIC_fpSQRT32, [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
438                                InstrStage<18, [A9_DRegsN],   0, Reserved>,
439                                InstrStage<1,  [A9_Pipe1]>,
440                                InstrStage<13, [A9_MUX0, A9_NPipe]>],
441                               [17, 1]>,
442   //
443   // Double-precision FP SQRT
444   InstrItinData<IIC_fpSQRT64, [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
445                                InstrStage<33, [A9_DRegsN],   0, Reserved>,
446                                InstrStage<1,  [A9_Pipe1]>,
447                                InstrStage<28, [A9_MUX0, A9_NPipe]>],
448                               [32, 1]>,
449
450   //
451   // Integer to Single-precision Move
452   InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
453                                // Extra 1 latency cycle since wbck is 2 cycles
454                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
455                                InstrStage<1, [A9_Pipe1]>,
456                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
457                               [1, 1]>,
458   //
459   // Integer to Double-precision Move
460   InstrItinData<IIC_fpMOVID,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
461                                // Extra 1 latency cycle since wbck is 2 cycles
462                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
463                                InstrStage<1, [A9_Pipe1]>,
464                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
465                               [1, 1, 1]>,
466   //
467   // Single-precision to Integer Move
468   InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
469                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
470                                InstrStage<1, [A9_Pipe1]>,
471                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
472                               [1, 1]>,
473   //
474   // Double-precision to Integer Move
475   InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
476                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
477                                InstrStage<1, [A9_Pipe1]>,
478                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
479                               [1, 1, 1]>,
480   //
481   // Single-precision FP Load
482   InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
483                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
484                                InstrStage<1, [A9_Pipe1], 0>,
485                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
486                               [1, 1]>,
487   //
488   // Double-precision FP Load
489   // FIXME: Result latency is 1 if address is 64-bit aligned.
490   InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
491                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
492                                InstrStage<1, [A9_Pipe1], 0>,
493                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
494                               [2, 1]>,
495   //
496   // FP Load Multiple
497   InstrItinData<IIC_fpLoadm,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
498                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
499                                InstrStage<1, [A9_Pipe1], 0>,
500                                InstrStage<1, [A9_MUX0, A9_NPipe]>]>,
501   //
502   // Single-precision FP Store
503   InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
504                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
505                                InstrStage<1, [A9_Pipe1], 0>,
506                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
507                               [1, 1]>,
508   //
509   // Double-precision FP Store
510   InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
511                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
512                                InstrStage<1, [A9_Pipe1], 0>,
513                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
514                               [1, 1]>,
515   //
516   // FP Store Multiple
517   InstrItinData<IIC_fpStorem, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
518                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
519                                InstrStage<1, [A9_Pipe1], 0>,
520                                InstrStage<1, [A9_MUX0, A9_NPipe]>]>,
521   // NEON
522   // Issue through integer pipeline, and execute in NEON unit.
523   // VLD1
524   // FIXME: We don't model this instruction properly
525   InstrItinData<IIC_VLD1,     [InstrStage<1, [A9_DRegsN],   0, Required>,
526                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
527                                InstrStage<1, [A9_Pipe1], 0>,
528                                InstrStage<1, [A9_MUX0, A9_NPipe]>]>,
529   //
530   // VLD2
531   // FIXME: We don't model this instruction properly
532   InstrItinData<IIC_VLD2,     [InstrStage<1, [A9_DRegsN],   0, Required>,
533                                // Extra latency cycles since wbck is 6 cycles
534                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
535                                InstrStage<1, [A9_Pipe1], 0>,
536                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
537                               [2, 2, 1]>,
538   //
539   // VLD3
540   // FIXME: We don't model this instruction properly
541   InstrItinData<IIC_VLD3,     [InstrStage<1, [A9_DRegsN],   0, Required>,
542                                // Extra latency cycles since wbck is 6 cycles
543                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
544                                InstrStage<1, [A9_Pipe1], 0>,
545                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
546                               [2, 2, 2, 1]>,
547   //
548   // VLD4
549   // FIXME: We don't model this instruction properly
550   InstrItinData<IIC_VLD4,     [InstrStage<1, [A9_DRegsN],   0, Required>,
551                                // Extra latency cycles since wbck is 6 cycles
552                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
553                                InstrStage<1, [A9_Pipe1], 0>,
554                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
555                               [2, 2, 2, 2, 1]>,
556   //
557   // VST
558   // FIXME: We don't model this instruction properly
559   InstrItinData<IIC_VST,      [InstrStage<1, [A9_DRegsN],   0, Required>,
560                                // Extra latency cycles since wbck is 6 cycles
561                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
562                                InstrStage<1, [A9_Pipe1], 0>,
563                                InstrStage<1, [A9_MUX0, A9_NPipe]>]>,
564   //
565   // Double-register Integer Unary
566   InstrItinData<IIC_VUNAiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
567                                // Extra latency cycles since wbck is 6 cycles
568                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
569                                InstrStage<1, [A9_Pipe1]>,
570                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
571                               [4, 2]>,
572   //
573   // Quad-register Integer Unary
574   InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
575                                // Extra latency cycles since wbck is 6 cycles
576                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
577                                InstrStage<1, [A9_Pipe1]>,
578                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
579                               [4, 2]>,
580   //
581   // Double-register Integer Q-Unary
582   InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [A9_DRegsN],   0, Required>,
583                                // Extra latency cycles since wbck is 6 cycles
584                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
585                                InstrStage<1, [A9_Pipe1]>,
586                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
587                               [4, 1]>,
588   //
589   // Quad-register Integer CountQ-Unary
590   InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
591                                // Extra latency cycles since wbck is 6 cycles
592                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
593                                InstrStage<1, [A9_Pipe1]>,
594                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
595                               [4, 1]>,
596   //
597   // Double-register Integer Binary
598   InstrItinData<IIC_VBINiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
599                                // Extra latency cycles since wbck is 6 cycles
600                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
601                                InstrStage<1, [A9_Pipe1]>,
602                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
603                               [3, 2, 2]>,
604   //
605   // Quad-register Integer Binary
606   InstrItinData<IIC_VBINiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
607                                // Extra latency cycles since wbck is 6 cycles
608                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
609                                InstrStage<1, [A9_Pipe1]>,
610                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
611                               [3, 2, 2]>,
612   //
613   // Double-register Integer Subtract
614   InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
615                                // Extra latency cycles since wbck is 6 cycles
616                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
617                                InstrStage<1, [A9_Pipe1]>,
618                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
619                               [3, 2, 1]>,
620   //
621   // Quad-register Integer Subtract
622   InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
623                                // Extra latency cycles since wbck is 6 cycles
624                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
625                                InstrStage<1, [A9_Pipe1]>,
626                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
627                               [3, 2, 1]>,
628   //
629   // Double-register Integer Shift
630   InstrItinData<IIC_VSHLiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
631                                // Extra latency cycles since wbck is 6 cycles
632                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
633                                InstrStage<1, [A9_Pipe1]>,
634                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
635                               [3, 1, 1]>,
636   //
637   // Quad-register Integer Shift
638   InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
639                                // Extra latency cycles since wbck is 6 cycles
640                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
641                                InstrStage<1, [A9_Pipe1]>,
642                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
643                               [3, 1, 1]>,
644   //
645   // Double-register Integer Shift (4 cycle)
646   InstrItinData<IIC_VSHLi4D,  [InstrStage<1, [A9_DRegsN],   0, Required>,
647                                // Extra latency cycles since wbck is 6 cycles
648                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
649                                InstrStage<1, [A9_Pipe1]>,
650                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
651                               [4, 1, 1]>,
652   //
653   // Quad-register Integer Shift (4 cycle)
654   InstrItinData<IIC_VSHLi4Q,  [InstrStage<1, [A9_DRegsN],   0, Required>,
655                                // Extra latency cycles since wbck is 6 cycles
656                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
657                                InstrStage<1, [A9_Pipe1]>,
658                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
659                               [4, 1, 1]>,
660   //
661   // Double-register Integer Binary (4 cycle)
662   InstrItinData<IIC_VBINi4D,  [InstrStage<1, [A9_DRegsN],   0, Required>,
663                                // Extra latency cycles since wbck is 6 cycles
664                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
665                                InstrStage<1, [A9_Pipe1]>,
666                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
667                               [4, 2, 2]>,
668   //
669   // Quad-register Integer Binary (4 cycle)
670   InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [A9_DRegsN],   0, Required>,
671                                // Extra latency cycles since wbck is 6 cycles
672                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
673                                InstrStage<1, [A9_Pipe1]>,
674                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
675                               [4, 2, 2]>,
676   //
677   // Double-register Integer Subtract (4 cycle)
678   InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
679                                // Extra latency cycles since wbck is 6 cycles
680                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
681                                InstrStage<1, [A9_Pipe1]>,
682                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
683                               [4, 2, 1]>,
684   //
685   // Quad-register Integer Subtract (4 cycle)
686   InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
687                                // Extra latency cycles since wbck is 6 cycles
688                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
689                                InstrStage<1, [A9_Pipe1]>,
690                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
691                               [4, 2, 1]>,
692
693   //
694   // Double-register Integer Count
695   InstrItinData<IIC_VCNTiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
696                                // Extra latency cycles since wbck is 6 cycles
697                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
698                                InstrStage<1, [A9_Pipe1]>,
699                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
700                               [3, 2, 2]>,
701   //
702   // Quad-register Integer Count
703   // Result written in N3, but that is relative to the last cycle of multicycle,
704   // so we use 4 for those cases
705   InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
706                                // Extra latency cycles since wbck is 7 cycles
707                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
708                                InstrStage<1, [A9_Pipe1]>,
709                                InstrStage<2, [A9_MUX0, A9_NPipe]>],
710                               [4, 2, 2]>,
711   //
712   // Double-register Absolute Difference and Accumulate
713   InstrItinData<IIC_VABAD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
714                                // Extra latency cycles since wbck is 6 cycles
715                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
716                                InstrStage<1, [A9_Pipe1]>,
717                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
718                               [6, 3, 2, 1]>,
719   //
720   // Quad-register Absolute Difference and Accumulate
721   InstrItinData<IIC_VABAQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
722                                // Extra latency cycles since wbck is 6 cycles
723                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
724                                InstrStage<1, [A9_Pipe1]>,
725                                InstrStage<2, [A9_MUX0, A9_NPipe]>],
726                               [6, 3, 2, 1]>,
727   //
728   // Double-register Integer Pair Add Long
729   InstrItinData<IIC_VPALiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
730                                // Extra latency cycles since wbck is 6 cycles
731                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
732                                InstrStage<1, [A9_Pipe1]>,
733                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
734                               [6, 3, 1]>,
735   //
736   // Quad-register Integer Pair Add Long
737   InstrItinData<IIC_VPALiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
738                                // Extra latency cycles since wbck is 6 cycles
739                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
740                                InstrStage<1, [A9_Pipe1]>,
741                                InstrStage<2, [A9_MUX0, A9_NPipe]>],
742                               [6, 3, 1]>,
743
744   //
745   // Double-register Integer Multiply (.8, .16)
746   InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_DRegsN],   0, Required>,
747                                // Extra latency cycles since wbck is 6 cycles
748                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
749                                InstrStage<1, [A9_Pipe1]>,
750                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
751                               [6, 2, 2]>,
752   //
753   // Quad-register Integer Multiply (.8, .16)
754   InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
755                                // Extra latency cycles since wbck is 7 cycles
756                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
757                                InstrStage<1, [A9_Pipe1]>,
758                                InstrStage<2, [A9_MUX0, A9_NPipe]>],
759                               [7, 2, 2]>,
760
761   //
762   // Double-register Integer Multiply (.32)
763   InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_DRegsN],   0, Required>,
764                                // Extra latency cycles since wbck is 7 cycles
765                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
766                                InstrStage<1, [A9_Pipe1]>,
767                                InstrStage<2, [A9_MUX0, A9_NPipe]>],
768                               [7, 2, 1]>,
769   //
770   // Quad-register Integer Multiply (.32)
771   InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
772                                // Extra latency cycles since wbck is 9 cycles
773                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
774                                InstrStage<1, [A9_Pipe1]>,
775                                InstrStage<4, [A9_MUX0, A9_NPipe]>],
776                               [9, 2, 1]>,
777   //
778   // Double-register Integer Multiply-Accumulate (.8, .16)
779   InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_DRegsN],   0, Required>,
780                                // Extra latency cycles since wbck is 6 cycles
781                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
782                                InstrStage<1, [A9_Pipe1]>,
783                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
784                               [6, 3, 2, 2]>,
785   //
786   // Double-register Integer Multiply-Accumulate (.32)
787   InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_DRegsN],   0, Required>,
788                                // Extra latency cycles since wbck is 7 cycles
789                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
790                                InstrStage<1, [A9_Pipe1]>,
791                                InstrStage<2, [A9_MUX0, A9_NPipe]>],
792                               [7, 3, 2, 1]>,
793   //
794   // Quad-register Integer Multiply-Accumulate (.8, .16)
795   InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
796                                // Extra latency cycles since wbck is 7 cycles
797                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
798                                InstrStage<1, [A9_Pipe1]>,
799                                InstrStage<2, [A9_MUX0, A9_NPipe]>],
800                               [7, 3, 2, 2]>,
801   //
802   // Quad-register Integer Multiply-Accumulate (.32)
803   InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
804                                // Extra latency cycles since wbck is 9 cycles
805                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
806                                InstrStage<1, [A9_Pipe1]>,
807                                InstrStage<4, [A9_MUX0, A9_NPipe]>],
808                               [9, 3, 2, 1]>,
809
810   //
811   // Move
812   InstrItinData<IIC_VMOV,     [InstrStage<1, [A9_DRegsN],   0, Required>,
813                                InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
814                                InstrStage<1, [A9_Pipe1]>,
815                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
816                               [1,1]>,
817   //
818   // Move Immediate
819   InstrItinData<IIC_VMOVImm,  [InstrStage<1, [A9_DRegsN],   0, Required>,
820                                // Extra latency cycles since wbck is 6 cycles
821                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
822                                InstrStage<1, [A9_Pipe1]>,
823                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
824                               [3]>,
825   //
826   // Double-register Permute Move
827   InstrItinData<IIC_VMOVD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
828   // FIXME: all latencies are arbitrary, no information is available
829                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
830                                InstrStage<1, [A9_Pipe1]>,
831                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
832                               [2, 1]>,
833   //
834   // Quad-register Permute Move
835   InstrItinData<IIC_VMOVQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
836   // FIXME: all latencies are arbitrary, no information is available
837                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
838                                InstrStage<1, [A9_Pipe1]>,
839                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
840                               [2, 1]>,
841   //
842   // Integer to Single-precision Move
843   InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
844   // FIXME: all latencies are arbitrary, no information is available
845                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
846                                InstrStage<1, [A9_Pipe1]>,
847                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
848                               [2, 1]>,
849   //
850   // Integer to Double-precision Move
851   InstrItinData<IIC_VMOVID ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
852   // FIXME: all latencies are arbitrary, no information is available
853                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
854                                InstrStage<1, [A9_Pipe1]>,
855                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
856                               [2, 1, 1]>,
857   //
858   // Single-precision to Integer Move
859   InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
860   // FIXME: all latencies are arbitrary, no information is available
861                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
862                                InstrStage<1, [A9_Pipe1]>,
863                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
864                               [2, 1]>,
865   //
866   // Double-precision to Integer Move
867   InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
868   // FIXME: all latencies are arbitrary, no information is available
869                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
870                                InstrStage<1, [A9_Pipe1]>,
871                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
872                               [2, 2, 1]>,
873   //
874   // Integer to Lane Move
875   InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_DRegsN],   0, Required>,
876   // FIXME: all latencies are arbitrary, no information is available
877                                InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
878                                InstrStage<1, [A9_Pipe1]>,
879                                InstrStage<2, [A9_MUX0, A9_NPipe]>],
880                               [3, 1, 1]>,
881
882   //
883   // Vector narrow move
884   InstrItinData<IIC_VMOVN,    [InstrStage<1, [A9_DRegsN],   0, Required>,
885                                // Extra latency cycles since wbck is 6 cycles
886                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
887                                InstrStage<1, [A9_Pipe1]>,
888                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
889                               [3, 1]>,
890   //
891   // Double-register FP Unary
892   InstrItinData<IIC_VUNAD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
893                                // Extra latency cycles since wbck is 6 cycles
894                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
895                                InstrStage<1, [A9_Pipe1]>,
896                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
897                               [5, 2]>,
898   //
899   // Quad-register FP Unary
900   // Result written in N5, but that is relative to the last cycle of multicycle,
901   // so we use 6 for those cases
902   InstrItinData<IIC_VUNAQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
903                                // Extra latency cycles since wbck is 7 cycles
904                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
905                                InstrStage<1, [A9_Pipe1]>,
906                                InstrStage<2, [A9_MUX0, A9_NPipe]>],
907                               [6, 2]>,
908   //
909   // Double-register FP Binary
910   // FIXME: We're using this itin for many instructions and [2, 2] here is too
911   // optimistic.
912   InstrItinData<IIC_VBIND,    [InstrStage<1, [A9_DRegsN],   0, Required>,
913                                // Extra latency cycles since wbck is 7 cycles
914                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
915                                InstrStage<1, [A9_Pipe1]>,
916                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
917                               [5, 2, 2]>,
918   //
919   // Quad-register FP Binary
920   // Result written in N5, but that is relative to the last cycle of multicycle,
921   // so we use 6 for those cases
922   // FIXME: We're using this itin for many instructions and [2, 2] here is too
923   // optimistic.
924   InstrItinData<IIC_VBINQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
925                                // Extra latency cycles since wbck is 8 cycles
926                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
927                                InstrStage<1, [A9_Pipe1]>,
928                                InstrStage<2, [A9_MUX0, A9_NPipe]>],
929                               [6, 2, 2]>,
930   //
931   // Double-register FP Multiple-Accumulate
932   InstrItinData<IIC_VMACD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
933                                // Extra latency cycles since wbck is 7 cycles
934                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
935                                InstrStage<1, [A9_Pipe1]>,
936                                InstrStage<2, [A9_MUX0, A9_NPipe]>],
937                               [6, 3, 2, 1]>,
938   //
939   // Quad-register FP Multiple-Accumulate
940   // Result written in N9, but that is relative to the last cycle of multicycle,
941   // so we use 10 for those cases
942   InstrItinData<IIC_VMACQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
943                                // Extra latency cycles since wbck is 9 cycles
944                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
945                                InstrStage<1, [A9_Pipe1]>,
946                                InstrStage<4, [A9_MUX0, A9_NPipe]>],
947                               [8, 4, 2, 1]>,
948   //
949   // Double-register Reciprical Step
950   InstrItinData<IIC_VRECSD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
951                                // Extra latency cycles since wbck is 7 cycles
952                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
953                                InstrStage<1, [A9_Pipe1]>,
954                                InstrStage<2, [A9_MUX0, A9_NPipe]>],
955                               [6, 2, 2]>,
956   //
957   // Quad-register Reciprical Step
958   InstrItinData<IIC_VRECSQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
959                                // Extra latency cycles since wbck is 9 cycles
960                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
961                                InstrStage<1, [A9_Pipe1]>,
962                                InstrStage<4, [A9_MUX0, A9_NPipe]>],
963                               [8, 2, 2]>,
964   //
965   // Double-register Permute
966   InstrItinData<IIC_VPERMD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
967                                // Extra latency cycles since wbck is 6 cycles
968                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
969                                InstrStage<1, [A9_Pipe1]>,
970                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
971                               [2, 2, 1, 1]>,
972   //
973   // Quad-register Permute
974   // Result written in N2, but that is relative to the last cycle of multicycle,
975   // so we use 3 for those cases
976   InstrItinData<IIC_VPERMQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
977                                // Extra latency cycles since wbck is 7 cycles
978                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
979                                InstrStage<1, [A9_Pipe1]>,
980                                InstrStage<2, [A9_MUX0, A9_NPipe]>],
981                               [3, 3, 1, 1]>,
982   //
983   // Quad-register Permute (3 cycle issue)
984   // Result written in N2, but that is relative to the last cycle of multicycle,
985   // so we use 4 for those cases
986   InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [A9_DRegsN],   0, Required>,
987                                // Extra latency cycles since wbck is 8 cycles
988                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
989                                InstrStage<1, [A9_Pipe1]>,
990                                InstrStage<3, [A9_MUX0, A9_NPipe]>],
991                               [4, 4, 1, 1]>,
992
993   //
994   // Double-register VEXT
995   InstrItinData<IIC_VEXTD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
996                                // Extra latency cycles since wbck is 7 cycles
997                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
998                                InstrStage<1, [A9_Pipe1]>,
999                                InstrStage<1, [A9_MUX0, A9_NPipe]>],
1000                               [2, 1, 1]>,
1001   //
1002   // Quad-register VEXT
1003   InstrItinData<IIC_VEXTQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
1004                                // Extra latency cycles since wbck is 9 cycles
1005                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1006                                InstrStage<1, [A9_Pipe1]>,
1007                                InstrStage<2, [A9_MUX0, A9_NPipe]>],
1008                               [3, 1, 1]>,
1009   //
1010   // VTB
1011   InstrItinData<IIC_VTB1,     [InstrStage<1, [A9_DRegsN],   0, Required>,
1012                                // Extra latency cycles since wbck is 7 cycles
1013                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1014                                InstrStage<1, [A9_Pipe1]>,
1015                                InstrStage<2, [A9_MUX0, A9_NPipe]>],
1016                               [3, 2, 1]>,
1017   InstrItinData<IIC_VTB2,     [InstrStage<2, [A9_DRegsN],   0, Required>,
1018                                // Extra latency cycles since wbck is 7 cycles
1019                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1020                                InstrStage<1, [A9_Pipe1]>,
1021                                InstrStage<2, [A9_MUX0, A9_NPipe]>],
1022                               [3, 2, 2, 1]>,
1023   InstrItinData<IIC_VTB3,     [InstrStage<2, [A9_DRegsN],   0, Required>,
1024                                // Extra latency cycles since wbck is 8 cycles
1025                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
1026                                InstrStage<1, [A9_Pipe1]>,
1027                                InstrStage<3, [A9_MUX0, A9_NPipe]>],
1028                               [4, 2, 2, 3, 1]>,
1029   InstrItinData<IIC_VTB4,     [InstrStage<1, [A9_DRegsN],   0, Required>,
1030                                // Extra latency cycles since wbck is 8 cycles
1031                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
1032                                InstrStage<1, [A9_Pipe1]>,
1033                                InstrStage<3, [A9_MUX0, A9_NPipe]>],
1034                               [4, 2, 2, 3, 3, 1]>,
1035   //
1036   // VTBX
1037   InstrItinData<IIC_VTBX1,    [InstrStage<1, [A9_DRegsN],   0, Required>,
1038                                // Extra latency cycles since wbck is 7 cycles
1039                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1040                                InstrStage<1, [A9_Pipe1]>,
1041                                InstrStage<2, [A9_MUX0, A9_NPipe]>],
1042                               [3, 1, 2, 1]>,
1043   InstrItinData<IIC_VTBX2,    [InstrStage<1, [A9_DRegsN],   0, Required>,
1044                                // Extra latency cycles since wbck is 7 cycles
1045                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1046                                InstrStage<1, [A9_Pipe1]>,
1047                                InstrStage<2, [A9_MUX0, A9_NPipe]>],
1048                               [3, 1, 2, 2, 1]>,
1049   InstrItinData<IIC_VTBX3,    [InstrStage<1, [A9_DRegsN],   0, Required>,
1050                                // Extra latency cycles since wbck is 8 cycles
1051                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
1052                                InstrStage<1, [A9_Pipe1]>,
1053                                InstrStage<3, [A9_MUX0, A9_NPipe]>],
1054                               [4, 1, 2, 2, 3, 1]>,
1055   InstrItinData<IIC_VTBX4,    [InstrStage<1, [A9_DRegsN],   0, Required>,
1056                                // Extra latency cycles since wbck is 8 cycles
1057                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
1058                                InstrStage<1, [A9_Pipe1]>,
1059                                InstrStage<2, [A9_MUX0, A9_NPipe]>],
1060                               [4, 1, 2, 2, 3, 3, 1]>
1061 ]>;