Some rough approximations for load/stores on A9
[oota-llvm.git] / lib / Target / ARM / ARMScheduleA9.td
1 //=- ARMScheduleA9.td - ARM Cortex-A9 Scheduling Definitions -*- tablegen -*-=//
2 // 
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 // 
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the itinerary class data for the ARM Cortex A9 processors.
11 //
12 //===----------------------------------------------------------------------===//
13
14 //
15 // Ad-hoc scheduling information derived from pretty vague "Cortex-A9 Technical
16 // Reference Manual".
17 //
18 // Functional units
19 def A9_Issue   : FuncUnit; // issue
20 def A9_Pipe0   : FuncUnit; // pipeline 0
21 def A9_Pipe1   : FuncUnit; // pipeline 1
22 def A9_LSPipe  : FuncUnit; // LS pipe
23 def A9_NPipe   : FuncUnit; // NEON ALU/MUL pipe
24 def A9_DRegsVFP: FuncUnit; // FP register set, VFP side
25 def A9_DRegsN  : FuncUnit; // FP register set, NEON side
26
27 // Dual issue pipeline represented by A9_Pipe0 | A9_Pipe1
28 //
29 def CortexA9Itineraries : ProcessorItineraries<
30   [A9_NPipe, A9_DRegsN, A9_DRegsVFP, A9_LSPipe, A9_Pipe0, A9_Pipe1, A9_Issue], [
31   // Two fully-pipelined integer ALU pipelines
32   // FIXME: There are no operand latencies for these instructions at all!
33   //
34   // Move instructions, unconditional
35   InstrItinData<IIC_iMOVi    , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1]>,
36   InstrItinData<IIC_iMOVr    , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
37   InstrItinData<IIC_iMOVsi   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
38   InstrItinData<IIC_iMOVsr   , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 2, 1]>,
39   //
40   // No operand cycles
41   InstrItinData<IIC_iALUx    , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>,
42   //
43   // Binary Instructions that produce a result
44   InstrItinData<IIC_iALUi    , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2]>,
45   InstrItinData<IIC_iALUr    , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2, 2]>,
46   InstrItinData<IIC_iALUsi   , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 2, 1]>,
47   InstrItinData<IIC_iALUsr   , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [2, 2, 1, 1]>,
48   //
49   // Unary Instructions that produce a result
50   InstrItinData<IIC_iUNAr    , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2]>,
51   InstrItinData<IIC_iUNAsi   , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
52   InstrItinData<IIC_iUNAsr   , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
53   //
54   // Compare instructions
55   InstrItinData<IIC_iCMPi    , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2]>,
56   InstrItinData<IIC_iCMPr    , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2]>,
57   InstrItinData<IIC_iCMPsi   , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
58   InstrItinData<IIC_iCMPsr   , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
59   //
60   // Move instructions, conditional
61   InstrItinData<IIC_iCMOVi   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2]>,
62   InstrItinData<IIC_iCMOVr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
63   InstrItinData<IIC_iCMOVsi  , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
64   InstrItinData<IIC_iCMOVsr  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
65
66   // Integer multiply pipeline
67   //
68   InstrItinData<IIC_iMUL16   , [InstrStage<1, [A9_Pipe1], 0>,
69                                 InstrStage<2, [A9_Pipe0]>], [4, 1, 1]>,
70   InstrItinData<IIC_iMAC16   , [InstrStage<1, [A9_Pipe1], 0>,
71                                 InstrStage<2, [A9_Pipe0]>], [4, 1, 1, 2]>,
72   InstrItinData<IIC_iMUL32   , [InstrStage<1, [A9_Pipe1], 0>,
73                                 InstrStage<2, [A9_Pipe0]>], [4, 1, 1]>,
74   InstrItinData<IIC_iMAC32   , [InstrStage<1, [A9_Pipe1], 0>,
75                                 InstrStage<2, [A9_Pipe0]>], [4, 1, 1, 2]>,
76   InstrItinData<IIC_iMUL64   , [InstrStage<2, [A9_Pipe1], 0>,
77                                 InstrStage<3, [A9_Pipe0]>], [4, 5, 1, 1]>,
78   InstrItinData<IIC_iMAC64   , [InstrStage<2, [A9_Pipe1], 0>,
79                                 InstrStage<3, [A9_Pipe0]>], [4, 5, 1, 1]>,
80   // Integer load pipeline
81   // FIXME: The timings are some rough approximations
82   //
83   // Immediate offset
84   InstrItinData<IIC_iLoadi   , [InstrStage<1, [A9_Pipe1]>,
85                                 InstrStage<1, [A9_LSPipe]>], [3, 1]>,
86   //
87   // Register offset
88   InstrItinData<IIC_iLoadr   , [InstrStage<1, [A9_Pipe1]>,
89                                 InstrStage<1, [A9_LSPipe]>], [3, 1, 1]>,
90   //
91   // Scaled register offset
92   InstrItinData<IIC_iLoadsi  , [InstrStage<1, [A9_Pipe1]>,
93                                 InstrStage<2, [A9_LSPipe]>], [4, 1, 1]>,
94   //
95   // Immediate offset with update
96   InstrItinData<IIC_iLoadiu  , [InstrStage<1, [A9_Pipe1]>,
97                                 InstrStage<2, [A9_LSPipe]>], [3, 2, 1]>,
98   //
99   // Register offset with update
100   InstrItinData<IIC_iLoadru  , [InstrStage<1, [A9_Pipe1]>,
101                                 InstrStage<2, [A9_LSPipe]>], [3, 2, 1, 1]>,
102   //
103   // Scaled register offset with update
104   InstrItinData<IIC_iLoadsiu , [InstrStage<1, [A9_Pipe1]>,
105                                 InstrStage<2, [A9_LSPipe]>], [4, 3, 1, 1]>,
106   //
107   // Load multiple
108   InstrItinData<IIC_iLoadm   , [InstrStage<1, [A9_Pipe1]>,
109                                 InstrStage<1, [A9_LSPipe]>]>,
110
111   // Integer store pipeline
112   ///
113   // Immediate offset
114   InstrItinData<IIC_iStorei  , [InstrStage<1, [A9_Pipe1]>,
115                                 InstrStage<1, [A9_LSPipe]>], [3, 1]>,
116   //
117   // Register offset
118   InstrItinData<IIC_iStorer  , [InstrStage<1, [ A9_Pipe1]>,
119                                 InstrStage<1, [A9_LSPipe]>], [3, 1, 1]>,
120   //
121   // Scaled register offset
122   InstrItinData<IIC_iStoresi , [InstrStage<1, [A9_Pipe1]>,
123                                 InstrStage<2, [A9_LSPipe]>], [3, 1, 1]>,
124   //
125   // Immediate offset with update
126   InstrItinData<IIC_iStoreiu , [InstrStage<1, [A9_Pipe1]>,
127                                 InstrStage<1, [A9_LSPipe]>], [2, 3, 1]>,
128   //
129   // Register offset with update
130   InstrItinData<IIC_iStoreru , [InstrStage<1, [A9_Pipe1]>,
131                                 InstrStage<1, [A9_LSPipe]>], [2, 3, 1, 1]>,
132   //
133   // Scaled register offset with update
134   InstrItinData<IIC_iStoresiu, [InstrStage<1, [A9_Pipe1]>,
135                                 InstrStage<2, [A9_LSPipe]>], [3, 3, 1, 1]>,
136   //
137   // Store multiple
138   InstrItinData<IIC_iStorem  , [InstrStage<1, [A9_Pipe1]>,
139                                 InstrStage<1, [A9_LSPipe]>]>,
140   // Branch
141   //
142   // no delay slots, so the latency of a branch is unimportant
143   InstrItinData<IIC_Br       , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>,
144
145   // VFP and NEON shares the same register file. This means that every VFP
146   // instruction should wait for full completion of the consecutive NEON
147   // instruction and vice-versa. We model this behavior with two artificial FUs:
148   // DRegsVFP and DRegsVFP.
149   //
150   // Every VFP instruction:
151   //  - Acquires DRegsVFP resource for 1 cycle
152   //  - Reserves DRegsN resource for the whole duration (including time to
153   //    register file writeback!).
154   // Every NEON instruction does the same but with FUs swapped.
155   //
156   // Since the reserved FU cannot be acquired this models precisly "cross-domain"
157   // stalls.
158
159   // VFP
160   // Issue through integer pipeline, and execute in NEON unit.
161
162   // FP Special Register to Integer Register File Move
163   InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
164                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
165                               InstrStage<1, [A9_Pipe1]>,
166                               InstrStage<1, [A9_NPipe]>]>,
167   //
168   // Single-precision FP Unary
169   InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
170                                // Extra latency cycles since wbck is 2 cycles
171                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
172                                InstrStage<1, [A9_Pipe1]>,
173                                InstrStage<1, [A9_NPipe]>], [1, 1]>,
174   //
175   // Double-precision FP Unary
176   InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
177                                // Extra latency cycles since wbck is 2 cycles
178                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
179                                InstrStage<1, [A9_Pipe1]>,
180                                InstrStage<1, [A9_NPipe]>], [1, 1]>,
181
182   //
183   // Single-precision FP Compare
184   InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
185                                // Extra latency cycles since wbck is 4 cycles
186                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
187                                InstrStage<1, [A9_Pipe1]>,
188                                InstrStage<1, [A9_NPipe]>], [1, 1]>,
189   //
190   // Double-precision FP Compare
191   InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
192                                // Extra latency cycles since wbck is 4 cycles
193                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
194                                InstrStage<1, [A9_Pipe1]>,
195                                InstrStage<1, [A9_NPipe]>], [1, 1]>,
196   //
197   // Single to Double FP Convert
198   InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
199                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
200                                InstrStage<1, [A9_Pipe1]>,
201                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
202   //
203   // Double to Single FP Convert
204   InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
205                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
206                                InstrStage<1, [A9_Pipe1]>,
207                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
208
209   //
210   // Single to Half FP Convert
211   InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
212                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
213                                InstrStage<1, [A9_Pipe1]>,
214                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
215   //
216   // Half to Single FP Convert
217   InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
218                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
219                                InstrStage<1, [A9_Pipe1]>,
220                                InstrStage<1, [A9_NPipe]>], [2, 1]>,
221
222   //
223   // Single-Precision FP to Integer Convert
224   InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
225                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
226                                InstrStage<1, [A9_Pipe1]>,
227                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
228   //
229   // Double-Precision FP to Integer Convert
230   InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
231                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
232                                InstrStage<1, [A9_Pipe1]>,
233                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
234   //
235   // Integer to Single-Precision FP Convert
236   InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
237                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
238                                InstrStage<1, [A9_Pipe1]>,
239                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
240   //
241   // Integer to Double-Precision FP Convert
242   InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
243                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
244                                InstrStage<1, [A9_Pipe1]>,
245                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
246   //
247   // Single-precision FP ALU
248   InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
249                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
250                                InstrStage<1, [A9_Pipe1]>,
251                                InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
252   //
253   // Double-precision FP ALU
254   InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
255                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
256                                InstrStage<1, [A9_Pipe1]>,
257                                InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
258   //
259   // Single-precision FP Multiply
260   InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
261                                InstrStage<6, [A9_DRegsN],   0, Reserved>,
262                                InstrStage<1, [A9_Pipe1]>,
263                                InstrStage<1, [A9_NPipe]>], [5, 1, 1]>,
264   //
265   // Double-precision FP Multiply
266   InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
267                                InstrStage<7, [A9_DRegsN],   0, Reserved>,
268                                InstrStage<1, [A9_Pipe1]>,
269                                InstrStage<2, [A9_NPipe]>], [6, 1, 1]>,
270   //
271   // Single-precision FP MAC
272   InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
273                                InstrStage<9, [A9_DRegsN],   0, Reserved>,
274                                InstrStage<1, [A9_Pipe1]>,
275                                InstrStage<1, [A9_NPipe]>], [8, 0, 1, 1]>,
276   //
277   // Double-precision FP MAC
278   InstrItinData<IIC_fpMAC64 , [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
279                                InstrStage<10, [A9_DRegsN],  0, Reserved>,
280                                InstrStage<1,  [A9_Pipe1]>,
281                                InstrStage<2,  [A9_NPipe]>], [9, 0, 1, 1]>,
282   //
283   // Single-precision FP DIV
284   InstrItinData<IIC_fpDIV32 , [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
285                                InstrStage<16, [A9_DRegsN],  0, Reserved>,
286                                InstrStage<1,  [A9_Pipe1]>,
287                                InstrStage<10, [A9_NPipe]>], [15, 1, 1]>,
288   //
289   // Double-precision FP DIV
290   InstrItinData<IIC_fpDIV64 , [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
291                                InstrStage<26, [A9_DRegsN],  0, Reserved>,
292                                InstrStage<1,  [A9_Pipe1]>,
293                                InstrStage<20, [A9_NPipe]>], [25, 1, 1]>,
294   //
295   // Single-precision FP SQRT
296   InstrItinData<IIC_fpSQRT32, [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
297                                InstrStage<18, [A9_DRegsN],   0, Reserved>,
298                                InstrStage<1,  [A9_Pipe1]>,
299                                InstrStage<13, [A9_NPipe]>], [17, 1]>,
300   //
301   // Double-precision FP SQRT
302   InstrItinData<IIC_fpSQRT64, [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
303                                InstrStage<33, [A9_DRegsN],   0, Reserved>,
304                                InstrStage<1,  [A9_Pipe1]>,
305                                InstrStage<28, [A9_NPipe]>], [32, 1]>,
306
307   //
308   // Integer to Single-precision Move
309   InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
310                                // Extra 1 latency cycle since wbck is 2 cycles
311                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
312                                InstrStage<1, [A9_Pipe1]>,
313                                InstrStage<1, [A9_NPipe]>], [1, 1]>,
314   //
315   // Integer to Double-precision Move
316   InstrItinData<IIC_fpMOVID,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
317                                // Extra 1 latency cycle since wbck is 2 cycles
318                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
319                                InstrStage<1, [A9_Pipe1]>,
320                                InstrStage<1, [A9_NPipe]>], [1, 1, 1]>,
321   //
322   // Single-precision to Integer Move
323   InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
324                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
325                                InstrStage<1, [A9_Pipe1]>,
326                                InstrStage<1, [A9_NPipe]>], [1, 1]>,
327   //
328   // Double-precision to Integer Move
329   InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
330                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
331                                InstrStage<1, [A9_Pipe1]>,
332                                InstrStage<1, [A9_NPipe]>], [1, 1, 1]>,
333   //
334   // Single-precision FP Load
335   // use A9_Issue to enforce the 1 load/store per cycle limit
336   InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
337                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
338                                InstrStage<1, [A9_Issue], 0>, 
339                                InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
340                                InstrStage<1, [A9_LSPipe], 0>,
341                                InstrStage<1, [A9_NPipe]>]>,
342   //
343   // Double-precision FP Load
344   // use A9_Issue to enforce the 1 load/store per cycle limit
345   InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
346                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
347                                InstrStage<1, [A9_Issue], 0>, 
348                                InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
349                                InstrStage<1, [A9_LSPipe], 0>,
350                                InstrStage<1, [A9_NPipe]>]>,
351   //
352   // FP Load Multiple
353   // use A9_Issue to enforce the 1 load/store per cycle limit
354   InstrItinData<IIC_fpLoadm,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
355                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
356                                InstrStage<1, [A9_Issue], 0>, 
357                                InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
358                                InstrStage<1, [A9_LSPipe], 0>,
359                                InstrStage<1, [A9_NPipe]>]>,
360   //
361   // Single-precision FP Store
362   // use A9_Issue to enforce the 1 load/store per cycle limit
363   InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
364                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
365                                InstrStage<1, [A9_Issue], 0>, 
366                                InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
367                                InstrStage<1, [A9_LSPipe], 0>,
368                                InstrStage<1, [A9_NPipe]>]>,
369   //
370   // Double-precision FP Store
371   // use A9_Issue to enforce the 1 load/store per cycle limit
372   InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
373                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
374                                InstrStage<1, [A9_Issue], 0>, 
375                                InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
376                                InstrStage<1, [A9_LSPipe], 0>,
377                                InstrStage<1, [A9_NPipe]>]>,
378   //
379   // FP Store Multiple
380   // use A9_Issue to enforce the 1 load/store per cycle limit
381   InstrItinData<IIC_fpStorem, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
382                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
383                                InstrStage<1, [A9_Issue], 0>, 
384                                InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
385                                InstrStage<1, [A9_LSPipe], 0>,
386                                InstrStage<1, [A9_NPipe]>]>,
387   // NEON
388   // Issue through integer pipeline, and execute in NEON unit.
389   // FIXME: Neon pipeline and LdSt unit are multiplexed. 
390   //        Add some syntactic sugar to model this!
391   // VLD1
392   // FIXME: We don't model this instruction properly
393   InstrItinData<IIC_VLD1,     [InstrStage<1, [A9_DRegsN],   0, Required>,
394                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
395                                InstrStage<1, [A9_Issue], 0>, 
396                                InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
397                                InstrStage<1, [A9_LSPipe], 0>,
398                                InstrStage<1, [A9_NPipe]>]>,
399   //
400   // VLD2
401   // FIXME: We don't model this instruction properly
402   InstrItinData<IIC_VLD2,     [InstrStage<1, [A9_DRegsN],   0, Required>,
403                                // Extra latency cycles since wbck is 6 cycles
404                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
405                                InstrStage<1, [A9_Issue], 0>, 
406                                InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
407                                InstrStage<1, [A9_LSPipe], 0>,
408                                InstrStage<1, [A9_NPipe]>], [2, 2, 1]>,
409   //
410   // VLD3
411   // FIXME: We don't model this instruction properly
412   InstrItinData<IIC_VLD3,     [InstrStage<1, [A9_DRegsN],   0, Required>,
413                                // Extra latency cycles since wbck is 6 cycles
414                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
415                                InstrStage<1, [A9_Issue], 0>, 
416                                InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
417                                InstrStage<1, [A9_LSPipe], 0>,
418                                InstrStage<1, [A9_NPipe]>], [2, 2, 2, 1]>,
419   //
420   // VLD4
421   // FIXME: We don't model this instruction properly
422   InstrItinData<IIC_VLD4,     [InstrStage<1, [A9_DRegsN],   0, Required>,
423                                // Extra latency cycles since wbck is 6 cycles
424                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
425                                InstrStage<1, [A9_Issue], 0>, 
426                                InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
427                                InstrStage<1, [A9_LSPipe], 0>,
428                                InstrStage<1, [A9_NPipe]>], [2, 2, 2, 2, 1]>,
429   //
430   // VST
431   // FIXME: We don't model this instruction properly
432   InstrItinData<IIC_VST,      [InstrStage<1, [A9_DRegsN],   0, Required>,
433                                // Extra latency cycles since wbck is 6 cycles
434                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
435                                InstrStage<1, [A9_Issue], 0>, 
436                                InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
437                                InstrStage<1, [A9_LSPipe], 0>,
438                                InstrStage<1, [A9_NPipe]>]>,
439   //
440   // Double-register Integer Unary
441   InstrItinData<IIC_VUNAiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
442                                // Extra latency cycles since wbck is 6 cycles
443                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
444                                InstrStage<1, [A9_Pipe1]>,
445                                InstrStage<1, [A9_NPipe]>], [4, 2]>,
446   //
447   // Quad-register Integer Unary
448   InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
449                                // Extra latency cycles since wbck is 6 cycles
450                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
451                                InstrStage<1, [A9_Pipe1]>,
452                                InstrStage<1, [A9_NPipe]>], [4, 2]>,
453   //
454   // Double-register Integer Q-Unary
455   InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [A9_DRegsN],   0, Required>,
456                                // Extra latency cycles since wbck is 6 cycles
457                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
458                                InstrStage<1, [A9_Pipe1]>,
459                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
460   //
461   // Quad-register Integer CountQ-Unary
462   InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
463                                // Extra latency cycles since wbck is 6 cycles
464                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
465                                InstrStage<1, [A9_Pipe1]>,
466                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
467   //
468   // Double-register Integer Binary
469   InstrItinData<IIC_VBINiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
470                                // Extra latency cycles since wbck is 6 cycles
471                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
472                                InstrStage<1, [A9_Pipe1]>,
473                                InstrStage<1, [A9_NPipe]>], [3, 2, 2]>,
474   //
475   // Quad-register Integer Binary
476   InstrItinData<IIC_VBINiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
477                                // Extra latency cycles since wbck is 6 cycles
478                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
479                                InstrStage<1, [A9_Pipe1]>,
480                                InstrStage<1, [A9_NPipe]>], [3, 2, 2]>,
481   //
482   // Double-register Integer Subtract
483   InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
484                                // Extra latency cycles since wbck is 6 cycles
485                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
486                                InstrStage<1, [A9_Pipe1]>,
487                                InstrStage<1, [A9_NPipe]>], [3, 2, 1]>,
488   //
489   // Quad-register Integer Subtract
490   InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
491                                // Extra latency cycles since wbck is 6 cycles
492                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
493                                InstrStage<1, [A9_Pipe1]>,
494                                InstrStage<1, [A9_NPipe]>], [3, 2, 1]>,
495   //
496   // Double-register Integer Shift
497   InstrItinData<IIC_VSHLiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
498                                // Extra latency cycles since wbck is 6 cycles
499                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
500                                InstrStage<1, [A9_Pipe1]>,
501                                InstrStage<1, [A9_NPipe]>], [3, 1, 1]>,
502   //
503   // Quad-register Integer Shift
504   InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
505                                // Extra latency cycles since wbck is 6 cycles
506                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
507                                InstrStage<1, [A9_Pipe1]>,
508                                InstrStage<1, [A9_NPipe]>], [3, 1, 1]>,
509   //
510   // Double-register Integer Shift (4 cycle)
511   InstrItinData<IIC_VSHLi4D,  [InstrStage<1, [A9_DRegsN],   0, Required>,
512                                // Extra latency cycles since wbck is 6 cycles
513                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
514                                InstrStage<1, [A9_Pipe1]>,
515                                InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
516   //
517   // Quad-register Integer Shift (4 cycle)
518   InstrItinData<IIC_VSHLi4Q,  [InstrStage<1, [A9_DRegsN],   0, Required>,
519                                // Extra latency cycles since wbck is 6 cycles
520                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
521                                InstrStage<1, [A9_Pipe1]>,
522                                InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
523   //
524   // Double-register Integer Binary (4 cycle)
525   InstrItinData<IIC_VBINi4D,  [InstrStage<1, [A9_DRegsN],   0, Required>,
526                                // Extra latency cycles since wbck is 6 cycles
527                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
528                                InstrStage<1, [A9_Pipe1]>,
529                                InstrStage<1, [A9_NPipe]>], [4, 2, 2]>,
530   //
531   // Quad-register Integer Binary (4 cycle)
532   InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [A9_DRegsN],   0, Required>,
533                                // Extra latency cycles since wbck is 6 cycles
534                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
535                                InstrStage<1, [A9_Pipe1]>,
536                                InstrStage<1, [A9_NPipe]>], [4, 2, 2]>,
537   //
538   // Double-register Integer Subtract (4 cycle)
539   InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
540                                // Extra latency cycles since wbck is 6 cycles
541                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
542                                InstrStage<1, [A9_Pipe1]>,
543                                InstrStage<1, [A9_NPipe]>], [4, 2, 1]>,
544   //
545   // Quad-register Integer Subtract (4 cycle)
546   InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
547                                // Extra latency cycles since wbck is 6 cycles
548                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
549                                InstrStage<1, [A9_Pipe1]>,
550                                InstrStage<1, [A9_NPipe]>], [4, 2, 1]>,
551
552   //
553   // Double-register Integer Count
554   InstrItinData<IIC_VCNTiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
555                                // Extra latency cycles since wbck is 6 cycles
556                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
557                                InstrStage<1, [A9_Pipe1]>,
558                                InstrStage<1, [A9_NPipe]>], [3, 2, 2]>,
559   //
560   // Quad-register Integer Count
561   // Result written in N3, but that is relative to the last cycle of multicycle,
562   // so we use 4 for those cases
563   InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
564                                // Extra latency cycles since wbck is 7 cycles
565                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
566                                InstrStage<1, [A9_Pipe1]>,
567                                InstrStage<2, [A9_NPipe]>], [4, 2, 2]>,
568   //
569   // Double-register Absolute Difference and Accumulate
570   InstrItinData<IIC_VABAD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
571                                // Extra latency cycles since wbck is 6 cycles
572                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
573                                InstrStage<1, [A9_Pipe1]>,
574                                InstrStage<1, [A9_NPipe]>], [6, 3, 2, 1]>,
575   //
576   // Quad-register Absolute Difference and Accumulate
577   InstrItinData<IIC_VABAQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
578                                // Extra latency cycles since wbck is 6 cycles
579                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
580                                InstrStage<1, [A9_Pipe1]>,
581                                InstrStage<2, [A9_NPipe]>], [6, 3, 2, 1]>,
582   //
583   // Double-register Integer Pair Add Long
584   InstrItinData<IIC_VPALiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
585                                // Extra latency cycles since wbck is 6 cycles
586                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
587                                InstrStage<1, [A9_Pipe1]>,
588                                InstrStage<1, [A9_NPipe]>], [6, 3, 1]>,
589   //
590   // Quad-register Integer Pair Add Long
591   InstrItinData<IIC_VPALiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
592                                // Extra latency cycles since wbck is 6 cycles
593                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
594                                InstrStage<1, [A9_Pipe1]>,
595                                InstrStage<2, [A9_NPipe]>], [6, 3, 1]>,
596
597   //
598   // Double-register Integer Multiply (.8, .16)
599   InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_DRegsN],   0, Required>,
600                                // Extra latency cycles since wbck is 6 cycles
601                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
602                                InstrStage<1, [A9_Pipe1]>,
603                                InstrStage<1, [A9_NPipe]>], [6, 2, 2]>,
604   //
605   // Quad-register Integer Multiply (.8, .16)
606   InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
607                                // Extra latency cycles since wbck is 7 cycles
608                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
609                                InstrStage<1, [A9_Pipe1]>,
610                                InstrStage<2, [A9_NPipe]>], [7, 2, 2]>,
611
612   //
613   // Double-register Integer Multiply (.32)
614   InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_DRegsN],   0, Required>,
615                                // Extra latency cycles since wbck is 7 cycles
616                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
617                                InstrStage<1, [A9_Pipe1]>,
618                                InstrStage<2, [A9_NPipe]>], [7, 2, 1]>,
619   //
620   // Quad-register Integer Multiply (.32)
621   InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
622                                // Extra latency cycles since wbck is 9 cycles
623                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
624                                InstrStage<1, [A9_Pipe1]>,
625                                InstrStage<4, [A9_NPipe]>], [9, 2, 1]>,
626   //
627   // Double-register Integer Multiply-Accumulate (.8, .16)
628   InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_DRegsN],   0, Required>,
629                                // Extra latency cycles since wbck is 6 cycles
630                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
631                                InstrStage<1, [A9_Pipe1]>,
632                                InstrStage<1, [A9_NPipe]>], [6, 3, 2, 2]>,
633   //
634   // Double-register Integer Multiply-Accumulate (.32)
635   InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_DRegsN],   0, Required>,
636                                // Extra latency cycles since wbck is 7 cycles
637                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
638                                InstrStage<1, [A9_Pipe1]>,
639                                InstrStage<2, [A9_NPipe]>], [7, 3, 2, 1]>,
640   //
641   // Quad-register Integer Multiply-Accumulate (.8, .16)
642   InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
643                                // Extra latency cycles since wbck is 7 cycles
644                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
645                                InstrStage<1, [A9_Pipe1]>,
646                                InstrStage<2, [A9_NPipe]>], [7, 3, 2, 2]>,
647   //
648   // Quad-register Integer Multiply-Accumulate (.32)
649   InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
650                                // Extra latency cycles since wbck is 9 cycles
651                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
652                                InstrStage<1, [A9_Pipe1]>,
653                                InstrStage<4, [A9_NPipe]>], [9, 3, 2, 1]>,
654   //
655   // Move Immediate
656   InstrItinData<IIC_VMOVImm,  [InstrStage<1, [A9_DRegsN],   0, Required>,
657                                // Extra latency cycles since wbck is 6 cycles
658                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
659                                InstrStage<1, [A9_Pipe1]>,
660                                InstrStage<1, [A9_NPipe]>], [3]>,
661   //
662   // Double-register Permute Move
663   InstrItinData<IIC_VMOVD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
664   // FIXME: all latencies are arbitrary, no information is available
665                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
666                                InstrStage<1, [A9_Pipe1]>,
667                                InstrStage<1, [A9_LSPipe]>], [2, 1]>,
668   //
669   // Quad-register Permute Move
670   // Result written in N2, but that is relative to the last cycle of multicycle,
671   // so we use 3 for those cases
672   InstrItinData<IIC_VMOVQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
673   // FIXME: all latencies are arbitrary, no information is available
674                                InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
675                                InstrStage<1, [A9_Pipe1]>,
676                                InstrStage<2, [A9_NPipe]>], [3, 1]>,
677   //
678   // Integer to Single-precision Move
679   InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
680   // FIXME: all latencies are arbitrary, no information is available
681                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
682                                InstrStage<1, [A9_Pipe1]>,
683                                InstrStage<1, [A9_NPipe]>], [2, 1]>,
684   //
685   // Integer to Double-precision Move
686   InstrItinData<IIC_VMOVID ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
687   // FIXME: all latencies are arbitrary, no information is available
688                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
689                                InstrStage<1, [A9_Pipe1]>,
690                                InstrStage<1, [A9_NPipe]>], [2, 1, 1]>,
691   //
692   // Single-precision to Integer Move
693   InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
694   // FIXME: all latencies are arbitrary, no information is available
695                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
696                                InstrStage<1, [A9_Pipe1]>,
697                                InstrStage<1, [A9_NPipe]>], [2, 1]>,
698   //
699   // Double-precision to Integer Move
700   InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
701   // FIXME: all latencies are arbitrary, no information is available
702                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
703                                InstrStage<1, [A9_Pipe1]>,
704                                InstrStage<1, [A9_NPipe]>], [2, 2, 1]>,
705   //
706   // Integer to Lane Move
707   InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_DRegsN],   0, Required>,
708   // FIXME: all latencies are arbitrary, no information is available
709                                InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
710                                InstrStage<1, [A9_Pipe1]>,
711                                InstrStage<2, [A9_NPipe]>], [3, 1, 1]>,
712
713   //
714   // Double-register FP Unary
715   InstrItinData<IIC_VUNAD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
716                                // Extra latency cycles since wbck is 6 cycles
717                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
718                                InstrStage<1, [A9_Pipe1]>,
719                                InstrStage<1, [A9_NPipe]>], [5, 2]>,
720   //
721   // Quad-register FP Unary
722   // Result written in N5, but that is relative to the last cycle of multicycle,
723   // so we use 6 for those cases
724   InstrItinData<IIC_VUNAQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
725                                // Extra latency cycles since wbck is 7 cycles
726                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
727                                InstrStage<1, [A9_Pipe1]>,
728                                InstrStage<2, [A9_NPipe]>], [6, 2]>,
729   //
730   // Double-register FP Binary
731   // FIXME: We're using this itin for many instructions and [2, 2] here is too
732   // optimistic.
733   InstrItinData<IIC_VBIND,    [InstrStage<1, [A9_DRegsN],   0, Required>,
734                                // Extra latency cycles since wbck is 7 cycles
735                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
736                                InstrStage<1, [A9_Pipe1]>,
737                                InstrStage<1, [A9_NPipe]>], [5, 2, 2]>,
738   //
739   // Quad-register FP Binary
740   // Result written in N5, but that is relative to the last cycle of multicycle,
741   // so we use 6 for those cases
742   // FIXME: We're using this itin for many instructions and [2, 2] here is too
743   // optimistic.
744   InstrItinData<IIC_VBINQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
745                                // Extra latency cycles since wbck is 8 cycles
746                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
747                                InstrStage<1, [A9_Pipe1]>,
748                                InstrStage<2, [A9_NPipe]>], [6, 2, 2]>,
749   //
750   // Double-register FP Multiple-Accumulate
751   InstrItinData<IIC_VMACD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
752                                // Extra latency cycles since wbck is 7 cycles
753                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
754                                InstrStage<1, [A9_Pipe1]>,
755                                InstrStage<2, [A9_NPipe]>], [6, 3, 2, 1]>,
756   //
757   // Quad-register FP Multiple-Accumulate
758   // Result written in N9, but that is relative to the last cycle of multicycle,
759   // so we use 10 for those cases
760   InstrItinData<IIC_VMACQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
761                                // Extra latency cycles since wbck is 9 cycles
762                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
763                                InstrStage<1, [A9_Pipe1]>,
764                                InstrStage<4, [A9_NPipe]>], [8, 4, 2, 1]>,
765   //
766   // Double-register Reciprical Step
767   InstrItinData<IIC_VRECSD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
768                                // Extra latency cycles since wbck is 7 cycles
769                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
770                                InstrStage<1, [A9_Pipe1]>,
771                                InstrStage<2, [A9_NPipe]>], [6, 2, 2]>,
772   //
773   // Quad-register Reciprical Step
774   InstrItinData<IIC_VRECSQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
775                                // Extra latency cycles since wbck is 9 cycles
776                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
777                                InstrStage<1, [A9_Pipe1]>,
778                                InstrStage<4, [A9_NPipe]>], [8, 2, 2]>,
779   //
780   // Double-register Permute
781   InstrItinData<IIC_VPERMD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
782                                // Extra latency cycles since wbck is 6 cycles
783                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
784                                InstrStage<1, [A9_Pipe1]>,
785                                InstrStage<1, [A9_NPipe]>], [2, 2, 1, 1]>,
786   //
787   // Quad-register Permute
788   // Result written in N2, but that is relative to the last cycle of multicycle,
789   // so we use 3 for those cases
790   InstrItinData<IIC_VPERMQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
791                                // Extra latency cycles since wbck is 7 cycles
792                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
793                                InstrStage<1, [A9_Pipe1]>,
794                                InstrStage<2, [A9_NPipe]>], [3, 3, 1, 1]>,
795   //
796   // Quad-register Permute (3 cycle issue)
797   // Result written in N2, but that is relative to the last cycle of multicycle,
798   // so we use 4 for those cases
799   InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [A9_DRegsN],   0, Required>,
800                                // Extra latency cycles since wbck is 8 cycles
801                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
802                                InstrStage<1, [A9_Pipe1]>,
803                                InstrStage<3, [A9_LSPipe]>], [4, 4, 1, 1]>,
804
805   //
806   // Double-register VEXT
807   InstrItinData<IIC_VEXTD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
808                                // Extra latency cycles since wbck is 7 cycles
809                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
810                                InstrStage<1, [A9_Pipe1]>,
811                                InstrStage<1, [A9_NPipe]>], [2, 1, 1]>,
812   //
813   // Quad-register VEXT
814   InstrItinData<IIC_VEXTQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
815                                // Extra latency cycles since wbck is 9 cycles
816                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
817                                InstrStage<1, [A9_Pipe1]>,
818                                InstrStage<2, [A9_NPipe]>], [3, 1, 1]>,
819   //
820   // VTB
821   InstrItinData<IIC_VTB1,     [InstrStage<1, [A9_DRegsN],   0, Required>,
822                                // Extra latency cycles since wbck is 7 cycles
823                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
824                                InstrStage<1, [A9_Pipe1]>,
825                                InstrStage<2, [A9_NPipe]>], [3, 2, 1]>,
826   InstrItinData<IIC_VTB2,     [InstrStage<2, [A9_DRegsN],   0, Required>,
827                                // Extra latency cycles since wbck is 7 cycles
828                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
829                                InstrStage<1, [A9_Pipe1]>,
830                                InstrStage<2, [A9_NPipe]>], [3, 2, 2, 1]>,
831   InstrItinData<IIC_VTB3,     [InstrStage<2, [A9_DRegsN],   0, Required>,
832                                // Extra latency cycles since wbck is 8 cycles
833                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
834                                InstrStage<1, [A9_Pipe1]>,
835                                InstrStage<3, [A9_NPipe]>], [4, 2, 2, 3, 1]>,
836   InstrItinData<IIC_VTB4,     [InstrStage<1, [A9_DRegsN],   0, Required>,
837                                // Extra latency cycles since wbck is 8 cycles
838                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
839                                InstrStage<1, [A9_Pipe1]>,
840                                InstrStage<3, [A9_NPipe]>], [4, 2, 2, 3, 3, 1]>,
841   //
842   // VTBX
843   InstrItinData<IIC_VTBX1,    [InstrStage<1, [A9_DRegsN],   0, Required>,
844                                // Extra latency cycles since wbck is 7 cycles
845                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
846                                InstrStage<1, [A9_Pipe1]>,
847                                InstrStage<2, [A9_NPipe]>], [3, 1, 2, 1]>,
848   InstrItinData<IIC_VTBX2,    [InstrStage<1, [A9_DRegsN],   0, Required>,
849                                // Extra latency cycles since wbck is 7 cycles
850                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
851                                InstrStage<1, [A9_Pipe1]>,
852                                InstrStage<2, [A9_NPipe]>], [3, 1, 2, 2, 1]>,
853   InstrItinData<IIC_VTBX3,    [InstrStage<1, [A9_DRegsN],   0, Required>,
854                                // Extra latency cycles since wbck is 8 cycles
855                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
856                                InstrStage<1, [A9_Pipe1]>,
857                                InstrStage<3, [A9_NPipe]>], [4, 1, 2, 2, 3, 1]>,
858   InstrItinData<IIC_VTBX4,    [InstrStage<1, [A9_DRegsN],   0, Required>,
859                                // Extra latency cycles since wbck is 8 cycles
860                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
861                                InstrStage<1, [A9_Pipe1]>,
862                                InstrStage<2, [A9_NPipe]>], [4, 1, 2, 2, 3, 3, 1]>
863 ]>;