1 //=- ARMScheduleA9.td - ARM Cortex-A9 Scheduling Definitions -*- tablegen -*-=//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the itinerary class data for the ARM Cortex A9 processors.
12 //===----------------------------------------------------------------------===//
15 // Ad-hoc scheduling information derived from pretty vague "Cortex-A9 Technical
19 def A9_Issue : FuncUnit; // issue
20 def A9_Pipe0 : FuncUnit; // pipeline 0
21 def A9_Pipe1 : FuncUnit; // pipeline 1
22 def A9_LSPipe : FuncUnit; // LS pipe
23 def A9_NPipe : FuncUnit; // NEON ALU/MUL pipe
24 def A9_DRegsVFP: FuncUnit; // FP register set, VFP side
25 def A9_DRegsN : FuncUnit; // FP register set, NEON side
27 // Dual issue pipeline represented by A9_Pipe0 | A9_Pipe1
29 def CortexA9Itineraries : ProcessorItineraries<
30 [A9_NPipe, A9_DRegsN, A9_DRegsVFP, A9_LSPipe, A9_Pipe0, A9_Pipe1, A9_Issue], [
31 // VFP and NEON shares the same register file. This means that every VFP
32 // instruction should wait for full completion of the consecutive NEON
33 // instruction and vice-versa. We model this behavior with two artificial FUs:
34 // DRegsVFP and DRegsVFP.
36 // Every VFP instruction:
37 // - Acquires DRegsVFP resource for 1 cycle
38 // - Reserves DRegsN resource for the whole duration (including time to
39 // register file writeback!).
40 // Every NEON instruction does the same but with FUs swapped.
42 // Since the reserved FU cannot be acquired this models precisly "cross-domain"
46 // Issue through integer pipeline, and execute in NEON unit.
48 // FP Special Register to Integer Register File Move
49 InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
50 InstrStage<2, [A9_DRegsN], 0, Reserved>,
51 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
52 InstrStage<1, [A9_NPipe]>]>,
54 // Single-precision FP Unary
55 InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
56 // Extra latency cycles since wbck is 2 cycles
57 InstrStage<3, [A9_DRegsN], 0, Reserved>,
58 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
59 InstrStage<1, [A9_NPipe]>], [1, 1]>,
61 // Double-precision FP Unary
62 InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
63 // Extra latency cycles since wbck is 2 cycles
64 InstrStage<3, [A9_DRegsN], 0, Reserved>,
65 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
66 InstrStage<1, [A9_NPipe]>], [1, 1]>,
69 // Single-precision FP Compare
70 InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
71 // Extra latency cycles since wbck is 4 cycles
72 InstrStage<5, [A9_DRegsN], 0, Reserved>,
73 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
74 InstrStage<1, [A9_NPipe]>], [1, 1]>,
76 // Double-precision FP Compare
77 InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
78 // Extra latency cycles since wbck is 4 cycles
79 InstrStage<5, [A9_DRegsN], 0, Reserved>,
80 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
81 InstrStage<1, [A9_NPipe]>], [1, 1]>,
83 // Single to Double FP Convert
84 InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
85 InstrStage<5, [A9_DRegsN], 0, Reserved>,
86 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
87 InstrStage<1, [A9_NPipe]>], [4, 1]>,
89 // Double to Single FP Convert
90 InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
91 InstrStage<5, [A9_DRegsN], 0, Reserved>,
92 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
93 InstrStage<1, [A9_NPipe]>], [4, 1]>,
96 // Single to Half FP Convert
97 InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
98 InstrStage<5, [A9_DRegsN], 0, Reserved>,
99 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
100 InstrStage<1, [A9_NPipe]>], [4, 1]>,
102 // Half to Single FP Convert
103 InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
104 InstrStage<3, [A9_DRegsN], 0, Reserved>,
105 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
106 InstrStage<1, [A9_NPipe]>], [2, 1]>,
109 // Single-Precision FP to Integer Convert
110 InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
111 InstrStage<5, [A9_DRegsN], 0, Reserved>,
112 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
113 InstrStage<1, [A9_NPipe]>], [4, 1]>,
115 // Double-Precision FP to Integer Convert
116 InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
117 InstrStage<5, [A9_DRegsN], 0, Reserved>,
118 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
119 InstrStage<1, [A9_NPipe]>], [4, 1]>,
121 // Integer to Single-Precision FP Convert
122 InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
123 InstrStage<5, [A9_DRegsN], 0, Reserved>,
124 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
125 InstrStage<1, [A9_NPipe]>], [4, 1]>,
127 // Integer to Double-Precision FP Convert
128 InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
129 InstrStage<5, [A9_DRegsN], 0, Reserved>,
130 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
131 InstrStage<1, [A9_NPipe]>], [4, 1]>,
133 // Single-precision FP ALU
134 InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
135 InstrStage<5, [A9_DRegsN], 0, Reserved>,
136 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
137 InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
139 // Double-precision FP ALU
140 InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
141 InstrStage<5, [A9_DRegsN], 0, Reserved>,
142 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
143 InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
145 // Single-precision FP Multiply
146 InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
147 InstrStage<6, [A9_DRegsN], 0, Reserved>,
148 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
149 InstrStage<1, [A9_NPipe]>], [5, 1, 1]>,
151 // Double-precision FP Multiply
152 InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
153 InstrStage<7, [A9_DRegsN], 0, Reserved>,
154 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
155 InstrStage<2, [A9_NPipe]>], [6, 1, 1]>,
157 // Single-precision FP MAC
158 InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
159 InstrStage<9, [A9_DRegsN], 0, Reserved>,
160 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
161 InstrStage<1, [A9_NPipe]>], [8, 0, 1, 1]>,
163 // Double-precision FP MAC
164 InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
165 InstrStage<10, [A9_DRegsN], 0, Reserved>,
166 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
167 InstrStage<2, [A9_NPipe]>], [9, 0, 1, 1]>,
169 // Single-precision FP DIV
170 InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
171 InstrStage<16, [A9_DRegsN], 0, Reserved>,
172 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
173 InstrStage<10, [A9_NPipe]>], [15, 1, 1]>,
175 // Double-precision FP DIV
176 InstrItinData<IIC_fpDIV64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
177 InstrStage<26, [A9_DRegsN], 0, Reserved>,
178 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
179 InstrStage<20, [A9_NPipe]>], [25, 1, 1]>,
181 // Single-precision FP SQRT
182 InstrItinData<IIC_fpSQRT32, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
183 InstrStage<18, [A9_DRegsN], 0, Reserved>,
184 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
185 InstrStage<13, [A9_NPipe]>], [17, 1]>,
187 // Double-precision FP SQRT
188 InstrItinData<IIC_fpSQRT64, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
189 InstrStage<33, [A9_DRegsN], 0, Reserved>,
190 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
191 InstrStage<28, [A9_NPipe]>], [32, 1]>,
194 // Integer to Single-precision Move
195 InstrItinData<IIC_fpMOVIS, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
196 // Extra 1 latency cycle since wbck is 2 cycles
197 InstrStage<3, [A9_DRegsN], 0, Reserved>,
198 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
199 InstrStage<1, [A9_NPipe]>], [1, 1]>,
201 // Integer to Double-precision Move
202 InstrItinData<IIC_fpMOVID, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
203 // Extra 1 latency cycle since wbck is 2 cycles
204 InstrStage<3, [A9_DRegsN], 0, Reserved>,
205 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
206 InstrStage<1, [A9_NPipe]>], [1, 1, 1]>,
208 // Single-precision to Integer Move
209 InstrItinData<IIC_fpMOVSI, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
210 InstrStage<2, [A9_DRegsN], 0, Reserved>,
211 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
212 InstrStage<1, [A9_NPipe]>], [1, 1]>,
214 // Double-precision to Integer Move
215 InstrItinData<IIC_fpMOVDI, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
216 InstrStage<2, [A9_DRegsN], 0, Reserved>,
217 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
218 InstrStage<1, [A9_NPipe]>], [1, 1, 1]>,
220 // Single-precision FP Load
221 // use A9_Issue to enforce the 1 load/store per cycle limit
222 InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
223 InstrStage<2, [A9_DRegsN], 0, Reserved>,
224 InstrStage<1, [A9_Issue], 0>,
225 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
226 InstrStage<1, [A9_LSPipe], 0>,
227 InstrStage<1, [A9_NPipe]>]>,
229 // Double-precision FP Load
230 // use A9_Issue to enforce the 1 load/store per cycle limit
231 InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
232 InstrStage<2, [A9_DRegsN], 0, Reserved>,
233 InstrStage<1, [A9_Issue], 0>,
234 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
235 InstrStage<1, [A9_LSPipe], 0>,
236 InstrStage<1, [A9_NPipe]>]>,
239 // use A9_Issue to enforce the 1 load/store per cycle limit
240 InstrItinData<IIC_fpLoadm, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
241 InstrStage<2, [A9_DRegsN], 0, Reserved>,
242 InstrStage<1, [A9_Issue], 0>,
243 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
244 InstrStage<1, [A9_LSPipe], 0>,
245 InstrStage<1, [A9_NPipe]>]>,
247 // Single-precision FP Store
248 // use A9_Issue to enforce the 1 load/store per cycle limit
249 InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
250 InstrStage<2, [A9_DRegsN], 0, Reserved>,
251 InstrStage<1, [A9_Issue], 0>,
252 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
253 InstrStage<1, [A9_LSPipe], 0>,
254 InstrStage<1, [A9_NPipe]>]>,
256 // Double-precision FP Store
257 // use A9_Issue to enforce the 1 load/store per cycle limit
258 InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
259 InstrStage<2, [A9_DRegsN], 0, Reserved>,
260 InstrStage<1, [A9_Issue], 0>,
261 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
262 InstrStage<1, [A9_LSPipe], 0>,
263 InstrStage<1, [A9_NPipe]>]>,
266 // use A9_Issue to enforce the 1 load/store per cycle limit
267 InstrItinData<IIC_fpStorem, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
268 InstrStage<2, [A9_DRegsN], 0, Reserved>,
269 InstrStage<1, [A9_Issue], 0>,
270 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
271 InstrStage<1, [A9_LSPipe], 0>,
272 InstrStage<1, [A9_NPipe]>]>,
274 // Issue through integer pipeline, and execute in NEON unit.
275 // FIXME: Neon pipeline and LdSt unit are multiplexed.
276 // Add some syntactic sugar to model this!
278 // FIXME: We don't model this instruction properly
279 InstrItinData<IIC_VLD1, [InstrStage<1, [A9_DRegsN], 0, Required>,
280 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
281 InstrStage<1, [A9_Issue], 0>,
282 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
283 InstrStage<1, [A9_LSPipe], 0>,
284 InstrStage<1, [A9_NPipe]>]>,
287 // FIXME: We don't model this instruction properly
288 InstrItinData<IIC_VLD2, [InstrStage<1, [A9_DRegsN], 0, Required>,
289 // Extra latency cycles since wbck is 6 cycles
290 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
291 InstrStage<1, [A9_Issue], 0>,
292 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
293 InstrStage<1, [A9_LSPipe], 0>,
294 InstrStage<1, [A9_NPipe]>], [2, 2, 1]>,
297 // FIXME: We don't model this instruction properly
298 InstrItinData<IIC_VLD3, [InstrStage<1, [A9_DRegsN], 0, Required>,
299 // Extra latency cycles since wbck is 6 cycles
300 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
301 InstrStage<1, [A9_Issue], 0>,
302 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
303 InstrStage<1, [A9_LSPipe], 0>,
304 InstrStage<1, [A9_NPipe]>], [2, 2, 2, 1]>,
307 // FIXME: We don't model this instruction properly
308 InstrItinData<IIC_VLD4, [InstrStage<1, [A9_DRegsN], 0, Required>,
309 // Extra latency cycles since wbck is 6 cycles
310 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
311 InstrStage<1, [A9_Issue], 0>,
312 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
313 InstrStage<1, [A9_LSPipe], 0>,
314 InstrStage<1, [A9_NPipe]>], [2, 2, 2, 2, 1]>,
317 // FIXME: We don't model this instruction properly
318 InstrItinData<IIC_VST, [InstrStage<1, [A9_DRegsN], 0, Required>,
319 // Extra latency cycles since wbck is 6 cycles
320 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
321 InstrStage<1, [A9_Issue], 0>,
322 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
323 InstrStage<1, [A9_LSPipe], 0>,
324 InstrStage<1, [A9_NPipe]>]>,
326 // Double-register Integer Unary
327 InstrItinData<IIC_VUNAiD, [InstrStage<1, [A9_DRegsN], 0, Required>,
328 // Extra latency cycles since wbck is 6 cycles
329 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
330 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
331 InstrStage<1, [A9_NPipe]>], [4, 2]>,
333 // Quad-register Integer Unary
334 InstrItinData<IIC_VUNAiQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
335 // Extra latency cycles since wbck is 6 cycles
336 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
337 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
338 InstrStage<1, [A9_NPipe]>], [4, 2]>,
340 // Double-register Integer Q-Unary
341 InstrItinData<IIC_VQUNAiD, [InstrStage<1, [A9_DRegsN], 0, Required>,
342 // Extra latency cycles since wbck is 6 cycles
343 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
344 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
345 InstrStage<1, [A9_NPipe]>], [4, 1]>,
347 // Quad-register Integer CountQ-Unary
348 InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
349 // Extra latency cycles since wbck is 6 cycles
350 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
351 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
352 InstrStage<1, [A9_NPipe]>], [4, 1]>,
354 // Double-register Integer Binary
355 InstrItinData<IIC_VBINiD, [InstrStage<1, [A9_DRegsN], 0, Required>,
356 // Extra latency cycles since wbck is 6 cycles
357 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
358 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
359 InstrStage<1, [A9_NPipe]>], [3, 2, 2]>,
361 // Quad-register Integer Binary
362 InstrItinData<IIC_VBINiQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
363 // Extra latency cycles since wbck is 6 cycles
364 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
365 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
366 InstrStage<1, [A9_NPipe]>], [3, 2, 2]>,
368 // Double-register Integer Subtract
369 InstrItinData<IIC_VSUBiD, [InstrStage<1, [A9_DRegsN], 0, Required>,
370 // Extra latency cycles since wbck is 6 cycles
371 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
372 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
373 InstrStage<1, [A9_NPipe]>], [3, 2, 1]>,
375 // Quad-register Integer Subtract
376 InstrItinData<IIC_VSUBiQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
377 // Extra latency cycles since wbck is 6 cycles
378 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
379 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
380 InstrStage<1, [A9_NPipe]>], [3, 2, 1]>,
382 // Double-register Integer Shift
383 InstrItinData<IIC_VSHLiD, [InstrStage<1, [A9_DRegsN], 0, Required>,
384 // Extra latency cycles since wbck is 6 cycles
385 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
386 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
387 InstrStage<1, [A9_NPipe]>], [3, 1, 1]>,
389 // Quad-register Integer Shift
390 InstrItinData<IIC_VSHLiQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
391 // Extra latency cycles since wbck is 6 cycles
392 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
393 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
394 InstrStage<1, [A9_NPipe]>], [3, 1, 1]>,
396 // Double-register Integer Shift (4 cycle)
397 InstrItinData<IIC_VSHLi4D, [InstrStage<1, [A9_DRegsN], 0, Required>,
398 // Extra latency cycles since wbck is 6 cycles
399 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
400 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
401 InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
403 // Quad-register Integer Shift (4 cycle)
404 InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [A9_DRegsN], 0, Required>,
405 // Extra latency cycles since wbck is 6 cycles
406 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
407 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
408 InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
410 // Double-register Integer Binary (4 cycle)
411 InstrItinData<IIC_VBINi4D, [InstrStage<1, [A9_DRegsN], 0, Required>,
412 // Extra latency cycles since wbck is 6 cycles
413 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
414 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
415 InstrStage<1, [A9_NPipe]>], [4, 2, 2]>,
417 // Quad-register Integer Binary (4 cycle)
418 InstrItinData<IIC_VBINi4Q, [InstrStage<1, [A9_DRegsN], 0, Required>,
419 // Extra latency cycles since wbck is 6 cycles
420 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
421 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
422 InstrStage<1, [A9_NPipe]>], [4, 2, 2]>,
424 // Double-register Integer Subtract (4 cycle)
425 InstrItinData<IIC_VSUBiD, [InstrStage<1, [A9_DRegsN], 0, Required>,
426 // Extra latency cycles since wbck is 6 cycles
427 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
428 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
429 InstrStage<1, [A9_NPipe]>], [4, 2, 1]>,
431 // Quad-register Integer Subtract (4 cycle)
432 InstrItinData<IIC_VSUBiQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
433 // Extra latency cycles since wbck is 6 cycles
434 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
435 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
436 InstrStage<1, [A9_NPipe]>], [4, 2, 1]>,
439 // Double-register Integer Count
440 InstrItinData<IIC_VCNTiD, [InstrStage<1, [A9_DRegsN], 0, Required>,
441 // Extra latency cycles since wbck is 6 cycles
442 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
443 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
444 InstrStage<1, [A9_NPipe]>], [3, 2, 2]>,
446 // Quad-register Integer Count
447 // Result written in N3, but that is relative to the last cycle of multicycle,
448 // so we use 4 for those cases
449 InstrItinData<IIC_VCNTiQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
450 // Extra latency cycles since wbck is 7 cycles
451 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
452 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
453 InstrStage<2, [A9_NPipe]>], [4, 2, 2]>,
455 // Double-register Absolute Difference and Accumulate
456 InstrItinData<IIC_VABAD, [InstrStage<1, [A9_DRegsN], 0, Required>,
457 // Extra latency cycles since wbck is 6 cycles
458 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
459 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
460 InstrStage<1, [A9_NPipe]>], [6, 3, 2, 1]>,
462 // Quad-register Absolute Difference and Accumulate
463 InstrItinData<IIC_VABAQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
464 // Extra latency cycles since wbck is 6 cycles
465 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
466 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
467 InstrStage<2, [A9_NPipe]>], [6, 3, 2, 1]>,
469 // Double-register Integer Pair Add Long
470 InstrItinData<IIC_VPALiD, [InstrStage<1, [A9_DRegsN], 0, Required>,
471 // Extra latency cycles since wbck is 6 cycles
472 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
473 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
474 InstrStage<1, [A9_NPipe]>], [6, 3, 1]>,
476 // Quad-register Integer Pair Add Long
477 InstrItinData<IIC_VPALiQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
478 // Extra latency cycles since wbck is 6 cycles
479 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
480 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
481 InstrStage<2, [A9_NPipe]>], [6, 3, 1]>,
484 // Double-register Integer Multiply (.8, .16)
485 InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_DRegsN], 0, Required>,
486 // Extra latency cycles since wbck is 6 cycles
487 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
488 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
489 InstrStage<1, [A9_NPipe]>], [6, 2, 2]>,
491 // Quad-register Integer Multiply (.8, .16)
492 InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_DRegsN], 0, Required>,
493 // Extra latency cycles since wbck is 7 cycles
494 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
495 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
496 InstrStage<2, [A9_NPipe]>], [7, 2, 2]>,
499 // Double-register Integer Multiply (.32)
500 InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_DRegsN], 0, Required>,
501 // Extra latency cycles since wbck is 7 cycles
502 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
503 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
504 InstrStage<2, [A9_NPipe]>], [7, 2, 1]>,
506 // Quad-register Integer Multiply (.32)
507 InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_DRegsN], 0, Required>,
508 // Extra latency cycles since wbck is 9 cycles
509 InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
510 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
511 InstrStage<4, [A9_NPipe]>], [9, 2, 1]>,
513 // Double-register Integer Multiply-Accumulate (.8, .16)
514 InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_DRegsN], 0, Required>,
515 // Extra latency cycles since wbck is 6 cycles
516 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
517 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
518 InstrStage<1, [A9_NPipe]>], [6, 3, 2, 2]>,
520 // Double-register Integer Multiply-Accumulate (.32)
521 InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_DRegsN], 0, Required>,
522 // Extra latency cycles since wbck is 7 cycles
523 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
524 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
525 InstrStage<2, [A9_NPipe]>], [7, 3, 2, 1]>,
527 // Quad-register Integer Multiply-Accumulate (.8, .16)
528 InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_DRegsN], 0, Required>,
529 // Extra latency cycles since wbck is 7 cycles
530 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
531 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
532 InstrStage<2, [A9_NPipe]>], [7, 3, 2, 2]>,
534 // Quad-register Integer Multiply-Accumulate (.32)
535 InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_DRegsN], 0, Required>,
536 // Extra latency cycles since wbck is 9 cycles
537 InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
538 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
539 InstrStage<4, [A9_NPipe]>], [9, 3, 2, 1]>,
542 InstrItinData<IIC_VMOVImm, [InstrStage<1, [A9_DRegsN], 0, Required>,
543 // Extra latency cycles since wbck is 6 cycles
544 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
545 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
546 InstrStage<1, [A9_NPipe]>], [3]>,
548 // Double-register Permute Move
549 InstrItinData<IIC_VMOVD, [InstrStage<1, [A9_DRegsN], 0, Required>,
550 // FIXME: all latencies are arbitrary, no information is available
551 InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
552 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
553 InstrStage<1, [A9_LSPipe]>], [2, 1]>,
555 // Quad-register Permute Move
556 // Result written in N2, but that is relative to the last cycle of multicycle,
557 // so we use 3 for those cases
558 InstrItinData<IIC_VMOVQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
559 // FIXME: all latencies are arbitrary, no information is available
560 InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
561 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
562 InstrStage<2, [A9_NPipe]>], [3, 1]>,
564 // Integer to Single-precision Move
565 InstrItinData<IIC_VMOVIS , [InstrStage<1, [A9_DRegsN], 0, Required>,
566 // FIXME: all latencies are arbitrary, no information is available
567 InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
568 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
569 InstrStage<1, [A9_NPipe]>], [2, 1]>,
571 // Integer to Double-precision Move
572 InstrItinData<IIC_VMOVID , [InstrStage<1, [A9_DRegsN], 0, Required>,
573 // FIXME: all latencies are arbitrary, no information is available
574 InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
575 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
576 InstrStage<1, [A9_NPipe]>], [2, 1, 1]>,
578 // Single-precision to Integer Move
579 InstrItinData<IIC_VMOVSI , [InstrStage<1, [A9_DRegsN], 0, Required>,
580 // FIXME: all latencies are arbitrary, no information is available
581 InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
582 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
583 InstrStage<1, [A9_NPipe]>], [2, 1]>,
585 // Double-precision to Integer Move
586 InstrItinData<IIC_VMOVDI , [InstrStage<1, [A9_DRegsN], 0, Required>,
587 // FIXME: all latencies are arbitrary, no information is available
588 InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
589 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
590 InstrStage<1, [A9_NPipe]>], [2, 2, 1]>,
592 // Integer to Lane Move
593 InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_DRegsN], 0, Required>,
594 // FIXME: all latencies are arbitrary, no information is available
595 InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
596 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
597 InstrStage<2, [A9_NPipe]>], [3, 1, 1]>,
600 // Double-register FP Unary
601 InstrItinData<IIC_VUNAD, [InstrStage<1, [A9_DRegsN], 0, Required>,
602 // Extra latency cycles since wbck is 6 cycles
603 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
604 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
605 InstrStage<1, [A9_NPipe]>], [5, 2]>,
607 // Quad-register FP Unary
608 // Result written in N5, but that is relative to the last cycle of multicycle,
609 // so we use 6 for those cases
610 InstrItinData<IIC_VUNAQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
611 // Extra latency cycles since wbck is 7 cycles
612 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
613 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
614 InstrStage<2, [A9_NPipe]>], [6, 2]>,
616 // Double-register FP Binary
617 // FIXME: We're using this itin for many instructions and [2, 2] here is too
619 InstrItinData<IIC_VBIND, [InstrStage<1, [A9_DRegsN], 0, Required>,
620 // Extra latency cycles since wbck is 7 cycles
621 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
622 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
623 InstrStage<1, [A9_NPipe]>], [5, 2, 2]>,
625 // Quad-register FP Binary
626 // Result written in N5, but that is relative to the last cycle of multicycle,
627 // so we use 6 for those cases
628 // FIXME: We're using this itin for many instructions and [2, 2] here is too
630 InstrItinData<IIC_VBINQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
631 // Extra latency cycles since wbck is 8 cycles
632 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
633 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
634 InstrStage<2, [A9_NPipe]>], [6, 2, 2]>,
636 // Double-register FP Multiple-Accumulate
637 InstrItinData<IIC_VMACD, [InstrStage<1, [A9_DRegsN], 0, Required>,
638 // Extra latency cycles since wbck is 7 cycles
639 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
640 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
641 InstrStage<2, [A9_NPipe]>], [6, 3, 2, 1]>,
643 // Quad-register FP Multiple-Accumulate
644 // Result written in N9, but that is relative to the last cycle of multicycle,
645 // so we use 10 for those cases
646 InstrItinData<IIC_VMACQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
647 // Extra latency cycles since wbck is 9 cycles
648 InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
649 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
650 InstrStage<4, [A9_NPipe]>], [8, 4, 2, 1]>,
652 // Double-register Reciprical Step
653 InstrItinData<IIC_VRECSD, [InstrStage<1, [A9_DRegsN], 0, Required>,
654 // Extra latency cycles since wbck is 7 cycles
655 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
656 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
657 InstrStage<2, [A9_NPipe]>], [6, 2, 2]>,
659 // Quad-register Reciprical Step
660 InstrItinData<IIC_VRECSQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
661 // Extra latency cycles since wbck is 9 cycles
662 InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
663 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
664 InstrStage<4, [A9_NPipe]>], [8, 2, 2]>,
666 // Double-register Permute
667 InstrItinData<IIC_VPERMD, [InstrStage<1, [A9_DRegsN], 0, Required>,
668 // Extra latency cycles since wbck is 6 cycles
669 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
670 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
671 InstrStage<1, [A9_NPipe]>], [2, 2, 1, 1]>,
673 // Quad-register Permute
674 // Result written in N2, but that is relative to the last cycle of multicycle,
675 // so we use 3 for those cases
676 InstrItinData<IIC_VPERMQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
677 // Extra latency cycles since wbck is 7 cycles
678 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
679 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
680 InstrStage<2, [A9_NPipe]>], [3, 3, 1, 1]>,
682 // Quad-register Permute (3 cycle issue)
683 // Result written in N2, but that is relative to the last cycle of multicycle,
684 // so we use 4 for those cases
685 InstrItinData<IIC_VPERMQ3, [InstrStage<1, [A9_DRegsN], 0, Required>,
686 // Extra latency cycles since wbck is 8 cycles
687 InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
688 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
689 InstrStage<3, [A9_LSPipe]>], [4, 4, 1, 1]>,
692 // Double-register VEXT
693 InstrItinData<IIC_VEXTD, [InstrStage<1, [A9_DRegsN], 0, Required>,
694 // Extra latency cycles since wbck is 7 cycles
695 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
696 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
697 InstrStage<1, [A9_NPipe]>], [2, 1, 1]>,
699 // Quad-register VEXT
700 InstrItinData<IIC_VEXTQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
701 // Extra latency cycles since wbck is 9 cycles
702 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
703 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
704 InstrStage<2, [A9_NPipe]>], [3, 1, 1]>,
707 InstrItinData<IIC_VTB1, [InstrStage<1, [A9_DRegsN], 0, Required>,
708 // Extra latency cycles since wbck is 7 cycles
709 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
710 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
711 InstrStage<2, [A9_NPipe]>], [3, 2, 1]>,
712 InstrItinData<IIC_VTB2, [InstrStage<2, [A9_DRegsN], 0, Required>,
713 // Extra latency cycles since wbck is 7 cycles
714 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
715 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
716 InstrStage<2, [A9_NPipe]>], [3, 2, 2, 1]>,
717 InstrItinData<IIC_VTB3, [InstrStage<2, [A9_DRegsN], 0, Required>,
718 // Extra latency cycles since wbck is 8 cycles
719 InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
720 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
721 InstrStage<3, [A9_NPipe]>], [4, 2, 2, 3, 1]>,
722 InstrItinData<IIC_VTB4, [InstrStage<1, [A9_DRegsN], 0, Required>,
723 // Extra latency cycles since wbck is 8 cycles
724 InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
725 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
726 InstrStage<3, [A9_NPipe]>], [4, 2, 2, 3, 3, 1]>,
729 InstrItinData<IIC_VTBX1, [InstrStage<1, [A9_DRegsN], 0, Required>,
730 // Extra latency cycles since wbck is 7 cycles
731 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
732 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
733 InstrStage<2, [A9_NPipe]>], [3, 1, 2, 1]>,
734 InstrItinData<IIC_VTBX2, [InstrStage<1, [A9_DRegsN], 0, Required>,
735 // Extra latency cycles since wbck is 7 cycles
736 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
737 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
738 InstrStage<2, [A9_NPipe]>], [3, 1, 2, 2, 1]>,
739 InstrItinData<IIC_VTBX3, [InstrStage<1, [A9_DRegsN], 0, Required>,
740 // Extra latency cycles since wbck is 8 cycles
741 InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
742 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
743 InstrStage<3, [A9_NPipe]>], [4, 1, 2, 2, 3, 1]>,
744 InstrItinData<IIC_VTBX4, [InstrStage<1, [A9_DRegsN], 0, Required>,
745 // Extra latency cycles since wbck is 8 cycles
746 InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
747 InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
748 InstrStage<2, [A9_NPipe]>], [4, 1, 2, 2, 3, 3, 1]>