1 ; RUN: llc -show-mc-encoding -march=arm -mcpu=cortex-a8 -mattr=+neon < %s | FileCheck %s
3 ; FIXME: We cannot currently test the following instructions, which are
4 ; currently marked as for-disassembly only in the .td files:
10 define <8 x i8> @vceq_8xi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
11 %tmp1 = load <8 x i8>* %A
12 %tmp2 = load <8 x i8>* %B
13 ; CHECK: vceq.i8 d16, d16, d17 @ encoding: [0xb1,0x08,0x40,0xf3]
14 %tmp3 = icmp eq <8 x i8> %tmp1, %tmp2
15 %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
20 define <4 x i16> @vceq_4xi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
21 %tmp1 = load <4 x i16>* %A
22 %tmp2 = load <4 x i16>* %B
23 ; CHECK: vceq.i16 d16, d16, d17 @ encoding: [0xb1,0x08,0x50,0xf3]
24 %tmp3 = icmp eq <4 x i16> %tmp1, %tmp2
25 %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
30 define <2 x i32> @vceq_2xi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
31 %tmp1 = load <2 x i32>* %A
32 %tmp2 = load <2 x i32>* %B
33 ; CHECK: vceq.i32 d16, d16, d17 @ encoding: [0xb1,0x08,0x60,0xf3]
34 %tmp3 = icmp eq <2 x i32> %tmp1, %tmp2
35 %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
40 define <2 x i32> @vceq_2xfloat(<2 x float>* %A, <2 x float>* %B) nounwind {
41 %tmp1 = load <2 x float>* %A
42 %tmp2 = load <2 x float>* %B
43 ; CHECK: vceq.f32 d16, d16, d17 @ encoding: [0xa1,0x0e,0x40,0xf2]
44 %tmp3 = fcmp oeq <2 x float> %tmp1, %tmp2
45 %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
50 define <16 x i8> @vceq_16xi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
51 %tmp1 = load <16 x i8>* %A
52 %tmp2 = load <16 x i8>* %B
53 ; CHECK: vceq.i8 q8, q8, q9 @ encoding: [0xf2,0x08,0x40,0xf3]
54 %tmp3 = icmp eq <16 x i8> %tmp1, %tmp2
55 %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
60 define <8 x i16> @vceq_8xi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
61 %tmp1 = load <8 x i16>* %A
62 %tmp2 = load <8 x i16>* %B
63 ; CHECK: vceq.i16 q8, q8, q9 @ encoding: [0xf2,0x08,0x50,0xf3]
64 %tmp3 = icmp eq <8 x i16> %tmp1, %tmp2
65 %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
70 define <4 x i32> @vceq_4xi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
71 %tmp1 = load <4 x i32>* %A
72 %tmp2 = load <4 x i32>* %B
73 ; CHECK: vceq.i32 q8, q8, q9 @ encoding: [0xf2,0x08,0x60,0xf3]
74 %tmp3 = icmp eq <4 x i32> %tmp1, %tmp2
75 %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
80 define <4 x i32> @vceq_4xfloat(<4 x float>* %A, <4 x float>* %B) nounwind {
81 %tmp1 = load <4 x float>* %A
82 %tmp2 = load <4 x float>* %B
83 ; CHECK: vceq.f32 q8, q8, q9 @ encoding: [0xe2,0x0e,0x40,0xf2]
84 %tmp3 = fcmp oeq <4 x float> %tmp1, %tmp2
85 %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
90 define <8 x i8> @vcges_8xi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
91 %tmp1 = load <8 x i8>* %A
92 %tmp2 = load <8 x i8>* %B
93 ; CHECK: vcge.s8 d16, d16, d17 @ encoding: [0xb1,0x03,0x40,0xf2]
94 %tmp3 = icmp sge <8 x i8> %tmp1, %tmp2
95 %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
100 define <4 x i16> @vcges_4xi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
101 %tmp1 = load <4 x i16>* %A
102 %tmp2 = load <4 x i16>* %B
103 %tmp3 = icmp sge <4 x i16> %tmp1, %tmp2
104 ; CHECK: vcge.s16 d16, d16, d17 @ encoding: [0xb1,0x03,0x50,0xf2]
105 %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
110 define <2 x i32> @vcges_2xi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
111 %tmp1 = load <2 x i32>* %A
112 %tmp2 = load <2 x i32>* %B
113 ; CHECK: vcge.s32 d16, d16, d17 @ encoding: [0xb1,0x03,0x60,0xf2]
114 %tmp3 = icmp sge <2 x i32> %tmp1, %tmp2
115 %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
120 define <8 x i8> @vcgeu_8xi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
121 %tmp1 = load <8 x i8>* %A
122 %tmp2 = load <8 x i8>* %B
123 ; CHECK: vcge.u8 d16, d16, d17 @ encoding: [0xb1,0x03,0x40,0xf3]
124 %tmp3 = icmp uge <8 x i8> %tmp1, %tmp2
125 %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
130 define <4 x i16> @vcgeu_4xi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
131 %tmp1 = load <4 x i16>* %A
132 %tmp2 = load <4 x i16>* %B
133 ; CHECK: vcge.u16 d16, d16, d17 @ encoding: [0xb1,0x03,0x50,0xf3]
134 %tmp3 = icmp uge <4 x i16> %tmp1, %tmp2
135 %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
140 define <2 x i32> @vcgeu_2xi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
141 %tmp1 = load <2 x i32>* %A
142 %tmp2 = load <2 x i32>* %B
143 %tmp3 = icmp uge <2 x i32> %tmp1, %tmp2
144 ; CHECK: vcge.u32 d16, d16, d17 @ encoding: [0xb1,0x03,0x60,0xf3]
145 %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
149 ; CHECK: vcge_2xfloat
150 define <2 x i32> @vcge_2xfloat(<2 x float>* %A, <2 x float>* %B) nounwind {
151 %tmp1 = load <2 x float>* %A
152 %tmp2 = load <2 x float>* %B
153 ; CHECK: vcge.f32 d16, d16, d17 @ encoding: [0xa1,0x0e,0x40,0xf3]
154 %tmp3 = fcmp oge <2 x float> %tmp1, %tmp2
155 %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
160 define <16 x i8> @vcges_16xi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
161 %tmp1 = load <16 x i8>* %A
162 %tmp2 = load <16 x i8>* %B
163 ; CHECK: vcge.s8 q8, q8, q9 @ encoding: [0xf2,0x03,0x40,0xf2]
164 %tmp3 = icmp sge <16 x i8> %tmp1, %tmp2
165 %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
170 define <8 x i16> @vcges_8xi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
171 %tmp1 = load <8 x i16>* %A
172 %tmp2 = load <8 x i16>* %B
173 ; CHECK: vcge.s16 q8, q8, q9 @ encoding: [0xf2,0x03,0x50,0xf2]
174 %tmp3 = icmp sge <8 x i16> %tmp1, %tmp2
175 %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
180 define <4 x i32> @vcges_4xi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
181 %tmp1 = load <4 x i32>* %A
182 %tmp2 = load <4 x i32>* %B
183 ; CHECK: vcge.s32 q8, q8, q9 @ encoding: [0xf2,0x03,0x60,0xf2]
184 %tmp3 = icmp sge <4 x i32> %tmp1, %tmp2
185 %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
190 define <16 x i8> @vcgeu_16xi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
191 %tmp1 = load <16 x i8>* %A
192 %tmp2 = load <16 x i8>* %B
193 ; CHECK: vcge.u8 q8, q8, q9 @ encoding: [0xf2,0x03,0x40,0xf3]
194 %tmp3 = icmp uge <16 x i8> %tmp1, %tmp2
195 %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
200 define <8 x i16> @vcgeu_8xi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
201 %tmp1 = load <8 x i16>* %A
202 %tmp2 = load <8 x i16>* %B
203 ; CHECK: vcge.u16 q8, q8, q9 @ encoding: [0xf2,0x03,0x50,0xf3]
204 %tmp3 = icmp uge <8 x i16> %tmp1, %tmp2
205 %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
210 define <4 x i32> @vcgeu_4xi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
211 %tmp1 = load <4 x i32>* %A
212 %tmp2 = load <4 x i32>* %B
213 ; CHECK: vcge.u32 q8, q8, q9 @ encoding: [0xf2,0x03,0x60,0xf3]
214 %tmp3 = icmp uge <4 x i32> %tmp1, %tmp2
215 %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
219 ; CHECK: vcge_4xfloat
220 define <4 x i32> @vcge_4xfloat(<4 x float>* %A, <4 x float>* %B) nounwind {
221 %tmp1 = load <4 x float>* %A
222 %tmp2 = load <4 x float>* %B
223 ; CHECK: vcge.f32 q8, q8, q9 @ encoding: [0xe2,0x0e,0x40,0xf3]
224 %tmp3 = fcmp oge <4 x float> %tmp1, %tmp2
225 %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
229 declare <2 x i32> @llvm.arm.neon.vacged(<2 x float>, <2 x float>) nounwind readnone
230 declare <4 x i32> @llvm.arm.neon.vacgeq(<4 x float>, <4 x float>) nounwind readnone
232 ; CHECK: vacge_2xfloat
233 define <2 x i32> @vacge_2xfloat(<2 x float>* %A, <2 x float>* %B) nounwind {
234 %tmp1 = load <2 x float>* %A
235 %tmp2 = load <2 x float>* %B
236 ; vacge.f32 d16, d16, d17 @ encoding: [0xb1,0x0e,0x40,0xf3]
237 %tmp3 = call <2 x i32> @llvm.arm.neon.vacged(<2 x float> %tmp1, <2 x float> %tmp2)
241 ; CHECK: vacge_4xfloat
242 define <4 x i32> @vacge_4xfloat(<4 x float>* %A, <4 x float>* %B) nounwind {
243 %tmp1 = load <4 x float>* %A
244 %tmp2 = load <4 x float>* %B
245 ; CHECK: vacge.f32 q8, q8, q9 @ encoding: [0xf2,0x0e,0x40,0xf3]
246 %tmp3 = call <4 x i32> @llvm.arm.neon.vacgeq(<4 x float> %tmp1, <4 x float> %tmp2)
251 define <8 x i8> @vcgts_8xi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
252 %tmp1 = load <8 x i8>* %A
253 %tmp2 = load <8 x i8>* %B
254 ; CHECK: vcgt.s8 d16, d16, d17 @ encoding: [0xa1,0x03,0x40,0xf2]
255 %tmp3 = icmp sgt <8 x i8> %tmp1, %tmp2
256 %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
261 define <4 x i16> @vcgts_4xi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
262 %tmp1 = load <4 x i16>* %A
263 %tmp2 = load <4 x i16>* %B
264 ; CHECK: vcgt.s16 d16, d16, d17 @ encoding: [0xa1,0x03,0x50,0xf2]
265 %tmp3 = icmp sgt <4 x i16> %tmp1, %tmp2
266 %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
271 define <2 x i32> @vcgts_2xi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
272 %tmp1 = load <2 x i32>* %A
273 %tmp2 = load <2 x i32>* %B
274 ; CHECK: vcgt.s32 d16, d16, d17 @ encoding: [0xa1,0x03,0x60,0xf2]
275 %tmp3 = icmp sgt <2 x i32> %tmp1, %tmp2
276 %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
281 define <8 x i8> @vcgtu_8xi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
282 %tmp1 = load <8 x i8>* %A
283 %tmp2 = load <8 x i8>* %B
284 ; CHECK: vcgt.u8 d16, d16, d17 @ encoding: [0xa1,0x03,0x40,0xf3]
285 %tmp3 = icmp ugt <8 x i8> %tmp1, %tmp2
286 %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
291 define <4 x i16> @vcgtu_4xi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
292 %tmp1 = load <4 x i16>* %A
293 %tmp2 = load <4 x i16>* %B
294 ; CHECK: vcgt.u16 d16, d16, d17 @ encoding: [0xa1,0x03,0x50,0xf3]
295 %tmp3 = icmp ugt <4 x i16> %tmp1, %tmp2
296 %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
301 define <2 x i32> @vcgtu_2xi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
302 %tmp1 = load <2 x i32>* %A
303 %tmp2 = load <2 x i32>* %B
304 ; CHECK: vcgt.u32 d16, d16, d17 @ encoding: [0xa1,0x03,0x60,0xf3]
305 %tmp3 = icmp ugt <2 x i32> %tmp1, %tmp2
306 %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
310 ; CHECK: vcgt_2xfloat
311 define <2 x i32> @vcgt_2xfloat(<2 x float>* %A, <2 x float>* %B) nounwind {
312 %tmp1 = load <2 x float>* %A
313 %tmp2 = load <2 x float>* %B
314 ; CHECK: vcgt.f32 d16, d16, d17 @ encoding: [0xa1,0x0e,0x60,0xf3]
315 %tmp3 = fcmp ogt <2 x float> %tmp1, %tmp2
316 %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
321 define <16 x i8> @vcgts_16xi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
322 %tmp1 = load <16 x i8>* %A
323 %tmp2 = load <16 x i8>* %B
324 ; CHECK: vcgt.s8 q8, q8, q9 @ encoding: [0xe2,0x03,0x40,0xf2]
325 %tmp3 = icmp sgt <16 x i8> %tmp1, %tmp2
326 %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
331 define <8 x i16> @vcgts_8xi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
332 %tmp1 = load <8 x i16>* %A
333 %tmp2 = load <8 x i16>* %B
334 ; CHECK: vcgt.s16 q8, q8, q9 @ encoding: [0xe2,0x03,0x50,0xf2]
335 %tmp3 = icmp sgt <8 x i16> %tmp1, %tmp2
336 %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
341 define <4 x i32> @vcgts_4xi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
342 %tmp1 = load <4 x i32>* %A
343 %tmp2 = load <4 x i32>* %B
344 ; CHECK: vcgt.s32 q8, q8, q9 @ encoding: [0xe2,0x03,0x60,0xf2]
345 %tmp3 = icmp sgt <4 x i32> %tmp1, %tmp2
346 %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
351 define <16 x i8> @vcgtu_16xi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
352 %tmp1 = load <16 x i8>* %A
353 %tmp2 = load <16 x i8>* %B
354 ; CHECK: vcgt.u8 q8, q8, q9 @ encoding: [0xe2,0x03,0x40,0xf3]
355 %tmp3 = icmp ugt <16 x i8> %tmp1, %tmp2
356 %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
361 define <8 x i16> @vcgtu_8xi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
362 %tmp1 = load <8 x i16>* %A
363 %tmp2 = load <8 x i16>* %B
364 ; CHECK: vcgt.u16 q8, q8, q9 @ encoding: [0xe2,0x03,0x50,0xf3]
365 %tmp3 = icmp ugt <8 x i16> %tmp1, %tmp2
366 %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
371 define <4 x i32> @vcgtu_4xi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
372 %tmp1 = load <4 x i32>* %A
373 %tmp2 = load <4 x i32>* %B
374 ; CHECK: vcgt.u32 q8, q8, q9 @ encoding: [0xe2,0x03,0x60,0xf3]
375 %tmp3 = icmp ugt <4 x i32> %tmp1, %tmp2
376 %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
380 ; CHECK: vcgt_4xfloat
381 define <4 x i32> @vcgt_4xfloat(<4 x float>* %A, <4 x float>* %B) nounwind {
382 %tmp1 = load <4 x float>* %A
383 %tmp2 = load <4 x float>* %B
384 ; CHECK: vcgt.f32 q8, q8, q9 @ encoding: [0xe2,0x0e,0x60,0xf3]
385 %tmp3 = fcmp ogt <4 x float> %tmp1, %tmp2
386 %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
390 declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>) nounwind readnone
391 declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>) nounwind readnone
393 ; CHECK: vacgt_2xfloat
394 define <2 x i32> @vacgt_2xfloat(<2 x float>* %A, <2 x float>* %B) nounwind {
395 %tmp1 = load <2 x float>* %A
396 %tmp2 = load <2 x float>* %B
397 ; CHECK: vacgt.f32 d16, d16, d17 @ encoding: [0xb1,0x0e,0x60,0xf3]
398 %tmp3 = call <2 x i32> @llvm.arm.neon.vacgtd(<2 x float> %tmp1, <2 x float> %tmp2)
402 ; CHECK: vacgt_4xfloat
403 define <4 x i32> @vacgt_4xfloat(<4 x float>* %A, <4 x float>* %B) nounwind {
404 %tmp1 = load <4 x float>* %A
405 %tmp2 = load <4 x float>* %B
406 ; CHECK: vacgt.f32 q8, q8, q9 @ encoding: [0xf2,0x0e,0x60,0xf3]
407 %tmp3 = call <4 x i32> @llvm.arm.neon.vacgtq(<4 x float> %tmp1, <4 x float> %tmp2)