1 ; RUN: llc -show-mc-encoding -march=arm -mcpu=cortex-a8 -mattr=+neon < %s | FileCheck %s
4 define <8 x i8> @vadd_8xi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
5 %tmp1 = load <8 x i8>* %A
6 %tmp2 = load <8 x i8>* %B
7 ; CHECK: vadd.i8 d16, d17, d16 @ encoding: [0xa0,0x08,0x41,0xf2]
8 %tmp3 = add <8 x i8> %tmp1, %tmp2
13 define <4 x i16> @vadd_4xi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
14 %tmp1 = load <4 x i16>* %A
15 %tmp2 = load <4 x i16>* %B
16 ; CHECK: vadd.i16 d16, d17, d16 @ encoding: [0xa0,0x08,0x51,0xf2]
17 %tmp3 = add <4 x i16> %tmp1, %tmp2
22 define <1 x i64> @vadd_1xi64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
23 %tmp1 = load <1 x i64>* %A
24 %tmp2 = load <1 x i64>* %B
25 ; CHECK: vadd.i64 d16, d17, d16 @ encoding: [0xa0,0x08,0x71,0xf2]
26 %tmp3 = add <1 x i64> %tmp1, %tmp2
31 define <2 x i32> @vadd_2xi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
32 %tmp1 = load <2 x i32>* %A
33 %tmp2 = load <2 x i32>* %B
34 ; CHECK: vadd.i32 d16, d17, d16 @ encoding: [0xa0,0x08,0x61,0xf2]
35 %tmp3 = add <2 x i32> %tmp1, %tmp2
40 define <2 x float> @vadd_2xfloat(<2 x float>* %A, <2 x float>* %B) nounwind {
41 %tmp1 = load <2 x float>* %A
42 %tmp2 = load <2 x float>* %B
43 ; CHECK: vadd.f32 d16, d16, d17 @ encoding: [0xa1,0x0d,0x40,0xf2]
44 %tmp3 = fadd <2 x float> %tmp1, %tmp2
49 define <4 x float> @vadd_4xfloat(<4 x float>* %A, <4 x float>* %B) nounwind {
50 %tmp1 = load <4 x float>* %A
51 %tmp2 = load <4 x float>* %B
52 ; CHECK: vadd.f32 q8, q8, q9 @ encoding: [0xe2,0x0d,0x40,0xf2]
53 %tmp3 = fadd <4 x float> %tmp1, %tmp2
58 define <8 x i16> @vaddls_8xi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
59 %tmp1 = load <8 x i8>* %A
60 %tmp2 = load <8 x i8>* %B
61 %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
62 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
63 ; CHECK: vaddl.s8 q8, d17, d16 @ encoding: [0xa0,0x00,0xc1,0xf2]
64 %tmp5 = add <8 x i16> %tmp3, %tmp4
69 define <4 x i32> @vaddls_4xi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
70 %tmp1 = load <4 x i16>* %A
71 %tmp2 = load <4 x i16>* %B
72 %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
73 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
74 ; CHECK: vaddl.s16 q8, d17, d16 @ encoding: [0xa0,0x00,0xd1,0xf2]
75 %tmp5 = add <4 x i32> %tmp3, %tmp4
80 define <2 x i64> @vaddls_2xi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
81 %tmp1 = load <2 x i32>* %A
82 %tmp2 = load <2 x i32>* %B
83 %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
84 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
85 ; CHECK: vaddl.s32 q8, d17, d16 @ encoding: [0xa0,0x00,0xe1,0xf2]
86 %tmp5 = add <2 x i64> %tmp3, %tmp4
91 define <8 x i16> @vaddlu_8xi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
92 %tmp1 = load <8 x i8>* %A
93 %tmp2 = load <8 x i8>* %B
94 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
95 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
96 ; CHECK: vaddl.u8 q8, d17, d16 @ encoding: [0xa0,0x00,0xc1,0xf3]
97 %tmp5 = add <8 x i16> %tmp3, %tmp4
101 ; CHECK: vaddlu_4xi16
102 define <4 x i32> @vaddlu_4xi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
103 %tmp1 = load <4 x i16>* %A
104 %tmp2 = load <4 x i16>* %B
105 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
106 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
107 ; CHECK: vaddl.u16 q8, d17, d16 @ encoding: [0xa0,0x00,0xd1,0xf3]
108 %tmp5 = add <4 x i32> %tmp3, %tmp4
112 ; CHECK: vaddlu_2xi32
113 define <2 x i64> @vaddlu_2xi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
114 %tmp1 = load <2 x i32>* %A
115 %tmp2 = load <2 x i32>* %B
116 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
117 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
118 ; CHECK: vaddl.u32 q8, d17, d16 @ encoding: [0xa0,0x00,0xe1,0xf3]
119 %tmp5 = add <2 x i64> %tmp3, %tmp4
124 define <8 x i16> @vaddws_8xi8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
125 %tmp1 = load <8 x i16>* %A
126 %tmp2 = load <8 x i8>* %B
127 %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
128 ; CHECK: vaddw.s8 q8, q8, d18 @ encoding: [0xa2,0x01,0xc0,0xf2]
129 %tmp4 = add <8 x i16> %tmp1, %tmp3
133 ; CHECK: vaddws_4xi16
134 define <4 x i32> @vaddws_4xi16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
135 %tmp1 = load <4 x i32>* %A
136 %tmp2 = load <4 x i16>* %B
137 %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
138 ; CHECK: vaddw.s16 q8, q8, d18 @ encoding: [0xa2,0x01,0xd0,0xf2]
139 %tmp4 = add <4 x i32> %tmp1, %tmp3
143 ; CHECK: vaddws_2xi32
144 define <2 x i64> @vaddws_2xi32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
145 %tmp1 = load <2 x i64>* %A
146 %tmp2 = load <2 x i32>* %B
147 %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
148 ; CHECK: vaddw.s32 q8, q8, d18 @ encoding: [0xa2,0x01,0xe0,0xf2]
149 %tmp4 = add <2 x i64> %tmp1, %tmp3
154 define <8 x i16> @vaddwu_8xi8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
155 %tmp1 = load <8 x i16>* %A
156 %tmp2 = load <8 x i8>* %B
157 %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
158 ; CHECK: vaddw.u8 q8, q8, d18 @ encoding: [0xa2,0x01,0xc0,0xf3]
159 %tmp4 = add <8 x i16> %tmp1, %tmp3
163 ; CHECK: vaddwu_4xi16
164 define <4 x i32> @vaddwu_4xi16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
165 %tmp1 = load <4 x i32>* %A
166 %tmp2 = load <4 x i16>* %B
167 %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
168 ; CHECK: vaddw.u16 q8, q8, d18 @ encoding: [0xa2,0x01,0xd0,0xf3]
169 %tmp4 = add <4 x i32> %tmp1, %tmp3
173 ; CHECK: vaddwu_2xi32
174 define <2 x i64> @vaddwu_2xi32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
175 %tmp1 = load <2 x i64>* %A
176 %tmp2 = load <2 x i32>* %B
177 %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
178 ; CHECK: vaddw.u32 q8, q8, d18 @ encoding: [0xa2,0x01,0xe0,0xf3]
179 %tmp4 = add <2 x i64> %tmp1, %tmp3
183 declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
184 declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
185 declare <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
188 define <8 x i8> @vhadds_8xi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
189 %tmp1 = load <8 x i8>* %A
190 %tmp2 = load <8 x i8>* %B
191 ; CHECK: vhadd.s8 d16, d16, d17 @ encoding: [0xa1,0x00,0x40,0xf2]
192 %tmp3 = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
196 ; CHECK: vhadds_4xi16
197 define <4 x i16> @vhadds_4xi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
198 %tmp1 = load <4 x i16>* %A
199 %tmp2 = load <4 x i16>* %B
200 ; CHECK: vhadd.s16 d16, d16, d17 @ encoding: [0xa1,0x00,0x50,0xf2]
201 %tmp3 = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
205 ; CHECK: vhadds_2xi32
206 define <2 x i32> @vhadds_2xi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
207 %tmp1 = load <2 x i32>* %A
208 %tmp2 = load <2 x i32>* %B
209 ; CHECK: vhadd.s32 d16, d16, d17 @ encoding: [0xa1,0x00,0x60,0xf2]
210 %tmp3 = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
214 declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
215 declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
216 declare <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
219 define <8 x i8> @vhaddu_8xi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
220 %tmp1 = load <8 x i8>* %A
221 %tmp2 = load <8 x i8>* %B
222 ; CHECK: vhadd.u8 d16, d16, d17 @ encoding: [0xa1,0x00,0x40,0xf3]
223 %tmp3 = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
227 ; CHECK: vhaddu_4xi16
228 define <4 x i16> @vhaddu_4xi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
229 %tmp1 = load <4 x i16>* %A
230 %tmp2 = load <4 x i16>* %B
231 ; CHECK: vhadd.u16 d16, d16, d17 @ encoding: [0xa1,0x00,0x50,0xf3]
232 %tmp3 = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
236 ; CHECK: vhaddu_2xi32
237 define <2 x i32> @vhaddu_2xi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
238 %tmp1 = load <2 x i32>* %A
239 %tmp2 = load <2 x i32>* %B
240 ; CHECK: vhadd.u32 d16, d16, d17 @ encoding: [0xa1,0x00,0x60,0xf3]
241 %tmp3 = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
245 declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
246 declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
247 declare <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
249 ; CHECK: vhadds_16xi8
250 define <16 x i8> @vhadds_16xi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
251 %tmp1 = load <16 x i8>* %A
252 %tmp2 = load <16 x i8>* %B
253 ; CHECK: vhadd.s8 q8, q8, q9 @ encoding: [0xe2,0x00,0x40,0xf2]
254 %tmp3 = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
258 ; CHECK: vhadds_8xi16
259 define <8 x i16> @vhadds_8xi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
260 %tmp1 = load <8 x i16>* %A
261 %tmp2 = load <8 x i16>* %B
262 ; CHECK: vhadd.s16 q8, q8, q9 @ encoding: [0xe2,0x00,0x50,0xf2]
263 %tmp3 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
267 ; CHECK: vhadds_4xi32
268 define <4 x i32> @vhadds_4xi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
269 %tmp1 = load <4 x i32>* %A
270 %tmp2 = load <4 x i32>* %B
271 ; CHECK: vhadd.s32 q8, q8, q9 @ encoding: [0xe2,0x00,0x60,0xf2]
272 %tmp3 = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
276 declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
277 declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
278 declare <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
280 ; CHECK: vhaddu_16xi8
281 define <16 x i8> @vhaddu_16xi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
282 %tmp1 = load <16 x i8>* %A
283 %tmp2 = load <16 x i8>* %B
284 ; CHECK: vhadd.u8 q8, q8, q9 @ encoding: [0xe2,0x00,0x40,0xf3]
285 %tmp3 = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
289 ; CHECK: vhaddu_8xi16
290 define <8 x i16> @vhaddu_8xi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
291 %tmp1 = load <8 x i16>* %A
292 %tmp2 = load <8 x i16>* %B
293 ; CHECK: vhadd.u16 q8, q8, q9 @ encoding: [0xe2,0x00,0x50,0xf3]
294 %tmp3 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
298 ; CHECK: vhaddu_4xi32
299 define <4 x i32> @vhaddu_4xi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
300 %tmp1 = load <4 x i32>* %A
301 %tmp2 = load <4 x i32>* %B
302 ; CHECK: vhadd.u32 q8, q8, q9 @ encoding: [0xe2,0x00,0x60,0xf3]
303 %tmp3 = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
307 declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
308 declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
309 declare <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
311 ; CHECK: vrhadds_8xi8
312 define <8 x i8> @vrhadds_8xi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
313 %tmp1 = load <8 x i8>* %A
314 %tmp2 = load <8 x i8>* %B
315 ; CHECK: vrhadd.s8 d16, d16, d17 @ encoding: [0xa1,0x01,0x40,0xf2]
316 %tmp3 = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
320 ; CHECK: vrhadds_4xi16
321 define <4 x i16> @vrhadds_4xi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
322 %tmp1 = load <4 x i16>* %A
323 %tmp2 = load <4 x i16>* %B
324 ; CHECK: vrhadd.s16 d16, d16, d17 @ encoding: [0xa1,0x01,0x50,0xf2]
325 %tmp3 = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
329 ; CHECK: vrhadds_2xi32
330 define <2 x i32> @vrhadds_2xi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
331 %tmp1 = load <2 x i32>* %A
332 %tmp2 = load <2 x i32>* %B
333 ; CHECK: vrhadd.s32 d16, d16, d17 @ encoding: [0xa1,0x01,0x60,0xf2]
334 %tmp3 = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
338 declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
339 declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
340 declare <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
342 ; CHECK: vrhaddu_8xi8
343 define <8 x i8> @vrhaddu_8xi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
344 %tmp1 = load <8 x i8>* %A
345 %tmp2 = load <8 x i8>* %B
346 ; CHECK: vrhadd.u8 d16, d16, d17 @ encoding: [0xa1,0x01,0x40,0xf3]
347 %tmp3 = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
351 ; CHECK: vrhaddu_4xi16
352 define <4 x i16> @vrhaddu_4xi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
353 %tmp1 = load <4 x i16>* %A
354 %tmp2 = load <4 x i16>* %B
355 ; CHECK: vrhadd.u16 d16, d16, d17 @ encoding: [0xa1,0x01,0x50,0xf3]
356 %tmp3 = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
360 ; CHECK: vrhaddu_2xi32
361 define <2 x i32> @vrhaddu_2xi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
362 %tmp1 = load <2 x i32>* %A
363 %tmp2 = load <2 x i32>* %B
364 ; CHECK: vrhadd.u32 d16, d16, d17 @ encoding: [0xa1,0x01,0x60,0xf3]
365 %tmp3 = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
369 declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
370 declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
371 declare <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
373 ; CHECK: vrhadds_16xi8
374 define <16 x i8> @vrhadds_16xi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
375 %tmp1 = load <16 x i8>* %A
376 %tmp2 = load <16 x i8>* %B
377 ; CHECK: vrhadd.s8 q8, q8, q9 @ encoding: [0xe2,0x01,0x40,0xf2]
378 %tmp3 = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
382 ; CHECK: vrhadds_8xi16
383 define <8 x i16> @vrhadds_8xi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
384 %tmp1 = load <8 x i16>* %A
385 %tmp2 = load <8 x i16>* %B
386 ; CHECK: vrhadd.s16 q8, q8, q9 @ encoding: [0xe2,0x01,0x50,0xf2]
387 %tmp3 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
391 ; CHECK: vrhadds_4xi32
392 define <4 x i32> @vrhadds_4xi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
393 %tmp1 = load <4 x i32>* %A
394 %tmp2 = load <4 x i32>* %B
395 ; CHECK: vrhadd.s32 q8, q8, q9 @ encoding: [0xe2,0x01,0x60,0xf2]
396 %tmp3 = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
400 declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
401 declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
402 declare <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
404 ; CHECK: vrhaddu_16xi8
405 define <16 x i8> @vrhaddu_16xi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
406 %tmp1 = load <16 x i8>* %A
407 %tmp2 = load <16 x i8>* %B
408 ; CHECK: vrhadd.u8 q8, q8, q9 @ encoding: [0xe2,0x01,0x40,0xf3]
409 %tmp3 = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
413 ; CHECK: vrhaddu_8xi16
414 define <8 x i16> @vrhaddu_8xi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
415 %tmp1 = load <8 x i16>* %A
416 %tmp2 = load <8 x i16>* %B
417 ; CHECK: vrhadd.u16 q8, q8, q9 @ encoding: [0xe2,0x01,0x50,0xf3]
418 %tmp3 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
422 ; CHECK: vrhaddu_4xi32
423 define <4 x i32> @vrhaddu_4xi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
424 %tmp1 = load <4 x i32>* %A
425 %tmp2 = load <4 x i32>* %B
426 ; CHECK: vrhadd.u32 q8, q8, q9 @ encoding: [0xe2,0x01,0x60,0xf3]
427 %tmp3 = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
431 declare <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
432 declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
433 declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
434 declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
437 define <8 x i8> @vqadds_8xi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
438 %tmp1 = load <8 x i8>* %A
439 %tmp2 = load <8 x i8>* %B
440 ; CHECK: vqadd.s8 d16, d16, d17 @ encoding: [0xb1,0x00,0x40,0xf2]
441 %tmp3 = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
445 ; CHECK: vqadds_4xi16
446 define <4 x i16> @vqadds_4xi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
447 %tmp1 = load <4 x i16>* %A
448 %tmp2 = load <4 x i16>* %B
449 ; CHECK: vqadd.s16 d16, d16, d17 @ encoding: [0xb1,0x00,0x50,0xf2]
450 %tmp3 = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
454 ; CHECK: vqadds_2xi32
455 define <2 x i32> @vqadds_2xi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
456 %tmp1 = load <2 x i32>* %A
457 %tmp2 = load <2 x i32>* %B
458 ; CHECK: vqadd.s32 d16, d16, d17 @ encoding: [0xb1,0x00,0x60,0xf2]
459 %tmp3 = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
463 ; CHECK: vqadds_1xi64
464 define <1 x i64> @vqadds_1xi64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
465 %tmp1 = load <1 x i64>* %A
466 %tmp2 = load <1 x i64>* %B
467 ; CHECK: vqadd.s64 d16, d16, d17 @ encoding: [0xb1,0x00,0x70,0xf2]
468 %tmp3 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
472 declare <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
473 declare <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
474 declare <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
475 declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
478 define <8 x i8> @vqaddu_8xi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
479 %tmp1 = load <8 x i8>* %A
480 %tmp2 = load <8 x i8>* %B
481 ; CHECK: vqadd.u8 d16, d16, d17 @ encoding: [0xb1,0x00,0x40,0xf3]
482 %tmp3 = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
486 ; CHECK: vqaddu_4xi16
487 define <4 x i16> @vqaddu_4xi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
488 %tmp1 = load <4 x i16>* %A
489 %tmp2 = load <4 x i16>* %B
490 ; CHECK: vqadd.u16 d16, d16, d17 @ encoding: [0xb1,0x00,0x50,0xf3]
491 %tmp3 = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
495 ; CHECK: vqaddu_2xi32
496 define <2 x i32> @vqaddu_2xi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
497 %tmp1 = load <2 x i32>* %A
498 %tmp2 = load <2 x i32>* %B
499 ; CHECK: vqadd.u32 d16, d16, d17 @ encoding: [0xb1,0x00,0x60,0xf3]
500 %tmp3 = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
504 ; CHECK: vqaddu_1xi64
505 define <1 x i64> @vqaddu_1xi64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
506 %tmp1 = load <1 x i64>* %A
507 %tmp2 = load <1 x i64>* %B
508 ; CHECK: vqadd.u64 d16, d16, d17 @ encoding: [0xb1,0x00,0x70,0xf3]
509 %tmp3 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
513 declare <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
514 declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
515 declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
516 declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
518 ; CHECK: vqadds_16xi8
519 define <16 x i8> @vqadds_16xi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
520 %tmp1 = load <16 x i8>* %A
521 %tmp2 = load <16 x i8>* %B
522 ; CHECK: vqadd.s8 q8, q8, q9 @ encoding: [0xf2,0x00,0x40,0xf2]
523 %tmp3 = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
527 ; CHECK: vqadds_8xi16
528 define <8 x i16> @vqadds_8xi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
529 %tmp1 = load <8 x i16>* %A
530 %tmp2 = load <8 x i16>* %B
531 ; CHECK: vqadd.s16 q8, q8, q9 @ encoding: [0xf2,0x00,0x50,0xf2]
532 %tmp3 = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
536 ; CHECK: vqadds_4xi32
537 define <4 x i32> @vqadds_4xi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
538 %tmp1 = load <4 x i32>* %A
539 %tmp2 = load <4 x i32>* %B
540 ; CHECK: vqadd.s32 q8, q8, q9 @ encoding: [0xf2,0x00,0x60,0xf2]
541 %tmp3 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
545 ; CHECK: vqadds_2xi64
546 define <2 x i64> @vqadds_2xi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
547 %tmp1 = load <2 x i64>* %A
548 %tmp2 = load <2 x i64>* %B
549 ; CHECK: vqadd.s64 q8, q8, q9 @ encoding: [0xf2,0x00,0x70,0xf2]
550 %tmp3 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
554 declare <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
555 declare <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
556 declare <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
557 declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
559 ; CHECK: vqaddu_16xi8
560 define <16 x i8> @vqaddu_16xi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
561 %tmp1 = load <16 x i8>* %A
562 %tmp2 = load <16 x i8>* %B
563 ; CHECK: vqadd.u8 q8, q8, q9 @ encoding: [0xf2,0x00,0x40,0xf3]
564 %tmp3 = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
568 ; CHECK: vqaddu_8xi16
569 define <8 x i16> @vqaddu_8xi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
570 %tmp1 = load <8 x i16>* %A
571 %tmp2 = load <8 x i16>* %B
572 ; CHECK: vqadd.u16 q8, q8, q9 @ encoding: [0xf2,0x00,0x50,0xf3]
573 %tmp3 = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
577 ; CHECK: vqaddu_4xi32
578 define <4 x i32> @vqaddu_4xi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
579 %tmp1 = load <4 x i32>* %A
580 %tmp2 = load <4 x i32>* %B
581 ; CHECK: vqadd.u32 q8, q8, q9 @ encoding: [0xf2,0x00,0x60,0xf3]
582 %tmp3 = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
586 ; CHECK: vqaddu_2xi64
587 define <2 x i64> @vqaddu_2xi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
588 %tmp1 = load <2 x i64>* %A
589 %tmp2 = load <2 x i64>* %B
590 ; CHECK: vqadd.u64 q8, q8, q9 @ encoding: [0xf2,0x00,0x70,0xf3]
591 %tmp3 = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
595 declare <8 x i8> @llvm.arm.neon.vaddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
596 declare <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
597 declare <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
599 ; CHECK: vaddhn_8xi16
600 define <8 x i8> @vaddhn_8xi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
601 %tmp1 = load <8 x i16>* %A
602 %tmp2 = load <8 x i16>* %B
603 ; CHECK: vaddhn.i16 d16, q8, q9 @ encoding: [0xa2,0x04,0xc0,0xf2]
604 %tmp3 = call <8 x i8> @llvm.arm.neon.vaddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
608 ; CHECK: vaddhn_4xi32
609 define <4 x i16> @vaddhn_4xi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
610 %tmp1 = load <4 x i32>* %A
611 %tmp2 = load <4 x i32>* %B
612 ; CHECK: vaddhn.i32 d16, q8, q9 @ encoding: [0xa2,0x04,0xd0,0xf2]
613 %tmp3 = call <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
617 ; CHECK: vaddhn_2xi64
618 define <2 x i32> @vaddhn_2xi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
619 %tmp1 = load <2 x i64>* %A
620 %tmp2 = load <2 x i64>* %B
621 ; CHECK: vaddhn.i64 d16, q8, q9 @ encoding: [0xa2,0x04,0xe0,0xf2]
622 %tmp3 = call <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
626 declare <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
627 declare <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
628 declare <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
630 ; CHECK: vraddhn_8xi16
631 define <8 x i8> @vraddhn_8xi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
632 %tmp1 = load <8 x i16>* %A
633 %tmp2 = load <8 x i16>* %B
634 ; CHECK: vraddhn.i16 d16, q8, q9 @ encoding: [0xa2,0x04,0xc0,0xf3]
635 %tmp3 = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
639 ; CHECK: vraddhn_4xi32
640 define <4 x i16> @vraddhn_4xi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
641 %tmp1 = load <4 x i32>* %A
642 %tmp2 = load <4 x i32>* %B
643 ; CHECK: vraddhn.i32 d16, q8, q9 @ encoding: [0xa2,0x04,0xd0,0xf3]
644 %tmp3 = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
648 ; CHECK: vraddhn_2xi64
649 define <2 x i32> @vraddhn_2xi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
650 %tmp1 = load <2 x i64>* %A
651 %tmp2 = load <2 x i64>* %B
652 ; CHECK: vraddhn.i64 d16, q8, q9 @ encoding: [0xa2,0x04,0xe0,0xf3]
653 %tmp3 = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)