1 ; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
4 define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
7 %tmp1 = load <8 x i8>* %A
8 %tmp2 = load <8 x i8>* %B
9 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
10 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
14 define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
15 ;CHECK-LABEL: sabdl4s:
17 %tmp1 = load <4 x i16>* %A
18 %tmp2 = load <4 x i16>* %B
19 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
20 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
24 define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
25 ;CHECK-LABEL: sabdl2d:
27 %tmp1 = load <2 x i32>* %A
28 %tmp2 = load <2 x i32>* %B
29 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
30 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
34 define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
35 ;CHECK-LABEL: sabdl2_8h:
37 %load1 = load <16 x i8>* %A
38 %load2 = load <16 x i8>* %B
39 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
40 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
41 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
42 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
46 define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
47 ;CHECK-LABEL: sabdl2_4s:
49 %load1 = load <8 x i16>* %A
50 %load2 = load <8 x i16>* %B
51 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
52 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
53 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
54 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
58 define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
59 ;CHECK-LABEL: sabdl2_2d:
61 %load1 = load <4 x i32>* %A
62 %load2 = load <4 x i32>* %B
63 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
64 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
65 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
66 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
70 define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
71 ;CHECK-LABEL: uabdl8h:
73 %tmp1 = load <8 x i8>* %A
74 %tmp2 = load <8 x i8>* %B
75 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
76 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
80 define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
81 ;CHECK-LABEL: uabdl4s:
83 %tmp1 = load <4 x i16>* %A
84 %tmp2 = load <4 x i16>* %B
85 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
86 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
90 define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
91 ;CHECK-LABEL: uabdl2d:
93 %tmp1 = load <2 x i32>* %A
94 %tmp2 = load <2 x i32>* %B
95 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
96 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
100 define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
101 ;CHECK-LABEL: uabdl2_8h:
103 %load1 = load <16 x i8>* %A
104 %load2 = load <16 x i8>* %B
105 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
106 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
108 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
109 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
113 define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
114 ;CHECK-LABEL: uabdl2_4s:
116 %load1 = load <8 x i16>* %A
117 %load2 = load <8 x i16>* %B
118 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
119 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
120 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
121 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
125 define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
126 ;CHECK-LABEL: uabdl2_2d:
128 %load1 = load <4 x i32>* %A
129 %load2 = load <4 x i32>* %B
130 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
131 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
132 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
133 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
137 define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
138 ;CHECK-LABEL: fabd_2s:
140 %tmp1 = load <2 x float>* %A
141 %tmp2 = load <2 x float>* %B
142 %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
143 ret <2 x float> %tmp3
146 define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
147 ;CHECK-LABEL: fabd_4s:
149 %tmp1 = load <4 x float>* %A
150 %tmp2 = load <4 x float>* %B
151 %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
152 ret <4 x float> %tmp3
155 define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
156 ;CHECK-LABEL: fabd_2d:
158 %tmp1 = load <2 x double>* %A
159 %tmp2 = load <2 x double>* %B
160 %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
161 ret <2 x double> %tmp3
164 declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
165 declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
166 declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
168 define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
169 ;CHECK-LABEL: sabd_8b:
171 %tmp1 = load <8 x i8>* %A
172 %tmp2 = load <8 x i8>* %B
173 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
177 define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
178 ;CHECK-LABEL: sabd_16b:
180 %tmp1 = load <16 x i8>* %A
181 %tmp2 = load <16 x i8>* %B
182 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
186 define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
187 ;CHECK-LABEL: sabd_4h:
189 %tmp1 = load <4 x i16>* %A
190 %tmp2 = load <4 x i16>* %B
191 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
195 define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
196 ;CHECK-LABEL: sabd_8h:
198 %tmp1 = load <8 x i16>* %A
199 %tmp2 = load <8 x i16>* %B
200 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
204 define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
205 ;CHECK-LABEL: sabd_2s:
207 %tmp1 = load <2 x i32>* %A
208 %tmp2 = load <2 x i32>* %B
209 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
213 define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
214 ;CHECK-LABEL: sabd_4s:
216 %tmp1 = load <4 x i32>* %A
217 %tmp2 = load <4 x i32>* %B
218 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
222 declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
223 declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
224 declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
225 declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
226 declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
227 declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
229 define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
230 ;CHECK-LABEL: uabd_8b:
232 %tmp1 = load <8 x i8>* %A
233 %tmp2 = load <8 x i8>* %B
234 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
238 define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
239 ;CHECK-LABEL: uabd_16b:
241 %tmp1 = load <16 x i8>* %A
242 %tmp2 = load <16 x i8>* %B
243 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
247 define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
248 ;CHECK-LABEL: uabd_4h:
250 %tmp1 = load <4 x i16>* %A
251 %tmp2 = load <4 x i16>* %B
252 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
256 define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
257 ;CHECK-LABEL: uabd_8h:
259 %tmp1 = load <8 x i16>* %A
260 %tmp2 = load <8 x i16>* %B
261 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
265 define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
266 ;CHECK-LABEL: uabd_2s:
268 %tmp1 = load <2 x i32>* %A
269 %tmp2 = load <2 x i32>* %B
270 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
274 define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
275 ;CHECK-LABEL: uabd_4s:
277 %tmp1 = load <4 x i32>* %A
278 %tmp2 = load <4 x i32>* %B
279 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
283 declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
284 declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
285 declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
286 declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
287 declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
288 declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
290 define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
291 ;CHECK-LABEL: sqabs_8b:
293 %tmp1 = load <8 x i8>* %A
294 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1)
298 define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
299 ;CHECK-LABEL: sqabs_16b:
301 %tmp1 = load <16 x i8>* %A
302 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1)
306 define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
307 ;CHECK-LABEL: sqabs_4h:
309 %tmp1 = load <4 x i16>* %A
310 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1)
314 define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
315 ;CHECK-LABEL: sqabs_8h:
317 %tmp1 = load <8 x i16>* %A
318 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1)
322 define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
323 ;CHECK-LABEL: sqabs_2s:
325 %tmp1 = load <2 x i32>* %A
326 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1)
330 define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
331 ;CHECK-LABEL: sqabs_4s:
333 %tmp1 = load <4 x i32>* %A
334 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1)
338 declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
339 declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
340 declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
341 declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
342 declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
343 declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
345 define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
346 ;CHECK-LABEL: sqneg_8b:
348 %tmp1 = load <8 x i8>* %A
349 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1)
353 define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
354 ;CHECK-LABEL: sqneg_16b:
356 %tmp1 = load <16 x i8>* %A
357 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1)
361 define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
362 ;CHECK-LABEL: sqneg_4h:
364 %tmp1 = load <4 x i16>* %A
365 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1)
369 define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
370 ;CHECK-LABEL: sqneg_8h:
372 %tmp1 = load <8 x i16>* %A
373 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1)
377 define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
378 ;CHECK-LABEL: sqneg_2s:
380 %tmp1 = load <2 x i32>* %A
381 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1)
385 define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
386 ;CHECK-LABEL: sqneg_4s:
388 %tmp1 = load <4 x i32>* %A
389 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1)
393 declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
394 declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
395 declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
396 declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
397 declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
398 declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
400 define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
401 ;CHECK-LABEL: abs_8b:
403 %tmp1 = load <8 x i8>* %A
404 %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1)
408 define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
409 ;CHECK-LABEL: abs_16b:
411 %tmp1 = load <16 x i8>* %A
412 %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1)
416 define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
417 ;CHECK-LABEL: abs_4h:
419 %tmp1 = load <4 x i16>* %A
420 %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1)
424 define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
425 ;CHECK-LABEL: abs_8h:
427 %tmp1 = load <8 x i16>* %A
428 %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1)
432 define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
433 ;CHECK-LABEL: abs_2s:
435 %tmp1 = load <2 x i32>* %A
436 %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1)
440 define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
441 ;CHECK-LABEL: abs_4s:
443 %tmp1 = load <4 x i32>* %A
444 %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1)
448 define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
449 ; CHECK-LABEL: abs_1d:
451 %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A)
455 define i64 @abs_1d_honestly(i64 %A) nounwind {
456 ; CHECK-LABEL: abs_1d_honestly:
458 %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A)
462 declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone
463 declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone
464 declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone
465 declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone
466 declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone
467 declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone
468 declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone
469 declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone
471 define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind {
472 ;CHECK-LABEL: sabal8h:
474 %tmp1 = load <8 x i8>* %A
475 %tmp2 = load <8 x i8>* %B
476 %tmp3 = load <8 x i16>* %C
477 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
478 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
479 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
483 define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
484 ;CHECK-LABEL: sabal4s:
486 %tmp1 = load <4 x i16>* %A
487 %tmp2 = load <4 x i16>* %B
488 %tmp3 = load <4 x i32>* %C
489 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
490 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
491 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
495 define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
496 ;CHECK-LABEL: sabal2d:
498 %tmp1 = load <2 x i32>* %A
499 %tmp2 = load <2 x i32>* %B
500 %tmp3 = load <2 x i64>* %C
501 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
502 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
503 %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
504 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
508 define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
509 ;CHECK-LABEL: sabal2_8h:
511 %load1 = load <16 x i8>* %A
512 %load2 = load <16 x i8>* %B
513 %tmp3 = load <8 x i16>* %C
514 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
515 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
516 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
517 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
518 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
522 define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
523 ;CHECK-LABEL: sabal2_4s:
525 %load1 = load <8 x i16>* %A
526 %load2 = load <8 x i16>* %B
527 %tmp3 = load <4 x i32>* %C
528 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
529 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
530 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
531 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
532 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
536 define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
537 ;CHECK-LABEL: sabal2_2d:
539 %load1 = load <4 x i32>* %A
540 %load2 = load <4 x i32>* %B
541 %tmp3 = load <2 x i64>* %C
542 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
543 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
544 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
545 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
546 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
550 define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind {
551 ;CHECK-LABEL: uabal8h:
553 %tmp1 = load <8 x i8>* %A
554 %tmp2 = load <8 x i8>* %B
555 %tmp3 = load <8 x i16>* %C
556 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
557 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
558 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
562 define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
563 ;CHECK-LABEL: uabal4s:
565 %tmp1 = load <4 x i16>* %A
566 %tmp2 = load <4 x i16>* %B
567 %tmp3 = load <4 x i32>* %C
568 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
569 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
570 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
574 define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
575 ;CHECK-LABEL: uabal2d:
577 %tmp1 = load <2 x i32>* %A
578 %tmp2 = load <2 x i32>* %B
579 %tmp3 = load <2 x i64>* %C
580 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
581 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
582 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
586 define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
587 ;CHECK-LABEL: uabal2_8h:
589 %load1 = load <16 x i8>* %A
590 %load2 = load <16 x i8>* %B
591 %tmp3 = load <8 x i16>* %C
592 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
593 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
594 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
595 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
596 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
600 define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
601 ;CHECK-LABEL: uabal2_4s:
603 %load1 = load <8 x i16>* %A
604 %load2 = load <8 x i16>* %B
605 %tmp3 = load <4 x i32>* %C
606 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
607 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
608 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
609 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
610 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
614 define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
615 ;CHECK-LABEL: uabal2_2d:
617 %load1 = load <4 x i32>* %A
618 %load2 = load <4 x i32>* %B
619 %tmp3 = load <2 x i64>* %C
620 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
621 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
622 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
623 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
624 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
628 define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
629 ;CHECK-LABEL: saba_8b:
631 %tmp1 = load <8 x i8>* %A
632 %tmp2 = load <8 x i8>* %B
633 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
634 %tmp4 = load <8 x i8>* %C
635 %tmp5 = add <8 x i8> %tmp3, %tmp4
639 define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
640 ;CHECK-LABEL: saba_16b:
642 %tmp1 = load <16 x i8>* %A
643 %tmp2 = load <16 x i8>* %B
644 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
645 %tmp4 = load <16 x i8>* %C
646 %tmp5 = add <16 x i8> %tmp3, %tmp4
650 define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
651 ;CHECK-LABEL: saba_4h:
653 %tmp1 = load <4 x i16>* %A
654 %tmp2 = load <4 x i16>* %B
655 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
656 %tmp4 = load <4 x i16>* %C
657 %tmp5 = add <4 x i16> %tmp3, %tmp4
661 define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
662 ;CHECK-LABEL: saba_8h:
664 %tmp1 = load <8 x i16>* %A
665 %tmp2 = load <8 x i16>* %B
666 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
667 %tmp4 = load <8 x i16>* %C
668 %tmp5 = add <8 x i16> %tmp3, %tmp4
672 define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
673 ;CHECK-LABEL: saba_2s:
675 %tmp1 = load <2 x i32>* %A
676 %tmp2 = load <2 x i32>* %B
677 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
678 %tmp4 = load <2 x i32>* %C
679 %tmp5 = add <2 x i32> %tmp3, %tmp4
683 define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
684 ;CHECK-LABEL: saba_4s:
686 %tmp1 = load <4 x i32>* %A
687 %tmp2 = load <4 x i32>* %B
688 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
689 %tmp4 = load <4 x i32>* %C
690 %tmp5 = add <4 x i32> %tmp3, %tmp4
694 define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
695 ;CHECK-LABEL: uaba_8b:
697 %tmp1 = load <8 x i8>* %A
698 %tmp2 = load <8 x i8>* %B
699 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
700 %tmp4 = load <8 x i8>* %C
701 %tmp5 = add <8 x i8> %tmp3, %tmp4
705 define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
706 ;CHECK-LABEL: uaba_16b:
708 %tmp1 = load <16 x i8>* %A
709 %tmp2 = load <16 x i8>* %B
710 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
711 %tmp4 = load <16 x i8>* %C
712 %tmp5 = add <16 x i8> %tmp3, %tmp4
716 define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
717 ;CHECK-LABEL: uaba_4h:
719 %tmp1 = load <4 x i16>* %A
720 %tmp2 = load <4 x i16>* %B
721 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
722 %tmp4 = load <4 x i16>* %C
723 %tmp5 = add <4 x i16> %tmp3, %tmp4
727 define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
728 ;CHECK-LABEL: uaba_8h:
730 %tmp1 = load <8 x i16>* %A
731 %tmp2 = load <8 x i16>* %B
732 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
733 %tmp4 = load <8 x i16>* %C
734 %tmp5 = add <8 x i16> %tmp3, %tmp4
738 define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
739 ;CHECK-LABEL: uaba_2s:
741 %tmp1 = load <2 x i32>* %A
742 %tmp2 = load <2 x i32>* %B
743 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
744 %tmp4 = load <2 x i32>* %C
745 %tmp5 = add <2 x i32> %tmp3, %tmp4
749 define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
750 ;CHECK-LABEL: uaba_4s:
752 %tmp1 = load <4 x i32>* %A
753 %tmp2 = load <4 x i32>* %B
754 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
755 %tmp4 = load <4 x i32>* %C
756 %tmp5 = add <4 x i32> %tmp3, %tmp4
761 define float @fabds(float %a, float %b) nounwind {
762 ; CHECK-LABEL: fabds:
763 ; CHECK: fabd s0, s0, s1
764 %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind
768 define double @fabdd(double %a, double %b) nounwind {
769 ; CHECK-LABEL: fabdd:
770 ; CHECK: fabd d0, d0, d1
771 %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind
775 declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone
776 declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone
778 define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
779 ; CHECK-LABEL: uabdl_from_extract_dup:
782 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
783 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
785 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
787 %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
788 %res1 = zext <2 x i32> %res to <2 x i64>
792 define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
793 ; CHECK-LABEL: sabdl_from_extract_dup:
796 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
797 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
799 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
801 %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
802 %res1 = zext <2 x i32> %res to <2 x i64>