1 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
3 %struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> }
4 %struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> }
5 %struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }
6 %struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> }
8 %struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> }
9 %struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> }
10 %struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> }
12 define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind {
15 %tmp1 = load <8 x i8>* %B
16 %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
17 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0
18 %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1
19 %tmp5 = add <8 x i8> %tmp3, %tmp4
23 define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind {
26 %tmp0 = bitcast i16* %A to i8*
27 %tmp1 = load <4 x i16>* %B
28 %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1)
29 %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0
30 %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1
31 %tmp5 = add <4 x i16> %tmp3, %tmp4
35 define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind {
38 %tmp0 = bitcast i32* %A to i8*
39 %tmp1 = load <2 x i32>* %B
40 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
41 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
42 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
43 %tmp5 = add <2 x i32> %tmp3, %tmp4
47 define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind {
50 %tmp0 = bitcast float* %A to i8*
51 %tmp1 = load <2 x float>* %B
52 %tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
53 %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0
54 %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1
55 %tmp5 = fadd <2 x float> %tmp3, %tmp4
59 define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
62 %tmp0 = bitcast i16* %A to i8*
63 %tmp1 = load <8 x i16>* %B
64 %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 1)
65 %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0
66 %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1
67 %tmp5 = add <8 x i16> %tmp3, %tmp4
71 define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
74 %tmp0 = bitcast i32* %A to i8*
75 %tmp1 = load <4 x i32>* %B
76 %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
77 %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0
78 %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1
79 %tmp5 = add <4 x i32> %tmp3, %tmp4
83 define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind {
86 %tmp0 = bitcast float* %A to i8*
87 %tmp1 = load <4 x float>* %B
88 %tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
89 %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0
90 %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1
91 %tmp5 = fadd <4 x float> %tmp3, %tmp4
95 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
96 declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
97 declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
98 declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly
100 declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
101 declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
102 declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly
104 %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
105 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
106 %struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> }
107 %struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> }
109 %struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> }
110 %struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> }
111 %struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> }
113 define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind {
116 %tmp1 = load <8 x i8>* %B
117 %tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
118 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0
119 %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1
120 %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2
121 %tmp6 = add <8 x i8> %tmp3, %tmp4
122 %tmp7 = add <8 x i8> %tmp5, %tmp6
126 define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind {
129 %tmp0 = bitcast i16* %A to i8*
130 %tmp1 = load <4 x i16>* %B
131 %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1)
132 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0
133 %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1
134 %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2
135 %tmp6 = add <4 x i16> %tmp3, %tmp4
136 %tmp7 = add <4 x i16> %tmp5, %tmp6
140 define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind {
143 %tmp0 = bitcast i32* %A to i8*
144 %tmp1 = load <2 x i32>* %B
145 %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
146 %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0
147 %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1
148 %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2
149 %tmp6 = add <2 x i32> %tmp3, %tmp4
150 %tmp7 = add <2 x i32> %tmp5, %tmp6
154 define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind {
157 %tmp0 = bitcast float* %A to i8*
158 %tmp1 = load <2 x float>* %B
159 %tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
160 %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0
161 %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1
162 %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2
163 %tmp6 = fadd <2 x float> %tmp3, %tmp4
164 %tmp7 = fadd <2 x float> %tmp5, %tmp6
165 ret <2 x float> %tmp7
168 define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
169 ;CHECK: vld3laneQi16:
171 %tmp0 = bitcast i16* %A to i8*
172 %tmp1 = load <8 x i16>* %B
173 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 1)
174 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
175 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
176 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
177 %tmp6 = add <8 x i16> %tmp3, %tmp4
178 %tmp7 = add <8 x i16> %tmp5, %tmp6
182 define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
183 ;CHECK: vld3laneQi32:
185 %tmp0 = bitcast i32* %A to i8*
186 %tmp1 = load <4 x i32>* %B
187 %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1)
188 %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0
189 %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1
190 %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2
191 %tmp6 = add <4 x i32> %tmp3, %tmp4
192 %tmp7 = add <4 x i32> %tmp5, %tmp6
196 define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind {
199 %tmp0 = bitcast float* %A to i8*
200 %tmp1 = load <4 x float>* %B
201 %tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
202 %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0
203 %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1
204 %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2
205 %tmp6 = fadd <4 x float> %tmp3, %tmp4
206 %tmp7 = fadd <4 x float> %tmp5, %tmp6
207 ret <4 x float> %tmp7
210 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
211 declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
212 declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
213 declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
215 declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
216 declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
217 declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
219 %struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }
220 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
221 %struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
222 %struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> }
224 %struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }
225 %struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }
226 %struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> }
228 define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
231 %tmp1 = load <8 x i8>* %B
232 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
233 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
234 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
235 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
236 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
237 %tmp7 = add <8 x i8> %tmp3, %tmp4
238 %tmp8 = add <8 x i8> %tmp5, %tmp6
239 %tmp9 = add <8 x i8> %tmp7, %tmp8
243 define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind {
246 %tmp0 = bitcast i16* %A to i8*
247 %tmp1 = load <4 x i16>* %B
248 %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1)
249 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0
250 %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1
251 %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2
252 %tmp6 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 3
253 %tmp7 = add <4 x i16> %tmp3, %tmp4
254 %tmp8 = add <4 x i16> %tmp5, %tmp6
255 %tmp9 = add <4 x i16> %tmp7, %tmp8
259 define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind {
262 %tmp0 = bitcast i32* %A to i8*
263 %tmp1 = load <2 x i32>* %B
264 %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
265 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0
266 %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1
267 %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2
268 %tmp6 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 3
269 %tmp7 = add <2 x i32> %tmp3, %tmp4
270 %tmp8 = add <2 x i32> %tmp5, %tmp6
271 %tmp9 = add <2 x i32> %tmp7, %tmp8
275 define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind {
278 %tmp0 = bitcast float* %A to i8*
279 %tmp1 = load <2 x float>* %B
280 %tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
281 %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0
282 %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1
283 %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2
284 %tmp6 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 3
285 %tmp7 = fadd <2 x float> %tmp3, %tmp4
286 %tmp8 = fadd <2 x float> %tmp5, %tmp6
287 %tmp9 = fadd <2 x float> %tmp7, %tmp8
288 ret <2 x float> %tmp9
291 define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
292 ;CHECK: vld4laneQi16:
294 %tmp0 = bitcast i16* %A to i8*
295 %tmp1 = load <8 x i16>* %B
296 %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 1)
297 %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0
298 %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1
299 %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2
300 %tmp6 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 3
301 %tmp7 = add <8 x i16> %tmp3, %tmp4
302 %tmp8 = add <8 x i16> %tmp5, %tmp6
303 %tmp9 = add <8 x i16> %tmp7, %tmp8
307 define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
308 ;CHECK: vld4laneQi32:
310 %tmp0 = bitcast i32* %A to i8*
311 %tmp1 = load <4 x i32>* %B
312 %tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1, i32 1)
313 %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0
314 %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1
315 %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2
316 %tmp6 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 3
317 %tmp7 = add <4 x i32> %tmp3, %tmp4
318 %tmp8 = add <4 x i32> %tmp5, %tmp6
319 %tmp9 = add <4 x i32> %tmp7, %tmp8
323 define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind {
326 %tmp0 = bitcast float* %A to i8*
327 %tmp1 = load <4 x float>* %B
328 %tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
329 %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0
330 %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1
331 %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2
332 %tmp6 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 3
333 %tmp7 = fadd <4 x float> %tmp3, %tmp4
334 %tmp8 = fadd <4 x float> %tmp5, %tmp6
335 %tmp9 = fadd <4 x float> %tmp7, %tmp8
336 ret <4 x float> %tmp9
339 declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
340 declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
341 declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
342 declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
344 declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
345 declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
346 declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly