1 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
3 define <8 x i8> @vld1lanei8(i8* %A, <8 x i8>* %B) nounwind {
5 ;Check the (default) alignment value.
6 ;CHECK: vld1.8 {d16[3]}, [r0]
7 %tmp1 = load <8 x i8>* %B
8 %tmp2 = load i8* %A, align 8
9 %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 3
13 define <4 x i16> @vld1lanei16(i16* %A, <4 x i16>* %B) nounwind {
15 ;Check the alignment value. Max for this instruction is 16 bits:
16 ;CHECK: vld1.16 {d16[2]}, [r0, :16]
17 %tmp1 = load <4 x i16>* %B
18 %tmp2 = load i16* %A, align 8
19 %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 2
23 define <2 x i32> @vld1lanei32(i32* %A, <2 x i32>* %B) nounwind {
25 ;Check the alignment value. Max for this instruction is 32 bits:
26 ;CHECK: vld1.32 {d16[1]}, [r0, :32]
27 %tmp1 = load <2 x i32>* %B
28 %tmp2 = load i32* %A, align 8
29 %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
33 define <2 x float> @vld1lanef(float* %A, <2 x float>* %B) nounwind {
35 ;CHECK: vld1.32 {d16[1]}, [r0]
36 %tmp1 = load <2 x float>* %B
37 %tmp2 = load float* %A, align 4
38 %tmp3 = insertelement <2 x float> %tmp1, float %tmp2, i32 1
42 define <16 x i8> @vld1laneQi8(i8* %A, <16 x i8>* %B) nounwind {
44 ;CHECK: vld1.8 {d17[1]}, [r0]
45 %tmp1 = load <16 x i8>* %B
46 %tmp2 = load i8* %A, align 8
47 %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 9
51 define <8 x i16> @vld1laneQi16(i16* %A, <8 x i16>* %B) nounwind {
53 ;CHECK: vld1.16 {d17[1]}, [r0, :16]
54 %tmp1 = load <8 x i16>* %B
55 %tmp2 = load i16* %A, align 8
56 %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5
60 define <4 x i32> @vld1laneQi32(i32* %A, <4 x i32>* %B) nounwind {
62 ;CHECK: vld1.32 {d17[1]}, [r0, :32]
63 %tmp1 = load <4 x i32>* %B
64 %tmp2 = load i32* %A, align 8
65 %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3
69 define <4 x float> @vld1laneQf(float* %A, <4 x float>* %B) nounwind {
71 ;CHECK: vld1.32 {d16[0]}, [r0]
72 %tmp1 = load <4 x float>* %B
73 %tmp2 = load float* %A
74 %tmp3 = insertelement <4 x float> %tmp1, float %tmp2, i32 0
78 %struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> }
79 %struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> }
80 %struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }
81 %struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> }
83 %struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> }
84 %struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> }
85 %struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> }
87 define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind {
89 ;Check the alignment value. Max for this instruction is 16 bits:
90 ;CHECK: vld2.8 {d16[1], d17[1]}, [r0, :16]
91 %tmp1 = load <8 x i8>* %B
92 %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
93 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0
94 %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1
95 %tmp5 = add <8 x i8> %tmp3, %tmp4
99 define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind {
101 ;Check the alignment value. Max for this instruction is 32 bits:
102 ;CHECK: vld2.16 {d16[1], d17[1]}, [r0, :32]
103 %tmp0 = bitcast i16* %A to i8*
104 %tmp1 = load <4 x i16>* %B
105 %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
106 %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0
107 %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1
108 %tmp5 = add <4 x i16> %tmp3, %tmp4
112 define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind {
115 %tmp0 = bitcast i32* %A to i8*
116 %tmp1 = load <2 x i32>* %B
117 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
118 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
119 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
120 %tmp5 = add <2 x i32> %tmp3, %tmp4
124 define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind {
127 %tmp0 = bitcast float* %A to i8*
128 %tmp1 = load <2 x float>* %B
129 %tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
130 %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0
131 %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1
132 %tmp5 = fadd <2 x float> %tmp3, %tmp4
133 ret <2 x float> %tmp5
136 define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
137 ;CHECK: vld2laneQi16:
138 ;Check the (default) alignment.
139 ;CHECK: vld2.16 {d17[1], d19[1]}, [r0]
140 %tmp0 = bitcast i16* %A to i8*
141 %tmp1 = load <8 x i16>* %B
142 %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
143 %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0
144 %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1
145 %tmp5 = add <8 x i16> %tmp3, %tmp4
149 define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
150 ;CHECK: vld2laneQi32:
151 ;Check the alignment value. Max for this instruction is 64 bits:
152 ;CHECK: vld2.32 {d17[0], d19[0]}, [r0, :64]
153 %tmp0 = bitcast i32* %A to i8*
154 %tmp1 = load <4 x i32>* %B
155 %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
156 %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0
157 %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1
158 %tmp5 = add <4 x i32> %tmp3, %tmp4
162 define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind {
165 %tmp0 = bitcast float* %A to i8*
166 %tmp1 = load <4 x float>* %B
167 %tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
168 %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0
169 %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1
170 %tmp5 = fadd <4 x float> %tmp3, %tmp4
171 ret <4 x float> %tmp5
174 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
175 declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
176 declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
177 declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly
179 declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
180 declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
181 declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly
183 %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
184 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
185 %struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> }
186 %struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> }
188 %struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> }
189 %struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> }
190 %struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> }
192 define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind {
195 %tmp1 = load <8 x i8>* %B
196 %tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
197 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0
198 %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1
199 %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2
200 %tmp6 = add <8 x i8> %tmp3, %tmp4
201 %tmp7 = add <8 x i8> %tmp5, %tmp6
205 define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind {
207 ;Check the (default) alignment value. VLD3 does not support alignment.
208 ;CHECK: vld3.16 {d16[1], d17[1], d18[1]}, [r0]
209 %tmp0 = bitcast i16* %A to i8*
210 %tmp1 = load <4 x i16>* %B
211 %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
212 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0
213 %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1
214 %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2
215 %tmp6 = add <4 x i16> %tmp3, %tmp4
216 %tmp7 = add <4 x i16> %tmp5, %tmp6
220 define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind {
223 %tmp0 = bitcast i32* %A to i8*
224 %tmp1 = load <2 x i32>* %B
225 %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
226 %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0
227 %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1
228 %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2
229 %tmp6 = add <2 x i32> %tmp3, %tmp4
230 %tmp7 = add <2 x i32> %tmp5, %tmp6
234 define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind {
237 %tmp0 = bitcast float* %A to i8*
238 %tmp1 = load <2 x float>* %B
239 %tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
240 %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0
241 %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1
242 %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2
243 %tmp6 = fadd <2 x float> %tmp3, %tmp4
244 %tmp7 = fadd <2 x float> %tmp5, %tmp6
245 ret <2 x float> %tmp7
248 define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
249 ;CHECK: vld3laneQi16:
250 ;Check the (default) alignment value. VLD3 does not support alignment.
251 ;CHECK: vld3.16 {d16[1], d18[1], d20[1]}, [r0]
252 %tmp0 = bitcast i16* %A to i8*
253 %tmp1 = load <8 x i16>* %B
254 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
255 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
256 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
257 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
258 %tmp6 = add <8 x i16> %tmp3, %tmp4
259 %tmp7 = add <8 x i16> %tmp5, %tmp6
263 define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
264 ;CHECK: vld3laneQi32:
266 %tmp0 = bitcast i32* %A to i8*
267 %tmp1 = load <4 x i32>* %B
268 %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1)
269 %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0
270 %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1
271 %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2
272 %tmp6 = add <4 x i32> %tmp3, %tmp4
273 %tmp7 = add <4 x i32> %tmp5, %tmp6
277 define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind {
280 %tmp0 = bitcast float* %A to i8*
281 %tmp1 = load <4 x float>* %B
282 %tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
283 %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0
284 %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1
285 %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2
286 %tmp6 = fadd <4 x float> %tmp3, %tmp4
287 %tmp7 = fadd <4 x float> %tmp5, %tmp6
288 ret <4 x float> %tmp7
291 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
292 declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
293 declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
294 declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
296 declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
297 declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
298 declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
300 %struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }
301 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
302 %struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
303 %struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> }
305 %struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }
306 %struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }
307 %struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> }
309 define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
311 ;Check the alignment value. Max for this instruction is 32 bits:
312 ;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0, :32]
313 %tmp1 = load <8 x i8>* %B
314 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
315 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
316 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
317 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
318 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
319 %tmp7 = add <8 x i8> %tmp3, %tmp4
320 %tmp8 = add <8 x i8> %tmp5, %tmp6
321 %tmp9 = add <8 x i8> %tmp7, %tmp8
325 define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind {
327 ;Check that a power-of-two alignment smaller than the total size of the memory
328 ;being loaded is ignored.
329 ;CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [r0]
330 %tmp0 = bitcast i16* %A to i8*
331 %tmp1 = load <4 x i16>* %B
332 %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4)
333 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0
334 %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1
335 %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2
336 %tmp6 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 3
337 %tmp7 = add <4 x i16> %tmp3, %tmp4
338 %tmp8 = add <4 x i16> %tmp5, %tmp6
339 %tmp9 = add <4 x i16> %tmp7, %tmp8
343 define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind {
345 ;Check the alignment value. An 8-byte alignment is allowed here even though
346 ;it is smaller than the total size of the memory being loaded.
347 ;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0, :64]
348 %tmp0 = bitcast i32* %A to i8*
349 %tmp1 = load <2 x i32>* %B
350 %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8)
351 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0
352 %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1
353 %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2
354 %tmp6 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 3
355 %tmp7 = add <2 x i32> %tmp3, %tmp4
356 %tmp8 = add <2 x i32> %tmp5, %tmp6
357 %tmp9 = add <2 x i32> %tmp7, %tmp8
361 define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind {
364 %tmp0 = bitcast float* %A to i8*
365 %tmp1 = load <2 x float>* %B
366 %tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
367 %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0
368 %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1
369 %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2
370 %tmp6 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 3
371 %tmp7 = fadd <2 x float> %tmp3, %tmp4
372 %tmp8 = fadd <2 x float> %tmp5, %tmp6
373 %tmp9 = fadd <2 x float> %tmp7, %tmp8
374 ret <2 x float> %tmp9
377 define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
378 ;CHECK: vld4laneQi16:
379 ;Check the alignment value. Max for this instruction is 64 bits:
380 ;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r0, :64]
381 %tmp0 = bitcast i16* %A to i8*
382 %tmp1 = load <8 x i16>* %B
383 %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16)
384 %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0
385 %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1
386 %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2
387 %tmp6 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 3
388 %tmp7 = add <8 x i16> %tmp3, %tmp4
389 %tmp8 = add <8 x i16> %tmp5, %tmp6
390 %tmp9 = add <8 x i16> %tmp7, %tmp8
394 define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
395 ;CHECK: vld4laneQi32:
396 ;Check the (default) alignment.
397 ;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [r0]
398 %tmp0 = bitcast i32* %A to i8*
399 %tmp1 = load <4 x i32>* %B
400 %tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
401 %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0
402 %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1
403 %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2
404 %tmp6 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 3
405 %tmp7 = add <4 x i32> %tmp3, %tmp4
406 %tmp8 = add <4 x i32> %tmp5, %tmp6
407 %tmp9 = add <4 x i32> %tmp7, %tmp8
411 define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind {
414 %tmp0 = bitcast float* %A to i8*
415 %tmp1 = load <4 x float>* %B
416 %tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
417 %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0
418 %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1
419 %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2
420 %tmp6 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 3
421 %tmp7 = fadd <4 x float> %tmp3, %tmp4
422 %tmp8 = fadd <4 x float> %tmp5, %tmp6
423 %tmp9 = fadd <4 x float> %tmp7, %tmp8
424 ret <4 x float> %tmp9
427 declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
428 declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
429 declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
430 declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
432 declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
433 declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
434 declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly