1 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
3 %struct.int8x16x2_t = type { [2 x <16 x i8>] }
4 %struct.int16x8x2_t = type { [2 x <8 x i16>] }
5 %struct.int32x4x2_t = type { [2 x <4 x i32>] }
6 %struct.int64x2x2_t = type { [2 x <2 x i64>] }
7 %struct.float32x4x2_t = type { [2 x <4 x float>] }
8 %struct.float64x2x2_t = type { [2 x <2 x double>] }
9 %struct.int8x8x2_t = type { [2 x <8 x i8>] }
10 %struct.int16x4x2_t = type { [2 x <4 x i16>] }
11 %struct.int32x2x2_t = type { [2 x <2 x i32>] }
12 %struct.int64x1x2_t = type { [2 x <1 x i64>] }
13 %struct.float32x2x2_t = type { [2 x <2 x float>] }
14 %struct.float64x1x2_t = type { [2 x <1 x double>] }
15 %struct.int8x16x3_t = type { [3 x <16 x i8>] }
16 %struct.int16x8x3_t = type { [3 x <8 x i16>] }
17 %struct.int32x4x3_t = type { [3 x <4 x i32>] }
18 %struct.int64x2x3_t = type { [3 x <2 x i64>] }
19 %struct.float32x4x3_t = type { [3 x <4 x float>] }
20 %struct.float64x2x3_t = type { [3 x <2 x double>] }
21 %struct.int8x8x3_t = type { [3 x <8 x i8>] }
22 %struct.int16x4x3_t = type { [3 x <4 x i16>] }
23 %struct.int32x2x3_t = type { [3 x <2 x i32>] }
24 %struct.int64x1x3_t = type { [3 x <1 x i64>] }
25 %struct.float32x2x3_t = type { [3 x <2 x float>] }
26 %struct.float64x1x3_t = type { [3 x <1 x double>] }
27 %struct.int8x16x4_t = type { [4 x <16 x i8>] }
28 %struct.int16x8x4_t = type { [4 x <8 x i16>] }
29 %struct.int32x4x4_t = type { [4 x <4 x i32>] }
30 %struct.int64x2x4_t = type { [4 x <2 x i64>] }
31 %struct.float32x4x4_t = type { [4 x <4 x float>] }
32 %struct.float64x2x4_t = type { [4 x <2 x double>] }
33 %struct.int8x8x4_t = type { [4 x <8 x i8>] }
34 %struct.int16x4x4_t = type { [4 x <4 x i16>] }
35 %struct.int32x2x4_t = type { [4 x <2 x i32>] }
36 %struct.int64x1x4_t = type { [4 x <1 x i64>] }
37 %struct.float32x2x4_t = type { [4 x <2 x float>] }
38 %struct.float64x1x4_t = type { [4 x <1 x double>] }
40 define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
41 ; CHECK-LABEL: test_vld1q_dup_s8
42 ; CHECK: ld1r {{{v[0-9]+}}.16b}, [x0]
44 %0 = load i8* %a, align 1
45 %1 = insertelement <16 x i8> undef, i8 %0, i32 0
46 %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
50 define <8 x i16> @test_vld1q_dup_s16(i16* %a) {
51 ; CHECK-LABEL: test_vld1q_dup_s16
52 ; CHECK: ld1r {{{v[0-9]+}}.8h}, [x0]
54 %0 = load i16* %a, align 2
55 %1 = insertelement <8 x i16> undef, i16 %0, i32 0
56 %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
60 define <4 x i32> @test_vld1q_dup_s32(i32* %a) {
61 ; CHECK-LABEL: test_vld1q_dup_s32
62 ; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
64 %0 = load i32* %a, align 4
65 %1 = insertelement <4 x i32> undef, i32 %0, i32 0
66 %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
70 define <2 x i64> @test_vld1q_dup_s64(i64* %a) {
71 ; CHECK-LABEL: test_vld1q_dup_s64
72 ; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
74 %0 = load i64* %a, align 8
75 %1 = insertelement <2 x i64> undef, i64 %0, i32 0
76 %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
80 define <4 x float> @test_vld1q_dup_f32(float* %a) {
81 ; CHECK-LABEL: test_vld1q_dup_f32
82 ; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
84 %0 = load float* %a, align 4
85 %1 = insertelement <4 x float> undef, float %0, i32 0
86 %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
90 define <2 x double> @test_vld1q_dup_f64(double* %a) {
91 ; CHECK-LABEL: test_vld1q_dup_f64
92 ; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
94 %0 = load double* %a, align 8
95 %1 = insertelement <2 x double> undef, double %0, i32 0
96 %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
97 ret <2 x double> %lane
100 define <8 x i8> @test_vld1_dup_s8(i8* %a) {
101 ; CHECK-LABEL: test_vld1_dup_s8
102 ; CHECK: ld1r {{{v[0-9]+}}.8b}, [x0]
104 %0 = load i8* %a, align 1
105 %1 = insertelement <8 x i8> undef, i8 %0, i32 0
106 %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
110 define <4 x i16> @test_vld1_dup_s16(i16* %a) {
111 ; CHECK-LABEL: test_vld1_dup_s16
112 ; CHECK: ld1r {{{v[0-9]+}}.4h}, [x0]
114 %0 = load i16* %a, align 2
115 %1 = insertelement <4 x i16> undef, i16 %0, i32 0
116 %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
120 define <2 x i32> @test_vld1_dup_s32(i32* %a) {
121 ; CHECK-LABEL: test_vld1_dup_s32
122 ; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
124 %0 = load i32* %a, align 4
125 %1 = insertelement <2 x i32> undef, i32 %0, i32 0
126 %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
130 define <1 x i64> @test_vld1_dup_s64(i64* %a) {
131 ; CHECK-LABEL: test_vld1_dup_s64
132 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
134 %0 = load i64* %a, align 8
135 %1 = insertelement <1 x i64> undef, i64 %0, i32 0
139 define <2 x float> @test_vld1_dup_f32(float* %a) {
140 ; CHECK-LABEL: test_vld1_dup_f32
141 ; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
143 %0 = load float* %a, align 4
144 %1 = insertelement <2 x float> undef, float %0, i32 0
145 %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
146 ret <2 x float> %lane
149 define <1 x double> @test_vld1_dup_f64(double* %a) {
150 ; CHECK-LABEL: test_vld1_dup_f64
151 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
153 %0 = load double* %a, align 8
154 %1 = insertelement <1 x double> undef, double %0, i32 0
158 define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) {
159 ; CHECK-LABEL: test_vld2q_dup_s8
160 ; CHECK: ld2r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
162 %vld_dup = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
163 %0 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 0
164 %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
165 %1 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 1
166 %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
167 %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %lane, 0, 0
168 %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
169 ret %struct.int8x16x2_t %.fca.0.1.insert
172 define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) {
173 ; CHECK-LABEL: test_vld2q_dup_s16
174 ; CHECK: ld2r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
176 %0 = bitcast i16* %a to i8*
177 %vld_dup = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
178 %1 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 0
179 %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
180 %2 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 1
181 %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
182 %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %lane, 0, 0
183 %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
184 ret %struct.int16x8x2_t %.fca.0.1.insert
187 define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) {
188 ; CHECK-LABEL: test_vld2q_dup_s32
189 ; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
191 %0 = bitcast i32* %a to i8*
192 %vld_dup = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
193 %1 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 0
194 %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
195 %2 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 1
196 %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
197 %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %lane, 0, 0
198 %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
199 ret %struct.int32x4x2_t %.fca.0.1.insert
202 define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) {
203 ; CHECK-LABEL: test_vld2q_dup_s64
204 ; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
206 %0 = bitcast i64* %a to i8*
207 %vld_dup = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
208 %1 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 0
209 %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
210 %2 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 1
211 %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
212 %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %lane, 0, 0
213 %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
214 ret %struct.int64x2x2_t %.fca.0.1.insert
217 define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) {
218 ; CHECK-LABEL: test_vld2q_dup_f32
219 ; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
221 %0 = bitcast float* %a to i8*
222 %vld_dup = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
223 %1 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 0
224 %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
225 %2 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 1
226 %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
227 %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %lane, 0, 0
228 %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
229 ret %struct.float32x4x2_t %.fca.0.1.insert
232 define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) {
233 ; CHECK-LABEL: test_vld2q_dup_f64
234 ; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
236 %0 = bitcast double* %a to i8*
237 %vld_dup = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
238 %1 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 0
239 %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
240 %2 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 1
241 %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
242 %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %lane, 0, 0
243 %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
244 ret %struct.float64x2x2_t %.fca.0.1.insert
247 define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) {
248 ; CHECK-LABEL: test_vld2_dup_s8
249 ; CHECK: ld2r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
251 %vld_dup = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
252 %0 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 0
253 %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
254 %1 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 1
255 %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
256 %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %lane, 0, 0
257 %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
258 ret %struct.int8x8x2_t %.fca.0.1.insert
261 define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) {
262 ; CHECK-LABEL: test_vld2_dup_s16
263 ; CHECK: ld2r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
265 %0 = bitcast i16* %a to i8*
266 %vld_dup = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
267 %1 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 0
268 %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
269 %2 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 1
270 %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
271 %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %lane, 0, 0
272 %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
273 ret %struct.int16x4x2_t %.fca.0.1.insert
276 define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) {
277 ; CHECK-LABEL: test_vld2_dup_s32
278 ; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
280 %0 = bitcast i32* %a to i8*
281 %vld_dup = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
282 %1 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 0
283 %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
284 %2 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 1
285 %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
286 %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %lane, 0, 0
287 %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
288 ret %struct.int32x2x2_t %.fca.0.1.insert
291 define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) {
292 ; CHECK-LABEL: test_vld2_dup_s64
293 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
295 %0 = bitcast i64* %a to i8*
296 %vld_dup = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %0, i32 8)
297 %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 0
298 %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 1
299 %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
300 %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
301 ret %struct.int64x1x2_t %.fca.0.1.insert
304 define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) {
305 ; CHECK-LABEL: test_vld2_dup_f32
306 ; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
308 %0 = bitcast float* %a to i8*
309 %vld_dup = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
310 %1 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 0
311 %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
312 %2 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 1
313 %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
314 %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %lane, 0, 0
315 %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
316 ret %struct.float32x2x2_t %.fca.0.1.insert
319 define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) {
320 ; CHECK-LABEL: test_vld2_dup_f64
321 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
323 %0 = bitcast double* %a to i8*
324 %vld_dup = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %0, i32 8)
325 %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 0
326 %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 1
327 %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
328 %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
329 ret %struct.float64x1x2_t %.fca.0.1.insert
332 define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) {
333 ; CHECK-LABEL: test_vld3q_dup_s8
334 ; CHECK: ld3r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
336 %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
337 %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
338 %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
339 %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
340 %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
341 %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
342 %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
343 %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %lane, 0, 0
344 %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
345 %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
346 ret %struct.int8x16x3_t %.fca.0.2.insert
349 define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) {
350 ; CHECK-LABEL: test_vld3q_dup_s16
351 ; CHECK: ld3r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
353 %0 = bitcast i16* %a to i8*
354 %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
355 %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
356 %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
357 %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
358 %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
359 %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
360 %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
361 %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %lane, 0, 0
362 %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
363 %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
364 ret %struct.int16x8x3_t %.fca.0.2.insert
367 define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) {
368 ; CHECK-LABEL: test_vld3q_dup_s32
369 ; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
371 %0 = bitcast i32* %a to i8*
372 %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
373 %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
374 %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
375 %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
376 %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
377 %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
378 %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
379 %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %lane, 0, 0
380 %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
381 %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
382 ret %struct.int32x4x3_t %.fca.0.2.insert
385 define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) {
386 ; CHECK-LABEL: test_vld3q_dup_s64
387 ; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
389 %0 = bitcast i64* %a to i8*
390 %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
391 %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
392 %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
393 %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
394 %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
395 %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
396 %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
397 %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %lane, 0, 0
398 %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
399 %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
400 ret %struct.int64x2x3_t %.fca.0.2.insert
403 define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) {
404 ; CHECK-LABEL: test_vld3q_dup_f32
405 ; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
407 %0 = bitcast float* %a to i8*
408 %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
409 %1 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
410 %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
411 %2 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
412 %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
413 %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
414 %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
415 %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %lane, 0, 0
416 %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
417 %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
418 ret %struct.float32x4x3_t %.fca.0.2.insert
421 define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) {
422 ; CHECK-LABEL: test_vld3q_dup_f64
423 ; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
425 %0 = bitcast double* %a to i8*
426 %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
427 %1 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
428 %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
429 %2 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
430 %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
431 %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
432 %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
433 %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %lane, 0, 0
434 %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
435 %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
436 ret %struct.float64x2x3_t %.fca.0.2.insert
439 define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) {
440 ; CHECK-LABEL: test_vld3_dup_s8
441 ; CHECK: ld3r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
443 %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
444 %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
445 %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
446 %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
447 %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
448 %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
449 %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
450 %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %lane, 0, 0
451 %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
452 %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
453 ret %struct.int8x8x3_t %.fca.0.2.insert
456 define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) {
457 ; CHECK-LABEL: test_vld3_dup_s16
458 ; CHECK: ld3r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
460 %0 = bitcast i16* %a to i8*
461 %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
462 %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
463 %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
464 %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
465 %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
466 %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
467 %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
468 %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %lane, 0, 0
469 %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
470 %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
471 ret %struct.int16x4x3_t %.fca.0.2.insert
474 define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) {
475 ; CHECK-LABEL: test_vld3_dup_s32
476 ; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
478 %0 = bitcast i32* %a to i8*
479 %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
480 %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
481 %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
482 %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
483 %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
484 %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
485 %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
486 %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %lane, 0, 0
487 %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
488 %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
489 ret %struct.int32x2x3_t %.fca.0.2.insert
492 define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) {
493 ; CHECK-LABEL: test_vld3_dup_s64
494 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
496 %0 = bitcast i64* %a to i8*
497 %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %0, i32 8)
498 %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
499 %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
500 %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
501 %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
502 %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
503 %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
504 ret %struct.int64x1x3_t %.fca.0.2.insert
507 define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) {
508 ; CHECK-LABEL: test_vld3_dup_f32
509 ; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
511 %0 = bitcast float* %a to i8*
512 %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
513 %1 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
514 %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
515 %2 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
516 %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
517 %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
518 %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
519 %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %lane, 0, 0
520 %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
521 %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
522 ret %struct.float32x2x3_t %.fca.0.2.insert
525 define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) {
526 ; CHECK-LABEL: test_vld3_dup_f64
527 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
529 %0 = bitcast double* %a to i8*
530 %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %0, i32 8)
531 %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
532 %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
533 %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
534 %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
535 %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
536 %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
537 ret %struct.float64x1x3_t %.fca.0.2.insert
540 define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) {
541 ; CHECK-LABEL: test_vld4q_dup_s8
542 ; CHECK: ld4r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
544 %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
545 %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
546 %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
547 %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
548 %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
549 %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
550 %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
551 %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 3
552 %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> zeroinitializer
553 %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %lane, 0, 0
554 %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
555 %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
556 %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %lane3, 0, 3
557 ret %struct.int8x16x4_t %.fca.0.3.insert
560 define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) {
561 ; CHECK-LABEL: test_vld4q_dup_s16
562 ; CHECK: ld4r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
564 %0 = bitcast i16* %a to i8*
565 %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
566 %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
567 %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
568 %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
569 %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
570 %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
571 %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
572 %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 3
573 %lane3 = shufflevector <8 x i16> %4, <8 x i16> undef, <8 x i32> zeroinitializer
574 %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %lane, 0, 0
575 %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
576 %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
577 %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %lane3, 0, 3
578 ret %struct.int16x8x4_t %.fca.0.3.insert
581 define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) {
582 ; CHECK-LABEL: test_vld4q_dup_s32
583 ; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
585 %0 = bitcast i32* %a to i8*
586 %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
587 %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
588 %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
589 %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
590 %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
591 %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
592 %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
593 %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 3
594 %lane3 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> zeroinitializer
595 %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %lane, 0, 0
596 %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
597 %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
598 %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %lane3, 0, 3
599 ret %struct.int32x4x4_t %.fca.0.3.insert
602 define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) {
603 ; CHECK-LABEL: test_vld4q_dup_s64
604 ; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
606 %0 = bitcast i64* %a to i8*
607 %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
608 %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
609 %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
610 %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
611 %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
612 %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
613 %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
614 %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 3
615 %lane3 = shufflevector <2 x i64> %4, <2 x i64> undef, <2 x i32> zeroinitializer
616 %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %lane, 0, 0
617 %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
618 %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
619 %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %lane3, 0, 3
620 ret %struct.int64x2x4_t %.fca.0.3.insert
623 define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) {
624 ; CHECK-LABEL: test_vld4q_dup_f32
625 ; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
627 %0 = bitcast float* %a to i8*
628 %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
629 %1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
630 %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
631 %2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
632 %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
633 %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
634 %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
635 %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 3
636 %lane3 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> zeroinitializer
637 %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %lane, 0, 0
638 %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
639 %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
640 %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %lane3, 0, 3
641 ret %struct.float32x4x4_t %.fca.0.3.insert
644 define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) {
645 ; CHECK-LABEL: test_vld4q_dup_f64
646 ; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
648 %0 = bitcast double* %a to i8*
649 %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
650 %1 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
651 %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
652 %2 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
653 %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
654 %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
655 %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
656 %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 3
657 %lane3 = shufflevector <2 x double> %4, <2 x double> undef, <2 x i32> zeroinitializer
658 %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %lane, 0, 0
659 %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
660 %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
661 %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %lane3, 0, 3
662 ret %struct.float64x2x4_t %.fca.0.3.insert
665 define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) {
666 ; CHECK-LABEL: test_vld4_dup_s8
667 ; CHECK: ld4r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
669 %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
670 %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
671 %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
672 %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
673 %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
674 %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
675 %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
676 %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 3
677 %lane3 = shufflevector <8 x i8> %3, <8 x i8> undef, <8 x i32> zeroinitializer
678 %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %lane, 0, 0
679 %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
680 %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
681 %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %lane3, 0, 3
682 ret %struct.int8x8x4_t %.fca.0.3.insert
685 define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) {
686 ; CHECK-LABEL: test_vld4_dup_s16
687 ; CHECK: ld4r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
689 %0 = bitcast i16* %a to i8*
690 %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
691 %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
692 %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
693 %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
694 %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
695 %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
696 %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
697 %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 3
698 %lane3 = shufflevector <4 x i16> %4, <4 x i16> undef, <4 x i32> zeroinitializer
699 %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %lane, 0, 0
700 %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
701 %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
702 %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %lane3, 0, 3
703 ret %struct.int16x4x4_t %.fca.0.3.insert
706 define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) {
707 ; CHECK-LABEL: test_vld4_dup_s32
708 ; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
710 %0 = bitcast i32* %a to i8*
711 %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
712 %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
713 %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
714 %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
715 %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
716 %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
717 %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
718 %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 3
719 %lane3 = shufflevector <2 x i32> %4, <2 x i32> undef, <2 x i32> zeroinitializer
720 %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %lane, 0, 0
721 %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
722 %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
723 %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %lane3, 0, 3
724 ret %struct.int32x2x4_t %.fca.0.3.insert
727 define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) {
728 ; CHECK-LABEL: test_vld4_dup_s64
729 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
731 %0 = bitcast i64* %a to i8*
732 %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %0, i32 8)
733 %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
734 %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
735 %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
736 %vld_dup.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 3
737 %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
738 %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
739 %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
740 %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld_dup.fca.3.extract, 0, 3
741 ret %struct.int64x1x4_t %.fca.0.3.insert
744 define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) {
745 ; CHECK-LABEL: test_vld4_dup_f32
746 ; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
748 %0 = bitcast float* %a to i8*
749 %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
750 %1 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
751 %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
752 %2 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
753 %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
754 %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
755 %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
756 %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 3
757 %lane3 = shufflevector <2 x float> %4, <2 x float> undef, <2 x i32> zeroinitializer
758 %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %lane, 0, 0
759 %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
760 %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
761 %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %lane3, 0, 3
762 ret %struct.float32x2x4_t %.fca.0.3.insert
765 define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) {
766 ; CHECK-LABEL: test_vld4_dup_f64
767 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
769 %0 = bitcast double* %a to i8*
770 %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %0, i32 8)
771 %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
772 %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
773 %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
774 %vld_dup.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 3
775 %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
776 %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
777 %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
778 %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld_dup.fca.3.extract, 0, 3
779 ret %struct.float64x1x4_t %.fca.0.3.insert
782 define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) {
783 ; CHECK-LABEL: test_vld1q_lane_s8
784 ; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
786 %0 = load i8* %a, align 1
787 %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
788 ret <16 x i8> %vld1_lane
791 define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) {
792 ; CHECK-LABEL: test_vld1q_lane_s16
793 ; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
795 %0 = load i16* %a, align 2
796 %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
797 ret <8 x i16> %vld1_lane
800 define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) {
801 ; CHECK-LABEL: test_vld1q_lane_s32
802 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
804 %0 = load i32* %a, align 4
805 %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
806 ret <4 x i32> %vld1_lane
809 define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) {
810 ; CHECK-LABEL: test_vld1q_lane_s64
811 ; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
813 %0 = load i64* %a, align 8
814 %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1
815 ret <2 x i64> %vld1_lane
818 define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) {
819 ; CHECK-LABEL: test_vld1q_lane_f32
820 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
822 %0 = load float* %a, align 4
823 %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
824 ret <4 x float> %vld1_lane
827 define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) {
828 ; CHECK-LABEL: test_vld1q_lane_f64
829 ; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
831 %0 = load double* %a, align 8
832 %vld1_lane = insertelement <2 x double> %b, double %0, i32 1
833 ret <2 x double> %vld1_lane
836 define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) {
837 ; CHECK-LABEL: test_vld1_lane_s8
838 ; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
840 %0 = load i8* %a, align 1
841 %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
842 ret <8 x i8> %vld1_lane
845 define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) {
846 ; CHECK-LABEL: test_vld1_lane_s16
847 ; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
849 %0 = load i16* %a, align 2
850 %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
851 ret <4 x i16> %vld1_lane
854 define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) {
855 ; CHECK-LABEL: test_vld1_lane_s32
856 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
858 %0 = load i32* %a, align 4
859 %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
860 ret <2 x i32> %vld1_lane
863 define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) {
864 ; CHECK-LABEL: test_vld1_lane_s64
865 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
867 %0 = load i64* %a, align 8
868 %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
869 ret <1 x i64> %vld1_lane
872 define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) {
873 ; CHECK-LABEL: test_vld1_lane_f32
874 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
876 %0 = load float* %a, align 4
877 %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
878 ret <2 x float> %vld1_lane
881 define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) {
882 ; CHECK-LABEL: test_vld1_lane_f64
883 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
885 %0 = load double* %a, align 8
886 %vld1_lane = insertelement <1 x double> undef, double %0, i32 0
887 ret <1 x double> %vld1_lane
890 define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
891 ; CHECK-LABEL: test_vld2q_lane_s16
892 ; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
894 %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
895 %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
896 %0 = bitcast i16* %a to i8*
897 %vld2_lane = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
898 %vld2_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 0
899 %vld2_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 1
900 %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2_lane.fca.0.extract, 0, 0
901 %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2_lane.fca.1.extract, 0, 1
902 ret %struct.int16x8x2_t %.fca.0.1.insert
905 define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
906 ; CHECK-LABEL: test_vld2q_lane_s32
907 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
909 %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
910 %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
911 %0 = bitcast i32* %a to i8*
912 %vld2_lane = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
913 %vld2_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 0
914 %vld2_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 1
915 %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2_lane.fca.0.extract, 0, 0
916 %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2_lane.fca.1.extract, 0, 1
917 ret %struct.int32x4x2_t %.fca.0.1.insert
920 define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
921 ; CHECK-LABEL: test_vld2q_lane_s64
922 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
924 %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
925 %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
926 %0 = bitcast i64* %a to i8*
927 %vld2_lane = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
928 %vld2_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 0
929 %vld2_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 1
930 %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2_lane.fca.0.extract, 0, 0
931 %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2_lane.fca.1.extract, 0, 1
932 ret %struct.int64x2x2_t %.fca.0.1.insert
935 define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
936 ; CHECK-LABEL: test_vld2q_lane_f32
937 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
939 %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
940 %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
941 %0 = bitcast float* %a to i8*
942 %vld2_lane = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
943 %vld2_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 0
944 %vld2_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 1
945 %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2_lane.fca.0.extract, 0, 0
946 %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2_lane.fca.1.extract, 0, 1
947 ret %struct.float32x4x2_t %.fca.0.1.insert
950 define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
951 ; CHECK-LABEL: test_vld2q_lane_f64
952 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
954 %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
955 %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
956 %0 = bitcast double* %a to i8*
957 %vld2_lane = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
958 %vld2_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 0
959 %vld2_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 1
960 %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2_lane.fca.0.extract, 0, 0
961 %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2_lane.fca.1.extract, 0, 1
962 ret %struct.float64x2x2_t %.fca.0.1.insert
965 define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
966 ; CHECK-LABEL: test_vld2_lane_s8
967 ; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
969 %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
970 %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
971 %vld2_lane = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
972 %vld2_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 0
973 %vld2_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 1
974 %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2_lane.fca.0.extract, 0, 0
975 %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2_lane.fca.1.extract, 0, 1
976 ret %struct.int8x8x2_t %.fca.0.1.insert
979 define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
980 ; CHECK-LABEL: test_vld2_lane_s16
981 ; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
983 %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
984 %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
985 %0 = bitcast i16* %a to i8*
986 %vld2_lane = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
987 %vld2_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 0
988 %vld2_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 1
989 %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2_lane.fca.0.extract, 0, 0
990 %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2_lane.fca.1.extract, 0, 1
991 ret %struct.int16x4x2_t %.fca.0.1.insert
994 define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
995 ; CHECK-LABEL: test_vld2_lane_s32
996 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
998 %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
999 %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
1000 %0 = bitcast i32* %a to i8*
1001 %vld2_lane = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
1002 %vld2_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 0
1003 %vld2_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 1
1004 %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2_lane.fca.0.extract, 0, 0
1005 %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2_lane.fca.1.extract, 0, 1
1006 ret %struct.int32x2x2_t %.fca.0.1.insert
1009 define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
1010 ; CHECK-LABEL: test_vld2_lane_s64
1011 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1013 %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
1014 %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
1015 %0 = bitcast i64* %a to i8*
1016 %vld2_lane = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
1017 %vld2_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 0
1018 %vld2_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 1
1019 %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2_lane.fca.0.extract, 0, 0
1020 %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2_lane.fca.1.extract, 0, 1
1021 ret %struct.int64x1x2_t %.fca.0.1.insert
1024 define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
1025 ; CHECK-LABEL: test_vld2_lane_f32
1026 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1028 %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
1029 %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
1030 %0 = bitcast float* %a to i8*
1031 %vld2_lane = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
1032 %vld2_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 0
1033 %vld2_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 1
1034 %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2_lane.fca.0.extract, 0, 0
1035 %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2_lane.fca.1.extract, 0, 1
1036 ret %struct.float32x2x2_t %.fca.0.1.insert
1039 define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
1040 ; CHECK-LABEL: test_vld2_lane_f64
1041 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1043 %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
1044 %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
1045 %0 = bitcast double* %a to i8*
1046 %vld2_lane = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
1047 %vld2_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 0
1048 %vld2_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 1
1049 %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2_lane.fca.0.extract, 0, 0
1050 %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2_lane.fca.1.extract, 0, 1
1051 ret %struct.float64x1x2_t %.fca.0.1.insert
1054 define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
1055 ; CHECK-LABEL: test_vld3q_lane_s16
1056 ; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1058 %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
1059 %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
1060 %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
1061 %0 = bitcast i16* %a to i8*
1062 %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
1063 %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
1064 %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
1065 %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
1066 %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
1067 %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
1068 %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
1069 ret %struct.int16x8x3_t %.fca.0.2.insert
1072 define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
1073 ; CHECK-LABEL: test_vld3q_lane_s32
1074 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1076 %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
1077 %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
1078 %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
1079 %0 = bitcast i32* %a to i8*
1080 %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
1081 %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
1082 %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
1083 %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
1084 %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
1085 %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
1086 %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
1087 ret %struct.int32x4x3_t %.fca.0.2.insert
1090 define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
1091 ; CHECK-LABEL: test_vld3q_lane_s64
1092 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1094 %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
1095 %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
1096 %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
1097 %0 = bitcast i64* %a to i8*
1098 %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
1099 %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
1100 %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
1101 %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
1102 %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
1103 %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
1104 %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
1105 ret %struct.int64x2x3_t %.fca.0.2.insert
1108 define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
1109 ; CHECK-LABEL: test_vld3q_lane_f32
1110 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1112 %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
1113 %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
1114 %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
1115 %0 = bitcast float* %a to i8*
1116 %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
1117 %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
1118 %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
1119 %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
1120 %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
1121 %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
1122 %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
1123 ret %struct.float32x4x3_t %.fca.0.2.insert
1126 define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
1127 ; CHECK-LABEL: test_vld3q_lane_f64
1128 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1130 %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
1131 %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
1132 %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
1133 %0 = bitcast double* %a to i8*
1134 %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
1135 %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
1136 %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
1137 %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
1138 %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
1139 %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
1140 %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
1141 ret %struct.float64x2x3_t %.fca.0.2.insert
1144 define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
1145 ; CHECK-LABEL: test_vld3_lane_s8
1146 ; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1148 %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
1149 %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
1150 %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
1151 %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
1152 %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
1153 %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
1154 %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
1155 %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
1156 %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
1157 %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
1158 ret %struct.int8x8x3_t %.fca.0.2.insert
1161 define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
1162 ; CHECK-LABEL: test_vld3_lane_s16
1163 ; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1165 %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
1166 %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
1167 %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
1168 %0 = bitcast i16* %a to i8*
1169 %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
1170 %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
1171 %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
1172 %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
1173 %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
1174 %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
1175 %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
1176 ret %struct.int16x4x3_t %.fca.0.2.insert
1179 define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
1180 ; CHECK-LABEL: test_vld3_lane_s32
1181 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1183 %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
1184 %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
1185 %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
1186 %0 = bitcast i32* %a to i8*
1187 %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
1188 %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
1189 %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
1190 %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
1191 %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
1192 %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
1193 %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
1194 ret %struct.int32x2x3_t %.fca.0.2.insert
1197 define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
1198 ; CHECK-LABEL: test_vld3_lane_s64
1199 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1201 %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
1202 %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
1203 %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
1204 %0 = bitcast i64* %a to i8*
1205 %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
1206 %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
1207 %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
1208 %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
1209 %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
1210 %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
1211 %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
1212 ret %struct.int64x1x3_t %.fca.0.2.insert
1215 define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
1216 ; CHECK-LABEL: test_vld3_lane_f32
1217 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1219 %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
1220 %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
1221 %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
1222 %0 = bitcast float* %a to i8*
1223 %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
1224 %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
1225 %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
1226 %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
1227 %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
1228 %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
1229 %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
1230 ret %struct.float32x2x3_t %.fca.0.2.insert
1233 define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
1234 ; CHECK-LABEL: test_vld3_lane_f64
1235 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1237 %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
1238 %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
1239 %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
1240 %0 = bitcast double* %a to i8*
1241 %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
1242 %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
1243 %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
1244 %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
1245 %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
1246 %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
1247 %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
1248 ret %struct.float64x1x3_t %.fca.0.2.insert
1251 define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
1252 ; CHECK-LABEL: test_vld4q_lane_s8
1253 ; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1255 %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
1256 %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
1257 %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
1258 %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
1259 %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 1)
1260 %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
1261 %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
1262 %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
1263 %vld3_lane.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 3
1264 %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
1265 %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
1266 %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
1267 %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld3_lane.fca.3.extract, 0, 3
1268 ret %struct.int8x16x4_t %.fca.0.3.insert
1271 define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
1272 ; CHECK-LABEL: test_vld4q_lane_s16
1273 ; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1275 %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
1276 %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
1277 %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
1278 %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
1279 %0 = bitcast i16* %a to i8*
1280 %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
1281 %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
1282 %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
1283 %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
1284 %vld3_lane.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 3
1285 %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
1286 %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
1287 %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
1288 %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld3_lane.fca.3.extract, 0, 3
1289 ret %struct.int16x8x4_t %.fca.0.3.insert
1292 define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
1293 ; CHECK-LABEL: test_vld4q_lane_s32
1294 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1296 %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
1297 %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
1298 %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
1299 %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
1300 %0 = bitcast i32* %a to i8*
1301 %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
1302 %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
1303 %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
1304 %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
1305 %vld3_lane.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 3
1306 %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
1307 %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
1308 %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
1309 %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld3_lane.fca.3.extract, 0, 3
1310 ret %struct.int32x4x4_t %.fca.0.3.insert
1313 define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
1314 ; CHECK-LABEL: test_vld4q_lane_s64
1315 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1317 %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
1318 %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
1319 %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
1320 %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
1321 %0 = bitcast i64* %a to i8*
1322 %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
1323 %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
1324 %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
1325 %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
1326 %vld3_lane.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 3
1327 %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
1328 %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
1329 %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
1330 %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld3_lane.fca.3.extract, 0, 3
1331 ret %struct.int64x2x4_t %.fca.0.3.insert
1334 define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
1335 ; CHECK-LABEL: test_vld4q_lane_f32
1336 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1338 %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
1339 %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
1340 %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
1341 %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
1342 %0 = bitcast float* %a to i8*
1343 %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
1344 %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
1345 %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
1346 %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
1347 %vld3_lane.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 3
1348 %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
1349 %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
1350 %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
1351 %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld3_lane.fca.3.extract, 0, 3
1352 ret %struct.float32x4x4_t %.fca.0.3.insert
1355 define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
1356 ; CHECK-LABEL: test_vld4q_lane_f64
1357 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1359 %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
1360 %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
1361 %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
1362 %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
1363 %0 = bitcast double* %a to i8*
1364 %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
1365 %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
1366 %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
1367 %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
1368 %vld3_lane.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 3
1369 %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
1370 %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
1371 %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
1372 %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld3_lane.fca.3.extract, 0, 3
1373 ret %struct.float64x2x4_t %.fca.0.3.insert
1376 define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
1377 ; CHECK-LABEL: test_vld4_lane_s8
1378 ; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1380 %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
1381 %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
1382 %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
1383 %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
1384 %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
1385 %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
1386 %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
1387 %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
1388 %vld3_lane.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 3
1389 %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
1390 %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
1391 %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
1392 %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld3_lane.fca.3.extract, 0, 3
1393 ret %struct.int8x8x4_t %.fca.0.3.insert
1396 define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
1397 ; CHECK-LABEL: test_vld4_lane_s16
1398 ; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1400 %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
1401 %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
1402 %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
1403 %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
1404 %0 = bitcast i16* %a to i8*
1405 %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
1406 %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
1407 %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
1408 %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
1409 %vld3_lane.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 3
1410 %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
1411 %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
1412 %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
1413 %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld3_lane.fca.3.extract, 0, 3
1414 ret %struct.int16x4x4_t %.fca.0.3.insert
1417 define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
1418 ; CHECK-LABEL: test_vld4_lane_s32
1419 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1421 %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
1422 %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
1423 %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
1424 %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
1425 %0 = bitcast i32* %a to i8*
1426 %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
1427 %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
1428 %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
1429 %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
1430 %vld3_lane.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 3
1431 %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
1432 %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
1433 %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
1434 %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld3_lane.fca.3.extract, 0, 3
1435 ret %struct.int32x2x4_t %.fca.0.3.insert
1438 define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
1439 ; CHECK-LABEL: test_vld4_lane_s64
1440 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1442 %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
1443 %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
1444 %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
1445 %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
1446 %0 = bitcast i64* %a to i8*
1447 %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
1448 %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
1449 %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
1450 %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
1451 %vld3_lane.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 3
1452 %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
1453 %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
1454 %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
1455 %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld3_lane.fca.3.extract, 0, 3
1456 ret %struct.int64x1x4_t %.fca.0.3.insert
1459 define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
1460 ; CHECK-LABEL: test_vld4_lane_f32
1461 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1463 %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
1464 %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
1465 %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
1466 %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
1467 %0 = bitcast float* %a to i8*
1468 %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
1469 %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
1470 %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
1471 %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
1472 %vld3_lane.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 3
1473 %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
1474 %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
1475 %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
1476 %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld3_lane.fca.3.extract, 0, 3
1477 ret %struct.float32x2x4_t %.fca.0.3.insert
1480 define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
1481 ; CHECK-LABEL: test_vld4_lane_f64
1482 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1484 %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
1485 %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
1486 %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
1487 %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
1488 %0 = bitcast double* %a to i8*
1489 %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
1490 %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
1491 %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
1492 %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
1493 %vld3_lane.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 3
1494 %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
1495 %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
1496 %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
1497 %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld3_lane.fca.3.extract, 0, 3
1498 ret %struct.float64x1x4_t %.fca.0.3.insert
1501 define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) {
1502 ; CHECK-LABEL: test_vst1q_lane_s8
1503 ; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1505 %0 = extractelement <16 x i8> %b, i32 15
1506 store i8 %0, i8* %a, align 1
1510 define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) {
1511 ; CHECK-LABEL: test_vst1q_lane_s16
1512 ; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1514 %0 = extractelement <8 x i16> %b, i32 7
1515 store i16 %0, i16* %a, align 2
1519 define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
1520 ; CHECK-LABEL: test_vst1q_lane_s32
1521 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1523 %0 = extractelement <4 x i32> %b, i32 3
1524 store i32 %0, i32* %a, align 4
1528 define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
1529 ; CHECK-LABEL: test_vst1q_lane_s64
1530 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1532 %0 = extractelement <2 x i64> %b, i32 1
1533 store i64 %0, i64* %a, align 8
1537 define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
1538 ; CHECK-LABEL: test_vst1q_lane_f32
1539 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1541 %0 = extractelement <4 x float> %b, i32 3
1542 store float %0, float* %a, align 4
1546 define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
1547 ; CHECK-LABEL: test_vst1q_lane_f64
1548 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1550 %0 = extractelement <2 x double> %b, i32 1
1551 store double %0, double* %a, align 8
1555 define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
1556 ; CHECK-LABEL: test_vst1_lane_s8
1557 ; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1559 %0 = extractelement <8 x i8> %b, i32 7
1560 store i8 %0, i8* %a, align 1
1564 define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) {
1565 ; CHECK-LABEL: test_vst1_lane_s16
1566 ; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1568 %0 = extractelement <4 x i16> %b, i32 3
1569 store i16 %0, i16* %a, align 2
1573 define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
1574 ; CHECK-LABEL: test_vst1_lane_s32
1575 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1577 %0 = extractelement <2 x i32> %b, i32 1
1578 store i32 %0, i32* %a, align 4
1582 define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
1583 ; CHECK-LABEL: test_vst1_lane_s64
1584 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1586 %0 = extractelement <1 x i64> %b, i32 0
1587 store i64 %0, i64* %a, align 8
1591 define void @test_vst1_lane_f32(float* %a, <2 x float> %b) {
1592 ; CHECK-LABEL: test_vst1_lane_f32
1593 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1595 %0 = extractelement <2 x float> %b, i32 1
1596 store float %0, float* %a, align 4
1600 define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
1601 ; CHECK-LABEL: test_vst1_lane_f64
1602 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1604 %0 = extractelement <1 x double> %b, i32 0
1605 store double %0, double* %a, align 8
1609 define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
1610 ; CHECK-LABEL: test_vst2q_lane_s8
1611 ; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1613 %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
1614 %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
1615 tail call void @llvm.arm.neon.vst2lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 15, i32 1)
1619 define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
1620 ; CHECK-LABEL: test_vst2q_lane_s16
1621 ; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1623 %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
1624 %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
1625 %0 = bitcast i16* %a to i8*
1626 tail call void @llvm.arm.neon.vst2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
1630 define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
1631 ; CHECK-LABEL: test_vst2q_lane_s32
1632 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1634 %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
1635 %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
1636 %0 = bitcast i32* %a to i8*
1637 tail call void @llvm.arm.neon.vst2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
1641 define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
1642 ; CHECK-LABEL: test_vst2q_lane_s64
1643 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1645 %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
1646 %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
1647 %0 = bitcast i64* %a to i8*
1648 tail call void @llvm.arm.neon.vst2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
1652 define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
1653 ; CHECK-LABEL: test_vst2q_lane_f32
1654 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1656 %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
1657 %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
1658 %0 = bitcast float* %a to i8*
1659 tail call void @llvm.arm.neon.vst2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
1663 define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
1664 ; CHECK-LABEL: test_vst2q_lane_f64
1665 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1667 %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
1668 %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
1669 %0 = bitcast double* %a to i8*
1670 tail call void @llvm.arm.neon.vst2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
1674 define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
1675 ; CHECK-LABEL: test_vst2_lane_s8
1676 ; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1678 %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
1679 %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
1680 tail call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
1684 define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
1685 ; CHECK-LABEL: test_vst2_lane_s16
1686 ; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1688 %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
1689 %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
1690 %0 = bitcast i16* %a to i8*
1691 tail call void @llvm.arm.neon.vst2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
1695 define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
1696 ; CHECK-LABEL: test_vst2_lane_s32
1697 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1699 %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
1700 %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
1701 %0 = bitcast i32* %a to i8*
1702 tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
1706 define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
1707 ; CHECK-LABEL: test_vst2_lane_s64
1708 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1710 %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
1711 %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
1712 %0 = bitcast i64* %a to i8*
1713 tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
1717 define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
1718 ; CHECK-LABEL: test_vst2_lane_f32
1719 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1721 %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
1722 %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
1723 %0 = bitcast float* %a to i8*
1724 tail call void @llvm.arm.neon.vst2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
1728 define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
1729 ; CHECK-LABEL: test_vst2_lane_f64
1730 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1732 %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
1733 %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
1734 %0 = bitcast double* %a to i8*
1735 tail call void @llvm.arm.neon.vst2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
1739 define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
1740 ; CHECK-LABEL: test_vst3q_lane_s8
1741 ; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1743 %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
1744 %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
1745 %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
1746 tail call void @llvm.arm.neon.vst3lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 15, i32 1)
1750 define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
1751 ; CHECK-LABEL: test_vst3q_lane_s16
1752 ; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1754 %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
1755 %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
1756 %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
1757 %0 = bitcast i16* %a to i8*
1758 tail call void @llvm.arm.neon.vst3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
1762 define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
1763 ; CHECK-LABEL: test_vst3q_lane_s32
1764 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1766 %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
1767 %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
1768 %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
1769 %0 = bitcast i32* %a to i8*
1770 tail call void @llvm.arm.neon.vst3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
1774 define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
1775 ; CHECK-LABEL: test_vst3q_lane_s64
1776 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1778 %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
1779 %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
1780 %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
1781 %0 = bitcast i64* %a to i8*
1782 tail call void @llvm.arm.neon.vst3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
1786 define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
1787 ; CHECK-LABEL: test_vst3q_lane_f32
1788 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1790 %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
1791 %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
1792 %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
1793 %0 = bitcast float* %a to i8*
1794 tail call void @llvm.arm.neon.vst3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
1798 define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
1799 ; CHECK-LABEL: test_vst3q_lane_f64
1800 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1802 %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
1803 %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
1804 %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
1805 %0 = bitcast double* %a to i8*
1806 tail call void @llvm.arm.neon.vst3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
1810 define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
1811 ; CHECK-LABEL: test_vst3_lane_s8
1812 ; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1814 %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
1815 %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
1816 %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
1817 tail call void @llvm.arm.neon.vst3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
1821 define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
1822 ; CHECK-LABEL: test_vst3_lane_s16
1823 ; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1825 %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
1826 %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
1827 %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
1828 %0 = bitcast i16* %a to i8*
1829 tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
1833 define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
1834 ; CHECK-LABEL: test_vst3_lane_s32
1835 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1837 %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
1838 %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
1839 %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
1840 %0 = bitcast i32* %a to i8*
1841 tail call void @llvm.arm.neon.vst3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
1845 define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
1846 ; CHECK-LABEL: test_vst3_lane_s64
1847 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1849 %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
1850 %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
1851 %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
1852 %0 = bitcast i64* %a to i8*
1853 tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
1857 define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
1858 ; CHECK-LABEL: test_vst3_lane_f32
1859 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1861 %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
1862 %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
1863 %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
1864 %0 = bitcast float* %a to i8*
1865 tail call void @llvm.arm.neon.vst3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
1869 define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
1870 ; CHECK-LABEL: test_vst3_lane_f64
1871 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1873 %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
1874 %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
1875 %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
1876 %0 = bitcast double* %a to i8*
1877 tail call void @llvm.arm.neon.vst3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
1881 define void @test_vst4q_lane_s8(i16* %a, [4 x <16 x i8>] %b.coerce) {
1882 ; CHECK-LABEL: test_vst4q_lane_s8
1883 ; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1885 %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
1886 %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
1887 %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
1888 %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
1889 %0 = bitcast i16* %a to i8*
1890 tail call void @llvm.arm.neon.vst4lane.v16i8(i8* %0, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 2)
1894 define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
1895 ; CHECK-LABEL: test_vst4q_lane_s16
1896 ; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1898 %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
1899 %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
1900 %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
1901 %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
1902 %0 = bitcast i16* %a to i8*
1903 tail call void @llvm.arm.neon.vst4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
1907 define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
1908 ; CHECK-LABEL: test_vst4q_lane_s32
1909 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1911 %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
1912 %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
1913 %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
1914 %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
1915 %0 = bitcast i32* %a to i8*
1916 tail call void @llvm.arm.neon.vst4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
1920 define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
1921 ; CHECK-LABEL: test_vst4q_lane_s64
1922 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1924 %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
1925 %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
1926 %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
1927 %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
1928 %0 = bitcast i64* %a to i8*
1929 tail call void @llvm.arm.neon.vst4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
1933 define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
1934 ; CHECK-LABEL: test_vst4q_lane_f32
1935 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1937 %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
1938 %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
1939 %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
1940 %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
1941 %0 = bitcast float* %a to i8*
1942 tail call void @llvm.arm.neon.vst4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
1946 define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
1947 ; CHECK-LABEL: test_vst4q_lane_f64
1948 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1950 %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
1951 %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
1952 %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
1953 %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
1954 %0 = bitcast double* %a to i8*
1955 tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
1959 define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
1960 ; CHECK-LABEL: test_vst4_lane_s8
1961 ; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1963 %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
1964 %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
1965 %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
1966 %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
1967 tail call void @llvm.arm.neon.vst4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
1971 define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
1972 ; CHECK-LABEL: test_vst4_lane_s16
1973 ; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1975 %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
1976 %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
1977 %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
1978 %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
1979 %0 = bitcast i16* %a to i8*
1980 tail call void @llvm.arm.neon.vst4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
1984 define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
1985 ; CHECK-LABEL: test_vst4_lane_s32
1986 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1988 %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
1989 %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
1990 %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
1991 %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
1992 %0 = bitcast i32* %a to i8*
1993 tail call void @llvm.arm.neon.vst4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
1997 define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
1998 ; CHECK-LABEL: test_vst4_lane_s64
1999 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2001 %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
2002 %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
2003 %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
2004 %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
2005 %0 = bitcast i64* %a to i8*
2006 tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
2010 define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
2011 ; CHECK-LABEL: test_vst4_lane_f32
2012 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2014 %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
2015 %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
2016 %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
2017 %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
2018 %0 = bitcast float* %a to i8*
2019 tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
2023 define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
2024 ; CHECK-LABEL: test_vst4_lane_f64
2025 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2027 %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
2028 %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
2029 %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
2030 %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
2031 %0 = bitcast double* %a to i8*
2032 tail call void @llvm.arm.neon.vst4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
2036 declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
2037 declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
2038 declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
2039 declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
2040 declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
2041 declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
2042 declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
2043 declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
2044 declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
2045 declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
2046 declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
2047 declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
2048 declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2049 declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2050 declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2051 declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2052 declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2053 declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2054 declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2055 declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2056 declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2057 declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
2058 declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2059 declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
2060 declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2061 declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2062 declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2063 declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2064 declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2065 declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2066 declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2067 declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2068 declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2069 declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
2070 declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2071 declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
2072 declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
2073 declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
2074 declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2075 declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2076 declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2077 declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2078 declare void @llvm.arm.neon.vst2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
2079 declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
2080 declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
2081 declare void @llvm.arm.neon.vst2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
2082 declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
2083 declare void @llvm.arm.neon.vst2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
2084 declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
2085 declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
2086 declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
2087 declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
2088 declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
2089 declare void @llvm.arm.neon.vst2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
2090 declare void @llvm.arm.neon.vst3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2091 declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2092 declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2093 declare void @llvm.arm.neon.vst3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2094 declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2095 declare void @llvm.arm.neon.vst3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2096 declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2097 declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2098 declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2099 declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2100 declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2101 declare void @llvm.arm.neon.vst3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2102 declare void @llvm.arm.neon.vst4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2103 declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2104 declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2105 declare void @llvm.arm.neon.vst4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2106 declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2107 declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2108 declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2109 declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2110 declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2111 declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2112 declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2113 declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)