1 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
3 %struct.int8x16x2_t = type { [2 x <16 x i8>] }
4 %struct.int16x8x2_t = type { [2 x <8 x i16>] }
5 %struct.int32x4x2_t = type { [2 x <4 x i32>] }
6 %struct.int64x2x2_t = type { [2 x <2 x i64>] }
7 %struct.float32x4x2_t = type { [2 x <4 x float>] }
8 %struct.float64x2x2_t = type { [2 x <2 x double>] }
9 %struct.int8x8x2_t = type { [2 x <8 x i8>] }
10 %struct.int16x4x2_t = type { [2 x <4 x i16>] }
11 %struct.int32x2x2_t = type { [2 x <2 x i32>] }
12 %struct.int64x1x2_t = type { [2 x <1 x i64>] }
13 %struct.float32x2x2_t = type { [2 x <2 x float>] }
14 %struct.float64x1x2_t = type { [2 x <1 x double>] }
15 %struct.int8x16x3_t = type { [3 x <16 x i8>] }
16 %struct.int16x8x3_t = type { [3 x <8 x i16>] }
17 %struct.int32x4x3_t = type { [3 x <4 x i32>] }
18 %struct.int64x2x3_t = type { [3 x <2 x i64>] }
19 %struct.float32x4x3_t = type { [3 x <4 x float>] }
20 %struct.float64x2x3_t = type { [3 x <2 x double>] }
21 %struct.int8x8x3_t = type { [3 x <8 x i8>] }
22 %struct.int16x4x3_t = type { [3 x <4 x i16>] }
23 %struct.int32x2x3_t = type { [3 x <2 x i32>] }
24 %struct.int64x1x3_t = type { [3 x <1 x i64>] }
25 %struct.float32x2x3_t = type { [3 x <2 x float>] }
26 %struct.float64x1x3_t = type { [3 x <1 x double>] }
27 %struct.int8x16x4_t = type { [4 x <16 x i8>] }
28 %struct.int16x8x4_t = type { [4 x <8 x i16>] }
29 %struct.int32x4x4_t = type { [4 x <4 x i32>] }
30 %struct.int64x2x4_t = type { [4 x <2 x i64>] }
31 %struct.float32x4x4_t = type { [4 x <4 x float>] }
32 %struct.float64x2x4_t = type { [4 x <2 x double>] }
33 %struct.int8x8x4_t = type { [4 x <8 x i8>] }
34 %struct.int16x4x4_t = type { [4 x <4 x i16>] }
35 %struct.int32x2x4_t = type { [4 x <2 x i32>] }
36 %struct.int64x1x4_t = type { [4 x <1 x i64>] }
37 %struct.float32x2x4_t = type { [4 x <2 x float>] }
38 %struct.float64x1x4_t = type { [4 x <1 x double>] }
41 define <16 x i8> @test_vld1q_s8(i8* readonly %a) {
42 ; CHECK: test_vld1q_s8
43 ; CHECK: ld1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
44 %vld1 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %a, i32 1)
48 define <8 x i16> @test_vld1q_s16(i16* readonly %a) {
49 ; CHECK: test_vld1q_s16
50 ; CHECK: ld1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
51 %1 = bitcast i16* %a to i8*
52 %vld1 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %1, i32 2)
56 define <4 x i32> @test_vld1q_s32(i32* readonly %a) {
57 ; CHECK: test_vld1q_s32
58 ; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
59 %1 = bitcast i32* %a to i8*
60 %vld1 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %1, i32 4)
64 define <2 x i64> @test_vld1q_s64(i64* readonly %a) {
65 ; CHECK: test_vld1q_s64
66 ; CHECK: ld1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
67 %1 = bitcast i64* %a to i8*
68 %vld1 = tail call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %1, i32 8)
72 define <4 x float> @test_vld1q_f32(float* readonly %a) {
73 ; CHECK: test_vld1q_f32
74 ; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
75 %1 = bitcast float* %a to i8*
76 %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %1, i32 4)
80 define <2 x double> @test_vld1q_f64(double* readonly %a) {
81 ; CHECK: test_vld1q_f64
82 ; CHECK: ld1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
83 %1 = bitcast double* %a to i8*
84 %vld1 = tail call <2 x double> @llvm.arm.neon.vld1.v2f64(i8* %1, i32 8)
85 ret <2 x double> %vld1
88 define <8 x i8> @test_vld1_s8(i8* readonly %a) {
90 ; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
91 %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1)
95 define <4 x i16> @test_vld1_s16(i16* readonly %a) {
96 ; CHECK: test_vld1_s16
97 ; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
98 %1 = bitcast i16* %a to i8*
99 %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2)
103 define <2 x i32> @test_vld1_s32(i32* readonly %a) {
104 ; CHECK: test_vld1_s32
105 ; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
106 %1 = bitcast i32* %a to i8*
107 %vld1 = tail call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %1, i32 4)
111 define <1 x i64> @test_vld1_s64(i64* readonly %a) {
112 ; CHECK: test_vld1_s64
113 ; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
114 %1 = bitcast i64* %a to i8*
115 %vld1 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %1, i32 8)
119 define <2 x float> @test_vld1_f32(float* readonly %a) {
120 ; CHECK: test_vld1_f32
121 ; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
122 %1 = bitcast float* %a to i8*
123 %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %1, i32 4)
124 ret <2 x float> %vld1
127 define <1 x double> @test_vld1_f64(double* readonly %a) {
128 ; CHECK: test_vld1_f64
129 ; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
130 %1 = bitcast double* %a to i8*
131 %vld1 = tail call <1 x double> @llvm.arm.neon.vld1.v1f64(i8* %1, i32 8)
132 ret <1 x double> %vld1
135 define <8 x i8> @test_vld1_p8(i8* readonly %a) {
136 ; CHECK: test_vld1_p8
137 ; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
138 %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1)
142 define <4 x i16> @test_vld1_p16(i16* readonly %a) {
143 ; CHECK: test_vld1_p16
144 ; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
145 %1 = bitcast i16* %a to i8*
146 %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2)
150 define %struct.int8x16x2_t @test_vld2q_s8(i8* readonly %a) {
151 ; CHECK: test_vld2q_s8
152 ; CHECK: ld2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
153 %vld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %a, i32 1)
154 %vld2.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 0
155 %vld2.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 1
156 %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2.fca.0.extract, 0, 0
157 %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2.fca.1.extract, 0, 1
158 ret %struct.int8x16x2_t %.fca.0.1.insert
161 define %struct.int16x8x2_t @test_vld2q_s16(i16* readonly %a) {
162 ; CHECK: test_vld2q_s16
163 ; CHECK: ld2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
164 %1 = bitcast i16* %a to i8*
165 %vld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8* %1, i32 2)
166 %vld2.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 0
167 %vld2.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 1
168 %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2.fca.0.extract, 0, 0
169 %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2.fca.1.extract, 0, 1
170 ret %struct.int16x8x2_t %.fca.0.1.insert
173 define %struct.int32x4x2_t @test_vld2q_s32(i32* readonly %a) {
174 ; CHECK: test_vld2q_s32
175 ; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
176 %1 = bitcast i32* %a to i8*
177 %vld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8* %1, i32 4)
178 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
179 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
180 %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2.fca.0.extract, 0, 0
181 %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2.fca.1.extract, 0, 1
182 ret %struct.int32x4x2_t %.fca.0.1.insert
185 define %struct.int64x2x2_t @test_vld2q_s64(i64* readonly %a) {
186 ; CHECK: test_vld2q_s64
187 ; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
188 %1 = bitcast i64* %a to i8*
189 %vld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8* %1, i32 8)
190 %vld2.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 0
191 %vld2.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 1
192 %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2.fca.0.extract, 0, 0
193 %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2.fca.1.extract, 0, 1
194 ret %struct.int64x2x2_t %.fca.0.1.insert
197 define %struct.float32x4x2_t @test_vld2q_f32(float* readonly %a) {
198 ; CHECK: test_vld2q_f32
199 ; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
200 %1 = bitcast float* %a to i8*
201 %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
202 %vld2.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 0
203 %vld2.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 1
204 %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2.fca.0.extract, 0, 0
205 %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2.fca.1.extract, 0, 1
206 ret %struct.float32x4x2_t %.fca.0.1.insert
209 define %struct.float64x2x2_t @test_vld2q_f64(double* readonly %a) {
210 ; CHECK: test_vld2q_f64
211 ; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
212 %1 = bitcast double* %a to i8*
213 %vld2 = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8* %1, i32 8)
214 %vld2.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 0
215 %vld2.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 1
216 %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2.fca.0.extract, 0, 0
217 %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2.fca.1.extract, 0, 1
218 ret %struct.float64x2x2_t %.fca.0.1.insert
221 define %struct.int8x8x2_t @test_vld2_s8(i8* readonly %a) {
222 ; CHECK: test_vld2_s8
223 ; CHECK: ld2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
224 %vld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8* %a, i32 1)
225 %vld2.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 0
226 %vld2.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 1
227 %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2.fca.0.extract, 0, 0
228 %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2.fca.1.extract, 0, 1
229 ret %struct.int8x8x2_t %.fca.0.1.insert
232 define %struct.int16x4x2_t @test_vld2_s16(i16* readonly %a) {
233 ; CHECK: test_vld2_s16
234 ; CHECK: ld2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
235 %1 = bitcast i16* %a to i8*
236 %vld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8* %1, i32 2)
237 %vld2.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 0
238 %vld2.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 1
239 %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2.fca.0.extract, 0, 0
240 %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2.fca.1.extract, 0, 1
241 ret %struct.int16x4x2_t %.fca.0.1.insert
244 define %struct.int32x2x2_t @test_vld2_s32(i32* readonly %a) {
245 ; CHECK: test_vld2_s32
246 ; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
247 %1 = bitcast i32* %a to i8*
248 %vld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8* %1, i32 4)
249 %vld2.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 0
250 %vld2.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 1
251 %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2.fca.0.extract, 0, 0
252 %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2.fca.1.extract, 0, 1
253 ret %struct.int32x2x2_t %.fca.0.1.insert
256 define %struct.int64x1x2_t @test_vld2_s64(i64* readonly %a) {
257 ; CHECK: test_vld2_s64
258 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
259 %1 = bitcast i64* %a to i8*
260 %vld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %1, i32 8)
261 %vld2.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 0
262 %vld2.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 1
263 %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2.fca.0.extract, 0, 0
264 %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2.fca.1.extract, 0, 1
265 ret %struct.int64x1x2_t %.fca.0.1.insert
268 define %struct.float32x2x2_t @test_vld2_f32(float* readonly %a) {
269 ; CHECK: test_vld2_f32
270 ; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
271 %1 = bitcast float* %a to i8*
272 %vld2 = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8* %1, i32 4)
273 %vld2.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 0
274 %vld2.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 1
275 %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2.fca.0.extract, 0, 0
276 %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2.fca.1.extract, 0, 1
277 ret %struct.float32x2x2_t %.fca.0.1.insert
280 define %struct.float64x1x2_t @test_vld2_f64(double* readonly %a) {
281 ; CHECK: test_vld2_f64
282 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
283 %1 = bitcast double* %a to i8*
284 %vld2 = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %1, i32 8)
285 %vld2.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 0
286 %vld2.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 1
287 %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2.fca.0.extract, 0, 0
288 %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2.fca.1.extract, 0, 1
289 ret %struct.float64x1x2_t %.fca.0.1.insert
292 define %struct.int8x16x3_t @test_vld3q_s8(i8* readonly %a) {
293 ; CHECK: test_vld3q_s8
294 ; CHECK: ld3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
295 %vld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %a, i32 1)
296 %vld3.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 0
297 %vld3.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 1
298 %vld3.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 2
299 %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3.fca.0.extract, 0, 0
300 %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3.fca.1.extract, 0, 1
301 %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3.fca.2.extract, 0, 2
302 ret %struct.int8x16x3_t %.fca.0.2.insert
305 define %struct.int16x8x3_t @test_vld3q_s16(i16* readonly %a) {
306 ; CHECK: test_vld3q_s16
307 ; CHECK: ld3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
308 %1 = bitcast i16* %a to i8*
309 %vld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8* %1, i32 2)
310 %vld3.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 0
311 %vld3.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 1
312 %vld3.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 2
313 %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3.fca.0.extract, 0, 0
314 %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3.fca.1.extract, 0, 1
315 %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3.fca.2.extract, 0, 2
316 ret %struct.int16x8x3_t %.fca.0.2.insert
319 define %struct.int32x4x3_t @test_vld3q_s32(i32* readonly %a) {
320 ; CHECK: test_vld3q_s32
321 ; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
322 %1 = bitcast i32* %a to i8*
323 %vld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8* %1, i32 4)
324 %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
325 %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
326 %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
327 %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3.fca.0.extract, 0, 0
328 %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3.fca.1.extract, 0, 1
329 %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3.fca.2.extract, 0, 2
330 ret %struct.int32x4x3_t %.fca.0.2.insert
333 define %struct.int64x2x3_t @test_vld3q_s64(i64* readonly %a) {
334 ; CHECK: test_vld3q_s64
335 ; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
336 %1 = bitcast i64* %a to i8*
337 %vld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8* %1, i32 8)
338 %vld3.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 0
339 %vld3.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 1
340 %vld3.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 2
341 %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3.fca.0.extract, 0, 0
342 %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3.fca.1.extract, 0, 1
343 %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3.fca.2.extract, 0, 2
344 ret %struct.int64x2x3_t %.fca.0.2.insert
347 define %struct.float32x4x3_t @test_vld3q_f32(float* readonly %a) {
348 ; CHECK: test_vld3q_f32
349 ; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
350 %1 = bitcast float* %a to i8*
351 %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8* %1, i32 4)
352 %vld3.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0
353 %vld3.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 1
354 %vld3.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 2
355 %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3.fca.0.extract, 0, 0
356 %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3.fca.1.extract, 0, 1
357 %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3.fca.2.extract, 0, 2
358 ret %struct.float32x4x3_t %.fca.0.2.insert
361 define %struct.float64x2x3_t @test_vld3q_f64(double* readonly %a) {
362 ; CHECK: test_vld3q_f64
363 ; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
364 %1 = bitcast double* %a to i8*
365 %vld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8* %1, i32 8)
366 %vld3.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 0
367 %vld3.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 1
368 %vld3.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 2
369 %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3.fca.0.extract, 0, 0
370 %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3.fca.1.extract, 0, 1
371 %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3.fca.2.extract, 0, 2
372 ret %struct.float64x2x3_t %.fca.0.2.insert
375 define %struct.int8x8x3_t @test_vld3_s8(i8* readonly %a) {
376 ; CHECK: test_vld3_s8
377 ; CHECK: ld3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
378 %vld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8* %a, i32 1)
379 %vld3.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 0
380 %vld3.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 1
381 %vld3.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 2
382 %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3.fca.0.extract, 0, 0
383 %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3.fca.1.extract, 0, 1
384 %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3.fca.2.extract, 0, 2
385 ret %struct.int8x8x3_t %.fca.0.2.insert
388 define %struct.int16x4x3_t @test_vld3_s16(i16* readonly %a) {
389 ; CHECK: test_vld3_s16
390 ; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
391 %1 = bitcast i16* %a to i8*
392 %vld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %1, i32 2)
393 %vld3.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 0
394 %vld3.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 1
395 %vld3.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 2
396 %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3.fca.0.extract, 0, 0
397 %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3.fca.1.extract, 0, 1
398 %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3.fca.2.extract, 0, 2
399 ret %struct.int16x4x3_t %.fca.0.2.insert
402 define %struct.int32x2x3_t @test_vld3_s32(i32* readonly %a) {
403 ; CHECK: test_vld3_s32
404 ; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
405 %1 = bitcast i32* %a to i8*
406 %vld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8* %1, i32 4)
407 %vld3.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 0
408 %vld3.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 1
409 %vld3.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 2
410 %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3.fca.0.extract, 0, 0
411 %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3.fca.1.extract, 0, 1
412 %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3.fca.2.extract, 0, 2
413 ret %struct.int32x2x3_t %.fca.0.2.insert
416 define %struct.int64x1x3_t @test_vld3_s64(i64* readonly %a) {
417 ; CHECK: test_vld3_s64
418 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
419 %1 = bitcast i64* %a to i8*
420 %vld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %1, i32 8)
421 %vld3.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 0
422 %vld3.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 1
423 %vld3.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 2
424 %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3.fca.0.extract, 0, 0
425 %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3.fca.1.extract, 0, 1
426 %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3.fca.2.extract, 0, 2
427 ret %struct.int64x1x3_t %.fca.0.2.insert
430 define %struct.float32x2x3_t @test_vld3_f32(float* readonly %a) {
431 ; CHECK: test_vld3_f32
432 ; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
433 %1 = bitcast float* %a to i8*
434 %vld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8* %1, i32 4)
435 %vld3.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 0
436 %vld3.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 1
437 %vld3.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 2
438 %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3.fca.0.extract, 0, 0
439 %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3.fca.1.extract, 0, 1
440 %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3.fca.2.extract, 0, 2
441 ret %struct.float32x2x3_t %.fca.0.2.insert
444 define %struct.float64x1x3_t @test_vld3_f64(double* readonly %a) {
445 ; CHECK: test_vld3_f64
446 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
447 %1 = bitcast double* %a to i8*
448 %vld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %1, i32 8)
449 %vld3.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 0
450 %vld3.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 1
451 %vld3.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 2
452 %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3.fca.0.extract, 0, 0
453 %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3.fca.1.extract, 0, 1
454 %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3.fca.2.extract, 0, 2
455 ret %struct.float64x1x3_t %.fca.0.2.insert
458 define %struct.int8x16x4_t @test_vld4q_s8(i8* readonly %a) {
459 ; CHECK: test_vld4q_s8
460 ; CHECK: ld4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
461 %vld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %a, i32 1)
462 %vld4.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 0
463 %vld4.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 1
464 %vld4.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 2
465 %vld4.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 3
466 %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld4.fca.0.extract, 0, 0
467 %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld4.fca.1.extract, 0, 1
468 %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld4.fca.2.extract, 0, 2
469 %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld4.fca.3.extract, 0, 3
470 ret %struct.int8x16x4_t %.fca.0.3.insert
473 define %struct.int16x8x4_t @test_vld4q_s16(i16* readonly %a) {
474 ; CHECK: test_vld4q_s16
475 ; CHECK: ld4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
476 %1 = bitcast i16* %a to i8*
477 %vld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8* %1, i32 2)
478 %vld4.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 0
479 %vld4.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 1
480 %vld4.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 2
481 %vld4.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 3
482 %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld4.fca.0.extract, 0, 0
483 %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld4.fca.1.extract, 0, 1
484 %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld4.fca.2.extract, 0, 2
485 %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld4.fca.3.extract, 0, 3
486 ret %struct.int16x8x4_t %.fca.0.3.insert
489 define %struct.int32x4x4_t @test_vld4q_s32(i32* readonly %a) {
490 ; CHECK: test_vld4q_s32
491 ; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
492 %1 = bitcast i32* %a to i8*
493 %vld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8* %1, i32 4)
494 %vld4.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 0
495 %vld4.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 1
496 %vld4.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 2
497 %vld4.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 3
498 %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld4.fca.0.extract, 0, 0
499 %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld4.fca.1.extract, 0, 1
500 %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld4.fca.2.extract, 0, 2
501 %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld4.fca.3.extract, 0, 3
502 ret %struct.int32x4x4_t %.fca.0.3.insert
505 define %struct.int64x2x4_t @test_vld4q_s64(i64* readonly %a) {
506 ; CHECK: test_vld4q_s64
507 ; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
508 %1 = bitcast i64* %a to i8*
509 %vld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8* %1, i32 8)
510 %vld4.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 0
511 %vld4.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 1
512 %vld4.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 2
513 %vld4.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 3
514 %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld4.fca.0.extract, 0, 0
515 %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld4.fca.1.extract, 0, 1
516 %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld4.fca.2.extract, 0, 2
517 %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld4.fca.3.extract, 0, 3
518 ret %struct.int64x2x4_t %.fca.0.3.insert
521 define %struct.float32x4x4_t @test_vld4q_f32(float* readonly %a) {
522 ; CHECK: test_vld4q_f32
523 ; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
524 %1 = bitcast float* %a to i8*
525 %vld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4)
526 %vld4.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 0
527 %vld4.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 1
528 %vld4.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 2
529 %vld4.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 3
530 %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld4.fca.0.extract, 0, 0
531 %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld4.fca.1.extract, 0, 1
532 %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld4.fca.2.extract, 0, 2
533 %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld4.fca.3.extract, 0, 3
534 ret %struct.float32x4x4_t %.fca.0.3.insert
537 define %struct.float64x2x4_t @test_vld4q_f64(double* readonly %a) {
538 ; CHECK: test_vld4q_f64
539 ; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
540 %1 = bitcast double* %a to i8*
541 %vld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8* %1, i32 8)
542 %vld4.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 0
543 %vld4.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 1
544 %vld4.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 2
545 %vld4.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 3
546 %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld4.fca.0.extract, 0, 0
547 %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld4.fca.1.extract, 0, 1
548 %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld4.fca.2.extract, 0, 2
549 %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld4.fca.3.extract, 0, 3
550 ret %struct.float64x2x4_t %.fca.0.3.insert
553 define %struct.int8x8x4_t @test_vld4_s8(i8* readonly %a) {
554 ; CHECK: test_vld4_s8
555 ; CHECK: ld4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
556 %vld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %a, i32 1)
557 %vld4.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 0
558 %vld4.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 1
559 %vld4.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 2
560 %vld4.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 3
561 %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld4.fca.0.extract, 0, 0
562 %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld4.fca.1.extract, 0, 1
563 %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld4.fca.2.extract, 0, 2
564 %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld4.fca.3.extract, 0, 3
565 ret %struct.int8x8x4_t %.fca.0.3.insert
568 define %struct.int16x4x4_t @test_vld4_s16(i16* readonly %a) {
569 ; CHECK: test_vld4_s16
570 ; CHECK: ld4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
571 %1 = bitcast i16* %a to i8*
572 %vld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8* %1, i32 2)
573 %vld4.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 0
574 %vld4.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 1
575 %vld4.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 2
576 %vld4.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 3
577 %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld4.fca.0.extract, 0, 0
578 %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld4.fca.1.extract, 0, 1
579 %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld4.fca.2.extract, 0, 2
580 %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld4.fca.3.extract, 0, 3
581 ret %struct.int16x4x4_t %.fca.0.3.insert
584 define %struct.int32x2x4_t @test_vld4_s32(i32* readonly %a) {
585 ; CHECK: test_vld4_s32
586 ; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
587 %1 = bitcast i32* %a to i8*
588 %vld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8* %1, i32 4)
589 %vld4.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 0
590 %vld4.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 1
591 %vld4.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 2
592 %vld4.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 3
593 %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld4.fca.0.extract, 0, 0
594 %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld4.fca.1.extract, 0, 1
595 %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld4.fca.2.extract, 0, 2
596 %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld4.fca.3.extract, 0, 3
597 ret %struct.int32x2x4_t %.fca.0.3.insert
600 define %struct.int64x1x4_t @test_vld4_s64(i64* readonly %a) {
601 ; CHECK: test_vld4_s64
602 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
603 %1 = bitcast i64* %a to i8*
604 %vld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %1, i32 8)
605 %vld4.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 0
606 %vld4.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 1
607 %vld4.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 2
608 %vld4.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 3
609 %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld4.fca.0.extract, 0, 0
610 %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld4.fca.1.extract, 0, 1
611 %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld4.fca.2.extract, 0, 2
612 %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld4.fca.3.extract, 0, 3
613 ret %struct.int64x1x4_t %.fca.0.3.insert
616 define %struct.float32x2x4_t @test_vld4_f32(float* readonly %a) {
617 ; CHECK: test_vld4_f32
618 ; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
619 %1 = bitcast float* %a to i8*
620 %vld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8* %1, i32 4)
621 %vld4.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 0
622 %vld4.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 1
623 %vld4.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 2
624 %vld4.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 3
625 %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld4.fca.0.extract, 0, 0
626 %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld4.fca.1.extract, 0, 1
627 %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld4.fca.2.extract, 0, 2
628 %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld4.fca.3.extract, 0, 3
629 ret %struct.float32x2x4_t %.fca.0.3.insert
632 define %struct.float64x1x4_t @test_vld4_f64(double* readonly %a) {
633 ; CHECK: test_vld4_f64
634 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
635 %1 = bitcast double* %a to i8*
636 %vld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %1, i32 8)
637 %vld4.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 0
638 %vld4.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 1
639 %vld4.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 2
640 %vld4.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 3
641 %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld4.fca.0.extract, 0, 0
642 %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld4.fca.1.extract, 0, 1
643 %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld4.fca.2.extract, 0, 2
644 %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld4.fca.3.extract, 0, 3
645 ret %struct.float64x1x4_t %.fca.0.3.insert
648 declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32)
649 declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32)
650 declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32)
651 declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32)
652 declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32)
653 declare <2 x double> @llvm.arm.neon.vld1.v2f64(i8*, i32)
654 declare <8 x i8> @llvm.arm.neon.vld1.v8i8(i8*, i32)
655 declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32)
656 declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32)
657 declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*, i32)
658 declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32)
659 declare <1 x double> @llvm.arm.neon.vld1.v1f64(i8*, i32)
660 declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
661 declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8*, i32)
662 declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8*, i32)
663 declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8*, i32)
664 declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32)
665 declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8*, i32)
666 declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8*, i32)
667 declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8*, i32)
668 declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8*, i32)
669 declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
670 declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8*, i32)
671 declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
672 declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32)
673 declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8*, i32)
674 declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8*, i32)
675 declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8*, i32)
676 declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8*, i32)
677 declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8*, i32)
678 declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8*, i32)
679 declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32)
680 declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8*, i32)
681 declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
682 declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8*, i32)
683 declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
684 declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
685 declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8*, i32)
686 declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8*, i32)
687 declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8*, i32)
688 declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32)
689 declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8*, i32)
690 declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
691 declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8*, i32)
692 declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8*, i32)
693 declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
694 declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8*, i32)
695 declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
697 define void @test_vst1q_s8(i8* %a, <16 x i8> %b) {
698 ; CHECK: test_vst1q_s8
699 ; CHECK: st1 {v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
700 tail call void @llvm.arm.neon.vst1.v16i8(i8* %a, <16 x i8> %b, i32 1)
704 define void @test_vst1q_s16(i16* %a, <8 x i16> %b) {
705 ; CHECK: test_vst1q_s16
706 ; CHECK: st1 {v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
707 %1 = bitcast i16* %a to i8*
708 tail call void @llvm.arm.neon.vst1.v8i16(i8* %1, <8 x i16> %b, i32 2)
712 define void @test_vst1q_s32(i32* %a, <4 x i32> %b) {
713 ; CHECK: test_vst1q_s32
714 ; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
715 %1 = bitcast i32* %a to i8*
716 tail call void @llvm.arm.neon.vst1.v4i32(i8* %1, <4 x i32> %b, i32 4)
720 define void @test_vst1q_s64(i64* %a, <2 x i64> %b) {
721 ; CHECK: test_vst1q_s64
722 ; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
723 %1 = bitcast i64* %a to i8*
724 tail call void @llvm.arm.neon.vst1.v2i64(i8* %1, <2 x i64> %b, i32 8)
728 define void @test_vst1q_f32(float* %a, <4 x float> %b) {
729 ; CHECK: test_vst1q_f32
730 ; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
731 %1 = bitcast float* %a to i8*
732 tail call void @llvm.arm.neon.vst1.v4f32(i8* %1, <4 x float> %b, i32 4)
736 define void @test_vst1q_f64(double* %a, <2 x double> %b) {
737 ; CHECK: test_vst1q_f64
738 ; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
739 %1 = bitcast double* %a to i8*
740 tail call void @llvm.arm.neon.vst1.v2f64(i8* %1, <2 x double> %b, i32 8)
744 define void @test_vst1_s8(i8* %a, <8 x i8> %b) {
745 ; CHECK: test_vst1_s8
746 ; CHECK: st1 {v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
747 tail call void @llvm.arm.neon.vst1.v8i8(i8* %a, <8 x i8> %b, i32 1)
751 define void @test_vst1_s16(i16* %a, <4 x i16> %b) {
752 ; CHECK: test_vst1_s16
753 ; CHECK: st1 {v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
754 %1 = bitcast i16* %a to i8*
755 tail call void @llvm.arm.neon.vst1.v4i16(i8* %1, <4 x i16> %b, i32 2)
759 define void @test_vst1_s32(i32* %a, <2 x i32> %b) {
760 ; CHECK: test_vst1_s32
761 ; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
762 %1 = bitcast i32* %a to i8*
763 tail call void @llvm.arm.neon.vst1.v2i32(i8* %1, <2 x i32> %b, i32 4)
767 define void @test_vst1_s64(i64* %a, <1 x i64> %b) {
768 ; CHECK: test_vst1_s64
769 ; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
770 %1 = bitcast i64* %a to i8*
771 tail call void @llvm.arm.neon.vst1.v1i64(i8* %1, <1 x i64> %b, i32 8)
775 define void @test_vst1_f32(float* %a, <2 x float> %b) {
776 ; CHECK: test_vst1_f32
777 ; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
778 %1 = bitcast float* %a to i8*
779 tail call void @llvm.arm.neon.vst1.v2f32(i8* %1, <2 x float> %b, i32 4)
783 define void @test_vst1_f64(double* %a, <1 x double> %b) {
784 ; CHECK: test_vst1_f64
785 ; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
786 %1 = bitcast double* %a to i8*
787 tail call void @llvm.arm.neon.vst1.v1f64(i8* %1, <1 x double> %b, i32 8)
791 define void @test_vst2q_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
792 ; CHECK: test_vst2q_s8
793 ; CHECK: st2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
794 %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
795 %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
796 tail call void @llvm.arm.neon.vst2.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 1)
800 define void @test_vst2q_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
801 ; CHECK: test_vst2q_s16
802 ; CHECK: st2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
803 %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
804 %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
805 %1 = bitcast i16* %a to i8*
806 tail call void @llvm.arm.neon.vst2.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 2)
810 define void @test_vst2q_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
811 ; CHECK: test_vst2q_s32
812 ; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
813 %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
814 %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
815 %1 = bitcast i32* %a to i8*
816 tail call void @llvm.arm.neon.vst2.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 4)
820 define void @test_vst2q_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
821 ; CHECK: test_vst2q_s64
822 ; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
823 %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
824 %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
825 %1 = bitcast i64* %a to i8*
826 tail call void @llvm.arm.neon.vst2.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 8)
830 define void @test_vst2q_f32(float* %a, [2 x <4 x float>] %b.coerce) {
831 ; CHECK: test_vst2q_f32
832 ; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
833 %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
834 %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
835 %1 = bitcast float* %a to i8*
836 tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 4)
840 define void @test_vst2q_f64(double* %a, [2 x <2 x double>] %b.coerce) {
841 ; CHECK: test_vst2q_f64
842 ; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
843 %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
844 %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
845 %1 = bitcast double* %a to i8*
846 tail call void @llvm.arm.neon.vst2.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 8)
850 define void @test_vst2_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
851 ; CHECK: test_vst2_s8
852 ; CHECK: st2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
853 %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
854 %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
855 tail call void @llvm.arm.neon.vst2.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 1)
859 define void @test_vst2_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
860 ; CHECK: test_vst2_s16
861 ; CHECK: st2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
862 %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
863 %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
864 %1 = bitcast i16* %a to i8*
865 tail call void @llvm.arm.neon.vst2.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 2)
869 define void @test_vst2_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
870 ; CHECK: test_vst2_s32
871 ; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
872 %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
873 %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
874 %1 = bitcast i32* %a to i8*
875 tail call void @llvm.arm.neon.vst2.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 4)
879 define void @test_vst2_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
880 ; CHECK: test_vst2_s64
881 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
882 %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
883 %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
884 %1 = bitcast i64* %a to i8*
885 tail call void @llvm.arm.neon.vst2.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 8)
889 define void @test_vst2_f32(float* %a, [2 x <2 x float>] %b.coerce) {
890 ; CHECK: test_vst2_f32
891 ; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
892 %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
893 %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
894 %1 = bitcast float* %a to i8*
895 tail call void @llvm.arm.neon.vst2.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 4)
899 define void @test_vst2_f64(double* %a, [2 x <1 x double>] %b.coerce) {
900 ; CHECK: test_vst2_f64
901 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
902 %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
903 %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
904 %1 = bitcast double* %a to i8*
905 tail call void @llvm.arm.neon.vst2.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 8)
909 define void @test_vst3q_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
910 ; CHECK: test_vst3q_s8
911 ; CHECK: st3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
912 %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
913 %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
914 %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
915 tail call void @llvm.arm.neon.vst3.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 1)
919 define void @test_vst3q_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
920 ; CHECK: test_vst3q_s16
921 ; CHECK: st3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
922 %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
923 %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
924 %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
925 %1 = bitcast i16* %a to i8*
926 tail call void @llvm.arm.neon.vst3.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 2)
930 define void @test_vst3q_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
931 ; CHECK: test_vst3q_s32
932 ; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
933 %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
934 %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
935 %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
936 %1 = bitcast i32* %a to i8*
937 tail call void @llvm.arm.neon.vst3.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 4)
941 define void @test_vst3q_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
942 ; CHECK: test_vst3q_s64
943 ; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
944 %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
945 %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
946 %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
947 %1 = bitcast i64* %a to i8*
948 tail call void @llvm.arm.neon.vst3.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 8)
952 define void @test_vst3q_f32(float* %a, [3 x <4 x float>] %b.coerce) {
953 ; CHECK: test_vst3q_f32
954 ; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
955 %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
956 %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
957 %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
958 %1 = bitcast float* %a to i8*
959 tail call void @llvm.arm.neon.vst3.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 4)
963 define void @test_vst3q_f64(double* %a, [3 x <2 x double>] %b.coerce) {
964 ; CHECK: test_vst3q_f64
965 ; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
966 %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
967 %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
968 %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
969 %1 = bitcast double* %a to i8*
970 tail call void @llvm.arm.neon.vst3.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 8)
974 define void @test_vst3_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
975 ; CHECK: test_vst3_s8
976 ; CHECK: st3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
977 %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
978 %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
979 %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
980 tail call void @llvm.arm.neon.vst3.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 1)
984 define void @test_vst3_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
985 ; CHECK: test_vst3_s16
986 ; CHECK: st3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
987 %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
988 %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
989 %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
990 %1 = bitcast i16* %a to i8*
991 tail call void @llvm.arm.neon.vst3.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 2)
995 define void @test_vst3_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
996 ; CHECK: test_vst3_s32
997 ; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
998 %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
999 %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
1000 %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
1001 %1 = bitcast i32* %a to i8*
1002 tail call void @llvm.arm.neon.vst3.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 4)
1006 define void @test_vst3_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
1007 ; CHECK: test_vst3_s64
1008 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
1009 %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
1010 %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
1011 %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
1012 %1 = bitcast i64* %a to i8*
1013 tail call void @llvm.arm.neon.vst3.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 8)
1017 define void @test_vst3_f32(float* %a, [3 x <2 x float>] %b.coerce) {
1018 ; CHECK: test_vst3_f32
1019 ; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
1020 %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
1021 %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
1022 %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
1023 %1 = bitcast float* %a to i8*
1024 tail call void @llvm.arm.neon.vst3.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 4)
1028 define void @test_vst3_f64(double* %a, [3 x <1 x double>] %b.coerce) {
1029 ; CHECK: test_vst3_f64
1030 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
1031 %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
1032 %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
1033 %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
1034 %1 = bitcast double* %a to i8*
1035 tail call void @llvm.arm.neon.vst3.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 8)
1039 define void @test_vst4q_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
1040 ; CHECK: test_vst4q_s8
1041 ; CHECK: st4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
1042 %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
1043 %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
1044 %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
1045 %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
1046 tail call void @llvm.arm.neon.vst4.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 1)
1050 define void @test_vst4q_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
1051 ; CHECK: test_vst4q_s16
1052 ; CHECK: st4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
1053 %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
1054 %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
1055 %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
1056 %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
1057 %1 = bitcast i16* %a to i8*
1058 tail call void @llvm.arm.neon.vst4.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 2)
1062 define void @test_vst4q_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
1063 ; CHECK: test_vst4q_s32
1064 ; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
1065 %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
1066 %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
1067 %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
1068 %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
1069 %1 = bitcast i32* %a to i8*
1070 tail call void @llvm.arm.neon.vst4.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 4)
1074 define void @test_vst4q_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
1075 ; CHECK: test_vst4q_s64
1076 ; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
1077 %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
1078 %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
1079 %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
1080 %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
1081 %1 = bitcast i64* %a to i8*
1082 tail call void @llvm.arm.neon.vst4.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 8)
1086 define void @test_vst4q_f32(float* %a, [4 x <4 x float>] %b.coerce) {
1087 ; CHECK: test_vst4q_f32
1088 ; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
1089 %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
1090 %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
1091 %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
1092 %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
1093 %1 = bitcast float* %a to i8*
1094 tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 4)
1098 define void @test_vst4q_f64(double* %a, [4 x <2 x double>] %b.coerce) {
1099 ; CHECK: test_vst4q_f64
1100 ; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
1101 %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
1102 %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
1103 %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
1104 %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
1105 %1 = bitcast double* %a to i8*
1106 tail call void @llvm.arm.neon.vst4.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 8)
1110 define void @test_vst4_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
1111 ; CHECK: test_vst4_s8
1112 ; CHECK: st4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
1113 %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
1114 %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
1115 %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
1116 %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
1117 tail call void @llvm.arm.neon.vst4.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 1)
1121 define void @test_vst4_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
1122 ; CHECK: test_vst4_s16
1123 ; CHECK: st4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
1124 %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
1125 %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
1126 %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
1127 %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
1128 %1 = bitcast i16* %a to i8*
1129 tail call void @llvm.arm.neon.vst4.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 2)
1133 define void @test_vst4_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
1134 ; CHECK: test_vst4_s32
1135 ; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
1136 %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
1137 %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
1138 %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
1139 %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
1140 %1 = bitcast i32* %a to i8*
1141 tail call void @llvm.arm.neon.vst4.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 4)
1145 define void @test_vst4_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
1146 ; CHECK: test_vst4_s64
1147 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
1148 %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
1149 %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
1150 %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
1151 %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
1152 %1 = bitcast i64* %a to i8*
1153 tail call void @llvm.arm.neon.vst4.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 8)
1157 define void @test_vst4_f32(float* %a, [4 x <2 x float>] %b.coerce) {
1158 ; CHECK: test_vst4_f32
1159 ; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
1160 %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
1161 %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
1162 %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
1163 %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
1164 %1 = bitcast float* %a to i8*
1165 tail call void @llvm.arm.neon.vst4.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 4)
1169 define void @test_vst4_f64(double* %a, [4 x <1 x double>] %b.coerce) {
1170 ; CHECK: test_vst4_f64
1171 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
1172 %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
1173 %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
1174 %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
1175 %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
1176 %1 = bitcast double* %a to i8*
1177 tail call void @llvm.arm.neon.vst4.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 8)
1181 declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32)
1182 declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32)
1183 declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32)
1184 declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>, i32)
1185 declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32)
1186 declare void @llvm.arm.neon.vst1.v2f64(i8*, <2 x double>, i32)
1187 declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32)
1188 declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32)
1189 declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32)
1190 declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>, i32)
1191 declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32)
1192 declare void @llvm.arm.neon.vst1.v1f64(i8*, <1 x double>, i32)
1193 declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
1194 declare void @llvm.arm.neon.vst2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
1195 declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32)
1196 declare void @llvm.arm.neon.vst2.v2i64(i8*, <2 x i64>, <2 x i64>, i32)
1197 declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32)
1198 declare void @llvm.arm.neon.vst2.v2f64(i8*, <2 x double>, <2 x double>, i32)
1199 declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
1200 declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>, i32)
1201 declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32)
1202 declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
1203 declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>, i32)
1204 declare void @llvm.arm.neon.vst2.v1f64(i8*, <1 x double>, <1 x double>, i32)
1205 declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
1206 declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
1207 declare void @llvm.arm.neon.vst3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32)
1208 declare void @llvm.arm.neon.vst3.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32)
1209 declare void @llvm.arm.neon.vst3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32)
1210 declare void @llvm.arm.neon.vst3.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32)
1211 declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
1212 declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32)
1213 declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
1214 declare void @llvm.arm.neon.vst3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
1215 declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32)
1216 declare void @llvm.arm.neon.vst3.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32)
1217 declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
1218 declare void @llvm.arm.neon.vst4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32)
1219 declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32)
1220 declare void @llvm.arm.neon.vst4.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32)
1221 declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
1222 declare void @llvm.arm.neon.vst4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32)
1223 declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
1224 declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32)
1225 declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32)
1226 declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
1227 declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32)
1228 declare void @llvm.arm.neon.vst4.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32)