1 ; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
2 ; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
4 define void @vshf_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
7 %1 = load <16 x i8>* %a
8 ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
9 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
10 ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
11 ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[PTR_A]])
12 ; CHECK-DAG: vshf.b [[R3]], [[R1]], [[R1]]
13 store <16 x i8> %2, <16 x i8>* %c
14 ; CHECK-DAG: st.b [[R3]], 0($4)
17 ; CHECK: .size vshf_v16i8_0
20 define void @vshf_v16i8_1(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
21 ; CHECK: vshf_v16i8_1:
23 %1 = load <16 x i8>* %a
24 ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
25 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
26 ; CHECK-DAG: splati.b [[R3:\$w[0-9]+]], [[R1]][1]
27 store <16 x i8> %2, <16 x i8>* %c
28 ; CHECK-DAG: st.b [[R3]], 0($4)
31 ; CHECK: .size vshf_v16i8_1
34 define void @vshf_v16i8_2(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
35 ; CHECK: vshf_v16i8_2:
37 %1 = load <16 x i8>* %a
38 %2 = load <16 x i8>* %b
39 ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
40 %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 16>
41 ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
42 ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[PTR_A]])
43 ; CHECK-DAG: vshf.b [[R3]], [[R2]], [[R2]]
44 store <16 x i8> %3, <16 x i8>* %c
45 ; CHECK-DAG: st.b [[R3]], 0($4)
48 ; CHECK: .size vshf_v16i8_2
51 define void @vshf_v16i8_3(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
52 ; CHECK: vshf_v16i8_3:
54 %1 = load <16 x i8>* %a
55 ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
56 %2 = load <16 x i8>* %b
57 ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
58 %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 17, i32 24, i32 25, i32 18, i32 19, i32 20, i32 28, i32 19, i32 1, i32 8, i32 9, i32 2, i32 3, i32 4, i32 12, i32 3>
59 ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
60 ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[PTR_A]])
61 ; The concatenation step of vshf is bitwise not vectorwise so we must reverse
62 ; the operands to get the right answer.
63 ; CHECK-DAG: vshf.b [[R3]], [[R2]], [[R1]]
64 store <16 x i8> %3, <16 x i8>* %c
65 ; CHECK-DAG: st.b [[R3]], 0($4)
68 ; CHECK: .size vshf_v16i8_3
71 define void @vshf_v16i8_4(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
72 ; CHECK: vshf_v16i8_4:
74 %1 = load <16 x i8>* %a
75 ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
76 %2 = shufflevector <16 x i8> %1, <16 x i8> %1, <16 x i32> <i32 1, i32 17, i32 1, i32 17, i32 1, i32 17, i32 1, i32 17, i32 1, i32 17, i32 1, i32 17, i32 1, i32 17, i32 1, i32 17>
77 ; CHECK-DAG: splati.b [[R3:\$w[0-9]+]], [[R1]][1]
78 store <16 x i8> %2, <16 x i8>* %c
79 ; CHECK-DAG: st.b [[R3]], 0($4)
82 ; CHECK: .size vshf_v16i8_4
85 define void @vshf_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
86 ; CHECK: vshf_v8i16_0:
88 %1 = load <8 x i16>* %a
89 ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
90 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
91 ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
92 ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0([[PTR_A]])
93 ; CHECK-DAG: vshf.h [[R3]], [[R1]], [[R1]]
94 store <8 x i16> %2, <8 x i16>* %c
95 ; CHECK-DAG: st.h [[R3]], 0($4)
98 ; CHECK: .size vshf_v8i16_0
101 define void @vshf_v8i16_1(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
102 ; CHECK: vshf_v8i16_1:
104 %1 = load <8 x i16>* %a
105 ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
106 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
107 ; CHECK-DAG: splati.h [[R3:\$w[0-9]+]], [[R1]][1]
108 store <8 x i16> %2, <8 x i16>* %c
109 ; CHECK-DAG: st.h [[R3]], 0($4)
112 ; CHECK: .size vshf_v8i16_1
115 define void @vshf_v8i16_2(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
116 ; CHECK: vshf_v8i16_2:
118 %1 = load <8 x i16>* %a
119 %2 = load <8 x i16>* %b
120 ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
121 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 8>
122 ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
123 ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0([[PTR_A]])
124 ; CHECK-DAG: vshf.h [[R3]], [[R2]], [[R2]]
125 store <8 x i16> %3, <8 x i16>* %c
126 ; CHECK-DAG: st.h [[R3]], 0($4)
129 ; CHECK: .size vshf_v8i16_2
132 define void @vshf_v8i16_3(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
133 ; CHECK: vshf_v8i16_3:
135 %1 = load <8 x i16>* %a
136 ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
137 %2 = load <8 x i16>* %b
138 ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
139 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 8, i32 9, i32 2, i32 3, i32 4, i32 12, i32 3>
140 ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
141 ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0([[PTR_A]])
142 ; The concatenation step of vshf is bitwise not vectorwise so we must reverse
143 ; the operands to get the right answer.
144 ; CHECK-DAG: vshf.h [[R3]], [[R2]], [[R1]]
145 store <8 x i16> %3, <8 x i16>* %c
146 ; CHECK-DAG: st.h [[R3]], 0($4)
149 ; CHECK: .size vshf_v8i16_3
152 define void @vshf_v8i16_4(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
153 ; CHECK: vshf_v8i16_4:
155 %1 = load <8 x i16>* %a
156 ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
157 %2 = shufflevector <8 x i16> %1, <8 x i16> %1, <8 x i32> <i32 1, i32 9, i32 1, i32 9, i32 1, i32 9, i32 1, i32 9>
158 ; CHECK-DAG: splati.h [[R3:\$w[0-9]+]], [[R1]][1]
159 store <8 x i16> %2, <8 x i16>* %c
160 ; CHECK-DAG: st.h [[R3]], 0($4)
163 ; CHECK: .size vshf_v8i16_4
166 ; Note: v4i32 only has one 4-element set so it's impossible to get a vshf.w
167 ; instruction when using a single vector.
169 define void @vshf_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
170 ; CHECK: vshf_v4i32_0:
172 %1 = load <4 x i32>* %a
173 ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
174 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
175 ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 27
176 store <4 x i32> %2, <4 x i32>* %c
177 ; CHECK-DAG: st.w [[R3]], 0($4)
180 ; CHECK: .size vshf_v4i32_0
183 define void @vshf_v4i32_1(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
184 ; CHECK: vshf_v4i32_1:
186 %1 = load <4 x i32>* %a
187 ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
188 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
189 ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 85
190 store <4 x i32> %2, <4 x i32>* %c
191 ; CHECK-DAG: st.w [[R3]], 0($4)
194 ; CHECK: .size vshf_v4i32_1
197 define void @vshf_v4i32_2(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
198 ; CHECK: vshf_v4i32_2:
200 %1 = load <4 x i32>* %a
201 %2 = load <4 x i32>* %b
202 ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
203 %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 4, i32 5, i32 6, i32 4>
204 ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R2]], 36
205 store <4 x i32> %3, <4 x i32>* %c
206 ; CHECK-DAG: st.w [[R3]], 0($4)
209 ; CHECK: .size vshf_v4i32_2
212 define void @vshf_v4i32_3(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
213 ; CHECK: vshf_v4i32_3:
215 %1 = load <4 x i32>* %a
216 ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
217 %2 = load <4 x i32>* %b
218 ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
219 %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 1, i32 5, i32 6, i32 4>
220 ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
221 ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0([[PTR_A]])
222 ; The concatenation step of vshf is bitwise not vectorwise so we must reverse
223 ; the operands to get the right answer.
224 ; CHECK-DAG: vshf.w [[R3]], [[R2]], [[R1]]
225 store <4 x i32> %3, <4 x i32>* %c
226 ; CHECK-DAG: st.w [[R3]], 0($4)
229 ; CHECK: .size vshf_v4i32_3
232 define void @vshf_v4i32_4(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
233 ; CHECK: vshf_v4i32_4:
235 %1 = load <4 x i32>* %a
236 ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
237 %2 = shufflevector <4 x i32> %1, <4 x i32> %1, <4 x i32> <i32 1, i32 5, i32 5, i32 1>
238 ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 85
239 store <4 x i32> %2, <4 x i32>* %c
240 ; CHECK-DAG: st.w [[R3]], 0($4)
243 ; CHECK: .size vshf_v4i32_4
246 define void @vshf_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
247 ; CHECK: vshf_v2i64_0:
249 %1 = load <2 x i64>* %a
250 ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
251 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
252 ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
253 ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0([[PTR_A]])
254 ; CHECK-DAG: vshf.d [[R3]], [[R1]], [[R1]]
255 store <2 x i64> %2, <2 x i64>* %c
256 ; CHECK-DAG: st.d [[R3]], 0($4)
259 ; CHECK: .size vshf_v2i64_0
262 define void @vshf_v2i64_1(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
263 ; CHECK: vshf_v2i64_1:
265 %1 = load <2 x i64>* %a
266 ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
267 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
268 ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][1]
269 store <2 x i64> %2, <2 x i64>* %c
270 ; CHECK-DAG: st.d [[R3]], 0($4)
273 ; CHECK: .size vshf_v2i64_1
276 define void @vshf_v2i64_2(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
277 ; CHECK: vshf_v2i64_2:
279 %1 = load <2 x i64>* %a
280 %2 = load <2 x i64>* %b
281 ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
282 %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 3, i32 2>
283 ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
284 ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0([[PTR_A]])
285 ; CHECK-DAG: vshf.d [[R3]], [[R2]], [[R2]]
286 store <2 x i64> %3, <2 x i64>* %c
287 ; CHECK-DAG: st.d [[R3]], 0($4)
290 ; CHECK: .size vshf_v2i64_2
293 define void @vshf_v2i64_3(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
294 ; CHECK: vshf_v2i64_3:
296 %1 = load <2 x i64>* %a
297 ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
298 %2 = load <2 x i64>* %b
299 ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
300 %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 2>
301 ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
302 ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0([[PTR_A]])
303 ; The concatenation step of vshf is bitwise not vectorwise so we must reverse
304 ; the operands to get the right answer.
305 ; CHECK-DAG: vshf.d [[R3]], [[R2]], [[R1]]
306 store <2 x i64> %3, <2 x i64>* %c
307 ; CHECK-DAG: st.d [[R3]], 0($4)
310 ; CHECK: .size vshf_v2i64_3
313 define void @vshf_v2i64_4(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
314 ; CHECK: vshf_v2i64_4:
316 %1 = load <2 x i64>* %a
317 ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
318 %2 = shufflevector <2 x i64> %1, <2 x i64> %1, <2 x i32> <i32 1, i32 3>
319 ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][1]
320 store <2 x i64> %2, <2 x i64>* %c
321 ; CHECK-DAG: st.d [[R3]], 0($4)
324 ; CHECK: .size vshf_v2i64_4
327 define void @shf_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
328 ; CHECK: shf_v16i8_0:
330 %1 = load <16 x i8>* %a
331 ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
332 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 2, i32 0, i32 5, i32 7, i32 6, i32 4, i32 9, i32 11, i32 10, i32 8, i32 13, i32 15, i32 14, i32 12>
333 ; CHECK-DAG: shf.b [[R3:\$w[0-9]+]], [[R1]], 45
334 store <16 x i8> %2, <16 x i8>* %c
335 ; CHECK-DAG: st.b [[R3]], 0($4)
338 ; CHECK: .size shf_v16i8_0
341 define void @shf_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
342 ; CHECK: shf_v8i16_0:
344 %1 = load <8 x i16>* %a
345 ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
346 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
347 ; CHECK-DAG: shf.h [[R3:\$w[0-9]+]], [[R1]], 27
348 store <8 x i16> %2, <8 x i16>* %c
349 ; CHECK-DAG: st.h [[R3]], 0($4)
352 ; CHECK: .size shf_v8i16_0
355 define void @shf_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
356 ; CHECK: shf_v4i32_0:
358 %1 = load <4 x i32>* %a
359 ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
360 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
361 ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 27
362 store <4 x i32> %2, <4 x i32>* %c
363 ; CHECK-DAG: st.w [[R3]], 0($4)
366 ; CHECK: .size shf_v4i32_0
369 ; shf.d does not exist
371 define void @ilvev_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
372 ; CHECK: ilvev_v16i8_0:
374 %1 = load <16 x i8>* %a
375 ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
376 %2 = load <16 x i8>* %b
377 ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
378 %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
379 <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
380 ; CHECK-DAG: ilvev.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
381 store <16 x i8> %3, <16 x i8>* %c
382 ; CHECK-DAG: st.b [[R3]], 0($4)
385 ; CHECK: .size ilvev_v16i8_0
388 define void @ilvev_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
389 ; CHECK: ilvev_v8i16_0:
391 %1 = load <8 x i16>* %a
392 ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
393 %2 = load <8 x i16>* %b
394 ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
395 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
396 ; CHECK-DAG: ilvev.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
397 store <8 x i16> %3, <8 x i16>* %c
398 ; CHECK-DAG: st.h [[R3]], 0($4)
401 ; CHECK: .size ilvev_v8i16_0
404 define void @ilvev_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
405 ; CHECK: ilvev_v4i32_0:
407 %1 = load <4 x i32>* %a
408 ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
409 %2 = load <4 x i32>* %b
410 ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
411 %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
412 ; CHECK-DAG: ilvev.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
413 store <4 x i32> %3, <4 x i32>* %c
414 ; CHECK-DAG: st.w [[R3]], 0($4)
417 ; CHECK: .size ilvev_v4i32_0
420 define void @ilvev_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
421 ; CHECK: ilvev_v2i64_0:
423 %1 = load <2 x i64>* %a
424 ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
425 %2 = load <2 x i64>* %b
426 ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
427 %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
428 ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
429 store <2 x i64> %3, <2 x i64>* %c
430 ; CHECK-DAG: st.d [[R3]], 0($4)
433 ; CHECK: .size ilvev_v2i64_0
436 define void @ilvod_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
437 ; CHECK: ilvod_v16i8_0:
439 %1 = load <16 x i8>* %a
440 ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
441 %2 = load <16 x i8>* %b
442 ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
443 %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
444 <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
445 ; CHECK-DAG: ilvod.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
446 store <16 x i8> %3, <16 x i8>* %c
447 ; CHECK-DAG: st.b [[R3]], 0($4)
450 ; CHECK: .size ilvod_v16i8_0
453 define void @ilvod_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
454 ; CHECK: ilvod_v8i16_0:
456 %1 = load <8 x i16>* %a
457 ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
458 %2 = load <8 x i16>* %b
459 ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
460 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
461 ; CHECK-DAG: ilvod.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
462 store <8 x i16> %3, <8 x i16>* %c
463 ; CHECK-DAG: st.h [[R3]], 0($4)
466 ; CHECK: .size ilvod_v8i16_0
469 define void @ilvod_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
470 ; CHECK: ilvod_v4i32_0:
472 %1 = load <4 x i32>* %a
473 ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
474 %2 = load <4 x i32>* %b
475 ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
476 %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
477 ; CHECK-DAG: ilvod.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
478 store <4 x i32> %3, <4 x i32>* %c
479 ; CHECK-DAG: st.w [[R3]], 0($4)
482 ; CHECK: .size ilvod_v4i32_0
485 define void @ilvod_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
486 ; CHECK: ilvod_v2i64_0:
488 %1 = load <2 x i64>* %a
489 ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
490 %2 = load <2 x i64>* %b
491 ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
492 %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 3>
493 ; CHECK-DAG: ilvod.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
494 store <2 x i64> %3, <2 x i64>* %c
495 ; CHECK-DAG: st.d [[R3]], 0($4)
498 ; CHECK: .size ilvod_v2i64_0
501 define void @ilvl_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
502 ; CHECK: ilvl_v16i8_0:
504 %1 = load <16 x i8>* %a
505 ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
506 %2 = load <16 x i8>* %b
507 ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
508 %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
509 <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
510 ; CHECK-DAG: ilvl.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
511 store <16 x i8> %3, <16 x i8>* %c
512 ; CHECK-DAG: st.b [[R3]], 0($4)
515 ; CHECK: .size ilvl_v16i8_0
518 define void @ilvl_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
519 ; CHECK: ilvl_v8i16_0:
521 %1 = load <8 x i16>* %a
522 ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
523 %2 = load <8 x i16>* %b
524 ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
525 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
526 ; CHECK-DAG: ilvl.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
527 store <8 x i16> %3, <8 x i16>* %c
528 ; CHECK-DAG: st.h [[R3]], 0($4)
531 ; CHECK: .size ilvl_v8i16_0
534 define void @ilvl_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
535 ; CHECK: ilvl_v4i32_0:
537 %1 = load <4 x i32>* %a
538 ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
539 %2 = load <4 x i32>* %b
540 ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
541 %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
542 ; CHECK-DAG: ilvl.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
543 store <4 x i32> %3, <4 x i32>* %c
544 ; CHECK-DAG: st.w [[R3]], 0($4)
547 ; CHECK: .size ilvl_v4i32_0
550 define void @ilvl_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
551 ; CHECK: ilvl_v2i64_0:
553 %1 = load <2 x i64>* %a
554 ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
555 %2 = load <2 x i64>* %b
556 ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
557 %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
558 ; ilvl.d and ilvev.d are equivalent for v2i64
559 ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
560 store <2 x i64> %3, <2 x i64>* %c
561 ; CHECK-DAG: st.d [[R3]], 0($4)
564 ; CHECK: .size ilvl_v2i64_0
567 define void @ilvr_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
568 ; CHECK: ilvr_v16i8_0:
570 %1 = load <16 x i8>* %a
571 ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
572 %2 = load <16 x i8>* %b
573 ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
574 %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
575 <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
576 ; CHECK-DAG: ilvr.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
577 store <16 x i8> %3, <16 x i8>* %c
578 ; CHECK-DAG: st.b [[R3]], 0($4)
581 ; CHECK: .size ilvr_v16i8_0
584 define void @ilvr_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
585 ; CHECK: ilvr_v8i16_0:
587 %1 = load <8 x i16>* %a
588 ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
589 %2 = load <8 x i16>* %b
590 ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
591 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
592 ; CHECK-DAG: ilvr.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
593 store <8 x i16> %3, <8 x i16>* %c
594 ; CHECK-DAG: st.h [[R3]], 0($4)
597 ; CHECK: .size ilvr_v8i16_0
600 define void @ilvr_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
601 ; CHECK: ilvr_v4i32_0:
603 %1 = load <4 x i32>* %a
604 ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
605 %2 = load <4 x i32>* %b
606 ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
607 %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
608 ; CHECK-DAG: ilvr.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
609 store <4 x i32> %3, <4 x i32>* %c
610 ; CHECK-DAG: st.w [[R3]], 0($4)
613 ; CHECK: .size ilvr_v4i32_0
616 define void @ilvr_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
617 ; CHECK: ilvr_v2i64_0:
619 %1 = load <2 x i64>* %a
620 ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
621 %2 = load <2 x i64>* %b
622 ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
623 %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 3>
624 ; ilvr.d and ilvod.d are equivalent for v2i64
625 ; CHECK-DAG: ilvod.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
626 store <2 x i64> %3, <2 x i64>* %c
627 ; CHECK-DAG: st.d [[R3]], 0($4)
630 ; CHECK: .size ilvr_v2i64_0
633 define void @pckev_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
634 ; CHECK: pckev_v16i8_0:
636 %1 = load <16 x i8>* %a
637 ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
638 %2 = load <16 x i8>* %b
639 ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
640 %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
641 <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
642 ; CHECK-DAG: pckev.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
643 store <16 x i8> %3, <16 x i8>* %c
644 ; CHECK-DAG: st.b [[R3]], 0($4)
647 ; CHECK: .size pckev_v16i8_0
650 define void @pckev_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
651 ; CHECK: pckev_v8i16_0:
653 %1 = load <8 x i16>* %a
654 ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
655 %2 = load <8 x i16>* %b
656 ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
657 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
658 ; CHECK-DAG: pckev.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
659 store <8 x i16> %3, <8 x i16>* %c
660 ; CHECK-DAG: st.h [[R3]], 0($4)
663 ; CHECK: .size pckev_v8i16_0
666 define void @pckev_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
667 ; CHECK: pckev_v4i32_0:
669 %1 = load <4 x i32>* %a
670 ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
671 %2 = load <4 x i32>* %b
672 ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
673 %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
674 ; CHECK-DAG: pckev.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
675 store <4 x i32> %3, <4 x i32>* %c
676 ; CHECK-DAG: st.w [[R3]], 0($4)
679 ; CHECK: .size pckev_v4i32_0
682 define void @pckev_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
683 ; CHECK: pckev_v2i64_0:
685 %1 = load <2 x i64>* %a
686 ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
687 %2 = load <2 x i64>* %b
688 ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
689 %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
690 ; pckev.d and ilvev.d are equivalent for v2i64
691 ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
692 store <2 x i64> %3, <2 x i64>* %c
693 ; CHECK-DAG: st.d [[R3]], 0($4)
696 ; CHECK: .size pckev_v2i64_0
699 define void @pckod_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
700 ; CHECK: pckod_v16i8_0:
702 %1 = load <16 x i8>* %a
703 ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
704 %2 = load <16 x i8>* %b
705 ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
706 %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
707 <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
708 ; CHECK-DAG: pckod.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
709 store <16 x i8> %3, <16 x i8>* %c
710 ; CHECK-DAG: st.b [[R3]], 0($4)
713 ; CHECK: .size pckod_v16i8_0
716 define void @pckod_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
717 ; CHECK: pckod_v8i16_0:
719 %1 = load <8 x i16>* %a
720 ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
721 %2 = load <8 x i16>* %b
722 ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
723 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
724 ; CHECK-DAG: pckod.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
725 store <8 x i16> %3, <8 x i16>* %c
726 ; CHECK-DAG: st.h [[R3]], 0($4)
729 ; CHECK: .size pckod_v8i16_0
732 define void @pckod_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
733 ; CHECK: pckod_v4i32_0:
735 %1 = load <4 x i32>* %a
736 ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
737 %2 = load <4 x i32>* %b
738 ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
739 %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
740 ; CHECK-DAG: pckod.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
741 store <4 x i32> %3, <4 x i32>* %c
742 ; CHECK-DAG: st.w [[R3]], 0($4)
745 ; CHECK: .size pckod_v4i32_0
748 define void @pckod_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
749 ; CHECK: pckod_v2i64_0:
751 %1 = load <2 x i64>* %a
752 ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
753 %2 = load <2 x i64>* %b
754 ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
755 %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 3>
756 ; pckod.d and ilvod.d are equivalent for v2i64
757 ; CHECK-DAG: ilvod.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
758 store <2 x i64> %3, <2 x i64>* %c
759 ; CHECK-DAG: st.d [[R3]], 0($4)
762 ; CHECK: .size pckod_v2i64_0
765 define void @splati_v16i8_0(<16 x i8>* %c, <16 x i8>* %a) nounwind {
766 ; CHECK: splati_v16i8_0:
768 %1 = load <16 x i8>* %a
769 ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
770 %2 = shufflevector <16 x i8> %1, <16 x i8> undef,
771 <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
772 ; CHECK-DAG: splati.b [[R3:\$w[0-9]+]], [[R1]][4]
773 store <16 x i8> %2, <16 x i8>* %c
774 ; CHECK-DAG: st.b [[R3]], 0($4)
777 ; CHECK: .size splati_v16i8_0
780 define void @splati_v8i16_0(<8 x i16>* %c, <8 x i16>* %a) nounwind {
781 ; CHECK: splati_v8i16_0:
783 %1 = load <8 x i16>* %a
784 ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
785 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
786 ; CHECK-DAG: splati.h [[R3:\$w[0-9]+]], [[R1]][4]
787 store <8 x i16> %2, <8 x i16>* %c
788 ; CHECK-DAG: st.h [[R3]], 0($4)
791 ; CHECK: .size splati_v8i16_0
794 define void @splati_v4i32_0(<4 x i32>* %c, <4 x i32>* %a) nounwind {
795 ; CHECK: splati_v4i32_0:
797 %1 = load <4 x i32>* %a
798 ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
799 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
800 ; shf.w and splati.w are equivalent
801 ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 255
802 store <4 x i32> %2, <4 x i32>* %c
803 ; CHECK-DAG: st.w [[R3]], 0($4)
806 ; CHECK: .size splati_v4i32_0
809 define void @splati_v2i64_0(<2 x i64>* %c, <2 x i64>* %a) nounwind {
810 ; CHECK: splati_v2i64_0:
812 %1 = load <2 x i64>* %a
813 ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
814 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
815 ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][1]
816 store <2 x i64> %2, <2 x i64>* %c
817 ; CHECK-DAG: st.d [[R3]], 0($4)
820 ; CHECK: .size splati_v2i64_0