1 ; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 -x86-experimental-vector-shuffle-legality | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
2 ; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-legality | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
3 ; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-legality | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-legality | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-legality | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
7 ; Verify that the DAG combiner correctly folds bitwise operations across
8 ; shuffles, nested shuffles with undef, pairs of nested shuffles, and other
9 ; basic and always-safe patterns. Also test that the DAG combiner will combine
10 ; target-specific shuffle instructions where reasonable.
12 target triple = "x86_64-unknown-unknown"
14 declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
15 declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
16 declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
18 define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
19 ; ALL-LABEL: combine_pshufd1:
20 ; ALL: # BB#0: # %entry
23 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
24 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27)
28 define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
29 ; ALL-LABEL: combine_pshufd2:
30 ; ALL: # BB#0: # %entry
33 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
34 %b.cast = bitcast <4 x i32> %b to <8 x i16>
35 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
36 %c.cast = bitcast <8 x i16> %c to <4 x i32>
37 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
41 define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
42 ; ALL-LABEL: combine_pshufd3:
43 ; ALL: # BB#0: # %entry
46 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
47 %b.cast = bitcast <4 x i32> %b to <8 x i16>
48 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
49 %c.cast = bitcast <8 x i16> %c to <4 x i32>
50 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
54 define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
55 ; SSE-LABEL: combine_pshufd4:
56 ; SSE: # BB#0: # %entry
57 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
60 ; AVX-LABEL: combine_pshufd4:
61 ; AVX: # BB#0: # %entry
62 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
65 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31)
66 %b.cast = bitcast <4 x i32> %b to <8 x i16>
67 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
68 %c.cast = bitcast <8 x i16> %c to <4 x i32>
69 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31)
73 define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
74 ; SSE-LABEL: combine_pshufd5:
75 ; SSE: # BB#0: # %entry
76 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
79 ; AVX-LABEL: combine_pshufd5:
80 ; AVX: # BB#0: # %entry
81 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
84 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76)
85 %b.cast = bitcast <4 x i32> %b to <8 x i16>
86 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
87 %c.cast = bitcast <8 x i16> %c to <4 x i32>
88 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76)
92 define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
93 ; SSE-LABEL: combine_pshufd6:
94 ; SSE: # BB#0: # %entry
95 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
98 ; AVX-LABEL: combine_pshufd6:
99 ; AVX: # BB#0: # %entry
100 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
103 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0)
104 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8)
108 define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
109 ; ALL-LABEL: combine_pshuflw1:
110 ; ALL: # BB#0: # %entry
113 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
114 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
118 define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
119 ; ALL-LABEL: combine_pshuflw2:
120 ; ALL: # BB#0: # %entry
123 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
124 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
125 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
129 define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
130 ; SSE-LABEL: combine_pshuflw3:
131 ; SSE: # BB#0: # %entry
132 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
135 ; AVX-LABEL: combine_pshuflw3:
136 ; AVX: # BB#0: # %entry
137 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
140 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
141 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
142 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
146 define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
147 ; SSE-LABEL: combine_pshufhw1:
148 ; SSE: # BB#0: # %entry
149 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
152 ; AVX-LABEL: combine_pshufhw1:
153 ; AVX: # BB#0: # %entry
154 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
157 %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
158 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
159 %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
163 define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
164 ; SSE-LABEL: combine_bitwise_ops_test1:
166 ; SSE-NEXT: pand %xmm1, %xmm0
167 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
170 ; AVX-LABEL: combine_bitwise_ops_test1:
172 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
173 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
175 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
176 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
177 %and = and <4 x i32> %shuf1, %shuf2
181 define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
182 ; SSE-LABEL: combine_bitwise_ops_test2:
184 ; SSE-NEXT: por %xmm1, %xmm0
185 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
188 ; AVX-LABEL: combine_bitwise_ops_test2:
190 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
191 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
193 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
194 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
195 %or = or <4 x i32> %shuf1, %shuf2
199 define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
200 ; SSE-LABEL: combine_bitwise_ops_test3:
202 ; SSE-NEXT: pxor %xmm1, %xmm0
203 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
206 ; AVX-LABEL: combine_bitwise_ops_test3:
208 ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
209 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
211 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
212 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
213 %xor = xor <4 x i32> %shuf1, %shuf2
217 define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
218 ; SSE-LABEL: combine_bitwise_ops_test4:
220 ; SSE-NEXT: pand %xmm1, %xmm0
221 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
224 ; AVX-LABEL: combine_bitwise_ops_test4:
226 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
227 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
229 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
230 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
231 %and = and <4 x i32> %shuf1, %shuf2
235 define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
236 ; SSE-LABEL: combine_bitwise_ops_test5:
238 ; SSE-NEXT: por %xmm1, %xmm0
239 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
242 ; AVX-LABEL: combine_bitwise_ops_test5:
244 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
245 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
247 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
248 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
249 %or = or <4 x i32> %shuf1, %shuf2
253 define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
254 ; SSE-LABEL: combine_bitwise_ops_test6:
256 ; SSE-NEXT: pxor %xmm1, %xmm0
257 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
260 ; AVX-LABEL: combine_bitwise_ops_test6:
262 ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
263 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
265 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
266 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
267 %xor = xor <4 x i32> %shuf1, %shuf2
272 ; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
273 ; are not performing a swizzle operations.
275 define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
276 ; SSE2-LABEL: combine_bitwise_ops_test1b:
278 ; SSE2-NEXT: pand %xmm1, %xmm0
279 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
280 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
281 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
284 ; SSSE3-LABEL: combine_bitwise_ops_test1b:
286 ; SSSE3-NEXT: pand %xmm1, %xmm0
287 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
288 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
289 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
292 ; SSE41-LABEL: combine_bitwise_ops_test1b:
294 ; SSE41-NEXT: pand %xmm1, %xmm0
295 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
298 ; AVX1-LABEL: combine_bitwise_ops_test1b:
300 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
301 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
304 ; AVX2-LABEL: combine_bitwise_ops_test1b:
306 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
307 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
309 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
310 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
311 %and = and <4 x i32> %shuf1, %shuf2
315 define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
316 ; SSE2-LABEL: combine_bitwise_ops_test2b:
318 ; SSE2-NEXT: por %xmm1, %xmm0
319 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
320 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
321 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
324 ; SSSE3-LABEL: combine_bitwise_ops_test2b:
326 ; SSSE3-NEXT: por %xmm1, %xmm0
327 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
328 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
329 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
332 ; SSE41-LABEL: combine_bitwise_ops_test2b:
334 ; SSE41-NEXT: por %xmm1, %xmm0
335 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
338 ; AVX1-LABEL: combine_bitwise_ops_test2b:
340 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
341 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
344 ; AVX2-LABEL: combine_bitwise_ops_test2b:
346 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
347 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
349 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
350 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
351 %or = or <4 x i32> %shuf1, %shuf2
355 define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
356 ; SSE2-LABEL: combine_bitwise_ops_test3b:
358 ; SSE2-NEXT: xorps %xmm1, %xmm0
359 ; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
362 ; SSSE3-LABEL: combine_bitwise_ops_test3b:
364 ; SSSE3-NEXT: xorps %xmm1, %xmm0
365 ; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
368 ; SSE41-LABEL: combine_bitwise_ops_test3b:
370 ; SSE41-NEXT: pxor %xmm1, %xmm0
371 ; SSE41-NEXT: pxor %xmm1, %xmm1
372 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
375 ; AVX1-LABEL: combine_bitwise_ops_test3b:
377 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
378 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
379 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
382 ; AVX2-LABEL: combine_bitwise_ops_test3b:
384 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
385 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
386 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
388 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
389 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
390 %xor = xor <4 x i32> %shuf1, %shuf2
394 define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
395 ; SSE2-LABEL: combine_bitwise_ops_test4b:
397 ; SSE2-NEXT: pand %xmm1, %xmm0
398 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
399 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
400 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
403 ; SSSE3-LABEL: combine_bitwise_ops_test4b:
405 ; SSSE3-NEXT: pand %xmm1, %xmm0
406 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
407 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
408 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
411 ; SSE41-LABEL: combine_bitwise_ops_test4b:
413 ; SSE41-NEXT: pand %xmm1, %xmm0
414 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
417 ; AVX1-LABEL: combine_bitwise_ops_test4b:
419 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
420 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
423 ; AVX2-LABEL: combine_bitwise_ops_test4b:
425 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
426 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
428 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
429 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
430 %and = and <4 x i32> %shuf1, %shuf2
434 define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
435 ; SSE2-LABEL: combine_bitwise_ops_test5b:
437 ; SSE2-NEXT: por %xmm1, %xmm0
438 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
439 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
440 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
443 ; SSSE3-LABEL: combine_bitwise_ops_test5b:
445 ; SSSE3-NEXT: por %xmm1, %xmm0
446 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
447 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
448 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
451 ; SSE41-LABEL: combine_bitwise_ops_test5b:
453 ; SSE41-NEXT: por %xmm1, %xmm0
454 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
457 ; AVX1-LABEL: combine_bitwise_ops_test5b:
459 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
460 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
463 ; AVX2-LABEL: combine_bitwise_ops_test5b:
465 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
466 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
468 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
469 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
470 %or = or <4 x i32> %shuf1, %shuf2
474 define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
475 ; SSE2-LABEL: combine_bitwise_ops_test6b:
477 ; SSE2-NEXT: xorps %xmm1, %xmm0
478 ; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
481 ; SSSE3-LABEL: combine_bitwise_ops_test6b:
483 ; SSSE3-NEXT: xorps %xmm1, %xmm0
484 ; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
487 ; SSE41-LABEL: combine_bitwise_ops_test6b:
489 ; SSE41-NEXT: pxor %xmm1, %xmm0
490 ; SSE41-NEXT: pxor %xmm1, %xmm1
491 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
494 ; AVX1-LABEL: combine_bitwise_ops_test6b:
496 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
497 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
498 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
501 ; AVX2-LABEL: combine_bitwise_ops_test6b:
503 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
504 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
505 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
507 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
508 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
509 %xor = xor <4 x i32> %shuf1, %shuf2
513 define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
514 ; SSE2-LABEL: combine_bitwise_ops_test1c:
516 ; SSE2-NEXT: andps %xmm1, %xmm0
517 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
520 ; SSSE3-LABEL: combine_bitwise_ops_test1c:
522 ; SSSE3-NEXT: andps %xmm1, %xmm0
523 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
526 ; SSE41-LABEL: combine_bitwise_ops_test1c:
528 ; SSE41-NEXT: pand %xmm1, %xmm0
529 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
530 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
533 ; AVX1-LABEL: combine_bitwise_ops_test1c:
535 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
536 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
537 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
540 ; AVX2-LABEL: combine_bitwise_ops_test1c:
542 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
543 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
544 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
546 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
547 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
548 %and = and <4 x i32> %shuf1, %shuf2
552 define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
553 ; SSE2-LABEL: combine_bitwise_ops_test2c:
555 ; SSE2-NEXT: orps %xmm1, %xmm0
556 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
559 ; SSSE3-LABEL: combine_bitwise_ops_test2c:
561 ; SSSE3-NEXT: orps %xmm1, %xmm0
562 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
565 ; SSE41-LABEL: combine_bitwise_ops_test2c:
567 ; SSE41-NEXT: por %xmm1, %xmm0
568 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
569 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
572 ; AVX1-LABEL: combine_bitwise_ops_test2c:
574 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
575 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
576 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
579 ; AVX2-LABEL: combine_bitwise_ops_test2c:
581 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
582 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
583 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
585 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
586 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
587 %or = or <4 x i32> %shuf1, %shuf2
591 define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
592 ; SSE2-LABEL: combine_bitwise_ops_test3c:
594 ; SSE2-NEXT: xorps %xmm1, %xmm0
595 ; SSE2-NEXT: xorps %xmm1, %xmm1
596 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
599 ; SSSE3-LABEL: combine_bitwise_ops_test3c:
601 ; SSSE3-NEXT: xorps %xmm1, %xmm0
602 ; SSSE3-NEXT: xorps %xmm1, %xmm1
603 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
606 ; SSE41-LABEL: combine_bitwise_ops_test3c:
608 ; SSE41-NEXT: pxor %xmm1, %xmm0
609 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
610 ; SSE41-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
613 ; AVX-LABEL: combine_bitwise_ops_test3c:
615 ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
616 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
617 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
619 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
620 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
621 %xor = xor <4 x i32> %shuf1, %shuf2
625 define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
626 ; SSE2-LABEL: combine_bitwise_ops_test4c:
628 ; SSE2-NEXT: andps %xmm1, %xmm0
629 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
630 ; SSE2-NEXT: movaps %xmm2, %xmm0
633 ; SSSE3-LABEL: combine_bitwise_ops_test4c:
635 ; SSSE3-NEXT: andps %xmm1, %xmm0
636 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
637 ; SSSE3-NEXT: movaps %xmm2, %xmm0
640 ; SSE41-LABEL: combine_bitwise_ops_test4c:
642 ; SSE41-NEXT: pand %xmm1, %xmm0
643 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
644 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
647 ; AVX1-LABEL: combine_bitwise_ops_test4c:
649 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
650 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
651 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
654 ; AVX2-LABEL: combine_bitwise_ops_test4c:
656 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
657 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
658 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
660 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
661 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
662 %and = and <4 x i32> %shuf1, %shuf2
666 define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
667 ; SSE2-LABEL: combine_bitwise_ops_test5c:
669 ; SSE2-NEXT: orps %xmm1, %xmm0
670 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
671 ; SSE2-NEXT: movaps %xmm2, %xmm0
674 ; SSSE3-LABEL: combine_bitwise_ops_test5c:
676 ; SSSE3-NEXT: orps %xmm1, %xmm0
677 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
678 ; SSSE3-NEXT: movaps %xmm2, %xmm0
681 ; SSE41-LABEL: combine_bitwise_ops_test5c:
683 ; SSE41-NEXT: por %xmm1, %xmm0
684 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
685 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
688 ; AVX1-LABEL: combine_bitwise_ops_test5c:
690 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
691 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
692 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
695 ; AVX2-LABEL: combine_bitwise_ops_test5c:
697 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
698 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
699 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
701 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
702 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
703 %or = or <4 x i32> %shuf1, %shuf2
707 define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
708 ; SSE2-LABEL: combine_bitwise_ops_test6c:
710 ; SSE2-NEXT: xorps %xmm1, %xmm0
711 ; SSE2-NEXT: xorps %xmm1, %xmm1
712 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
713 ; SSE2-NEXT: movaps %xmm1, %xmm0
716 ; SSSE3-LABEL: combine_bitwise_ops_test6c:
718 ; SSSE3-NEXT: xorps %xmm1, %xmm0
719 ; SSSE3-NEXT: xorps %xmm1, %xmm1
720 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
721 ; SSSE3-NEXT: movaps %xmm1, %xmm0
724 ; SSE41-LABEL: combine_bitwise_ops_test6c:
726 ; SSE41-NEXT: pxor %xmm1, %xmm0
727 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
728 ; SSE41-NEXT: pxor %xmm0, %xmm0
729 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
732 ; AVX1-LABEL: combine_bitwise_ops_test6c:
734 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
735 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
736 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
737 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
740 ; AVX2-LABEL: combine_bitwise_ops_test6c:
742 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
743 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
744 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
745 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
747 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
748 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
749 %xor = xor <4 x i32> %shuf1, %shuf2
753 define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) {
754 ; SSE-LABEL: combine_nested_undef_test1:
756 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
759 ; AVX-LABEL: combine_nested_undef_test1:
761 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
763 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
764 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
768 define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) {
769 ; SSE-LABEL: combine_nested_undef_test2:
771 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
774 ; AVX-LABEL: combine_nested_undef_test2:
776 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
778 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
779 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
783 define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) {
784 ; SSE-LABEL: combine_nested_undef_test3:
786 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
789 ; AVX-LABEL: combine_nested_undef_test3:
791 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
793 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
794 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
798 define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) {
799 ; SSE-LABEL: combine_nested_undef_test4:
801 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
804 ; AVX1-LABEL: combine_nested_undef_test4:
806 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
809 ; AVX2-LABEL: combine_nested_undef_test4:
811 ; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
813 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
814 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
818 define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) {
819 ; SSE-LABEL: combine_nested_undef_test5:
821 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
824 ; AVX-LABEL: combine_nested_undef_test5:
826 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
828 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
829 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
833 define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) {
834 ; SSE-LABEL: combine_nested_undef_test6:
836 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
839 ; AVX-LABEL: combine_nested_undef_test6:
841 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
843 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
844 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
848 define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) {
849 ; SSE-LABEL: combine_nested_undef_test7:
851 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
854 ; AVX-LABEL: combine_nested_undef_test7:
856 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
858 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
859 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
863 define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) {
864 ; SSE-LABEL: combine_nested_undef_test8:
866 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
869 ; AVX-LABEL: combine_nested_undef_test8:
871 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
873 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
874 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
878 define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) {
879 ; SSE-LABEL: combine_nested_undef_test9:
881 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
884 ; AVX-LABEL: combine_nested_undef_test9:
886 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
888 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
889 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
893 define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) {
894 ; SSE-LABEL: combine_nested_undef_test10:
896 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
899 ; AVX-LABEL: combine_nested_undef_test10:
901 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
903 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
904 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
908 define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) {
909 ; SSE-LABEL: combine_nested_undef_test11:
911 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
914 ; AVX-LABEL: combine_nested_undef_test11:
916 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
918 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
919 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
923 define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) {
924 ; SSE-LABEL: combine_nested_undef_test12:
926 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
929 ; AVX1-LABEL: combine_nested_undef_test12:
931 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
934 ; AVX2-LABEL: combine_nested_undef_test12:
936 ; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
938 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
939 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
943 ; The following pair of shuffles is folded into vector %A.
944 define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) {
945 ; ALL-LABEL: combine_nested_undef_test13:
948 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
949 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
953 ; The following pair of shuffles is folded into vector %B.
954 define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) {
955 ; SSE-LABEL: combine_nested_undef_test14:
957 ; SSE-NEXT: movaps %xmm1, %xmm0
960 ; AVX-LABEL: combine_nested_undef_test14:
962 ; AVX-NEXT: vmovaps %xmm1, %xmm0
964 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
965 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
970 ; Verify that we don't optimize the following cases. We expect more than one shuffle.
972 ; FIXME: Many of these already don't make sense, and the rest should stop
973 ; making sense with th enew vector shuffle lowering. Revisit at least testing for
976 define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
977 ; SSE2-LABEL: combine_nested_undef_test15:
979 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
980 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
981 ; SSE2-NEXT: movaps %xmm1, %xmm0
984 ; SSSE3-LABEL: combine_nested_undef_test15:
986 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
987 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
988 ; SSSE3-NEXT: movaps %xmm1, %xmm0
991 ; SSE41-LABEL: combine_nested_undef_test15:
993 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
994 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
995 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
998 ; AVX1-LABEL: combine_nested_undef_test15:
1000 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
1001 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
1002 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1005 ; AVX2-LABEL: combine_nested_undef_test15:
1007 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
1008 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
1009 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1011 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
1012 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1016 define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
1017 ; SSE2-LABEL: combine_nested_undef_test16:
1019 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
1020 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
1021 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1024 ; SSSE3-LABEL: combine_nested_undef_test16:
1026 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
1027 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
1028 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1031 ; SSE41-LABEL: combine_nested_undef_test16:
1033 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1034 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1037 ; AVX1-LABEL: combine_nested_undef_test16:
1039 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1040 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1043 ; AVX2-LABEL: combine_nested_undef_test16:
1045 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1046 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1048 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1049 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1053 define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
1054 ; SSE2-LABEL: combine_nested_undef_test17:
1056 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
1057 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
1060 ; SSSE3-LABEL: combine_nested_undef_test17:
1062 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
1063 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
1066 ; SSE41-LABEL: combine_nested_undef_test17:
1068 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
1069 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
1072 ; AVX1-LABEL: combine_nested_undef_test17:
1074 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
1075 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
1078 ; AVX2-LABEL: combine_nested_undef_test17:
1080 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1081 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
1083 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
1084 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1088 define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) {
1089 ; SSE-LABEL: combine_nested_undef_test18:
1091 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
1094 ; AVX-LABEL: combine_nested_undef_test18:
1096 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
1098 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1099 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
1103 define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
1104 ; SSE2-LABEL: combine_nested_undef_test19:
1106 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
1107 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,0]
1108 ; SSE2-NEXT: movaps %xmm1, %xmm0
1111 ; SSSE3-LABEL: combine_nested_undef_test19:
1113 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
1114 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,0]
1115 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1118 ; SSE41-LABEL: combine_nested_undef_test19:
1120 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1121 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
1124 ; AVX1-LABEL: combine_nested_undef_test19:
1126 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1127 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
1130 ; AVX2-LABEL: combine_nested_undef_test19:
1132 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1133 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
1135 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
1136 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
1140 define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
1141 ; SSE2-LABEL: combine_nested_undef_test20:
1143 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
1144 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1145 ; SSE2-NEXT: movaps %xmm1, %xmm0
1148 ; SSSE3-LABEL: combine_nested_undef_test20:
1150 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
1151 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1152 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1155 ; SSE41-LABEL: combine_nested_undef_test20:
1157 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1158 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
1161 ; AVX1-LABEL: combine_nested_undef_test20:
1163 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1164 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
1167 ; AVX2-LABEL: combine_nested_undef_test20:
1169 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1170 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
1172 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
1173 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1177 define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
1178 ; SSE2-LABEL: combine_nested_undef_test21:
1180 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
1181 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1]
1182 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1185 ; SSSE3-LABEL: combine_nested_undef_test21:
1187 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
1188 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1]
1189 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1192 ; SSE41-LABEL: combine_nested_undef_test21:
1194 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1195 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1198 ; AVX1-LABEL: combine_nested_undef_test21:
1200 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1201 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1204 ; AVX2-LABEL: combine_nested_undef_test21:
1206 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1207 ; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
1209 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
1210 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1215 ; Test that we correctly combine shuffles according to rule
1216 ; shuffle(shuffle(x, y), undef) -> shuffle(y, undef)
1218 define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) {
1219 ; SSE-LABEL: combine_nested_undef_test22:
1221 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
1224 ; AVX-LABEL: combine_nested_undef_test22:
1226 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
1228 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1229 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3>
1233 define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) {
1234 ; SSE-LABEL: combine_nested_undef_test23:
1236 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
1239 ; AVX-LABEL: combine_nested_undef_test23:
1241 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
1243 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1244 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1248 define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) {
1249 ; SSE-LABEL: combine_nested_undef_test24:
1251 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
1254 ; AVX-LABEL: combine_nested_undef_test24:
1256 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
1258 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1259 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4>
1263 define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) {
1264 ; SSE-LABEL: combine_nested_undef_test25:
1266 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1269 ; AVX1-LABEL: combine_nested_undef_test25:
1271 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1274 ; AVX2-LABEL: combine_nested_undef_test25:
1276 ; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
1278 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4>
1279 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1>
1283 define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) {
1284 ; SSE-LABEL: combine_nested_undef_test26:
1286 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1289 ; AVX-LABEL: combine_nested_undef_test26:
1291 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1293 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7>
1294 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
1298 define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) {
1299 ; SSE-LABEL: combine_nested_undef_test27:
1301 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1304 ; AVX1-LABEL: combine_nested_undef_test27:
1306 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1309 ; AVX2-LABEL: combine_nested_undef_test27:
1311 ; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
1313 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4>
1314 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
1318 define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) {
1319 ; SSE-LABEL: combine_nested_undef_test28:
1321 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
1324 ; AVX-LABEL: combine_nested_undef_test28:
1326 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
1328 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
1329 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2>
1333 define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
1334 ; SSE-LABEL: combine_test1:
1336 ; SSE-NEXT: movaps %xmm1, %xmm0
1339 ; AVX-LABEL: combine_test1:
1341 ; AVX-NEXT: vmovaps %xmm1, %xmm0
1343 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1344 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1348 define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
1349 ; SSE2-LABEL: combine_test2:
1351 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1352 ; SSE2-NEXT: movaps %xmm1, %xmm0
1355 ; SSSE3-LABEL: combine_test2:
1357 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1358 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1361 ; SSE41-LABEL: combine_test2:
1363 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1366 ; AVX-LABEL: combine_test2:
1368 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1370 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1371 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1375 define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) {
1376 ; SSE-LABEL: combine_test3:
1378 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1381 ; AVX-LABEL: combine_test3:
1383 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1385 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1386 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1390 define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
1391 ; SSE-LABEL: combine_test4:
1393 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1394 ; SSE-NEXT: movapd %xmm1, %xmm0
1397 ; AVX-LABEL: combine_test4:
1399 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1401 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1402 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1406 define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
1407 ; SSE2-LABEL: combine_test5:
1409 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1410 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1413 ; SSSE3-LABEL: combine_test5:
1415 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1416 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1419 ; SSE41-LABEL: combine_test5:
1421 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1424 ; AVX-LABEL: combine_test5:
1426 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1428 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1429 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1433 define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
1434 ; SSE-LABEL: combine_test6:
1436 ; SSE-NEXT: movaps %xmm1, %xmm0
1439 ; AVX-LABEL: combine_test6:
1441 ; AVX-NEXT: vmovaps %xmm1, %xmm0
1443 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1444 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1448 define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
1449 ; SSE2-LABEL: combine_test7:
1451 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1452 ; SSE2-NEXT: movaps %xmm1, %xmm0
1455 ; SSSE3-LABEL: combine_test7:
1457 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1458 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1461 ; SSE41-LABEL: combine_test7:
1463 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1466 ; AVX1-LABEL: combine_test7:
1468 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1471 ; AVX2-LABEL: combine_test7:
1473 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1475 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1476 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1480 define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) {
1481 ; SSE-LABEL: combine_test8:
1483 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1486 ; AVX-LABEL: combine_test8:
1488 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1490 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1491 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1495 define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) {
1496 ; SSE-LABEL: combine_test9:
1498 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1499 ; SSE-NEXT: movdqa %xmm1, %xmm0
1502 ; AVX-LABEL: combine_test9:
1504 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1506 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1507 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1511 define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) {
1512 ; SSE2-LABEL: combine_test10:
1514 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1515 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1518 ; SSSE3-LABEL: combine_test10:
1520 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1521 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1524 ; SSE41-LABEL: combine_test10:
1526 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1529 ; AVX1-LABEL: combine_test10:
1531 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1534 ; AVX2-LABEL: combine_test10:
1536 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1538 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1539 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1543 define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
1544 ; ALL-LABEL: combine_test11:
1547 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1548 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1552 define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
1553 ; SSE2-LABEL: combine_test12:
1555 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1556 ; SSE2-NEXT: movaps %xmm1, %xmm0
1559 ; SSSE3-LABEL: combine_test12:
1561 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1562 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1565 ; SSE41-LABEL: combine_test12:
1567 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1570 ; AVX-LABEL: combine_test12:
1572 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1574 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1575 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1579 define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) {
1580 ; SSE-LABEL: combine_test13:
1582 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1585 ; AVX-LABEL: combine_test13:
1587 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1589 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1590 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1594 define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) {
1595 ; SSE-LABEL: combine_test14:
1597 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1600 ; AVX-LABEL: combine_test14:
1602 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1604 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1605 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1609 define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) {
1610 ; SSE2-LABEL: combine_test15:
1612 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1613 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1616 ; SSSE3-LABEL: combine_test15:
1618 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1619 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1622 ; SSE41-LABEL: combine_test15:
1624 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1627 ; AVX-LABEL: combine_test15:
1629 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1631 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1632 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1636 define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
1637 ; ALL-LABEL: combine_test16:
1640 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1641 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1645 define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
1646 ; SSE2-LABEL: combine_test17:
1648 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1649 ; SSE2-NEXT: movaps %xmm1, %xmm0
1652 ; SSSE3-LABEL: combine_test17:
1654 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1655 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1658 ; SSE41-LABEL: combine_test17:
1660 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1663 ; AVX1-LABEL: combine_test17:
1665 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1668 ; AVX2-LABEL: combine_test17:
1670 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1672 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1673 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1677 define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) {
1678 ; SSE-LABEL: combine_test18:
1680 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1683 ; AVX-LABEL: combine_test18:
1685 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1687 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1688 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1692 define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) {
1693 ; SSE-LABEL: combine_test19:
1695 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1698 ; AVX-LABEL: combine_test19:
1700 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1702 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1703 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1707 define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
1708 ; SSE2-LABEL: combine_test20:
1710 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1711 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1714 ; SSSE3-LABEL: combine_test20:
1716 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1717 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1720 ; SSE41-LABEL: combine_test20:
1722 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1725 ; AVX1-LABEL: combine_test20:
1727 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1730 ; AVX2-LABEL: combine_test20:
1732 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1734 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1735 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1739 define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) {
1740 ; SSE-LABEL: combine_test21:
1742 ; SSE-NEXT: movdqa %xmm0, %xmm2
1743 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1744 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1745 ; SSE-NEXT: movdqa %xmm2, (%rdi)
1748 ; AVX1-LABEL: combine_test21:
1750 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1751 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0]
1752 ; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1753 ; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
1754 ; AVX1-NEXT: vzeroupper
1757 ; AVX2-LABEL: combine_test21:
1759 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1760 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0]
1761 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1762 ; AVX2-NEXT: vmovdqa %xmm2, (%rdi)
1763 ; AVX2-NEXT: vzeroupper
1765 %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1766 %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1767 store <4 x i32> %1, <4 x i32>* %ptr, align 16
1771 define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) {
1772 ; SSE-LABEL: combine_test22:
1774 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1775 ; SSE-NEXT: movhpd (%rsi), %xmm0
1778 ; AVX-LABEL: combine_test22:
1780 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1781 ; AVX-NEXT: vmovhpd (%rsi), %xmm0, %xmm0
1783 ; Current AVX2 lowering of this is still awful, not adding a test case.
1784 %1 = load <2 x float>* %a, align 8
1785 %2 = load <2 x float>* %b, align 8
1786 %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1790 ; Check some negative cases.
1791 ; FIXME: Do any of these really make sense? Are they redundant with the above tests?
1793 define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
1794 ; SSE-LABEL: combine_test1b:
1796 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
1797 ; SSE-NEXT: movaps %xmm1, %xmm0
1800 ; AVX-LABEL: combine_test1b:
1802 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0]
1804 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1805 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0>
1809 define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
1810 ; SSE2-LABEL: combine_test2b:
1812 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0,0]
1813 ; SSE2-NEXT: movaps %xmm1, %xmm0
1816 ; SSSE3-LABEL: combine_test2b:
1818 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
1821 ; SSE41-LABEL: combine_test2b:
1823 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
1826 ; AVX-LABEL: combine_test2b:
1828 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0]
1830 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1831 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5>
1835 define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
1836 ; SSE2-LABEL: combine_test3b:
1838 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1839 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1842 ; SSSE3-LABEL: combine_test3b:
1844 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1845 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1848 ; SSE41-LABEL: combine_test3b:
1850 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1851 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1854 ; AVX-LABEL: combine_test3b:
1856 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1857 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1859 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3>
1860 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7>
1864 define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
1865 ; SSE-LABEL: combine_test4b:
1867 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
1868 ; SSE-NEXT: movaps %xmm1, %xmm0
1871 ; AVX-LABEL: combine_test4b:
1873 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3]
1875 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1876 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7>
1881 ; Verify that we correctly fold shuffles even when we use illegal vector types.
1883 define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
1884 ; SSE2-LABEL: combine_test1c:
1886 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1887 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1888 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1889 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1890 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1891 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1892 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1895 ; SSSE3-LABEL: combine_test1c:
1897 ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1898 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1899 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1900 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1901 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1902 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1903 ; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1906 ; SSE41-LABEL: combine_test1c:
1908 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1909 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1910 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
1913 ; AVX1-LABEL: combine_test1c:
1915 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1916 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1917 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1920 ; AVX2-LABEL: combine_test1c:
1922 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1923 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1924 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1926 %A = load <4 x i8>* %a
1927 %B = load <4 x i8>* %b
1928 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1929 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1933 define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
1934 ; SSE2-LABEL: combine_test2c:
1936 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1937 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1938 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1939 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1940 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1941 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1942 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1945 ; SSSE3-LABEL: combine_test2c:
1947 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1948 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1949 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1950 ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1951 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1952 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1953 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1956 ; SSE41-LABEL: combine_test2c:
1958 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1959 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1960 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1963 ; AVX-LABEL: combine_test2c:
1965 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1966 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1967 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1969 %A = load <4 x i8>* %a
1970 %B = load <4 x i8>* %b
1971 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5>
1972 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1976 define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
1977 ; SSE2-LABEL: combine_test3c:
1979 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1980 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1981 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1982 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1983 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1984 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1985 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1988 ; SSSE3-LABEL: combine_test3c:
1990 ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1991 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1992 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1993 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1994 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1995 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1996 ; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1999 ; SSE41-LABEL: combine_test3c:
2001 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2002 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2003 ; SSE41-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2006 ; AVX-LABEL: combine_test3c:
2008 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2009 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2010 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2012 %A = load <4 x i8>* %a
2013 %B = load <4 x i8>* %b
2014 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2015 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2019 define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
2020 ; SSE2-LABEL: combine_test4c:
2022 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2023 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2024 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2025 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2026 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2027 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2028 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
2029 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
2032 ; SSSE3-LABEL: combine_test4c:
2034 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2035 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2036 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2037 ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2038 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2039 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2040 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
2041 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
2044 ; SSE41-LABEL: combine_test4c:
2046 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2047 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2048 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
2051 ; AVX1-LABEL: combine_test4c:
2053 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2054 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2055 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
2058 ; AVX2-LABEL: combine_test4c:
2060 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2061 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2062 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
2064 %A = load <4 x i8>* %a
2065 %B = load <4 x i8>* %b
2066 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
2067 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
2072 ; The following test cases are generated from this C++ code
2074 ;__m128 blend_01(__m128 a, __m128 b)
2077 ; s = _mm_blend_ps( s, b, 1<<0 );
2078 ; s = _mm_blend_ps( s, b, 1<<1 );
2082 ;__m128 blend_02(__m128 a, __m128 b)
2085 ; s = _mm_blend_ps( s, b, 1<<0 );
2086 ; s = _mm_blend_ps( s, b, 1<<2 );
2090 ;__m128 blend_123(__m128 a, __m128 b)
2093 ; s = _mm_blend_ps( s, b, 1<<1 );
2094 ; s = _mm_blend_ps( s, b, 1<<2 );
2095 ; s = _mm_blend_ps( s, b, 1<<3 );
2099 ; Ideally, we should collapse the following shuffles into a single one.
2101 define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
2102 ; SSE2-LABEL: combine_blend_01:
2104 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2107 ; SSSE3-LABEL: combine_blend_01:
2109 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2112 ; SSE41-LABEL: combine_blend_01:
2114 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2117 ; AVX-LABEL: combine_blend_01:
2119 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2121 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
2122 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
2123 ret <4 x float> %shuffle6
2126 define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
2127 ; SSE2-LABEL: combine_blend_02:
2129 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
2130 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
2131 ; SSE2-NEXT: movaps %xmm1, %xmm0
2134 ; SSSE3-LABEL: combine_blend_02:
2136 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
2137 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
2138 ; SSSE3-NEXT: movaps %xmm1, %xmm0
2141 ; SSE41-LABEL: combine_blend_02:
2143 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2146 ; AVX-LABEL: combine_blend_02:
2148 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2150 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3>
2151 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
2152 ret <4 x float> %shuffle6
2155 define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
2156 ; SSE2-LABEL: combine_blend_123:
2158 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2159 ; SSE2-NEXT: movaps %xmm1, %xmm0
2162 ; SSSE3-LABEL: combine_blend_123:
2164 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2165 ; SSSE3-NEXT: movaps %xmm1, %xmm0
2168 ; SSE41-LABEL: combine_blend_123:
2170 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2173 ; AVX-LABEL: combine_blend_123:
2175 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2177 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
2178 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
2179 %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
2180 ret <4 x float> %shuffle12
2183 define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) {
2184 ; SSE-LABEL: combine_test_movhl_1:
2186 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2187 ; SSE-NEXT: movdqa %xmm1, %xmm0
2190 ; AVX-LABEL: combine_test_movhl_1:
2192 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2194 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3>
2195 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3>
2199 define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) {
2200 ; SSE-LABEL: combine_test_movhl_2:
2202 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2203 ; SSE-NEXT: movdqa %xmm1, %xmm0
2206 ; AVX-LABEL: combine_test_movhl_2:
2208 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2210 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6>
2211 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2>
2215 define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) {
2216 ; SSE-LABEL: combine_test_movhl_3:
2218 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2219 ; SSE-NEXT: movdqa %xmm1, %xmm0
2222 ; AVX-LABEL: combine_test_movhl_3:
2224 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2226 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2>
2227 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2>
2232 ; Verify that we fold shuffles according to rule:
2233 ; (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2)
2235 define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) {
2236 ; SSE2-LABEL: combine_undef_input_test1:
2238 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2241 ; SSSE3-LABEL: combine_undef_input_test1:
2243 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2246 ; SSE41-LABEL: combine_undef_input_test1:
2248 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2251 ; AVX-LABEL: combine_undef_input_test1:
2253 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2255 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2256 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2260 define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) {
2261 ; SSE-LABEL: combine_undef_input_test2:
2263 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2266 ; AVX-LABEL: combine_undef_input_test2:
2268 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2270 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2271 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2275 define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) {
2276 ; SSE-LABEL: combine_undef_input_test3:
2278 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2281 ; AVX-LABEL: combine_undef_input_test3:
2283 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2285 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2286 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2290 define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) {
2291 ; SSE-LABEL: combine_undef_input_test4:
2293 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2294 ; SSE-NEXT: movapd %xmm1, %xmm0
2297 ; AVX-LABEL: combine_undef_input_test4:
2299 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2301 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2302 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2306 define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) {
2307 ; SSE2-LABEL: combine_undef_input_test5:
2309 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2310 ; SSE2-NEXT: movapd %xmm1, %xmm0
2313 ; SSSE3-LABEL: combine_undef_input_test5:
2315 ; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2316 ; SSSE3-NEXT: movapd %xmm1, %xmm0
2319 ; SSE41-LABEL: combine_undef_input_test5:
2321 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2324 ; AVX-LABEL: combine_undef_input_test5:
2326 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2328 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2329 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2334 ; Verify that we fold shuffles according to rule:
2335 ; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2337 define <4 x float> @combine_undef_input_test6(<4 x float> %a) {
2338 ; ALL-LABEL: combine_undef_input_test6:
2341 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2342 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2346 define <4 x float> @combine_undef_input_test7(<4 x float> %a) {
2347 ; SSE2-LABEL: combine_undef_input_test7:
2349 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
2352 ; SSSE3-LABEL: combine_undef_input_test7:
2354 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2357 ; SSE41-LABEL: combine_undef_input_test7:
2359 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2362 ; AVX-LABEL: combine_undef_input_test7:
2364 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2366 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2367 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2371 define <4 x float> @combine_undef_input_test8(<4 x float> %a) {
2372 ; SSE2-LABEL: combine_undef_input_test8:
2374 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
2377 ; SSSE3-LABEL: combine_undef_input_test8:
2379 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2382 ; SSE41-LABEL: combine_undef_input_test8:
2384 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2387 ; AVX-LABEL: combine_undef_input_test8:
2389 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2391 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2392 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2396 define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
2397 ; SSE-LABEL: combine_undef_input_test9:
2399 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
2402 ; AVX-LABEL: combine_undef_input_test9:
2404 ; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
2406 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2407 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2411 define <4 x float> @combine_undef_input_test10(<4 x float> %a) {
2412 ; ALL-LABEL: combine_undef_input_test10:
2415 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2416 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2420 define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) {
2421 ; SSE2-LABEL: combine_undef_input_test11:
2423 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2426 ; SSSE3-LABEL: combine_undef_input_test11:
2428 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2431 ; SSE41-LABEL: combine_undef_input_test11:
2433 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2436 ; AVX-LABEL: combine_undef_input_test11:
2438 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2440 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2441 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6>
2445 define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) {
2446 ; SSE-LABEL: combine_undef_input_test12:
2448 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2451 ; AVX-LABEL: combine_undef_input_test12:
2453 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2455 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2456 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2460 define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) {
2461 ; SSE-LABEL: combine_undef_input_test13:
2463 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2466 ; AVX-LABEL: combine_undef_input_test13:
2468 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2470 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2471 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5>
2475 define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) {
2476 ; SSE-LABEL: combine_undef_input_test14:
2478 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2479 ; SSE-NEXT: movapd %xmm1, %xmm0
2482 ; AVX-LABEL: combine_undef_input_test14:
2484 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2486 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2487 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2491 define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) {
2492 ; SSE2-LABEL: combine_undef_input_test15:
2494 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2495 ; SSE2-NEXT: movapd %xmm1, %xmm0
2498 ; SSSE3-LABEL: combine_undef_input_test15:
2500 ; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2501 ; SSSE3-NEXT: movapd %xmm1, %xmm0
2504 ; SSE41-LABEL: combine_undef_input_test15:
2506 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2509 ; AVX-LABEL: combine_undef_input_test15:
2511 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2513 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2514 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2519 ; Verify that shuffles are canonicalized according to rules:
2520 ; shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
2522 ; This allows to trigger the following combine rule:
2523 ; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2525 ; As a result, all the shuffle pairs in each function below should be
2526 ; combined into a single legal shuffle operation.
2528 define <4 x float> @combine_undef_input_test16(<4 x float> %a) {
2529 ; ALL-LABEL: combine_undef_input_test16:
2532 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2533 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
2537 define <4 x float> @combine_undef_input_test17(<4 x float> %a) {
2538 ; SSE2-LABEL: combine_undef_input_test17:
2540 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
2543 ; SSSE3-LABEL: combine_undef_input_test17:
2545 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2548 ; SSE41-LABEL: combine_undef_input_test17:
2550 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2553 ; AVX-LABEL: combine_undef_input_test17:
2555 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2557 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2558 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2562 define <4 x float> @combine_undef_input_test18(<4 x float> %a) {
2563 ; SSE2-LABEL: combine_undef_input_test18:
2565 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
2568 ; SSSE3-LABEL: combine_undef_input_test18:
2570 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2573 ; SSE41-LABEL: combine_undef_input_test18:
2575 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2578 ; AVX-LABEL: combine_undef_input_test18:
2580 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2582 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2583 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
2587 define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
2588 ; SSE-LABEL: combine_undef_input_test19:
2590 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
2593 ; AVX-LABEL: combine_undef_input_test19:
2595 ; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
2597 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2598 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2602 define <4 x float> @combine_undef_input_test20(<4 x float> %a) {
2603 ; ALL-LABEL: combine_undef_input_test20:
2606 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2607 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2611 ; These tests are designed to test the ability to combine away unnecessary
2612 ; operations feeding into a shuffle. The AVX cases are the important ones as
2613 ; they leverage operations which cannot be done naturally on the entire vector
2614 ; and thus are decomposed into multiple smaller operations.
2616 define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
2617 ; SSE-LABEL: combine_unneeded_subvector1:
2619 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
2620 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0]
2621 ; SSE-NEXT: movdqa %xmm0, %xmm1
2624 ; AVX1-LABEL: combine_unneeded_subvector1:
2626 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2627 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
2628 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
2629 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2632 ; AVX2-LABEL: combine_unneeded_subvector1:
2634 ; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
2635 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
2636 ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
2638 %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2639 %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
2643 define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
2644 ; SSE-LABEL: combine_unneeded_subvector2:
2646 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
2647 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0]
2648 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
2651 ; AVX1-LABEL: combine_unneeded_subvector2:
2653 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2654 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
2655 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2656 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2657 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2660 ; AVX2-LABEL: combine_unneeded_subvector2:
2662 ; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
2663 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2664 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2666 %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2667 %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
2671 define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) {
2672 ; SSE2-LABEL: combine_insertps1:
2674 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2675 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2676 ; SSE2-NEXT: movaps %xmm1, %xmm0
2679 ; SSSE3-LABEL: combine_insertps1:
2681 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2682 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2683 ; SSSE3-NEXT: movaps %xmm1, %xmm0
2686 ; SSE41-LABEL: combine_insertps1:
2688 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2691 ; AVX-LABEL: combine_insertps1:
2693 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2696 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4>
2697 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
2701 define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) {
2702 ; SSE2-LABEL: combine_insertps2:
2704 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2705 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2706 ; SSE2-NEXT: movaps %xmm1, %xmm0
2709 ; SSSE3-LABEL: combine_insertps2:
2711 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2712 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2713 ; SSSE3-NEXT: movaps %xmm1, %xmm0
2716 ; SSE41-LABEL: combine_insertps2:
2718 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2721 ; AVX-LABEL: combine_insertps2:
2723 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2726 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7>
2727 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2731 define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
2732 ; SSE2-LABEL: combine_insertps3:
2734 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2735 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2738 ; SSSE3-LABEL: combine_insertps3:
2740 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2741 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2744 ; SSE41-LABEL: combine_insertps3:
2746 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2749 ; AVX-LABEL: combine_insertps3:
2751 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2754 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2755 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3>
2759 define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
2760 ; SSE2-LABEL: combine_insertps4:
2762 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
2763 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2766 ; SSSE3-LABEL: combine_insertps4:
2768 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
2769 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2772 ; SSE41-LABEL: combine_insertps4:
2774 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2777 ; AVX-LABEL: combine_insertps4:
2779 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2782 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2783 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5>
2787 define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
2788 ; SSE-LABEL: PR22377:
2789 ; SSE: # BB#0: # %entry
2790 ; SSE-NEXT: movaps %xmm0, %xmm1
2791 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,1,3]
2792 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
2793 ; SSE-NEXT: addps %xmm0, %xmm1
2794 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2797 ; AVX-LABEL: PR22377:
2798 ; AVX: # BB#0: # %entry
2799 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3]
2800 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
2801 ; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm1
2802 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2805 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
2806 %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2807 %r2 = fadd <4 x float> %s1, %s2
2808 %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2812 define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) {
2813 ; SSE2-LABEL: PR22390:
2814 ; SSE2: # BB#0: # %entry
2815 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2816 ; SSE2-NEXT: movaps %xmm0, %xmm2
2817 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2818 ; SSE2-NEXT: addps %xmm0, %xmm2
2819 ; SSE2-NEXT: movaps %xmm2, %xmm0
2822 ; SSSE3-LABEL: PR22390:
2823 ; SSSE3: # BB#0: # %entry
2824 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2825 ; SSSE3-NEXT: movaps %xmm0, %xmm2
2826 ; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2827 ; SSSE3-NEXT: addps %xmm0, %xmm2
2828 ; SSSE3-NEXT: movaps %xmm2, %xmm0
2831 ; SSE41-LABEL: PR22390:
2832 ; SSE41: # BB#0: # %entry
2833 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2834 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2835 ; SSE41-NEXT: addps %xmm1, %xmm0
2838 ; AVX-LABEL: PR22390:
2839 ; AVX: # BB#0: # %entry
2840 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2841 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2842 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
2845 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
2846 %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
2847 %r2 = fadd <4 x float> %s1, %s2
2851 define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
2852 ; SSE2-LABEL: PR22412:
2853 ; SSE2: # BB#0: # %entry
2854 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2855 ; SSE2-NEXT: movapd %xmm2, %xmm0
2856 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
2857 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
2858 ; SSE2-NEXT: movaps %xmm3, %xmm1
2861 ; SSSE3-LABEL: PR22412:
2862 ; SSSE3: # BB#0: # %entry
2863 ; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2864 ; SSSE3-NEXT: movapd %xmm2, %xmm0
2865 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
2866 ; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
2867 ; SSSE3-NEXT: movaps %xmm3, %xmm1
2870 ; SSE41-LABEL: PR22412:
2871 ; SSE41: # BB#0: # %entry
2872 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
2873 ; SSE41-NEXT: movapd %xmm0, %xmm1
2874 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[3,2]
2875 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[3,2]
2876 ; SSE41-NEXT: movaps %xmm1, %xmm0
2877 ; SSE41-NEXT: movaps %xmm3, %xmm1
2880 ; AVX1-LABEL: PR22412:
2881 ; AVX1: # BB#0: # %entry
2882 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
2883 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
2884 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6]
2887 ; AVX2-LABEL: PR22412:
2888 ; AVX2: # BB#0: # %entry
2889 ; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
2890 ; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,7,6,5,4,3,2]
2891 ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
2894 %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2895 %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2>