1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
5 define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
7 ; ALL: ## BB#0: ## %entry
8 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
11 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
12 ret <8 x float> %shuffle
15 define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
17 ; ALL: ## BB#0: ## %entry
18 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
21 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
22 ret <8 x float> %shuffle
25 define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
27 ; ALL: ## BB#0: ## %entry
28 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
31 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
32 ret <8 x float> %shuffle
35 define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
37 ; ALL: ## BB#0: ## %entry
38 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
41 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
42 ret <8 x float> %shuffle
45 define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
47 ; ALL: ## BB#0: ## %entry
48 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
51 %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
52 ret <32 x i8> %shuffle
55 define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
57 ; ALL: ## BB#0: ## %entry
58 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
61 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
62 ret <4 x i64> %shuffle
65 define <32 x i8> @Ei(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
67 ; AVX1: ## BB#0: ## %entry
68 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
69 ; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
70 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
71 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
75 ; AVX2: ## BB#0: ## %entry
76 ; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
77 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
80 ; add forces execution domain
81 %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
82 %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
83 ret <32 x i8> %shuffle
86 define <4 x i64> @E2i(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
88 ; AVX1: ## BB#0: ## %entry
89 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
90 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
94 ; AVX2: ## BB#0: ## %entry
95 ; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
96 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
97 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
100 ; add forces execution domain
101 %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
102 %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
103 ret <4 x i64> %shuffle
106 define <8 x i32> @E3i(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
108 ; AVX1: ## BB#0: ## %entry
109 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
110 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
111 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
112 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
116 ; AVX2: ## BB#0: ## %entry
117 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
118 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
119 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
122 ; add forces execution domain
123 %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
124 %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15>
125 ret <8 x i32> %shuffle
128 define <16 x i16> @E4i(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
130 ; AVX1: ## BB#0: ## %entry
131 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
132 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
136 ; AVX2: ## BB#0: ## %entry
137 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
138 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
141 ; add forces execution domain
142 %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
143 %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
144 ret <16 x i16> %shuffle
147 define <16 x i16> @E5i(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
149 ; AVX1: ## BB#0: ## %entry
150 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
151 ; AVX1-NEXT: vmovaps (%rsi), %ymm1
152 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
153 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
157 ; AVX2: ## BB#0: ## %entry
158 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
159 ; AVX2-NEXT: vmovdqa (%rsi), %ymm1
160 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
161 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
164 %c = load <16 x i16>, <16 x i16>* %a
165 %d = load <16 x i16>, <16 x i16>* %b
166 %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
167 %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
168 ret <16 x i16> %shuffle
171 ;;;; Cases with undef indicies mixed in the mask
173 define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
175 ; ALL: ## BB#0: ## %entry
176 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
179 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11>
180 ret <8 x float> %shuffle
183 define <8 x float> @F2(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
185 ; ALL: ## BB#0: ## %entry
186 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
189 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
190 ret <8 x float> %shuffle
193 define <8 x float> @F3(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
195 ; ALL: ## BB#0: ## %entry
196 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
199 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 10, i32 11>
200 ret <8 x float> %shuffle
203 define <8 x float> @F4(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
205 ; ALL: ## BB#0: ## %entry
206 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
209 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15>
210 ret <8 x float> %shuffle
213 define <8 x float> @F5(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
215 ; ALL: ## BB#0: ## %entry
216 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
219 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
220 ret <8 x float> %shuffle
223 define <8 x float> @F6(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
225 ; ALL: ## BB#0: ## %entry
226 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
229 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
230 ret <8 x float> %shuffle
233 define <8 x float> @F7(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
235 ; ALL: ## BB#0: ## %entry
236 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
239 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
240 ret <8 x float> %shuffle
243 define <8 x float> @F8(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
245 ; ALL: ## BB#0: ## %entry
246 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
249 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15>
250 ret <8 x float> %shuffle
253 ;;;; Cases we must not select vperm2f128
255 define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
257 ; ALL: ## BB#0: ## %entry
258 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
259 ; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
262 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
263 ret <8 x float> %shuffle
266 ;; Test zero mask generation.
267 ;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984
268 ;; Prefer xor+vblendpd over vperm2f128 because that has better performance.
270 define <4 x double> @vperm2z_0x08(<4 x double> %a) {
271 ; ALL-LABEL: vperm2z_0x08:
273 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
275 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
279 define <4 x double> @vperm2z_0x18(<4 x double> %a) {
280 ; ALL-LABEL: vperm2z_0x18:
282 ; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
283 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
285 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
289 define <4 x double> @vperm2z_0x28(<4 x double> %a) {
290 ; ALL-LABEL: vperm2z_0x28:
292 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
294 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
298 define <4 x double> @vperm2z_0x38(<4 x double> %a) {
299 ; ALL-LABEL: vperm2z_0x38:
301 ; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
302 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
304 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
308 define <4 x double> @vperm2z_0x80(<4 x double> %a) {
309 ; ALL-LABEL: vperm2z_0x80:
311 ; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
312 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
314 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
318 define <4 x double> @vperm2z_0x81(<4 x double> %a) {
319 ; ALL-LABEL: vperm2z_0x81:
321 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
323 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
327 define <4 x double> @vperm2z_0x82(<4 x double> %a) {
328 ; ALL-LABEL: vperm2z_0x82:
330 ; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
331 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
333 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
337 define <4 x double> @vperm2z_0x83(<4 x double> %a) {
338 ; ALL-LABEL: vperm2z_0x83:
340 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
342 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
346 ;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection.
348 define <4 x i64> @vperm2z_int_0x83(<4 x i64> %a, <4 x i64> %b) {
349 ; AVX1-LABEL: vperm2z_int_0x83:
351 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
352 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
353 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
354 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
355 ; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0
356 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
359 ; AVX2-LABEL: vperm2z_int_0x83:
361 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
362 ; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
364 %s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
365 %c = add <4 x i64> %b, %s