1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
4 define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
6 ; ALL: ## BB#0: ## %entry
7 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
10 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
11 ret <8 x float> %shuffle
14 define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
16 ; ALL: ## BB#0: ## %entry
17 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
20 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
21 ret <8 x float> %shuffle
24 define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
26 ; ALL: ## BB#0: ## %entry
27 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
30 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
31 ret <8 x float> %shuffle
34 define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
36 ; ALL: ## BB#0: ## %entry
37 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
40 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
41 ret <8 x float> %shuffle
44 define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
46 ; ALL: ## BB#0: ## %entry
47 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
50 %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
51 ret <32 x i8> %shuffle
54 define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
56 ; ALL: ## BB#0: ## %entry
57 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
60 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
61 ret <4 x i64> %shuffle
64 define <32 x i8> @Ei(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
66 ; AVX1: ## BB#0: ## %entry
67 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
68 ; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
69 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
70 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
74 ; AVX2: ## BB#0: ## %entry
75 ; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
76 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
79 ; add forces execution domain
80 %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
81 %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
82 ret <32 x i8> %shuffle
85 define <4 x i64> @E2i(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
87 ; AVX1: ## BB#0: ## %entry
88 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
89 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
93 ; AVX2: ## BB#0: ## %entry
94 ; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
95 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
96 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
99 ; add forces execution domain
100 %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
101 %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
102 ret <4 x i64> %shuffle
105 define <8 x i32> @E3i(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
107 ; AVX1: ## BB#0: ## %entry
108 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
109 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
110 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
111 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
115 ; AVX2: ## BB#0: ## %entry
116 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
117 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
118 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
121 ; add forces execution domain
122 %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
123 %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15>
124 ret <8 x i32> %shuffle
127 define <16 x i16> @E4i(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
129 ; AVX1: ## BB#0: ## %entry
130 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
131 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
135 ; AVX2: ## BB#0: ## %entry
136 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
137 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
140 ; add forces execution domain
141 %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
142 %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
143 ret <16 x i16> %shuffle
146 define <16 x i16> @E5i(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
148 ; AVX1: ## BB#0: ## %entry
149 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
150 ; AVX1-NEXT: vmovaps (%rsi), %ymm1
151 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
152 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
156 ; AVX2: ## BB#0: ## %entry
157 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
158 ; AVX2-NEXT: vmovdqa (%rsi), %ymm1
159 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
160 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
163 %c = load <16 x i16>, <16 x i16>* %a
164 %d = load <16 x i16>, <16 x i16>* %b
165 %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
166 %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
167 ret <16 x i16> %shuffle
170 ;;;; Cases with undef indicies mixed in the mask
172 define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
174 ; ALL: ## BB#0: ## %entry
175 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
178 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11>
179 ret <8 x float> %shuffle
182 define <8 x float> @F2(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
184 ; ALL: ## BB#0: ## %entry
185 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
188 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
189 ret <8 x float> %shuffle
192 define <8 x float> @F3(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
194 ; ALL: ## BB#0: ## %entry
195 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
198 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 10, i32 11>
199 ret <8 x float> %shuffle
202 define <8 x float> @F4(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
204 ; ALL: ## BB#0: ## %entry
205 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
208 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15>
209 ret <8 x float> %shuffle
212 define <8 x float> @F5(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
214 ; ALL: ## BB#0: ## %entry
215 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
218 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
219 ret <8 x float> %shuffle
222 define <8 x float> @F6(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
224 ; ALL: ## BB#0: ## %entry
225 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
228 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
229 ret <8 x float> %shuffle
232 define <8 x float> @F7(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
234 ; ALL: ## BB#0: ## %entry
235 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
238 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
239 ret <8 x float> %shuffle
242 define <8 x float> @F8(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
244 ; ALL: ## BB#0: ## %entry
245 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
248 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15>
249 ret <8 x float> %shuffle
252 ;;;; Cases we must not select vperm2f128
254 define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
256 ; ALL: ## BB#0: ## %entry
257 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
258 ; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
261 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
262 ret <8 x float> %shuffle
265 ;; Test zero mask generation.
266 ;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984
267 ;; Prefer xor+vblendpd over vperm2f128 because that has better performance.
269 define <4 x double> @vperm2z_0x08(<4 x double> %a) {
270 ; ALL-LABEL: vperm2z_0x08:
272 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
274 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
278 define <4 x double> @vperm2z_0x18(<4 x double> %a) {
279 ; ALL-LABEL: vperm2z_0x18:
281 ; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
282 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
284 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
288 define <4 x double> @vperm2z_0x28(<4 x double> %a) {
289 ; ALL-LABEL: vperm2z_0x28:
291 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
293 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
297 define <4 x double> @vperm2z_0x38(<4 x double> %a) {
298 ; ALL-LABEL: vperm2z_0x38:
300 ; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
301 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
303 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
307 define <4 x double> @vperm2z_0x80(<4 x double> %a) {
308 ; ALL-LABEL: vperm2z_0x80:
310 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],zero,zero
312 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
316 define <4 x double> @vperm2z_0x81(<4 x double> %a) {
317 ; ALL-LABEL: vperm2z_0x81:
319 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
321 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
325 define <4 x double> @vperm2z_0x82(<4 x double> %a) {
326 ; ALL-LABEL: vperm2z_0x82:
328 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],zero,zero
330 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
334 define <4 x double> @vperm2z_0x83(<4 x double> %a) {
335 ; ALL-LABEL: vperm2z_0x83:
337 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
339 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
343 ;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection.
345 define <4 x i64> @vperm2z_int_0x83(<4 x i64> %a, <4 x i64> %b) {
346 ; ALL-LABEL: vperm2z_int_0x83:
348 ; AVX1: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
349 ; AVX2: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
350 %s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
351 %c = add <4 x i64> %b, %s