1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
9 define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
10 ; ALL-LABEL: var_shift_v8i64:
12 ; ALL-NEXT: vpsllvq %zmm1, %zmm0, %zmm0
14 %shift = shl <8 x i64> %a, %b
18 define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
19 ; ALL-LABEL: var_shift_v16i32:
21 ; ALL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
23 %shift = shl <16 x i32> %a, %b
27 define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
28 ; AVX512DQ-LABEL: var_shift_v32i16:
30 ; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
31 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
32 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
33 ; AVX512DQ-NEXT: vpsllvd %ymm5, %ymm6, %ymm5
34 ; AVX512DQ-NEXT: vpsrld $16, %ymm5, %ymm5
35 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
36 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
37 ; AVX512DQ-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
38 ; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm0
39 ; AVX512DQ-NEXT: vpackusdw %ymm5, %ymm0, %ymm0
40 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
41 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
42 ; AVX512DQ-NEXT: vpsllvd %ymm2, %ymm5, %ymm2
43 ; AVX512DQ-NEXT: vpsrld $16, %ymm2, %ymm2
44 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
45 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
46 ; AVX512DQ-NEXT: vpsllvd %ymm3, %ymm1, %ymm1
47 ; AVX512DQ-NEXT: vpsrld $16, %ymm1, %ymm1
48 ; AVX512DQ-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
51 ; AVX512BW-LABEL: var_shift_v32i16:
53 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
55 %shift = shl <32 x i16> %a, %b
59 define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
60 ; AVX512DQ-LABEL: var_shift_v64i8:
62 ; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm4
63 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
64 ; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4
65 ; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2
66 ; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
67 ; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm4
68 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
69 ; AVX512DQ-NEXT: vpand %ymm6, %ymm4, %ymm4
70 ; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm2
71 ; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
72 ; AVX512DQ-NEXT: vpaddb %ymm0, %ymm0, %ymm4
73 ; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm2
74 ; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
75 ; AVX512DQ-NEXT: vpsllw $4, %ymm1, %ymm2
76 ; AVX512DQ-NEXT: vpand %ymm5, %ymm2, %ymm2
77 ; AVX512DQ-NEXT: vpsllw $5, %ymm3, %ymm3
78 ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
79 ; AVX512DQ-NEXT: vpsllw $2, %ymm1, %ymm2
80 ; AVX512DQ-NEXT: vpand %ymm6, %ymm2, %ymm2
81 ; AVX512DQ-NEXT: vpaddb %ymm3, %ymm3, %ymm3
82 ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
83 ; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm2
84 ; AVX512DQ-NEXT: vpaddb %ymm3, %ymm3, %ymm3
85 ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
87 %shift = shl <64 x i8> %a, %b
92 ; Uniform Variable Shifts
95 define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
96 ; ALL-LABEL: splatvar_shift_v8i64:
98 ; ALL-NEXT: vpsllq %xmm1, %zmm0, %zmm0
100 %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
101 %shift = shl <8 x i64> %a, %splat
105 define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
106 ; ALL-LABEL: splatvar_shift_v16i32:
108 ; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2
109 ; ALL-NEXT: vmovss %xmm1, %xmm2, %xmm1
110 ; ALL-NEXT: vpslld %xmm1, %zmm0, %zmm0
112 %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
113 %shift = shl <16 x i32> %a, %splat
114 ret <16 x i32> %shift
117 define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
118 ; AVX512DQ-LABEL: splatvar_shift_v32i16:
120 ; AVX512DQ-NEXT: vmovd %xmm2, %eax
121 ; AVX512DQ-NEXT: movzwl %ax, %eax
122 ; AVX512DQ-NEXT: vmovd %eax, %xmm2
123 ; AVX512DQ-NEXT: vpsllw %xmm2, %ymm0, %ymm0
124 ; AVX512DQ-NEXT: vpsllw %xmm2, %ymm1, %ymm1
125 ; AVX512DQ-NEXT: retq
127 ; AVX512BW-LABEL: splatvar_shift_v32i16:
129 ; AVX512BW-NEXT: vmovd %xmm1, %eax
130 ; AVX512BW-NEXT: movzwl %ax, %eax
131 ; AVX512BW-NEXT: vmovd %eax, %xmm1
132 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
133 ; AVX512BW-NEXT: retq
134 %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
135 %shift = shl <32 x i16> %a, %splat
136 ret <32 x i16> %shift
139 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
140 ; AVX512DQ-LABEL: splatvar_shift_v64i8:
142 ; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2
143 ; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm3
144 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
145 ; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3
146 ; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2
147 ; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
148 ; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm3
149 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
150 ; AVX512DQ-NEXT: vpand %ymm5, %ymm3, %ymm3
151 ; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm6
152 ; AVX512DQ-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0
153 ; AVX512DQ-NEXT: vpaddb %ymm0, %ymm0, %ymm3
154 ; AVX512DQ-NEXT: vpaddb %ymm6, %ymm6, %ymm7
155 ; AVX512DQ-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0
156 ; AVX512DQ-NEXT: vpsllw $4, %ymm1, %ymm3
157 ; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3
158 ; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
159 ; AVX512DQ-NEXT: vpsllw $2, %ymm1, %ymm2
160 ; AVX512DQ-NEXT: vpand %ymm5, %ymm2, %ymm2
161 ; AVX512DQ-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
162 ; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm2
163 ; AVX512DQ-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm1
164 ; AVX512DQ-NEXT: retq
166 %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
167 %shift = shl <64 x i8> %a, %splat
175 define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
176 ; ALL-LABEL: constant_shift_v8i64:
178 ; ALL-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0
180 %shift = shl <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62>
184 define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
185 ; ALL-LABEL: constant_shift_v16i32:
187 ; ALL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
189 %shift = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
190 ret <16 x i32> %shift
193 define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
194 ; AVX512DQ-LABEL: constant_shift_v32i16:
196 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
197 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0
198 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1
199 ; AVX512DQ-NEXT: retq
201 ; AVX512BW-LABEL: constant_shift_v32i16:
203 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
204 ; AVX512BW-NEXT: retq
205 %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
206 ret <32 x i16> %shift
209 define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
210 ; AVX512DQ-LABEL: constant_shift_v64i8:
212 ; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm2
213 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
214 ; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2
215 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
216 ; AVX512DQ-NEXT: vpsllw $5, %ymm4, %ymm4
217 ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
218 ; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm2
219 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
220 ; AVX512DQ-NEXT: vpand %ymm5, %ymm2, %ymm2
221 ; AVX512DQ-NEXT: vpaddb %ymm4, %ymm4, %ymm6
222 ; AVX512DQ-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
223 ; AVX512DQ-NEXT: vpaddb %ymm0, %ymm0, %ymm2
224 ; AVX512DQ-NEXT: vpaddb %ymm6, %ymm6, %ymm7
225 ; AVX512DQ-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0
226 ; AVX512DQ-NEXT: vpsllw $4, %ymm1, %ymm2
227 ; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2
228 ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
229 ; AVX512DQ-NEXT: vpsllw $2, %ymm1, %ymm2
230 ; AVX512DQ-NEXT: vpand %ymm5, %ymm2, %ymm2
231 ; AVX512DQ-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
232 ; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm2
233 ; AVX512DQ-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm1
234 ; AVX512DQ-NEXT: retq
235 %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
240 ; Uniform Constant Shifts
243 define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
244 ; ALL-LABEL: splatconstant_shift_v8i64:
246 ; ALL-NEXT: vpsllq $7, %zmm0, %zmm0
248 %shift = shl <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
252 define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
253 ; ALL-LABEL: splatconstant_shift_v16i32:
255 ; ALL-NEXT: vpslld $5, %zmm0, %zmm0
257 %shift = shl <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
258 ret <16 x i32> %shift
261 define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
262 ; AVX512DQ-LABEL: splatconstant_shift_v32i16:
264 ; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm0
265 ; AVX512DQ-NEXT: vpsllw $3, %ymm1, %ymm1
266 ; AVX512DQ-NEXT: retq
268 ; AVX512BW-LABEL: splatconstant_shift_v32i16:
270 ; AVX512BW-NEXT: vpsllw $3, %zmm0, %zmm0
271 ; AVX512BW-NEXT: retq
272 %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
273 ret <32 x i16> %shift
276 define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
277 ; AVX512DQ-LABEL: splatconstant_shift_v64i8:
279 ; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm0
280 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
281 ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
282 ; AVX512DQ-NEXT: vpsllw $3, %ymm1, %ymm1
283 ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
284 ; AVX512DQ-NEXT: retq
286 ; AVX512BW-LABEL: splatconstant_shift_v64i8:
288 ; AVX512BW-NEXT: vpsllw $3, %zmm0, %zmm0
289 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
290 ; AVX512BW-NEXT: retq
291 %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>