1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_64
3 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_32
4 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX
5 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX_32
6 ; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
9 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
10 target triple = "x86_64-unknown-linux-gnu"
14 ; SCALAR: extractelement <16 x float*>
15 ; SCALAR-NEXT: load float
16 ; SCALAR-NEXT: insertelement <16 x float>
17 ; SCALAR-NEXT: extractelement <16 x float*>
18 ; SCALAR-NEXT: load float
20 define <16 x float> @test1(float* %base, <16 x i32> %ind) {
21 ; KNL_64-LABEL: test1:
23 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
24 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
25 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
28 ; KNL_32-LABEL: test1:
30 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
31 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
32 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
33 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
38 ; SKX-NEXT: kxnorw %k0, %k0, %k1
39 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
40 ; SKX-NEXT: vmovaps %zmm1, %zmm0
43 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
44 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
46 %sext_ind = sext <16 x i32> %ind to <16 x i64>
47 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
49 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
53 declare <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
54 declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
55 declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )
59 ; SCALAR: extractelement <16 x float*>
60 ; SCALAR-NEXT: load float
61 ; SCALAR-NEXT: insertelement <16 x float>
62 ; SCALAR-NEXT: br label %else
64 ; SCALAR-NEXT: %res.phi.else = phi
65 ; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
66 ; SCALAR-NEXT: %ToLoad1 = icmp eq i1 %Mask1, true
67 ; SCALAR-NEXT: br i1 %ToLoad1, label %cond.load1, label %else2
69 define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
70 ; KNL_64-LABEL: test2:
72 ; KNL_64-NEXT: kmovw %esi, %k1
73 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
74 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
77 ; KNL_32-LABEL: test2:
79 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
80 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
81 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
82 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
87 ; SKX-NEXT: kmovw %esi, %k1
88 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
89 ; SKX-NEXT: vmovaps %zmm1, %zmm0
92 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
93 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
95 %sext_ind = sext <16 x i32> %ind to <16 x i64>
96 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
97 %imask = bitcast i16 %mask to <16 x i1>
98 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef)
102 define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
103 ; KNL_64-LABEL: test3:
105 ; KNL_64-NEXT: kmovw %esi, %k1
106 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
107 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
110 ; KNL_32-LABEL: test3:
112 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
113 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
114 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
115 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
120 ; SKX-NEXT: kmovw %esi, %k1
121 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
122 ; SKX-NEXT: vmovaps %zmm1, %zmm0
125 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
126 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
128 %sext_ind = sext <16 x i32> %ind to <16 x i64>
129 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i64> %sext_ind
130 %imask = bitcast i16 %mask to <16 x i1>
131 %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
136 define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
137 ; KNL_64-LABEL: test4:
139 ; KNL_64-NEXT: kmovw %esi, %k1
140 ; KNL_64-NEXT: kmovw %k1, %k2
141 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
142 ; KNL_64-NEXT: vmovaps %zmm1, %zmm2
143 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
144 ; KNL_64-NEXT: vpaddd %zmm2, %zmm1, %zmm0
147 ; KNL_32-LABEL: test4:
149 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
150 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
151 ; KNL_32-NEXT: kmovw %k1, %k2
152 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
153 ; KNL_32-NEXT: vmovaps %zmm1, %zmm2
154 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
155 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
160 ; SKX-NEXT: kmovw %esi, %k1
161 ; SKX-NEXT: kmovw %k1, %k2
162 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
163 ; SKX-NEXT: vmovaps %zmm1, %zmm2
164 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
165 ; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0
168 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
169 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
171 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
172 %imask = bitcast i16 %mask to <16 x i1>
173 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
174 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
175 %res = add <16 x i32> %gt1, %gt2
180 ; SCALAR-LABEL: test5
181 ; SCALAR: %Mask0 = extractelement <16 x i1> %imask, i32 0
182 ; SCALAR-NEXT: %ToStore0 = icmp eq i1 %Mask0, true
183 ; SCALAR-NEXT: br i1 %ToStore0, label %cond.store, label %else
184 ; SCALAR: cond.store:
185 ; SCALAR-NEXT: %Elt0 = extractelement <16 x i32> %val, i32 0
186 ; SCALAR-NEXT: %Ptr0 = extractelement <16 x i32*> %gep.random, i32 0
187 ; SCALAR-NEXT: store i32 %Elt0, i32* %Ptr0, align 4
188 ; SCALAR-NEXT: br label %else
190 ; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
191 ; SCALAR-NEXT: %ToStore1 = icmp eq i1 %Mask1, true
192 ; SCALAR-NEXT: br i1 %ToStore1, label %cond.store1, label %else2
194 define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
195 ; KNL_64-LABEL: test5:
197 ; KNL_64-NEXT: kmovw %esi, %k1
198 ; KNL_64-NEXT: kmovw %k1, %k2
199 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
200 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
203 ; KNL_32-LABEL: test5:
205 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
206 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
207 ; KNL_32-NEXT: kmovw %k1, %k2
208 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
209 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
214 ; SKX-NEXT: kmovw %esi, %k1
215 ; SKX-NEXT: kmovw %k1, %k2
216 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
217 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
220 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
221 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
223 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
224 %imask = bitcast i16 %mask to <16 x i1>
225 call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
226 call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
230 declare void @llvm.masked.scatter.v8i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
231 declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )
234 ; SCALAR-LABEL: test6
235 ; SCALAR: store i32 %Elt0, i32* %Ptr01, align 4
236 ; SCALAR-NEXT: %Elt1 = extractelement <8 x i32> %a1, i32 1
237 ; SCALAR-NEXT: %Ptr12 = extractelement <8 x i32*> %ptr, i32 1
238 ; SCALAR-NEXT: store i32 %Elt1, i32* %Ptr12, align 4
239 ; SCALAR-NEXT: %Elt2 = extractelement <8 x i32> %a1, i32 2
240 ; SCALAR-NEXT: %Ptr23 = extractelement <8 x i32*> %ptr, i32 2
241 ; SCALAR-NEXT: store i32 %Elt2, i32* %Ptr23, align 4
243 define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
244 ; KNL_64-LABEL: test6:
246 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
247 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2
248 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
249 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
250 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
253 ; KNL_32-LABEL: test6:
255 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
256 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm2
257 ; KNL_32-NEXT: kxnorw %k0, %k0, %k2
258 ; KNL_32-NEXT: vpgatherqd (,%zmm2), %ymm1 {%k2}
259 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm2) {%k1}
260 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
265 ; SKX-NEXT: kxnorw %k0, %k0, %k1
266 ; SKX-NEXT: kxnorw %k0, %k0, %k2
267 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
268 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
269 ; SKX-NEXT: vmovaps %zmm2, %zmm0
272 %a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
274 call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
278 define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
280 ; KNL_64-LABEL: test7:
282 ; KNL_64-NEXT: movzbl %sil, %eax
283 ; KNL_64-NEXT: kmovw %eax, %k1
284 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
285 ; KNL_64-NEXT: kmovw %k1, %k2
286 ; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2}
287 ; KNL_64-NEXT: vmovaps %zmm1, %zmm2
288 ; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
289 ; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0
292 ; KNL_32-LABEL: test7:
294 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
295 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
296 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
297 ; KNL_32-NEXT: kmovw %k1, %k2
298 ; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2}
299 ; KNL_32-NEXT: vmovaps %zmm1, %zmm2
300 ; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
301 ; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
306 ; SKX-NEXT: kmovb %esi, %k1
307 ; SKX-NEXT: kmovw %k1, %k2
308 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
309 ; SKX-NEXT: vmovaps %zmm1, %zmm2
310 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
311 ; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0
314 %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0
315 %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer
317 %gep.random = getelementptr i32, <8 x i32*> %broadcast.splat, <8 x i32> %ind
318 %imask = bitcast i8 %mask to <8 x i1>
319 %gt1 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef)
320 %gt2 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1)
321 %res = add <8 x i32> %gt1, %gt2
325 ; No uniform base in this case, index <8 x i64> contains addresses,
326 ; each gather call will be split into two
327 define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
328 ; KNL_64-LABEL: test8:
330 ; KNL_64-NEXT: kmovw %edi, %k1
331 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
332 ; KNL_64-NEXT: kmovw %k2, %k3
333 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
334 ; KNL_64-NEXT: kmovw %k1, %k3
335 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
336 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
337 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
338 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
339 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
340 ; KNL_64-NEXT: vpaddd %zmm0, %zmm4, %zmm0
343 ; KNL_32-LABEL: test8:
345 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
346 ; KNL_32-NEXT: kmovw %k1, %k2
347 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
348 ; KNL_32-NEXT: vmovaps %zmm1, %zmm2
349 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
350 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
355 ; SKX-NEXT: kmovw %edi, %k1
356 ; SKX-NEXT: kshiftrw $8, %k1, %k2
357 ; SKX-NEXT: kmovw %k2, %k3
358 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
359 ; SKX-NEXT: kmovw %k1, %k3
360 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
361 ; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm4
362 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
363 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
364 ; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0
365 ; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0
368 ; SKX_32-LABEL: test8:
370 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
371 ; SKX_32-NEXT: kmovw %k1, %k2
372 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
373 ; SKX_32-NEXT: vmovaps %zmm1, %zmm2
374 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
375 ; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
378 %imask = bitcast i16 %mask to <16 x i1>
379 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
380 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
381 %res = add <16 x i32> %gt1, %gt2
385 %struct.RT = type { i8, [10 x [20 x i32]], i8 }
386 %struct.ST = type { i32, double, %struct.RT }
388 ; Masked gather for agregate types
389 ; Test9 and Test10 should give the same result (scalar and vector indices in GEP)
392 define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
393 ; KNL_64-LABEL: test9:
394 ; KNL_64: # BB#0: # %entry
395 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
396 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
397 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
398 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4
399 ; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1
400 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
401 ; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1
402 ; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1
403 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
404 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
405 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
406 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
407 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
408 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
409 ; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
410 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
411 ; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
412 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
413 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
416 ; KNL_32-LABEL: test9:
417 ; KNL_32: # BB#0: # %entry
418 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
419 ; KNL_32-NEXT: vpbroadcastd .LCPI8_0, %ymm3
420 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
421 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
422 ; KNL_32-NEXT: vpbroadcastd .LCPI8_1, %ymm3
423 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
424 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
425 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
426 ; KNL_32-NEXT: vpbroadcastd .LCPI8_2, %ymm1
427 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
428 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
429 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
430 ; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
434 ; SKX: # BB#0: # %entry
435 ; SKX-NEXT: vpbroadcastq %rdi, %zmm2
436 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
437 ; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
438 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
439 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
440 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
441 ; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
442 ; SKX-NEXT: kxnorw %k0, %k0, %k1
443 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
446 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
447 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
449 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13>
450 %res = call <8 x i32 > @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
454 define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
455 ; KNL_64-LABEL: test10:
456 ; KNL_64: # BB#0: # %entry
457 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
458 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
459 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
460 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4
461 ; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1
462 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
463 ; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1
464 ; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1
465 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
466 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
467 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
468 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
469 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
470 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
471 ; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
472 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
473 ; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
474 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
475 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
478 ; KNL_32-LABEL: test10:
479 ; KNL_32: # BB#0: # %entry
480 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
481 ; KNL_32-NEXT: vpbroadcastd .LCPI9_0, %ymm3
482 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
483 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
484 ; KNL_32-NEXT: vpbroadcastd .LCPI9_1, %ymm3
485 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
486 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
487 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
488 ; KNL_32-NEXT: vpbroadcastd .LCPI9_2, %ymm1
489 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
490 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
491 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
492 ; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
496 ; SKX: # BB#0: # %entry
497 ; SKX-NEXT: vpbroadcastq %rdi, %zmm2
498 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
499 ; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
500 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
501 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
502 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
503 ; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
504 ; SKX-NEXT: kxnorw %k0, %k0, %k1
505 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
508 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
509 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
511 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13
512 %res = call <8 x i32 > @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
516 ; Splat index in GEP, requires broadcast
517 define <16 x float> @test11(float* %base, i32 %ind) {
518 ; KNL_64-LABEL: test11:
520 ; KNL_64-NEXT: vpbroadcastd %esi, %zmm1
521 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
522 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
525 ; KNL_32-LABEL: test11:
527 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
528 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %zmm1
529 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
530 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
535 ; SKX-NEXT: vpbroadcastd %esi, %zmm1
536 ; SKX-NEXT: kxnorw %k0, %k0, %k1
537 ; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
540 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
541 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
543 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
545 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
549 ; We are checking the uniform base here. It is taken directly from input to vgatherdps
550 define <16 x float> @test12(float* %base, <16 x i32> %ind) {
551 ; KNL_64-LABEL: test12:
553 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
554 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
555 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
558 ; KNL_32-LABEL: test12:
560 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
561 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
562 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
563 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
568 ; SKX-NEXT: kxnorw %k0, %k0, %k1
569 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
570 ; SKX-NEXT: vmovaps %zmm1, %zmm0
573 %sext_ind = sext <16 x i32> %ind to <16 x i64>
574 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
576 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
580 ; The same as the previous, but the mask is undefined
581 define <16 x float> @test13(float* %base, <16 x i32> %ind) {
582 ; KNL_64-LABEL: test13:
584 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
585 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
588 ; KNL_32-LABEL: test13:
590 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
591 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
592 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
597 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
598 ; SKX-NEXT: vmovaps %zmm1, %zmm0
601 %sext_ind = sext <16 x i32> %ind to <16 x i64>
602 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
604 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef)
608 ; The base pointer is not splat, can't find unform base
609 define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
610 ; KNL_64-LABEL: test14:
612 ; KNL_64-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
613 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
614 ; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0
615 ; KNL_64-NEXT: vmovd %esi, %xmm1
616 ; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1
617 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
618 ; KNL_64-NEXT: vpsllq $2, %zmm1, %zmm1
619 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
620 ; KNL_64-NEXT: kshiftrw $8, %k0, %k1
621 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
622 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1}
623 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm0
626 ; KNL_32-LABEL: test14:
628 ; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
629 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
630 ; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0
631 ; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
632 ; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
633 ; KNL_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
638 ; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
639 ; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0
640 ; SKX-NEXT: vpbroadcastq %xmm0, %zmm0
641 ; SKX-NEXT: vmovd %esi, %xmm1
642 ; SKX-NEXT: vpbroadcastd %xmm1, %ymm1
643 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
644 ; SKX-NEXT: vpsllq $2, %zmm1, %zmm1
645 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
646 ; SKX-NEXT: kshiftrw $8, %k0, %k1
647 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
648 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1}
649 ; SKX-NEXT: vinsertf32x8 $1, %ymm1, %zmm2, %zmm0
652 ; SKX_32-LABEL: test14:
654 ; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
655 ; SKX_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
656 ; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0
657 ; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
658 ; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
659 ; SKX_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
662 %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1
663 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
665 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
667 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef)
671 declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
672 declare <4 x double> @llvm.masked.gather.v4f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
673 declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
675 ; Gather smaller than existing instruction
676 define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
678 ; KNL_64-LABEL: test15:
680 ; KNL_64-NEXT: vpxor %ymm2, %ymm2, %ymm2
681 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
682 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2
683 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm0
684 ; KNL_64-NEXT: vpsllq $63, %zmm0, %zmm0
685 ; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k1
686 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1}
689 ; KNL_32-LABEL: test15:
691 ; KNL_32-NEXT: vpxor %ymm2, %ymm2, %ymm2
692 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
693 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
694 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2
695 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm0
696 ; KNL_32-NEXT: vpsllvq .LCPI14_0, %zmm0, %zmm0
697 ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k1
698 ; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1}
703 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
704 ; SKX-NEXT: vpmovd2m %xmm1, %k1
705 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
706 ; SKX-NEXT: vmovaps %zmm1, %zmm0
709 ; SKX_32-LABEL: test15:
711 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
712 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
713 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
714 ; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
715 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
718 %sext_ind = sext <4 x i32> %ind to <4 x i64>
719 %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind
720 %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
724 ; Gather smaller than existing instruction
725 define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
727 ; KNL_64-LABEL: test16:
729 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
730 ; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
731 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
732 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
733 ; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
734 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
735 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
736 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
737 ; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
738 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
741 ; KNL_32-LABEL: test16:
743 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
744 ; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
745 ; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
746 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
747 ; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
748 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
749 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
750 ; KNL_32-NEXT: vpsllvq .LCPI15_0, %zmm1, %zmm1
751 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
752 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
753 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
758 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
759 ; SKX-NEXT: vpmovd2m %xmm1, %k1
760 ; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
761 ; SKX-NEXT: vmovaps %zmm2, %zmm0
764 ; SKX_32-LABEL: test16:
766 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
767 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
768 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
769 ; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1}
770 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0
773 %sext_ind = sext <4 x i32> %ind to <4 x i64>
774 %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind
775 %res = call <4 x double> @llvm.masked.gather.v4f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
779 define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
781 ; KNL_64-LABEL: test17:
783 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
784 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
785 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
786 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
787 ; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
788 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
791 ; KNL_32-LABEL: test17:
793 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
794 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
795 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
796 ; KNL_32-NEXT: vpsllvq .LCPI16_0, %zmm1, %zmm1
797 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
798 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
799 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
804 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
805 ; SKX-NEXT: vpmovq2m %xmm1, %k1
806 ; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
807 ; SKX-NEXT: vmovaps %zmm2, %zmm0
810 ; SKX_32-LABEL: test17:
812 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
813 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
814 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
815 ; SKX_32-NEXT: vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1}
816 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0
819 %sext_ind = sext <2 x i32> %ind to <2 x i64>
820 %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind
821 %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
825 declare void @llvm.masked.scatter.v4i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
826 declare void @llvm.masked.scatter.v4f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
827 declare void @llvm.masked.scatter.v2i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
828 declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
829 declare void @llvm.masked.scatter.v2f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
831 define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
833 ; KNL_64-LABEL: test18:
835 ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
836 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
837 ; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2
838 ; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2
839 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
840 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
843 ; KNL_32-LABEL: test18:
845 ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
846 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
847 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
848 ; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2
849 ; KNL_32-NEXT: vpsllvq .LCPI17_0, %zmm2, %zmm2
850 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
851 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
856 ; SKX-NEXT: vpslld $31, %xmm2, %xmm2
857 ; SKX-NEXT: vpmovd2m %xmm2, %k1
858 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
861 ; SKX_32-LABEL: test18:
863 ; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
864 ; SKX_32-NEXT: vpmovd2m %xmm2, %k1
865 ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
867 call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
871 define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) {
873 ; KNL_64-LABEL: test19:
875 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
876 ; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
877 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
878 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
879 ; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
880 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
881 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
882 ; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
885 ; KNL_32-LABEL: test19:
887 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
888 ; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
889 ; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
890 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
891 ; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
892 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
893 ; KNL_32-NEXT: vpsllvq .LCPI18_0, %zmm1, %zmm1
894 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
895 ; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
900 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
901 ; SKX-NEXT: vpmovd2m %xmm1, %k1
902 ; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
905 ; SKX_32-LABEL: test19:
907 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
908 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
909 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
910 ; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
912 %gep = getelementptr double, double* %ptr, <4 x i64> %ind
913 call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
917 ; Data type requires widening
918 define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
920 ; KNL_64-LABEL: test20:
922 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
923 ; KNL_64-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero
924 ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
925 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
926 ; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2
927 ; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2
928 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
929 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
932 ; KNL_32-LABEL: test20:
934 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
935 ; KNL_32-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero
936 ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
937 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
938 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
939 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
940 ; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2
941 ; KNL_32-NEXT: vpsllvq .LCPI19_0, %zmm2, %zmm2
942 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
943 ; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
948 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
949 ; SKX-NEXT: vpmovq2m %xmm2, %k0
950 ; SKX-NEXT: kshiftlw $2, %k0, %k0
951 ; SKX-NEXT: kshiftrw $2, %k0, %k1
952 ; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1}
955 ; SKX_32-LABEL: test20:
957 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
958 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
959 ; SKX_32-NEXT: vpmovq2m %xmm2, %k0
960 ; SKX_32-NEXT: kshiftlw $2, %k0, %k0
961 ; SKX_32-NEXT: kshiftrw $2, %k0, %k1
962 ; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1}
964 call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
968 ; Data type requires promotion
969 define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
971 ; KNL_64-LABEL: test21:
973 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
974 ; KNL_64-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
975 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
976 ; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2
977 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
978 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
981 ; KNL_32-LABEL: test21:
983 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
984 ; KNL_32-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
985 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
986 ; KNL_32-NEXT: vpsllvq .LCPI20_0, %zmm2, %zmm2
987 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
988 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
993 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
994 ; SKX-NEXT: vpmovq2m %xmm2, %k0
995 ; SKX-NEXT: kshiftlw $2, %k0, %k0
996 ; SKX-NEXT: kshiftrw $2, %k0, %k1
997 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
998 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
1001 ; SKX_32-LABEL: test21:
1003 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
1004 ; SKX_32-NEXT: vpmovq2m %xmm2, %k0
1005 ; SKX_32-NEXT: kshiftlw $2, %k0, %k0
1006 ; SKX_32-NEXT: kshiftrw $2, %k0, %k1
1007 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1008 ; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
1010 call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
1014 ; The result type requires widening
1015 declare <2 x float> @llvm.masked.gather.v2f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
1017 define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
1020 ; KNL_64-LABEL: test22:
1022 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1023 ; KNL_64-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero
1024 ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
1025 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
1026 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1027 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
1028 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
1029 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
1030 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
1031 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
1032 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
1035 ; KNL_32-LABEL: test22:
1037 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1038 ; KNL_32-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero
1039 ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
1040 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
1041 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1042 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1043 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
1044 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
1045 ; KNL_32-NEXT: vpsllvq .LCPI21_0, %zmm1, %zmm1
1046 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
1047 ; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
1048 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1051 ; SKX-LABEL: test22:
1053 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1054 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1055 ; SKX-NEXT: vpmovq2m %xmm1, %k0
1056 ; SKX-NEXT: kshiftlw $2, %k0, %k0
1057 ; SKX-NEXT: kshiftrw $2, %k0, %k1
1058 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
1059 ; SKX-NEXT: vmovaps %zmm2, %zmm0
1062 ; SKX_32-LABEL: test22:
1064 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1065 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1066 ; SKX_32-NEXT: vpmovq2m %xmm1, %k0
1067 ; SKX_32-NEXT: kshiftlw $2, %k0, %k0
1068 ; SKX_32-NEXT: kshiftrw $2, %k0, %k1
1069 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1070 ; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm2 {%k1}
1071 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0
1073 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1074 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1075 %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
1079 declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
1080 declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
1082 define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
1084 ; KNL_64-LABEL: test23:
1086 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
1087 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1088 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
1089 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
1090 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
1091 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
1094 ; KNL_32-LABEL: test23:
1096 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
1097 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1098 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1099 ; KNL_32-NEXT: vpsllvq .LCPI22_0, %zmm1, %zmm1
1100 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
1101 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
1102 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1105 ; SKX-LABEL: test23:
1107 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1108 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1109 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
1110 ; SKX-NEXT: vmovaps %zmm2, %zmm0
1113 ; SKX_32-LABEL: test23:
1115 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1116 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1117 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1118 ; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
1119 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0
1121 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1122 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1123 %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
1127 define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
1128 ; KNL_64-LABEL: test24:
1130 ; KNL_64-NEXT: movb $3, %al
1131 ; KNL_64-NEXT: movzbl %al, %eax
1132 ; KNL_64-NEXT: kmovw %eax, %k1
1133 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
1134 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
1137 ; KNL_32-LABEL: test24:
1139 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1140 ; KNL_32-NEXT: vpxord %zmm1, %zmm1, %zmm1
1141 ; KNL_32-NEXT: vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1
1142 ; KNL_32-NEXT: vpsllvq .LCPI23_1, %zmm1, %zmm1
1143 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
1144 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
1145 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
1148 ; SKX-LABEL: test24:
1150 ; SKX-NEXT: kxnorw %k0, %k0, %k1
1151 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
1152 ; SKX-NEXT: vmovaps %zmm1, %zmm0
1155 ; SKX_32-LABEL: test24:
1157 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1158 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
1159 ; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
1160 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
1162 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1163 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1164 %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1168 define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) {
1170 ; KNL_64-LABEL: test25:
1172 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
1173 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1174 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
1175 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
1176 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
1177 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
1180 ; KNL_32-LABEL: test25:
1182 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
1183 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1184 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1185 ; KNL_32-NEXT: vpsllvq .LCPI24_0, %zmm1, %zmm1
1186 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
1187 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
1188 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1191 ; SKX-LABEL: test25:
1193 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1194 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1195 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
1196 ; SKX-NEXT: vmovaps %zmm2, %zmm0
1199 ; SKX_32-LABEL: test25:
1201 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1202 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1203 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1204 ; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
1205 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0
1207 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1208 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1209 %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
1213 define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
1215 ; KNL_64-LABEL: test26:
1217 ; KNL_64-NEXT: movb $3, %al
1218 ; KNL_64-NEXT: movzbl %al, %eax
1219 ; KNL_64-NEXT: kmovw %eax, %k1
1220 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
1221 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
1224 ; KNL_32-LABEL: test26:
1226 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1227 ; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
1228 ; KNL_32-NEXT: vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2
1229 ; KNL_32-NEXT: vpsllvq .LCPI25_1, %zmm2, %zmm2
1230 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
1231 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
1232 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
1235 ; SKX-LABEL: test26:
1237 ; SKX-NEXT: kxnorw %k0, %k0, %k1
1238 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
1239 ; SKX-NEXT: vmovaps %zmm1, %zmm0
1242 ; SKX_32-LABEL: test26:
1244 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1245 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
1246 ; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
1247 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
1249 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1250 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1251 %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
1255 ; Result type requires widening; all-ones mask
1256 define <2 x float> @test27(float* %base, <2 x i32> %ind) {
1258 ; KNL_64-LABEL: test27:
1260 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1261 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1
1262 ; KNL_64-NEXT: movb $3, %al
1263 ; KNL_64-NEXT: movzbl %al, %eax
1264 ; KNL_64-NEXT: kmovw %eax, %k1
1265 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
1268 ; KNL_32-LABEL: test27:
1270 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1271 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1272 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
1273 ; KNL_32-NEXT: movb $3, %cl
1274 ; KNL_32-NEXT: movzbl %cl, %ecx
1275 ; KNL_32-NEXT: kmovw %ecx, %k1
1276 ; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
1279 ; SKX-LABEL: test27:
1281 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
1282 ; SKX-NEXT: movb $3, %al
1283 ; SKX-NEXT: kmovb %eax, %k1
1284 ; SKX-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
1286 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1287 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1288 %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
1292 ; Data type requires promotion, mask is all-ones
1293 define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
1296 ; KNL_64-LABEL: test28:
1298 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1299 ; KNL_64-NEXT: movb $3, %al
1300 ; KNL_64-NEXT: movzbl %al, %eax
1301 ; KNL_64-NEXT: kmovw %eax, %k1
1302 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
1305 ; KNL_32-LABEL: test28:
1307 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1308 ; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
1309 ; KNL_32-NEXT: vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2
1310 ; KNL_32-NEXT: vpsllvq .LCPI27_1, %zmm2, %zmm2
1311 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
1312 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
1315 ; SKX-LABEL: test28:
1317 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1318 ; SKX-NEXT: movb $3, %al
1319 ; SKX-NEXT: kmovb %eax, %k1
1320 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
1323 ; SKX_32-LABEL: test28:
1325 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1326 ; SKX_32-NEXT: movb $3, %al
1327 ; SKX_32-NEXT: kmovb %eax, %k1
1328 ; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
1330 call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
1335 ; SCALAR-LABEL: test29
1336 ; SCALAR: extractelement <16 x float*>
1337 ; SCALAR-NEXT: load float
1338 ; SCALAR-NEXT: insertelement <16 x float>
1339 ; SCALAR-NEXT: extractelement <16 x float*>
1340 ; SCALAR-NEXT: load float
1342 define <16 x float> @test29(float* %base, <16 x i32> %ind) {
1343 ; KNL_64-LABEL: test29:
1345 ; KNL_64-NEXT: movw $44, %ax
1346 ; KNL_64-NEXT: kmovw %eax, %k1
1347 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1348 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
1351 ; KNL_32-LABEL: test29:
1353 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1354 ; KNL_32-NEXT: movw $44, %cx
1355 ; KNL_32-NEXT: kmovw %ecx, %k1
1356 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
1357 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
1360 ; SKX-LABEL: test29:
1362 ; SKX-NEXT: movw $44, %ax
1363 ; SKX-NEXT: kmovw %eax, %k1
1364 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1365 ; SKX-NEXT: vmovaps %zmm1, %zmm0
1368 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
1369 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
1371 %sext_ind = sext <16 x i32> %ind to <16 x i64>
1372 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
1374 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
1375 ret <16 x float>%res
1378 ; Check non-power-of-2 case. It should be scalarized.
1379 declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
1380 define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
1381 ; KNL_64-LABEL: test30:
1383 ; KNL_64-NEXT: andl $1, %edx
1384 ; KNL_64-NEXT: kmovw %edx, %k1
1385 ; KNL_64-NEXT: andl $1, %esi
1386 ; KNL_64-NEXT: kmovw %esi, %k2
1387 ; KNL_64-NEXT: movl %edi, %eax
1388 ; KNL_64-NEXT: andl $1, %eax
1389 ; KNL_64-NEXT: kmovw %eax, %k0
1390 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
1391 ; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1
1392 ; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm1
1393 ; KNL_64-NEXT: # implicit-def: %XMM0
1394 ; KNL_64-NEXT: testb $1, %dil
1395 ; KNL_64-NEXT: je .LBB29_2
1396 ; KNL_64-NEXT: # BB#1: # %cond.load
1397 ; KNL_64-NEXT: vmovq %xmm1, %rax
1398 ; KNL_64-NEXT: vmovd (%rax), %xmm0
1399 ; KNL_64-NEXT: .LBB29_2: # %else
1400 ; KNL_64-NEXT: kmovw %k2, %eax
1401 ; KNL_64-NEXT: movl %eax, %ecx
1402 ; KNL_64-NEXT: andl $1, %ecx
1403 ; KNL_64-NEXT: testb %cl, %cl
1404 ; KNL_64-NEXT: je .LBB29_4
1405 ; KNL_64-NEXT: # BB#3: # %cond.load1
1406 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx
1407 ; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0
1408 ; KNL_64-NEXT: .LBB29_4: # %else2
1409 ; KNL_64-NEXT: kmovw %k1, %ecx
1410 ; KNL_64-NEXT: movl %ecx, %edx
1411 ; KNL_64-NEXT: andl $1, %edx
1412 ; KNL_64-NEXT: testb %dl, %dl
1413 ; KNL_64-NEXT: je .LBB29_6
1414 ; KNL_64-NEXT: # BB#5: # %cond.load4
1415 ; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1
1416 ; KNL_64-NEXT: vmovq %xmm1, %rdx
1417 ; KNL_64-NEXT: vpinsrd $2, (%rdx), %xmm0, %xmm0
1418 ; KNL_64-NEXT: .LBB29_6: # %else5
1419 ; KNL_64-NEXT: kmovw %k0, %edx
1420 ; KNL_64-NEXT: vmovd %edx, %xmm1
1421 ; KNL_64-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
1422 ; KNL_64-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
1423 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
1424 ; KNL_64-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
1427 ; KNL_32-LABEL: test30:
1429 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1430 ; KNL_32-NEXT: andl $1, %eax
1431 ; KNL_32-NEXT: kmovw %eax, %k1
1432 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1433 ; KNL_32-NEXT: andl $1, %eax
1434 ; KNL_32-NEXT: kmovw %eax, %k2
1435 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1436 ; KNL_32-NEXT: movl %eax, %ecx
1437 ; KNL_32-NEXT: andl $1, %ecx
1438 ; KNL_32-NEXT: kmovw %ecx, %k0
1439 ; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
1440 ; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1
1441 ; KNL_32-NEXT: # implicit-def: %XMM0
1442 ; KNL_32-NEXT: testb $1, %al
1443 ; KNL_32-NEXT: je .LBB29_2
1444 ; KNL_32-NEXT: # BB#1: # %cond.load
1445 ; KNL_32-NEXT: vmovd %xmm1, %eax
1446 ; KNL_32-NEXT: vmovd (%eax), %xmm0
1447 ; KNL_32-NEXT: .LBB29_2: # %else
1448 ; KNL_32-NEXT: kmovw %k2, %eax
1449 ; KNL_32-NEXT: movl %eax, %ecx
1450 ; KNL_32-NEXT: andl $1, %ecx
1451 ; KNL_32-NEXT: testb %cl, %cl
1452 ; KNL_32-NEXT: je .LBB29_4
1453 ; KNL_32-NEXT: # BB#3: # %cond.load1
1454 ; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx
1455 ; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0
1456 ; KNL_32-NEXT: .LBB29_4: # %else2
1457 ; KNL_32-NEXT: kmovw %k1, %ecx
1458 ; KNL_32-NEXT: movl %ecx, %edx
1459 ; KNL_32-NEXT: andl $1, %edx
1460 ; KNL_32-NEXT: testb %dl, %dl
1461 ; KNL_32-NEXT: je .LBB29_6
1462 ; KNL_32-NEXT: # BB#5: # %cond.load4
1463 ; KNL_32-NEXT: vpextrd $2, %xmm1, %edx
1464 ; KNL_32-NEXT: vpinsrd $2, (%edx), %xmm0, %xmm0
1465 ; KNL_32-NEXT: .LBB29_6: # %else5
1466 ; KNL_32-NEXT: kmovw %k0, %edx
1467 ; KNL_32-NEXT: vmovd %edx, %xmm1
1468 ; KNL_32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
1469 ; KNL_32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
1470 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
1471 ; KNL_32-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
1474 ; SKX-LABEL: test30:
1476 ; SKX-NEXT: vpslld $31, %xmm2, %xmm2
1477 ; SKX-NEXT: vpmovd2m %xmm2, %k1
1478 ; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
1479 ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
1480 ; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
1481 ; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm1
1482 ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
1483 ; SKX-NEXT: # implicit-def: %XMM0
1484 ; SKX-NEXT: andb $1, %al
1485 ; SKX-NEXT: je .LBB29_2
1486 ; SKX-NEXT: # BB#1: # %cond.load
1487 ; SKX-NEXT: vmovq %xmm1, %rax
1488 ; SKX-NEXT: vmovd (%rax), %xmm0
1489 ; SKX-NEXT: .LBB29_2: # %else
1490 ; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
1491 ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
1492 ; SKX-NEXT: andb $1, %al
1493 ; SKX-NEXT: je .LBB29_4
1494 ; SKX-NEXT: # BB#3: # %cond.load1
1495 ; SKX-NEXT: vpextrq $1, %xmm1, %rax
1496 ; SKX-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0
1497 ; SKX-NEXT: .LBB29_4: # %else2
1498 ; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
1499 ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
1500 ; SKX-NEXT: andb $1, %al
1501 ; SKX-NEXT: je .LBB29_6
1502 ; SKX-NEXT: # BB#5: # %cond.load4
1503 ; SKX-NEXT: vextracti128 $1, %ymm1, %xmm1
1504 ; SKX-NEXT: vmovq %xmm1, %rax
1505 ; SKX-NEXT: vpinsrd $2, (%rax), %xmm0, %xmm0
1506 ; SKX-NEXT: .LBB29_6: # %else5
1507 ; SKX-NEXT: vmovdqa32 %xmm0, %xmm3 {%k1}
1508 ; SKX-NEXT: vmovaps %zmm3, %zmm0
1511 ; SKX_32-LABEL: test30:
1513 ; SKX_32-NEXT: subl $12, %esp
1514 ; SKX_32-NEXT: .Ltmp0:
1515 ; SKX_32-NEXT: .cfi_def_cfa_offset 16
1516 ; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
1517 ; SKX_32-NEXT: vpmovd2m %xmm2, %k1
1518 ; SKX_32-NEXT: kmovb %k1, {{[0-9]+}}(%esp)
1519 ; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
1520 ; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm2
1521 ; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
1522 ; SKX_32-NEXT: # implicit-def: %XMM1
1523 ; SKX_32-NEXT: andb $1, %al
1524 ; SKX_32-NEXT: je .LBB29_2
1525 ; SKX_32-NEXT: # BB#1: # %cond.load
1526 ; SKX_32-NEXT: vmovd %xmm2, %eax
1527 ; SKX_32-NEXT: vmovd (%eax), %xmm1
1528 ; SKX_32-NEXT: .LBB29_2: # %else
1529 ; SKX_32-NEXT: kmovb %k1, {{[0-9]+}}(%esp)
1530 ; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
1531 ; SKX_32-NEXT: andb $1, %al
1532 ; SKX_32-NEXT: je .LBB29_4
1533 ; SKX_32-NEXT: # BB#3: # %cond.load1
1534 ; SKX_32-NEXT: vpextrd $1, %xmm2, %eax
1535 ; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm1, %xmm1
1536 ; SKX_32-NEXT: .LBB29_4: # %else2
1537 ; SKX_32-NEXT: vmovdqa32 {{[0-9]+}}(%esp), %xmm0
1538 ; SKX_32-NEXT: kmovb %k1, (%esp)
1539 ; SKX_32-NEXT: movb (%esp), %al
1540 ; SKX_32-NEXT: andb $1, %al
1541 ; SKX_32-NEXT: je .LBB29_6
1542 ; SKX_32-NEXT: # BB#5: # %cond.load4
1543 ; SKX_32-NEXT: vpextrd $2, %xmm2, %eax
1544 ; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm1, %xmm1
1545 ; SKX_32-NEXT: .LBB29_6: # %else5
1546 ; SKX_32-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
1547 ; SKX_32-NEXT: addl $12, %esp
1550 %sext_ind = sext <3 x i32> %ind to <3 x i64>
1551 %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
1552 %res = call <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
1556 declare <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
1561 define <16 x float*> @test31(<16 x float**> %ptrs) {
1562 ; KNL_64-LABEL: test31:
1564 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
1565 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2
1566 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
1567 ; KNL_64-NEXT: kshiftrw $8, %k1, %k1
1568 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
1569 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
1570 ; KNL_64-NEXT: vmovaps %zmm3, %zmm1
1573 ; KNL_32-LABEL: test31:
1575 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
1576 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
1577 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
1580 ; SKX-LABEL: test31:
1582 ; SKX-NEXT: kxnorw %k0, %k0, %k1
1583 ; SKX-NEXT: kxnorw %k0, %k0, %k2
1584 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
1585 ; SKX-NEXT: kshiftrw $8, %k1, %k1
1586 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
1587 ; SKX-NEXT: vmovaps %zmm2, %zmm0
1588 ; SKX-NEXT: vmovaps %zmm3, %zmm1
1591 ; SKX_32-LABEL: test31:
1593 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
1594 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
1595 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
1598 %res = call <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
1599 ret <16 x float*>%res
1602 define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
1603 ; KNL_64-LABEL: test_gather_16i32:
1605 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1606 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1607 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1608 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2
1609 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1610 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
1611 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
1612 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
1615 ; KNL_32-LABEL: test_gather_16i32:
1617 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1618 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1619 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1620 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
1621 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1624 ; SKX-LABEL: test_gather_16i32:
1626 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1627 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1628 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1629 ; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm2
1630 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1631 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
1632 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
1633 ; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0
1636 ; SKX_32-LABEL: test_gather_16i32:
1638 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1639 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
1640 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1641 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
1642 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0
1644 %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
1647 define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
1648 ; KNL_64-LABEL: test_gather_16i64:
1650 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1651 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1652 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1653 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1654 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
1655 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
1656 ; KNL_64-NEXT: vmovaps %zmm3, %zmm0
1657 ; KNL_64-NEXT: vmovaps %zmm4, %zmm1
1660 ; KNL_32-LABEL: test_gather_16i64:
1662 ; KNL_32-NEXT: pushl %ebp
1663 ; KNL_32-NEXT: .Ltmp0:
1664 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
1665 ; KNL_32-NEXT: .Ltmp1:
1666 ; KNL_32-NEXT: .cfi_offset %ebp, -8
1667 ; KNL_32-NEXT: movl %esp, %ebp
1668 ; KNL_32-NEXT: .Ltmp2:
1669 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
1670 ; KNL_32-NEXT: andl $-64, %esp
1671 ; KNL_32-NEXT: subl $64, %esp
1672 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1673 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1674 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1675 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
1676 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
1677 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
1678 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1679 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
1680 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1681 ; KNL_32-NEXT: movl %ebp, %esp
1682 ; KNL_32-NEXT: popl %ebp
1685 ; SKX-LABEL: test_gather_16i64:
1687 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1688 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1689 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1690 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1691 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
1692 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
1693 ; SKX-NEXT: vmovaps %zmm3, %zmm0
1694 ; SKX-NEXT: vmovaps %zmm4, %zmm1
1697 ; SKX_32-LABEL: test_gather_16i64:
1699 ; SKX_32-NEXT: pushl %ebp
1700 ; SKX_32-NEXT: .Ltmp1:
1701 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
1702 ; SKX_32-NEXT: .Ltmp2:
1703 ; SKX_32-NEXT: .cfi_offset %ebp, -8
1704 ; SKX_32-NEXT: movl %esp, %ebp
1705 ; SKX_32-NEXT: .Ltmp3:
1706 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
1707 ; SKX_32-NEXT: andl $-64, %esp
1708 ; SKX_32-NEXT: subl $64, %esp
1709 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1710 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
1711 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1712 ; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
1713 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
1714 ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
1715 ; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
1716 ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
1717 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0
1718 ; SKX_32-NEXT: movl %ebp, %esp
1719 ; SKX_32-NEXT: popl %ebp
1721 %res = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
1724 declare <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
1725 define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
1726 ; KNL_64-LABEL: test_gather_16f32:
1728 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1729 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1730 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1731 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2
1732 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1733 ; KNL_64-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
1734 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
1735 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
1738 ; KNL_32-LABEL: test_gather_16f32:
1740 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1741 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1742 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1743 ; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
1744 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1747 ; SKX-LABEL: test_gather_16f32:
1749 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1750 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1751 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1752 ; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm2
1753 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1754 ; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
1755 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
1756 ; SKX-NEXT: vinsertf32x8 $1, %ymm2, %zmm3, %zmm0
1759 ; SKX_32-LABEL: test_gather_16f32:
1761 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1762 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
1763 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1764 ; SKX_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
1765 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0
1767 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
1768 ret <16 x float> %res
1770 define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
1771 ; KNL_64-LABEL: test_gather_16f64:
1773 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1774 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1775 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1776 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1777 ; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
1778 ; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
1779 ; KNL_64-NEXT: vmovaps %zmm3, %zmm0
1780 ; KNL_64-NEXT: vmovaps %zmm4, %zmm1
1783 ; KNL_32-LABEL: test_gather_16f64:
1785 ; KNL_32-NEXT: pushl %ebp
1786 ; KNL_32-NEXT: .Ltmp3:
1787 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
1788 ; KNL_32-NEXT: .Ltmp4:
1789 ; KNL_32-NEXT: .cfi_offset %ebp, -8
1790 ; KNL_32-NEXT: movl %esp, %ebp
1791 ; KNL_32-NEXT: .Ltmp5:
1792 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
1793 ; KNL_32-NEXT: andl $-64, %esp
1794 ; KNL_32-NEXT: subl $64, %esp
1795 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1796 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1797 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1798 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
1799 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
1800 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
1801 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1802 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
1803 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1804 ; KNL_32-NEXT: movl %ebp, %esp
1805 ; KNL_32-NEXT: popl %ebp
1808 ; SKX-LABEL: test_gather_16f64:
1810 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1811 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1812 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1813 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1814 ; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
1815 ; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
1816 ; SKX-NEXT: vmovaps %zmm3, %zmm0
1817 ; SKX-NEXT: vmovaps %zmm4, %zmm1
1820 ; SKX_32-LABEL: test_gather_16f64:
1822 ; SKX_32-NEXT: pushl %ebp
1823 ; SKX_32-NEXT: .Ltmp4:
1824 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
1825 ; SKX_32-NEXT: .Ltmp5:
1826 ; SKX_32-NEXT: .cfi_offset %ebp, -8
1827 ; SKX_32-NEXT: movl %esp, %ebp
1828 ; SKX_32-NEXT: .Ltmp6:
1829 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
1830 ; SKX_32-NEXT: andl $-64, %esp
1831 ; SKX_32-NEXT: subl $64, %esp
1832 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1833 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
1834 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1835 ; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
1836 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
1837 ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
1838 ; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
1839 ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
1840 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0
1841 ; SKX_32-NEXT: movl %ebp, %esp
1842 ; SKX_32-NEXT: popl %ebp
1844 %res = call <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
1845 ret <16 x double> %res
1847 declare <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
1848 define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
1849 ; KNL_64-LABEL: test_scatter_16i32:
1851 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1852 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1853 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1854 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1855 ; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
1856 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0
1857 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
1860 ; KNL_32-LABEL: test_scatter_16i32:
1862 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1863 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1864 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1865 ; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
1868 ; SKX-LABEL: test_scatter_16i32:
1870 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1871 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1872 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1873 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1874 ; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
1875 ; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm0
1876 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
1879 ; SKX_32-LABEL: test_scatter_16i32:
1881 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1882 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
1883 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1884 ; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
1886 call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
1889 define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
1890 ; KNL_64-LABEL: test_scatter_16i64:
1892 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1893 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1894 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1895 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1896 ; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
1897 ; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
1900 ; KNL_32-LABEL: test_scatter_16i64:
1902 ; KNL_32-NEXT: pushl %ebp
1903 ; KNL_32-NEXT: .Ltmp6:
1904 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
1905 ; KNL_32-NEXT: .Ltmp7:
1906 ; KNL_32-NEXT: .cfi_offset %ebp, -8
1907 ; KNL_32-NEXT: movl %esp, %ebp
1908 ; KNL_32-NEXT: .Ltmp8:
1909 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
1910 ; KNL_32-NEXT: andl $-64, %esp
1911 ; KNL_32-NEXT: subl $64, %esp
1912 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1913 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1914 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1915 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
1916 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
1917 ; KNL_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
1918 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1919 ; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
1920 ; KNL_32-NEXT: movl %ebp, %esp
1921 ; KNL_32-NEXT: popl %ebp
1924 ; SKX-LABEL: test_scatter_16i64:
1926 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1927 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1928 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1929 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1930 ; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
1931 ; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
1934 ; SKX_32-LABEL: test_scatter_16i64:
1936 ; SKX_32-NEXT: pushl %ebp
1937 ; SKX_32-NEXT: .Ltmp7:
1938 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
1939 ; SKX_32-NEXT: .Ltmp8:
1940 ; SKX_32-NEXT: .cfi_offset %ebp, -8
1941 ; SKX_32-NEXT: movl %esp, %ebp
1942 ; SKX_32-NEXT: .Ltmp9:
1943 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
1944 ; SKX_32-NEXT: andl $-64, %esp
1945 ; SKX_32-NEXT: subl $64, %esp
1946 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1947 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
1948 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1949 ; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
1950 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
1951 ; SKX_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
1952 ; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
1953 ; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
1954 ; SKX_32-NEXT: movl %ebp, %esp
1955 ; SKX_32-NEXT: popl %ebp
1957 call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
1960 declare void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
1961 define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
1962 ; KNL_64-LABEL: test_scatter_16f32:
1964 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1965 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1966 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1967 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1968 ; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
1969 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0
1970 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
1973 ; KNL_32-LABEL: test_scatter_16f32:
1975 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1976 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1977 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1978 ; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
1981 ; SKX-LABEL: test_scatter_16f32:
1983 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1984 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1985 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1986 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1987 ; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
1988 ; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm0
1989 ; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
1992 ; SKX_32-LABEL: test_scatter_16f32:
1994 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1995 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
1996 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1997 ; SKX_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
1999 call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
2002 declare void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
2003 define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
2004 ; KNL_64-LABEL: test_scatter_16f64:
2006 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2007 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2008 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2009 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2010 ; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
2011 ; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
2014 ; KNL_32-LABEL: test_scatter_16f64:
2016 ; KNL_32-NEXT: pushl %ebp
2017 ; KNL_32-NEXT: .Ltmp9:
2018 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2019 ; KNL_32-NEXT: .Ltmp10:
2020 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2021 ; KNL_32-NEXT: movl %esp, %ebp
2022 ; KNL_32-NEXT: .Ltmp11:
2023 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2024 ; KNL_32-NEXT: andl $-64, %esp
2025 ; KNL_32-NEXT: subl $64, %esp
2026 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2027 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2028 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2029 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
2030 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
2031 ; KNL_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
2032 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2033 ; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
2034 ; KNL_32-NEXT: movl %ebp, %esp
2035 ; KNL_32-NEXT: popl %ebp
2038 ; SKX-LABEL: test_scatter_16f64:
2040 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2041 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2042 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
2043 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2044 ; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
2045 ; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
2048 ; SKX_32-LABEL: test_scatter_16f64:
2050 ; SKX_32-NEXT: pushl %ebp
2051 ; SKX_32-NEXT: .Ltmp10:
2052 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2053 ; SKX_32-NEXT: .Ltmp11:
2054 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2055 ; SKX_32-NEXT: movl %esp, %ebp
2056 ; SKX_32-NEXT: .Ltmp12:
2057 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2058 ; SKX_32-NEXT: andl $-64, %esp
2059 ; SKX_32-NEXT: subl $64, %esp
2060 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2061 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2062 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2063 ; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
2064 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
2065 ; SKX_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
2066 ; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
2067 ; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
2068 ; SKX_32-NEXT: movl %ebp, %esp
2069 ; SKX_32-NEXT: popl %ebp
2071 call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
2074 declare void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)