1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
4 declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, i8*, <16 x i32>, i16, i32)
5 declare void @llvm.x86.avx512.scatter.dps.512 (i8*, i16, <16 x i32>, <16 x float>, i32)
6 declare <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double>, i8*, <8 x i32>, i8, i32)
7 declare void @llvm.x86.avx512.scatter.dpd.512 (i8*, i8, <8 x i32>, <8 x double>, i32)
9 declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float>, i8*, <8 x i64>, i8, i32)
10 declare void @llvm.x86.avx512.scatter.qps.512 (i8*, i8, <8 x i64>, <8 x float>, i32)
11 declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double>, i8*, <8 x i64>, i8, i32)
12 declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, i8, <8 x i64>, <8 x double>, i32)
14 define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf) {
15 ; CHECK-LABEL: gather_mask_dps:
17 ; CHECK-NEXT: kmovw %edi, %k1
18 ; CHECK-NEXT: kmovq %k1, %k2
19 ; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2}
20 ; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
21 ; CHECK-NEXT: vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1}
23 %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
24 %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
25 call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
29 define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
30 ; CHECK-LABEL: gather_mask_dpd:
32 ; CHECK-NEXT: kmovb %edi, %k1
33 ; CHECK-NEXT: kmovq %k1, %k2
34 ; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2}
35 ; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
36 ; CHECK-NEXT: vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1}
38 %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
39 %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
40 call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
44 define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf) {
45 ; CHECK-LABEL: gather_mask_qps:
47 ; CHECK-NEXT: kmovb %edi, %k1
48 ; CHECK-NEXT: kmovq %k1, %k2
49 ; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2}
50 ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
51 ; CHECK-NEXT: vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1}
53 %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
54 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
55 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
59 define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
60 ; CHECK-LABEL: gather_mask_qpd:
62 ; CHECK-NEXT: kmovb %edi, %k1
63 ; CHECK-NEXT: kmovq %k1, %k2
64 ; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2}
65 ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
66 ; CHECK-NEXT: vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1}
68 %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
69 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
70 call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
74 ;; Integer Gather/Scatter
76 declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, i8*, <16 x i32>, i16, i32)
77 declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, i16, <16 x i32>, <16 x i32>, i32)
78 declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64>, i8*, <8 x i32>, i8, i32)
79 declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, i8, <8 x i32>, <8 x i64>, i32)
81 declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32>, i8*, <8 x i64>, i8, i32)
82 declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, i8, <8 x i64>, <8 x i32>, i32)
83 declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, <8 x i64>, i8, i32)
84 declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, i8, <8 x i64>, <8 x i64>, i32)
86 define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf) {
87 ; CHECK-LABEL: gather_mask_dd:
89 ; CHECK-NEXT: kmovw %edi, %k1
90 ; CHECK-NEXT: kmovq %k1, %k2
91 ; CHECK-NEXT: vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2}
92 ; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
93 ; CHECK-NEXT: vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1}
95 %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
96 %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
97 call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
101 define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf) {
102 ; CHECK-LABEL: gather_mask_qd:
104 ; CHECK-NEXT: kmovb %edi, %k1
105 ; CHECK-NEXT: kmovq %k1, %k2
106 ; CHECK-NEXT: vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2}
107 ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
108 ; CHECK-NEXT: vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1}
110 %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
111 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
112 call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
116 define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
117 ; CHECK-LABEL: gather_mask_qq:
119 ; CHECK-NEXT: kmovb %edi, %k1
120 ; CHECK-NEXT: kmovq %k1, %k2
121 ; CHECK-NEXT: vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2}
122 ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
123 ; CHECK-NEXT: vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1}
125 %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
126 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
127 call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
131 define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
132 ; CHECK-LABEL: gather_mask_dq:
134 ; CHECK-NEXT: kmovb %edi, %k1
135 ; CHECK-NEXT: kmovq %k1, %k2
136 ; CHECK-NEXT: vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2}
137 ; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
138 ; CHECK-NEXT: vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1}
140 %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
141 %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
142 call void @llvm.x86.avx512.scatter.dpq.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
146 define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
147 ; CHECK-LABEL: gather_mask_dpd_execdomain:
149 ; CHECK-NEXT: kmovb %edi, %k1
150 ; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1}
151 ; CHECK-NEXT: vmovapd %zmm1, (%rdx)
153 %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
154 store <8 x double> %x, <8 x double>* %stbuf
158 define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
159 ; CHECK-LABEL: gather_mask_qpd_execdomain:
161 ; CHECK-NEXT: kmovb %edi, %k1
162 ; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1}
163 ; CHECK-NEXT: vmovapd %zmm1, (%rdx)
165 %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
166 store <8 x double> %x, <8 x double>* %stbuf
170 define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base) {
171 ; CHECK-LABEL: gather_mask_dps_execdomain:
173 ; CHECK-NEXT: kmovw %edi, %k1
174 ; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k1}
175 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
177 %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
178 ret <16 x float> %res;
181 define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base) {
182 ; CHECK-LABEL: gather_mask_qps_execdomain:
184 ; CHECK-NEXT: kmovb %edi, %k1
185 ; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1}
186 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
188 %res = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
189 ret <8 x float> %res;
192 define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) {
193 ; CHECK-LABEL: scatter_mask_dpd_execdomain:
195 ; CHECK-NEXT: kmovb %esi, %k1
196 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
197 ; CHECK-NEXT: vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1}
199 %x = load <8 x double>, <8 x double>* %src, align 64
200 call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
204 define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) {
205 ; CHECK-LABEL: scatter_mask_qpd_execdomain:
207 ; CHECK-NEXT: kmovb %esi, %k1
208 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
209 ; CHECK-NEXT: vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1}
211 %x = load <8 x double>, <8 x double>* %src, align 64
212 call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
216 define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf) {
217 ; CHECK-LABEL: scatter_mask_dps_execdomain:
219 ; CHECK-NEXT: kmovw %esi, %k1
220 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
221 ; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1}
223 %x = load <16 x float>, <16 x float>* %src, align 64
224 call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
228 define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf) {
229 ; CHECK-LABEL: scatter_mask_qps_execdomain:
231 ; CHECK-NEXT: kmovb %esi, %k1
232 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
233 ; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1}
235 %x = load <8 x float>, <8 x float>* %src, align 32
236 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
240 define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf) {
241 ; CHECK-LABEL: gather_qps:
243 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
244 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
245 ; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
246 ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
247 ; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
249 %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4)
250 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
251 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 -1, <8 x i64>%ind2, <8 x float> %x, i32 4)
255 declare void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
256 declare void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
257 define void @prefetch(<8 x i64> %ind, i8* %base) {
258 ; CHECK-LABEL: prefetch:
260 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
261 ; CHECK-NEXT: vgatherpf0qps (%rdi,%zmm0,4) {%k1}
262 ; CHECK-NEXT: kxorw %k0, %k0, %k1
263 ; CHECK-NEXT: vgatherpf1qps (%rdi,%zmm0,4) {%k1}
264 ; CHECK-NEXT: movb $1, %al
265 ; CHECK-NEXT: kmovb %eax, %k1
266 ; CHECK-NEXT: vscatterpf0qps (%rdi,%zmm0,2) {%k1}
267 ; CHECK-NEXT: movb $120, %al
268 ; CHECK-NEXT: kmovb %eax, %k1
269 ; CHECK-NEXT: vscatterpf1qps (%rdi,%zmm0,2) {%k1}
271 call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 0)
272 call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, i8* %base, i32 4, i32 1)
273 call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, i8* %base, i32 2, i32 0)
274 call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, i8* %base, i32 2, i32 1)
278 declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, i8*, <2 x i64>, i8, i32)
280 define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
281 ; CHECK-LABEL: test_int_x86_avx512_gather3div2_df:
283 ; CHECK-NEXT: kmovb %esi, %k1
284 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
285 ; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,4), %xmm2 {%k1}
286 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
287 ; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,2), %xmm0 {%k1}
288 ; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
290 %res = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
291 %res1 = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2)
292 %res2 = fadd <2 x double> %res, %res1
293 ret <2 x double> %res2
296 declare <4 x i32> @llvm.x86.avx512.gather3div2.di(<2 x i64>, i8*, <2 x i64>, i8, i32)
298 define <4 x i32>@test_int_x86_avx512_gather3div2_di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
299 ; CHECK-LABEL: test_int_x86_avx512_gather3div2_di:
301 ; CHECK-NEXT: kmovb %esi, %k1
302 ; CHECK-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1}
303 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0
305 %res = call <4 x i32> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8)
306 %res1 = call <4 x i32> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8)
307 %res2 = add <4 x i32> %res, %res1
311 declare <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double>, i8*, <4 x i64>, i8, i32)
313 define <4 x double>@test_int_x86_avx512_gather3div4_df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
314 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_df:
316 ; CHECK-NEXT: kmovb %esi, %k1
317 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
318 ; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,4), %ymm2 {%k1}
319 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
320 ; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,2), %ymm0 {%k1}
321 ; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0
323 %res = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
324 %res1 = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2)
325 %res2 = fadd <4 x double> %res, %res1
326 ret <4 x double> %res2
329 declare <8 x i32> @llvm.x86.avx512.gather3div4.di(<4 x i64>, i8*, <4 x i64>, i8, i32)
331 define <8 x i32>@test_int_x86_avx512_gather3div4_di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
332 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_di:
334 ; CHECK-NEXT: kmovb %esi, %k1
335 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
336 ; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1}
337 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
338 ; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1}
339 ; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
341 %res = call <8 x i32> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 8)
342 %res1 = call <8 x i32> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 8)
343 %res2 = add <8 x i32> %res, %res1
347 declare <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float>, i8*, <2 x i64>, i8, i32)
349 define <4 x float>@test_int_x86_avx512_gather3div4_sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
350 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_sf:
352 ; CHECK-NEXT: kmovb %esi, %k1
353 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
354 ; CHECK-NEXT: vgatherqps (%rdi,%xmm1,4), %xmm2 {%k1}
355 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
356 ; CHECK-NEXT: vgatherqps (%rdi,%xmm1,2), %xmm0 {%k1}
357 ; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
359 %res = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
360 %res1 = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2)
361 %res2 = fadd <4 x float> %res, %res1
362 ret <4 x float> %res2
365 declare <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32>, i8*, <2 x i64>, i8, i32)
367 define <4 x i32>@test_int_x86_avx512_gather3div4_si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
368 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_si:
370 ; CHECK-NEXT: kmovb %esi, %k1
371 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
372 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
373 ; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k2}
374 ; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1}
375 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
377 %res = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 4)
378 %res1 = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
379 %res2 = add <4 x i32> %res, %res1
383 declare <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float>, i8*, <4 x i64>, i8, i32)
385 define <4 x float>@test_int_x86_avx512_gather3div8_sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
386 ; CHECK-LABEL: test_int_x86_avx512_gather3div8_sf:
388 ; CHECK-NEXT: kmovb %esi, %k1
389 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
390 ; CHECK-NEXT: vgatherqps (%rdi,%ymm1,4), %xmm2 {%k1}
391 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
392 ; CHECK-NEXT: vgatherqps (%rdi,%ymm1,2), %xmm0 {%k1}
393 ; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
395 %res = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
396 %res1 = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2)
397 %res2 = fadd <4 x float> %res, %res1
398 ret <4 x float> %res2
401 declare <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32>, i8*, <4 x i64>, i8, i32)
403 define <4 x i32>@test_int_x86_avx512_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
404 ; CHECK-LABEL: test_int_x86_avx512_gather3div8_si:
406 ; CHECK-NEXT: kmovb %esi, %k1
407 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
408 ; CHECK-NEXT: kmovq %k1, %k2
409 ; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2}
410 ; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1}
411 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
413 %res = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
414 %res1 = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 2)
415 %res2 = add <4 x i32> %res, %res1
419 declare <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double>, i8*, <4 x i32>, i8, i32)
421 define <2 x double>@test_int_x86_avx512_gather3siv2_df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
422 ; CHECK-LABEL: test_int_x86_avx512_gather3siv2_df:
424 ; CHECK-NEXT: kmovb %esi, %k1
425 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
426 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %xmm2 {%k1}
427 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
428 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %xmm0 {%k1}
429 ; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
431 %res = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
432 %res1 = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2)
433 %res2 = fadd <2 x double> %res, %res1
434 ret <2 x double> %res2
437 declare <4 x i32> @llvm.x86.avx512.gather3siv2.di(<2 x i64>, i8*, <4 x i32>, i8, i32)
439 define <4 x i32>@test_int_x86_avx512_gather3siv2_di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
440 ; CHECK-LABEL: test_int_x86_avx512_gather3siv2_di:
442 ; CHECK-NEXT: kmovb %esi, %k1
443 ; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1}
444 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0
446 %res = call <4 x i32> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
447 %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
448 %res2 = add <4 x i32> %res, %res1
452 declare <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double>, i8*, <4 x i32>, i8, i32)
454 define <4 x double>@test_int_x86_avx512_gather3siv4_df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
455 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_df:
457 ; CHECK-NEXT: kmovb %esi, %k1
458 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
459 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %ymm2 {%k1}
460 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
461 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %ymm0 {%k1}
462 ; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0
464 %res = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
465 %res1 = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2)
466 %res2 = fadd <4 x double> %res, %res1
467 ret <4 x double> %res2
470 declare <8 x i32> @llvm.x86.avx512.gather3siv4.di(<4 x i64>, i8*, <4 x i32>, i8, i32)
472 define <8 x i32>@test_int_x86_avx512_gather3siv4_di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
473 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_di:
475 ; CHECK-NEXT: kmovb %esi, %k1
476 ; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1}
477 ; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm0
479 %res = call <8 x i32> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
480 %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
481 %res2 = add <8 x i32> %res, %res1
485 declare <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float>, i8*, <4 x i32>, i8, i32)
487 define <4 x float>@test_int_x86_avx512_gather3siv4_sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
488 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_sf:
490 ; CHECK-NEXT: kmovb %esi, %k1
491 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
492 ; CHECK-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm2 {%k1}
493 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
494 ; CHECK-NEXT: vgatherdps (%rdi,%xmm1,2), %xmm0 {%k1}
495 ; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
497 %res = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
498 %res1 = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2)
499 %res2 = fadd <4 x float> %res, %res1
500 ret <4 x float> %res2
503 declare <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32>, i8*, <4 x i32>, i8, i32)
505 define <4 x i32>@test_int_x86_avx512_gather3siv4_si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
506 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_si:
508 ; CHECK-NEXT: kmovb %esi, %k1
509 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
510 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
511 ; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k2}
512 ; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,2), %xmm0 {%k1}
513 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
515 %res = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 4)
516 %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 2)
517 %res2 = add <4 x i32> %res, %res1
521 declare <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float>, i8*, <8 x i32>, i8, i32)
523 define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
524 ; CHECK-LABEL: test_int_x86_avx512_gather3siv8_sf:
526 ; CHECK-NEXT: kmovb %esi, %k1
527 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
528 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm2 {%k1}
529 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
530 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm0 {%k1}
531 ; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0
533 %res = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4)
534 %res1 = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 -1, i32 2)
535 %res2 = fadd <8 x float> %res, %res1
536 ret <8 x float> %res2
539 declare <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32>, i8*, <8 x i32>, i8, i32)
541 define <8 x i32>@test_int_x86_avx512_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
542 ; CHECK-LABEL: test_int_x86_avx512_gather3siv8_si:
544 ; CHECK-NEXT: kmovb %esi, %k1
545 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
546 ; CHECK-NEXT: kmovq %k1, %k2
547 ; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2}
548 ; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,2), %ymm0 {%k1}
549 ; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
551 %res = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4)
552 %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 2)
553 %res2 = add <8 x i32> %res, %res1
557 declare void @llvm.x86.avx512.scatterdiv2.df(i8*, i8, <2 x i64>, <2 x double>, i32)
559 define void@test_int_x86_avx512_scatterdiv2_df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3) {
560 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_df:
562 ; CHECK-NEXT: kmovb %esi, %k1
563 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
564 ; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,2) {%k2}
565 ; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,4) {%k1}
567 call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 -1, <2 x i64> %x2, <2 x double> %x3, i32 2)
568 call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3, i32 4)
572 declare void @llvm.x86.avx512.scatterdiv2.di(i8*, i8, <2 x i64>, <2 x i64>, i32)
574 define void@test_int_x86_avx512_scatterdiv2_di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3) {
575 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_di:
577 ; CHECK-NEXT: kmovb %esi, %k1
578 ; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,2) {%k1}
579 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
580 ; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,4) {%k1}
582 call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3, i32 2)
583 call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 -1, <2 x i64> %x2, <2 x i64> %x3, i32 4)
587 declare void @llvm.x86.avx512.scatterdiv4.df(i8*, i8, <4 x i64>, <4 x double>, i32)
589 define void@test_int_x86_avx512_scatterdiv4_df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3) {
590 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_df:
592 ; CHECK-NEXT: kmovb %esi, %k1
593 ; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,2) {%k1}
594 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
595 ; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1}
597 call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3, i32 2)
598 call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 -1, <4 x i64> %x2, <4 x double> %x3, i32 4)
602 declare void @llvm.x86.avx512.scatterdiv4.di(i8*, i8, <4 x i64>, <4 x i64>, i32)
604 define void@test_int_x86_avx512_scatterdiv4_di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3) {
605 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_di:
607 ; CHECK-NEXT: kmovb %esi, %k1
608 ; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,2) {%k1}
609 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
610 ; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1}
612 call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3, i32 2)
613 call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i64> %x3, i32 4)
617 declare void @llvm.x86.avx512.scatterdiv4.sf(i8*, i8, <2 x i64>, <4 x float>, i32)
619 define void@test_int_x86_avx512_scatterdiv4_sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3) {
620 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_sf:
622 ; CHECK-NEXT: kmovb %esi, %k1
623 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,2) {%k1}
624 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
625 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,4) {%k1}
627 call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3, i32 2)
628 call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 -1, <2 x i64> %x2, <4 x float> %x3, i32 4)
632 declare void @llvm.x86.avx512.scatterdiv4.si(i8*, i8, <2 x i64>, <4 x i32>, i32)
634 define void@test_int_x86_avx512_scatterdiv4_si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3) {
635 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_si:
637 ; CHECK-NEXT: kmovb %esi, %k1
638 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
639 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,2) {%k2}
640 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1}
642 call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 -1, <2 x i64> %x2, <4 x i32> %x3, i32 2)
643 call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3, i32 4)
647 declare void @llvm.x86.avx512.scatterdiv8.sf(i8*, i8, <4 x i64>, <4 x float>, i32)
649 define void@test_int_x86_avx512_scatterdiv8_sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3) {
650 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_sf:
652 ; CHECK-NEXT: kmovb %esi, %k1
653 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,2) {%k1}
654 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
655 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1}
657 call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3, i32 2)
658 call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 -1, <4 x i64> %x2, <4 x float> %x3, i32 4)
662 declare void @llvm.x86.avx512.scatterdiv8.si(i8*, i8, <4 x i64>, <4 x i32>, i32)
664 define void@test_int_x86_avx512_scatterdiv8_si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3) {
665 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_si:
667 ; CHECK-NEXT: kmovb %esi, %k1
668 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,2) {%k1}
669 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
670 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1}
672 call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3, i32 2)
673 call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i32> %x3, i32 4)
677 declare void @llvm.x86.avx512.scattersiv2.df(i8*, i8, <4 x i32>, <2 x double>, i32)
679 define void@test_int_x86_avx512_scattersiv2_df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3) {
680 ; CHECK-LABEL: test_int_x86_avx512_scattersiv2_df:
682 ; CHECK-NEXT: kmovb %esi, %k1
683 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
684 ; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,2) {%k2}
685 ; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,4) {%k1}
687 call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 -1, <4 x i32> %x2, <2 x double> %x3, i32 2)
688 call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3, i32 4)
692 declare void @llvm.x86.avx512.scattersiv2.di(i8*, i8, <4 x i32>, <2 x i64>, i32)
694 define void@test_int_x86_avx512_scattersiv2_di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3) {
695 ; CHECK-LABEL: test_int_x86_avx512_scattersiv2_di:
697 ; CHECK-NEXT: kmovb %esi, %k1
698 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
699 ; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,2) {%k2}
700 ; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,4) {%k1}
702 call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 -1, <4 x i32> %x2, <2 x i64> %x3, i32 2)
703 call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3, i32 4)
707 declare void @llvm.x86.avx512.scattersiv4.df(i8*, i8, <4 x i32>, <4 x double>, i32)
709 define void@test_int_x86_avx512_scattersiv4_df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3) {
710 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_df:
712 ; CHECK-NEXT: kmovb %esi, %k1
713 ; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,2) {%k1}
714 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
715 ; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1}
717 call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3, i32 2)
718 call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 -1, <4 x i32> %x2, <4 x double> %x3, i32 4)
722 declare void @llvm.x86.avx512.scattersiv4.di(i8*, i8, <4 x i32>, <4 x i64>, i32)
724 define void@test_int_x86_avx512_scattersiv4_di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3) {
725 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_di:
727 ; CHECK-NEXT: kmovb %esi, %k1
728 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
729 ; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,2) {%k2}
730 ; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1}
732 call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i64> %x3, i32 2)
733 call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3, i32 4)
737 declare void @llvm.x86.avx512.scattersiv4.sf(i8*, i8, <4 x i32>, <4 x float>, i32)
739 define void@test_int_x86_avx512_scattersiv4_sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3) {
740 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_sf:
742 ; CHECK-NEXT: kmovb %esi, %k1
743 ; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,2) {%k1}
744 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
745 ; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,4) {%k1}
747 call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3, i32 2)
748 call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 -1, <4 x i32> %x2, <4 x float> %x3, i32 4)
752 declare void @llvm.x86.avx512.scattersiv4.si(i8*, i8, <4 x i32>, <4 x i32>, i32)
754 define void@test_int_x86_avx512_scattersiv4_si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3) {
755 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_si:
757 ; CHECK-NEXT: kmovb %esi, %k1
758 ; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,2) {%k1}
759 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
760 ; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1}
762 call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3, i32 2)
763 call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i32> %x3, i32 4)
767 declare void @llvm.x86.avx512.scattersiv8.sf(i8*, i8, <8 x i32>, <8 x float>, i32)
769 define void@test_int_x86_avx512_scattersiv8_sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3) {
770 ; CHECK-LABEL: test_int_x86_avx512_scattersiv8_sf:
772 ; CHECK-NEXT: kmovb %esi, %k1
773 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
774 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
775 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
777 call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3, i32 2)
778 call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 -1, <8 x i32> %x2, <8 x float> %x3, i32 4)
782 declare void @llvm.x86.avx512.scattersiv8.si(i8*, i8, <8 x i32>, <8 x i32>, i32)
784 define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3) {
785 ; CHECK-LABEL: test_int_x86_avx512_scattersiv8_si:
787 ; CHECK-NEXT: kmovb %esi, %k1
788 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
789 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
790 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
792 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
793 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 4)
797 define void @scatter_mask_test(i8* %x0, <8 x i32> %x2, <8 x i32> %x3) {
798 ; CHECK-LABEL: scatter_mask_test:
800 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
801 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
802 ; CHECK-NEXT: kxorw %k0, %k0, %k1
803 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
804 ; CHECK-NEXT: movb $1, %al
805 ; CHECK-NEXT: kmovb %eax, %k1
806 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
807 ; CHECK-NEXT: movb $96, %al
808 ; CHECK-NEXT: kmovb %eax, %k1
809 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
811 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
812 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 0, <8 x i32> %x2, <8 x i32> %x3, i32 4)
813 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
814 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 96, <8 x i32> %x2, <8 x i32> %x3, i32 4)
818 define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %base) {
819 ; CHECK-LABEL: gather_mask_test:
821 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
822 ; CHECK-NEXT: vmovaps %zmm1, %zmm2
823 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1}
824 ; CHECK-NEXT: kxorw %k0, %k0, %k1
825 ; CHECK-NEXT: vmovaps %zmm1, %zmm3
826 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
827 ; CHECK-NEXT: movw $1, %ax
828 ; CHECK-NEXT: kmovw %eax, %k1
829 ; CHECK-NEXT: vmovaps %zmm1, %zmm4
830 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm4 {%k1}
831 ; CHECK-NEXT: movw $220, %ax
832 ; CHECK-NEXT: kmovw %eax, %k1
833 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
834 ; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0
835 ; CHECK-NEXT: vaddps %zmm4, %zmm1, %zmm1
836 ; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
838 %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4)
839 %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4)
840 %res2 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 1, i32 4)
841 %res3 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 220, i32 4)
843 %res4 = fadd <16 x float> %res, %res1
844 %res5 = fadd <16 x float> %res3, %res2
845 %res6 = fadd <16 x float> %res5, %res4
846 ret <16 x float> %res6