2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s
4 declare <8 x i64> @llvm.x86.avx512.mask.cvtpd2qq.512(<8 x double>, <8 x i64>, i8, i32)
6 define <8 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_512(<8 x double> %x0, <8 x i64> %x1, i8 %x2) {
7 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_512:
9 ; CHECK-NEXT: kmovb %edi, %k1
10 ; CHECK-NEXT: vcvtpd2qq {ru-sae}, %zmm0, %zmm1 {%k1}
11 ; CHECK-NEXT: vcvtpd2qq {rn-sae}, %zmm0, %zmm0
12 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
14 %res = call <8 x i64> @llvm.x86.avx512.mask.cvtpd2qq.512(<8 x double> %x0, <8 x i64> %x1, i8 %x2, i32 2)
15 %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvtpd2qq.512(<8 x double> %x0, <8 x i64> %x1, i8 -1, i32 0)
16 %res2 = add <8 x i64> %res, %res1
20 declare <8 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.512(<8 x double>, <8 x i64>, i8, i32)
22 define <8 x i64>@test_int_x86_avx512_mask_cvt_pd2uqq_512(<8 x double> %x0, <8 x i64> %x1, i8 %x2) {
23 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_512:
25 ; CHECK-NEXT: kmovb %edi, %k1
26 ; CHECK-NEXT: vcvtpd2uqq {ru-sae}, %zmm0, %zmm1 {%k1}
27 ; CHECK-NEXT: vcvtpd2uqq {rn-sae}, %zmm0, %zmm0
28 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
30 %res = call <8 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.512(<8 x double> %x0, <8 x i64> %x1, i8 %x2, i32 2)
31 %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.512(<8 x double> %x0, <8 x i64> %x1, i8 -1, i32 0)
32 %res2 = add <8 x i64> %res, %res1
36 declare <8 x i64> @llvm.x86.avx512.mask.cvtps2qq.512(<8 x float>, <8 x i64>, i8, i32)
38 define <8 x i64>@test_int_x86_avx512_mask_cvt_ps2qq_512(<8 x float> %x0, <8 x i64> %x1, i8 %x2) {
39 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_512:
41 ; CHECK-NEXT: kmovb %edi, %k1
42 ; CHECK-NEXT: vcvtps2qq {ru-sae}, %ymm0, %zmm1 {%k1}
43 ; CHECK-NEXT: vcvtps2qq {rn-sae}, %ymm0, %zmm0
44 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
46 %res = call <8 x i64> @llvm.x86.avx512.mask.cvtps2qq.512(<8 x float> %x0, <8 x i64> %x1, i8 %x2, i32 2)
47 %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvtps2qq.512(<8 x float> %x0, <8 x i64> %x1, i8 -1, i32 0)
48 %res2 = add <8 x i64> %res, %res1
52 declare <8 x i64> @llvm.x86.avx512.mask.cvtps2uqq.512(<8 x float>, <8 x i64>, i8, i32)
54 define <8 x i64>@test_int_x86_avx512_mask_cvt_ps2uqq_512(<8 x float> %x0, <8 x i64> %x1, i8 %x2) {
55 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_512:
57 ; CHECK-NEXT: kmovb %edi, %k1
58 ; CHECK-NEXT: vcvtps2uqq {ru-sae}, %ymm0, %zmm1 {%k1}
59 ; CHECK-NEXT: vcvtps2uqq {rn-sae}, %ymm0, %zmm0
60 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
62 %res = call <8 x i64> @llvm.x86.avx512.mask.cvtps2uqq.512(<8 x float> %x0, <8 x i64> %x1, i8 %x2, i32 2)
63 %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvtps2uqq.512(<8 x float> %x0, <8 x i64> %x1, i8 -1, i32 0)
64 %res2 = add <8 x i64> %res, %res1
68 declare <8 x double> @llvm.x86.avx512.mask.cvtqq2pd.512(<8 x i64>, <8 x double>, i8, i32)
70 define <8 x double>@test_int_x86_avx512_mask_cvt_qq2pd_512(<8 x i64> %x0, <8 x double> %x1, i8 %x2) {
71 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_512:
73 ; CHECK-NEXT: kmovb %edi, %k1
74 ; CHECK-NEXT: vcvtqq2pd %zmm0, %zmm1 {%k1}
75 ; CHECK-NEXT: vcvtqq2pd {rn-sae}, %zmm0, %zmm0
76 ; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
78 %res = call <8 x double> @llvm.x86.avx512.mask.cvtqq2pd.512(<8 x i64> %x0, <8 x double> %x1, i8 %x2, i32 4)
79 %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtqq2pd.512(<8 x i64> %x0, <8 x double> %x1, i8 -1, i32 0)
80 %res2 = fadd <8 x double> %res, %res1
81 ret <8 x double> %res2
84 declare <8 x float> @llvm.x86.avx512.mask.cvtqq2ps.512(<8 x i64>, <8 x float>, i8, i32)
86 define <8 x float>@test_int_x86_avx512_mask_cvt_qq2ps_512(<8 x i64> %x0, <8 x float> %x1, i8 %x2) {
87 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_512:
89 ; CHECK-NEXT: kmovb %edi, %k1
90 ; CHECK-NEXT: vcvtqq2ps %zmm0, %ymm1 {%k1}
91 ; CHECK-NEXT: vcvtqq2ps {rn-sae}, %zmm0, %ymm0
92 ; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
94 %res = call <8 x float> @llvm.x86.avx512.mask.cvtqq2ps.512(<8 x i64> %x0, <8 x float> %x1, i8 %x2, i32 4)
95 %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtqq2ps.512(<8 x i64> %x0, <8 x float> %x1, i8 -1, i32 0)
96 %res2 = fadd <8 x float> %res, %res1
100 declare <8 x i64> @llvm.x86.avx512.mask.cvttpd2qq.512(<8 x double>, <8 x i64>, i8, i32)
102 define <8 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_512(<8 x double> %x0, <8 x i64> %x1, i8 %x2) {
103 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_512:
105 ; CHECK-NEXT: kmovb %edi, %k1
106 ; CHECK-NEXT: vcvttpd2qq %zmm0, %zmm1 {%k1}
107 ; CHECK-NEXT: vcvttpd2qq {sae}, %zmm0, %zmm0
108 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
110 %res = call <8 x i64> @llvm.x86.avx512.mask.cvttpd2qq.512(<8 x double> %x0, <8 x i64> %x1, i8 %x2, i32 4)
111 %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvttpd2qq.512(<8 x double> %x0, <8 x i64> %x1, i8 -1, i32 8)
112 %res2 = add <8 x i64> %res, %res1
116 declare <8 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.512(<8 x double>, <8 x i64>, i8, i32)
118 define <8 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_512(<8 x double> %x0, <8 x i64> %x1, i8 %x2) {
119 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_512:
121 ; CHECK-NEXT: kmovb %edi, %k1
122 ; CHECK-NEXT: vcvttpd2uqq %zmm0, %zmm1 {%k1}
123 ; CHECK-NEXT: vcvttpd2uqq {sae}, %zmm0, %zmm0
124 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
126 %res = call <8 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.512(<8 x double> %x0, <8 x i64> %x1, i8 %x2, i32 4)
127 %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.512(<8 x double> %x0, <8 x i64> %x1, i8 -1, i32 8)
128 %res2 = add <8 x i64> %res, %res1
132 declare <8 x i64> @llvm.x86.avx512.mask.cvttps2qq.512(<8 x float>, <8 x i64>, i8, i32)
134 define <8 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_512(<8 x float> %x0, <8 x i64> %x1, i8 %x2) {
135 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_512:
137 ; CHECK-NEXT: kmovb %edi, %k1
138 ; CHECK-NEXT: vcvttps2qq %ymm0, %zmm1 {%k1}
139 ; CHECK-NEXT: vcvttps2qq {sae}, %ymm0, %zmm0
140 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
142 %res = call <8 x i64> @llvm.x86.avx512.mask.cvttps2qq.512(<8 x float> %x0, <8 x i64> %x1, i8 %x2, i32 4)
143 %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvttps2qq.512(<8 x float> %x0, <8 x i64> %x1, i8 -1, i32 8)
144 %res2 = add <8 x i64> %res, %res1
148 declare <8 x i64> @llvm.x86.avx512.mask.cvttps2uqq.512(<8 x float>, <8 x i64>, i8, i32)
150 define <8 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_512(<8 x float> %x0, <8 x i64> %x1, i8 %x2) {
151 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_512:
153 ; CHECK-NEXT: kmovb %edi, %k1
154 ; CHECK-NEXT: vcvttps2uqq %ymm0, %zmm1 {%k1}
155 ; CHECK-NEXT: vcvttps2uqq {sae}, %ymm0, %zmm0
156 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
158 %res = call <8 x i64> @llvm.x86.avx512.mask.cvttps2uqq.512(<8 x float> %x0, <8 x i64> %x1, i8 %x2, i32 4)
159 %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvttps2uqq.512(<8 x float> %x0, <8 x i64> %x1, i8 -1, i32 8)
160 %res2 = add <8 x i64> %res, %res1
164 declare <8 x double> @llvm.x86.avx512.mask.cvtuqq2pd.512(<8 x i64>, <8 x double>, i8, i32)
166 define <8 x double>@test_int_x86_avx512_mask_cvt_uqq2pd_512(<8 x i64> %x0, <8 x double> %x1, i8 %x2) {
167 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_512:
169 ; CHECK-NEXT: kmovb %edi, %k1
170 ; CHECK-NEXT: vcvtuqq2pd %zmm0, %zmm1 {%k1}
171 ; CHECK-NEXT: vcvtuqq2pd {rn-sae}, %zmm0, %zmm0
172 ; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
174 %res = call <8 x double> @llvm.x86.avx512.mask.cvtuqq2pd.512(<8 x i64> %x0, <8 x double> %x1, i8 %x2, i32 4)
175 %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtuqq2pd.512(<8 x i64> %x0, <8 x double> %x1, i8 -1, i32 0)
176 %res2 = fadd <8 x double> %res, %res1
177 ret <8 x double> %res2
180 declare <8 x float> @llvm.x86.avx512.mask.cvtuqq2ps.512(<8 x i64>, <8 x float>, i8, i32)
182 define <8 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_512(<8 x i64> %x0, <8 x float> %x1, i8 %x2) {
183 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_512:
185 ; CHECK-NEXT: kmovb %edi, %k1
186 ; CHECK-NEXT: vcvtuqq2ps %zmm0, %ymm1 {%k1}
187 ; CHECK-NEXT: vcvtuqq2ps {rn-sae}, %zmm0, %ymm0
188 ; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
190 %res = call <8 x float> @llvm.x86.avx512.mask.cvtuqq2ps.512(<8 x i64> %x0, <8 x float> %x1, i8 %x2, i32 4)
191 %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtuqq2ps.512(<8 x i64> %x0, <8 x float> %x1, i8 -1, i32 0)
192 %res2 = fadd <8 x float> %res, %res1
193 ret <8 x float> %res2
196 declare <8 x double> @llvm.x86.avx512.mask.reduce.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
197 ; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_pd_512
200 ; CHECK: vreducepd {{.*}}{%k1}
203 define <8 x double>@test_int_x86_avx512_mask_reduce_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
204 %res = call <8 x double> @llvm.x86.avx512.mask.reduce.pd.512(<8 x double> %x0, i32 8, <8 x double> %x2, i8 %x3, i32 4)
205 %res1 = call <8 x double> @llvm.x86.avx512.mask.reduce.pd.512(<8 x double> %x0, i32 4, <8 x double> %x2, i8 -1, i32 8)
206 %res2 = fadd <8 x double> %res, %res1
207 ret <8 x double> %res2
210 declare <16 x float> @llvm.x86.avx512.mask.reduce.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
211 ; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_ps_512
218 define <16 x float>@test_int_x86_avx512_mask_reduce_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
219 %res = call <16 x float> @llvm.x86.avx512.mask.reduce.ps.512(<16 x float> %x0, i32 44, <16 x float> %x2, i16 %x3, i32 8)
220 %res1 = call <16 x float> @llvm.x86.avx512.mask.reduce.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 -1, i32 4)
221 %res2 = fadd <16 x float> %res, %res1
222 ret <16 x float> %res2
225 declare <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double>, <8 x double>, i32, <8 x double>, i8, i32)
226 ; CHECK-LABEL: @test_int_x86_avx512_mask_range_pd_512
233 define <8 x double>@test_int_x86_avx512_mask_range_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
234 %res = call <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double> %x0, <8 x double> %x1, i32 8, <8 x double> %x3, i8 %x4, i32 4)
235 %res1 = call <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double> %x0, <8 x double> %x1, i32 4, <8 x double> %x3, i8 -1, i32 8)
236 %res2 = fadd <8 x double> %res, %res1
237 ret <8 x double> %res2
240 declare <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float>, <16 x float>, i32, <16 x float>, i16, i32)
242 ; CHECK-LABEL: @test_int_x86_avx512_mask_range_ps_512
249 define <16 x float>@test_int_x86_avx512_mask_range_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) {
250 %res = call <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float> %x0, <16 x float> %x1, i32 88, <16 x float> %x3, i16 %x4, i32 4)
251 %res1 = call <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float> %x0, <16 x float> %x1, i32 4, <16 x float> %x3, i16 -1, i32 8)
252 %res2 = fadd <16 x float> %res, %res1
253 ret <16 x float> %res2
256 declare <4 x float> @llvm.x86.avx512.mask.reduce.ss(<4 x float>, <4 x float>,<4 x float>, i8, i32, i32)
258 ; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_ss
265 define <4 x float>@test_int_x86_avx512_mask_reduce_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
266 %res = call <4 x float> @llvm.x86.avx512.mask.reduce.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4, i32 4, i32 4)
267 %res1 = call <4 x float> @llvm.x86.avx512.mask.reduce.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 -1, i32 4, i32 8)
268 %res2 = fadd <4 x float> %res, %res1
269 ret <4 x float> %res2
272 declare <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float>, <4 x float>,<4 x float>, i8, i32, i32)
273 ; CHECK-LABEL: @test_int_x86_avx512_mask_range_ss
281 define <4 x float>@test_int_x86_avx512_mask_range_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
282 %res = call <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4, i32 4, i32 8)
283 %res1 = call <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 -1, i32 4, i32 8)
284 %res2 = fadd <4 x float> %res, %res1
285 ret <4 x float> %res2
288 declare <2 x double> @llvm.x86.avx512.mask.reduce.sd(<2 x double>, <2 x double>,<2 x double>, i8, i32, i32)
290 ; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_sd
297 define <2 x double>@test_int_x86_avx512_mask_reduce_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
298 %res = call <2 x double> @llvm.x86.avx512.mask.reduce.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4, i32 4, i32 4)
299 %res1 = call <2 x double> @llvm.x86.avx512.mask.reduce.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 -1, i32 4, i32 8)
300 %res2 = fadd <2 x double> %res, %res1
301 ret <2 x double> %res2
304 declare <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double>, <2 x double>,<2 x double>, i8, i32, i32)
305 ; CHECK-LABEL: @test_int_x86_avx512_mask_range_sd
312 define <2 x double>@test_int_x86_avx512_mask_range_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
313 %res = call <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4, i32 4, i32 4)
314 %res1 = call <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 -1, i32 4, i32 8)
315 %res2 = fadd <2 x double> %res, %res1
316 ret <2 x double> %res2
320 declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double>, i32, <2 x double>, i8)
322 define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) {
323 ; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_512:
325 ; CHECK-NEXT: kmovb %edi, %k1
326 ; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1}
327 ; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm2 {%k1} {z}
328 ; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm0
329 ; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
330 ; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
332 %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> %x2, i8 %x3)
333 %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3)
334 %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1)
335 %res3 = fadd <2 x double> %res, %res1
336 %res4 = fadd <2 x double> %res2, %res3
337 ret <2 x double> %res4
340 declare <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float>, i32, <8 x float>, i8)
342 define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x float> %x2, i8 %x3) {
343 ; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x8:
345 ; CHECK-NEXT: kmovb %edi, %k1
346 ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1}
347 ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2 {%k1} {z}
348 ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0
349 ; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
350 ; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0
352 %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3)
353 %res2 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 %x3)
354 %res1 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 -1)
355 %res3 = fadd <8 x float> %res, %res1
356 %res4 = fadd <8 x float> %res2, %res3
357 ret <8 x float> %res4
360 declare <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float>, <8 x float>, i32, <16 x float>, i16)
362 define <16 x float>@test_int_x86_avx512_mask_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, <16 x float> %x3, i16 %x4) {
363 ; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x8_512:
365 ; CHECK-NEXT: kmovw %edi, %k1
366 ; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
367 ; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
368 ; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0
369 ; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1
370 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
372 %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
373 %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4)
374 %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
375 %res3 = fadd <16 x float> %res, %res1
376 %res4 = fadd <16 x float> %res2, %res3
377 ret <16 x float> %res4
380 declare <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double>, <2 x double>, i32, <8 x double>, i8)
382 define <8 x double>@test_int_x86_avx512_mask_insertf64x2_512(<8 x double> %x0, <2 x double> %x1,<8 x double> %x3, i8 %x4) {
383 ; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_512:
385 ; CHECK-NEXT: kmovb %edi, %k1
386 ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm2 {%k1}
387 ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm3 {%k1} {z}
388 ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm0
389 ; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1
390 ; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
392 %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
393 %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4)
394 %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
395 %res3 = fadd <8 x double> %res, %res1
396 %res4 = fadd <8 x double> %res3, %res2
397 ret <8 x double> %res4
400 declare <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32>, <8 x i32>, i32, <16 x i32>, i16)
402 define <16 x i32>@test_int_x86_avx512_mask_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, <16 x i32> %x3, i16 %x4) {
403 ; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x8_512:
405 ; CHECK-NEXT: kmovw %edi, %k1
406 ; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
407 ; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
408 ; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
409 ; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1
410 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
412 %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
413 %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4)
414 %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
415 %res3 = add <16 x i32> %res, %res1
416 %res4 = add <16 x i32> %res3, %res2
420 declare <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64>, <2 x i64>, i32, <8 x i64>, i8)
422 define <8 x i64>@test_int_x86_avx512_mask_inserti64x2_512(<8 x i64> %x0, <2 x i64> %x1, <8 x i64> %x3, i8 %x4) {
423 ; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_512:
425 ; CHECK-NEXT: kmovb %edi, %k1
426 ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm2 {%k1}
427 ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm3 {%k1} {z}
428 ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm0
429 ; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1
430 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
432 %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
433 %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4)
434 %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
435 %res3 = add <8 x i64> %res, %res1
436 %res4 = add <8 x i64> %res2, %res3
440 declare i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double>, i32, i8)
442 ; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_pd_512
449 define i8 @test_int_x86_avx512_mask_fpclass_pd_512(<8 x double> %x0, i8 %x1) {
450 %res = call i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double> %x0, i32 2, i8 %x1)
451 %res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double> %x0, i32 4, i8 -1)
452 %res2 = add i8 %res, %res1
455 declare i16 @llvm.x86.avx512.mask.fpclass.ps.512(<16 x float>, i32, i16)
457 ; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_ps_512
464 define i16@test_int_x86_avx512_mask_fpclass_ps_512(<16 x float> %x0, i16 %x1) {
465 %res = call i16 @llvm.x86.avx512.mask.fpclass.ps.512(<16 x float> %x0, i32 4, i16 %x1)
466 %res1 = call i16 @llvm.x86.avx512.mask.fpclass.ps.512(<16 x float> %x0, i32 4, i16 -1)
467 %res2 = add i16 %res, %res1
471 declare i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double>, i32, i8)
473 ; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_sd
480 define i8 @test_int_x86_avx512_mask_fpclass_sd(<2 x double> %x0, i8 %x1) {
481 %res = call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %x0, i32 2, i8 %x1)
482 %res1 = call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %x0, i32 4, i8 -1)
483 %res2 = add i8 %res, %res1
487 declare i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float>, i32, i8)
489 ; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_ss
498 define i8 @test_int_x86_avx512_mask_fpclass_ss(<4 x float> %x0, i8 %x1) {
499 %res = call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %x0, i32 4, i8 %x1)
500 %res1 = call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %x0, i32 4, i8 -1)
501 %res2 = add i8 %res, %res1
505 declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float>, <16 x float>, i16)
507 define <16 x float>@test_int_x86_avx512_mask_broadcastf32x2_512(<4 x float> %x0, <16 x float> %x2, i16 %x3) {
508 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x2_512:
510 ; CHECK-NEXT: kmovw %edi, %k1
511 ; CHECK-NEXT: vbroadcastf32x2 %xmm0, %zmm1 {%k1}
512 ; CHECK-NEXT: vbroadcastf32x2 %xmm0, %zmm2 {%k1} {z}
513 ; CHECK-NEXT: vbroadcastf32x2 %xmm0, %zmm0
514 ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
515 ; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
517 %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> %x2, i16 %x3)
518 %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> zeroinitializer, i16 %x3)
519 %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> %x2, i16 -1)
520 %res3 = fadd <16 x float> %res, %res1
521 %res4 = fadd <16 x float> %res3, %res2
522 ret <16 x float> %res4
525 declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32>, <16 x i32>, i16)
527 define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x2_512(<4 x i32> %x0, <16 x i32> %x2, i16 %x3) {
528 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x2_512:
530 ; CHECK-NEXT: kmovw %edi, %k1
531 ; CHECK-NEXT: vbroadcasti32x2 %xmm0, %zmm1 {%k1}
532 ; CHECK-NEXT: vbroadcasti32x2 %xmm0, %zmm2 {%k1} {z}
533 ; CHECK-NEXT: vbroadcasti32x2 %xmm0, %zmm0
534 ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
535 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
537 %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> %x2, i16 %x3)
538 %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %x3)
539 %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1)
540 %res3 = add <16 x i32> %res, %res1
541 %res4 = add <16 x i32> %res3, %res2
545 declare i16 @llvm.x86.avx512.cvtd2mask.512(<16 x i32>)
547 define i16@test_int_x86_avx512_cvtd2mask_512(<16 x i32> %x0) {
548 ; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_512:
550 ; CHECK-NEXT: vpmovd2m %zmm0, %k0
551 ; CHECK-NEXT: kmovw %k0, %eax
553 %res = call i16 @llvm.x86.avx512.cvtd2mask.512(<16 x i32> %x0)
557 declare i8 @llvm.x86.avx512.cvtq2mask.512(<8 x i64>)
559 define i8@test_int_x86_avx512_cvtq2mask_512(<8 x i64> %x0) {
560 ; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_512:
562 ; CHECK-NEXT: vpmovq2m %zmm0, %k0
563 ; CHECK-NEXT: kmovb %k0, %eax
565 %res = call i8 @llvm.x86.avx512.cvtq2mask.512(<8 x i64> %x0)
569 declare <16 x i32> @llvm.x86.avx512.cvtmask2d.512(i16)
571 define <16 x i32>@test_int_x86_avx512_cvtmask2d_512(i16 %x0) {
572 ; CHECK-LABEL: test_int_x86_avx512_cvtmask2d_512:
574 ; CHECK-NEXT: kmovw %edi, %k0
575 ; CHECK-NEXT: vpmovm2d %k0, %zmm0
577 %res = call <16 x i32> @llvm.x86.avx512.cvtmask2d.512(i16 %x0)
581 declare <8 x i64> @llvm.x86.avx512.cvtmask2q.512(i8)
583 define <8 x i64>@test_int_x86_avx512_cvtmask2q_512(i8 %x0) {
584 ; CHECK-LABEL: test_int_x86_avx512_cvtmask2q_512:
586 ; CHECK-NEXT: kmovb %edi, %k0
587 ; CHECK-NEXT: vpmovm2q %k0, %zmm0
589 %res = call <8 x i64> @llvm.x86.avx512.cvtmask2q.512(i8 %x0)
593 declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float>, <16 x float>, i16)
595 define <16 x float>@test_int_x86_avx512_mask_broadcastf32x8_512(<8 x float> %x0, <16 x float> %x2, i16 %mask) {
596 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x8_512:
597 ; CHECK: kmovw %edi, %k1
598 ; CHECK: vshuff32x4 $68, %zmm0, %zmm0, %zmm2 {%k1} {z}
599 ; CHECK: vshuff32x4 $68, %zmm0, %zmm0, %zmm1 {%k1}
600 ; CHECK: vshuff32x4 $68, %zmm0, %zmm0, %zmm0
601 ; CHECK: vaddps %zmm1, %zmm0, %zmm0
602 ; CHECK: vaddps %zmm0, %zmm2, %zmm0
604 %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 -1)
605 %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 %mask)
606 %res3 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> zeroinitializer, i16 %mask)
607 %res4 = fadd <16 x float> %res1, %res2
608 %res5 = fadd <16 x float> %res3, %res4
609 ret <16 x float> %res5
612 declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double>, <8 x double>, i8)
614 define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512(<2 x double> %x0, <8 x double> %x2, i8 %mask) {
615 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_512:
616 ; CHECK: kmovb %edi, %k1
617 ; CHECK: vshuff64x2 $0, %zmm0, %zmm0, %zmm2 {%k1} {z}
618 ; CHECK: vshuff64x2 $0, %zmm0, %zmm0, %zmm1 {%k1}
619 ; CHECK: vshuff64x2 $0, %zmm0, %zmm0, %zmm0
620 ; CHECK: vaddpd %zmm1, %zmm0, %zmm0
621 ; CHECK: vaddpd %zmm0, %zmm2, %zmm0
623 %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 -1)
624 %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 %mask)
625 %res3 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> zeroinitializer, i8 %mask)
626 %res4 = fadd <8 x double> %res1, %res2
627 %res5 = fadd <8 x double> %res3, %res4
628 ret <8 x double> %res5
631 declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32>, <16 x i32>, i16)
633 define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x8_512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask) {
634 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x8_512:
635 ; CHECK: kmovw %edi, %k1
636 ; CHECK: vshufi32x4 $68, %zmm0, %zmm0, %zmm2 {%k1} {z}
637 ; CHECK: vshufi32x4 $68, %zmm0, %zmm0, %zmm1 {%k1}
638 ; CHECK: vshufi32x4 $68, %zmm0, %zmm0, %zmm0
639 ; CHECK: vpaddd %zmm1, %zmm0, %zmm0
640 ; CHECK: vpaddd %zmm0, %zmm2, %zmm0
642 %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 -1)
643 %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask)
644 %res3 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
645 %res4 = add <16 x i32> %res1, %res2
646 %res5 = add <16 x i32> %res3, %res4
650 declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64>, <8 x i64>, i8)
652 define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x2_512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask) {
653 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_512:
654 ; CHECK: kmovb %edi, %k1
655 ; CHECK: vshufi64x2 $0, %zmm0, %zmm0, %zmm2 {%k1} {z}
656 ; CHECK: vshufi64x2 $0, %zmm0, %zmm0, %zmm1 {%k1}
657 ; CHECK: vshufi64x2 $0, %zmm0, %zmm0, %zmm0
658 ; CHECK: vpaddq %zmm1, %zmm0, %zmm0
659 ; CHECK: vpaddq %zmm0, %zmm2, %zmm0
661 %res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 -1)
662 %res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask)
663 %res3 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask)
664 %res4 = add <8 x i64> %res1, %res2
665 %res5 = add <8 x i64> %res3, %res4