def int_x86_avx512_mask_load_pd_512 : GCCBuiltin<"__builtin_ia32_loadapd512_mask">,
Intrinsic<[llvm_v8f64_ty], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty],
[IntrReadArgMem]>;
+
+ def int_x86_avx512_mask_move_ss : GCCBuiltin<"__builtin_ia32_movss_mask">,
+ Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
+ [IntrNoMem]>;
+ def int_x86_avx512_mask_move_sd : GCCBuiltin<"__builtin_ia32_movsd_mask">,
+ Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
+ [IntrNoMem]>;
}
// Conditional store ops
// AVX-512 MOVSS, MOVSD
//===----------------------------------------------------------------------===//
-multiclass avx512_move_scalar <string asm, RegisterClass RC,
- SDNode OpNode, ValueType vt,
- X86MemOperand x86memop, PatFrag mem_pat> {
- let hasSideEffects = 0 in {
- def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2),
- !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128X:$dst, (vt (OpNode VR128X:$src1,
- (scalar_to_vector RC:$src2))))],
- IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG;
- let Constraints = "$src1 = $dst" in
- def rrk : SI<0x10, MRMSrcReg, (outs VR128X:$dst),
- (ins VR128X:$src1, VK1WM:$mask, RC:$src2, RC:$src3),
- !strconcat(asm,
- "\t{$src3, $src2, $dst {${mask}}|$dst {${mask}}, $src2, $src3}"),
- [], IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG, EVEX_K;
- def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
- !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>,
- EVEX, VEX_LIG;
+multiclass avx512_move_scalar <string asm, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm rr_Int : AVX512_maskable_scalar<0x10, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2),
+ asm, "$src2, $src1","$src1, $src2",
+ (_.VT (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2))),
+ IIC_SSE_MOV_S_RR>, EVEX_4V;
+ let Constraints = "$src1 = $dst" , mayLoad = 1 in
+ defm rm_Int : AVX512_maskable_3src_scalar<0x10, MRMSrcMem, _,
+ (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src),
+ asm,"$src","$src",
+ (_.VT (OpNode (_.VT _.RC:$src1),
+ (_.VT (scalar_to_vector
+ (_.ScalarLdFrag addr:$src)))))>, EVEX;
+ let isCodeGenOnly = 1 in {
+ def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.FRC:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1,
+ (scalar_to_vector _.FRC:$src2))))],
+ _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V;
+ let mayLoad = 1 in
+ def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
+ _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX;
+ }
let mayStore = 1 in {
- def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
- !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
- [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
- EVEX, VEX_LIG;
- def mrk: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, VK1WM:$mask, RC:$src),
- !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
- [], IIC_SSE_MOV_S_MR>,
- EVEX, VEX_LIG, EVEX_K;
+ def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(store _.FRC:$src, addr:$dst)], _.ExeDomain, IIC_SSE_MOV_S_MR>,
+ EVEX;
+ def mrk: AVX512PI<0x11, MRMDestMem, (outs),
+ (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src),
+ !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
+ [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K;
} // mayStore
- } //hasSideEffects = 0
}
-let ExeDomain = SSEPackedSingle in
-defm VMOVSSZ : avx512_move_scalar<"movss", FR32X, X86Movss, v4f32, f32mem,
- loadf32>, XS, EVEX_CD8<32, CD8VT1>;
+defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>,
+ VEX_LIG, XS, EVEX_CD8<32, CD8VT1>;
-let ExeDomain = SSEPackedDouble in
-defm VMOVSDZ : avx512_move_scalar<"movsd", FR64X, X86Movsd, v2f64, f64mem,
- loadf64>, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>,
+ VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
def : Pat<(f32 (X86select VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
- (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
- VK1WM:$mask, (f32 (IMPLICIT_DEF)), FR32X:$src1), FR32X)>;
+ (COPY_TO_REGCLASS (VMOVSSZrr_Intk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
+ VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>;
def : Pat<(f64 (X86select VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
- (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
- VK1WM:$mask, (f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>;
+ (COPY_TO_REGCLASS (VMOVSDZrr_Intk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
+ VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>;
def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
(VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)),
EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512,
EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,\r
X86ISD::VTRUNC, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8,
X86ISD::VTRUNC, 0),
X86ISD::MOVDDUP, 0),
X86_INTRINSIC_DATA(avx512_mask_movddup_512, INTR_TYPE_1OP_MASK,
X86ISD::MOVDDUP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_move_sd, INTR_TYPE_SCALAR_MASK,
+ X86ISD::MOVSD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_move_ss, INTR_TYPE_SCALAR_MASK,
+ X86ISD::MOVSS, 0),
X86_INTRINSIC_DATA(avx512_mask_movshdup_128, INTR_TYPE_1OP_MASK,
X86ISD::MOVSHDUP, 0),
X86_INTRINSIC_DATA(avx512_mask_movshdup_256, INTR_TYPE_1OP_MASK,
}
declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
+declare <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_move_ss_rrk(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrk:
+; CHECK: vmovss %xmm1, %xmm0, %xmm2 {%k1}
+ %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ ret <4 x float> %res
+}
+
+define <4 x float>@test_int_x86_avx512_mask_move_ss_rrkz(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrkz:
+; CHECK: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
+ %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x2)
+ ret <4 x float> %res
+}
+
+define <4 x float>@test_int_x86_avx512_mask_move_ss_rr(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rr:
+; CHECK: vmovss %xmm1, %xmm0, %xmm0
+ %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 -1)
+ ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double>, <2 x double>, <2 x double>, i8)
+define <2 x double>@test_int_x86_avx512_mask_move_sd_rr(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rr:
+; CHECK: vmovsd %xmm1, %xmm0, %xmm0
+ %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 -1)
+ ret <2 x double> %res
+}
+
+define <2 x double>@test_int_x86_avx512_mask_move_sd_rrkz(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrkz:
+; CHECK: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
+ %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 %x2)
+ ret <2 x double> %res
+}
+
+define <2 x double>@test_int_x86_avx512_mask_move_sd_rrk(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrk:
+; CHECK: vmovsd %xmm1, %xmm0, %xmm2 {%k1}
+ %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+ ret <2 x double> %res
+}
+
// CHECK: vucomiss -516(%rdx), %xmm22
// CHECK: encoding: [0x62,0xe1,0x7c,0x08,0x2e,0xb2,0xfc,0xfd,0xff,0xff]
vucomiss -516(%rdx), %xmm22
+// CHECK: vmovsd (%rcx), %xmm25 {%k3}
+// CHECK: encoding: [0x62,0x61,0xff,0x0b,0x10,0x09]
+ vmovsd (%rcx), %xmm25 {%k3}
+
+// CHECK: vmovsd (%rcx), %xmm25 {%k3} {z}
+// CHECK: encoding: [0x62,0x61,0xff,0x8b,0x10,0x09]
+ vmovsd (%rcx), %xmm25 {%k3} {z}
+
+// CHECK: vmovsd %xmm19, %xmm3, %xmm27 {%k3} {z}
+// CHECK: encoding: [0x62,0x21,0xe7,0x8b,0x10,0xdb]
+ vmovsd %xmm19, %xmm3, %xmm27 {%k3} {z}
+
+// CHECK: vmovss (%rcx), %xmm2 {%k4}
+// CHECK: encoding: [0x62,0xf1,0x7e,0x0c,0x10,0x11]
+ vmovss (%rcx), %xmm2 {%k4}
+
+// CHECK: vmovss (%rcx), %xmm2 {%k4} {z}
+// CHECK: encoding: [0x62,0xf1,0x7e,0x8c,0x10,0x11]
+ vmovss (%rcx), %xmm2 {%k4} {z}
+
+// CHECK: vmovss %xmm26, %xmm9, %xmm28 {%k4} {z}
+// CHECK: encoding: [0x62,0x01,0x36,0x8c,0x10,0xe2]
+ vmovss %xmm26, %xmm9, %xmm28 {%k4} {z}
+
+// CHECK: vmovsd %xmm15, %xmm22, %xmm21 {%k7} {z}
+// CHECK: encoding: [0x62,0xc1,0xcf,0x87,0x10,0xef]
+ vmovsd %xmm15, %xmm22, %xmm21 {%k7} {z}
+
+// CHECK: vmovsd %xmm8, %xmm13, %xmm3 {%k5} {z}
+// CHECK: encoding: [0x62,0xd1,0x97,0x8d,0x10,0xd8]
+ vmovsd %xmm8, %xmm13, %xmm3 {%k5} {z}
+
+// CHECK: vmovss %xmm2, %xmm27, %xmm17 {%k2} {z}
+// CHECK: encoding: [0x62,0xe1,0x26,0x82,0x10,0xca]
+ vmovss %xmm2, %xmm27, %xmm17 {%k2} {z}
+
+// CHECK: vmovss %xmm23, %xmm19, %xmm10 {%k3} {z}
+// CHECK: encoding: [0x62,0x31,0x66,0x83,0x10,0xd7]
+ vmovss %xmm23, %xmm19, %xmm10 {%k3} {z}
+
+// CHECK: vmovsd %xmm4, %xmm15, %xmm4 {%k6} {z}
+// CHECK: encoding: [0x62,0xf1,0x87,0x8e,0x10,0xe4]
+ vmovsd %xmm4, %xmm15, %xmm4 {%k6} {z}
+
+// CHECK: vmovsd %xmm14, %xmm2, %xmm20 {%k7} {z}
+// CHECK: encoding: [0x62,0xc1,0xef,0x8f,0x10,0xe6]
+ vmovsd %xmm14, %xmm2, %xmm20 {%k7} {z}
+
+// CHECK: vmovss %xmm19, %xmm11, %xmm21 {%k3} {z}
+// CHECK: encoding: [0x62,0xa1,0x26,0x8b,0x10,0xeb]
+ vmovss %xmm19, %xmm11, %xmm21 {%k3} {z}
+
+// CHECK: vmovss %xmm24, %xmm27, %xmm15 {%k2} {z}
+// CHECK: encoding: [0x62,0x11,0x26,0x82,0x10,0xf8]
+ vmovss %xmm24, %xmm27, %xmm15 {%k2} {z}
// CHECK: vcomiss xmm16, dword ptr [rcx]
// CHECK: encoding: [0x62,0xe1,0x7c,0x08,0x2f,0x01]
vcomiss xmm16, DWORD PTR [rcx]
+
+// CHECK: vmovss dword ptr [rcx] {k2}, xmm13
+// CHECK: encoding: [0x62,0x71,0x7e,0x0a,0x11,0x29]
+ vmovss dword ptr [rcx]{k2},xmm13
+
+// CHECK: vmovss dword ptr [rax + 8*r14 + 4660], xmm13
+// CHECK: encoding: [0xc4,0x21,0x7a,0x11,0xac,0xf0,0x34,0x12,0x00,0x00]
+ vmovss dword ptr [rax+r14*8+0x1234],xmm13
+
+// CHECK: vmovss dword ptr [rdx + 508], xmm13
+// CHECK: encoding: [0xc5,0x7a,0x11,0xaa,0xfc,0x01,0x00,0x00]
+ vmovss dword ptr [rdx+0x1fc],xmm13
+
+// CHECK: vmovss dword ptr [rdx + 512], xmm13
+// CHECK: encoding: [0xc5,0x7a,0x11,0xaa,0x00,0x02,0x00,0x00]
+ vmovss dword ptr [rdx+0x200],xmm13
+
+// CHECK: vmovss dword ptr [rdx - 512], xmm13
+// CHECK: encoding: [0xc5,0x7a,0x11,0xaa,0x00,0xfe,0xff,0xff]
+ vmovss dword ptr [rdx-0x200],xmm13
+
+// CHECK: vmovss dword ptr [rdx - 516], xmm13
+// CHECK: encoding: [0xc5,0x7a,0x11,0xaa,0xfc,0xfd,0xff,0xff]
+ vmovss dword ptr [rdx-0x204],xmm13
+
+// CHECK: vmovss dword ptr [rdx + 508], xmm5
+// CHECK: encoding: [0xc5,0xfa,0x11,0xaa,0xfc,0x01,0x00,0x00]
+ vmovss dword ptr [rdx+0x1fc],xmm5
+
+// CHECK: vmovss dword ptr [rdx + 512], xmm5
+// CHECK: encoding: [0xc5,0xfa,0x11,0xaa,0x00,0x02,0x00,0x00]
+ vmovss dword ptr [rdx+0x200],xmm5
+
+// CHECK: vmovss dword ptr [rdx - 512], xmm5
+// CHECK: encoding: [0xc5,0xfa,0x11,0xaa,0x00,0xfe,0xff,0xff]
+ vmovss dword ptr [rdx-0x200], xmm5
+
+// CHECK: vmovss dword ptr [rdx - 516], xmm5
+// CHECK: encoding: [0xc5,0xfa,0x11,0xaa,0xfc,0xfd,0xff,0xff]
+ vmovss dword ptr [rdx-0x204],xmm5
+
+// CHECK: vmovss dword ptr [rcx], xmm13
+// CHECK: encoding: [0xc5,0x7a,0x11,0x29]
+ vmovss dword ptr [rcx],xmm13
+
+// CHECK: vmovss xmm2, dword ptr [rcx]
+// CHECK: encoding: [0xc5,0xfa,0x10,0x11]
+ vmovss xmm2, dword ptr [rcx]
+
+// CHECK: vmovss xmm2 {k4}, dword ptr [rcx]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x0c,0x10,0x11]
+ vmovss xmm2{k4}, dword ptr [rcx]
+
+// CHECK: vmovss xmm2 {k4} {z}, dword ptr [rcx]
+// CHECK: encoding: [0x62,0xf1,0x7e,0x8c,0x10,0x11]
+ vmovss xmm2{k4} {z}, dword ptr [rcx]
+
+// CHECK: vmovsd xmm25 , qword ptr [rcx]
+// CHECK: encoding: [0x62,0x61,0xff,0x08,0x10,0x09]
+ vmovsd xmm25, qword ptr [rcx]
+
+// CHECK: vmovsd xmm25 {k3}, qword ptr [rcx]
+// CHECK: encoding: [0x62,0x61,0xff,0x0b,0x10,0x09]
+ vmovsd xmm25{k3}, qword ptr [rcx]
+
+// CHECK: vmovsd xmm25 {k3} {z}, qword ptr [rcx]
+// CHECK: encoding: [0x62,0x61,0xff,0x8b,0x10,0x09]
+ vmovsd xmm25{k3} {z}, qword ptr [rcx]
+
+// CHECK: vmovsd xmm25 , qword ptr [rax + 8*r14 + 291]
+// CHECK: encoding: [0x62,0x21,0xff,0x08,0x10,0x8c,0xf0,0x23,0x01,0x00,0x00]
+ vmovsd xmm25, qword ptr [rax+r14*8+0x123]
+
+// CHECK: vmovsd xmm25 , qword ptr [rdx + 1016]
+// CHECK: encoding: [0x62,0x61,0xff,0x08,0x10,0x4a,0x7f]
+ vmovsd xmm25, qword ptr [rdx+0x3f8]
+
+// CHECK: vmovsd xmm25 , qword ptr [rdx + 1024]
+// CHECK: encoding: [0x62,0x61,0xff,0x08,0x10,0x8a,0x00,0x04,0x00,0x00]
+ vmovsd xmm25, qword ptr [rdx+0x400]
+
+// CHECK: vmovsd xmm25 , qword ptr [rdx - 1024]
+// CHECK: encoding: [0x62,0x61,0xff,0x08,0x10,0x4a,0x80]
+ vmovsd xmm25, qword ptr [rdx-0x400]
+
+// CHECK: vmovsd xmm25 , qword ptr [rdx - 1032]
+// CHECK: encoding: [0x62,0x61,0xff,0x08,0x10,0x8a,0xf8,0xfb,0xff,0xff]
+ vmovsd xmm25, qword ptr [rdx-0x408]