Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
- def int_x86_avx512_sqrt_ss : GCCBuiltin<"__builtin_ia32_sqrtrndss">,
- Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty],
- [IntrNoMem]>;
- def int_x86_avx512_sqrt_sd : GCCBuiltin<"__builtin_ia32_sqrtrndsd">,
- Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty],
- [IntrNoMem]>;
+ def int_x86_avx512_mask_sqrt_ss : GCCBuiltin<"__builtin_ia32_sqrtrndss_mask">,
+ Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
+ llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_sqrt_sd : GCCBuiltin<"__builtin_ia32_sqrtrndsd_mask">,
+ Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
+ llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_sqrt_pd_128 : GCCBuiltin<"__builtin_ia32_sqrtpd128_mask">,
Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
}
}
-multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,
- Intrinsic F32Int, Intrinsic F64Int,
- OpndItins itins_s, OpndItins itins_d> {
- def SSZr : SI<opc, MRMSrcReg, (outs FR32X:$dst),
- (ins FR32X:$src1, FR32X:$src2),
- !strconcat(OpcodeStr,
- "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [], itins_s.rr>, XS, EVEX_4V;
- let isCodeGenOnly = 1 in
- def SSZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst),
- (ins VR128X:$src1, VR128X:$src2),
- !strconcat(OpcodeStr,
- "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128X:$dst,
- (F32Int VR128X:$src1, VR128X:$src2))],
- itins_s.rr>, XS, EVEX_4V;
- let mayLoad = 1 in {
- def SSZm : SI<opc, MRMSrcMem, (outs FR32X:$dst),
- (ins FR32X:$src1, f32mem:$src2),
- !strconcat(OpcodeStr,
- "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [], itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>;
- let isCodeGenOnly = 1 in
- def SSZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst),
- (ins VR128X:$src1, ssmem:$src2),
- !strconcat(OpcodeStr,
- "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128X:$dst,
- (F32Int VR128X:$src1, sse_load_f32:$src2))],
- itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>;
- }
- def SDZr : SI<opc, MRMSrcReg, (outs FR64X:$dst),
- (ins FR64X:$src1, FR64X:$src2),
- !strconcat(OpcodeStr,
- "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
- XD, EVEX_4V, VEX_W;
- let isCodeGenOnly = 1 in
- def SDZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst),
- (ins VR128X:$src1, VR128X:$src2),
- !strconcat(OpcodeStr,
- "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128X:$dst,
- (F64Int VR128X:$src1, VR128X:$src2))],
- itins_s.rr>, XD, EVEX_4V, VEX_W;
- let mayLoad = 1 in {
- def SDZm : SI<opc, MRMSrcMem, (outs FR64X:$dst),
- (ins FR64X:$src1, f64mem:$src2),
- !strconcat(OpcodeStr,
- "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
- XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
- let isCodeGenOnly = 1 in
- def SDZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst),
- (ins VR128X:$src1, sdmem:$src2),
- !strconcat(OpcodeStr,
- "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128X:$dst,
- (F64Int VR128X:$src1, sse_load_f64:$src2))]>,
- XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
- }
-}
-
multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
SDNode OpNode> {
defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
v8f64_info>, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
}
+multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+ string SUFF, SDNode OpNode, SDNode OpNodeRnd> {
+
+ defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNodeRnd (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 FROUND_CURRENT))>;
+ let mayLoad = 1 in
+ defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNodeRnd (_.VT _.RC:$src1),
+ (_.VT (scalar_to_vector
+ (_.ScalarLdFrag addr:$src2))),
+ (i32 FROUND_CURRENT))>;
+
+ defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
+ "$rc, $src2, $src1", "$src1, $src2, $rc",
+ (OpNodeRnd (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 imm:$rc))>,
+ EVEX_B, EVEX_RC;
+
+ let isCodeGenOnly = 1 in {
+ def r : SI<opc, MRMSrcReg, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
+
+ let mayLoad = 1 in
+ def m : SI<opc, MRMSrcMem, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
+ }
+
+ def : Pat<(_.EltVT (OpNode _.FRC:$src)),
+ (!cast<Instruction>(NAME#SUFF#Zr)
+ (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;
+
+ def : Pat<(_.EltVT (OpNode (load addr:$src))),
+ (!cast<Instruction>(NAME#SUFF#Zm)
+ (_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[OptForSize]>;
+}
+
+multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr> {
+ defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", f32x_info, "SS", fsqrt,
+ X86fsqrtRnds>, EVEX_CD8<32, CD8VT1>, EVEX_4V, XS;
+ defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", f64x_info, "SD", fsqrt,
+ X86fsqrtRnds>, EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W;
+}
+
defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>,
avx512_sqrt_packed_all_round<0x51, "vsqrt", X86fsqrtRnd>;
-defm VSQRT : avx512_sqrt_scalar<0x51, "sqrt",
- int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd,
- SSE_SQRTSS, SSE_SQRTSD>;
+defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG;
let Predicates = [HasAVX512] in {
- def : Pat<(f32 (fsqrt FR32X:$src)),
- (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
- def : Pat<(f32 (fsqrt (load addr:$src))),
- (VSQRTSSZm (f32 (IMPLICIT_DEF)), addr:$src)>,
- Requires<[OptForSize]>;
- def : Pat<(f64 (fsqrt FR64X:$src)),
- (VSQRTSDZr (f64 (IMPLICIT_DEF)), FR64X:$src)>;
- def : Pat<(f64 (fsqrt (load addr:$src))),
- (VSQRTSDZm (f64 (IMPLICIT_DEF)), addr:$src)>,
- Requires<[OptForSize]>;
-
def : Pat<(f32 (X86frsqrt FR32X:$src)),
(VRSQRT14SSrr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
def : Pat<(f32 (X86frsqrt (load addr:$src))),
def : Pat<(f32 (X86frcp (load addr:$src))),
(VRCP14SSrm (f32 (IMPLICIT_DEF)), addr:$src)>,
Requires<[OptForSize]>;
-
- def : Pat<(int_x86_sse_sqrt_ss VR128X:$src),
- (COPY_TO_REGCLASS (VSQRTSSZr (f32 (IMPLICIT_DEF)),
- (COPY_TO_REGCLASS VR128X:$src, FR32)),
- VR128X)>;
- def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src),
- (VSQRTSSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
-
- def : Pat<(int_x86_sse2_sqrt_sd VR128X:$src),
- (COPY_TO_REGCLASS (VSQRTSDZr (f64 (IMPLICIT_DEF)),
- (COPY_TO_REGCLASS VR128X:$src, FR64)),
- VR128X)>;
- def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
- (VSQRTSDZm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
}
multiclass
def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>;
def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>;
def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>;
-def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>;
-def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>;
-def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>;
-def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>;
+def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>;
+def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>;
+def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>;
+def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>;
+def X86fsqrtRnds : SDNode<"X86ISD::FSQRT_RND", STDFp2SrcRm>;
def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>;
def X86fgetexpRnds : SDNode<"X86ISD::FGETEXP_RND", STDFp2SrcRm>;
X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT,
X86ISD::FSQRT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FSQRT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FSQRT_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_sub_pd_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_sub_pd_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
}
declare <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
-define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: vsqrtss {{.*}}encoding: [0x62
- %res = call <4 x float> @llvm.x86.avx512.sqrt.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
+
+define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
+; CHECK-LABEL: test_sqrt_ss:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT: vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
+; CHECK-NEXT: vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm1
+; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 2)
+ %res3 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 3)
+
+ %res.1 = fadd <4 x float> %res0, %res1
+ %res.2 = fadd <4 x float> %res2, %res3
+ %res = fadd <4 x float> %res.1, %res.2
ret <4 x float> %res
}
-declare <4 x float> @llvm.x86.avx512.sqrt.ss(<4 x float>, <4 x float>) nounwind readnone
-define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: vsqrtsd {{.*}}encoding: [0x62
- %res = call <2 x double> @llvm.x86.avx512.sqrt.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
+
+define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
+; CHECK-LABEL: test_sqrt_sd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT: vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
+; CHECK-NEXT: vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm1
+; CHECK-NEXT: vaddpd %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 2)
+ %res3 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 3)
+
+ %res.1 = fadd <2 x double> %res0, %res1
+ %res.2 = fadd <2 x double> %res2, %res3
+ %res = fadd <2 x double> %res.1, %res.2
ret <2 x double> %res
}
-declare <2 x double> @llvm.x86.avx512.sqrt.sd(<2 x double>, <2 x double>) nounwind readnone
define i64 @test_x86_sse2_cvtsd2si64(<2 x double> %a0) {
; CHECK: vcvtsd2si {{.*}}encoding: [0x62
}
declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
-
define i64 @test_x86_sse_cvtss2si64(<4 x float> %a0) {
; CHECK: vcvtss2si {{.*}}encoding: [0x62
%res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ; <i64> [#uses=1]
// CHECK: encoding: [0x62,0xf2,0xc5,0x08,0x43,0x92,0xf8,0xfb,0xff,0xff]
vgetexpsd -1032(%rdx), %xmm7, %xmm2
+// CHECK: vsqrtss %xmm8, %xmm19, %xmm22
+// CHECK: encoding: [0x62,0xc1,0x66,0x00,0x51,0xf0]
+ vsqrtss %xmm8, %xmm19, %xmm22
+
+// CHECK: vsqrtss %xmm8, %xmm19, %xmm22 {%k1}
+// CHECK: encoding: [0x62,0xc1,0x66,0x01,0x51,0xf0]
+ vsqrtss %xmm8, %xmm19, %xmm22 {%k1}
+
+// CHECK: vsqrtss %xmm8, %xmm19, %xmm22 {%k1} {z}
+// CHECK: encoding: [0x62,0xc1,0x66,0x81,0x51,0xf0]
+ vsqrtss %xmm8, %xmm19, %xmm22 {%k1} {z}
+
+// CHECK: vsqrtss {rn-sae}, %xmm8, %xmm19, %xmm22
+// CHECK: encoding: [0x62,0xc1,0x66,0x10,0x51,0xf0]
+ vsqrtss {rn-sae}, %xmm8, %xmm19, %xmm22
+
+// CHECK: vsqrtss {ru-sae}, %xmm8, %xmm19, %xmm22
+// CHECK: encoding: [0x62,0xc1,0x66,0x50,0x51,0xf0]
+ vsqrtss {ru-sae}, %xmm8, %xmm19, %xmm22
+
+// CHECK: vsqrtss {rd-sae}, %xmm8, %xmm19, %xmm22
+// CHECK: encoding: [0x62,0xc1,0x66,0x30,0x51,0xf0]
+ vsqrtss {rd-sae}, %xmm8, %xmm19, %xmm22
+
+// CHECK: vsqrtss {rz-sae}, %xmm8, %xmm19, %xmm22
+// CHECK: encoding: [0x62,0xc1,0x66,0x70,0x51,0xf0]
+ vsqrtss {rz-sae}, %xmm8, %xmm19, %xmm22
+
+// CHECK: vsqrtss (%rcx), %xmm19, %xmm22
+// CHECK: encoding: [0x62,0xe1,0x66,0x00,0x51,0x31]
+ vsqrtss (%rcx), %xmm19, %xmm22
+
+// CHECK: vsqrtss 291(%rax,%r14,8), %xmm19, %xmm22
+// CHECK: encoding: [0x62,0xa1,0x66,0x00,0x51,0xb4,0xf0,0x23,0x01,0x00,0x00]
+ vsqrtss 291(%rax,%r14,8), %xmm19, %xmm22
+
+// CHECK: vsqrtss 508(%rdx), %xmm19, %xmm22
+// CHECK: encoding: [0x62,0xe1,0x66,0x00,0x51,0x72,0x7f]
+ vsqrtss 508(%rdx), %xmm19, %xmm22
+
+// CHECK: vsqrtss 512(%rdx), %xmm19, %xmm22
+// CHECK: encoding: [0x62,0xe1,0x66,0x00,0x51,0xb2,0x00,0x02,0x00,0x00]
+ vsqrtss 512(%rdx), %xmm19, %xmm22
+
+// CHECK: vsqrtss -512(%rdx), %xmm19, %xmm22
+// CHECK: encoding: [0x62,0xe1,0x66,0x00,0x51,0x72,0x80]
+ vsqrtss -512(%rdx), %xmm19, %xmm22
+
+// CHECK: vsqrtss -516(%rdx), %xmm19, %xmm22
+// CHECK: encoding: [0x62,0xe1,0x66,0x00,0x51,0xb2,0xfc,0xfd,0xff,0xff]
+ vsqrtss -516(%rdx), %xmm19, %xmm22
+
+// CHECK: vsqrtsd %xmm12, %xmm2, %xmm26
+// CHECK: encoding: [0x62,0x41,0xef,0x08,0x51,0xd4]
+ vsqrtsd %xmm12, %xmm2, %xmm26
+
+// CHECK: vsqrtsd %xmm12, %xmm2, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xd1,0xef,0x0f,0x51,0xf4]
+ vsqrtsd %xmm12, %xmm2, %xmm6 {%k7}
+
+// CHECK: vsqrtsd %xmm12, %xmm2, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xd1,0xef,0x8f,0x51,0xf4]
+ vsqrtsd %xmm12, %xmm2, %xmm6 {%k7} {z}
+
+// CHECK: vsqrtsd {rn-sae}, %xmm12, %xmm2, %xmm6
+// CHECK: encoding: [0x62,0xd1,0xef,0x18,0x51,0xf4]
+ vsqrtsd {rn-sae}, %xmm12, %xmm2, %xmm6
+
+// CHECK: vsqrtsd {ru-sae}, %xmm12, %xmm2, %xmm6
+// CHECK: encoding: [0x62,0xd1,0xef,0x58,0x51,0xf4]
+ vsqrtsd {ru-sae}, %xmm12, %xmm2, %xmm6
+
+// CHECK: vsqrtsd {rd-sae}, %xmm12, %xmm2, %xmm6
+// CHECK: encoding: [0x62,0xd1,0xef,0x38,0x51,0xf4]
+ vsqrtsd {rd-sae}, %xmm12, %xmm2, %xmm6
+
+// CHECK: vsqrtsd {rz-sae}, %xmm12, %xmm2, %xmm6
+// CHECK: encoding: [0x62,0xd1,0xef,0x78,0x51,0xf4]
+ vsqrtsd {rz-sae}, %xmm12, %xmm2, %xmm6
+
+// CHECK: vsqrtsd (%rcx), %xmm2, %xmm26
+// CHECK: encoding: [0x62,0x61,0xef,0x08,0x51,0x11]
+ vsqrtsd (%rcx), %xmm2, %xmm26
+
+// CHECK: vsqrtsd 291(%rax,%r14,8), %xmm2, %xmm26
+// CHECK: encoding: [0x62,0x21,0xef,0x08,0x51,0x94,0xf0,0x23,0x01,0x00,0x00]
+ vsqrtsd 291(%rax,%r14,8), %xmm2, %xmm26
+
+// CHECK: vsqrtsd 1016(%rdx), %xmm2, %xmm26
+// CHECK: encoding: [0x62,0x61,0xef,0x08,0x51,0x52,0x7f]
+ vsqrtsd 1016(%rdx), %xmm2, %xmm26
+
+// CHECK: vsqrtsd 1024(%rdx), %xmm2, %xmm26
+// CHECK: encoding: [0x62,0x61,0xef,0x08,0x51,0x92,0x00,0x04,0x00,0x00]
+ vsqrtsd 1024(%rdx), %xmm2, %xmm26
+
+// CHECK: vsqrtsd -1024(%rdx), %xmm2, %xmm26
+// CHECK: encoding: [0x62,0x61,0xef,0x08,0x51,0x52,0x80]
+ vsqrtsd -1024(%rdx), %xmm2, %xmm26
+
+// CHECK: vsqrtsd -1032(%rdx), %xmm2, %xmm26
+// CHECK: encoding: [0x62,0x61,0xef,0x08,0x51,0x92,0xf8,0xfb,0xff,0xff]
+ vsqrtsd -1032(%rdx), %xmm2, %xmm26
+
// CHECK: vinsertf32x4 $171, %xmm3, %zmm26, %zmm11
// CHECK: encoding: [0x62,0x73,0x2d,0x40,0x18,0xdb,0xab]
vinsertf32x4 $0xab, %xmm3, %zmm26, %zmm11