-//====- X86InstrSSE.td - Describe the X86 Instruction Set -------*- C++ -*-===//
+//====- X86InstrSSE.td - Describe the X86 Instruction Set --*- tablegen -*-===//
//
// The LLVM Compiler Infrastructure
//
-// This file was developed by Evan Cheng and is distributed under the University
-// of Illinois Open Source License. See LICENSE.TXT for details.
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// SSE 'Special' Instructions
//===----------------------------------------------------------------------===//
+let isImplicitDef = 1 in {
def IMPLICIT_DEF_VR128 : I<0, Pseudo, (outs VR128:$dst), (ins),
"#IMPLICIT_DEF $dst",
[(set VR128:$dst, (v4f32 (undef)))]>,
def IMPLICIT_DEF_FR64 : I<0, Pseudo, (outs FR64:$dst), (ins),
"#IMPLICIT_DEF $dst",
[(set FR64:$dst, (undef))]>, Requires<[HasSSE2]>;
+}
//===----------------------------------------------------------------------===//
// SSE Complex Patterns
// the top elements. These are used for the SSE 'ss' and 'sd' instruction
// forms.
def sse_load_f32 : ComplexPattern<v4f32, 4, "SelectScalarSSELoad", [],
- [SDNPHasChain]>;
+ [SDNPHasChain, SDNPMayLoad]>;
def sse_load_f64 : ComplexPattern<v2f64, 4, "SelectScalarSSELoad", [],
- [SDNPHasChain]>;
+ [SDNPHasChain, SDNPMayLoad]>;
def ssmem : Operand<v4f32> {
let PrintMethod = "printf32mem";
//===----------------------------------------------------------------------===//
// Move Instructions
+let neverHasSideEffects = 1 in
def MOVSSrr : SSI<0x10, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
"movss\t{$src, $dst|$dst, $src}", []>;
-let isLoad = 1, isReMaterializable = 1 in
+let isSimpleLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
def MOVSSrm : SSI<0x10, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
"movss\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (loadf32 addr:$src))]>;
[(set GR32:$dst, (int_x86_sse_cvtss2si
(load addr:$src)))]>;
+// Match intrinisics which expect MM and XMM operand(s).
+def Int_CVTPS2PIrr : PSI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+ "cvtps2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvtps2pi VR128:$src))]>;
+def Int_CVTPS2PIrm : PSI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
+ "cvtps2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvtps2pi
+ (load addr:$src)))]>;
+def Int_CVTTPS2PIrr: PSI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+ "cvttps2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvttps2pi VR128:$src))]>;
+def Int_CVTTPS2PIrm: PSI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
+ "cvttps2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvttps2pi
+ (load addr:$src)))]>;
+let isTwoAddress = 1 in {
+ def Int_CVTPI2PSrr : PSI<0x2A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR64:$src2),
+ "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse_cvtpi2ps VR128:$src1,
+ VR64:$src2))]>;
+ def Int_CVTPI2PSrm : PSI<0x2A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i64mem:$src2),
+ "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse_cvtpi2ps VR128:$src1,
+ (load addr:$src2)))]>;
+}
+
// Aliases for intrinsics
def Int_CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
"cvttss2si\t{$src, $dst|$dst, $src}",
// Comparison instructions
let isTwoAddress = 1 in {
- def CMPSSrr : SSI<0xC2, MRMSrcReg,
+let neverHasSideEffects = 1 in
+ def CMPSSrr : SSIi8<0xC2, MRMSrcReg,
(outs FR32:$dst), (ins FR32:$src1, FR32:$src, SSECC:$cc),
"cmp${cc}ss\t{$src, $dst|$dst, $src}", []>;
- def CMPSSrm : SSI<0xC2, MRMSrcMem,
+let neverHasSideEffects = 1, mayLoad = 1 in
+ def CMPSSrm : SSIi8<0xC2, MRMSrcMem,
(outs FR32:$dst), (ins FR32:$src1, f32mem:$src, SSECC:$cc),
"cmp${cc}ss\t{$src, $dst|$dst, $src}", []>;
}
// Aliases to match intrinsics which expect XMM operand(s).
let isTwoAddress = 1 in {
- def Int_CMPSSrr : SSI<0xC2, MRMSrcReg,
+ def Int_CMPSSrr : SSIi8<0xC2, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
"cmp${cc}ss\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
VR128:$src, imm:$cc))]>;
- def Int_CMPSSrm : SSI<0xC2, MRMSrcMem,
+ def Int_CMPSSrm : SSIi8<0xC2, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, f32mem:$src, SSECC:$cc),
"cmp${cc}ss\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
// Alias instruction to do FR32 reg-to-reg copy using movaps. Upper bits are
// disregarded.
+let neverHasSideEffects = 1 in
def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
"movaps\t{$src, $dst|$dst, $src}", []>;
// Alias instruction to load FR32 from f128mem using movaps. Upper bits are
// disregarded.
-let isLoad = 1 in
+let isSimpleLoad = 1 in
def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
"movaps\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (alignedloadfsf32 addr:$src))]>;
"xorps\t{$src2, $dst|$dst, $src2}",
[(set FR32:$dst, (X86fxor FR32:$src1,
(memopfsf32 addr:$src2)))]>;
-
+let neverHasSideEffects = 1 in {
def FsANDNPSrr : PSI<0x55, MRMSrcReg,
(outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
"andnps\t{$src2, $dst|$dst, $src2}", []>;
+
+let mayLoad = 1 in
def FsANDNPSrm : PSI<0x55, MRMSrcMem,
(outs FR32:$dst), (ins FR32:$src1, f128mem:$src2),
"andnps\t{$src2, $dst|$dst, $src2}", []>;
}
+}
/// basic_sse1_fp_binop_rm - SSE1 binops come in both scalar and vector forms.
///
// SSE packed FP Instructions
// Move Instructions
+let neverHasSideEffects = 1 in
def MOVAPSrr : PSI<0x28, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"movaps\t{$src, $dst|$dst, $src}", []>;
-let isLoad = 1, isReMaterializable = 1 in
+let isSimpleLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
def MOVAPSrm : PSI<0x28, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"movaps\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (alignedloadv4f32 addr:$src))]>;
"movaps\t{$src, $dst|$dst, $src}",
[(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
+let neverHasSideEffects = 1 in
def MOVUPSrr : PSI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"movups\t{$src, $dst|$dst, $src}", []>;
-let isLoad = 1 in
+let isSimpleLoad = 1 in
def MOVUPSrm : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"movups\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (loadv4f32 addr:$src))]>;
[(store (v4f32 VR128:$src), addr:$dst)]>;
// Intrinsic forms of MOVUPS load and store
-let isLoad = 1 in
+let isSimpleLoad = 1 in
def MOVUPSrm_Int : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"movups\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>;
"stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>;
// Alias instructions that map zero vector to pxor / xorp* for sse.
-// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
let isReMaterializable = 1 in
def V_SET0 : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins),
"xorps\t$dst, $dst",
- [(set VR128:$dst, (v4f32 immAllZerosV))]>;
+ [(set VR128:$dst, (v4i32 immAllZerosV))]>;
// FR32 to 128-bit vector conversion.
def MOVSS2PSrr : SSI<0x10, MRMSrcReg, (outs VR128:$dst), (ins FR32:$src),
// Move to lower bits of a VR128, leaving upper bits alone.
// Three operand (but two address) aliases.
let isTwoAddress = 1 in {
+let neverHasSideEffects = 1 in
def MOVLSS2PSrr : SSI<0x10, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, FR32:$src2),
"movss\t{$src2, $dst|$dst, $src2}", []>;
let AddedComplexity = 20 in
def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src),
"movss\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (v4f32 (vector_shuffle immAllZerosV,
+ [(set VR128:$dst, (v4f32 (vector_shuffle immAllZerosV_bc,
(v4f32 (scalar_to_vector (loadf32 addr:$src))),
MOVL_shuffle_mask)))]>;
//===----------------------------------------------------------------------===//
// Move Instructions
+let neverHasSideEffects = 1 in
def MOVSDrr : SDI<0x10, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
"movsd\t{$src, $dst|$dst, $src}", []>;
-let isLoad = 1, isReMaterializable = 1 in
+let isSimpleLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
def MOVSDrm : SDI<0x10, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
"movsd\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (loadf64 addr:$src))]>;
[(set GR32:$dst, (int_x86_sse2_cvtsd2si
(load addr:$src)))]>;
+// Match intrinisics which expect MM and XMM operand(s).
+def Int_CVTPD2PIrr : PDI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+ "cvtpd2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvtpd2pi VR128:$src))]>;
+def Int_CVTPD2PIrm : PDI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
+ "cvtpd2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvtpd2pi
+ (load addr:$src)))]>;
+def Int_CVTTPD2PIrr: PDI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+ "cvttpd2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvttpd2pi VR128:$src))]>;
+def Int_CVTTPD2PIrm: PDI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
+ "cvttpd2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvttpd2pi
+ (load addr:$src)))]>;
+def Int_CVTPI2PDrr : PDI<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
+ "cvtpi2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cvtpi2pd VR64:$src))]>;
+def Int_CVTPI2PDrm : PDI<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "cvtpi2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cvtpi2pd
+ (load addr:$src)))]>;
+
// Aliases for intrinsics
def Int_CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
"cvttsd2si\t{$src, $dst|$dst, $src}",
(load addr:$src)))]>;
// Comparison instructions
-let isTwoAddress = 1 in {
- def CMPSDrr : SDI<0xC2, MRMSrcReg,
+let isTwoAddress = 1, neverHasSideEffects = 1 in {
+ def CMPSDrr : SDIi8<0xC2, MRMSrcReg,
(outs FR64:$dst), (ins FR64:$src1, FR64:$src, SSECC:$cc),
"cmp${cc}sd\t{$src, $dst|$dst, $src}", []>;
- def CMPSDrm : SDI<0xC2, MRMSrcMem,
+let mayLoad = 1 in
+ def CMPSDrm : SDIi8<0xC2, MRMSrcMem,
(outs FR64:$dst), (ins FR64:$src1, f64mem:$src, SSECC:$cc),
"cmp${cc}sd\t{$src, $dst|$dst, $src}", []>;
}
// Aliases to match intrinsics which expect XMM operand(s).
let isTwoAddress = 1 in {
- def Int_CMPSDrr : SDI<0xC2, MRMSrcReg,
+ def Int_CMPSDrr : SDIi8<0xC2, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
"cmp${cc}sd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
VR128:$src, imm:$cc))]>;
- def Int_CMPSDrm : SDI<0xC2, MRMSrcMem,
+ def Int_CMPSDrm : SDIi8<0xC2, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, f64mem:$src, SSECC:$cc),
"cmp${cc}sd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
// Alias instruction to do FR64 reg-to-reg copy using movapd. Upper bits are
// disregarded.
+let neverHasSideEffects = 1 in
def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
"movapd\t{$src, $dst|$dst, $src}", []>;
// Alias instruction to load FR64 from f128mem using movapd. Upper bits are
// disregarded.
-let isLoad = 1 in
+let isSimpleLoad = 1 in
def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
"movapd\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (alignedloadfsf64 addr:$src))]>;
[(set FR64:$dst, (X86fxor FR64:$src1,
(memopfsf64 addr:$src2)))]>;
+let neverHasSideEffects = 1 in {
def FsANDNPDrr : PDI<0x55, MRMSrcReg,
(outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
"andnpd\t{$src2, $dst|$dst, $src2}", []>;
+let mayLoad = 1 in
def FsANDNPDrm : PDI<0x55, MRMSrcMem,
(outs FR64:$dst), (ins FR64:$src1, f128mem:$src2),
"andnpd\t{$src2, $dst|$dst, $src2}", []>;
}
+}
/// basic_sse2_fp_binop_rm - SSE2 binops come in both scalar and vector forms.
///
// SSE packed FP Instructions
// Move Instructions
+let neverHasSideEffects = 1 in
def MOVAPDrr : PDI<0x28, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"movapd\t{$src, $dst|$dst, $src}", []>;
-let isLoad = 1, isReMaterializable = 1 in
+let isSimpleLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
def MOVAPDrm : PDI<0x28, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"movapd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (alignedloadv2f64 addr:$src))]>;
"movapd\t{$src, $dst|$dst, $src}",
[(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
+let neverHasSideEffects = 1 in
def MOVUPDrr : PDI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"movupd\t{$src, $dst|$dst, $src}", []>;
-let isLoad = 1 in
+let isSimpleLoad = 1 in
def MOVUPDrm : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"movupd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (loadv2f64 addr:$src))]>;
// SSE integer instructions
// Move Instructions
+let neverHasSideEffects = 1 in
def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"movdqa\t{$src, $dst|$dst, $src}", []>;
-let isLoad = 1 in
+let isSimpleLoad = 1, mayLoad = 1 in
def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"movdqa\t{$src, $dst|$dst, $src}",
[/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
+let mayStore = 1 in
def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
"movdqa\t{$src, $dst|$dst, $src}",
[/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
-let isLoad = 1 in
+let isSimpleLoad = 1, mayLoad = 1 in
def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"movdqu\t{$src, $dst|$dst, $src}",
[/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
XS, Requires<[HasSSE2]>;
+let mayStore = 1 in
def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
"movdqu\t{$src, $dst|$dst, $src}",
[/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
XS, Requires<[HasSSE2]>;
// Intrinsic forms of MOVDQU load and store
-let isLoad = 1 in
+let isSimpleLoad = 1 in
def MOVDQUrm_Int : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"movdqu\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>,
// PSRAQ doesn't exist in SSE[1-3].
// 128-bit logical shifts.
-let isTwoAddress = 1 in {
+let isTwoAddress = 1, neverHasSideEffects = 1 in {
def PSLLDQri : PDIi8<0x73, MRM7r,
(outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
"pslldq\t{$src2, $dst|$dst, $src2}", []>;
def MFENCE : I<0xAE, MRM6m, (outs), (ins),
"mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
-
// Alias instructions that map zero vector to pxor / xorp* for sse.
-// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
let isReMaterializable = 1 in
def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins),
"pcmpeqd\t$dst, $dst",
- [(set VR128:$dst, (v2f64 immAllOnesV))]>;
+ [(set VR128:$dst, (v4i32 immAllOnesV))]>;
// FR64 to 128-bit vector conversion.
def MOVSD2PDrr : SDI<0x10, MRMSrcReg, (outs VR128:$dst), (ins FR64:$src),
// Move to lower bits of a VR128, leaving upper bits alone.
// Three operand (but two address) aliases.
let isTwoAddress = 1 in {
+ let neverHasSideEffects = 1 in
def MOVLSD2PDrr : SDI<0x10, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, FR64:$src2),
"movsd\t{$src2, $dst|$dst, $src2}", []>;
def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
"movsd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v2f64 (vector_shuffle immAllZerosV,
+ (v2f64 (vector_shuffle immAllZerosV_bc,
(v2f64 (scalar_to_vector
(loadf64 addr:$src))),
MOVL_shuffle_mask)))]>;
-let AddedComplexity = 15 in
// movd / movq to XMM register zero-extends
+let AddedComplexity = 15 in {
def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (vector_shuffle immAllZerosV,
(v4i32 (scalar_to_vector GR32:$src)),
MOVL_shuffle_mask)))]>;
-let AddedComplexity = 20 in
+// This is X86-64 only.
+def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (vector_shuffle immAllZerosV_bc,
+ (v2i64 (scalar_to_vector GR64:$src)),
+ MOVL_shuffle_mask)))]>;
+}
+
+let AddedComplexity = 20 in {
def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (vector_shuffle immAllZerosV,
(v4i32 (scalar_to_vector (loadi32 addr:$src))),
MOVL_shuffle_mask)))]>;
-
-// Moving from XMM to XMM but still clear upper 64 bits.
-let AddedComplexity = 15 in
-def MOVZQI2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "movq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_movl_dq VR128:$src))]>,
- XS, Requires<[HasSSE2]>;
-let AddedComplexity = 20 in
def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"movq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_movl_dq
- (bitconvert (memopv2i64 addr:$src))))]>,
- XS, Requires<[HasSSE2]>;
+ [(set VR128:$dst,
+ (v2i64 (vector_shuffle immAllZerosV_bc,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))),
+ MOVL_shuffle_mask)))]>, XS,
+ Requires<[HasSSE2]>;
+}
+// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
+// IA32 document. movq xmm1, xmm2 does clear the high bits.
+let AddedComplexity = 15 in
+def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2i64 (vector_shuffle immAllZerosV_bc,
+ VR128:$src,
+ MOVL_shuffle_mask)))]>,
+ XS, Requires<[HasSSE2]>;
+
+let AddedComplexity = 20 in
+def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2i64 (vector_shuffle immAllZerosV_bc,
+ (memopv2i64 addr:$src),
+ MOVL_shuffle_mask)))]>,
+ XS, Requires<[HasSSE2]>;
//===----------------------------------------------------------------------===//
// SSE3 Instructions
let isTwoAddress = 1 in {
def PALIGNR64rr : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
(ins VR64:$src1, VR64:$src2, i16imm:$src3),
- "palignr\t{$src2, $dst|$dst, $src2}",
+ "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR64:$dst,
(int_x86_ssse3_palign_r
VR64:$src1, VR64:$src2,
imm:$src3))]>;
def PALIGNR64rm : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
(ins VR64:$src1, i64mem:$src2, i16imm:$src3),
- "palignr\t{$src2, $dst|$dst, $src2}",
+ "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR64:$dst,
(int_x86_ssse3_palign_r
VR64:$src1,
def PALIGNR128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, i32imm:$src3),
- "palignr\t{$src2, $dst|$dst, $src2}",
+ "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR128:$dst,
(int_x86_ssse3_palign_r_128
VR128:$src1, VR128:$src2,
imm:$src3))]>, OpSize;
def PALIGNR128rm : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, i32imm:$src3),
- "palignr\t{$src2, $dst|$dst, $src2}",
+ "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR128:$dst,
(int_x86_ssse3_palign_r_128
VR128:$src1,
def : Pat<(v4i32 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
def : Pat<(v2i64 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
-// 128-bit vector all zero's.
-def : Pat<(v16i8 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
-def : Pat<(v8i16 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
-def : Pat<(v4i32 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
-def : Pat<(v2i64 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
-def : Pat<(v2f64 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
-
-// 128-bit vector all one's.
-def : Pat<(v16i8 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
-def : Pat<(v8i16 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
-def : Pat<(v4i32 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
-def : Pat<(v2i64 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
-def : Pat<(v4f32 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE1]>;
-
+// extload f32 -> f64. This matches load+fextend because we have a hack in
+// the isel (PreprocessForFPConvert) that can introduce loads after dag combine.
+// Since these loads aren't folded into the fextend, we have to match it
+// explicitly here.
+let Predicates = [HasSSE2] in
+ def : Pat<(fextend (loadf32 addr:$src)),
+ (CVTSS2SDrm addr:$src)>;
// Scalar to v8i16 / v16i8. The source may be a GR32, but only the lower 8 or
// 16-bits matter.
// Move scalar to XMM zero-extended
// movd to XMM register zero-extends
let AddedComplexity = 15 in {
-def : Pat<(v8i16 (vector_shuffle immAllZerosV,
- (v8i16 (X86s2vec GR32:$src)), MOVL_shuffle_mask)),
- (MOVZDI2PDIrr GR32:$src)>, Requires<[HasSSE2]>;
-def : Pat<(v16i8 (vector_shuffle immAllZerosV,
- (v16i8 (X86s2vec GR32:$src)), MOVL_shuffle_mask)),
- (MOVZDI2PDIrr GR32:$src)>, Requires<[HasSSE2]>;
// Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
-def : Pat<(v2f64 (vector_shuffle immAllZerosV,
+def : Pat<(v2f64 (vector_shuffle immAllZerosV_bc,
(v2f64 (scalar_to_vector FR64:$src)), MOVL_shuffle_mask)),
(MOVLSD2PDrr (V_SET0), FR64:$src)>, Requires<[HasSSE2]>;
-def : Pat<(v4f32 (vector_shuffle immAllZerosV,
+def : Pat<(v4f32 (vector_shuffle immAllZerosV_bc,
(v4f32 (scalar_to_vector FR32:$src)), MOVL_shuffle_mask)),
(MOVLSS2PSrr (V_SET0), FR32:$src)>, Requires<[HasSSE2]>;
}
// Special unary SHUFPSrri case.
// FIXME: when we want non two-address code, then we should use PSHUFD?
-def : Pat<(vector_shuffle (v4f32 VR128:$src1), (undef),
- SHUFP_unary_shuffle_mask:$sm),
+def : Pat<(v4f32 (vector_shuffle VR128:$src1, (undef),
+ SHUFP_unary_shuffle_mask:$sm)),
(SHUFPSrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>,
Requires<[HasSSE1]>;
// Special unary SHUFPDrri case.
-def : Pat<(vector_shuffle (v2f64 VR128:$src1), (undef),
- SHUFP_unary_shuffle_mask:$sm),
+def : Pat<(v2f64 (vector_shuffle VR128:$src1, (undef),
+ SHUFP_unary_shuffle_mask:$sm)),
(SHUFPDrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>,
Requires<[HasSSE2]>;
// Unary v4f32 shuffle with PSHUF* in order to fold a load.
(PSHUFDmi addr:$src1, SHUFP_unary_shuffle_mask:$sm)>,
Requires<[HasSSE2]>;
// Special binary v4i32 shuffle cases with SHUFPS.
-def : Pat<(vector_shuffle (v4i32 VR128:$src1), (v4i32 VR128:$src2),
- PSHUFD_binary_shuffle_mask:$sm),
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, (v4i32 VR128:$src2),
+ PSHUFD_binary_shuffle_mask:$sm)),
(SHUFPSrri VR128:$src1, VR128:$src2, PSHUFD_binary_shuffle_mask:$sm)>,
Requires<[HasSSE2]>;
-def : Pat<(vector_shuffle (v4i32 VR128:$src1),
- (bc_v4i32 (memopv2i64 addr:$src2)), PSHUFD_binary_shuffle_mask:$sm),
+def : Pat<(v4i32 (vector_shuffle VR128:$src1,
+ (bc_v4i32 (memopv2i64 addr:$src2)), PSHUFD_binary_shuffle_mask:$sm)),
(SHUFPSrmi VR128:$src1, addr:$src2, PSHUFD_binary_shuffle_mask:$sm)>,
Requires<[HasSSE2]>;
+// Special binary v2i64 shuffle cases using SHUFPDrri.
+def : Pat<(v2i64 (vector_shuffle VR128:$src1, VR128:$src2,
+ SHUFP_shuffle_mask:$sm)),
+ (SHUFPDrri VR128:$src1, VR128:$src2, SHUFP_shuffle_mask:$sm)>,
+ Requires<[HasSSE2]>;
+// Special unary SHUFPDrri case.
+def : Pat<(v2i64 (vector_shuffle VR128:$src1, (undef),
+ SHUFP_unary_shuffle_mask:$sm)),
+ (SHUFPDrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>,
+ Requires<[HasSSE2]>;
// vector_shuffle v1, <undef>, <0, 0, 1, 1, ...>
let AddedComplexity = 10 in {
}
// Set lowest element and zero upper elements.
-let AddedComplexity = 20 in
-def : Pat<(bc_v2i64 (vector_shuffle immAllZerosV,
- (v2f64 (scalar_to_vector (loadf64 addr:$src))),
- MOVL_shuffle_mask)),
- (MOVZQI2PQIrm addr:$src)>, Requires<[HasSSE2]>;
+let AddedComplexity = 15 in
+def : Pat<(v2f64 (vector_shuffle immAllZerosV_bc, VR128:$src,
+ MOVL_shuffle_mask)),
+ (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>;
+
// FIXME: Temporary workaround since 2-wide shuffle is broken.
def : Pat<(int_x86_sse2_movs_d VR128:$src1, VR128:$src2),
(memopv2i64 addr:$src2))),
(PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+// vector -> vector casts
+def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+ (Int_CVTDQ2PSrr VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+ (Int_CVTTPS2DQrr VR128:$src)>, Requires<[HasSSE2]>;
+
// Use movaps / movups for SSE integer load / store (one byte shorter).
def : Pat<(alignedloadv4i32 addr:$src),
(MOVAPSrm addr:$src)>, Requires<[HasSSE1]>;
(MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
def : Pat<(store (v16i8 VR128:$src), addr:$dst),
(MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 Instructions
+//===----------------------------------------------------------------------===//
+
+// SSE4.1 Instruction Templates:
+//
+// SS418I - SSE 4.1 instructions with T8 prefix.
+// SS41AI - SSE 4.1 instructions with TA prefix.
+//
+class SS418I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSE41]>;
+class SS41AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSE41]>;
+
+
+multiclass sse41_fp_unop_rm<bits<8> opcss, bits<8> opcps,
+ bits<8> opcsd, bits<8> opcpd,
+ string OpcodeStr,
+ Intrinsic F32Int,
+ Intrinsic V4F32Int,
+ Intrinsic F64Int,
+ Intrinsic V2F64Int,
+ bit Commutable = 0> {
+ // Intrinsic operation, reg.
+ def SSr_Int : SS41AI<opcss, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, i32imm:$src2),
+ !strconcat(OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (F32Int VR128:$src1, imm:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Intrinsic operation, mem.
+ def SSm_Int : SS41AI<opcss, MRMSrcMem,
+ (outs VR128:$dst), (ins ssmem:$src1, i32imm:$src2),
+ !strconcat(OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (F32Int sse_load_f32:$src1, imm:$src2))]>;
+
+ // Vector intrinsic operation, reg
+ def PSr_Int : SS41AI<opcps, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, i32imm:$src2),
+ !strconcat(OpcodeStr,
+ "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (V4F32Int VR128:$src1, imm:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector intrinsic operation, mem
+ def PSm_Int : SS41AI<opcps, MRMSrcMem,
+ (outs VR128:$dst), (ins f128mem:$src1, i32imm:$src2),
+ !strconcat(OpcodeStr,
+ "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (V4F32Int (load addr:$src1), imm:$src2))]>;
+
+ // Intrinsic operation, reg.
+ def SDr_Int : SS41AI<opcsd, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, i32imm:$src2),
+ !strconcat(OpcodeStr,
+ "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (F64Int VR128:$src1, imm:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Intrinsic operation, mem.
+ def SDm_Int : SS41AI<opcsd, MRMSrcMem,
+ (outs VR128:$dst), (ins sdmem:$src1, i32imm:$src2),
+ !strconcat(OpcodeStr,
+ "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (F64Int sse_load_f64:$src1, imm:$src2))]>;
+
+ // Vector intrinsic operation, reg
+ def PDr_Int : SS41AI<opcpd, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, i32imm:$src2),
+ !strconcat(OpcodeStr,
+ "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (V2F64Int VR128:$src1, imm:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector intrinsic operation, mem
+ def PDm_Int : SS41AI<opcpd, MRMSrcMem,
+ (outs VR128:$dst), (ins f128mem:$src1, i32imm:$src2),
+ !strconcat(OpcodeStr,
+ "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (V2F64Int (load addr:$src1), imm:$src2))]>;
+}
-// (vextract (v4i32 bc (v4f32 s2v (f32 load $addr))), 0) -> (i32 load $addr)
-def : Pat<(vector_extract
- (bc_v4i32 (v4f32 (scalar_to_vector (loadf32 addr:$src)))), (iPTR 0)),
- (MOV32rm addr:$src)>, Requires<[HasSSE2]>;
-def : Pat<(vector_extract
- (bc_v2i64 (v2f64 (scalar_to_vector (loadf64 addr:$src)))), (iPTR 0)),
- (MOV64rm addr:$src)>, Requires<[HasSSE2, In64BitMode]>;
+// FP round - roundss, roundps, roundsd, roundpd
+defm ROUND : sse41_fp_unop_rm<0x0A, 0x08, 0x0B, 0x09, "round",
+ int_x86_sse41_round_ss, int_x86_sse41_round_ps,
+ int_x86_sse41_round_sd, int_x86_sse41_round_pd>;