X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86InstrSSE.td;h=403daf5ca24b469a40f28133d5ee1964f5c90b70;hb=01998742c33f8cf247b96bf308e7e57653faa031;hp=4e495b6784b79f44f0a60dd4b6494670148ad4a3;hpb=b21495043e2c82bdaf88d44c8859cf9fc757695c;p=oota-llvm.git diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 4e495b6784b..403daf5ca24 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -19,14 +19,16 @@ def X86loadp : SDNode<"X86ISD::LOAD_PACK", SDTLoad, [SDNPHasChain]>; +def X86loadu : SDNode<"X86ISD::LOAD_UA", SDTLoad, + [SDNPHasChain]>; def X86fand : SDNode<"X86ISD::FAND", SDTFPBinOp, [SDNPCommutative, SDNPAssociative]>; def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp, [SDNPCommutative, SDNPAssociative]>; def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest, - [SDNPOutFlag]>; + [SDNPHasChain, SDNPOutFlag]>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest, - [SDNPOutFlag]>; + [SDNPHasChain, SDNPOutFlag]>; def X86s2vec : SDNode<"X86ISD::S2VEC", SDTypeProfile<1, 1, []>, []>; def X86pextrw : SDNode<"X86ISD::PEXTRW", @@ -43,9 +45,6 @@ def X86loadpf64 : PatFrag<(ops node:$ptr), (f64 (X86loadp node:$ptr))>; def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>; def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>; -def loadv16i8 : PatFrag<(ops node:$ptr), (v16i8 (load node:$ptr))>; -def loadv8i16 : PatFrag<(ops node:$ptr), (v8i16 (load node:$ptr))>; -def loadv4i32 : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>; def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>; def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>; @@ -186,72 +185,75 @@ class S3I o, Format F, dag ops, string asm, list pattern> //===----------------------------------------------------------------------===// // Helpers for defining instructions that directly correspond to intrinsics. -class SS_Intr o, string asm, Intrinsic IntId> - : SSI; -class SS_Intm o, string asm, Intrinsic IntId> - : SSI; -class SD_Intr o, string asm, Intrinsic IntId> - : SDI; -class SD_Intm o, string asm, Intrinsic IntId> - : SDI; - -class SS_Intrr o, string asm, Intrinsic IntId> - : SSI o, string OpcodeStr, Intrinsic IntId> { + def r : SSI; + def m : SSI; +} + +multiclass SD_IntUnary o, string OpcodeStr, Intrinsic IntId> { + def r : SDI; + def m : SDI; +} + +class SS_Intrr o, string OpcodeStr, Intrinsic IntId> + : SSI; -class SS_Intrm o, string asm, Intrinsic IntId> - : SSI o, string OpcodeStr, Intrinsic IntId> + : SSI; -class SD_Intrr o, string asm, Intrinsic IntId> - : SDI o, string OpcodeStr, Intrinsic IntId> + : SDI; -class SD_Intrm o, string asm, Intrinsic IntId> - : SDI o, string OpcodeStr, Intrinsic IntId> + : SDI; -class PS_Intr o, string asm, Intrinsic IntId> - : PSI o, string OpcodeStr, Intrinsic IntId> + : PSI; -class PS_Intm o, string asm, Intrinsic IntId> - : PSI; -class PD_Intr o, string asm, Intrinsic IntId> - : PDI o, string OpcodeStr, Intrinsic IntId> + : PSI; +class PD_Intr o, string OpcodeStr, Intrinsic IntId> + : PDI; -class PD_Intm o, string asm, Intrinsic IntId> - : PDI; - -class PS_Intrr o, string asm, Intrinsic IntId> - : PSI o, string OpcodeStr, Intrinsic IntId> + : PDI; + +class PS_Intrr o, string OpcodeStr, Intrinsic IntId> + : PSI; -class PS_Intrm o, string asm, Intrinsic IntId> - : PSI; -class PD_Intrr o, string asm, Intrinsic IntId> - : PDI o, string OpcodeStr, Intrinsic IntId> + : PSI; +class PD_Intrr o, string OpcodeStr, Intrinsic IntId> + : PDI; -class PD_Intrm o, string asm, Intrinsic IntId> - : PDI; - -class S3D_Intrr o, string asm, Intrinsic IntId> - : S3DI; -class S3D_Intrm o, string asm, Intrinsic IntId> - : S3DI; -class S3_Intrr o, string asm, Intrinsic IntId> - : S3I; -class S3_Intrm o, string asm, Intrinsic IntId> - : S3I; +class PD_Intrm o, string OpcodeStr, Intrinsic IntId> + : PDI; // Some 'special' instructions def IMPLICIT_DEF_FR32 : I<0, Pseudo, (ops FR32:$dst), @@ -378,114 +380,47 @@ def SQRTSDm : SDI<0x51, MRMSrcMem, (ops FR64:$dst, f64mem:$src), "sqrtsd {$src, $dst|$dst, $src}", [(set FR64:$dst, (fsqrt (loadf64 addr:$src)))]>; -def RSQRTSSr : SSI<0x52, MRMSrcReg, (ops FR32:$dst, FR32:$src), - "rsqrtss {$src, $dst|$dst, $src}", []>; -def RSQRTSSm : SSI<0x52, MRMSrcMem, (ops FR32:$dst, f32mem:$src), - "rsqrtss {$src, $dst|$dst, $src}", []>; -def RCPSSr : SSI<0x53, MRMSrcReg, (ops FR32:$dst, FR32:$src), - "rcpss {$src, $dst|$dst, $src}", []>; -def RCPSSm : SSI<0x53, MRMSrcMem, (ops FR32:$dst, f32mem:$src), - "rcpss {$src, $dst|$dst, $src}", []>; - -let isTwoAddress = 1 in { -let isCommutable = 1 in { -def MAXSSrr : SSI<0x5F, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2), - "maxss {$src2, $dst|$dst, $src2}", []>; -def MAXSDrr : SDI<0x5F, MRMSrcReg, (ops FR64:$dst, FR32:$src1, FR64:$src2), - "maxsd {$src2, $dst|$dst, $src2}", []>; -def MINSSrr : SSI<0x5D, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2), - "minss {$src2, $dst|$dst, $src2}", []>; -def MINSDrr : SDI<0x5D, MRMSrcReg, (ops FR64:$dst, FR32:$src1, FR64:$src2), - "minsd {$src2, $dst|$dst, $src2}", []>; -} -def MAXSSrm : SSI<0x5F, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f32mem:$src2), - "maxss {$src2, $dst|$dst, $src2}", []>; -def MAXSDrm : SDI<0x5F, MRMSrcMem, (ops FR64:$dst, FR32:$src1, f64mem:$src2), - "maxsd {$src2, $dst|$dst, $src2}", []>; -def MINSSrm : SSI<0x5D, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f32mem:$src2), - "minss {$src2, $dst|$dst, $src2}", []>; -def MINSDrm : SDI<0x5D, MRMSrcMem, (ops FR64:$dst, FR32:$src1, f64mem:$src2), - "minsd {$src2, $dst|$dst, $src2}", []>; -} - // Aliases to match intrinsics which expect XMM operand(s). let isTwoAddress = 1 in { let isCommutable = 1 in { -def Int_ADDSSrr : SS_Intrr<0x58, "addss {$src2, $dst|$dst, $src2}", - int_x86_sse_add_ss>; -def Int_ADDSDrr : SD_Intrr<0x58, "addsd {$src2, $dst|$dst, $src2}", - int_x86_sse2_add_sd>; -def Int_MULSSrr : SS_Intrr<0x59, "mulss {$src2, $dst|$dst, $src2}", - int_x86_sse_mul_ss>; -def Int_MULSDrr : SD_Intrr<0x59, "mulsd {$src2, $dst|$dst, $src2}", - int_x86_sse2_mul_sd>; +def Int_ADDSSrr : SS_Intrr<0x58, "addss", int_x86_sse_add_ss>; +def Int_ADDSDrr : SD_Intrr<0x58, "addsd", int_x86_sse2_add_sd>; +def Int_MULSSrr : SS_Intrr<0x59, "mulss", int_x86_sse_mul_ss>; +def Int_MULSDrr : SD_Intrr<0x59, "mulsd", int_x86_sse2_mul_sd>; } -def Int_ADDSSrm : SS_Intrm<0x58, "addss {$src2, $dst|$dst, $src2}", - int_x86_sse_add_ss>; -def Int_ADDSDrm : SD_Intrm<0x58, "addsd {$src2, $dst|$dst, $src2}", - int_x86_sse2_add_sd>; -def Int_MULSSrm : SS_Intrm<0x59, "mulss {$src2, $dst|$dst, $src2}", - int_x86_sse_mul_ss>; -def Int_MULSDrm : SD_Intrm<0x59, "mulsd {$src2, $dst|$dst, $src2}", - int_x86_sse2_mul_sd>; - -def Int_DIVSSrr : SS_Intrr<0x5E, "divss {$src2, $dst|$dst, $src2}", - int_x86_sse_div_ss>; -def Int_DIVSSrm : SS_Intrm<0x5E, "divss {$src2, $dst|$dst, $src2}", - int_x86_sse_div_ss>; -def Int_DIVSDrr : SD_Intrr<0x5E, "divsd {$src2, $dst|$dst, $src2}", - int_x86_sse2_div_sd>; -def Int_DIVSDrm : SD_Intrm<0x5E, "divsd {$src2, $dst|$dst, $src2}", - int_x86_sse2_div_sd>; - -def Int_SUBSSrr : SS_Intrr<0x5C, "subss {$src2, $dst|$dst, $src2}", - int_x86_sse_sub_ss>; -def Int_SUBSSrm : SS_Intrm<0x5C, "subss {$src2, $dst|$dst, $src2}", - int_x86_sse_sub_ss>; -def Int_SUBSDrr : SD_Intrr<0x5C, "subsd {$src2, $dst|$dst, $src2}", - int_x86_sse2_sub_sd>; -def Int_SUBSDrm : SD_Intrm<0x5C, "subsd {$src2, $dst|$dst, $src2}", - int_x86_sse2_sub_sd>; +def Int_ADDSSrm : SS_Intrm<0x58, "addss", int_x86_sse_add_ss>; +def Int_ADDSDrm : SD_Intrm<0x58, "addsd", int_x86_sse2_add_sd>; +def Int_MULSSrm : SS_Intrm<0x59, "mulss", int_x86_sse_mul_ss>; +def Int_MULSDrm : SD_Intrm<0x59, "mulsd", int_x86_sse2_mul_sd>; + +def Int_DIVSSrr : SS_Intrr<0x5E, "divss", int_x86_sse_div_ss>; +def Int_DIVSSrm : SS_Intrm<0x5E, "divss", int_x86_sse_div_ss>; +def Int_DIVSDrr : SD_Intrr<0x5E, "divsd", int_x86_sse2_div_sd>; +def Int_DIVSDrm : SD_Intrm<0x5E, "divsd", int_x86_sse2_div_sd>; + +def Int_SUBSSrr : SS_Intrr<0x5C, "subss", int_x86_sse_sub_ss>; +def Int_SUBSSrm : SS_Intrm<0x5C, "subss", int_x86_sse_sub_ss>; +def Int_SUBSDrr : SD_Intrr<0x5C, "subsd", int_x86_sse2_sub_sd>; +def Int_SUBSDrm : SD_Intrm<0x5C, "subsd", int_x86_sse2_sub_sd>; } -def Int_SQRTSSr : SS_Intr<0x51, "sqrtss {$src, $dst|$dst, $src}", - int_x86_sse_sqrt_ss>; -def Int_SQRTSSm : SS_Intm<0x51, "sqrtss {$src, $dst|$dst, $src}", - int_x86_sse_sqrt_ss>; -def Int_SQRTSDr : SD_Intr<0x51, "sqrtsd {$src, $dst|$dst, $src}", - int_x86_sse2_sqrt_sd>; -def Int_SQRTSDm : SD_Intm<0x51, "sqrtsd {$src, $dst|$dst, $src}", - int_x86_sse2_sqrt_sd>; - -def Int_RSQRTSSr : SS_Intr<0x52, "rsqrtss {$src, $dst|$dst, $src}", - int_x86_sse_rsqrt_ss>; -def Int_RSQRTSSm : SS_Intm<0x52, "rsqrtss {$src, $dst|$dst, $src}", - int_x86_sse_rsqrt_ss>; -def Int_RCPSSr : SS_Intr<0x53, "rcpss {$src, $dst|$dst, $src}", - int_x86_sse_rcp_ss>; -def Int_RCPSSm : SS_Intm<0x53, "rcpss {$src, $dst|$dst, $src}", - int_x86_sse_rcp_ss>; +defm Int_SQRTSS : SS_IntUnary<0x51, "sqrtss" , int_x86_sse_sqrt_ss>; +defm Int_SQRTSD : SD_IntUnary<0x51, "sqrtsd" , int_x86_sse2_sqrt_sd>; +defm Int_RSQRTSS : SS_IntUnary<0x52, "rsqrtss", int_x86_sse_rsqrt_ss>; +defm Int_RCPSS : SS_IntUnary<0x53, "rcpss" , int_x86_sse_rcp_ss>; let isTwoAddress = 1 in { let isCommutable = 1 in { -def Int_MAXSSrr : SS_Intrr<0x5F, "maxss {$src2, $dst|$dst, $src2}", - int_x86_sse_max_ss>; -def Int_MAXSDrr : SD_Intrr<0x5F, "maxsd {$src2, $dst|$dst, $src2}", - int_x86_sse2_max_sd>; -def Int_MINSSrr : SS_Intrr<0x5D, "minss {$src2, $dst|$dst, $src2}", - int_x86_sse_min_ss>; -def Int_MINSDrr : SD_Intrr<0x5D, "minsd {$src2, $dst|$dst, $src2}", - int_x86_sse2_min_sd>; +def Int_MAXSSrr : SS_Intrr<0x5F, "maxss", int_x86_sse_max_ss>; +def Int_MAXSDrr : SD_Intrr<0x5F, "maxsd", int_x86_sse2_max_sd>; +def Int_MINSSrr : SS_Intrr<0x5D, "minss", int_x86_sse_min_ss>; +def Int_MINSDrr : SD_Intrr<0x5D, "minsd", int_x86_sse2_min_sd>; } -def Int_MAXSSrm : SS_Intrm<0x5F, "maxss {$src2, $dst|$dst, $src2}", - int_x86_sse_max_ss>; -def Int_MAXSDrm : SD_Intrm<0x5F, "maxsd {$src2, $dst|$dst, $src2}", - int_x86_sse2_max_sd>; -def Int_MINSSrm : SS_Intrm<0x5D, "minss {$src2, $dst|$dst, $src2}", - int_x86_sse_min_ss>; -def Int_MINSDrm : SD_Intrm<0x5D, "minsd {$src2, $dst|$dst, $src2}", - int_x86_sse2_min_sd>; +def Int_MAXSSrm : SS_Intrm<0x5F, "maxss", int_x86_sse_max_ss>; +def Int_MAXSDrm : SD_Intrm<0x5F, "maxsd", int_x86_sse2_max_sd>; +def Int_MINSSrm : SS_Intrm<0x5D, "minss", int_x86_sse_min_ss>; +def Int_MINSDrm : SD_Intrm<0x5D, "minsd", int_x86_sse2_min_sd>; } // Conversion instructions @@ -537,14 +472,14 @@ def Int_CVTSS2SIrr: SSI<0x2D, MRMSrcReg, (ops GR32:$dst, VR128:$src), def Int_CVTSS2SIrm: SSI<0x2D, MRMSrcMem, (ops GR32:$dst, f32mem:$src), "cvtss2si {$src, $dst|$dst, $src}", [(set GR32:$dst, (int_x86_sse_cvtss2si - (loadv4f32 addr:$src)))]>; + (load addr:$src)))]>; def Int_CVTSD2SIrr: SDI<0x2D, MRMSrcReg, (ops GR32:$dst, VR128:$src), "cvtsd2si {$src, $dst|$dst, $src}", [(set GR32:$dst, (int_x86_sse2_cvtsd2si VR128:$src))]>; def Int_CVTSD2SIrm: SDI<0x2D, MRMSrcMem, (ops GR32:$dst, f128mem:$src), "cvtsd2si {$src, $dst|$dst, $src}", [(set GR32:$dst, (int_x86_sse2_cvtsd2si - (loadv2f64 addr:$src)))]>; + (load addr:$src)))]>; // Aliases for intrinsics def Int_CVTTSS2SIrr: SSI<0x2C, MRMSrcReg, (ops GR32:$dst, VR128:$src), @@ -552,15 +487,14 @@ def Int_CVTTSS2SIrr: SSI<0x2C, MRMSrcReg, (ops GR32:$dst, VR128:$src), [(set GR32:$dst, (int_x86_sse_cvttss2si VR128:$src))]>; def Int_CVTTSS2SIrm: SSI<0x2C, MRMSrcMem, (ops GR32:$dst, f32mem:$src), "cvttss2si {$src, $dst|$dst, $src}", - [(set GR32:$dst, (int_x86_sse_cvttss2si - (loadv4f32 addr:$src)))]>; + [(set GR32:$dst, (int_x86_sse_cvttss2si(load addr:$src)))]>; def Int_CVTTSD2SIrr: SDI<0x2C, MRMSrcReg, (ops GR32:$dst, VR128:$src), "cvttsd2si {$src, $dst|$dst, $src}", [(set GR32:$dst, (int_x86_sse2_cvttsd2si VR128:$src))]>; def Int_CVTTSD2SIrm: SDI<0x2C, MRMSrcMem, (ops GR32:$dst, f128mem:$src), "cvttsd2si {$src, $dst|$dst, $src}", [(set GR32:$dst, (int_x86_sse2_cvttsd2si - (loadv2f64 addr:$src)))]>; + (load addr:$src)))]>; let isTwoAddress = 1 in { def Int_CVTSI2SSrr: SSI<0x2A, MRMSrcReg, @@ -630,26 +564,26 @@ def Int_UCOMISSrr: PSI<0x2E, MRMSrcReg, (ops VR128:$src1, VR128:$src2), [(X86ucomi (v4f32 VR128:$src1), VR128:$src2)]>; def Int_UCOMISSrm: PSI<0x2E, MRMSrcMem, (ops VR128:$src1, f128mem:$src2), "ucomiss {$src2, $src1|$src1, $src2}", - [(X86ucomi (v4f32 VR128:$src1), (loadv4f32 addr:$src2))]>; + [(X86ucomi (v4f32 VR128:$src1), (load addr:$src2))]>; def Int_UCOMISDrr: PDI<0x2E, MRMSrcReg, (ops VR128:$src1, VR128:$src2), "ucomisd {$src2, $src1|$src1, $src2}", [(X86ucomi (v2f64 VR128:$src1), (v2f64 VR128:$src2))]>; def Int_UCOMISDrm: PDI<0x2E, MRMSrcMem, (ops VR128:$src1, f128mem:$src2), "ucomisd {$src2, $src1|$src1, $src2}", - [(X86ucomi (v2f64 VR128:$src1), (loadv2f64 addr:$src2))]>; + [(X86ucomi (v2f64 VR128:$src1), (load addr:$src2))]>; def Int_COMISSrr: PSI<0x2F, MRMSrcReg, (ops VR128:$src1, VR128:$src2), "comiss {$src2, $src1|$src1, $src2}", [(X86comi (v4f32 VR128:$src1), VR128:$src2)]>; def Int_COMISSrm: PSI<0x2F, MRMSrcMem, (ops VR128:$src1, f128mem:$src2), "comiss {$src2, $src1|$src1, $src2}", - [(X86comi (v4f32 VR128:$src1), (loadv4f32 addr:$src2))]>; + [(X86comi (v4f32 VR128:$src1), (load addr:$src2))]>; def Int_COMISDrr: PDI<0x2F, MRMSrcReg, (ops VR128:$src1, VR128:$src2), "comisd {$src2, $src1|$src1, $src2}", [(X86comi (v2f64 VR128:$src1), (v2f64 VR128:$src2))]>; def Int_COMISDrm: PDI<0x2F, MRMSrcMem, (ops VR128:$src1, f128mem:$src2), "comisd {$src2, $src1|$src1, $src2}", - [(X86comi (v2f64 VR128:$src1), (loadv2f64 addr:$src2))]>; + [(X86comi (v2f64 VR128:$src1), (load addr:$src2))]>; // Aliases of packed instructions for scalar use. These all have names that // start with 'Fs'. @@ -888,7 +822,7 @@ def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (ops VR128:$dst, VR128:$src), def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (ops VR128:$dst, i128mem:$src), "cvtdq2ps {$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2ps - (bc_v4i32 (loadv2i64 addr:$src))))]>, + (bitconvert (loadv2i64 addr:$src))))]>, TB, Requires<[HasSSE2]>; // SSE2 instructions with XS prefix @@ -899,7 +833,7 @@ def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (ops VR128:$dst, VR128:$src), def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (ops VR128:$dst, i64mem:$src), "cvtdq2pd {$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2pd - (bc_v4i32 (loadv2i64 addr:$src))))]>, + (bitconvert (loadv2i64 addr:$src))))]>, XS, Requires<[HasSSE2]>; def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (ops VR128:$dst, VR128:$src), @@ -908,7 +842,7 @@ def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (ops VR128:$dst, VR128:$src), def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (ops VR128:$dst, f128mem:$src), "cvtps2dq {$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2dq - (loadv4f32 addr:$src)))]>; + (load addr:$src)))]>; // SSE2 packed instructions with XS prefix def Int_CVTTPS2DQrr : I<0x5B, MRMSrcReg, (ops VR128:$dst, VR128:$src), "cvttps2dq {$src, $dst|$dst, $src}", @@ -917,7 +851,7 @@ def Int_CVTTPS2DQrr : I<0x5B, MRMSrcReg, (ops VR128:$dst, VR128:$src), def Int_CVTTPS2DQrm : I<0x5B, MRMSrcMem, (ops VR128:$dst, f128mem:$src), "cvttps2dq {$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttps2dq - (loadv4f32 addr:$src)))]>, + (load addr:$src)))]>, XS, Requires<[HasSSE2]>; // SSE2 packed instructions with XD prefix @@ -928,7 +862,7 @@ def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (ops VR128:$dst, VR128:$src), def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (ops VR128:$dst, f128mem:$src), "cvtpd2dq {$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2dq - (loadv2f64 addr:$src)))]>, + (load addr:$src)))]>, XD, Requires<[HasSSE2]>; def Int_CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (ops VR128:$dst, VR128:$src), "cvttpd2dq {$src, $dst|$dst, $src}", @@ -936,7 +870,7 @@ def Int_CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (ops VR128:$dst, VR128:$src), def Int_CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (ops VR128:$dst, f128mem:$src), "cvttpd2dq {$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttpd2dq - (loadv2f64 addr:$src)))]>; + (load addr:$src)))]>; // SSE2 instructions without OpSize prefix def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (ops VR128:$dst, VR128:$src), @@ -946,7 +880,7 @@ def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (ops VR128:$dst, VR128:$src), def Int_CVTPS2PDrm : I<0x5A, MRMSrcReg, (ops VR128:$dst, f64mem:$src), "cvtps2pd {$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd - (loadv4f32 addr:$src)))]>, + (load addr:$src)))]>, TB, Requires<[HasSSE2]>; def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (ops VR128:$dst, VR128:$src), @@ -955,7 +889,7 @@ def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (ops VR128:$dst, VR128:$src), def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcReg, (ops VR128:$dst, f128mem:$src), "cvtpd2ps {$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2ps - (loadv2f64 addr:$src)))]>; + (load addr:$src)))]>; // Match intrinsics which expect XMM operand(s). // Aliases for intrinsics @@ -979,7 +913,7 @@ def Int_CVTSD2SSrm: SDI<0x5A, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f64mem:$src2), "cvtsd2ss {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, - (loadv2f64 addr:$src2)))]>; + (load addr:$src2)))]>; def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), "cvtss2sd {$src2, $dst|$dst, $src2}", @@ -990,7 +924,7 @@ def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f32mem:$src2), "cvtss2sd {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, - (loadv4f32 addr:$src2)))]>, XS, + (load addr:$src2)))]>, XS, Requires<[HasSSE2]>; } @@ -1067,7 +1001,7 @@ def ADDSUBPSrm : S3DI<0xD0, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2), "addsubps {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1, - (loadv4f32 addr:$src2)))]>; + (load addr:$src2)))]>; def ADDSUBPDrr : S3I<0xD0, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), "addsubpd {$src2, $dst|$dst, $src2}", @@ -1077,46 +1011,30 @@ def ADDSUBPDrm : S3I<0xD0, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2), "addsubpd {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1, - (loadv2f64 addr:$src2)))]>; + (load addr:$src2)))]>; } -def SQRTPSr : PS_Intr<0x51, "sqrtps {$src, $dst|$dst, $src}", - int_x86_sse_sqrt_ps>; -def SQRTPSm : PS_Intm<0x51, "sqrtps {$src, $dst|$dst, $src}", - int_x86_sse_sqrt_ps>; -def SQRTPDr : PD_Intr<0x51, "sqrtpd {$src, $dst|$dst, $src}", - int_x86_sse2_sqrt_pd>; -def SQRTPDm : PD_Intm<0x51, "sqrtpd {$src, $dst|$dst, $src}", - int_x86_sse2_sqrt_pd>; - -def RSQRTPSr : PS_Intr<0x52, "rsqrtps {$src, $dst|$dst, $src}", - int_x86_sse_rsqrt_ps>; -def RSQRTPSm : PS_Intm<0x52, "rsqrtps {$src, $dst|$dst, $src}", - int_x86_sse_rsqrt_ps>; -def RCPPSr : PS_Intr<0x53, "rcpps {$src, $dst|$dst, $src}", - int_x86_sse_rcp_ps>; -def RCPPSm : PS_Intm<0x53, "rcpps {$src, $dst|$dst, $src}", - int_x86_sse_rcp_ps>; +def SQRTPSr : PS_Intr<0x51, "sqrtps", int_x86_sse_sqrt_ps>; +def SQRTPSm : PS_Intm<0x51, "sqrtps", int_x86_sse_sqrt_ps>; +def SQRTPDr : PD_Intr<0x51, "sqrtpd", int_x86_sse2_sqrt_pd>; +def SQRTPDm : PD_Intm<0x51, "sqrtpd", int_x86_sse2_sqrt_pd>; + +def RSQRTPSr : PS_Intr<0x52, "rsqrtps", int_x86_sse_rsqrt_ps>; +def RSQRTPSm : PS_Intm<0x52, "rsqrtps", int_x86_sse_rsqrt_ps>; +def RCPPSr : PS_Intr<0x53, "rcpps", int_x86_sse_rcp_ps>; +def RCPPSm : PS_Intm<0x53, "rcpps", int_x86_sse_rcp_ps>; let isTwoAddress = 1 in { let isCommutable = 1 in { -def MAXPSrr : PS_Intrr<0x5F, "maxps {$src2, $dst|$dst, $src2}", - int_x86_sse_max_ps>; -def MAXPDrr : PD_Intrr<0x5F, "maxpd {$src2, $dst|$dst, $src2}", - int_x86_sse2_max_pd>; -def MINPSrr : PS_Intrr<0x5D, "minps {$src2, $dst|$dst, $src2}", - int_x86_sse_min_ps>; -def MINPDrr : PD_Intrr<0x5D, "minpd {$src2, $dst|$dst, $src2}", - int_x86_sse2_min_pd>; +def MAXPSrr : PS_Intrr<0x5F, "maxps", int_x86_sse_max_ps>; +def MAXPDrr : PD_Intrr<0x5F, "maxpd", int_x86_sse2_max_pd>; +def MINPSrr : PS_Intrr<0x5D, "minps", int_x86_sse_min_ps>; +def MINPDrr : PD_Intrr<0x5D, "minpd", int_x86_sse2_min_pd>; } -def MAXPSrm : PS_Intrm<0x5F, "maxps {$src2, $dst|$dst, $src2}", - int_x86_sse_max_ps>; -def MAXPDrm : PD_Intrm<0x5F, "maxpd {$src2, $dst|$dst, $src2}", - int_x86_sse2_max_pd>; -def MINPSrm : PS_Intrm<0x5D, "minps {$src2, $dst|$dst, $src2}", - int_x86_sse_min_ps>; -def MINPDrm : PD_Intrm<0x5D, "minpd {$src2, $dst|$dst, $src2}", - int_x86_sse2_min_pd>; +def MAXPSrm : PS_Intrm<0x5F, "maxps", int_x86_sse_max_ps>; +def MAXPDrm : PD_Intrm<0x5F, "maxpd", int_x86_sse2_max_pd>; +def MINPSrm : PS_Intrm<0x5D, "minps", int_x86_sse_min_ps>; +def MINPDrm : PD_Intrm<0x5D, "minpd", int_x86_sse2_min_pd>; } // Logical @@ -1129,7 +1047,7 @@ def ANDPDrr : PDI<0x54, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), "andpd {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (and (bc_v2i64 (v2f64 VR128:$src1)), - (bc_v2i64 (v2f64 VR128:$src2))))]>; + (bc_v2i64 (v2f64 VR128:$src2))))]>; def ORPSrr : PSI<0x56, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), "orps {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (v2i64 (or VR128:$src1, VR128:$src2)))]>; @@ -1137,7 +1055,7 @@ def ORPDrr : PDI<0x56, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), "orpd {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (or (bc_v2i64 (v2f64 VR128:$src1)), - (bc_v2i64 (v2f64 VR128:$src2))))]>; + (bc_v2i64 (v2f64 VR128:$src2))))]>; def XORPSrr : PSI<0x57, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), "xorps {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (v2i64 (xor VR128:$src1, VR128:$src2)))]>; @@ -1145,7 +1063,7 @@ def XORPDrr : PDI<0x57, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), "xorpd {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (xor (bc_v2i64 (v2f64 VR128:$src1)), - (bc_v2i64 (v2f64 VR128:$src2))))]>; + (bc_v2i64 (v2f64 VR128:$src2))))]>; } def ANDPSrm : PSI<0x54, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2), "andps {$src2, $dst|$dst, $src2}", @@ -1155,7 +1073,7 @@ def ANDPDrm : PDI<0x54, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2), "andpd {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (and (bc_v2i64 (v2f64 VR128:$src1)), - (bc_v2i64 (loadv2f64 addr:$src2))))]>; + (bc_v2i64 (loadv2f64 addr:$src2))))]>; def ORPSrm : PSI<0x56, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2), "orps {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (or VR128:$src1, @@ -1164,7 +1082,7 @@ def ORPDrm : PDI<0x56, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2), "orpd {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (or (bc_v2i64 (v2f64 VR128:$src1)), - (bc_v2i64 (loadv2f64 addr:$src2))))]>; + (bc_v2i64 (loadv2f64 addr:$src2))))]>; def XORPSrm : PSI<0x57, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2), "xorps {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (xor VR128:$src1, @@ -1173,7 +1091,7 @@ def XORPDrm : PDI<0x57, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2), "xorpd {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (xor (bc_v2i64 (v2f64 VR128:$src1)), - (bc_v2i64 (loadv2f64 addr:$src2))))]>; + (bc_v2i64 (loadv2f64 addr:$src2))))]>; def ANDNPSrr : PSI<0x55, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), "andnps {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (v2i64 (and (xor VR128:$src1, @@ -1188,12 +1106,12 @@ def ANDNPDrr : PDI<0x55, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), "andnpd {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (and (vnot (bc_v2i64 (v2f64 VR128:$src1))), - (bc_v2i64 (v2f64 VR128:$src2))))]>; + (bc_v2i64 (v2f64 VR128:$src2))))]>; def ANDNPDrm : PDI<0x55, MRMSrcMem, (ops VR128:$dst, VR128:$src1,f128mem:$src2), "andnpd {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (and (vnot (bc_v2i64 (v2f64 VR128:$src1))), - (bc_v2i64 (loadv2f64 addr:$src2))))]>; + (bc_v2i64 (loadv2f64 addr:$src2))))]>; } let isTwoAddress = 1 in { @@ -1221,7 +1139,7 @@ def CMPPDrmi : PDIi8<0xC2, MRMSrcMem, // Shuffle and unpack instructions let isTwoAddress = 1 in { -let isCommutable = 1, isConvertibleToThreeAddress = 1 in // Convert to pshufd +let isConvertibleToThreeAddress = 1 in // Convert to pshufd def SHUFPSrri : PSIi8<0xC6, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2, i32i8imm:$src3), "shufps {$src3, $src2, $dst|$dst, $src2, $src3}", @@ -1234,7 +1152,6 @@ def SHUFPSrmi : PSIi8<0xC6, MRMSrcMem, [(set VR128:$dst, (v4f32 (vector_shuffle VR128:$src1, (load addr:$src2), SHUFP_shuffle_mask:$src3)))]>; -let isCommutable = 1 in def SHUFPDrri : PDIi8<0xC6, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2, i8imm:$src3), "shufpd {$src3, $src2, $dst|$dst, $src2, $src3}", @@ -1302,23 +1219,33 @@ def UNPCKLPDrm : PDI<0x14, MRMSrcMem, } // Horizontal ops + +class S3D_Intrr o, string OpcodeStr, Intrinsic IntId> + : S3DI; +class S3D_Intrm o, string OpcodeStr, Intrinsic IntId> + : S3DI; +class S3_Intrr o, string OpcodeStr, Intrinsic IntId> + : S3I; +class S3_Intrm o, string OpcodeStr, Intrinsic IntId> + : S3I; + let isTwoAddress = 1 in { -def HADDPSrr : S3D_Intrr<0x7C, "haddps {$src2, $dst|$dst, $src2}", - int_x86_sse3_hadd_ps>; -def HADDPSrm : S3D_Intrm<0x7C, "haddps {$src2, $dst|$dst, $src2}", - int_x86_sse3_hadd_ps>; -def HADDPDrr : S3_Intrr<0x7C, "haddpd {$src2, $dst|$dst, $src2}", - int_x86_sse3_hadd_pd>; -def HADDPDrm : S3_Intrm<0x7C, "haddpd {$src2, $dst|$dst, $src2}", - int_x86_sse3_hadd_pd>; -def HSUBPSrr : S3D_Intrr<0x7D, "hsubps {$src2, $dst|$dst, $src2}", - int_x86_sse3_hsub_ps>; -def HSUBPSrm : S3D_Intrm<0x7D, "hsubps {$src2, $dst|$dst, $src2}", - int_x86_sse3_hsub_ps>; -def HSUBPDrr : S3_Intrr<0x7D, "hsubpd {$src2, $dst|$dst, $src2}", - int_x86_sse3_hsub_pd>; -def HSUBPDrm : S3_Intrm<0x7D, "hsubpd {$src2, $dst|$dst, $src2}", - int_x86_sse3_hsub_pd>; +def HADDPSrr : S3D_Intrr<0x7C, "haddps", int_x86_sse3_hadd_ps>; +def HADDPSrm : S3D_Intrm<0x7C, "haddps", int_x86_sse3_hadd_ps>; +def HADDPDrr : S3_Intrr <0x7C, "haddpd", int_x86_sse3_hadd_pd>; +def HADDPDrm : S3_Intrm <0x7C, "haddpd", int_x86_sse3_hadd_pd>; +def HSUBPSrr : S3D_Intrr<0x7D, "hsubps", int_x86_sse3_hsub_ps>; +def HSUBPSrm : S3D_Intrm<0x7D, "hsubps", int_x86_sse3_hsub_ps>; +def HSUBPDrr : S3_Intrr <0x7D, "hsubpd", int_x86_sse3_hsub_pd>; +def HSUBPDrm : S3_Intrm <0x7D, "hsubpd", int_x86_sse3_hsub_pd>; } //===----------------------------------------------------------------------===// @@ -1401,19 +1328,19 @@ def PADDUSWrr : PDI<0xDD, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), def PADDSBrm : PDI<0xEC, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), "paddsb {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_padds_b VR128:$src1, - (bc_v16i8 (loadv2i64 addr:$src2))))]>; + (bitconvert (loadv2i64 addr:$src2))))]>; def PADDSWrm : PDI<0xED, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), "paddsw {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_padds_w VR128:$src1, - (bc_v8i16 (loadv2i64 addr:$src2))))]>; + (bitconvert (loadv2i64 addr:$src2))))]>; def PADDUSBrm : PDI<0xDC, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), "paddusb {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_paddus_b VR128:$src1, - (bc_v16i8 (loadv2i64 addr:$src2))))]>; + (bitconvert (loadv2i64 addr:$src2))))]>; def PADDUSWrm : PDI<0xDD, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), "paddusw {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_paddus_w VR128:$src1, - (bc_v8i16 (loadv2i64 addr:$src2))))]>; + (bitconvert (loadv2i64 addr:$src2))))]>; def PSUBBrr : PDI<0xF8, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), @@ -1467,22 +1394,22 @@ def PSUBSBrm : PDI<0xE8, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), "psubsb {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_psubs_b VR128:$src1, - (bc_v16i8 (loadv2i64 addr:$src2))))]>; + (bitconvert (loadv2i64 addr:$src2))))]>; def PSUBSWrm : PDI<0xE9, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), "psubsw {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_psubs_w VR128:$src1, - (bc_v8i16 (loadv2i64 addr:$src2))))]>; + (bitconvert (loadv2i64 addr:$src2))))]>; def PSUBUSBrm : PDI<0xD8, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), "psubusb {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_psubus_b VR128:$src1, - (bc_v16i8 (loadv2i64 addr:$src2))))]>; + (bitconvert (loadv2i64 addr:$src2))))]>; def PSUBUSWrm : PDI<0xD9, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), "psubusw {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_psubus_w VR128:$src1, - (bc_v8i16 (loadv2i64 addr:$src2))))]>; + (bitconvert (loadv2i64 addr:$src2))))]>; let isCommutable = 1 in { def PMULHUWrr : PDI<0xE4, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), @@ -1501,23 +1428,23 @@ def PMULUDQrr : PDI<0xF4, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), [(set VR128:$dst, (int_x86_sse2_pmulu_dq VR128:$src1, VR128:$src2))]>; } -def PMULHUWrm : PDI<0xE4, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), +def PMULHUWrm : PDI<0xE4, MRMSrcMem, (ops VR128:$dst,VR128:$src1,i128mem:$src2), "pmulhuw {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_pmulhu_w VR128:$src1, - (bc_v8i16 (loadv2i64 addr:$src2))))]>; -def PMULHWrm : PDI<0xE5, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), + (bitconvert (loadv2i64 addr:$src2))))]>; +def PMULHWrm : PDI<0xE5, MRMSrcMem, (ops VR128:$dst, VR128:$src1,i128mem:$src2), "pmulhw {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_pmulh_w VR128:$src1, - (bc_v8i16 (loadv2i64 addr:$src2))))]>; + (bitconvert (loadv2i64 addr:$src2))))]>; def PMULLWrm : PDI<0xD5, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), "pmullw {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (v8i16 (mul VR128:$src1, (bc_v8i16 (loadv2i64 addr:$src2)))))]>; -def PMULUDQrm : PDI<0xF4, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), +def PMULUDQrm : PDI<0xF4, MRMSrcMem, (ops VR128:$dst,VR128:$src1,i128mem:$src2), "pmuludq {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_pmulu_dq VR128:$src1, - (bc_v4i32 (loadv2i64 addr:$src2))))]>; + (bitconvert (loadv2i64 addr:$src2))))]>; let isCommutable = 1 in { def PMADDWDrr : PDI<0xF5, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), @@ -1529,7 +1456,7 @@ def PMADDWDrm : PDI<0xF5, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), "pmaddwd {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_pmadd_wd VR128:$src1, - (bc_v8i16 (loadv2i64 addr:$src2))))]>; + (bitconvert (loadv2i64 addr:$src2))))]>; let isCommutable = 1 in { def PAVGBrr : PDI<0xE0, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), @@ -1544,11 +1471,11 @@ def PAVGWrr : PDI<0xE3, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), def PAVGBrm : PDI<0xE0, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), "pavgb {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_pavg_b VR128:$src1, - (bc_v16i8 (loadv2i64 addr:$src2))))]>; + (bitconvert (loadv2i64 addr:$src2))))]>; def PAVGWrm : PDI<0xE3, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), "pavgw {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_pavg_w VR128:$src1, - (bc_v8i16 (loadv2i64 addr:$src2))))]>; + (bitconvert (loadv2i64 addr:$src2))))]>; let isCommutable = 1 in { def PMAXUBrr : PDI<0xDE, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), @@ -1563,11 +1490,11 @@ def PMAXSWrr : PDI<0xEE, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), def PMAXUBrm : PDI<0xDE, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), "pmaxub {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_pmaxu_b VR128:$src1, - (bc_v16i8 (loadv2i64 addr:$src2))))]>; + (bitconvert (loadv2i64 addr:$src2))))]>; def PMAXSWrm : PDI<0xEE, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), "pmaxsw {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_pmaxs_w VR128:$src1, - (bc_v8i16 (loadv2i64 addr:$src2))))]>; + (bitconvert (loadv2i64 addr:$src2))))]>; let isCommutable = 1 in { def PMINUBrr : PDI<0xDA, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), @@ -1579,14 +1506,14 @@ def PMINSWrr : PDI<0xEA, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), [(set VR128:$dst, (int_x86_sse2_pmins_w VR128:$src1, VR128:$src2))]>; } -def PMINUBrm : PDI<0xDA, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), +def PMINUBrm : PDI<0xDA, MRMSrcMem, (ops VR128:$dst, VR128:$src1,i128mem:$src2), "pminub {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_pminu_b VR128:$src1, - (bc_v16i8 (loadv2i64 addr:$src2))))]>; -def PMINSWrm : PDI<0xEA, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), + (bitconvert (loadv2i64 addr:$src2))))]>; +def PMINSWrm : PDI<0xEA, MRMSrcMem, (ops VR128:$dst, VR128:$src1,i128mem:$src2), "pminsw {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_pmins_w VR128:$src1, - (bc_v8i16 (loadv2i64 addr:$src2))))]>; + (bitconvert (loadv2i64 addr:$src2))))]>; let isCommutable = 1 in { @@ -1595,45 +1522,45 @@ def PSADBWrr : PDI<0xE0, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), [(set VR128:$dst, (int_x86_sse2_psad_bw VR128:$src1, VR128:$src2))]>; } -def PSADBWrm : PDI<0xE0, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), +def PSADBWrm : PDI<0xE0, MRMSrcMem, (ops VR128:$dst, VR128:$src1,i128mem:$src2), "psadbw {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_psad_bw VR128:$src1, - (bc_v16i8 (loadv2i64 addr:$src2))))]>; + (bitconvert (loadv2i64 addr:$src2))))]>; } let isTwoAddress = 1 in { -def PSLLWrr : PDIi8<0xF1, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), - "psllw {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_psll_w VR128:$src1, - VR128:$src2))]>; -def PSLLWrm : PDIi8<0xF1, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), - "psllw {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_psll_w VR128:$src1, - (bc_v4i32 (loadv2i64 addr:$src2))))]>; +def PSLLWrr : PDI<0xF1, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + "psllw {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psll_w VR128:$src1, + VR128:$src2))]>; +def PSLLWrm : PDI<0xF1, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), + "psllw {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psll_w VR128:$src1, + (bitconvert (loadv2i64 addr:$src2))))]>; def PSLLWri : PDIi8<0x71, MRM6r, (ops VR128:$dst, VR128:$src1, i32i8imm:$src2), "psllw {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_psll_w VR128:$src1, (scalar_to_vector (i32 imm:$src2))))]>; -def PSLLDrr : PDIi8<0xF2, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), - "pslld {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_psll_d VR128:$src1, - VR128:$src2))]>; -def PSLLDrm : PDIi8<0xF2, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), - "pslld {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_psll_d VR128:$src1, - (bc_v4i32 (loadv2i64 addr:$src2))))]>; +def PSLLDrr : PDI<0xF2, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + "pslld {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psll_d VR128:$src1, + VR128:$src2))]>; +def PSLLDrm : PDI<0xF2, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), + "pslld {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psll_d VR128:$src1, + (bitconvert (loadv2i64 addr:$src2))))]>; def PSLLDri : PDIi8<0x72, MRM6r, (ops VR128:$dst, VR128:$src1, i32i8imm:$src2), "pslld {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_psll_d VR128:$src1, (scalar_to_vector (i32 imm:$src2))))]>; -def PSLLQrr : PDIi8<0xF3, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), - "psllq {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_psll_q VR128:$src1, - VR128:$src2))]>; -def PSLLQrm : PDIi8<0xF3, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), - "psllq {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_psll_q VR128:$src1, - (bc_v4i32 (loadv2i64 addr:$src2))))]>; +def PSLLQrr : PDI<0xF3, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + "psllq {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psll_q VR128:$src1, + VR128:$src2))]>; +def PSLLQrm : PDI<0xF3, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), + "psllq {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psll_q VR128:$src1, + (bitconvert (loadv2i64 addr:$src2))))]>; def PSLLQri : PDIi8<0x73, MRM6r, (ops VR128:$dst, VR128:$src1, i32i8imm:$src2), "psllq {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_psll_q VR128:$src1, @@ -1641,38 +1568,38 @@ def PSLLQri : PDIi8<0x73, MRM6r, (ops VR128:$dst, VR128:$src1, i32i8imm:$src2), def PSLLDQri : PDIi8<0x73, MRM7r, (ops VR128:$dst, VR128:$src1, i32i8imm:$src2), "pslldq {$src2, $dst|$dst, $src2}", []>; -def PSRLWrr : PDIi8<0xD1, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), - "psrlw {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_psrl_w VR128:$src1, - VR128:$src2))]>; -def PSRLWrm : PDIi8<0xD1, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), - "psrlw {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_psrl_w VR128:$src1, - (bc_v4i32 (loadv2i64 addr:$src2))))]>; +def PSRLWrr : PDI<0xD1, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + "psrlw {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psrl_w VR128:$src1, + VR128:$src2))]>; +def PSRLWrm : PDI<0xD1, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), + "psrlw {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psrl_w VR128:$src1, + (bitconvert (loadv2i64 addr:$src2))))]>; def PSRLWri : PDIi8<0x71, MRM2r, (ops VR128:$dst, VR128:$src1, i32i8imm:$src2), "psrlw {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_psrl_w VR128:$src1, (scalar_to_vector (i32 imm:$src2))))]>; -def PSRLDrr : PDIi8<0xD2, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), - "psrld {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_psrl_d VR128:$src1, - VR128:$src2))]>; -def PSRLDrm : PDIi8<0xD2, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), - "psrld {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_psrl_d VR128:$src1, - (bc_v4i32 (loadv2i64 addr:$src2))))]>; +def PSRLDrr : PDI<0xD2, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + "psrld {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psrl_d VR128:$src1, + VR128:$src2))]>; +def PSRLDrm : PDI<0xD2, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), + "psrld {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psrl_d VR128:$src1, + (bitconvert (loadv2i64 addr:$src2))))]>; def PSRLDri : PDIi8<0x72, MRM2r, (ops VR128:$dst, VR128:$src1, i32i8imm:$src2), "psrld {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_psrl_d VR128:$src1, (scalar_to_vector (i32 imm:$src2))))]>; -def PSRLQrr : PDIi8<0xD3, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), - "psrlq {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_psrl_q VR128:$src1, - VR128:$src2))]>; -def PSRLQrm : PDIi8<0xD3, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), - "psrlq {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_psrl_q VR128:$src1, - (bc_v4i32 (loadv2i64 addr:$src2))))]>; +def PSRLQrr : PDI<0xD3, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + "psrlq {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psrl_q VR128:$src1, + VR128:$src2))]>; +def PSRLQrm : PDI<0xD3, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), + "psrlq {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psrl_q VR128:$src1, + (bitconvert (loadv2i64 addr:$src2))))]>; def PSRLQri : PDIi8<0x73, MRM2r, (ops VR128:$dst, VR128:$src1, i32i8imm:$src2), "psrlq {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_psrl_q VR128:$src1, @@ -1680,26 +1607,26 @@ def PSRLQri : PDIi8<0x73, MRM2r, (ops VR128:$dst, VR128:$src1, i32i8imm:$src2), def PSRLDQri : PDIi8<0x73, MRM3r, (ops VR128:$dst, VR128:$src1, i32i8imm:$src2), "psrldq {$src2, $dst|$dst, $src2}", []>; -def PSRAWrr : PDIi8<0xE1, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), - "psraw {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_psra_w VR128:$src1, - VR128:$src2))]>; -def PSRAWrm : PDIi8<0xE1, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), - "psraw {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_psra_w VR128:$src1, - (bc_v4i32 (loadv2i64 addr:$src2))))]>; +def PSRAWrr : PDI<0xE1, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + "psraw {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psra_w VR128:$src1, + VR128:$src2))]>; +def PSRAWrm : PDI<0xE1, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), + "psraw {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psra_w VR128:$src1, + (bitconvert (loadv2i64 addr:$src2))))]>; def PSRAWri : PDIi8<0x71, MRM4r, (ops VR128:$dst, VR128:$src1, i32i8imm:$src2), "psraw {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_psra_w VR128:$src1, (scalar_to_vector (i32 imm:$src2))))]>; -def PSRADrr : PDIi8<0xE2, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), - "psrad {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_psra_d VR128:$src1, - VR128:$src2))]>; -def PSRADrm : PDIi8<0xE2, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), - "psrad {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_psra_d VR128:$src1, - (bc_v4i32 (loadv2i64 addr:$src2))))]>; +def PSRADrr : PDI<0xE2, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + "psrad {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psra_d VR128:$src1, + VR128:$src2))]>; +def PSRADrm : PDI<0xE2, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), + "psrad {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psra_d VR128:$src1, + (bitconvert (loadv2i64 addr:$src2))))]>; def PSRADri : PDIi8<0x72, MRM4r, (ops VR128:$dst, VR128:$src1, i32i8imm:$src2), "psrad {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_psra_d VR128:$src1, @@ -1744,69 +1671,27 @@ def PANDNrm : PDI<0xDF, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), (load addr:$src2))))]>; } -// SSE2 Integer comparison + let isTwoAddress = 1 in { -def PCMPEQBrr : PDI<0x74, MRMSrcReg, - (ops VR128:$dst, VR128:$src1, VR128:$src2), - "pcmpeqb {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_pcmpeq_b VR128:$src1, - VR128:$src2))]>; -def PCMPEQBrm : PDI<0x74, MRMSrcMem, - (ops VR128:$dst, VR128:$src1, i128mem:$src2), - "pcmpeqb {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_pcmpeq_b VR128:$src1, - (bc_v16i8 (loadv2i64 addr:$src2))))]>; -def PCMPEQWrr : PDI<0x75, MRMSrcReg, - (ops VR128:$dst, VR128:$src1, VR128:$src2), - "pcmpeqw {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_pcmpeq_w VR128:$src1, - VR128:$src2))]>; -def PCMPEQWrm : PDI<0x75, MRMSrcMem, - (ops VR128:$dst, VR128:$src1, i128mem:$src2), - "pcmpeqw {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_pcmpeq_w VR128:$src1, - (bc_v8i16 (loadv2i64 addr:$src2))))]>; -def PCMPEQDrr : PDI<0x76, MRMSrcReg, - (ops VR128:$dst, VR128:$src1, VR128:$src2), - "pcmpeqd {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_pcmpeq_d VR128:$src1, - VR128:$src2))]>; -def PCMPEQDrm : PDI<0x76, MRMSrcMem, - (ops VR128:$dst, VR128:$src1, i128mem:$src2), - "pcmpeqd {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_pcmpeq_d VR128:$src1, - (bc_v4i32 (loadv2i64 addr:$src2))))]>; +multiclass PDI_binop_rm opc, string OpcodeStr, Intrinsic IntId> { + def rr : PDI; + def rm : PDI; +} +} -def PCMPGTBrr : PDI<0x64, MRMSrcReg, - (ops VR128:$dst, VR128:$src1, VR128:$src2), - "pcmpgtb {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_pcmpgt_b VR128:$src1, - VR128:$src2))]>; -def PCMPGTBrm : PDI<0x64, MRMSrcMem, - (ops VR128:$dst, VR128:$src1, i128mem:$src2), - "pcmpgtb {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_pcmpgt_b VR128:$src1, - (bc_v16i8 (loadv2i64 addr:$src2))))]>; -def PCMPGTWrr : PDI<0x65, MRMSrcReg, - (ops VR128:$dst, VR128:$src1, VR128:$src2), - "pcmpgtw {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_pcmpgt_w VR128:$src1, - VR128:$src2))]>; -def PCMPGTWrm : PDI<0x65, MRMSrcMem, - (ops VR128:$dst, VR128:$src1, i128mem:$src2), - "pcmpgtw {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_pcmpgt_w VR128:$src1, - (bc_v8i16 (loadv2i64 addr:$src2))))]>; -def PCMPGTDrr : PDI<0x66, MRMSrcReg, - (ops VR128:$dst, VR128:$src1, VR128:$src2), - "pcmpgtd {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_pcmpgt_d VR128:$src1, - VR128:$src2))]>; -def PCMPGTDrm : PDI<0x66, MRMSrcMem, - (ops VR128:$dst, VR128:$src1, i128mem:$src2), - "pcmpgtd {$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_pcmpgt_d VR128:$src1, - (bc_v4i32 (loadv2i64 addr:$src2))))]>; +// SSE2 Integer comparison +let isTwoAddress = 1 in { +defm PCMPEQB : PDI_binop_rm<0x74, "pcmpeqb", int_x86_sse2_pcmpeq_b>; +defm PCMPEQW : PDI_binop_rm<0x75, "pcmpeqw", int_x86_sse2_pcmpeq_w>; +defm PCMPEQD : PDI_binop_rm<0x76, "pcmpeqd", int_x86_sse2_pcmpeq_d>; +defm PCMPGTB : PDI_binop_rm<0x64, "pcmpgtb", int_x86_sse2_pcmpgt_b>; +defm PCMPGTW : PDI_binop_rm<0x65, "pcmpgtw", int_x86_sse2_pcmpgt_w>; +defm PCMPGTD : PDI_binop_rm<0x66, "pcmpgtd", int_x86_sse2_pcmpgt_d>; } // Pack instructions @@ -1822,7 +1707,7 @@ def PACKSSWBrm : PDI<0x63, MRMSrcMem, (ops VR128:$dst, VR128:$src1, "packsswb {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (v8i16 (int_x86_sse2_packsswb_128 VR128:$src1, - (bc_v8i16 (loadv2f64 addr:$src2)))))]>; + (bitconvert (loadv2f64 addr:$src2)))))]>; def PACKSSDWrr : PDI<0x6B, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), "packssdw {$src2, $dst|$dst, $src2}", @@ -1834,7 +1719,7 @@ def PACKSSDWrm : PDI<0x6B, MRMSrcMem, (ops VR128:$dst, VR128:$src1, "packssdw {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (v4i32 (int_x86_sse2_packssdw_128 VR128:$src1, - (bc_v4i32 (loadv2i64 addr:$src2)))))]>; + (bitconvert (loadv2i64 addr:$src2)))))]>; def PACKUSWBrr : PDI<0x67, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), "packuswb {$src2, $dst|$dst, $src2}", @@ -1846,7 +1731,7 @@ def PACKUSWBrm : PDI<0x67, MRMSrcMem, (ops VR128:$dst, VR128:$src1, "packuswb {$src2, $dst|$dst, $src2}", [(set VR128:$dst, (v8i16 (int_x86_sse2_packuswb_128 VR128:$src1, - (bc_v8i16 (loadv2i64 addr:$src2)))))]>; + (bitconvert (loadv2i64 addr:$src2)))))]>; } // Shuffle and unpack instructions @@ -1860,7 +1745,7 @@ def PSHUFDmi : PDIi8<0x70, MRMSrcMem, (ops VR128:$dst, i128mem:$src1, i8imm:$src2), "pshufd {$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (v4i32 (vector_shuffle - (bc_v4i32 (loadv2i64 addr:$src1)), + (bc_v4i32(loadv2i64 addr:$src1)), (undef), PSHUFD_shuffle_mask:$src2)))]>; @@ -2044,7 +1929,7 @@ def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (ops GR32:$dst, VR128:$src), [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>; // Conditional store -def MASKMOVDQU : PDI<0xF7, RawFrm, (ops VR128:$src, VR128:$mask), +def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (ops VR128:$src, VR128:$mask), "maskmovdqu {$mask, $src|$src, $mask}", [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>, Imp<[EDI],[]>; @@ -2109,15 +1994,9 @@ def MWAIT : I<0xC9, RawFrm, (ops), "mwait", // Alias instructions that map zero vector to pxor / xorp* for sse. // FIXME: remove when we can teach regalloc that xor reg, reg is ok. -def V_SET0_PI : PDI<0xEF, MRMInitReg, (ops VR128:$dst), - "pxor $dst, $dst", - [(set VR128:$dst, (v2i64 immAllZerosV))]>; -def V_SET0_PS : PSI<0x57, MRMInitReg, (ops VR128:$dst), - "xorps $dst, $dst", - [(set VR128:$dst, (v4f32 immAllZerosV))]>; -def V_SET0_PD : PDI<0x57, MRMInitReg, (ops VR128:$dst), - "xorpd $dst, $dst", - [(set VR128:$dst, (v2f64 immAllZerosV))]>; +def V_SET0 : PSI<0x57, MRMInitReg, (ops VR128:$dst), + "xorps $dst, $dst", + [(set VR128:$dst, (v4f32 immAllZerosV))]>; def V_SETALLONES : PDI<0x76, MRMInitReg, (ops VR128:$dst), "pcmpeqd $dst, $dst", @@ -2234,7 +2113,7 @@ def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (ops VR128:$dst, f64mem:$src), def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (ops VR128:$dst, GR32:$src), "movd {$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (vector_shuffle immAllZerosV, - (v4i32 (scalar_to_vector GR32:$src)), + (v4i32 (scalar_to_vector GR32:$src)), MOVL_shuffle_mask)))]>; def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (ops VR128:$dst, i32mem:$src), "movd {$src, $dst|$dst, $src}", @@ -2249,7 +2128,7 @@ def MOVZQI2PQIrr : I<0x7E, MRMSrcReg, (ops VR128:$dst, VR128:$src), def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (ops VR128:$dst, i64mem:$src), "movq {$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_movl_dq - (bc_v4i32 (loadv2i64 addr:$src))))]>, + (bitconvert (loadv2i64 addr:$src))))]>, XS, Requires<[HasSSE2]>; } @@ -2265,16 +2144,18 @@ def : Pat<(v4i32 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>; def : Pat<(v2i64 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>; // 128-bit vector all zero's. -def : Pat<(v16i8 immAllZerosV), (v16i8 (V_SET0_PI))>, Requires<[HasSSE2]>; -def : Pat<(v8i16 immAllZerosV), (v8i16 (V_SET0_PI))>, Requires<[HasSSE2]>; -def : Pat<(v4i32 immAllZerosV), (v4i32 (V_SET0_PI))>, Requires<[HasSSE2]>; +def : Pat<(v16i8 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>; +def : Pat<(v8i16 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>; +def : Pat<(v4i32 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>; +def : Pat<(v2i64 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>; +def : Pat<(v2f64 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>; // 128-bit vector all one's. -def : Pat<(v16i8 immAllOnesV), (v16i8 (V_SETALLONES))>, Requires<[HasSSE2]>; -def : Pat<(v8i16 immAllOnesV), (v8i16 (V_SETALLONES))>, Requires<[HasSSE2]>; -def : Pat<(v4i32 immAllOnesV), (v4i32 (V_SETALLONES))>, Requires<[HasSSE2]>; -def : Pat<(v2i64 immAllOnesV), (v2i64 (V_SETALLONES))>, Requires<[HasSSE2]>; -def : Pat<(v4f32 immAllOnesV), (v4f32 (V_SETALLONES))>, Requires<[HasSSE1]>; +def : Pat<(v16i8 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>; +def : Pat<(v8i16 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>; +def : Pat<(v4i32 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>; +def : Pat<(v2i64 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>; +def : Pat<(v4f32 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE1]>; // Store 128-bit integer vector values. def : Pat<(store (v16i8 VR128:$src), addr:$dst), @@ -2286,227 +2167,199 @@ def : Pat<(store (v4i32 VR128:$src), addr:$dst), // Scalar to v8i16 / v16i8. The source may be a GR32, but only the lower 8 or // 16-bits matter. -def : Pat<(v8i16 (X86s2vec GR32:$src)), (v8i16 (MOVDI2PDIrr GR32:$src))>, +def : Pat<(v8i16 (X86s2vec GR32:$src)), (MOVDI2PDIrr GR32:$src)>, Requires<[HasSSE2]>; -def : Pat<(v16i8 (X86s2vec GR32:$src)), (v16i8 (MOVDI2PDIrr GR32:$src))>, +def : Pat<(v16i8 (X86s2vec GR32:$src)), (MOVDI2PDIrr GR32:$src)>, Requires<[HasSSE2]>; // bit_convert -def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v4i32 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v4i32 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>, - Requires<[HasSSE2]>; +let Predicates = [HasSSE2] in { + def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; +} // Move scalar to XMM zero-extended // movd to XMM register zero-extends let AddedComplexity = 20 in { def : Pat<(v8i16 (vector_shuffle immAllZerosV, (v8i16 (X86s2vec GR32:$src)), MOVL_shuffle_mask)), - (v8i16 (MOVZDI2PDIrr GR32:$src))>, Requires<[HasSSE2]>; + (MOVZDI2PDIrr GR32:$src)>, Requires<[HasSSE2]>; def : Pat<(v16i8 (vector_shuffle immAllZerosV, (v16i8 (X86s2vec GR32:$src)), MOVL_shuffle_mask)), - (v16i8 (MOVZDI2PDIrr GR32:$src))>, Requires<[HasSSE2]>; + (MOVZDI2PDIrr GR32:$src)>, Requires<[HasSSE2]>; // Zeroing a VR128 then do a MOVS{S|D} to the lower bits. def : Pat<(v2f64 (vector_shuffle immAllZerosV, (v2f64 (scalar_to_vector FR64:$src)), MOVL_shuffle_mask)), - (v2f64 (MOVLSD2PDrr (V_SET0_PD), FR64:$src))>, Requires<[HasSSE2]>; + (MOVLSD2PDrr (V_SET0), FR64:$src)>, Requires<[HasSSE2]>; def : Pat<(v4f32 (vector_shuffle immAllZerosV, (v4f32 (scalar_to_vector FR32:$src)), MOVL_shuffle_mask)), - (v4f32 (MOVLSS2PSrr (V_SET0_PS), FR32:$src))>, Requires<[HasSSE2]>; + (MOVLSS2PSrr (V_SET0), FR32:$src)>, Requires<[HasSSE2]>; } // Splat v2f64 / v2i64 let AddedComplexity = 10 in { def : Pat<(vector_shuffle (v2f64 VR128:$src), (undef), SSE_splat_v2_mask:$sm), - (v2f64 (UNPCKLPDrr VR128:$src, VR128:$src))>, Requires<[HasSSE2]>; + (UNPCKLPDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; def : Pat<(vector_shuffle (v2i64 VR128:$src), (undef), SSE_splat_v2_mask:$sm), - (v2i64 (PUNPCKLQDQrr VR128:$src, VR128:$src))>, Requires<[HasSSE2]>; + (PUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; } // Splat v4f32 def : Pat<(vector_shuffle (v4f32 VR128:$src), (undef), SSE_splat_mask:$sm), - (v4f32 (SHUFPSrri VR128:$src, VR128:$src, SSE_splat_mask:$sm))>, + (SHUFPSrri VR128:$src, VR128:$src, SSE_splat_mask:$sm)>, Requires<[HasSSE1]>; // Special unary SHUFPSrri case. // FIXME: when we want non two-address code, then we should use PSHUFD? def : Pat<(vector_shuffle (v4f32 VR128:$src1), (undef), SHUFP_unary_shuffle_mask:$sm), - (v4f32 (SHUFPSrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm))>, + (SHUFPSrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>, Requires<[HasSSE1]>; // Unary v4f32 shuffle with PSHUF* in order to fold a load. def : Pat<(vector_shuffle (loadv4f32 addr:$src1), (undef), SHUFP_unary_shuffle_mask:$sm), - (v4f32 (PSHUFDmi addr:$src1, SHUFP_unary_shuffle_mask:$sm))>, + (PSHUFDmi addr:$src1, SHUFP_unary_shuffle_mask:$sm)>, Requires<[HasSSE2]>; // Special binary v4i32 shuffle cases with SHUFPS. def : Pat<(vector_shuffle (v4i32 VR128:$src1), (v4i32 VR128:$src2), PSHUFD_binary_shuffle_mask:$sm), - (v4i32 (SHUFPSrri VR128:$src1, VR128:$src2, - PSHUFD_binary_shuffle_mask:$sm))>, Requires<[HasSSE2]>; + (SHUFPSrri VR128:$src1, VR128:$src2, PSHUFD_binary_shuffle_mask:$sm)>, + Requires<[HasSSE2]>; def : Pat<(vector_shuffle (v4i32 VR128:$src1), (bc_v4i32 (loadv2i64 addr:$src2)), PSHUFD_binary_shuffle_mask:$sm), - (v4i32 (SHUFPSrmi VR128:$src1, addr:$src2, - PSHUFD_binary_shuffle_mask:$sm))>, Requires<[HasSSE2]>; + (SHUFPSrmi VR128:$src1, addr:$src2, PSHUFD_binary_shuffle_mask:$sm)>, + Requires<[HasSSE2]>; // vector_shuffle v1, , <0, 0, 1, 1, ...> let AddedComplexity = 10 in { def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef), UNPCKL_v_undef_shuffle_mask)), - (v4f32 (UNPCKLPSrr VR128:$src, VR128:$src))>, Requires<[HasSSE2]>; + (UNPCKLPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; def : Pat<(v16i8 (vector_shuffle VR128:$src, (undef), UNPCKL_v_undef_shuffle_mask)), - (v16i8 (PUNPCKLBWrr VR128:$src, VR128:$src))>, Requires<[HasSSE2]>; + (PUNPCKLBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; def : Pat<(v8i16 (vector_shuffle VR128:$src, (undef), UNPCKL_v_undef_shuffle_mask)), - (v8i16 (PUNPCKLWDrr VR128:$src, VR128:$src))>, Requires<[HasSSE2]>; + (PUNPCKLWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef), UNPCKL_v_undef_shuffle_mask)), - (v4i32 (PUNPCKLDQrr VR128:$src, VR128:$src))>, Requires<[HasSSE1]>; + (PUNPCKLDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>; } let AddedComplexity = 20 in { // vector_shuffle v1, <1, 1, 3, 3> def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef), MOVSHDUP_shuffle_mask)), - (v4i32 (MOVSHDUPrr VR128:$src))>, Requires<[HasSSE3]>; + (MOVSHDUPrr VR128:$src)>, Requires<[HasSSE3]>; def : Pat<(v4i32 (vector_shuffle (bc_v4i32 (loadv2i64 addr:$src)), (undef), MOVSHDUP_shuffle_mask)), - (v4i32 (MOVSHDUPrm addr:$src))>, Requires<[HasSSE3]>; + (MOVSHDUPrm addr:$src)>, Requires<[HasSSE3]>; // vector_shuffle v1, <0, 0, 2, 2> def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef), MOVSLDUP_shuffle_mask)), - (v4i32 (MOVSLDUPrr VR128:$src))>, Requires<[HasSSE3]>; + (MOVSLDUPrr VR128:$src)>, Requires<[HasSSE3]>; def : Pat<(v4i32 (vector_shuffle (bc_v4i32 (loadv2i64 addr:$src)), (undef), MOVSLDUP_shuffle_mask)), - (v4i32 (MOVSLDUPrm addr:$src))>, Requires<[HasSSE3]>; + (MOVSLDUPrm addr:$src)>, Requires<[HasSSE3]>; } let AddedComplexity = 20 in { // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2, MOVHP_shuffle_mask)), - (v4i32 (MOVLHPSrr VR128:$src1, VR128:$src2))>; + (MOVLHPSrr VR128:$src1, VR128:$src2)>; // vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2, MOVHLPS_shuffle_mask)), - (v4i32 (MOVHLPSrr VR128:$src1, VR128:$src2))>; + (MOVHLPSrr VR128:$src1, VR128:$src2)>; // vector_shuffle v1, undef <2, 3, ?, ?> using MOVHLPS def : Pat<(v4f32 (vector_shuffle VR128:$src1, (undef), UNPCKH_shuffle_mask)), - (v4f32 (MOVHLPSrr VR128:$src1, VR128:$src1))>; + (MOVHLPSrr VR128:$src1, VR128:$src1)>; def : Pat<(v4i32 (vector_shuffle VR128:$src1, (undef), UNPCKH_shuffle_mask)), - (v4i32 (MOVHLPSrr VR128:$src1, VR128:$src1))>; + (MOVHLPSrr VR128:$src1, VR128:$src1)>; // vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS // vector_shuffle v1, (load v2) <0, 1, 4, 5> using MOVHPS def : Pat<(v4f32 (vector_shuffle VR128:$src1, (loadv4f32 addr:$src2), MOVLP_shuffle_mask)), - (v4f32 (MOVLPSrm VR128:$src1, addr:$src2))>, Requires<[HasSSE1]>; + (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>; def : Pat<(v2f64 (vector_shuffle VR128:$src1, (loadv2f64 addr:$src2), MOVLP_shuffle_mask)), - (v2f64 (MOVLPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>; + (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; def : Pat<(v4f32 (vector_shuffle VR128:$src1, (loadv4f32 addr:$src2), MOVHP_shuffle_mask)), - (v4f32 (MOVHPSrm VR128:$src1, addr:$src2))>, Requires<[HasSSE1]>; + (MOVHPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>; def : Pat<(v2f64 (vector_shuffle VR128:$src1, (loadv2f64 addr:$src2), MOVHP_shuffle_mask)), - (v2f64 (MOVHPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>; + (MOVHPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; def : Pat<(v4i32 (vector_shuffle VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)), MOVLP_shuffle_mask)), - (v4i32 (MOVLPSrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>; + (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; def : Pat<(v2i64 (vector_shuffle VR128:$src1, (loadv2i64 addr:$src2), MOVLP_shuffle_mask)), - (v2i64 (MOVLPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>; + (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; def : Pat<(v4i32 (vector_shuffle VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)), MOVHP_shuffle_mask)), - (v4i32 (MOVHPSrm VR128:$src1, addr:$src2))>, Requires<[HasSSE1]>; + (MOVHPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>; def : Pat<(v2i64 (vector_shuffle VR128:$src1, (loadv2i64 addr:$src2), MOVLP_shuffle_mask)), - (v2i64 (MOVLPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>; + (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; // Setting the lowest element in the vector. def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2, MOVL_shuffle_mask)), - (v4i32 (MOVLPSrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>; + (MOVLPSrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; def : Pat<(v2i64 (vector_shuffle VR128:$src1, VR128:$src2, MOVL_shuffle_mask)), - (v2i64 (MOVLPDrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>; + (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; // vector_shuffle v1, v2 <4, 5, 2, 3> using MOVLPDrr (movsd) def : Pat<(v4f32 (vector_shuffle VR128:$src1, VR128:$src2, MOVLP_shuffle_mask)), - (v4f32 (MOVLPDrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>; + (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2, MOVLP_shuffle_mask)), - (v4i32 (MOVLPDrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>; + (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; // Set lowest element and zero upper elements. def : Pat<(bc_v2i64 (vector_shuffle immAllZerosV, (v2f64 (scalar_to_vector (loadf64 addr:$src))), MOVL_shuffle_mask)), - (v2i64 (MOVZQI2PQIrm addr:$src))>, Requires<[HasSSE2]>; + (MOVZQI2PQIrm addr:$src)>, Requires<[HasSSE2]>; } // FIXME: Temporary workaround since 2-wide shuffle is broken. @@ -2550,20 +2403,24 @@ def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2), // Some special case pandn patterns. def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))), VR128:$src2)), - (v2i64 (PANDNrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>; + (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))), VR128:$src2)), - (v2i64 (PANDNrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>; + (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))), VR128:$src2)), - (v2i64 (PANDNrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>; + (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))), (load addr:$src2))), - (v2i64 (PANDNrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>; + (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))), (load addr:$src2))), - (v2i64 (PANDNrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>; + (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))), (load addr:$src2))), - (v2i64 (PANDNrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>; + (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; + +// Unaligned load +def : Pat<(v4f32 (X86loadu addr:$src)), (MOVUPSrm addr:$src)>, + Requires<[HasSSE1]>;