From: Simon Pilgrim Date: Sat, 14 Feb 2015 22:40:46 +0000 (+0000) Subject: [X86][XOP] Enable commutation for XOP instructions X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=6d5ee8a8b598850c80a4a33be616ef87906c2d8d;p=oota-llvm.git [X86][XOP] Enable commutation for XOP instructions Patch to allow XOP instructions (integer comparison and integer multiply-add) to be commuted. The comparison instructions sometimes require the compare mode to be flipped but the remaining instructions can use default commutation modes. This patch also sets the SSE domains of all the XOP instructions. Differential Revision: http://reviews.llvm.org/D7646 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@229267 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 0707f021a71..57a078ef95c 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -2906,6 +2906,32 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { return nullptr; } } + case X86::VPCOMBri: case X86::VPCOMUBri: + case X86::VPCOMDri: case X86::VPCOMUDri: + case X86::VPCOMQri: case X86::VPCOMUQri: + case X86::VPCOMWri: case X86::VPCOMUWri: { + // Flip comparison mode immediate (if necessary). + unsigned Imm = MI->getOperand(3).getImm() & 0x7; + switch (Imm) { + case 0x00: Imm = 0x02; break; // LT -> GT + case 0x01: Imm = 0x03; break; // LE -> GE + case 0x02: Imm = 0x00; break; // GT -> LT + case 0x03: Imm = 0x01; break; // GE -> LE + case 0x04: // EQ + case 0x05: // NE + case 0x06: // FALSE + case 0x07: // TRUE + default: + break; + } + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->getOperand(3).setImm(Imm); + return TargetInstrInfo::commuteInstruction(MI, NewMI); + } case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr: case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr: case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr: diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td index f3a202d0d5d..8455b8d8467 100644 --- a/lib/Target/X86/X86InstrXOP.td +++ b/lib/Target/X86/X86InstrXOP.td @@ -20,21 +20,23 @@ multiclass xop2op opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP; } -defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>; -defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>; -defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>; -defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>; -defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>; -defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>; -defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>; -defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>; -defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>; -defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>; -defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>; -defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>; -defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>; -defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>; -defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>; +let ExeDomain = SSEPackedInt in { + defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>; + defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>; + defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>; + defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>; + defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>; + defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>; + defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>; + defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>; + defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>; + defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>; + defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>; + defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>; + defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>; + defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>; + defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>; +} // Scalar load 2 addr operand instructions multiclass xop2opsld opc, string OpcodeStr, Intrinsic Int, @@ -47,11 +49,6 @@ multiclass xop2opsld opc, string OpcodeStr, Intrinsic Int, [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP; } -defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss, - ssmem, sse_load_f32>; -defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd, - sdmem, sse_load_f64>; - multiclass xop2op128 opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { def rr : IXOP opc, string OpcodeStr, Intrinsic Int, [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP; } -defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32>; -defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64>; - multiclass xop2op256 opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { def rrY : IXOP opc, string OpcodeStr, Intrinsic Int, [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L; } -defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32>; -defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>; +let ExeDomain = SSEPackedSingle in { + defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss, + ssmem, sse_load_f32>; + defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32>; + defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32>; +} + +let ExeDomain = SSEPackedDouble in { + defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd, + sdmem, sse_load_f64>; + defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64>; + defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>; +} multiclass xop3op opc, string OpcodeStr, Intrinsic Int> { def rr : IXOP opc, string OpcodeStr, Intrinsic Int> { XOP_4VOp3; } -defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>; -defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>; -defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>; -defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>; -defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>; -defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>; -defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>; -defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>; -defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>; -defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>; -defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>; -defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>; +let ExeDomain = SSEPackedInt in { + defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>; + defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>; + defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>; + defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>; + defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>; + defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>; + defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>; + defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>; + defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>; + defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>; + defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>; + defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>; +} multiclass xop3opimm opc, string OpcodeStr, Intrinsic Int> { def ri : IXOPi8 opc, string OpcodeStr, Intrinsic Int> { (Int (bitconvert (loadv2i64 addr:$src1)), imm:$src2))]>, XOP; } -defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>; -defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>; -defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>; -defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>; +let ExeDomain = SSEPackedInt in { + defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>; + defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>; + defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>; + defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>; +} // Instruction where second source can be memory, but third must be register multiclass xop4opm2 opc, string OpcodeStr, Intrinsic Int> { + let isCommutable = 1 in def rr : IXOPi8 opc, string OpcodeStr, Intrinsic Int> { VR128:$src3))]>, XOP_4V, VEX_I8IMM; } -defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>; -defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>; -defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>; -defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>; -defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>; -defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>; -defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>; -defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>; -defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>; -defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>; -defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>; -defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>; +let ExeDomain = SSEPackedInt in { + defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>; + defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>; + defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>; + defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>; + defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>; + defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>; + defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>; + defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>; + defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>; + defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>; + defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>; + defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>; +} // Instruction where second source can be memory, third must be imm8 multiclass xopvpcom opc, string Suffix, Intrinsic Int> { + let isCommutable = 1 in def ri : IXOPi8 opc, string Suffix, Intrinsic Int> { } } -defm VPCOMB : xopvpcom<0xCC, "b", int_x86_xop_vpcomb>; -defm VPCOMW : xopvpcom<0xCD, "w", int_x86_xop_vpcomw>; -defm VPCOMD : xopvpcom<0xCE, "d", int_x86_xop_vpcomd>; -defm VPCOMQ : xopvpcom<0xCF, "q", int_x86_xop_vpcomq>; -defm VPCOMUB : xopvpcom<0xEC, "ub", int_x86_xop_vpcomub>; -defm VPCOMUW : xopvpcom<0xED, "uw", int_x86_xop_vpcomuw>; -defm VPCOMUD : xopvpcom<0xEE, "ud", int_x86_xop_vpcomud>; -defm VPCOMUQ : xopvpcom<0xEF, "uq", int_x86_xop_vpcomuq>; +let ExeDomain = SSEPackedInt in { // SSE integer instructions + defm VPCOMB : xopvpcom<0xCC, "b", int_x86_xop_vpcomb>; + defm VPCOMW : xopvpcom<0xCD, "w", int_x86_xop_vpcomw>; + defm VPCOMD : xopvpcom<0xCE, "d", int_x86_xop_vpcomd>; + defm VPCOMQ : xopvpcom<0xCF, "q", int_x86_xop_vpcomq>; + defm VPCOMUB : xopvpcom<0xEC, "ub", int_x86_xop_vpcomub>; + defm VPCOMUW : xopvpcom<0xED, "uw", int_x86_xop_vpcomuw>; + defm VPCOMUD : xopvpcom<0xEE, "ud", int_x86_xop_vpcomud>; + defm VPCOMUQ : xopvpcom<0xEF, "uq", int_x86_xop_vpcomuq>; +} // Instruction where either second or third source can be memory multiclass xop4op opc, string OpcodeStr, Intrinsic Int> { @@ -222,8 +237,10 @@ multiclass xop4op opc, string OpcodeStr, Intrinsic Int> { XOP_4V, VEX_I8IMM; } -defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>; -defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>; +let ExeDomain = SSEPackedInt in { + defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>; + defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>; +} multiclass xop4op256 opc, string OpcodeStr, Intrinsic Int> { def rrY : IXOPi8 opc, string OpcodeStr, Intrinsic Int> { XOP_4V, VEX_I8IMM, VEX_L; } -defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>; +let ExeDomain = SSEPackedInt in + defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>; multiclass xop5op opc, string OpcodeStr, Intrinsic Int128, Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> { @@ -295,8 +313,11 @@ multiclass xop5op opc, string OpcodeStr, Intrinsic Int128, VEX_L; } -defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd, - int_x86_xop_vpermil2pd_256, loadv2f64, loadv4f64>; -defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps, - int_x86_xop_vpermil2ps_256, loadv4f32, loadv8f32>; +let ExeDomain = SSEPackedDouble in + defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd, + int_x86_xop_vpermil2pd_256, loadv2f64, loadv4f64>; + +let ExeDomain = SSEPackedSingle in + defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps, + int_x86_xop_vpermil2ps_256, loadv4f32, loadv8f32>; diff --git a/test/CodeGen/X86/commute-xop.ll b/test/CodeGen/X86/commute-xop.ll new file mode 100644 index 00000000000..a3e14feddbc --- /dev/null +++ b/test/CodeGen/X86/commute-xop.ll @@ -0,0 +1,184 @@ +; RUN: llc -O3 -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx,+xop < %s | FileCheck %s + +define <16 x i8> @commute_fold_vpcomb(<16 x i8>* %a0, <16 x i8> %a1) { + ;CHECK-LABEL: commute_fold_vpcomb + ;CHECK: vpcomgtb (%rdi), %xmm0, %xmm0 + %1 = load <16 x i8>* %a0 + %2 = call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %1, <16 x i8> %a1, i8 0) ; vpcomltb + ret <16 x i8> %2 +} +declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readnone + +define <4 x i32> @commute_fold_vpcomd(<4 x i32>* %a0, <4 x i32> %a1) { + ;CHECK-LABEL: commute_fold_vpcomd + ;CHECK: vpcomged (%rdi), %xmm0, %xmm0 + %1 = load <4 x i32>* %a0 + %2 = call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %1, <4 x i32> %a1, i8 1) ; vpcomled + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readnone + +define <2 x i64> @commute_fold_vpcomq(<2 x i64>* %a0, <2 x i64> %a1) { + ;CHECK-LABEL: commute_fold_vpcomq + ;CHECK: vpcomltq (%rdi), %xmm0, %xmm0 + %1 = load <2 x i64>* %a0 + %2 = call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %1, <2 x i64> %a1, i8 2) ; vpcomgtq + ret <2 x i64> %2 +} +declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readnone + +define <16 x i8> @commute_fold_vpcomub(<16 x i8>* %a0, <16 x i8> %a1) { + ;CHECK-LABEL: commute_fold_vpcomub + ;CHECK: vpcomleub (%rdi), %xmm0, %xmm0 + %1 = load <16 x i8>* %a0 + %2 = call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %1, <16 x i8> %a1, i8 3) ; vpcomgeub + ret <16 x i8> %2 +} +declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readnone + +define <4 x i32> @commute_fold_vpcomud(<4 x i32>* %a0, <4 x i32> %a1) { + ;CHECK-LABEL: commute_fold_vpcomud + ;CHECK: vpcomequd (%rdi), %xmm0, %xmm0 + %1 = load <4 x i32>* %a0 + %2 = call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %1, <4 x i32> %a1, i8 4) ; vpcomequd + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readnone + +define <2 x i64> @commute_fold_vpcomuq(<2 x i64>* %a0, <2 x i64> %a1) { + ;CHECK-LABEL: commute_fold_vpcomuq + ;CHECK: vpcomnequq (%rdi), %xmm0, %xmm0 + %1 = load <2 x i64>* %a0 + %2 = call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %1, <2 x i64> %a1, i8 5) ; vpcomnequq + ret <2 x i64> %2 +} +declare <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64>, <2 x i64>, i8) nounwind readnone + +define <8 x i16> @commute_fold_vpcomuw(<8 x i16>* %a0, <8 x i16> %a1) { + ;CHECK-LABEL: commute_fold_vpcomuw + ;CHECK: vpcomfalseuw (%rdi), %xmm0, %xmm0 + %1 = load <8 x i16>* %a0 + %2 = call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %1, <8 x i16> %a1, i8 6) ; vpcomfalseuw + ret <8 x i16> %2 +} +declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readnone + +define <8 x i16> @commute_fold_vpcomw(<8 x i16>* %a0, <8 x i16> %a1) { + ;CHECK-LABEL: commute_fold_vpcomw + ;CHECK: vpcomtruew (%rdi), %xmm0, %xmm0 + %1 = load <8 x i16>* %a0 + %2 = call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %1, <8 x i16> %a1, i8 7) ; vpcomtruew + ret <8 x i16> %2 +} +declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone + +define <4 x i32> @commute_fold_vpmacsdd(<4 x i32>* %a0, <4 x i32> %a1, <4 x i32> %a2) { + ;CHECK-LABEL: commute_fold_vpmacsdd + ;CHECK: vpmacsdd %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <4 x i32>* %a0 + %2 = call <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32> %1, <4 x i32> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone + +define <2 x i64> @commute_fold_vpmacsdqh(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) { + ;CHECK-LABEL: commute_fold_vpmacsdqh + ;CHECK: vpmacsdqh %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <4 x i32>* %a0 + %2 = call <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2) + ret <2 x i64> %2 +} +declare <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone + +define <2 x i64> @commute_fold_vpmacsdql(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) { + ;CHECK-LABEL: commute_fold_vpmacsdql + ;CHECK: vpmacsdql %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <4 x i32>* %a0 + %2 = call <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2) + ret <2 x i64> %2 +} +declare <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone + +define <4 x i32> @commute_fold_vpmacssdd(<4 x i32>* %a0, <4 x i32> %a1, <4 x i32> %a2) { + ;CHECK-LABEL: commute_fold_vpmacssdd + ;CHECK: vpmacssdd %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <4 x i32>* %a0 + %2 = call <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32> %1, <4 x i32> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone + +define <2 x i64> @commute_fold_vpmacssdqh(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) { + ;CHECK-LABEL: commute_fold_vpmacssdqh + ;CHECK: vpmacssdqh %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <4 x i32>* %a0 + %2 = call <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2) + ret <2 x i64> %2 +} +declare <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone + +define <2 x i64> @commute_fold_vpmacssdql(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) { + ;CHECK-LABEL: commute_fold_vpmacssdql + ;CHECK: vpmacssdql %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <4 x i32>* %a0 + %2 = call <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2) + ret <2 x i64> %2 +} +declare <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone + +define <4 x i32> @commute_fold_vpmacsswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) { + ;CHECK-LABEL: commute_fold_vpmacsswd + ;CHECK: vpmacsswd %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <8 x i16>* %a0 + %2 = call <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone + +define <8 x i16> @commute_fold_vpmacssww(<8 x i16>* %a0, <8 x i16> %a1, <8 x i16> %a2) { + ;CHECK-LABEL: commute_fold_vpmacssww + ;CHECK: vpmacssww %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <8 x i16>* %a0 + %2 = call <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16> %1, <8 x i16> %a1, <8 x i16> %a2) + ret <8 x i16> %2 +} +declare <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone + +define <4 x i32> @commute_fold_vpmacswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) { + ;CHECK-LABEL: commute_fold_vpmacswd + ;CHECK: vpmacswd %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <8 x i16>* %a0 + %2 = call <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone + +define <8 x i16> @commute_fold_vpmacsww(<8 x i16>* %a0, <8 x i16> %a1, <8 x i16> %a2) { + ;CHECK-LABEL: commute_fold_vpmacsww + ;CHECK: vpmacsww %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <8 x i16>* %a0 + %2 = call <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16> %1, <8 x i16> %a1, <8 x i16> %a2) + ret <8 x i16> %2 +} +declare <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone + +define <4 x i32> @commute_fold_vpmadcsswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) { + ;CHECK-LABEL: commute_fold_vpmadcsswd + ;CHECK: vpmadcsswd %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <8 x i16>* %a0 + %2 = call <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone + +define <4 x i32> @commute_fold_vpmadcswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) { + ;CHECK-LABEL: commute_fold_vpmadcswd + ;CHECK: vpmadcswd %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <8 x i16>* %a0 + %2 = call <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone + + +