From: JF Bastien Date: Wed, 5 Aug 2015 21:04:59 +0000 (+0000) Subject: x86 atomic: optimize a.store(reg op a.load(acquire), release) X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=8cfa23f93aeb8d3596a1ddc53b458953e487b8cf;p=oota-llvm.git x86 atomic: optimize a.store(reg op a.load(acquire), release) Summary: PR24191 finds that the expected memory-register operations aren't generated when relaxed { load ; modify ; store } is used. This is similar to PR17281 which was addressed in D4796, but only for memory-immediate operations (and for memory orderings up to acquire and release). This patch also handles some floating-point operations. Reviewers: reames, kcc, dvyukov, nadav, morisset, chandlerc, t.p.northover, pete Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D11382 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@244128 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index c9244efdc39..b8f132c28f2 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -20132,6 +20132,45 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, return sinkMBB; } +MachineBasicBlock * +X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI, + MachineBasicBlock *BB) const { + // Combine the following atomic floating-point modification pattern: + // a.store(reg OP a.load(acquire), release) + // Transform them into: + // OPss (%gpr), %xmm + // movss %xmm, (%gpr) + // Or sd equivalent for 64-bit operations. + unsigned MOp, FOp; + switch (MI->getOpcode()) { + default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP"); + case X86::RELEASE_FADD32mr: MOp = X86::MOVSSmr; FOp = X86::ADDSSrm; break; + case X86::RELEASE_FADD64mr: MOp = X86::MOVSDmr; FOp = X86::ADDSDrm; break; + } + const X86InstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + unsigned MSrc = MI->getOperand(0).getReg(); + unsigned VSrc = MI->getOperand(5).getReg(); + MachineInstrBuilder MIM = BuildMI(*BB, MI, DL, TII->get(MOp)) + .addReg(/*Base=*/MSrc) + .addImm(/*Scale=*/1) + .addReg(/*Index=*/0) + .addImm(0) + .addReg(0); + MachineInstr *MIO = BuildMI(*BB, (MachineInstr *)MIM, DL, TII->get(FOp), + MRI.createVirtualRegister(MRI.getRegClass(VSrc))) + .addReg(VSrc) + .addReg(/*Base=*/MSrc) + .addImm(/*Scale=*/1) + .addReg(/*Index=*/0) + .addImm(/*Disp=*/0) + .addReg(/*Segment=*/0); + MIM.addReg(MIO->getOperand(0).getReg(), RegState::Kill); + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; +} + MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { @@ -20687,6 +20726,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::CMOV_V64I1: return EmitLoweredSelect(MI, BB); + case X86::RELEASE_FADD32mr: + case X86::RELEASE_FADD64mr: + return EmitLoweredAtomicFP(MI, BB); + case X86::FP32_TO_INT16_IN_MEM: case X86::FP32_TO_INT32_IN_MEM: case X86::FP32_TO_INT64_IN_MEM: diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index edf942121f8..837fe8cfcf7 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -1080,6 +1080,9 @@ namespace llvm { MachineBasicBlock *EmitLoweredSelect(MachineInstr *I, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr *I, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredWinAlloca(MachineInstr *MI, MachineBasicBlock *BB) const; diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 49dc3189f09..083e80de7e8 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -752,6 +752,8 @@ defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add", /* The following multiclass tries to make sure that in code like * x.store (immediate op x.load(acquire), release) + * and + * x.store (register op x.load(acquire), release) * an operation directly on memory is generated instead of wasting a register. * It is not automatic as atomic_store/load are only lowered to MOV instructions * extremely late to prevent them from being accidentally reordered in the backend @@ -759,19 +761,31 @@ defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add", */ multiclass RELEASE_BINOP_MI { def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), - "#RELEASE_BINOP PSEUDO!", + "#BINOP "#NAME#"8mi PSEUDO!", [(atomic_store_8 addr:$dst, (!cast(op) (atomic_load_8 addr:$dst), (i8 imm:$src)))]>; + def NAME#8mr : I<0, Pseudo, (outs), (ins i8mem:$dst, GR8:$src), + "#BINOP "#NAME#"8mr PSEUDO!", + [(atomic_store_8 addr:$dst, (!cast(op) + (atomic_load_8 addr:$dst), GR8:$src))]>; // NAME#16 is not generated as 16-bit arithmetic instructions are considered // costly and avoided as far as possible by this backend anyway def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), - "#RELEASE_BINOP PSEUDO!", + "#BINOP "#NAME#"32mi PSEUDO!", [(atomic_store_32 addr:$dst, (!cast(op) (atomic_load_32 addr:$dst), (i32 imm:$src)))]>; + def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src), + "#BINOP "#NAME#"32mr PSEUDO!", + [(atomic_store_32 addr:$dst, (!cast(op) + (atomic_load_32 addr:$dst), GR32:$src))]>; def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), - "#RELEASE_BINOP PSEUDO!", + "#BINOP "#NAME#"64mi32 PSEUDO!", [(atomic_store_64 addr:$dst, (!cast(op) (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>; + def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), + "#BINOP "#NAME#"64mr PSEUDO!", + [(atomic_store_64 addr:$dst, (!cast(op) + (atomic_load_64 addr:$dst), GR64:$src))]>; } defm RELEASE_ADD : RELEASE_BINOP_MI<"add">; defm RELEASE_AND : RELEASE_BINOP_MI<"and">; @@ -780,18 +794,41 @@ defm RELEASE_XOR : RELEASE_BINOP_MI<"xor">; // Note: we don't deal with sub, because substractions of constants are // optimized into additions before this code can run +// Same as above, but for floating-point. +// FIXME: imm version. +// FIXME: Version that doesn't clobber $src, using AVX's VADDSS. +// FIXME: This could also handle SIMD operations with *ps and *pd instructions. +let usesCustomInserter = 1 in { +multiclass RELEASE_FP_BINOP_MI { + def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src), + "#BINOP "#NAME#"32mr PSEUDO!", + [(atomic_store_32 addr:$dst, + (i32 (bitconvert (!cast(op) + (f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))), + FR32:$src))))]>, Requires<[HasSSE1]>; + def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src), + "#BINOP "#NAME#"64mr PSEUDO!", + [(atomic_store_64 addr:$dst, + (i64 (bitconvert (!cast(op) + (f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))), + FR64:$src))))]>, Requires<[HasSSE2]>; +} +defm RELEASE_FADD : RELEASE_FP_BINOP_MI<"fadd">; +// FIXME: Add fsub, fmul, fdiv, ... +} + multiclass RELEASE_UNOP { def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst), - "#RELEASE_UNOP PSEUDO!", + "#UNOP "#NAME#"8m PSEUDO!", [(atomic_store_8 addr:$dst, dag8)]>; def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst), - "#RELEASE_UNOP PSEUDO!", + "#UNOP "#NAME#"16m PSEUDO!", [(atomic_store_16 addr:$dst, dag16)]>; def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst), - "#RELEASE_UNOP PSEUDO!", + "#UNOP "#NAME#"32m PSEUDO!", [(atomic_store_32 addr:$dst, dag32)]>; def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst), - "#RELEASE_UNOP PSEUDO!", + "#UNOP "#NAME#"64m PSEUDO!", [(atomic_store_64 addr:$dst, dag64)]>; } @@ -821,42 +858,42 @@ defm RELEASE_NOT : RELEASE_UNOP< */ def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), - "#RELEASE_MOV PSEUDO !", + "#RELEASE_MOV8mi PSEUDO!", [(atomic_store_8 addr:$dst, (i8 imm:$src))]>; def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src), - "#RELEASE_MOV PSEUDO !", + "#RELEASE_MOV16mi PSEUDO!", [(atomic_store_16 addr:$dst, (i16 imm:$src))]>; def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), - "#RELEASE_MOV PSEUDO !", + "#RELEASE_MOV32mi PSEUDO!", [(atomic_store_32 addr:$dst, (i32 imm:$src))]>; def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), - "#RELEASE_MOV PSEUDO !", + "#RELEASE_MOV64mi32 PSEUDO!", [(atomic_store_64 addr:$dst, i64immSExt32:$src)]>; def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src), - "#RELEASE_MOV PSEUDO!", + "#RELEASE_MOV8mr PSEUDO!", [(atomic_store_8 addr:$dst, GR8 :$src)]>; def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src), - "#RELEASE_MOV PSEUDO!", + "#RELEASE_MOV16mr PSEUDO!", [(atomic_store_16 addr:$dst, GR16:$src)]>; def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src), - "#RELEASE_MOV PSEUDO!", + "#RELEASE_MOV32mr PSEUDO!", [(atomic_store_32 addr:$dst, GR32:$src)]>; def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), - "#RELEASE_MOV PSEUDO!", + "#RELEASE_MOV64mr PSEUDO!", [(atomic_store_64 addr:$dst, GR64:$src)]>; def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src), - "#ACQUIRE_MOV PSEUDO!", + "#ACQUIRE_MOV8rm PSEUDO!", [(set GR8:$dst, (atomic_load_8 addr:$src))]>; def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src), - "#ACQUIRE_MOV PSEUDO!", + "#ACQUIRE_MOV16rm PSEUDO!", [(set GR16:$dst, (atomic_load_16 addr:$src))]>; def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src), - "#ACQUIRE_MOV PSEUDO!", + "#ACQUIRE_MOV32rm PSEUDO!", [(set GR32:$dst, (atomic_load_32 addr:$src))]>; def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), - "#ACQUIRE_MOV PSEUDO!", + "#ACQUIRE_MOV64rm PSEUDO!", [(set GR64:$dst, (atomic_load_64 addr:$src))]>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 68977d38e15..2b30ce6d362 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -598,17 +598,29 @@ ReSimplify: case X86::RELEASE_MOV32mi: OutMI.setOpcode(X86::MOV32mi); goto ReSimplify; case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify; case X86::RELEASE_ADD8mi: OutMI.setOpcode(X86::ADD8mi); goto ReSimplify; + case X86::RELEASE_ADD8mr: OutMI.setOpcode(X86::ADD8mr); goto ReSimplify; case X86::RELEASE_ADD32mi: OutMI.setOpcode(X86::ADD32mi); goto ReSimplify; + case X86::RELEASE_ADD32mr: OutMI.setOpcode(X86::ADD32mr); goto ReSimplify; case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify; + case X86::RELEASE_ADD64mr: OutMI.setOpcode(X86::ADD64mr); goto ReSimplify; case X86::RELEASE_AND8mi: OutMI.setOpcode(X86::AND8mi); goto ReSimplify; + case X86::RELEASE_AND8mr: OutMI.setOpcode(X86::AND8mr); goto ReSimplify; case X86::RELEASE_AND32mi: OutMI.setOpcode(X86::AND32mi); goto ReSimplify; + case X86::RELEASE_AND32mr: OutMI.setOpcode(X86::AND32mr); goto ReSimplify; case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify; + case X86::RELEASE_AND64mr: OutMI.setOpcode(X86::AND64mr); goto ReSimplify; case X86::RELEASE_OR8mi: OutMI.setOpcode(X86::OR8mi); goto ReSimplify; + case X86::RELEASE_OR8mr: OutMI.setOpcode(X86::OR8mr); goto ReSimplify; case X86::RELEASE_OR32mi: OutMI.setOpcode(X86::OR32mi); goto ReSimplify; + case X86::RELEASE_OR32mr: OutMI.setOpcode(X86::OR32mr); goto ReSimplify; case X86::RELEASE_OR64mi32: OutMI.setOpcode(X86::OR64mi32); goto ReSimplify; + case X86::RELEASE_OR64mr: OutMI.setOpcode(X86::OR64mr); goto ReSimplify; case X86::RELEASE_XOR8mi: OutMI.setOpcode(X86::XOR8mi); goto ReSimplify; + case X86::RELEASE_XOR8mr: OutMI.setOpcode(X86::XOR8mr); goto ReSimplify; case X86::RELEASE_XOR32mi: OutMI.setOpcode(X86::XOR32mi); goto ReSimplify; + case X86::RELEASE_XOR32mr: OutMI.setOpcode(X86::XOR32mr); goto ReSimplify; case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify; + case X86::RELEASE_XOR64mr: OutMI.setOpcode(X86::XOR64mr); goto ReSimplify; case X86::RELEASE_INC8m: OutMI.setOpcode(X86::INC8m); goto ReSimplify; case X86::RELEASE_INC16m: OutMI.setOpcode(X86::INC16m); goto ReSimplify; case X86::RELEASE_INC32m: OutMI.setOpcode(X86::INC32m); goto ReSimplify; diff --git a/test/CodeGen/X86/atomic_mi.ll b/test/CodeGen/X86/atomic_mi.ll index 7a6204fc893..8ea72fa5f27 100644 --- a/test/CodeGen/X86/atomic_mi.ll +++ b/test/CodeGen/X86/atomic_mi.ll @@ -14,7 +14,11 @@ ; The binary operations supported are currently add, and, or, xor. ; sub is not supported because they are translated by an addition of the ; negated immediate. -; Finally, we also check the same kind of pattern for inc/dec +; +; We also check the same patterns: +; - For inc/dec. +; - For register instead of immediate operands. +; - For floating point operations. ; seq_cst stores are left as (lock) xchgl, but we try to check every other ; attribute at least once. @@ -25,10 +29,10 @@ ; an implicit lock prefix, so making it explicit is not required. define void @store_atomic_imm_8(i8* %p) { -; X64-LABEL: store_atomic_imm_8 +; X64-LABEL: store_atomic_imm_8: ; X64: movb ; X64-NOT: movb -; X32-LABEL: store_atomic_imm_8 +; X32-LABEL: store_atomic_imm_8: ; X32: movb ; X32-NOT: movb store atomic i8 42, i8* %p release, align 1 @@ -36,10 +40,10 @@ define void @store_atomic_imm_8(i8* %p) { } define void @store_atomic_imm_16(i16* %p) { -; X64-LABEL: store_atomic_imm_16 +; X64-LABEL: store_atomic_imm_16: ; X64: movw ; X64-NOT: movw -; X32-LABEL: store_atomic_imm_16 +; X32-LABEL: store_atomic_imm_16: ; X32: movw ; X32-NOT: movw store atomic i16 42, i16* %p monotonic, align 2 @@ -47,12 +51,12 @@ define void @store_atomic_imm_16(i16* %p) { } define void @store_atomic_imm_32(i32* %p) { -; X64-LABEL: store_atomic_imm_32 +; X64-LABEL: store_atomic_imm_32: ; X64: movl ; X64-NOT: movl ; On 32 bits, there is an extra movl for each of those functions ; (probably for alignment reasons). -; X32-LABEL: store_atomic_imm_32 +; X32-LABEL: store_atomic_imm_32: ; X32: movl 4(%esp), %eax ; X32: movl ; X32-NOT: movl @@ -61,12 +65,12 @@ define void @store_atomic_imm_32(i32* %p) { } define void @store_atomic_imm_64(i64* %p) { -; X64-LABEL: store_atomic_imm_64 +; X64-LABEL: store_atomic_imm_64: ; X64: movq ; X64-NOT: movq ; These are implemented with a CAS loop on 32 bit architectures, and thus ; cannot be optimized in the same way as the others. -; X32-LABEL: store_atomic_imm_64 +; X32-LABEL: store_atomic_imm_64: ; X32: cmpxchg8b store atomic i64 42, i64* %p release, align 8 ret void @@ -75,7 +79,7 @@ define void @store_atomic_imm_64(i64* %p) { ; If an immediate is too big to fit in 32 bits, it cannot be store in one mov, ; even on X64, one must use movabsq that can only target a register. define void @store_atomic_imm_64_big(i64* %p) { -; X64-LABEL: store_atomic_imm_64_big +; X64-LABEL: store_atomic_imm_64_big: ; X64: movabsq ; X64: movq store atomic i64 100000000000, i64* %p monotonic, align 8 @@ -84,9 +88,9 @@ define void @store_atomic_imm_64_big(i64* %p) { ; It would be incorrect to replace a lock xchgl by a movl define void @store_atomic_imm_32_seq_cst(i32* %p) { -; X64-LABEL: store_atomic_imm_32_seq_cst +; X64-LABEL: store_atomic_imm_32_seq_cst: ; X64: xchgl -; X32-LABEL: store_atomic_imm_32_seq_cst +; X32-LABEL: store_atomic_imm_32_seq_cst: ; X32: xchgl store atomic i32 42, i32* %p seq_cst, align 4 ret void @@ -94,12 +98,12 @@ define void @store_atomic_imm_32_seq_cst(i32* %p) { ; ----- ADD ----- -define void @add_8(i8* %p) { -; X64-LABEL: add_8 +define void @add_8i(i8* %p) { +; X64-LABEL: add_8i: ; X64-NOT: lock ; X64: addb ; X64-NOT: movb -; X32-LABEL: add_8 +; X32-LABEL: add_8i: ; X32-NOT: lock ; X32: addb ; X32-NOT: movb @@ -109,12 +113,27 @@ define void @add_8(i8* %p) { ret void } -define void @add_16(i16* %p) { +define void @add_8r(i8* %p, i8 %v) { +; X64-LABEL: add_8r: +; X64-NOT: lock +; X64: addb +; X64-NOT: movb +; X32-LABEL: add_8r: +; X32-NOT: lock +; X32: addb +; X32-NOT: movb + %1 = load atomic i8, i8* %p seq_cst, align 1 + %2 = add i8 %1, %v + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @add_16i(i16* %p) { ; Currently the transformation is not done on 16 bit accesses, as the backend ; treat 16 bit arithmetic as expensive on X86/X86_64. -; X64-LABEL: add_16 +; X64-LABEL: add_16i: ; X64-NOT: addw -; X32-LABEL: add_16 +; X32-LABEL: add_16i: ; X32-NOT: addw %1 = load atomic i16, i16* %p acquire, align 2 %2 = add i16 %1, 2 @@ -122,12 +141,25 @@ define void @add_16(i16* %p) { ret void } -define void @add_32(i32* %p) { -; X64-LABEL: add_32 +define void @add_16r(i16* %p, i16 %v) { +; Currently the transformation is not done on 16 bit accesses, as the backend +; treat 16 bit arithmetic as expensive on X86/X86_64. +; X64-LABEL: add_16r: +; X64-NOT: addw +; X32-LABEL: add_16r: +; X32-NOT: addw [.*], ( + %1 = load atomic i16, i16* %p acquire, align 2 + %2 = add i16 %1, %v + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @add_32i(i32* %p) { +; X64-LABEL: add_32i: ; X64-NOT: lock ; X64: addl ; X64-NOT: movl -; X32-LABEL: add_32 +; X32-LABEL: add_32i: ; X32-NOT: lock ; X32: addl ; X32-NOT: movl @@ -137,23 +169,94 @@ define void @add_32(i32* %p) { ret void } -define void @add_64(i64* %p) { -; X64-LABEL: add_64 +define void @add_32r(i32* %p, i32 %v) { +; X64-LABEL: add_32r: +; X64-NOT: lock +; X64: addl +; X64-NOT: movl +; X32-LABEL: add_32r: +; X32-NOT: lock +; X32: addl +; X32-NOT: movl + %1 = load atomic i32, i32* %p acquire, align 4 + %2 = add i32 %1, %v + store atomic i32 %2, i32* %p monotonic, align 4 + ret void +} + +; The following is a corner case where the load is added to itself. The pattern +; matching should not fold this. We only test with 32-bit add, but the same +; applies to other sizes and operations. +define void @add_32r_self(i32* %p) { +; X64-LABEL: add_32r_self: +; X64-NOT: lock +; X64: movl (%[[M:[a-z]+]]), %[[R:[a-z]+]] +; X64: addl %[[R]], %[[R]] +; X64: movl %[[R]], (%[[M]]) +; X32-LABEL: add_32r_self: +; X32-NOT: lock +; X32: movl (%[[M:[a-z]+]]), %[[R:[a-z]+]] +; X32: addl %[[R]], %[[R]] +; X32: movl %[[R]], (%[[M]]) + %1 = load atomic i32, i32* %p acquire, align 4 + %2 = add i32 %1, %1 + store atomic i32 %2, i32* %p monotonic, align 4 + ret void +} + +; The following is a corner case where the load's result is returned. The +; optimizer isn't allowed to duplicate the load because it's atomic. +define i32 @add_32r_ret_load(i32* %p, i32 %v) { +; X64-LABEL: add_32r_ret_load: +; X64-NOT: lock +; X64: movl (%rdi), %eax +; X64-NEXT: leal (%rsi,%rax), %ecx +; X64-NEXT: movl %ecx, (%rdi) +; X64-NEXT: retq +; X32-LABEL: add_32r_ret_load: +; X32-NOT: lock +; X32: movl 4(%esp), %[[P:[a-z]+]] +; X32-NEXT: movl (%[[P]]), +; X32-NOT: %[[P]] +; More code here, we just don't want it to load from P. +; X32: movl %{{.*}}, (%[[P]]) +; X32-NEXT: retl + %1 = load atomic i32, i32* %p acquire, align 4 + %2 = add i32 %1, %v + store atomic i32 %2, i32* %p monotonic, align 4 + ret i32 %1 +} + +define void @add_64i(i64* %p) { +; X64-LABEL: add_64i: ; X64-NOT: lock ; X64: addq ; X64-NOT: movq ; We do not check X86-32 as it cannot do 'addq'. -; X32-LABEL: add_64 +; X32-LABEL: add_64i: %1 = load atomic i64, i64* %p acquire, align 8 %2 = add i64 %1, 2 store atomic i64 %2, i64* %p release, align 8 ret void } -define void @add_32_seq_cst(i32* %p) { -; X64-LABEL: add_32_seq_cst +define void @add_64r(i64* %p, i64 %v) { +; X64-LABEL: add_64r: +; X64-NOT: lock +; X64: addq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'addq'. +; X32-LABEL: add_64r: + %1 = load atomic i64, i64* %p acquire, align 8 + %2 = add i64 %1, %v + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @add_32i_seq_cst(i32* %p) { +; X64-LABEL: add_32i_seq_cst: ; X64: xchgl -; X32-LABEL: add_32_seq_cst +; X32-LABEL: add_32i_seq_cst: ; X32: xchgl %1 = load atomic i32, i32* %p monotonic, align 4 %2 = add i32 %1, 2 @@ -161,14 +264,25 @@ define void @add_32_seq_cst(i32* %p) { ret void } +define void @add_32r_seq_cst(i32* %p, i32 %v) { +; X64-LABEL: add_32r_seq_cst: +; X64: xchgl +; X32-LABEL: add_32r_seq_cst: +; X32: xchgl + %1 = load atomic i32, i32* %p monotonic, align 4 + %2 = add i32 %1, %v + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + ; ----- AND ----- -define void @and_8(i8* %p) { -; X64-LABEL: and_8 +define void @and_8i(i8* %p) { +; X64-LABEL: and_8i: ; X64-NOT: lock ; X64: andb ; X64-NOT: movb -; X32-LABEL: and_8 +; X32-LABEL: and_8i: ; X32-NOT: lock ; X32: andb ; X32-NOT: movb @@ -178,12 +292,27 @@ define void @and_8(i8* %p) { ret void } -define void @and_16(i16* %p) { +define void @and_8r(i8* %p, i8 %v) { +; X64-LABEL: and_8r: +; X64-NOT: lock +; X64: andb +; X64-NOT: movb +; X32-LABEL: and_8r: +; X32-NOT: lock +; X32: andb +; X32-NOT: movb + %1 = load atomic i8, i8* %p monotonic, align 1 + %2 = and i8 %1, %v + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @and_16i(i16* %p) { ; Currently the transformation is not done on 16 bit accesses, as the backend ; treat 16 bit arithmetic as expensive on X86/X86_64. -; X64-LABEL: and_16 +; X64-LABEL: and_16i: ; X64-NOT: andw -; X32-LABEL: and_16 +; X32-LABEL: and_16i: ; X32-NOT: andw %1 = load atomic i16, i16* %p acquire, align 2 %2 = and i16 %1, 2 @@ -191,12 +320,25 @@ define void @and_16(i16* %p) { ret void } -define void @and_32(i32* %p) { -; X64-LABEL: and_32 +define void @and_16r(i16* %p, i16 %v) { +; Currently the transformation is not done on 16 bit accesses, as the backend +; treat 16 bit arithmetic as expensive on X86/X86_64. +; X64-LABEL: and_16r: +; X64-NOT: andw +; X32-LABEL: and_16r: +; X32-NOT: andw [.*], ( + %1 = load atomic i16, i16* %p acquire, align 2 + %2 = and i16 %1, %v + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @and_32i(i32* %p) { +; X64-LABEL: and_32i: ; X64-NOT: lock ; X64: andl ; X64-NOT: movl -; X32-LABEL: and_32 +; X32-LABEL: and_32i: ; X32-NOT: lock ; X32: andl ; X32-NOT: movl @@ -206,23 +348,51 @@ define void @and_32(i32* %p) { ret void } -define void @and_64(i64* %p) { -; X64-LABEL: and_64 +define void @and_32r(i32* %p, i32 %v) { +; X64-LABEL: and_32r: +; X64-NOT: lock +; X64: andl +; X64-NOT: movl +; X32-LABEL: and_32r: +; X32-NOT: lock +; X32: andl +; X32-NOT: movl + %1 = load atomic i32, i32* %p acquire, align 4 + %2 = and i32 %1, %v + store atomic i32 %2, i32* %p release, align 4 + ret void +} + +define void @and_64i(i64* %p) { +; X64-LABEL: and_64i: ; X64-NOT: lock ; X64: andq ; X64-NOT: movq ; We do not check X86-32 as it cannot do 'andq'. -; X32-LABEL: and_64 +; X32-LABEL: and_64i: %1 = load atomic i64, i64* %p acquire, align 8 %2 = and i64 %1, 2 store atomic i64 %2, i64* %p release, align 8 ret void } -define void @and_32_seq_cst(i32* %p) { -; X64-LABEL: and_32_seq_cst +define void @and_64r(i64* %p, i64 %v) { +; X64-LABEL: and_64r: +; X64-NOT: lock +; X64: andq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'andq'. +; X32-LABEL: and_64r: + %1 = load atomic i64, i64* %p acquire, align 8 + %2 = and i64 %1, %v + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @and_32i_seq_cst(i32* %p) { +; X64-LABEL: and_32i_seq_cst: ; X64: xchgl -; X32-LABEL: and_32_seq_cst +; X32-LABEL: and_32i_seq_cst: ; X32: xchgl %1 = load atomic i32, i32* %p monotonic, align 4 %2 = and i32 %1, 2 @@ -230,14 +400,25 @@ define void @and_32_seq_cst(i32* %p) { ret void } +define void @and_32r_seq_cst(i32* %p, i32 %v) { +; X64-LABEL: and_32r_seq_cst: +; X64: xchgl +; X32-LABEL: and_32r_seq_cst: +; X32: xchgl + %1 = load atomic i32, i32* %p monotonic, align 4 + %2 = and i32 %1, %v + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + ; ----- OR ----- -define void @or_8(i8* %p) { -; X64-LABEL: or_8 +define void @or_8i(i8* %p) { +; X64-LABEL: or_8i: ; X64-NOT: lock ; X64: orb ; X64-NOT: movb -; X32-LABEL: or_8 +; X32-LABEL: or_8i: ; X32-NOT: lock ; X32: orb ; X32-NOT: movb @@ -247,10 +428,25 @@ define void @or_8(i8* %p) { ret void } -define void @or_16(i16* %p) { -; X64-LABEL: or_16 +define void @or_8r(i8* %p, i8 %v) { +; X64-LABEL: or_8r: +; X64-NOT: lock +; X64: orb +; X64-NOT: movb +; X32-LABEL: or_8r: +; X32-NOT: lock +; X32: orb +; X32-NOT: movb + %1 = load atomic i8, i8* %p acquire, align 1 + %2 = or i8 %1, %v + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @or_16i(i16* %p) { +; X64-LABEL: or_16i: ; X64-NOT: orw -; X32-LABEL: or_16 +; X32-LABEL: or_16i: ; X32-NOT: orw %1 = load atomic i16, i16* %p acquire, align 2 %2 = or i16 %1, 2 @@ -258,12 +454,23 @@ define void @or_16(i16* %p) { ret void } -define void @or_32(i32* %p) { -; X64-LABEL: or_32 +define void @or_16r(i16* %p, i16 %v) { +; X64-LABEL: or_16r: +; X64-NOT: orw +; X32-LABEL: or_16r: +; X32-NOT: orw [.*], ( + %1 = load atomic i16, i16* %p acquire, align 2 + %2 = or i16 %1, %v + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @or_32i(i32* %p) { +; X64-LABEL: or_32i: ; X64-NOT: lock ; X64: orl ; X64-NOT: movl -; X32-LABEL: or_32 +; X32-LABEL: or_32i: ; X32-NOT: lock ; X32: orl ; X32-NOT: movl @@ -273,23 +480,51 @@ define void @or_32(i32* %p) { ret void } -define void @or_64(i64* %p) { -; X64-LABEL: or_64 +define void @or_32r(i32* %p, i32 %v) { +; X64-LABEL: or_32r: +; X64-NOT: lock +; X64: orl +; X64-NOT: movl +; X32-LABEL: or_32r: +; X32-NOT: lock +; X32: orl +; X32-NOT: movl + %1 = load atomic i32, i32* %p acquire, align 4 + %2 = or i32 %1, %v + store atomic i32 %2, i32* %p release, align 4 + ret void +} + +define void @or_64i(i64* %p) { +; X64-LABEL: or_64i: ; X64-NOT: lock ; X64: orq ; X64-NOT: movq ; We do not check X86-32 as it cannot do 'orq'. -; X32-LABEL: or_64 +; X32-LABEL: or_64i: %1 = load atomic i64, i64* %p acquire, align 8 %2 = or i64 %1, 2 store atomic i64 %2, i64* %p release, align 8 ret void } -define void @or_32_seq_cst(i32* %p) { -; X64-LABEL: or_32_seq_cst +define void @or_64r(i64* %p, i64 %v) { +; X64-LABEL: or_64r: +; X64-NOT: lock +; X64: orq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'orq'. +; X32-LABEL: or_64r: + %1 = load atomic i64, i64* %p acquire, align 8 + %2 = or i64 %1, %v + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @or_32i_seq_cst(i32* %p) { +; X64-LABEL: or_32i_seq_cst: ; X64: xchgl -; X32-LABEL: or_32_seq_cst +; X32-LABEL: or_32i_seq_cst: ; X32: xchgl %1 = load atomic i32, i32* %p monotonic, align 4 %2 = or i32 %1, 2 @@ -297,14 +532,25 @@ define void @or_32_seq_cst(i32* %p) { ret void } +define void @or_32r_seq_cst(i32* %p, i32 %v) { +; X64-LABEL: or_32r_seq_cst: +; X64: xchgl +; X32-LABEL: or_32r_seq_cst: +; X32: xchgl + %1 = load atomic i32, i32* %p monotonic, align 4 + %2 = or i32 %1, %v + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + ; ----- XOR ----- -define void @xor_8(i8* %p) { -; X64-LABEL: xor_8 +define void @xor_8i(i8* %p) { +; X64-LABEL: xor_8i: ; X64-NOT: lock ; X64: xorb ; X64-NOT: movb -; X32-LABEL: xor_8 +; X32-LABEL: xor_8i: ; X32-NOT: lock ; X32: xorb ; X32-NOT: movb @@ -314,10 +560,25 @@ define void @xor_8(i8* %p) { ret void } -define void @xor_16(i16* %p) { -; X64-LABEL: xor_16 +define void @xor_8r(i8* %p, i8 %v) { +; X64-LABEL: xor_8r: +; X64-NOT: lock +; X64: xorb +; X64-NOT: movb +; X32-LABEL: xor_8r: +; X32-NOT: lock +; X32: xorb +; X32-NOT: movb + %1 = load atomic i8, i8* %p acquire, align 1 + %2 = xor i8 %1, %v + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @xor_16i(i16* %p) { +; X64-LABEL: xor_16i: ; X64-NOT: xorw -; X32-LABEL: xor_16 +; X32-LABEL: xor_16i: ; X32-NOT: xorw %1 = load atomic i16, i16* %p acquire, align 2 %2 = xor i16 %1, 2 @@ -325,12 +586,23 @@ define void @xor_16(i16* %p) { ret void } -define void @xor_32(i32* %p) { -; X64-LABEL: xor_32 +define void @xor_16r(i16* %p, i16 %v) { +; X64-LABEL: xor_16r: +; X64-NOT: xorw +; X32-LABEL: xor_16r: +; X32-NOT: xorw [.*], ( + %1 = load atomic i16, i16* %p acquire, align 2 + %2 = xor i16 %1, %v + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @xor_32i(i32* %p) { +; X64-LABEL: xor_32i: ; X64-NOT: lock ; X64: xorl ; X64-NOT: movl -; X32-LABEL: xor_32 +; X32-LABEL: xor_32i: ; X32-NOT: lock ; X32: xorl ; X32-NOT: movl @@ -340,23 +612,51 @@ define void @xor_32(i32* %p) { ret void } -define void @xor_64(i64* %p) { -; X64-LABEL: xor_64 +define void @xor_32r(i32* %p, i32 %v) { +; X64-LABEL: xor_32r: +; X64-NOT: lock +; X64: xorl +; X64-NOT: movl +; X32-LABEL: xor_32r: +; X32-NOT: lock +; X32: xorl +; X32-NOT: movl + %1 = load atomic i32, i32* %p acquire, align 4 + %2 = xor i32 %1, %v + store atomic i32 %2, i32* %p release, align 4 + ret void +} + +define void @xor_64i(i64* %p) { +; X64-LABEL: xor_64i: ; X64-NOT: lock ; X64: xorq ; X64-NOT: movq ; We do not check X86-32 as it cannot do 'xorq'. -; X32-LABEL: xor_64 +; X32-LABEL: xor_64i: %1 = load atomic i64, i64* %p acquire, align 8 %2 = xor i64 %1, 2 store atomic i64 %2, i64* %p release, align 8 ret void } -define void @xor_32_seq_cst(i32* %p) { -; X64-LABEL: xor_32_seq_cst +define void @xor_64r(i64* %p, i64 %v) { +; X64-LABEL: xor_64r: +; X64-NOT: lock +; X64: xorq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'xorq'. +; X32-LABEL: xor_64r: + %1 = load atomic i64, i64* %p acquire, align 8 + %2 = xor i64 %1, %v + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @xor_32i_seq_cst(i32* %p) { +; X64-LABEL: xor_32i_seq_cst: ; X64: xchgl -; X32-LABEL: xor_32_seq_cst +; X32-LABEL: xor_32i_seq_cst: ; X32: xchgl %1 = load atomic i32, i32* %p monotonic, align 4 %2 = xor i32 %1, 2 @@ -364,18 +664,29 @@ define void @xor_32_seq_cst(i32* %p) { ret void } +define void @xor_32r_seq_cst(i32* %p, i32 %v) { +; X64-LABEL: xor_32r_seq_cst: +; X64: xchgl +; X32-LABEL: xor_32r_seq_cst: +; X32: xchgl + %1 = load atomic i32, i32* %p monotonic, align 4 + %2 = xor i32 %1, %v + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + ; ----- INC ----- define void @inc_8(i8* %p) { -; X64-LABEL: inc_8 +; X64-LABEL: inc_8: ; X64-NOT: lock ; X64: incb ; X64-NOT: movb -; X32-LABEL: inc_8 +; X32-LABEL: inc_8: ; X32-NOT: lock ; X32: incb ; X32-NOT: movb -; SLOW_INC-LABEL: inc_8 +; SLOW_INC-LABEL: inc_8: ; SLOW_INC-NOT: incb ; SLOW_INC-NOT: movb %1 = load atomic i8, i8* %p seq_cst, align 1 @@ -387,11 +698,11 @@ define void @inc_8(i8* %p) { define void @inc_16(i16* %p) { ; Currently the transformation is not done on 16 bit accesses, as the backend ; treat 16 bit arithmetic as expensive on X86/X86_64. -; X64-LABEL: inc_16 +; X64-LABEL: inc_16: ; X64-NOT: incw -; X32-LABEL: inc_16 +; X32-LABEL: inc_16: ; X32-NOT: incw -; SLOW_INC-LABEL: inc_16 +; SLOW_INC-LABEL: inc_16: ; SLOW_INC-NOT: incw %1 = load atomic i16, i16* %p acquire, align 2 %2 = add i16 %1, 1 @@ -400,15 +711,15 @@ define void @inc_16(i16* %p) { } define void @inc_32(i32* %p) { -; X64-LABEL: inc_32 +; X64-LABEL: inc_32: ; X64-NOT: lock ; X64: incl ; X64-NOT: movl -; X32-LABEL: inc_32 +; X32-LABEL: inc_32: ; X32-NOT: lock ; X32: incl ; X32-NOT: movl -; SLOW_INC-LABEL: inc_32 +; SLOW_INC-LABEL: inc_32: ; SLOW_INC-NOT: incl ; SLOW_INC-NOT: movl %1 = load atomic i32, i32* %p acquire, align 4 @@ -418,13 +729,13 @@ define void @inc_32(i32* %p) { } define void @inc_64(i64* %p) { -; X64-LABEL: inc_64 +; X64-LABEL: inc_64: ; X64-NOT: lock ; X64: incq ; X64-NOT: movq ; We do not check X86-32 as it cannot do 'incq'. -; X32-LABEL: inc_64 -; SLOW_INC-LABEL: inc_64 +; X32-LABEL: inc_64: +; SLOW_INC-LABEL: inc_64: ; SLOW_INC-NOT: incq ; SLOW_INC-NOT: movq %1 = load atomic i64, i64* %p acquire, align 8 @@ -434,9 +745,9 @@ define void @inc_64(i64* %p) { } define void @inc_32_seq_cst(i32* %p) { -; X64-LABEL: inc_32_seq_cst +; X64-LABEL: inc_32_seq_cst: ; X64: xchgl -; X32-LABEL: inc_32_seq_cst +; X32-LABEL: inc_32_seq_cst: ; X32: xchgl %1 = load atomic i32, i32* %p monotonic, align 4 %2 = add i32 %1, 1 @@ -447,15 +758,15 @@ define void @inc_32_seq_cst(i32* %p) { ; ----- DEC ----- define void @dec_8(i8* %p) { -; X64-LABEL: dec_8 +; X64-LABEL: dec_8: ; X64-NOT: lock ; X64: decb ; X64-NOT: movb -; X32-LABEL: dec_8 +; X32-LABEL: dec_8: ; X32-NOT: lock ; X32: decb ; X32-NOT: movb -; SLOW_INC-LABEL: dec_8 +; SLOW_INC-LABEL: dec_8: ; SLOW_INC-NOT: decb ; SLOW_INC-NOT: movb %1 = load atomic i8, i8* %p seq_cst, align 1 @@ -467,11 +778,11 @@ define void @dec_8(i8* %p) { define void @dec_16(i16* %p) { ; Currently the transformation is not done on 16 bit accesses, as the backend ; treat 16 bit arithmetic as expensive on X86/X86_64. -; X64-LABEL: dec_16 +; X64-LABEL: dec_16: ; X64-NOT: decw -; X32-LABEL: dec_16 +; X32-LABEL: dec_16: ; X32-NOT: decw -; SLOW_INC-LABEL: dec_16 +; SLOW_INC-LABEL: dec_16: ; SLOW_INC-NOT: decw %1 = load atomic i16, i16* %p acquire, align 2 %2 = sub i16 %1, 1 @@ -480,15 +791,15 @@ define void @dec_16(i16* %p) { } define void @dec_32(i32* %p) { -; X64-LABEL: dec_32 +; X64-LABEL: dec_32: ; X64-NOT: lock ; X64: decl ; X64-NOT: movl -; X32-LABEL: dec_32 +; X32-LABEL: dec_32: ; X32-NOT: lock ; X32: decl ; X32-NOT: movl -; SLOW_INC-LABEL: dec_32 +; SLOW_INC-LABEL: dec_32: ; SLOW_INC-NOT: decl ; SLOW_INC-NOT: movl %1 = load atomic i32, i32* %p acquire, align 4 @@ -498,13 +809,13 @@ define void @dec_32(i32* %p) { } define void @dec_64(i64* %p) { -; X64-LABEL: dec_64 +; X64-LABEL: dec_64: ; X64-NOT: lock ; X64: decq ; X64-NOT: movq ; We do not check X86-32 as it cannot do 'decq'. -; X32-LABEL: dec_64 -; SLOW_INC-LABEL: dec_64 +; X32-LABEL: dec_64: +; SLOW_INC-LABEL: dec_64: ; SLOW_INC-NOT: decq ; SLOW_INC-NOT: movq %1 = load atomic i64, i64* %p acquire, align 8 @@ -514,12 +825,49 @@ define void @dec_64(i64* %p) { } define void @dec_32_seq_cst(i32* %p) { -; X64-LABEL: dec_32_seq_cst +; X64-LABEL: dec_32_seq_cst: ; X64: xchgl -; X32-LABEL: dec_32_seq_cst +; X32-LABEL: dec_32_seq_cst: ; X32: xchgl %1 = load atomic i32, i32* %p monotonic, align 4 %2 = sub i32 %1, 1 store atomic i32 %2, i32* %p seq_cst, align 4 ret void } + +; ----- FADD ----- + +define void @fadd_32r(float* %loc, float %val) { +; X64-LABEL: fadd_32r: +; X64-NOT: lock +; X64-NOT: mov +; X64: addss (%[[M:[a-z]+]]), %[[XMM:xmm[0-9]+]] +; X64-NEXT: movss %[[XMM]], (%[[M]]) +; X32-LABEL: fadd_32r: +; Don't check x86-32. +; LLVM's SSE handling is conservative on x86-32 even without using atomics. + %floc = bitcast float* %loc to i32* + %1 = load atomic i32, i32* %floc seq_cst, align 4 + %2 = bitcast i32 %1 to float + %add = fadd float %2, %val + %3 = bitcast float %add to i32 + store atomic i32 %3, i32* %floc release, align 4 + ret void +} + +define void @fadd_64r(double* %loc, double %val) { +; X64-LABEL: fadd_64r: +; X64-NOT: lock +; X64-NOT: mov +; X64: addsd (%[[M:[a-z]+]]), %[[XMM:xmm[0-9]+]] +; X64-NEXT: movsd %[[XMM]], (%[[M]]) +; X32-LABEL: fadd_64r: +; Don't check x86-32 (see comment above). + %floc = bitcast double* %loc to i64* + %1 = load atomic i64, i64* %floc seq_cst, align 8 + %2 = bitcast i64 %1 to double + %add = fadd double %2, %val + %3 = bitcast double %add to i64 + store atomic i64 %3, i64* %floc release, align 8 + ret void +}