From ee629d83953a0a66d95ae3ed1cc88f2cb0e0d163 Mon Sep 17 00:00:00 2001 From: Ahmed Bougacha Date: Tue, 22 Sep 2015 17:21:44 +0000 Subject: [PATCH] [AArch64] Emit clrex in the expanded cmpxchg fail block. In the comparison failure block of a cmpxchg expansion, the initial ldrex/ldxr will not be followed by a matching strex/stxr. On ARM/AArch64, this unnecessarily ties up the execution monitor, which might have a negative performance impact on some uarchs. Instead, release the monitor in the failure block. The clrex instruction was designed for this: use it. Also see ARMARM v8-A B2.10.2: "Exclusive access instructions and Shareable memory locations". Differential Revision: http://reviews.llvm.org/D13033 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@248291 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Target/TargetLowering.h | 8 +++ lib/CodeGen/AtomicExpandPass.cpp | 17 +++++- lib/Target/AArch64/AArch64ISelLowering.cpp | 7 +++ lib/Target/AArch64/AArch64ISelLowering.h | 2 + test/CodeGen/AArch64/arm64-atomic.ll | 70 +++++++++++++--------- test/CodeGen/AArch64/atomic-ops.ll | 16 +++-- 6 files changed, 84 insertions(+), 36 deletions(-) diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 47b1bb70b6a..caf28d2b118 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -1097,6 +1097,14 @@ public: } /// @} + // Emits code that executes when the comparison result in the ll/sc + // expansion of a cmpxchg instruction is such that the store-conditional will + // not execute. This makes it possible to balance out the load-linked with + // a dedicated instruction, if desired. + // E.g., on ARM, if ldrex isn't followed by strex, the exclusive monitor would + // be unnecessarily held, except if clrex, inserted by this hook, is executed. + virtual void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const {} + /// Returns true if the given (atomic) store should be expanded by the /// IR-level AtomicExpand pass into an "atomic xchg" which ignores its input. virtual bool shouldExpandAtomicStoreInIR(StoreInst *SI) const { diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp index 73102ccfece..0e5b62a3e34 100644 --- a/lib/CodeGen/AtomicExpandPass.cpp +++ b/lib/CodeGen/AtomicExpandPass.cpp @@ -374,7 +374,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { // %loaded = @load.linked(%addr) // %should_store = icmp eq %loaded, %desired // br i1 %should_store, label %cmpxchg.trystore, - // label %cmpxchg.failure + // label %cmpxchg.nostore // cmpxchg.trystore: // %stored = @store_conditional(%new, %addr) // %success = icmp eq i32 %stored, 0 @@ -382,6 +382,9 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { // cmpxchg.success: // fence? // br label %cmpxchg.end + // cmpxchg.nostore: + // @load_linked_fail_balance()? + // br label %cmpxchg.failure // cmpxchg.failure: // fence? // br label %cmpxchg.end @@ -392,7 +395,8 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { // [...] BasicBlock *ExitBB = BB->splitBasicBlock(CI, "cmpxchg.end"); auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB); - auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, FailureBB); + auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB); + auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB); auto TryStoreBB = BasicBlock::Create(Ctx, "cmpxchg.trystore", F, SuccessBB); auto LoopBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, TryStoreBB); @@ -416,7 +420,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { // If the cmpxchg doesn't actually need any ordering when it fails, we can // jump straight past that fence instruction (if it exists). - Builder.CreateCondBr(ShouldStore, TryStoreBB, FailureBB); + Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB); Builder.SetInsertPoint(TryStoreBB); Value *StoreSuccess = TLI->emitStoreConditional( @@ -432,6 +436,13 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { /*IsLoad=*/true); Builder.CreateBr(ExitBB); + Builder.SetInsertPoint(NoStoreBB); + // In the failing case, where we don't execute the store-conditional, the + // target might want to balance out the load-linked with a dedicated + // instruction (e.g., on ARM, clearing the exclusive monitor). + TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); + Builder.CreateBr(FailureBB); + Builder.SetInsertPoint(FailureBB); TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true, /*IsLoad=*/true); diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 3c1251e2695..cba7c5b5502 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9682,6 +9682,13 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, cast(Addr->getType())->getElementType()); } +void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( + IRBuilder<> &Builder) const { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Builder.CreateCall( + llvm::Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); +} + Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index a60c2a6315d..b815f55da6b 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -348,6 +348,8 @@ public: Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override; + void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override; + TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; diff --git a/test/CodeGen/AArch64/arm64-atomic.ll b/test/CodeGen/AArch64/arm64-atomic.ll index 0824bd881a9..5d8d60de5fc 100644 --- a/test/CodeGen/AArch64/arm64-atomic.ll +++ b/test/CodeGen/AArch64/arm64-atomic.ll @@ -2,13 +2,17 @@ define i32 @val_compare_and_swap(i32* %p, i32 %cmp, i32 %new) #0 { ; CHECK-LABEL: val_compare_and_swap: -; CHECK-NEXT: [[LABEL:.?LBB[0-9]+_[0-9]+]]: -; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x0] +; CHECK-NEXT: mov x[[ADDR:[0-9]+]], x0 +; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]: +; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x[[ADDR]]] ; CHECK-NEXT: cmp [[RESULT]], w1 -; CHECK-NEXT: b.ne [[LABEL2:.?LBB[0-9]+_[0-9]+]] -; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], w2, [x0] -; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[LABEL]] -; CHECK-NEXT: [[LABEL2]]: +; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] +; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]] +; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] +; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: [[FAILBB]]: +; CHECK-NEXT: clrex +; CHECK-NEXT: [[EXITBB]]: %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire %val = extractvalue { i32, i1 } %pair, 0 ret i32 %val @@ -17,13 +21,16 @@ define i32 @val_compare_and_swap(i32* %p, i32 %cmp, i32 %new) #0 { define i32 @val_compare_and_swap_from_load(i32* %p, i32 %cmp, i32* %pnew) #0 { ; CHECK-LABEL: val_compare_and_swap_from_load: ; CHECK-NEXT: ldr [[NEW:w[0-9]+]], [x2] -; CHECK-NEXT: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]: ; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x0] ; CHECK-NEXT: cmp [[RESULT]], w1 -; CHECK-NEXT: b.ne [[LABEL2:.?LBB[0-9]+_[0-9]+]] +; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], [[NEW]], [x0] -; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[LABEL]] -; CHECK-NEXT: [[LABEL2]]: +; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] +; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: [[FAILBB]]: +; CHECK-NEXT: clrex +; CHECK-NEXT: [[EXITBB]]: %new = load i32, i32* %pnew %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire %val = extractvalue { i32, i1 } %pair, 0 @@ -32,13 +39,17 @@ define i32 @val_compare_and_swap_from_load(i32* %p, i32 %cmp, i32* %pnew) #0 { define i32 @val_compare_and_swap_rel(i32* %p, i32 %cmp, i32 %new) #0 { ; CHECK-LABEL: val_compare_and_swap_rel: -; CHECK-NEXT: [[LABEL:.?LBB[0-9]+_[0-9]+]]: -; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x0] +; CHECK-NEXT: mov x[[ADDR:[0-9]+]], x0 +; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]: +; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x[[ADDR]] ; CHECK-NEXT: cmp [[RESULT]], w1 -; CHECK-NEXT: b.ne [[LABEL2:.?LBB[0-9]+_[0-9]+]] -; CHECK-NEXT: stlxr [[SCRATCH_REG:w[0-9]+]], w2, [x0] -; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[LABEL]] -; CHECK-NEXT: [[LABEL2]]: +; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] +; CHECK-NEXT: stlxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]] +; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] +; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: [[FAILBB]]: +; CHECK-NEXT: clrex +; CHECK-NEXT: [[EXITBB]]: %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acq_rel monotonic %val = extractvalue { i32, i1 } %pair, 0 ret i32 %val @@ -47,13 +58,16 @@ define i32 @val_compare_and_swap_rel(i32* %p, i32 %cmp, i32 %new) #0 { define i64 @val_compare_and_swap_64(i64* %p, i64 %cmp, i64 %new) #0 { ; CHECK-LABEL: val_compare_and_swap_64: ; CHECK-NEXT: mov x[[ADDR:[0-9]+]], x0 -; CHECK-NEXT: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]: ; CHECK-NEXT: ldxr [[RESULT:x[0-9]+]], [x[[ADDR]]] ; CHECK-NEXT: cmp [[RESULT]], x1 -; CHECK-NEXT: b.ne [[LABEL2:.?LBB[0-9]+_[0-9]+]] +; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], x2, [x[[ADDR]]] -; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[LABEL]] -; CHECK-NEXT: [[LABEL2]]: +; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] +; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: [[FAILBB]]: +; CHECK-NEXT: clrex +; CHECK-NEXT: [[EXITBB]]: %pair = cmpxchg i64* %p, i64 %cmp, i64 %new monotonic monotonic %val = extractvalue { i64, i1 } %pair, 0 ret i64 %val @@ -61,13 +75,13 @@ define i64 @val_compare_and_swap_64(i64* %p, i64 %cmp, i64 %new) #0 { define i32 @fetch_and_nand(i32* %p) #0 { ; CHECK-LABEL: fetch_and_nand: -; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: [[TRYBB:.?LBB[0-9_]+]]: ; CHECK: ldxr w[[DEST_REG:[0-9]+]], [x0] ; CHECK: mvn [[TMP_REG:w[0-9]+]], w[[DEST_REG]] ; CHECK: orr [[SCRATCH2_REG:w[0-9]+]], [[TMP_REG]], #0xfffffff8 ; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]] ; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0] -; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]] +; CHECK: cbnz [[SCRATCH_REG]], [[TRYBB]] ; CHECK: mov x0, x[[DEST_REG]] %val = atomicrmw nand i32* %p, i32 7 release ret i32 %val @@ -76,12 +90,12 @@ define i32 @fetch_and_nand(i32* %p) #0 { define i64 @fetch_and_nand_64(i64* %p) #0 { ; CHECK-LABEL: fetch_and_nand_64: ; CHECK: mov x[[ADDR:[0-9]+]], x0 -; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: [[TRYBB:.?LBB[0-9_]+]]: ; CHECK: ldaxr x[[DEST_REG:[0-9]+]], [x[[ADDR]]] ; CHECK: mvn w[[TMP_REG:[0-9]+]], w[[DEST_REG]] ; CHECK: orr [[SCRATCH2_REG:x[0-9]+]], x[[TMP_REG]], #0xfffffffffffffff8 ; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]] -; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]] +; CHECK: cbnz [[SCRATCH_REG]], [[TRYBB]] %val = atomicrmw nand i64* %p, i64 7 acq_rel ret i64 %val @@ -90,12 +104,12 @@ define i64 @fetch_and_nand_64(i64* %p) #0 { define i32 @fetch_and_or(i32* %p) #0 { ; CHECK-LABEL: fetch_and_or: ; CHECK: movz [[OLDVAL_REG:w[0-9]+]], #0x5 -; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: [[TRYBB:.?LBB[0-9_]+]]: ; CHECK: ldaxr w[[DEST_REG:[0-9]+]], [x0] ; CHECK: orr [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], [[OLDVAL_REG]] ; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]] ; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0] -; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]] +; CHECK: cbnz [[SCRATCH_REG]], [[TRYBB]] ; CHECK: mov x0, x[[DEST_REG]] %val = atomicrmw or i32* %p, i32 5 seq_cst ret i32 %val @@ -104,11 +118,11 @@ define i32 @fetch_and_or(i32* %p) #0 { define i64 @fetch_and_or_64(i64* %p) #0 { ; CHECK: fetch_and_or_64: ; CHECK: mov x[[ADDR:[0-9]+]], x0 -; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: [[TRYBB:.?LBB[0-9_]+]]: ; CHECK: ldxr [[DEST_REG:x[0-9]+]], [x[[ADDR]]] ; CHECK: orr [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], #0x7 ; CHECK: stxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]] -; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]] +; CHECK: cbnz [[SCRATCH_REG]], [[TRYBB]] %val = atomicrmw or i64* %p, i64 7 monotonic ret i64 %val } diff --git a/test/CodeGen/AArch64/atomic-ops.ll b/test/CodeGen/AArch64/atomic-ops.ll index cb90caeadc1..fa7c251a094 100644 --- a/test/CodeGen/AArch64/atomic-ops.ll +++ b/test/CodeGen/AArch64/atomic-ops.ll @@ -893,6 +893,8 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind { ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]] ; CHECK: stxrb [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]] +; CHECK: [[GET_OUT]]: +; CHECK: clrex ; CHECK-NOT: dmb ; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]] @@ -916,6 +918,8 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind { ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]] ; CHECK: stlxrh [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]] +; CHECK: [[GET_OUT]]: +; CHECK: clrex ; CHECK-NOT: dmb ; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]] @@ -927,21 +931,21 @@ define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind { %pair = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic %old = extractvalue { i32, i1 } %pair, 0 +; CHECK: mov {{[xw]}}[[WANTED:[0-9]+]], {{[xw]}}0 + ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32 ; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]: ; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]] - ; w0 below is a reasonable guess but could change: it certainly comes into the - ; function there. -; CHECK-NEXT: cmp w[[OLD]], w0 +; CHECK-NEXT: cmp w[[OLD]], w[[WANTED]] ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]] ; CHECK: stlxr [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]] +; CHECK: [[GET_OUT]]: +; CHECK: clrex ; CHECK-NOT: dmb - -; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]] ret i32 %old } @@ -963,6 +967,8 @@ define void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind { ; As above, w1 is a reasonable guess. ; CHECK: stxr [[STATUS:w[0-9]+]], x1, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]] +; CHECK: [[GET_OUT]]: +; CHECK: clrex ; CHECK-NOT: dmb ; CHECK: str x[[OLD]], -- 2.34.1