From: Eli Friedman Date: Wed, 31 Aug 2011 00:31:29 +0000 (+0000) Subject: Some 64-bit atomic operations on ARM. 64-bit cmpxchg coming next. X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=2bdffe488203a08a2ca98548a157e0eaf39d4b2d;p=oota-llvm.git Some 64-bit atomic operations on ARM. 64-bit cmpxchg coming next. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@138845 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index f1e6b70453f..37e895650b1 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -254,6 +254,8 @@ private: SDNode *SelectConcatVector(SDNode *N); + SDNode *SelectAtomic64(SDNode *Node, unsigned Opc); + /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for /// inline asm expressions. virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op, @@ -2309,6 +2311,21 @@ SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) { return PairDRegs(VT, N->getOperand(0), N->getOperand(1)); } +SDNode *ARMDAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) { + SDValue Chain = Node->getOperand(0); + SDValue In1 = Node->getOperand(1); + SDValue In2L = Node->getOperand(2); + SDValue In2H = Node->getOperand(3); + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(Node)->getMemOperand(); + const SDValue Ops[] = { In1, In2L, In2H, Chain}; + SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(), + MVT::i32, MVT::i32, MVT::Other, Ops, + array_lengthof(Ops)); + cast(ResNode)->setMemRefs(MemOp, MemOp + 1); + return ResNode; +} + SDNode *ARMDAGToDAGISel::Select(SDNode *N) { DebugLoc dl = N->getDebugLoc(); @@ -3089,6 +3106,21 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { case ISD::CONCAT_VECTORS: return SelectConcatVector(N); + + case ARMISD::ATOMOR64_DAG: + return SelectAtomic64(N, ARM::ATOMOR6432); + case ARMISD::ATOMXOR64_DAG: + return SelectAtomic64(N, ARM::ATOMXOR6432); + case ARMISD::ATOMADD64_DAG: + return SelectAtomic64(N, ARM::ATOMADD6432); + case ARMISD::ATOMSUB64_DAG: + return SelectAtomic64(N, ARM::ATOMSUB6432); + case ARMISD::ATOMNAND64_DAG: + return SelectAtomic64(N, ARM::ATOMNAND6432); + case ARMISD::ATOMAND64_DAG: + return SelectAtomic64(N, ARM::ATOMAND6432); + case ARMISD::ATOMSWAP64_DAG: + return SelectAtomic64(N, ARM::ATOMSWAP6432); } return SelectCode(N); diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 52684663af2..550c8daaee5 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -611,6 +611,13 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) // normally. setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom); setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); + // Custom lowering for 64-bit ops + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc. setInsertFencesForAtomic(true); } else { @@ -4846,6 +4853,29 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { Op.getOperand(1), Op.getOperand(2)); } +static void +ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl& Results, + SelectionDAG &DAG, unsigned NewOp) { + EVT T = Node->getValueType(0); + DebugLoc dl = Node->getDebugLoc(); + assert (T == MVT::i64 && "Only know how to expand i64 atomics"); + + SDValue Chain = Node->getOperand(0); + SDValue In1 = Node->getOperand(1); + SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, + Node->getOperand(2), DAG.getIntPtrConstant(0)); + SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, + Node->getOperand(2), DAG.getIntPtrConstant(1)); + SDValue Ops[] = { Chain, In1, In2L, In2H }; + SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); + SDValue Result = + DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, + cast(Node)->getMemOperand()); + SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); + Results.push_back(Result.getValue(2)); +} + SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Don't know how to custom lower this!"); @@ -4918,6 +4948,29 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, case ISD::SRA: Res = Expand64BitShift(N, DAG, Subtarget); break; + case ISD::ATOMIC_LOAD_ADD: + ReplaceATOMIC_BINARY_64(N, Results, DAG, ARMISD::ATOMADD64_DAG); + return; + case ISD::ATOMIC_LOAD_AND: + ReplaceATOMIC_BINARY_64(N, Results, DAG, ARMISD::ATOMAND64_DAG); + return; + case ISD::ATOMIC_LOAD_NAND: + ReplaceATOMIC_BINARY_64(N, Results, DAG, ARMISD::ATOMNAND64_DAG); + return; + case ISD::ATOMIC_LOAD_OR: + ReplaceATOMIC_BINARY_64(N, Results, DAG, ARMISD::ATOMOR64_DAG); + return; + case ISD::ATOMIC_LOAD_SUB: + ReplaceATOMIC_BINARY_64(N, Results, DAG, ARMISD::ATOMSUB64_DAG); + return; + case ISD::ATOMIC_LOAD_XOR: + ReplaceATOMIC_BINARY_64(N, Results, DAG, ARMISD::ATOMXOR64_DAG); + return; + case ISD::ATOMIC_SWAP: + ReplaceATOMIC_BINARY_64(N, Results, DAG, ARMISD::ATOMSWAP64_DAG); + return; + //case ISD::ATOMIC_CMP_SWAP: + // ReplaceATOMIC_CMPXCHG_64(N, Results, DAG); } if (Res.getNode()) Results.push_back(Res); @@ -5237,6 +5290,113 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, return BB; } +MachineBasicBlock * +ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, + unsigned Op1, unsigned Op2, + bool NeedsCarry) const { + // This also handles ATOMIC_SWAP, indicated by Op1==0. + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction *MF = BB->getParent(); + MachineFunction::iterator It = BB; + ++It; + + unsigned destlo = MI->getOperand(0).getReg(); + unsigned desthi = MI->getOperand(1).getReg(); + unsigned ptr = MI->getOperand(2).getReg(); + unsigned vallo = MI->getOperand(3).getReg(); + unsigned valhi = MI->getOperand(4).getReg(); + DebugLoc dl = MI->getDebugLoc(); + bool isThumb2 = Subtarget->isThumb2(); + + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + if (isThumb2) { + MRI.constrainRegClass(destlo, ARM::rGPRRegisterClass); + MRI.constrainRegClass(desthi, ARM::rGPRRegisterClass); + MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass); + } + + unsigned ldrOpc = isThumb2 ? ARM::t2LDREXD : ARM::LDREXD; + unsigned strOpc = isThumb2 ? ARM::t2STREXD : ARM::STREXD; + + MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(It, loopMBB); + MF->insert(It, exitMBB); + + // Transfer the remainder of BB and its successor edges to exitMBB. + exitMBB->splice(exitMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + TargetRegisterClass *TRC = + isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass; + unsigned storesuccess = MRI.createVirtualRegister(TRC); + + // thisMBB: + // ... + // fallthrough --> loopMBB + BB->addSuccessor(loopMBB); + + // loopMBB: + // ldrexd r2, r3, ptr + // r0, r2, incr + // r1, r3, incr + // strexd storesuccess, r0, r1, ptr + // cmp storesuccess, #0 + // bne- loopMBB + // fallthrough --> exitMBB + // + // Note that the registers are explicitly specified because there is not any + // way to force the register allocator to allocate a register pair. + // + // FIXME: The hardcoded registers are not necessary for Thumb2, but we + // need to properly enforce the restriction that the two output registers + // for ldrexd must be different. + BB = loopMBB; + // Load + AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc)) + .addReg(ARM::R2, RegState::Define) + .addReg(ARM::R3, RegState::Define).addReg(ptr)); + // Copy r2/r3 into dest. (This copy will normally be coalesced.) + BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo).addReg(ARM::R2); + BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi).addReg(ARM::R3); + if (Op1) { + // Perform binary operation + AddDefaultPred(BuildMI(BB, dl, TII->get(Op1), ARM::R0) + .addReg(destlo).addReg(vallo)) + .addReg(NeedsCarry ? ARM::CPSR : 0, getDefRegState(NeedsCarry)); + AddDefaultPred(BuildMI(BB, dl, TII->get(Op2), ARM::R1) + .addReg(desthi).addReg(valhi)).addReg(0); + } else { + // Copy to physregs for strexd + BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R0).addReg(vallo); + BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R1).addReg(valhi); + } + + // Store + AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess) + .addReg(ARM::R0).addReg(ARM::R1).addReg(ptr)); + // Cmp+jump + AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(storesuccess).addImm(0)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); + + BB->addSuccessor(loopMBB); + BB->addSuccessor(exitMBB); + + // exitMBB: + // ... + BB = exitMBB; + + MI->eraseFromParent(); // The instruction is gone now. + + return BB; +} + static MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), @@ -5374,6 +5534,25 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2); case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4); + + case ARM::ATOMADD6432: + return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr, + isThumb2 ? ARM::t2ADCrr : ARM::ADCrr, true); + case ARM::ATOMSUB6432: + return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, + isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, true); + case ARM::ATOMOR6432: + return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr, + isThumb2 ? ARM::t2ORRrr : ARM::ORRrr, false); + case ARM::ATOMXOR6432: + return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2EORrr : ARM::EORrr, + isThumb2 ? ARM::t2EORrr : ARM::EORrr, false); + case ARM::ATOMAND6432: + return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr, + isThumb2 ? ARM::t2ANDrr : ARM::ANDrr, false); + case ARM::ATOMSWAP6432: + return EmitAtomicBinary64(MI, BB, 0, 0, false); + case ARM::tMOVCCr_pseudo: { // To "insert" a SELECT_CC instruction, we actually have to insert the // diamond control-flow pattern. The incoming instruction knows the diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 92164aef00b..3641d0dfe77 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -211,7 +211,17 @@ namespace llvm { VST4_UPD, VST2LN_UPD, VST3LN_UPD, - VST4LN_UPD + VST4LN_UPD, + + // 64-bit atomic ops (value split into two registers) + ATOMADD64_DAG, + ATOMSUB64_DAG, + ATOMOR64_DAG, + ATOMXOR64_DAG, + ATOMAND64_DAG, + ATOMNAND64_DAG, + ATOMSWAP64_DAG, + ATOMCMPXCHG64_DAG }; } @@ -493,6 +503,11 @@ namespace llvm { MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode) const; + MachineBasicBlock *EmitAtomicBinary64(MachineInstr *MI, + MachineBasicBlock *BB, + unsigned Op1, + unsigned Op2, + bool NeedsCarry) const; MachineBasicBlock * EmitAtomicBinaryMinMax(MachineInstr *MI, MachineBasicBlock *BB, unsigned Size, diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 08b0954efa8..010f7c07dbf 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -69,6 +69,8 @@ def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; +def SDTARMatomicBinary : SDTypeProfile<2, 3, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisPtrTy<2>, SDTCisInt<3>,SDTCisInt<4>]>; def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, [SDTCisSameAs<0, 2>, @@ -162,6 +164,28 @@ def ARMtcret : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET, def ARMbfi : SDNode<"ARMISD::BFI", SDT_ARMBFI>; +def ARMAtomAdd64 : SDNode<"X86ISD::ATOMADD64_DAG", SDTARMatomicBinary, + [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def ARMAtomSub64 : SDNode<"X86ISD::ATOMSUB64_DAG", SDTARMatomicBinary, + [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def ARMAtomOr64 : SDNode<"X86ISD::ATOMOR64_DAG", SDTARMatomicBinary, + [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def ARMAtomXor64 : SDNode<"X86ISD::ATOMXOR64_DAG", SDTARMatomicBinary, + [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def ARMAtomAnd64 : SDNode<"X86ISD::ATOMAND64_DAG", SDTARMatomicBinary, + [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def ARMAtomNand64 : SDNode<"X86ISD::ATOMNAND64_DAG", SDTARMatomicBinary, + [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def ARMAtomSwap64 : SDNode<"X86ISD::ATOMSWAP64_DAG", SDTARMatomicBinary, + [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; + //===----------------------------------------------------------------------===// // ARM Instruction Predicate Definitions. // @@ -1609,6 +1633,32 @@ PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary, [(ARMcallseq_start timm:$amt)]>; } +// Atomic pseudo-insts which will be lowered to ldrexd/strexd loops. +// (These psuedos use a hand-written selection code). +let usesCustomInserter = 1, Uses = [CPSR] in { +def ATOMOR6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2), + NoItinerary, []>; +def ATOMXOR6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2), + NoItinerary, []>; +def ATOMADD6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2), + NoItinerary, []>; +def ATOMSUB6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2), + NoItinerary, []>; +def ATOMNAND6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2), + NoItinerary, []>; +def ATOMAND6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2), + NoItinerary, []>; +def ATOMSWAP6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2), + NoItinerary, []>; +} + def NOP : AI<(outs), (ins), MiscFrm, NoItinerary, "nop", "", []>, Requires<[IsARM, HasV6T2]> { let Inst{27-16} = 0b001100100000; diff --git a/test/CodeGen/ARM/atomic-64bit.ll b/test/CodeGen/ARM/atomic-64bit.ll new file mode 100644 index 00000000000..186581ca8fb --- /dev/null +++ b/test/CodeGen/ARM/atomic-64bit.ll @@ -0,0 +1,83 @@ +; RUN: llc < %s -mtriple=armv7-apple-ios | FileCheck %s + +define i64 @test1(i64* %ptr, i64 %val) { +; CHECK: test1 +; CHECK: dmb ish +; CHECK: ldrexd r2, r3 +; CHECK: adds r0, r2 +; CHECK: adc r1, r3 +; CHECK: strexd {{r[0-9]+}}, r0, r1 +; CHECK: cmp +; CHECK: bne +; CHECK: dmb ish + %r = atomicrmw add i64* %ptr, i64 %val seq_cst + ret i64 %r +} + +define i64 @test2(i64* %ptr, i64 %val) { +; CHECK: test2 +; CHECK: dmb ish +; CHECK: ldrexd r2, r3 +; CHECK: subs r0, r2 +; CHECK: sbc r1, r3 +; CHECK: strexd {{r[0-9]+}}, r0, r1 +; CHECK: cmp +; CHECK: bne +; CHECK: dmb ish + %r = atomicrmw sub i64* %ptr, i64 %val seq_cst + ret i64 %r +} + +define i64 @test3(i64* %ptr, i64 %val) { +; CHECK: test3 +; CHECK: dmb ish +; CHECK: ldrexd r2, r3 +; CHECK: and r0, r2 +; CHECK: and r1, r3 +; CHECK: strexd {{r[0-9]+}}, r0, r1 +; CHECK: cmp +; CHECK: bne +; CHECK: dmb ish + %r = atomicrmw and i64* %ptr, i64 %val seq_cst + ret i64 %r +} + +define i64 @test4(i64* %ptr, i64 %val) { +; CHECK: test4 +; CHECK: dmb ish +; CHECK: ldrexd r2, r3 +; CHECK: orr r0, r2 +; CHECK: orr r1, r3 +; CHECK: strexd {{r[0-9]+}}, r0, r1 +; CHECK: cmp +; CHECK: bne +; CHECK: dmb ish + %r = atomicrmw or i64* %ptr, i64 %val seq_cst + ret i64 %r +} + +define i64 @test5(i64* %ptr, i64 %val) { +; CHECK: test5 +; CHECK: dmb ish +; CHECK: ldrexd r2, r3 +; CHECK: eor r0, r2 +; CHECK: eor r1, r3 +; CHECK: strexd {{r[0-9]+}}, r0, r1 +; CHECK: cmp +; CHECK: bne +; CHECK: dmb ish + %r = atomicrmw xor i64* %ptr, i64 %val seq_cst + ret i64 %r +} + +define i64 @test6(i64* %ptr, i64 %val) { +; CHECK: test6 +; CHECK: dmb ish +; CHECK: ldrexd r2, r3 +; CHECK: strexd {{r[0-9]+}}, r0, r1 +; CHECK: cmp +; CHECK: bne +; CHECK: dmb ish + %r = atomicrmw xchg i64* %ptr, i64 %val seq_cst + ret i64 %r +} \ No newline at end of file