From: Peter Collingbourne Date: Fri, 5 Jun 2015 18:01:28 +0000 (+0000) Subject: Revert r238473, "Thumb2: Modify codegen for memcpy intrinsic to prefer LDM/STM." X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=f5c04a9da7fb273981764815905888006c53218b;p=oota-llvm.git Revert r238473, "Thumb2: Modify codegen for memcpy intrinsic to prefer LDM/STM." as it caused miscompilations and assertion failures (PR23768, http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20150601/280380.html). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239169 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 12dd4c1698e..47c8400a668 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -1125,7 +1125,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; case ARMISD::VBSL: return "ARMISD::VBSL"; - case ARMISD::MCOPY: return "ARMISD::MCOPY"; case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; @@ -7676,59 +7675,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, } } -/// \brief Lowers MCOPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD depending -/// on whether the result is used. This is done as a post-isel lowering instead -/// of as a custom inserter because we need the use list from the SDNode. -static void LowerMCOPY(const ARMSubtarget *Subtarget, MachineInstr *MI, - SDNode *Node) { - bool isThumb1 = Subtarget->isThumb1Only(); - bool isThumb2 = Subtarget->isThumb2(); - const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); - - DebugLoc dl = MI->getDebugLoc(); - MachineBasicBlock *BB = MI->getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - - MachineInstrBuilder LD, ST; - if (isThumb1 || Node->hasAnyUseOfValue(1)) { - LD = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA_UPD - : isThumb1 ? ARM::tLDMIA_UPD - : ARM::LDMIA_UPD)) - .addOperand(MI->getOperand(1)); - } else { - LD = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA : ARM::LDMIA)); - } - - if (isThumb1 || Node->hasAnyUseOfValue(0)) { - ST = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA_UPD - : isThumb1 ? ARM::tSTMIA_UPD - : ARM::STMIA_UPD)) - .addOperand(MI->getOperand(0)); - } else { - ST = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA : ARM::STMIA)); - } - - LD.addOperand(MI->getOperand(3)).addImm(ARMCC::AL).addReg(0); - ST.addOperand(MI->getOperand(2)).addImm(ARMCC::AL).addReg(0); - - for (unsigned I = 0; I != MI->getOperand(4).getImm(); ++I) { - unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass - : &ARM::GPRRegClass); - LD.addReg(TmpReg, RegState::Define); - ST.addReg(TmpReg, RegState::Kill); - } - - MI->eraseFromParent(); -} - void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const { - if (MI->getOpcode() == ARM::MCOPY) { - LowerMCOPY(Subtarget, MI, Node); - return; - } - const MCInstrDesc *MCID = &MI->getDesc(); // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, // RSC. Coming out of isel, they have an implicit CPSR def, but the optional diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 94b6ec62214..c0b329c5a1e 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -189,10 +189,6 @@ namespace llvm { // Vector bitwise select VBSL, - // Pseudo-instruction representing a memory copy using ldm/stm - // instructions. - MCOPY, - // Vector load N-element structure to all lanes: VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, VLD3DUP, diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 5a8b3dae6df..b8cac135baf 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -73,10 +73,6 @@ def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, def SDT_ARMVMAXNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>; def SDT_ARMVMINNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>; -def SDT_ARMMCOPY : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, - SDTCisVT<2, i32>, SDTCisVT<3, i32>, - SDTCisVT<4, i32>]>; - def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, @@ -183,10 +179,6 @@ def ARMbfi : SDNode<"ARMISD::BFI", SDT_ARMBFI>; def ARMvmaxnm : SDNode<"ARMISD::VMAXNM", SDT_ARMVMAXNM, []>; def ARMvminnm : SDNode<"ARMISD::VMINNM", SDT_ARMVMINNM, []>; -def ARMmcopy : SDNode<"ARMISD::MCOPY", SDT_ARMMCOPY, - [SDNPHasChain, SDNPInGlue, SDNPOutGlue, - SDNPMayStore, SDNPMayLoad]>; - //===----------------------------------------------------------------------===// // ARM Instruction Predicate Definitions. // @@ -4586,13 +4578,6 @@ let usesCustomInserter = 1 in { [(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>; } -let hasPostISelHook = 1 in { - def MCOPY : PseudoInst< - (outs GPR:$newdst, GPR:$newsrc), (ins GPR:$dst, GPR:$src, i32imm:$nreg), - NoItinerary, - [(set GPR:$newdst, GPR:$newsrc, (ARMmcopy GPR:$dst, GPR:$src, imm:$nreg))]>; -} - def ldrex_1 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i8; }]>; diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp index beed0fb432f..a59cf985110 100644 --- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -164,38 +164,41 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, unsigned VTSize = 4; unsigned i = 0; // Emit a maximum of 4 loads in Thumb1 since we have fewer registers - const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6; + const unsigned MAX_LOADS_IN_LDM = Subtarget.isThumb1Only() ? 4 : 6; SDValue TFOps[6]; SDValue Loads[6]; uint64_t SrcOff = 0, DstOff = 0; - // FIXME: We should invent a VMCOPY pseudo-instruction that lowers to - // VLDM/VSTM and make this code emit it when appropriate. This would reduce - // pressure on the general purpose registers. However this seems harder to map - // onto the register allocator's view of the world. - - // The number of MCOPY pseudo-instructions to emit. We use up to MaxLoadsInLDM - // registers per mcopy, which will get lowered into ldm/stm later on. This is - // a lower bound on the number of MCOPY operations we must emit. - unsigned NumMCOPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM; - - SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue); - - for (unsigned I = 0; I != NumMCOPYs; ++I) { - // Evenly distribute registers among MCOPY operations to reduce register - // pressure. - unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMCOPYs; - unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps; - - Dst = DAG.getNode(ARMISD::MCOPY, dl, VTs, Chain, Dst, Src, - DAG.getConstant(NumRegs, dl, MVT::i32)); - Src = Dst.getValue(1); - Chain = Dst.getValue(2); + // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the + // same number of stores. The loads and stores will get combined into + // ldm/stm later on. + while (EmittedNumMemOps < NumMemOps) { + for (i = 0; + i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { + Loads[i] = DAG.getLoad(VT, dl, Chain, + DAG.getNode(ISD::ADD, dl, MVT::i32, Src, + DAG.getConstant(SrcOff, dl, MVT::i32)), + SrcPtrInfo.getWithOffset(SrcOff), isVolatile, + false, false, 0); + TFOps[i] = Loads[i].getValue(1); + SrcOff += VTSize; + } + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + makeArrayRef(TFOps, i)); - DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize); - SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize); + for (i = 0; + i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { + TFOps[i] = DAG.getStore(Chain, dl, Loads[i], + DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, + DAG.getConstant(DstOff, dl, MVT::i32)), + DstPtrInfo.getWithOffset(DstOff), + isVolatile, false, 0); + DstOff += VTSize; + } + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + makeArrayRef(TFOps, i)); - EmittedNumMemOps = NextEmittedNumMemOps; + EmittedNumMemOps += i; } if (BytesLeft == 0) diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp index 188410a5123..f88ac30a91a 100644 --- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp +++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp @@ -744,21 +744,10 @@ void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { O << "{"; - - // The backend may have given us a register list in non-ascending order. Sort - // it now. - std::vector RegOps(MI->size() - OpNum); - std::copy(MI->begin() + OpNum, MI->end(), RegOps.begin()); - std::sort(RegOps.begin(), RegOps.end(), - [this](const MCOperand &O1, const MCOperand &O2) -> bool { - return MRI.getEncodingValue(O1.getReg()) < - MRI.getEncodingValue(O2.getReg()); - }); - - for (unsigned i = 0, e = RegOps.size(); i != e; ++i) { - if (i != 0) + for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) { + if (i != OpNum) O << ", "; - printRegName(O, RegOps[i].getReg()); + printRegName(O, MI->getOperand(i).getReg()); } O << "}"; } diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp index c2d9c17cf37..0ab1ff906c9 100644 --- a/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -125,10 +125,7 @@ namespace { { ARM::t2LDMIA, ARM::tLDMIA, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2LDMIA_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0, 0, 1, 1, 1,1, 0,1,0 }, - // ARM::t2STMIA (with no basereg writeback) has no Thumb1 equivalent. - // tSTMIA_UPD is a change in semantics which can only be used if the base - // register is killed. This difference is correctly handled elsewhere. - { ARM::t2STMIA, ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, + // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2STMDB_UPD, 0, ARM::tPUSH, 0, 0, 1, 1, 1,1, 0,1,0 } }; @@ -435,14 +432,6 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, isLdStMul = true; break; } - case ARM::t2STMIA: { - // If the base register is killed, we don't care what its value is after the - // instruction, so we can use an updating STMIA. - if (!MI->getOperand(0).isKill()) - return false; - - break; - } case ARM::t2LDMIA_RET: { unsigned BaseReg = MI->getOperand(1).getReg(); if (BaseReg != ARM::SP) @@ -500,12 +489,6 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, // Add the 16-bit load / store instruction. DebugLoc dl = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, TII->get(Opc)); - - // tSTMIA_UPD takes a defining register operand. We've already checked that - // the register is killed, so mark it as dead here. - if (Entry.WideOpc == ARM::t2STMIA) - MIB.addReg(MI->getOperand(0).getReg(), RegState::Define | RegState::Dead); - if (!isLdStMul) { MIB.addOperand(MI->getOperand(0)); MIB.addOperand(MI->getOperand(1)); diff --git a/test/CodeGen/Thumb/ldm-stm-base-materialization.ll b/test/CodeGen/Thumb/ldm-stm-base-materialization.ll index 0be796eb8f8..916e5ea299a 100644 --- a/test/CodeGen/Thumb/ldm-stm-base-materialization.ll +++ b/test/CodeGen/Thumb/ldm-stm-base-materialization.ll @@ -6,89 +6,22 @@ target triple = "thumbv6m-none--eabi" @b = external global i32* ; Function Attrs: nounwind -define void @foo24() #0 { +define void @foo() #0 { entry: -; CHECK-LABEL: foo24: -; CHECK: ldr r[[LB:[0-9]]], .LCPI -; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4 -; CHECK: ldr r[[SB:[0-9]]], .LCPI -; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4 -; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]} -; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]} -; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]} -; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]} - %0 = load i32*, i32** @a, align 4 - %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 - %1 = bitcast i32* %arrayidx to i8* - %2 = load i32*, i32** @b, align 4 - %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 - %3 = bitcast i32* %arrayidx1 to i8* - tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 24, i32 4, i1 false) - ret void -} - -define void @foo28() #0 { -entry: -; CHECK-LABEL: foo28: -; CHECK: ldr r[[LB:[0-9]]], .LCPI -; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4 -; CHECK: ldr r[[SB:[0-9]]], .LCPI -; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4 -; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]} -; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]} -; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]], r[[R4:[0-9]]]} -; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]], r[[R4]]} - %0 = load i32*, i32** @a, align 4 - %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 - %1 = bitcast i32* %arrayidx to i8* - %2 = load i32*, i32** @b, align 4 - %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 - %3 = bitcast i32* %arrayidx1 to i8* - tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 28, i32 4, i1 false) - ret void -} - -define void @foo32() #0 { -entry: -; CHECK-LABEL: foo32: -; CHECK: ldr r[[LB:[0-9]]], .LCPI -; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4 +; CHECK-LABEL: foo: ; CHECK: ldr r[[SB:[0-9]]], .LCPI -; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4 -; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]], r[[R4:[0-9]]]} -; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]], r[[R4]]} -; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]], r[[R4:[0-9]]]} -; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]], r[[R4]]} - %0 = load i32*, i32** @a, align 4 - %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 - %1 = bitcast i32* %arrayidx to i8* - %2 = load i32*, i32** @b, align 4 - %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 - %3 = bitcast i32* %arrayidx1 to i8* - tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 32, i32 4, i1 false) - ret void -} - -define void @foo36() #0 { -entry: -; CHECK-LABEL: foo36: ; CHECK: ldr r[[LB:[0-9]]], .LCPI ; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4 -; CHECK: ldr r[[SB:[0-9]]], .LCPI +; CHECK-NEXT: ldm r[[NLB]], ; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4 -; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]} -; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]} -; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]} -; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]} -; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]} -; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]} +; CHECK-NEXT: stm r[[NSB]] %0 = load i32*, i32** @a, align 4 %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 %1 = bitcast i32* %arrayidx to i8* %2 = load i32*, i32** @b, align 4 %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 %3 = bitcast i32* %arrayidx1 to i8* - tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 36, i32 4, i1 false) + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 24, i32 4, i1 false) ret void } diff --git a/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll b/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll index c4d84341ea2..da2f3f09b28 100644 --- a/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll +++ b/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll @@ -7,8 +7,8 @@ define void @t1() #0 { entry: ; CHECK-LABEL: t1: ; CHECK: ldr r[[LB:[0-9]]], -; CHECK-NEXT: ldr r[[SB:[0-9]]], ; CHECK-NEXT: ldm r[[LB]]!, +; CHECK-NEXT: ldr r[[SB:[0-9]]], ; CHECK-NEXT: stm r[[SB]]!, ; CHECK-NEXT: ldrb {{.*}}, [r[[LB]]] ; CHECK-NEXT: strb {{.*}}, [r[[SB]]] @@ -21,8 +21,8 @@ define void @t2() #0 { entry: ; CHECK-LABEL: t2: ; CHECK: ldr r[[LB:[0-9]]], -; CHECK-NEXT: ldr r[[SB:[0-9]]], ; CHECK-NEXT: ldm r[[LB]]!, +; CHECK-NEXT: ldr r[[SB:[0-9]]], ; CHECK-NEXT: stm r[[SB]]!, ; CHECK-NEXT: ldrh {{.*}}, [r[[LB]]] ; CHECK-NEXT: ldrb {{.*}}, [r[[LB]], #2]