From 05a3f9120a394d095c031a2d1b6d19ff90f7e8ba Mon Sep 17 00:00:00 2001 From: Quentin Colombet Date: Fri, 6 Mar 2015 22:42:10 +0000 Subject: [PATCH] [AArch64][LoadStoreOptimizer] Generate LDP + SXTW instead of LD[U]R + LD[U]RSW. Teach the load store optimizer how to sign extend a result of a load pair when it helps creating more pairs. The rational is that loads are more expensive than sign extensions, so if we gather some in one instruction this is better! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@231527 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../AArch64/AArch64LoadStoreOptimizer.cpp | 127 ++++++++++++++++-- test/CodeGen/AArch64/arm64-ldp.ll | 94 +++++++++++++ 2 files changed, 210 insertions(+), 11 deletions(-) diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 8463ce650bd..b1499e2c06d 100644 --- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -63,16 +63,24 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // If a matching instruction is found, MergeForward is set to true if the // merge is to remove the first instruction and replace the second with // a pair-wise insn, and false if the reverse is true. + // \p SExtIdx[out] gives the index of the result of the load pair that + // must be extended. The value of SExtIdx assumes that the paired load + // produces the value in this order: (I, returned iterator), i.e., + // -1 means no value has to be extended, 0 means I, and 1 means the + // returned iterator. MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, - bool &MergeForward, + bool &MergeForward, int &SExtIdx, unsigned Limit); // Merge the two instructions indicated into a single pair-wise instruction. // If MergeForward is true, erase the first instruction and fold its // operation into the second. If false, the reverse. Return the instruction // following the first instruction (which may change during processing). + // \p SExtIdx index of the result that must be extended for a paired load. + // -1 means none, 0 means I, and 1 means Paired. MachineBasicBlock::iterator mergePairedInsns(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, bool MergeForward); + MachineBasicBlock::iterator Paired, bool MergeForward, + int SExtIdx); // Scan the instruction list to find a base register update that can // be combined with the current instruction (a load or store) using @@ -181,6 +189,43 @@ int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) { } } +static unsigned getMatchingNonSExtOpcode(unsigned Opc, + bool *IsValidLdStrOpc = nullptr) { + if (IsValidLdStrOpc) + *IsValidLdStrOpc = true; + switch (Opc) { + default: + if (IsValidLdStrOpc) + *IsValidLdStrOpc = false; + return UINT_MAX; + case AArch64::STRDui: + case AArch64::STURDi: + case AArch64::STRQui: + case AArch64::STURQi: + case AArch64::STRWui: + case AArch64::STURWi: + case AArch64::STRXui: + case AArch64::STURXi: + case AArch64::LDRDui: + case AArch64::LDURDi: + case AArch64::LDRQui: + case AArch64::LDURQi: + case AArch64::LDRWui: + case AArch64::LDURWi: + case AArch64::LDRXui: + case AArch64::LDURXi: + case AArch64::STRSui: + case AArch64::STURSi: + case AArch64::LDRSui: + case AArch64::LDURSi: + return Opc; + case AArch64::LDRSWui: + return AArch64::LDRWui; + case AArch64::LDURSWi: + return AArch64::LDURWi; + } +} + static unsigned getMatchingPairOpcode(unsigned Opc) { switch (Opc) { default: @@ -282,7 +327,7 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - bool MergeForward) { + bool MergeForward, int SExtIdx) { MachineBasicBlock::iterator NextI = I; ++NextI; // If NextI is the second of the two instructions to be merged, we need @@ -292,11 +337,13 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, if (NextI == Paired) ++NextI; - bool IsUnscaled = isUnscaledLdst(I->getOpcode()); + unsigned Opc = + SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); + bool IsUnscaled = isUnscaledLdst(Opc); int OffsetStride = IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1; - unsigned NewOpc = getMatchingPairOpcode(I->getOpcode()); + unsigned NewOpc = getMatchingPairOpcode(Opc); // Insert our new paired instruction after whichever of the paired // instructions MergeForward indicates. MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; @@ -311,6 +358,11 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, Paired->getOperand(2).getImm() + OffsetStride) { RtMI = Paired; Rt2MI = I; + // Here we swapped the assumption made for SExtIdx. + // I.e., we turn ldp I, Paired into ldp Paired, I. + // Update the index accordingly. + if (SExtIdx != -1) + SExtIdx = (SExtIdx + 1) % 2; } else { RtMI = I; Rt2MI = Paired; @@ -337,8 +389,47 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, DEBUG(dbgs() << " "); DEBUG(Paired->print(dbgs())); DEBUG(dbgs() << " with instruction:\n "); - DEBUG(((MachineInstr *)MIB)->print(dbgs())); - DEBUG(dbgs() << "\n"); + + if (SExtIdx != -1) { + // Generate the sign extension for the proper result of the ldp. + // I.e., with X1, that would be: + // %W1 = KILL %W1, %X1 + // %X1 = SBFMXri %X1, 0, 31 + MachineOperand &DstMO = MIB->getOperand(SExtIdx); + // Right now, DstMO has the extended register, since it comes from an + // extended opcode. + unsigned DstRegX = DstMO.getReg(); + // Get the W variant of that register. + unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); + // Update the result of LDP to use the W instead of the X variant. + DstMO.setReg(DstRegW); + DEBUG(((MachineInstr *)MIB)->print(dbgs())); + DEBUG(dbgs() << "\n"); + // Make the machine verifier happy by providing a definition for + // the X register. + // Insert this definition right after the generated LDP, i.e., before + // InsertionPoint. + MachineInstrBuilder MIBKill = + BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(TargetOpcode::KILL), DstRegW) + .addReg(DstRegW) + .addReg(DstRegX, RegState::Define); + MIBKill->getOperand(2).setImplicit(); + // Create the sign extension. + MachineInstrBuilder MIBSXTW = + BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(AArch64::SBFMXri), DstRegX) + .addReg(DstRegX) + .addImm(0) + .addImm(31); + (void)MIBSXTW; + DEBUG(dbgs() << " Extend operand:\n "); + DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs())); + DEBUG(dbgs() << "\n"); + } else { + DEBUG(((MachineInstr *)MIB)->print(dbgs())); + DEBUG(dbgs() << "\n"); + } // Erase the old instructions. I->eraseFromParent(); @@ -396,7 +487,8 @@ static int alignTo(int Num, int PowOf2) { /// be combined with the current instruction into a load/store pair. MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, - bool &MergeForward, unsigned Limit) { + bool &MergeForward, int &SExtIdx, + unsigned Limit) { MachineBasicBlock::iterator E = I->getParent()->end(); MachineBasicBlock::iterator MBBI = I; MachineInstr *FirstMI = I; @@ -436,7 +528,19 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // Now that we know this is a real instruction, count it. ++Count; - if (Opc == MI->getOpcode() && MI->getOperand(2).isImm()) { + bool CanMergeOpc = Opc == MI->getOpcode(); + SExtIdx = -1; + if (!CanMergeOpc) { + bool IsValidLdStrOpc; + unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc); + if (!IsValidLdStrOpc) + continue; + // Opc will be the first instruction in the pair. + SExtIdx = NonSExtOpc == (unsigned)Opc ? 1 : 0; + CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode()); + } + + if (CanMergeOpc && MI->getOperand(2).isImm()) { // If we've found another instruction with the same opcode, check to see // if the base and offset are compatible with our starting instruction. // These instructions all have scaled immediate operands, so we just @@ -823,13 +927,14 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { } // Look ahead up to ScanLimit instructions for a pairable instruction. bool MergeForward = false; + int SExtIdx = -1; MachineBasicBlock::iterator Paired = - findMatchingInsn(MBBI, MergeForward, ScanLimit); + findMatchingInsn(MBBI, MergeForward, SExtIdx, ScanLimit); if (Paired != E) { // Merge the loads into a pair. Keeping the iterator straight is a // pain, so we let the merge routine tell us what the next instruction // is after it's done mucking about. - MBBI = mergePairedInsns(MBBI, Paired, MergeForward); + MBBI = mergePairedInsns(MBBI, Paired, MergeForward, SExtIdx); Modified = true; ++NumPairCreated; diff --git a/test/CodeGen/AArch64/arm64-ldp.ll b/test/CodeGen/AArch64/arm64-ldp.ll index 8642e0d9742..a192eab112f 100644 --- a/test/CodeGen/AArch64/arm64-ldp.ll +++ b/test/CodeGen/AArch64/arm64-ldp.ll @@ -24,6 +24,33 @@ define i64 @ldp_sext_int(i32* %p) nounwind { ret i64 %add } +; CHECK-LABEL: ldp_half_sext_res0_int: +; CHECK: ldp w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0] +; CHECK: sxtw x[[DST1]], w[[DST1]] +define i64 @ldp_half_sext_res0_int(i32* %p) nounwind { + %tmp = load i32, i32* %p, align 4 + %add.ptr = getelementptr inbounds i32, i32* %p, i64 1 + %tmp1 = load i32, i32* %add.ptr, align 4 + %sexttmp = sext i32 %tmp to i64 + %sexttmp1 = zext i32 %tmp1 to i64 + %add = add nsw i64 %sexttmp1, %sexttmp + ret i64 %add +} + +; CHECK-LABEL: ldp_half_sext_res1_int: +; CHECK: ldp w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0] +; CHECK: sxtw x[[DST2]], w[[DST2]] +define i64 @ldp_half_sext_res1_int(i32* %p) nounwind { + %tmp = load i32, i32* %p, align 4 + %add.ptr = getelementptr inbounds i32, i32* %p, i64 1 + %tmp1 = load i32, i32* %add.ptr, align 4 + %sexttmp = zext i32 %tmp to i64 + %sexttmp1 = sext i32 %tmp1 to i64 + %add = add nsw i64 %sexttmp1, %sexttmp + ret i64 %add +} + + ; CHECK: ldp_long ; CHECK: ldp define i64 @ldp_long(i64* %p) nounwind { @@ -83,6 +110,39 @@ define i64 @ldur_sext_int(i32* %a) nounwind { ret i64 %tmp3 } +define i64 @ldur_half_sext_int_res0(i32* %a) nounwind { +; LDUR_CHK: ldur_half_sext_int_res0 +; LDUR_CHK: ldp w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-8] +; LDUR_CHK: sxtw x[[DST1]], w[[DST1]] +; LDUR_CHK-NEXT: add x{{[0-9]+}}, x[[DST2]], x[[DST1]] +; LDUR_CHK-NEXT: ret + %p1 = getelementptr inbounds i32, i32* %a, i32 -1 + %tmp1 = load i32, i32* %p1, align 2 + %p2 = getelementptr inbounds i32, i32* %a, i32 -2 + %tmp2 = load i32, i32* %p2, align 2 + %sexttmp1 = zext i32 %tmp1 to i64 + %sexttmp2 = sext i32 %tmp2 to i64 + %tmp3 = add i64 %sexttmp1, %sexttmp2 + ret i64 %tmp3 +} + +define i64 @ldur_half_sext_int_res1(i32* %a) nounwind { +; LDUR_CHK: ldur_half_sext_int_res1 +; LDUR_CHK: ldp w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-8] +; LDUR_CHK: sxtw x[[DST2]], w[[DST2]] +; LDUR_CHK-NEXT: add x{{[0-9]+}}, x[[DST2]], x[[DST1]] +; LDUR_CHK-NEXT: ret + %p1 = getelementptr inbounds i32, i32* %a, i32 -1 + %tmp1 = load i32, i32* %p1, align 2 + %p2 = getelementptr inbounds i32, i32* %a, i32 -2 + %tmp2 = load i32, i32* %p2, align 2 + %sexttmp1 = sext i32 %tmp1 to i64 + %sexttmp2 = zext i32 %tmp2 to i64 + %tmp3 = add i64 %sexttmp1, %sexttmp2 + ret i64 %tmp3 +} + + define i64 @ldur_long(i64* %a) nounwind ssp { ; LDUR_CHK: ldur_long ; LDUR_CHK: ldp [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-16] @@ -153,6 +213,40 @@ define i64 @pairUpBarelyInSext(i32* %a) nounwind ssp { ret i64 %tmp3 } +define i64 @pairUpBarelyInHalfSextRes0(i32* %a) nounwind ssp { +; LDUR_CHK: pairUpBarelyInHalfSextRes0 +; LDUR_CHK-NOT: ldur +; LDUR_CHK: ldp w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-256] +; LDUR_CHK: sxtw x[[DST1]], w[[DST1]] +; LDUR_CHK-NEXT: add x{{[0-9]+}}, x[[DST2]], x[[DST1]] +; LDUR_CHK-NEXT: ret + %p1 = getelementptr inbounds i32, i32* %a, i64 -63 + %tmp1 = load i32, i32* %p1, align 2 + %p2 = getelementptr inbounds i32, i32* %a, i64 -64 + %tmp2 = load i32, i32* %p2, align 2 + %sexttmp1 = zext i32 %tmp1 to i64 + %sexttmp2 = sext i32 %tmp2 to i64 + %tmp3 = add i64 %sexttmp1, %sexttmp2 + ret i64 %tmp3 +} + +define i64 @pairUpBarelyInHalfSextRes1(i32* %a) nounwind ssp { +; LDUR_CHK: pairUpBarelyInHalfSextRes1 +; LDUR_CHK-NOT: ldur +; LDUR_CHK: ldp w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-256] +; LDUR_CHK: sxtw x[[DST2]], w[[DST2]] +; LDUR_CHK-NEXT: add x{{[0-9]+}}, x[[DST2]], x[[DST1]] +; LDUR_CHK-NEXT: ret + %p1 = getelementptr inbounds i32, i32* %a, i64 -63 + %tmp1 = load i32, i32* %p1, align 2 + %p2 = getelementptr inbounds i32, i32* %a, i64 -64 + %tmp2 = load i32, i32* %p2, align 2 + %sexttmp1 = sext i32 %tmp1 to i64 + %sexttmp2 = zext i32 %tmp2 to i64 + %tmp3 = add i64 %sexttmp1, %sexttmp2 + ret i64 %tmp3 +} + define i64 @pairUpBarelyOut(i64* %a) nounwind ssp { ; LDUR_CHK: pairUpBarelyOut ; LDUR_CHK-NOT: ldp -- 2.34.1