From a43aca40330cd266decea50f173bcfa4f79c0650 Mon Sep 17 00:00:00 2001 From: Chad Rosier Date: Fri, 25 Sep 2015 17:48:17 +0000 Subject: [PATCH] [AArch64] Add support for generating pre- and post-index load/store pairs. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@248593 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../AArch64/AArch64LoadStoreOptimizer.cpp | 216 ++++++++++++++---- test/CodeGen/AArch64/ldst-opt.ll | 126 ++++++++++ test/CodeGen/AArch64/tail-call.ll | 6 +- 3 files changed, 301 insertions(+), 47 deletions(-) diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 783a610ce0c..22a263e3ac5 100644 --- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -117,6 +117,11 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { MachineBasicBlock::iterator findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); + // Find an instruction that updates the base register of the ld/st + // instruction. + bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI, + unsigned BaseReg, int Offset); + // Merge a pre- or post-index base register update into a ld/st instruction. MachineBasicBlock::iterator mergeUpdateInsn(MachineBasicBlock::iterator I, @@ -303,6 +308,26 @@ static unsigned getPreIndexedOpcode(unsigned Opc) { return AArch64::LDRXpre; case AArch64::LDRSWui: return AArch64::LDRSWpre; + case AArch64::LDPSi: + return AArch64::LDPSpre; + case AArch64::LDPDi: + return AArch64::LDPDpre; + case AArch64::LDPQi: + return AArch64::LDPQpre; + case AArch64::LDPWi: + return AArch64::LDPWpre; + case AArch64::LDPXi: + return AArch64::LDPXpre; + case AArch64::STPSi: + return AArch64::STPSpre; + case AArch64::STPDi: + return AArch64::STPDpre; + case AArch64::STPQi: + return AArch64::STPQpre; + case AArch64::STPWi: + return AArch64::STPWpre; + case AArch64::STPXi: + return AArch64::STPXpre; } } @@ -332,19 +357,62 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { return AArch64::LDRXpost; case AArch64::LDRSWui: return AArch64::LDRSWpost; + case AArch64::LDPSi: + return AArch64::LDPSpost; + case AArch64::LDPDi: + return AArch64::LDPDpost; + case AArch64::LDPQi: + return AArch64::LDPQpost; + case AArch64::LDPWi: + return AArch64::LDPWpost; + case AArch64::LDPXi: + return AArch64::LDPXpost; + case AArch64::STPSi: + return AArch64::STPSpost; + case AArch64::STPDi: + return AArch64::STPDpost; + case AArch64::STPQi: + return AArch64::STPQpost; + case AArch64::STPWi: + return AArch64::STPWpost; + case AArch64::STPXi: + return AArch64::STPXpost; + } +} + +static bool isPairedLdSt(const MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + return false; + case AArch64::LDPSi: + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPWi: + case AArch64::LDPXi: + case AArch64::STPSi: + case AArch64::STPDi: + case AArch64::STPQi: + case AArch64::STPWi: + case AArch64::STPXi: + return true; } } -static const MachineOperand &getLdStRegOp(const MachineInstr *MI) { - return MI->getOperand(0); +static const MachineOperand &getLdStRegOp(const MachineInstr *MI, + unsigned PairedRegOp = 0) { + assert(PairedRegOp < 2 && "Unexpected register operand idx."); + unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0; + return MI->getOperand(Idx); } static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) { - return MI->getOperand(1); + unsigned Idx = isPairedLdSt(MI) ? 2 : 1; + return MI->getOperand(Idx); } static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) { - return MI->getOperand(2); + unsigned Idx = isPairedLdSt(MI) ? 3 : 2; + return MI->getOperand(Idx); } MachineBasicBlock::iterator @@ -704,12 +772,25 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) : getPostIndexedOpcode(I->getOpcode()); - MachineInstrBuilder MIB = - BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(getLdStRegOp(Update)) - .addOperand(getLdStRegOp(I)) - .addOperand(getLdStBaseOp(I)) - .addImm(Value); + MachineInstrBuilder MIB; + if (!isPairedLdSt(I)) { + // Non-paired instruction. + MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) + .addOperand(getLdStRegOp(Update)) + .addOperand(getLdStRegOp(I)) + .addOperand(getLdStBaseOp(I)) + .addImm(Value); + } else { + // Paired instruction. + const MachineFunction &MF = *I->getParent()->getParent(); + int Scale = TII->getRegClass(I->getDesc(), 0, TRI, MF)->getSize(); + MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) + .addOperand(getLdStRegOp(Update)) + .addOperand(getLdStRegOp(I, 0)) + .addOperand(getLdStRegOp(I, 1)) + .addOperand(getLdStBaseOp(I)) + .addImm(Value / Scale); + } (void)MIB; if (IsPreIdx) @@ -731,8 +812,9 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, return NextI; } -static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg, - int Offset) { +bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, + MachineInstr *MI, + unsigned BaseReg, int Offset) { switch (MI->getOpcode()) { default: break; @@ -748,17 +830,38 @@ static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg, // Watch out for 1 << 12 shifted value. if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm())) break; - // If the instruction has the base register as source and dest and the - // immediate will fit in a signed 9-bit integer, then we have a match. - if (MI->getOperand(0).getReg() == BaseReg && - MI->getOperand(1).getReg() == BaseReg && - MI->getOperand(2).getImm() <= 255 && - MI->getOperand(2).getImm() >= -256) { - // If we have a non-zero Offset, we check that it matches the amount - // we're adding to the register. - if (!Offset || Offset == MI->getOperand(2).getImm()) - return true; + + // The update instruction source and destination register must be the + // same as the load/store base register. + if (MI->getOperand(0).getReg() != BaseReg || + MI->getOperand(1).getReg() != BaseReg) + break; + + bool IsPairedInsn = isPairedLdSt(MemMI); + int UpdateOffset = MI->getOperand(2).getImm(); + // For non-paired load/store instructions, the immediate must fit in a + // signed 9-bit integer. + if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256)) + break; + + // For paired load/store instructions, the immediate must be a multiple of + // the scaling factor. The scaled offset must also fit into a signed 7-bit + // integer. + if (IsPairedInsn) { + const MachineFunction &MF = *MemMI->getParent()->getParent(); + int Scale = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); + if (UpdateOffset % Scale != 0) + break; + + int ScaledOffset = UpdateOffset / Scale; + if (ScaledOffset > 64 || ScaledOffset < -64) + break; } + + // If we have a non-zero Offset, we check that it matches the amount + // we're adding to the register. + if (!Offset || Offset == MI->getOperand(2).getImm()) + return true; break; } return false; @@ -771,15 +874,18 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( MachineBasicBlock::iterator MBBI = I; const MachineFunction &MF = *MemMI->getParent()->getParent(); - unsigned DestReg = getLdStRegOp(MemMI).getReg(); unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); int Offset = getLdStOffsetOp(MemMI).getImm() * TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); - // If the base register overlaps the destination register, we can't + // If the base register overlaps a destination register, we can't // merge the update. - if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) - return E; + bool IsPairedInsn = isPairedLdSt(MemMI); + for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { + unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); + if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + return E; + } // Scan forward looking for post-index opportunities. // Updating instructions can't be formed if the memory insn already @@ -804,7 +910,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( ++Count; // If we found a match, return it. - if (isMatchingUpdateInsn(MI, BaseReg, Value)) + if (isMatchingUpdateInsn(I, MI, BaseReg, Value)) return MBBI; // Update the status of what the instruction clobbered and used. @@ -826,7 +932,6 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( MachineBasicBlock::iterator MBBI = I; const MachineFunction &MF = *MemMI->getParent()->getParent(); - unsigned DestReg = getLdStRegOp(MemMI).getReg(); unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); int Offset = getLdStOffsetOp(MemMI).getImm(); unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); @@ -835,10 +940,14 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( // not any matching update. Ditto if the memory offset isn't zero. if (MBBI == B || Offset != 0) return E; - // If the base register overlaps the destination register, we can't + // If the base register overlaps a destination register, we can't // merge the update. - if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) - return E; + bool IsPairedInsn = isPairedLdSt(MemMI); + for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { + unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); + if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + return E; + } // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. @@ -857,7 +966,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( ++Count; // If we found a match, return it. - if (isMatchingUpdateInsn(MI, BaseReg, RegSize)) + if (isMatchingUpdateInsn(I, MI, BaseReg, RegSize)) return MBBI; // Update the status of what the instruction clobbered and used. @@ -897,6 +1006,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { // Just move on to the next instruction. ++MBBI; break; + // Scaled instructions. case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: @@ -908,7 +1018,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { case AArch64::LDRXui: case AArch64::LDRWui: case AArch64::LDRSWui: - // do the unscaled versions as well + // Unscaled instructions. case AArch64::STURSi: case AArch64::STURDi: case AArch64::STURQi: @@ -970,6 +1080,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { // Just move on to the next instruction. ++MBBI; break; + // Scaled instructions. case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: @@ -980,7 +1091,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { case AArch64::LDRQui: case AArch64::LDRXui: case AArch64::LDRWui: - // do the unscaled versions as well + // Unscaled instructions. case AArch64::STURSi: case AArch64::STURDi: case AArch64::STURQi: @@ -990,13 +1101,28 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { case AArch64::LDURDi: case AArch64::LDURQi: case AArch64::LDURWi: - case AArch64::LDURXi: { + case AArch64::LDURXi: + // Paired instructions. + case AArch64::LDPSi: + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPWi: + case AArch64::LDPXi: + case AArch64::STPSi: + case AArch64::STPDi: + case AArch64::STPQi: + case AArch64::STPWi: + case AArch64::STPXi: { // Make sure this is a reg+imm (as opposed to an address reloc). if (!getLdStOffsetOp(MI).isImm()) { ++MBBI; break; } - // Look ahead up to ScanLimit instructions for a mergable instruction. + // Look forward to try to form a post-index instruction. For example, + // ldr x0, [x20] + // add x20, x20, #32 + // merged into: + // ldr x0, [x20], #32 MachineBasicBlock::iterator Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, 0); if (Update != E) { @@ -1026,19 +1152,23 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { ++NumPreFolded; break; } + // The immediate in the load/store is scaled by the size of the register + // being loaded. The immediate in the add we're looking for, + // however, is not, so adjust here. + int Value = MI->getOperand(isPairedLdSt(MI) ? 3 : 2).getImm() * + TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent())) + ->getSize(); + + // FIXME: The immediate in the load/store should be scaled by the size of + // the memory operation, not the size of the register being loaded/stored. + // This works in general, but does not work for the LDPSW instruction, + // which defines two 64-bit registers, but loads 32-bit values. // Look forward to try to find a post-index instruction. For example, // ldr x1, [x0, #64] // add x0, x0, #64 // merged into: // ldr x1, [x0, #64]! - - // The immediate in the load/store is scaled by the size of the register - // being loaded. The immediate in the add we're looking for, - // however, is not, so adjust here. - int Value = MI->getOperand(2).getImm() * - TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent())) - ->getSize(); Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value); if (Update != E) { // Merge the update into the ld/st. diff --git a/test/CodeGen/AArch64/ldst-opt.ll b/test/CodeGen/AArch64/ldst-opt.ll index b2c11c7517c..56f3caaa408 100644 --- a/test/CodeGen/AArch64/ldst-opt.ll +++ b/test/CodeGen/AArch64/ldst-opt.ll @@ -164,6 +164,48 @@ bar: ret void } +; Check the following transform: +; +; (ldp|stp) w1, w2 [x0, #32] +; ... +; add x0, x0, #32 +; -> +; (ldp|stp) w1, w2, [x0, #32]! +; + +define void @load-pair-pre-indexed-word(%struct.word* %ptr) nounwind { +; CHECK-LABEL: load-pair-pre-indexed-word +; CHECK: ldp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]! +; CHECK-NOT: add x0, x0, #32 +entry: + %a = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 0 + %a1 = load i32, i32* %a, align 4 + %b = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 1 + %b1 = load i32, i32* %b, align 4 + %add = add i32 %a1, %b1 + br label %bar +bar: + %c = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1 + tail call void @bar_word(%s.word* %c, i32 %add) + ret void +} + +define void @store-pair-pre-indexed-word(%struct.word* %ptr, i32 %val) nounwind { +; CHECK-LABEL: store-pair-pre-indexed-word +; CHECK: stp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]! +; CHECK-NOT: add x0, x0, #32 +entry: + %a = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 0 + store i32 %val, i32* %a, align 4 + %b = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 1 + store i32 %val, i32* %b, align 4 + br label %bar +bar: + %c = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1 + tail call void @bar_word(%s.word* %c, i32 %val) + ret void +} + ; Check the following transform: ; ; add x8, x8, #16 @@ -636,6 +678,90 @@ declare void @use-quadword(<2 x i64>) declare void @use-float(float) declare void @use-double(double) +; Check the following transform: +; +; stp w0, [x20] +; ... +; add x20, x20, #32 +; -> +; stp w0, [x20], #32 + +define void @store-pair-post-indexed-word() nounwind { +; CHECK-LABEL: store-pair-post-indexed-word +; CHECK: stp w{{[0-9]+}}, w{{[0-9]+}}, [sp], #16 +; CHECK: ret + %src = alloca { i32, i32 }, align 8 + %dst = alloca { i32, i32 }, align 8 + + %src.realp = getelementptr inbounds { i32, i32 }, { i32, i32 }* %src, i32 0, i32 0 + %src.real = load i32, i32* %src.realp + %src.imagp = getelementptr inbounds { i32, i32 }, { i32, i32 }* %src, i32 0, i32 1 + %src.imag = load i32, i32* %src.imagp + + %dst.realp = getelementptr inbounds { i32, i32 }, { i32, i32 }* %dst, i32 0, i32 0 + %dst.imagp = getelementptr inbounds { i32, i32 }, { i32, i32 }* %dst, i32 0, i32 1 + store i32 %src.real, i32* %dst.realp + store i32 %src.imag, i32* %dst.imagp + ret void +} + +define void @store-pair-post-indexed-doubleword() nounwind { +; CHECK-LABEL: store-pair-post-indexed-doubleword +; CHECK: stp x{{[0-9]+}}, x{{[0-9]+}}, [sp], #32 +; CHECK: ret + %src = alloca { i64, i64 }, align 8 + %dst = alloca { i64, i64 }, align 8 + + %src.realp = getelementptr inbounds { i64, i64 }, { i64, i64 }* %src, i32 0, i32 0 + %src.real = load i64, i64* %src.realp + %src.imagp = getelementptr inbounds { i64, i64 }, { i64, i64 }* %src, i32 0, i32 1 + %src.imag = load i64, i64* %src.imagp + + %dst.realp = getelementptr inbounds { i64, i64 }, { i64, i64 }* %dst, i32 0, i32 0 + %dst.imagp = getelementptr inbounds { i64, i64 }, { i64, i64 }* %dst, i32 0, i32 1 + store i64 %src.real, i64* %dst.realp + store i64 %src.imag, i64* %dst.imagp + ret void +} + +define void @store-pair-post-indexed-float() nounwind { +; CHECK-LABEL: store-pair-post-indexed-float +; CHECK: stp s{{[0-9]+}}, s{{[0-9]+}}, [sp], #16 +; CHECK: ret + %src = alloca { float, float }, align 8 + %dst = alloca { float, float }, align 8 + + %src.realp = getelementptr inbounds { float, float }, { float, float }* %src, i32 0, i32 0 + %src.real = load float, float* %src.realp + %src.imagp = getelementptr inbounds { float, float }, { float, float }* %src, i32 0, i32 1 + %src.imag = load float, float* %src.imagp + + %dst.realp = getelementptr inbounds { float, float }, { float, float }* %dst, i32 0, i32 0 + %dst.imagp = getelementptr inbounds { float, float }, { float, float }* %dst, i32 0, i32 1 + store float %src.real, float* %dst.realp + store float %src.imag, float* %dst.imagp + ret void +} + +define void @store-pair-post-indexed-double() nounwind { +; CHECK-LABEL: store-pair-post-indexed-double +; CHECK: stp d{{[0-9]+}}, d{{[0-9]+}}, [sp], #32 +; CHECK: ret + %src = alloca { double, double }, align 8 + %dst = alloca { double, double }, align 8 + + %src.realp = getelementptr inbounds { double, double }, { double, double }* %src, i32 0, i32 0 + %src.real = load double, double* %src.realp + %src.imagp = getelementptr inbounds { double, double }, { double, double }* %src, i32 0, i32 1 + %src.imag = load double, double* %src.imagp + + %dst.realp = getelementptr inbounds { double, double }, { double, double }* %dst, i32 0, i32 0 + %dst.imagp = getelementptr inbounds { double, double }, { double, double }* %dst, i32 0, i32 1 + store double %src.real, double* %dst.realp + store double %src.imag, double* %dst.imagp + ret void +} + ; Check the following transform: ; ; (ldr|str) X, [x20] diff --git a/test/CodeGen/AArch64/tail-call.ll b/test/CodeGen/AArch64/tail-call.ll index e5766154bb4..fa5d8b943b6 100644 --- a/test/CodeGen/AArch64/tail-call.ll +++ b/test/CodeGen/AArch64/tail-call.ll @@ -59,8 +59,7 @@ define fastcc void @caller_to16_from8([8 x i32], i64 %a) { ; callee will not deallocate the space, even in fastcc. tail call fastcc void @callee_stack16([8 x i32] undef, i64 42, i64 2) -; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16] -; CHECK-NEXT: add sp, sp, #16 +; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]! ; CHECK-NEXT: b callee_stack16 ret void } @@ -89,8 +88,7 @@ define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) { ret void ; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16] -; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16] -; CHECK-NEXT: add sp, sp, #16 +; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]! ; CHECK-NEXT: b callee_stack16 } -- 2.34.1