struct AArch64LoadStoreOpt : public MachineFunctionPass {
static char ID;
- AArch64LoadStoreOpt() : MachineFunctionPass(ID), IsStrictAlign(false) {
+ AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
}
const AArch64InstrInfo *TII;
const TargetRegisterInfo *TRI;
- bool IsStrictAlign;
// Scan the instructions looking for a load/store that can be combined
// with the current instruction into a load/store pair.
// Find and merge foldable ldr/str instructions.
bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
- bool optimizeBlock(MachineBasicBlock &MBB);
+ // Check if converting two narrow loads into a single wider load with
+ // bitfield extracts could be enabled.
+ bool enableNarrowLdMerge(MachineFunction &Fn);
+
+ bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);
bool runOnMachineFunction(MachineFunction &Fn) override;
return false;
}
-bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
+bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
+ bool enableNarrowLdOpt) {
bool Modified = false;
// Three tranformations to do here:
// 1) Find halfword loads that can be merged into a single 32-bit word load
// ldr x0, [x2], #4
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
- !IsStrictAlign && MBBI != E;) {
+ enableNarrowLdOpt && MBBI != E;) {
MachineInstr *MI = MBBI;
switch (MI->getOpcode()) {
default:
return Modified;
}
+bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
+ const AArch64Subtarget *SubTarget =
+ &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
+ bool ProfitableArch = SubTarget->isCortexA57();
+ // FIXME: The benefit from converting narrow loads into a wider load could be
+ // microarchitectural as it assumes that a single load with two bitfield
+ // extracts is cheaper than two narrow loads. Currently, this conversion is
+ // enabled only in cortex-a57 on which performance benefits were verified.
+ return ProfitableArch & (!SubTarget->requiresStrictAlign());
+}
+
bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo());
TRI = Fn.getSubtarget().getRegisterInfo();
- IsStrictAlign = (static_cast<const AArch64Subtarget &>(Fn.getSubtarget()))
- .requiresStrictAlign();
bool Modified = false;
+ bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);
for (auto &MBB : Fn)
- Modified |= optimizeBlock(MBB);
+ Modified |= optimizeBlock(MBB, enableNarrowLdOpt);
return Modified;
}
ret i64 %add
}
-; CHECK-LABEL: Ldrh_merge
-; CHECK-NOT: ldrh
-; CHECK: ldr [[NEW_DEST:w[0-9]+]]
-; CHECK: and w{{[0-9]+}}, [[NEW_DEST]], #0xffff
-; CHECK: lsr w{{[0-9]+}}, [[NEW_DEST]]
-
-define i16 @Ldrh_merge(i16* nocapture readonly %p) {
- %1 = load i16, i16* %p, align 2
- ;%conv = zext i16 %0 to i32
- %arrayidx2 = getelementptr inbounds i16, i16* %p, i64 1
- %2 = load i16, i16* %arrayidx2, align 2
- %add = add nuw nsw i16 %1, %2
- ret i16 %add
-}
-
-; CHECK-LABEL: Ldurh_merge
-; CHECK-NOT: ldurh
-; CHECK: ldur [[NEW_DEST:w[0-9]+]]
-; CHECK: and w{{[0-9]+}}, [[NEW_DEST]], #0xffff
-; CHECK: lsr w{{[0-9]+}}, [[NEW_DEST]]
-define i16 @Ldurh_merge(i16* nocapture readonly %p) {
-entry:
- %arrayidx = getelementptr inbounds i16, i16* %p, i64 -2
- %0 = load i16, i16* %arrayidx
- %arrayidx3 = getelementptr inbounds i16, i16* %p, i64 -1
- %1 = load i16, i16* %arrayidx3
- %add = add nuw nsw i16 %0, %1
- ret i16 %add
-}
-
-; CHECK-LABEL: Ldrh_4_merge
-; CHECK-NOT: ldrh
-; CHECK: ldp [[NEW_DEST:w[0-9]+]]
-define i16 @Ldrh_4_merge(i16* nocapture readonly %P) {
- %arrayidx = getelementptr inbounds i16, i16* %P, i64 0
- %l0 = load i16, i16* %arrayidx
- %arrayidx2 = getelementptr inbounds i16, i16* %P, i64 1
- %l1 = load i16, i16* %arrayidx2
- %arrayidx7 = getelementptr inbounds i16, i16* %P, i64 2
- %l2 = load i16, i16* %arrayidx7
- %arrayidx12 = getelementptr inbounds i16, i16* %P, i64 3
- %l3 = load i16, i16* %arrayidx12
- %add4 = add nuw nsw i16 %l1, %l0
- %add9 = add nuw nsw i16 %add4, %l2
- %add14 = add nuw nsw i16 %add9, %l3
-
- ret i16 %add14
-}
--- /dev/null
+; RUN: llc < %s -march=arm64 -mcpu=cortex-a57 -verify-machineinstrs | FileCheck %s
+
+; CHECK-LABEL: Ldrh_merge
+; CHECK-NOT: ldrh
+; CHECK: ldr [[NEW_DEST:w[0-9]+]]
+; CHECK: and w{{[0-9]+}}, [[NEW_DEST]], #0xffff
+; CHECK: lsr w{{[0-9]+}}, [[NEW_DEST]]
+define i16 @Ldrh_merge(i16* nocapture readonly %p) {
+ %1 = load i16, i16* %p, align 2
+ %arrayidx2 = getelementptr inbounds i16, i16* %p, i64 1
+ %2 = load i16, i16* %arrayidx2, align 2
+ %add = add nuw nsw i16 %1, %2
+ ret i16 %add
+}
+
+; CHECK-LABEL: Ldurh_merge
+; CHECK-NOT: ldurh
+; CHECK: ldur [[NEW_DEST:w[0-9]+]]
+; CHECK: and w{{[0-9]+}}, [[NEW_DEST]], #0xffff
+; CHECK: lsr w{{[0-9]+}}, [[NEW_DEST]]
+define i16 @Ldurh_merge(i16* nocapture readonly %p) {
+entry:
+ %arrayidx = getelementptr inbounds i16, i16* %p, i64 -2
+ %0 = load i16, i16* %arrayidx
+ %arrayidx3 = getelementptr inbounds i16, i16* %p, i64 -1
+ %1 = load i16, i16* %arrayidx3
+ %add = add nuw nsw i16 %0, %1
+ ret i16 %add
+}
+
+; CHECK-LABEL: Ldrh_4_merge
+; CHECK-NOT: ldrh
+; CHECK: ldp [[NEW_DEST:w[0-9]+]]
+define i16 @Ldrh_4_merge(i16* nocapture readonly %P) {
+ %arrayidx = getelementptr inbounds i16, i16* %P, i64 0
+ %l0 = load i16, i16* %arrayidx
+ %arrayidx2 = getelementptr inbounds i16, i16* %P, i64 1
+ %l1 = load i16, i16* %arrayidx2
+ %arrayidx7 = getelementptr inbounds i16, i16* %P, i64 2
+ %l2 = load i16, i16* %arrayidx7
+ %arrayidx12 = getelementptr inbounds i16, i16* %P, i64 3
+ %l3 = load i16, i16* %arrayidx12
+ %add4 = add nuw nsw i16 %l1, %l0
+ %add9 = add nuw nsw i16 %add4, %l2
+ %add14 = add nuw nsw i16 %add9, %l3
+ ret i16 %add14
+}