"Number of load/store from unscaled generated");
STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted");
STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
+STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
cl::init(20), cl::Hidden);
MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
LdStPairFlags &Flags,
unsigned Limit);
+
+ // Scan the instructions looking for a store that writes to the address from
+ // which the current load instruction reads. Return true if one is found.
+ bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
+ MachineBasicBlock::iterator &StoreI);
+
// Merge the two instructions indicated into a single pair-wise instruction.
// If MergeForward is true, erase the first instruction and fold its
// operation into the second. If false, the reverse. Return the instruction
MachineBasicBlock::iterator Paired,
const LdStPairFlags &Flags);
+ // Promote the load that reads directly from the address stored to.
+ MachineBasicBlock::iterator
+ promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
+ MachineBasicBlock::iterator StoreI);
+
// Scan the instruction list to find a base register update that can
// be combined with the current instruction (a load or store) using
// pre or post indexed addressing with writeback. Scan forwards.
// Find and merge foldable ldr/str instructions.
bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
+ // Find and promote load instructions which read directly from store.
+ bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
+
// Check if converting two narrow loads into a single wider load with
// bitfield extracts could be enabled.
bool enableNarrowLdMerge(MachineFunction &Fn);
}
}
+static unsigned isMatchingStore(MachineInstr *LoadInst,
+ MachineInstr *StoreInst) {
+ unsigned LdOpc = LoadInst->getOpcode();
+ unsigned StOpc = StoreInst->getOpcode();
+ switch (LdOpc) {
+ default:
+ llvm_unreachable("Unsupported load instruction!");
+ case AArch64::LDRBBui:
+ return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
+ StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
+ case AArch64::LDURBBi:
+ return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
+ StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
+ case AArch64::LDRHHui:
+ return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
+ StOpc == AArch64::STRXui;
+ case AArch64::LDURHHi:
+ return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
+ StOpc == AArch64::STURXi;
+ case AArch64::LDRWui:
+ return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
+ case AArch64::LDURWi:
+ return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
+ case AArch64::LDRXui:
+ return StOpc == AArch64::STRXui;
+ case AArch64::LDURXi:
+ return StOpc == AArch64::STURXi;
+ }
+}
+
static unsigned getPreIndexedOpcode(unsigned Opc) {
switch (Opc) {
default:
return MI->getOperand(Idx);
}
+static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst,
+ MachineInstr *StoreInst) {
+ assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
+ int LoadSize = getMemScale(LoadInst);
+ int StoreSize = getMemScale(StoreInst);
+ int UnscaledStOffset = isUnscaledLdSt(StoreInst)
+ ? getLdStOffsetOp(StoreInst).getImm()
+ : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
+ int UnscaledLdOffset = isUnscaledLdSt(LoadInst)
+ ? getLdStOffsetOp(LoadInst).getImm()
+ : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
+ return (UnscaledStOffset <= UnscaledLdOffset) &&
+ (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
+}
+
// Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI.
static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0,
MachineInstr *Op1) {
return NextI;
}
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
+ MachineBasicBlock::iterator StoreI) {
+ MachineBasicBlock::iterator NextI = LoadI;
+ ++NextI;
+
+ int LoadSize = getMemScale(LoadI);
+ int StoreSize = getMemScale(StoreI);
+ unsigned LdRt = getLdStRegOp(LoadI).getReg();
+ unsigned StRt = getLdStRegOp(StoreI).getReg();
+ bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
+
+ assert((IsStoreXReg ||
+ TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
+ "Unexpected RegClass");
+
+ MachineInstr *BitExtMI;
+ if (LoadSize == StoreSize) {
+ // Remove the load, if the destination register of the loads is the same
+ // register for stored value.
+ if (StRt == LdRt) {
+ DEBUG(dbgs() << "Remove load instruction:\n ");
+ DEBUG(LoadI->print(dbgs()));
+ DEBUG(dbgs() << "\n");
+ LoadI->eraseFromParent();
+ return NextI;
+ }
+ // Replace the load with a mov if the load and store are in the same size.
+ BitExtMI =
+ BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
+ TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
+ .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
+ .addReg(StRt)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+ } else {
+ // FIXME: Currently we disable this transformation in big-endian targets as
+ // performance and correctness are verified only in little-endian.
+ if (!Subtarget->isLittleEndian())
+ return NextI;
+ bool IsUnscaled = isUnscaledLdSt(LoadI);
+ assert(IsUnscaled == isUnscaledLdSt(StoreI) && "Unsupported ld/st match");
+ assert(LoadSize < StoreSize && "Invalid load size");
+ int UnscaledLdOffset = IsUnscaled
+ ? getLdStOffsetOp(LoadI).getImm()
+ : getLdStOffsetOp(LoadI).getImm() * LoadSize;
+ int UnscaledStOffset = IsUnscaled
+ ? getLdStOffsetOp(StoreI).getImm()
+ : getLdStOffsetOp(StoreI).getImm() * StoreSize;
+ int Width = LoadSize * 8;
+ int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
+ int Imms = Immr + Width - 1;
+ unsigned DestReg = IsStoreXReg
+ ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
+ &AArch64::GPR64RegClass)
+ : LdRt;
+
+ assert(((UnscaledLdOffset) >= UnscaledStOffset &&
+ (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
+ "Invalid offset");
+
+ Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
+ Imms = Immr + Width - 1;
+ if (UnscaledLdOffset == UnscaledStOffset) {
+ uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
+ | ((Immr) << 6) // immr
+ | ((Imms) << 0) // imms
+ ;
+
+ BitExtMI =
+ BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
+ TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
+ DestReg)
+ .addReg(StRt)
+ .addImm(AndMaskEncoded);
+ } else {
+ BitExtMI =
+ BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
+ TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
+ DestReg)
+ .addReg(StRt)
+ .addImm(Immr)
+ .addImm(Imms);
+ }
+ }
+
+ DEBUG(dbgs() << "Promoting load by replacing :\n ");
+ DEBUG(StoreI->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG(LoadI->print(dbgs()));
+ DEBUG(dbgs() << " with instructions:\n ");
+ DEBUG(StoreI->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG((BitExtMI)->print(dbgs()));
+ DEBUG(dbgs() << "\n");
+
+ // Erase the old instructions.
+ LoadI->eraseFromParent();
+ return NextI;
+}
+
/// trackRegDefsUses - Remember what registers the specified instruction uses
/// and modifies.
static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs,
return false;
}
+bool AArch64LoadStoreOpt::findMatchingStore(
+ MachineBasicBlock::iterator I, unsigned Limit,
+ MachineBasicBlock::iterator &StoreI) {
+ MachineBasicBlock::iterator E = I->getParent()->begin();
+ MachineBasicBlock::iterator MBBI = I;
+ MachineInstr *FirstMI = I;
+ unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
+
+ // Track which registers have been modified and used between the first insn
+ // and the second insn.
+ BitVector ModifiedRegs, UsedRegs;
+ ModifiedRegs.resize(TRI->getNumRegs());
+ UsedRegs.resize(TRI->getNumRegs());
+
+ for (unsigned Count = 0; MBBI != E && Count < Limit;) {
+ --MBBI;
+ MachineInstr *MI = MBBI;
+ // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+ // optimization by changing how far we scan.
+ if (MI->isDebugValue())
+ continue;
+ // Now that we know this is a real instruction, count it.
+ ++Count;
+
+ // If the load instruction reads directly from the address to which the
+ // store instruction writes and the stored value is not modified, we can
+ // promote the load. Since we do not handle stores with pre-/post-index,
+ // it's unnecessary to check if BaseReg is modified by the store itself.
+ if (MI->mayStore() && isMatchingStore(FirstMI, MI) &&
+ BaseReg == getLdStBaseOp(MI).getReg() &&
+ isLdOffsetInRangeOfSt(FirstMI, MI) &&
+ !ModifiedRegs[getLdStRegOp(MI).getReg()]) {
+ StoreI = MBBI;
+ return true;
+ }
+
+ if (MI->isCall())
+ return false;
+
+ // Update modified / uses register lists.
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+ // Otherwise, if the base register is modified, we have no match, so
+ // return early.
+ if (ModifiedRegs[BaseReg])
+ return false;
+
+ // If we encounter a store aliased with the load, return early.
+ if (MI->mayStore() && mayAlias(FirstMI, MI, TII))
+ return false;
+ }
+ return false;
+}
+
/// findMatchingInsn - Scan the instructions looking for a load/store that can
/// be combined with the current instruction into a load/store pair.
MachineBasicBlock::iterator
return E;
}
+bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
+ MachineBasicBlock::iterator &MBBI) {
+ MachineInstr *MI = MBBI;
+ // If this is a volatile load, don't mess with it.
+ if (MI->hasOrderedMemoryRef())
+ return false;
+
+ // Make sure this is a reg+imm.
+ // FIXME: It is possible to extend it to handle reg+reg cases.
+ if (!getLdStOffsetOp(MI).isImm())
+ return false;
+
+ // Look backward up to ScanLimit instructions.
+ MachineBasicBlock::iterator StoreI;
+ if (findMatchingStore(MBBI, ScanLimit, StoreI)) {
+ ++NumLoadsFromStoresPromoted;
+ // Promote the load. Keeping the iterator straight is a
+ // pain, so we let the merge routine tell us what the next instruction
+ // is after it's done mucking about.
+ MBBI = promoteLoadFromStore(MBBI, StoreI);
+ return true;
+ }
+ return false;
+}
+
bool AArch64LoadStoreOpt::tryToMergeLdStInst(
MachineBasicBlock::iterator &MBBI) {
MachineInstr *MI = MBBI;
bool enableNarrowLdOpt) {
bool Modified = false;
// Three tranformations to do here:
- // 1) Find narrow loads that can be converted into a single wider load
+ // 1) Find loads that directly read from stores and promote them by
+ // replacing with mov instructions. If the store is wider than the load,
+ // the load will be replaced with a bitfield extract.
+ // e.g.,
+ // str w1, [x0, #4]
+ // ldrh w2, [x0, #6]
+ // ; becomes
+ // str w1, [x0, #4]
+ // lsr w2, w1, #16
+ // 2) Find narrow loads that can be converted into a single wider load
// with bitfield extract instructions.
// e.g.,
// ldrh w0, [x2]
// ldr w0, [x2]
// ubfx w1, w0, #16, #16
// and w0, w0, #ffff
- // 2) Find loads and stores that can be merged into a single load or store
+ // 3) Find loads and stores that can be merged into a single load or store
// pair instruction.
// e.g.,
// ldr x0, [x2]
// ldr x1, [x2, #8]
// ; becomes
// ldp x0, x1, [x2]
- // 3) Find base register updates that can be merged into the load or store
+ // 4) Find base register updates that can be merged into the load or store
// as a base-reg writeback.
// e.g.,
// ldr x0, [x2]
// ; becomes
// ldr x0, [x2], #4
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ MBBI != E;) {
+ MachineInstr *MI = MBBI;
+ switch (MI->getOpcode()) {
+ default:
+ // Just move on to the next instruction.
+ ++MBBI;
+ break;
+ // Scaled instructions.
+ case AArch64::LDRBBui:
+ case AArch64::LDRHHui:
+ case AArch64::LDRWui:
+ case AArch64::LDRXui:
+ // Unscaled instructions.
+ case AArch64::LDURBBi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURWi:
+ case AArch64::LDURXi: {
+ if (tryToPromoteLoadFromStore(MBBI)) {
+ Modified = true;
+ break;
+ }
+ ++MBBI;
+ break;
+ }
+ // FIXME: Do the other instructions.
+ }
+ }
+
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
enableNarrowLdOpt && MBBI != E;) {
MachineInstr *MI = MBBI;
--- /dev/null
+; RUN: llc < %s -mtriple aarch64--none-eabi -verify-machineinstrs | FileCheck %s
+
+; CHECK-LABEL: Str64Ldr64
+; CHECK: mov x0, x1
+define i64 @Str64Ldr64(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i64*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i64, i64* %0, i64 1
+ %1 = load i64, i64* %arrayidx1
+ ret i64 %1
+}
+
+; CHECK-LABEL: Str64Ldr32_0
+; CHECK: and x0, x1, #0xffffffff
+define i32 @Str64Ldr32_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i32*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 2
+ %1 = load i32, i32* %arrayidx1
+ ret i32 %1
+}
+
+; CHECK-LABEL: Str64Ldr32_1
+; CHECK: lsr x0, x1, #32
+define i32 @Str64Ldr32_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i32*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 3
+ %1 = load i32, i32* %arrayidx1
+ ret i32 %1
+}
+
+; CHECK-LABEL: Str64Ldr16_0
+; CHECK: and x0, x1, #0xffff
+define i16 @Str64Ldr16_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i16*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 4
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Str64Ldr16_1
+; CHECK: ubfx x0, x1, #16, #16
+define i16 @Str64Ldr16_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i16*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 5
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Str64Ldr16_2
+; CHECK: ubfx x0, x1, #32, #16
+define i16 @Str64Ldr16_2(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i16*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 6
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Str64Ldr16_3
+; CHECK: lsr x0, x1, #48
+define i16 @Str64Ldr16_3(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i16*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 7
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_0
+; CHECK: and x0, x1, #0xff
+define i8 @Str64Ldr8_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 8
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_1
+; CHECK: ubfx x0, x1, #8, #8
+define i8 @Str64Ldr8_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 9
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_2
+; CHECK: ubfx x0, x1, #16, #8
+define i8 @Str64Ldr8_2(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 10
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_3
+; CHECK: ubfx x0, x1, #24, #8
+define i8 @Str64Ldr8_3(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 11
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_4
+; CHECK: ubfx x0, x1, #32, #8
+define i8 @Str64Ldr8_4(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 12
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_5
+; CHECK: ubfx x0, x1, #40, #8
+define i8 @Str64Ldr8_5(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 13
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_6
+; CHECK: ubfx x0, x1, #48, #8
+define i8 @Str64Ldr8_6(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 14
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_7
+; CHECK: lsr x0, x1, #56
+define i8 @Str64Ldr8_7(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 15
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str32Ldr32
+; CHECK: mov w0, w1
+define i32 @Str32Ldr32(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i32*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 1
+ %1 = load i32, i32* %arrayidx1
+ ret i32 %1
+}
+
+; CHECK-LABEL: Str32Ldr16_0
+; CHECK: and w0, w1, #0xffff
+define i16 @Str32Ldr16_0(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i16*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 2
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Str32Ldr16_1
+; CHECK: lsr w0, w1, #16
+define i16 @Str32Ldr16_1(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i16*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 3
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Str32Ldr8_0
+; CHECK: and w0, w1, #0xff
+define i8 @Str32Ldr8_0(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i8*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 4
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str32Ldr8_1
+; CHECK: ubfx w0, w1, #8, #8
+define i8 @Str32Ldr8_1(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i8*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 5
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str32Ldr8_2
+; CHECK: ubfx w0, w1, #16, #8
+define i8 @Str32Ldr8_2(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i8*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 6
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str32Ldr8_3
+; CHECK: lsr w0, w1, #24
+define i8 @Str32Ldr8_3(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i8*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 7
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str16Ldr16
+; CHECK: mov w0, w1
+define i16 @Str16Ldr16(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+ %0 = bitcast i16* %P to i16*
+ %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 1
+ store i16 %v, i16* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 1
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Str16Ldr8_0
+; CHECK: and w0, w1, #0xff
+define i8 @Str16Ldr8_0(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+ %0 = bitcast i16* %P to i8*
+ %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 1
+ store i16 %v, i16* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 2
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str16Ldr8_1
+; CHECK: ubfx w0, w1, #8, #8
+define i8 @Str16Ldr8_1(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+ %0 = bitcast i16* %P to i8*
+ %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 1
+ store i16 %v, i16* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 3
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+
+; CHECK-LABEL: Unscaled_Str64Ldr64
+; CHECK: mov x0, x1
+define i64 @Unscaled_Str64Ldr64(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i64*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i64, i64* %0, i64 -1
+ %1 = load i64, i64* %arrayidx1
+ ret i64 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr32_0
+; CHECK: and x0, x1, #0xffffffff
+define i32 @Unscaled_Str64Ldr32_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i32*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 -2
+ %1 = load i32, i32* %arrayidx1
+ ret i32 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr32_1
+; CHECK: lsr x0, x1, #32
+define i32 @Unscaled_Str64Ldr32_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i32*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 -1
+ %1 = load i32, i32* %arrayidx1
+ ret i32 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr16_0
+; CHECK: and x0, x1, #0xffff
+define i16 @Unscaled_Str64Ldr16_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i16*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -4
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr16_1
+; CHECK: ubfx x0, x1, #16, #16
+define i16 @Unscaled_Str64Ldr16_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i16*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -3
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr16_2
+; CHECK: ubfx x0, x1, #32, #16
+define i16 @Unscaled_Str64Ldr16_2(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i16*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -2
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr16_3
+; CHECK: lsr x0, x1, #48
+define i16 @Unscaled_Str64Ldr16_3(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i16*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -1
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_0
+; CHECK: and x0, x1, #0xff
+define i8 @Unscaled_Str64Ldr8_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -8
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_1
+; CHECK: ubfx x0, x1, #8, #8
+define i8 @Unscaled_Str64Ldr8_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -7
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_2
+; CHECK: ubfx x0, x1, #16, #8
+define i8 @Unscaled_Str64Ldr8_2(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -6
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_3
+; CHECK: ubfx x0, x1, #24, #8
+define i8 @Unscaled_Str64Ldr8_3(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -5
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_4
+; CHECK: ubfx x0, x1, #32, #8
+define i8 @Unscaled_Str64Ldr8_4(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -4
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_5
+; CHECK: ubfx x0, x1, #40, #8
+define i8 @Unscaled_Str64Ldr8_5(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -3
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_6
+; CHECK: ubfx x0, x1, #48, #8
+define i8 @Unscaled_Str64Ldr8_6(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -2
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_7
+; CHECK: lsr x0, x1, #56
+define i8 @Unscaled_Str64Ldr8_7(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -1
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr32
+; CHECK: mov w0, w1
+define i32 @Unscaled_Str32Ldr32(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i32*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 -1
+ %1 = load i32, i32* %arrayidx1
+ ret i32 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr16_0
+; CHECK: and w0, w1, #0xffff
+define i16 @Unscaled_Str32Ldr16_0(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i16*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -2
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr16_1
+; CHECK: lsr w0, w1, #16
+define i16 @Unscaled_Str32Ldr16_1(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i16*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -1
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr8_0
+; CHECK: and w0, w1, #0xff
+define i8 @Unscaled_Str32Ldr8_0(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i8*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -4
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr8_1
+; CHECK: ubfx w0, w1, #8, #8
+define i8 @Unscaled_Str32Ldr8_1(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i8*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -3
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr8_2
+; CHECK: ubfx w0, w1, #16, #8
+define i8 @Unscaled_Str32Ldr8_2(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i8*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -2
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr8_3
+; CHECK: lsr w0, w1, #24
+define i8 @Unscaled_Str32Ldr8_3(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i8*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -1
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str16Ldr16
+; CHECK: mov w0, w1
+define i16 @Unscaled_Str16Ldr16(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+ %0 = bitcast i16* %P to i16*
+ %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 -1
+ store i16 %v, i16* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -1
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str16Ldr8_0
+; CHECK: and w0, w1, #0xff
+define i8 @Unscaled_Str16Ldr8_0(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+ %0 = bitcast i16* %P to i8*
+ %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 -1
+ store i16 %v, i16* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -2
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str16Ldr8_1
+; CHECK: ubfx w0, w1, #8, #8
+define i8 @Unscaled_Str16Ldr8_1(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+ %0 = bitcast i16* %P to i8*
+ %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 -1
+ store i16 %v, i16* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -1
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: StrVolatileLdr
+; CHECK: ldrh
+define i16 @StrVolatileLdr(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i16*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 2
+ %1 = load volatile i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: StrNotInRangeLdr
+; CHECK: ldrh
+define i16 @StrNotInRangeLdr(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i16*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 1
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_StrNotInRangeLdr
+; CHECK: ldurh
+define i16 @Unscaled_StrNotInRangeLdr(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i16*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -3
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: StrCallLdr
+; CHECK: ldrh
+define i16 @StrCallLdr(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i16*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ %c = call i1 @test_dummy()
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 1
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+declare i1 @test_dummy()
+
+; CHECK-LABEL: StrStrLdr
+; CHECK: ldrh
+define i16 @StrStrLdr(i32 %v, i32* %P, i32* %P2, i32 %n) {
+entry:
+ %0 = bitcast i32* %P to i16*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ store i32 %n, i32* %P2
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 2
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}