1 //=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file contains a pass that performs load / store related peephole
11 // optimizations. This pass should be run after register allocation.
13 //===----------------------------------------------------------------------===//
15 #include "AArch64InstrInfo.h"
16 #include "AArch64Subtarget.h"
17 #include "MCTargetDesc/AArch64AddressingModes.h"
18 #include "llvm/ADT/BitVector.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/CodeGen/MachineInstr.h"
24 #include "llvm/CodeGen/MachineInstrBuilder.h"
25 #include "llvm/Support/CommandLine.h"
26 #include "llvm/Support/Debug.h"
27 #include "llvm/Support/ErrorHandling.h"
28 #include "llvm/Support/raw_ostream.h"
29 #include "llvm/Target/TargetInstrInfo.h"
30 #include "llvm/Target/TargetMachine.h"
31 #include "llvm/Target/TargetRegisterInfo.h"
34 #define DEBUG_TYPE "aarch64-ldst-opt"
36 /// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine
37 /// load / store instructions to form ldp / stp instructions.
39 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
40 STATISTIC(NumPostFolded, "Number of post-index updates folded");
41 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
42 STATISTIC(NumUnscaledPairCreated,
43 "Number of load/store from unscaled generated");
44 STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted");
45 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
47 static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
48 cl::init(20), cl::Hidden);
51 void initializeAArch64LoadStoreOptPass(PassRegistry &);
54 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
58 typedef struct LdStPairFlags {
59 // If a matching instruction is found, MergeForward is set to true if the
60 // merge is to remove the first instruction and replace the second with
61 // a pair-wise insn, and false if the reverse is true.
64 // SExtIdx gives the index of the result of the load pair that must be
65 // extended. The value of SExtIdx assumes that the paired load produces the
66 // value in this order: (I, returned iterator), i.e., -1 means no value has
67 // to be extended, 0 means I, and 1 means the returned iterator.
70 LdStPairFlags() : MergeForward(false), SExtIdx(-1) {}
72 void setMergeForward(bool V = true) { MergeForward = V; }
73 bool getMergeForward() const { return MergeForward; }
75 void setSExtIdx(int V) { SExtIdx = V; }
76 int getSExtIdx() const { return SExtIdx; }
80 struct AArch64LoadStoreOpt : public MachineFunctionPass {
82 AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
83 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
86 const AArch64InstrInfo *TII;
87 const TargetRegisterInfo *TRI;
88 const AArch64Subtarget *Subtarget;
90 // Scan the instructions looking for a load/store that can be combined
91 // with the current instruction into a load/store pair.
92 // Return the matching instruction if one is found, else MBB->end().
93 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
96 // Merge the two instructions indicated into a single pair-wise instruction.
97 // If MergeForward is true, erase the first instruction and fold its
98 // operation into the second. If false, the reverse. Return the instruction
99 // following the first instruction (which may change during processing).
100 MachineBasicBlock::iterator
101 mergePairedInsns(MachineBasicBlock::iterator I,
102 MachineBasicBlock::iterator Paired,
103 const LdStPairFlags &Flags);
105 // Scan the instruction list to find a base register update that can
106 // be combined with the current instruction (a load or store) using
107 // pre or post indexed addressing with writeback. Scan forwards.
108 MachineBasicBlock::iterator
109 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
112 // Scan the instruction list to find a base register update that can
113 // be combined with the current instruction (a load or store) using
114 // pre or post indexed addressing with writeback. Scan backwards.
115 MachineBasicBlock::iterator
116 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
118 // Find an instruction that updates the base register of the ld/st
120 bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI,
121 unsigned BaseReg, int Offset);
123 // Merge a pre- or post-index base register update into a ld/st instruction.
124 MachineBasicBlock::iterator
125 mergeUpdateInsn(MachineBasicBlock::iterator I,
126 MachineBasicBlock::iterator Update, bool IsPreIdx);
128 // Find and merge foldable ldr/str instructions.
129 bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
131 // Check if converting two narrow loads into a single wider load with
132 // bitfield extracts could be enabled.
133 bool enableNarrowLdMerge(MachineFunction &Fn);
135 bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);
137 bool runOnMachineFunction(MachineFunction &Fn) override;
139 const char *getPassName() const override {
140 return AARCH64_LOAD_STORE_OPT_NAME;
143 char AArch64LoadStoreOpt::ID = 0;
146 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
147 AARCH64_LOAD_STORE_OPT_NAME, false, false)
149 static bool isUnscaledLdSt(unsigned Opc) {
153 case AArch64::STURSi:
154 case AArch64::STURDi:
155 case AArch64::STURQi:
156 case AArch64::STURBBi:
157 case AArch64::STURHHi:
158 case AArch64::STURWi:
159 case AArch64::STURXi:
160 case AArch64::LDURSi:
161 case AArch64::LDURDi:
162 case AArch64::LDURQi:
163 case AArch64::LDURWi:
164 case AArch64::LDURXi:
165 case AArch64::LDURSWi:
166 case AArch64::LDURHHi:
167 case AArch64::LDURBBi:
168 case AArch64::LDURSBWi:
169 case AArch64::LDURSHWi:
174 static bool isUnscaledLdSt(MachineInstr *MI) {
175 return isUnscaledLdSt(MI->getOpcode());
178 static unsigned getBitExtrOpcode(MachineInstr *MI) {
179 switch (MI->getOpcode()) {
181 llvm_unreachable("Unexpected opcode.");
182 case AArch64::LDRBBui:
183 case AArch64::LDURBBi:
184 case AArch64::LDRHHui:
185 case AArch64::LDURHHi:
186 return AArch64::UBFMWri;
187 case AArch64::LDRSBWui:
188 case AArch64::LDURSBWi:
189 case AArch64::LDRSHWui:
190 case AArch64::LDURSHWi:
191 return AArch64::SBFMWri;
195 static bool isNarrowStore(unsigned Opc) {
199 case AArch64::STRBBui:
200 case AArch64::STURBBi:
201 case AArch64::STRHHui:
202 case AArch64::STURHHi:
207 static bool isNarrowStore(MachineInstr *MI) {
208 return isNarrowStore(MI->getOpcode());
211 static bool isNarrowLoad(unsigned Opc) {
215 case AArch64::LDRHHui:
216 case AArch64::LDURHHi:
217 case AArch64::LDRBBui:
218 case AArch64::LDURBBi:
219 case AArch64::LDRSHWui:
220 case AArch64::LDURSHWi:
221 case AArch64::LDRSBWui:
222 case AArch64::LDURSBWi:
227 static bool isNarrowLoad(MachineInstr *MI) {
228 return isNarrowLoad(MI->getOpcode());
231 // Scaling factor for unscaled load or store.
232 static int getMemScale(MachineInstr *MI) {
233 switch (MI->getOpcode()) {
235 llvm_unreachable("Opcode has unknown scale!");
236 case AArch64::LDRBBui:
237 case AArch64::LDURBBi:
238 case AArch64::LDRSBWui:
239 case AArch64::LDURSBWi:
240 case AArch64::STRBBui:
241 case AArch64::STURBBi:
243 case AArch64::LDRHHui:
244 case AArch64::LDURHHi:
245 case AArch64::LDRSHWui:
246 case AArch64::LDURSHWi:
247 case AArch64::STRHHui:
248 case AArch64::STURHHi:
250 case AArch64::LDRSui:
251 case AArch64::LDURSi:
252 case AArch64::LDRSWui:
253 case AArch64::LDURSWi:
254 case AArch64::LDRWui:
255 case AArch64::LDURWi:
256 case AArch64::STRSui:
257 case AArch64::STURSi:
258 case AArch64::STRWui:
259 case AArch64::STURWi:
261 case AArch64::LDPSWi:
266 case AArch64::LDRDui:
267 case AArch64::LDURDi:
268 case AArch64::LDRXui:
269 case AArch64::LDURXi:
270 case AArch64::STRDui:
271 case AArch64::STURDi:
272 case AArch64::STRXui:
273 case AArch64::STURXi:
279 case AArch64::LDRQui:
280 case AArch64::LDURQi:
281 case AArch64::STRQui:
282 case AArch64::STURQi:
289 static unsigned getMatchingNonSExtOpcode(unsigned Opc,
290 bool *IsValidLdStrOpc = nullptr) {
292 *IsValidLdStrOpc = true;
296 *IsValidLdStrOpc = false;
298 case AArch64::STRDui:
299 case AArch64::STURDi:
300 case AArch64::STRQui:
301 case AArch64::STURQi:
302 case AArch64::STRBBui:
303 case AArch64::STURBBi:
304 case AArch64::STRHHui:
305 case AArch64::STURHHi:
306 case AArch64::STRWui:
307 case AArch64::STURWi:
308 case AArch64::STRXui:
309 case AArch64::STURXi:
310 case AArch64::LDRDui:
311 case AArch64::LDURDi:
312 case AArch64::LDRQui:
313 case AArch64::LDURQi:
314 case AArch64::LDRWui:
315 case AArch64::LDURWi:
316 case AArch64::LDRXui:
317 case AArch64::LDURXi:
318 case AArch64::STRSui:
319 case AArch64::STURSi:
320 case AArch64::LDRSui:
321 case AArch64::LDURSi:
322 case AArch64::LDRHHui:
323 case AArch64::LDURHHi:
324 case AArch64::LDRBBui:
325 case AArch64::LDURBBi:
327 case AArch64::LDRSWui:
328 return AArch64::LDRWui;
329 case AArch64::LDURSWi:
330 return AArch64::LDURWi;
331 case AArch64::LDRSBWui:
332 return AArch64::LDRBBui;
333 case AArch64::LDRSHWui:
334 return AArch64::LDRHHui;
335 case AArch64::LDURSBWi:
336 return AArch64::LDURBBi;
337 case AArch64::LDURSHWi:
338 return AArch64::LDURHHi;
342 static unsigned getMatchingPairOpcode(unsigned Opc) {
345 llvm_unreachable("Opcode has no pairwise equivalent!");
346 case AArch64::STRSui:
347 case AArch64::STURSi:
348 return AArch64::STPSi;
349 case AArch64::STRDui:
350 case AArch64::STURDi:
351 return AArch64::STPDi;
352 case AArch64::STRQui:
353 case AArch64::STURQi:
354 return AArch64::STPQi;
355 case AArch64::STRBBui:
356 return AArch64::STRHHui;
357 case AArch64::STRHHui:
358 return AArch64::STRWui;
359 case AArch64::STURBBi:
360 return AArch64::STURHHi;
361 case AArch64::STURHHi:
362 return AArch64::STURWi;
363 case AArch64::STRWui:
364 case AArch64::STURWi:
365 return AArch64::STPWi;
366 case AArch64::STRXui:
367 case AArch64::STURXi:
368 return AArch64::STPXi;
369 case AArch64::LDRSui:
370 case AArch64::LDURSi:
371 return AArch64::LDPSi;
372 case AArch64::LDRDui:
373 case AArch64::LDURDi:
374 return AArch64::LDPDi;
375 case AArch64::LDRQui:
376 case AArch64::LDURQi:
377 return AArch64::LDPQi;
378 case AArch64::LDRWui:
379 case AArch64::LDURWi:
380 return AArch64::LDPWi;
381 case AArch64::LDRXui:
382 case AArch64::LDURXi:
383 return AArch64::LDPXi;
384 case AArch64::LDRSWui:
385 case AArch64::LDURSWi:
386 return AArch64::LDPSWi;
387 case AArch64::LDRHHui:
388 case AArch64::LDRSHWui:
389 return AArch64::LDRWui;
390 case AArch64::LDURHHi:
391 case AArch64::LDURSHWi:
392 return AArch64::LDURWi;
393 case AArch64::LDRBBui:
394 case AArch64::LDRSBWui:
395 return AArch64::LDRHHui;
396 case AArch64::LDURBBi:
397 case AArch64::LDURSBWi:
398 return AArch64::LDURHHi;
402 static unsigned getPreIndexedOpcode(unsigned Opc) {
405 llvm_unreachable("Opcode has no pre-indexed equivalent!");
406 case AArch64::STRSui:
407 return AArch64::STRSpre;
408 case AArch64::STRDui:
409 return AArch64::STRDpre;
410 case AArch64::STRQui:
411 return AArch64::STRQpre;
412 case AArch64::STRBBui:
413 return AArch64::STRBBpre;
414 case AArch64::STRHHui:
415 return AArch64::STRHHpre;
416 case AArch64::STRWui:
417 return AArch64::STRWpre;
418 case AArch64::STRXui:
419 return AArch64::STRXpre;
420 case AArch64::LDRSui:
421 return AArch64::LDRSpre;
422 case AArch64::LDRDui:
423 return AArch64::LDRDpre;
424 case AArch64::LDRQui:
425 return AArch64::LDRQpre;
426 case AArch64::LDRBBui:
427 return AArch64::LDRBBpre;
428 case AArch64::LDRHHui:
429 return AArch64::LDRHHpre;
430 case AArch64::LDRWui:
431 return AArch64::LDRWpre;
432 case AArch64::LDRXui:
433 return AArch64::LDRXpre;
434 case AArch64::LDRSWui:
435 return AArch64::LDRSWpre;
437 return AArch64::LDPSpre;
438 case AArch64::LDPSWi:
439 return AArch64::LDPSWpre;
441 return AArch64::LDPDpre;
443 return AArch64::LDPQpre;
445 return AArch64::LDPWpre;
447 return AArch64::LDPXpre;
449 return AArch64::STPSpre;
451 return AArch64::STPDpre;
453 return AArch64::STPQpre;
455 return AArch64::STPWpre;
457 return AArch64::STPXpre;
461 static unsigned getPostIndexedOpcode(unsigned Opc) {
464 llvm_unreachable("Opcode has no post-indexed wise equivalent!");
465 case AArch64::STRSui:
466 return AArch64::STRSpost;
467 case AArch64::STRDui:
468 return AArch64::STRDpost;
469 case AArch64::STRQui:
470 return AArch64::STRQpost;
471 case AArch64::STRBBui:
472 return AArch64::STRBBpost;
473 case AArch64::STRHHui:
474 return AArch64::STRHHpost;
475 case AArch64::STRWui:
476 return AArch64::STRWpost;
477 case AArch64::STRXui:
478 return AArch64::STRXpost;
479 case AArch64::LDRSui:
480 return AArch64::LDRSpost;
481 case AArch64::LDRDui:
482 return AArch64::LDRDpost;
483 case AArch64::LDRQui:
484 return AArch64::LDRQpost;
485 case AArch64::LDRBBui:
486 return AArch64::LDRBBpost;
487 case AArch64::LDRHHui:
488 return AArch64::LDRHHpost;
489 case AArch64::LDRWui:
490 return AArch64::LDRWpost;
491 case AArch64::LDRXui:
492 return AArch64::LDRXpost;
493 case AArch64::LDRSWui:
494 return AArch64::LDRSWpost;
496 return AArch64::LDPSpost;
497 case AArch64::LDPSWi:
498 return AArch64::LDPSWpost;
500 return AArch64::LDPDpost;
502 return AArch64::LDPQpost;
504 return AArch64::LDPWpost;
506 return AArch64::LDPXpost;
508 return AArch64::STPSpost;
510 return AArch64::STPDpost;
512 return AArch64::STPQpost;
514 return AArch64::STPWpost;
516 return AArch64::STPXpost;
520 static bool isPairedLdSt(const MachineInstr *MI) {
521 switch (MI->getOpcode()) {
525 case AArch64::LDPSWi:
539 static const MachineOperand &getLdStRegOp(const MachineInstr *MI,
540 unsigned PairedRegOp = 0) {
541 assert(PairedRegOp < 2 && "Unexpected register operand idx.");
542 unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
543 return MI->getOperand(Idx);
546 static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) {
547 unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
548 return MI->getOperand(Idx);
551 static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) {
552 unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
553 return MI->getOperand(Idx);
556 // Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI.
557 static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0,
559 assert(MI->memoperands_empty() && "expected a new machineinstr");
560 size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) +
561 (Op1->memoperands_end() - Op1->memoperands_begin());
563 MachineFunction *MF = MI->getParent()->getParent();
564 MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs);
565 MachineSDNode::mmo_iterator MemEnd =
566 std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin);
567 MemEnd = std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd);
568 MI->setMemRefs(MemBegin, MemEnd);
571 MachineBasicBlock::iterator
572 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
573 MachineBasicBlock::iterator Paired,
574 const LdStPairFlags &Flags) {
575 MachineBasicBlock::iterator NextI = I;
577 // If NextI is the second of the two instructions to be merged, we need
578 // to skip one further. Either way we merge will invalidate the iterator,
579 // and we don't need to scan the new instruction, as it's a pairwise
580 // instruction, which we're not considering for further action anyway.
584 int SExtIdx = Flags.getSExtIdx();
586 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
587 bool IsUnscaled = isUnscaledLdSt(Opc);
588 int OffsetStride = IsUnscaled ? getMemScale(I) : 1;
590 bool MergeForward = Flags.getMergeForward();
591 unsigned NewOpc = getMatchingPairOpcode(Opc);
592 // Insert our new paired instruction after whichever of the paired
593 // instructions MergeForward indicates.
594 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
595 // Also based on MergeForward is from where we copy the base register operand
596 // so we get the flags compatible with the input code.
597 const MachineOperand &BaseRegOp =
598 MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I);
600 // Which register is Rt and which is Rt2 depends on the offset order.
601 MachineInstr *RtMI, *Rt2MI;
602 if (getLdStOffsetOp(I).getImm() ==
603 getLdStOffsetOp(Paired).getImm() + OffsetStride) {
606 // Here we swapped the assumption made for SExtIdx.
607 // I.e., we turn ldp I, Paired into ldp Paired, I.
608 // Update the index accordingly.
610 SExtIdx = (SExtIdx + 1) % 2;
616 int OffsetImm = getLdStOffsetOp(RtMI).getImm();
618 if (isNarrowLoad(Opc)) {
619 // Change the scaled offset from small to large type.
621 assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
624 MachineInstr *RtNewDest = MergeForward ? I : Paired;
625 // When merging small (< 32 bit) loads for big-endian targets, the order of
626 // the component parts gets swapped.
627 if (!Subtarget->isLittleEndian())
628 std::swap(RtMI, Rt2MI);
629 // Construct the new load instruction.
630 MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2;
631 NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
633 .addOperand(getLdStRegOp(RtNewDest))
634 .addOperand(BaseRegOp)
637 // Copy MachineMemOperands from the original loads.
638 concatenateMemOperands(NewMemMI, I, Paired);
642 << "Creating the new load and extract. Replacing instructions:\n ");
643 DEBUG(I->print(dbgs()));
644 DEBUG(dbgs() << " ");
645 DEBUG(Paired->print(dbgs()));
646 DEBUG(dbgs() << " with instructions:\n ");
647 DEBUG((NewMemMI)->print(dbgs()));
649 int Width = getMemScale(I) == 1 ? 8 : 16;
652 int ImmsLow = LSBLow + Width - 1;
653 int ImmsHigh = LSBHigh + Width - 1;
654 MachineInstr *ExtDestMI = MergeForward ? Paired : I;
655 if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) {
656 // Create the bitfield extract for high bits.
657 BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
658 TII->get(getBitExtrOpcode(Rt2MI)))
659 .addOperand(getLdStRegOp(Rt2MI))
660 .addReg(getLdStRegOp(RtNewDest).getReg())
663 // Create the bitfield extract for low bits.
664 if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
665 // For unsigned, prefer to use AND for low bits.
666 BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
667 TII->get(AArch64::ANDWri))
668 .addOperand(getLdStRegOp(RtMI))
669 .addReg(getLdStRegOp(RtNewDest).getReg())
672 BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
673 TII->get(getBitExtrOpcode(RtMI)))
674 .addOperand(getLdStRegOp(RtMI))
675 .addReg(getLdStRegOp(RtNewDest).getReg())
680 // Create the bitfield extract for low bits.
681 if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
682 // For unsigned, prefer to use AND for low bits.
683 BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
684 TII->get(AArch64::ANDWri))
685 .addOperand(getLdStRegOp(RtMI))
686 .addReg(getLdStRegOp(RtNewDest).getReg())
689 BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
690 TII->get(getBitExtrOpcode(RtMI)))
691 .addOperand(getLdStRegOp(RtMI))
692 .addReg(getLdStRegOp(RtNewDest).getReg())
697 // Create the bitfield extract for high bits.
698 BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
699 TII->get(getBitExtrOpcode(Rt2MI)))
700 .addOperand(getLdStRegOp(Rt2MI))
701 .addReg(getLdStRegOp(RtNewDest).getReg())
705 DEBUG(dbgs() << " ");
706 DEBUG((BitExtMI1)->print(dbgs()));
707 DEBUG(dbgs() << " ");
708 DEBUG((BitExtMI2)->print(dbgs()));
709 DEBUG(dbgs() << "\n");
711 // Erase the old instructions.
712 I->eraseFromParent();
713 Paired->eraseFromParent();
717 // Construct the new instruction.
718 MachineInstrBuilder MIB;
719 if (isNarrowStore(Opc)) {
720 // Change the scaled offset from small to large type.
722 assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
725 MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
727 .addOperand(getLdStRegOp(I))
728 .addOperand(BaseRegOp)
730 // Copy MachineMemOperands from the original stores.
731 concatenateMemOperands(MIB, I, Paired);
735 OffsetImm /= OffsetStride;
736 MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
738 .addOperand(getLdStRegOp(RtMI))
739 .addOperand(getLdStRegOp(Rt2MI))
740 .addOperand(BaseRegOp)
746 // FIXME: Do we need/want to copy the mem operands from the source
747 // instructions? Probably. What uses them after this?
749 DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n ");
750 DEBUG(I->print(dbgs()));
751 DEBUG(dbgs() << " ");
752 DEBUG(Paired->print(dbgs()));
753 DEBUG(dbgs() << " with instruction:\n ");
756 // Generate the sign extension for the proper result of the ldp.
757 // I.e., with X1, that would be:
758 // %W1<def> = KILL %W1, %X1<imp-def>
759 // %X1<def> = SBFMXri %X1<kill>, 0, 31
760 MachineOperand &DstMO = MIB->getOperand(SExtIdx);
761 // Right now, DstMO has the extended register, since it comes from an
763 unsigned DstRegX = DstMO.getReg();
764 // Get the W variant of that register.
765 unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
766 // Update the result of LDP to use the W instead of the X variant.
767 DstMO.setReg(DstRegW);
768 DEBUG(((MachineInstr *)MIB)->print(dbgs()));
769 DEBUG(dbgs() << "\n");
770 // Make the machine verifier happy by providing a definition for
772 // Insert this definition right after the generated LDP, i.e., before
774 MachineInstrBuilder MIBKill =
775 BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
776 TII->get(TargetOpcode::KILL), DstRegW)
778 .addReg(DstRegX, RegState::Define);
779 MIBKill->getOperand(2).setImplicit();
780 // Create the sign extension.
781 MachineInstrBuilder MIBSXTW =
782 BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
783 TII->get(AArch64::SBFMXri), DstRegX)
788 DEBUG(dbgs() << " Extend operand:\n ");
789 DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
790 DEBUG(dbgs() << "\n");
792 DEBUG(((MachineInstr *)MIB)->print(dbgs()));
793 DEBUG(dbgs() << "\n");
796 // Erase the old instructions.
797 I->eraseFromParent();
798 Paired->eraseFromParent();
803 /// trackRegDefsUses - Remember what registers the specified instruction uses
805 static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs,
807 const TargetRegisterInfo *TRI) {
808 for (const MachineOperand &MO : MI->operands()) {
810 ModifiedRegs.setBitsNotInMask(MO.getRegMask());
814 unsigned Reg = MO.getReg();
816 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
817 ModifiedRegs.set(*AI);
819 assert(MO.isUse() && "Reg operand not a def and not a use?!?");
820 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
826 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
827 // Convert the byte-offset used by unscaled into an "element" offset used
828 // by the scaled pair load/store instructions.
830 Offset /= OffsetStride;
832 return Offset <= 63 && Offset >= -64;
835 // Do alignment, specialized to power of 2 and for signed ints,
836 // avoiding having to do a C-style cast from uint_64t to int when
837 // using RoundUpToAlignment from include/llvm/Support/MathExtras.h.
838 // FIXME: Move this function to include/MathExtras.h?
839 static int alignTo(int Num, int PowOf2) {
840 return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
843 static bool mayAlias(MachineInstr *MIa, MachineInstr *MIb,
844 const AArch64InstrInfo *TII) {
845 // One of the instructions must modify memory.
846 if (!MIa->mayStore() && !MIb->mayStore())
849 // Both instructions must be memory operations.
850 if (!MIa->mayLoadOrStore() && !MIb->mayLoadOrStore())
853 return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb);
856 static bool mayAlias(MachineInstr *MIa,
857 SmallVectorImpl<MachineInstr *> &MemInsns,
858 const AArch64InstrInfo *TII) {
859 for (auto &MIb : MemInsns)
860 if (mayAlias(MIa, MIb, TII))
866 /// findMatchingInsn - Scan the instructions looking for a load/store that can
867 /// be combined with the current instruction into a load/store pair.
868 MachineBasicBlock::iterator
869 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
870 LdStPairFlags &Flags, unsigned Limit) {
871 MachineBasicBlock::iterator E = I->getParent()->end();
872 MachineBasicBlock::iterator MBBI = I;
873 MachineInstr *FirstMI = I;
876 unsigned Opc = FirstMI->getOpcode();
877 bool MayLoad = FirstMI->mayLoad();
878 bool IsUnscaled = isUnscaledLdSt(FirstMI);
879 unsigned Reg = getLdStRegOp(FirstMI).getReg();
880 unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
881 int Offset = getLdStOffsetOp(FirstMI).getImm();
882 bool IsNarrowStore = isNarrowStore(Opc);
884 // For narrow stores, find only the case where the stored value is WZR.
885 if (IsNarrowStore && Reg != AArch64::WZR)
888 // Early exit if the first instruction modifies the base register.
889 // e.g., ldr x0, [x0]
890 if (FirstMI->modifiesRegister(BaseReg, TRI))
893 // Early exit if the offset if not possible to match. (6 bits of positive
894 // range, plus allow an extra one in case we find a later insn that matches
896 int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
897 if (!(isNarrowLoad(Opc) || IsNarrowStore) &&
898 !inBoundsForPair(IsUnscaled, Offset, OffsetStride))
901 // Track which registers have been modified and used between the first insn
902 // (inclusive) and the second insn.
903 BitVector ModifiedRegs, UsedRegs;
904 ModifiedRegs.resize(TRI->getNumRegs());
905 UsedRegs.resize(TRI->getNumRegs());
907 // Remember any instructions that read/write memory between FirstMI and MI.
908 SmallVector<MachineInstr *, 4> MemInsns;
910 for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
911 MachineInstr *MI = MBBI;
912 // Skip DBG_VALUE instructions. Otherwise debug info can affect the
913 // optimization by changing how far we scan.
914 if (MI->isDebugValue())
917 // Now that we know this is a real instruction, count it.
920 bool CanMergeOpc = Opc == MI->getOpcode();
921 Flags.setSExtIdx(-1);
923 bool IsValidLdStrOpc;
924 unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc);
925 assert(IsValidLdStrOpc &&
926 "Given Opc should be a Load or Store with an immediate");
927 // Opc will be the first instruction in the pair.
928 Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0);
929 CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode());
932 if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) {
933 assert(MI->mayLoadOrStore() && "Expected memory operation.");
934 // If we've found another instruction with the same opcode, check to see
935 // if the base and offset are compatible with our starting instruction.
936 // These instructions all have scaled immediate operands, so we just
937 // check for +1/-1. Make sure to check the new instruction offset is
938 // actually an immediate and not a symbolic reference destined for
941 // Pairwise instructions have a 7-bit signed offset field. Single insns
942 // have a 12-bit unsigned offset field. To be a valid combine, the
943 // final offset must be in range.
944 unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
945 int MIOffset = getLdStOffsetOp(MI).getImm();
946 if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
947 (Offset + OffsetStride == MIOffset))) {
948 int MinOffset = Offset < MIOffset ? Offset : MIOffset;
949 // If this is a volatile load/store that otherwise matched, stop looking
950 // as something is going on that we don't have enough information to
951 // safely transform. Similarly, stop if we see a hint to avoid pairs.
952 if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
954 // If the resultant immediate offset of merging these instructions
955 // is out of range for a pairwise instruction, bail and keep looking.
956 bool MIIsUnscaled = isUnscaledLdSt(MI);
957 bool IsNarrowLoad = isNarrowLoad(MI->getOpcode());
959 !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
960 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
961 MemInsns.push_back(MI);
965 if (IsNarrowLoad || IsNarrowStore) {
966 // If the alignment requirements of the scaled wide load/store
967 // instruction can't express the offset of the scaled narrow
968 // input, bail and keep looking.
969 if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) {
970 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
971 MemInsns.push_back(MI);
975 // If the alignment requirements of the paired (scaled) instruction
976 // can't express the offset of the unscaled input, bail and keep
978 if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
979 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
980 MemInsns.push_back(MI);
984 // If the destination register of the loads is the same register, bail
985 // and keep looking. A load-pair instruction with both destination
986 // registers the same is UNPREDICTABLE and will result in an exception.
987 // For narrow stores, allow only when the stored value is the same
989 if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) ||
990 (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) {
991 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
992 MemInsns.push_back(MI);
996 // If the Rt of the second instruction was not modified or used between
997 // the two instructions and none of the instructions between the second
998 // and first alias with the second, we can combine the second into the
1000 if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
1001 !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
1002 !mayAlias(MI, MemInsns, TII)) {
1003 Flags.setMergeForward(false);
1007 // Likewise, if the Rt of the first instruction is not modified or used
1008 // between the two instructions and none of the instructions between the
1009 // first and the second alias with the first, we can combine the first
1011 if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
1012 !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
1013 !mayAlias(FirstMI, MemInsns, TII)) {
1014 Flags.setMergeForward(true);
1017 // Unable to combine these instructions due to interference in between.
1022 // If the instruction wasn't a matching load or store. Stop searching if we
1023 // encounter a call instruction that might modify memory.
1027 // Update modified / uses register lists.
1028 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1030 // Otherwise, if the base register is modified, we have no match, so
1032 if (ModifiedRegs[BaseReg])
1035 // Update list of instructions that read/write memory.
1036 if (MI->mayLoadOrStore())
1037 MemInsns.push_back(MI);
1042 MachineBasicBlock::iterator
1043 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
1044 MachineBasicBlock::iterator Update,
1046 assert((Update->getOpcode() == AArch64::ADDXri ||
1047 Update->getOpcode() == AArch64::SUBXri) &&
1048 "Unexpected base register update instruction to merge!");
1049 MachineBasicBlock::iterator NextI = I;
1050 // Return the instruction following the merged instruction, which is
1051 // the instruction following our unmerged load. Unless that's the add/sub
1052 // instruction we're merging, in which case it's the one after that.
1053 if (++NextI == Update)
1056 int Value = Update->getOperand(2).getImm();
1057 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
1058 "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1059 if (Update->getOpcode() == AArch64::SUBXri)
1062 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
1063 : getPostIndexedOpcode(I->getOpcode());
1064 MachineInstrBuilder MIB;
1065 if (!isPairedLdSt(I)) {
1066 // Non-paired instruction.
1067 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1068 .addOperand(getLdStRegOp(Update))
1069 .addOperand(getLdStRegOp(I))
1070 .addOperand(getLdStBaseOp(I))
1073 // Paired instruction.
1074 int Scale = getMemScale(I);
1075 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1076 .addOperand(getLdStRegOp(Update))
1077 .addOperand(getLdStRegOp(I, 0))
1078 .addOperand(getLdStRegOp(I, 1))
1079 .addOperand(getLdStBaseOp(I))
1080 .addImm(Value / Scale);
1085 DEBUG(dbgs() << "Creating pre-indexed load/store.");
1087 DEBUG(dbgs() << "Creating post-indexed load/store.");
1088 DEBUG(dbgs() << " Replacing instructions:\n ");
1089 DEBUG(I->print(dbgs()));
1090 DEBUG(dbgs() << " ");
1091 DEBUG(Update->print(dbgs()));
1092 DEBUG(dbgs() << " with instruction:\n ");
1093 DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1094 DEBUG(dbgs() << "\n");
1096 // Erase the old instructions for the block.
1097 I->eraseFromParent();
1098 Update->eraseFromParent();
1103 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
1105 unsigned BaseReg, int Offset) {
1106 switch (MI->getOpcode()) {
1109 case AArch64::SUBXri:
1110 // Negate the offset for a SUB instruction.
1113 case AArch64::ADDXri:
1114 // Make sure it's a vanilla immediate operand, not a relocation or
1115 // anything else we can't handle.
1116 if (!MI->getOperand(2).isImm())
1118 // Watch out for 1 << 12 shifted value.
1119 if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
1122 // The update instruction source and destination register must be the
1123 // same as the load/store base register.
1124 if (MI->getOperand(0).getReg() != BaseReg ||
1125 MI->getOperand(1).getReg() != BaseReg)
1128 bool IsPairedInsn = isPairedLdSt(MemMI);
1129 int UpdateOffset = MI->getOperand(2).getImm();
1130 // For non-paired load/store instructions, the immediate must fit in a
1131 // signed 9-bit integer.
1132 if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
1135 // For paired load/store instructions, the immediate must be a multiple of
1136 // the scaling factor. The scaled offset must also fit into a signed 7-bit
1139 int Scale = getMemScale(MemMI);
1140 if (UpdateOffset % Scale != 0)
1143 int ScaledOffset = UpdateOffset / Scale;
1144 if (ScaledOffset > 64 || ScaledOffset < -64)
1148 // If we have a non-zero Offset, we check that it matches the amount
1149 // we're adding to the register.
1150 if (!Offset || Offset == MI->getOperand(2).getImm())
1157 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
1158 MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) {
1159 MachineBasicBlock::iterator E = I->getParent()->end();
1160 MachineInstr *MemMI = I;
1161 MachineBasicBlock::iterator MBBI = I;
1163 unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
1164 int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
1166 // Scan forward looking for post-index opportunities. Updating instructions
1167 // can't be formed if the memory instruction doesn't have the offset we're
1169 if (MIUnscaledOffset != UnscaledOffset)
1172 // If the base register overlaps a destination register, we can't
1173 // merge the update.
1174 bool IsPairedInsn = isPairedLdSt(MemMI);
1175 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1176 unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
1177 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1181 // Track which registers have been modified and used between the first insn
1182 // (inclusive) and the second insn.
1183 BitVector ModifiedRegs, UsedRegs;
1184 ModifiedRegs.resize(TRI->getNumRegs());
1185 UsedRegs.resize(TRI->getNumRegs());
1187 for (unsigned Count = 0; MBBI != E; ++MBBI) {
1188 MachineInstr *MI = MBBI;
1189 // Skip DBG_VALUE instructions. Otherwise debug info can affect the
1190 // optimization by changing how far we scan.
1191 if (MI->isDebugValue())
1194 // Now that we know this is a real instruction, count it.
1197 // If we found a match, return it.
1198 if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset))
1201 // Update the status of what the instruction clobbered and used.
1202 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1204 // Otherwise, if the base register is used or modified, we have no match, so
1206 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
1212 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
1213 MachineBasicBlock::iterator I, unsigned Limit) {
1214 MachineBasicBlock::iterator B = I->getParent()->begin();
1215 MachineBasicBlock::iterator E = I->getParent()->end();
1216 MachineInstr *MemMI = I;
1217 MachineBasicBlock::iterator MBBI = I;
1219 unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
1220 int Offset = getLdStOffsetOp(MemMI).getImm();
1222 // If the load/store is the first instruction in the block, there's obviously
1223 // not any matching update. Ditto if the memory offset isn't zero.
1224 if (MBBI == B || Offset != 0)
1226 // If the base register overlaps a destination register, we can't
1227 // merge the update.
1228 bool IsPairedInsn = isPairedLdSt(MemMI);
1229 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1230 unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
1231 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1235 // Track which registers have been modified and used between the first insn
1236 // (inclusive) and the second insn.
1237 BitVector ModifiedRegs, UsedRegs;
1238 ModifiedRegs.resize(TRI->getNumRegs());
1239 UsedRegs.resize(TRI->getNumRegs());
1241 for (unsigned Count = 0; MBBI != B; --MBBI) {
1242 MachineInstr *MI = MBBI;
1243 // Skip DBG_VALUE instructions. Otherwise debug info can affect the
1244 // optimization by changing how far we scan.
1245 if (MI->isDebugValue())
1248 // Now that we know this is a real instruction, count it.
1251 // If we found a match, return it.
1252 if (isMatchingUpdateInsn(I, MI, BaseReg, Offset))
1255 // Update the status of what the instruction clobbered and used.
1256 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1258 // Otherwise, if the base register is used or modified, we have no match, so
1260 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
1266 bool AArch64LoadStoreOpt::tryToMergeLdStInst(
1267 MachineBasicBlock::iterator &MBBI) {
1268 MachineInstr *MI = MBBI;
1269 MachineBasicBlock::iterator E = MI->getParent()->end();
1270 // If this is a volatile load/store, don't mess with it.
1271 if (MI->hasOrderedMemoryRef())
1274 // Make sure this is a reg+imm (as opposed to an address reloc).
1275 if (!getLdStOffsetOp(MI).isImm())
1278 // Check if this load/store has a hint to avoid pair formation.
1279 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
1280 if (TII->isLdStPairSuppressed(MI))
1283 // Look ahead up to ScanLimit instructions for a pairable instruction.
1284 LdStPairFlags Flags;
1285 MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit);
1287 if (isNarrowLoad(MI)) {
1288 ++NumNarrowLoadsPromoted;
1289 } else if (isNarrowStore(MI)) {
1290 ++NumZeroStoresPromoted;
1293 if (isUnscaledLdSt(MI))
1294 ++NumUnscaledPairCreated;
1297 // Merge the loads into a pair. Keeping the iterator straight is a
1298 // pain, so we let the merge routine tell us what the next instruction
1299 // is after it's done mucking about.
1300 MBBI = mergePairedInsns(MBBI, Paired, Flags);
1306 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
1307 bool enableNarrowLdOpt) {
1308 bool Modified = false;
1309 // Three tranformations to do here:
1310 // 1) Find narrow loads that can be converted into a single wider load
1311 // with bitfield extract instructions.
1314 // ldrh w1, [x2, #2]
1317 // ubfx w1, w0, #16, #16
1318 // and w0, w0, #ffff
1319 // 2) Find loads and stores that can be merged into a single load or store
1320 // pair instruction.
1326 // 3) Find base register updates that can be merged into the load or store
1327 // as a base-reg writeback.
1334 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1335 enableNarrowLdOpt && MBBI != E;) {
1336 MachineInstr *MI = MBBI;
1337 switch (MI->getOpcode()) {
1339 // Just move on to the next instruction.
1342 // Scaled instructions.
1343 case AArch64::LDRBBui:
1344 case AArch64::LDRHHui:
1345 case AArch64::LDRSBWui:
1346 case AArch64::LDRSHWui:
1347 case AArch64::STRBBui:
1348 case AArch64::STRHHui:
1349 // Unscaled instructions.
1350 case AArch64::LDURBBi:
1351 case AArch64::LDURHHi:
1352 case AArch64::LDURSBWi:
1353 case AArch64::LDURSHWi:
1354 case AArch64::STURBBi:
1355 case AArch64::STURHHi: {
1356 if (tryToMergeLdStInst(MBBI)) {
1363 // FIXME: Do the other instructions.
1367 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1369 MachineInstr *MI = MBBI;
1370 switch (MI->getOpcode()) {
1372 // Just move on to the next instruction.
1375 // Scaled instructions.
1376 case AArch64::STRSui:
1377 case AArch64::STRDui:
1378 case AArch64::STRQui:
1379 case AArch64::STRXui:
1380 case AArch64::STRWui:
1381 case AArch64::LDRSui:
1382 case AArch64::LDRDui:
1383 case AArch64::LDRQui:
1384 case AArch64::LDRXui:
1385 case AArch64::LDRWui:
1386 case AArch64::LDRSWui:
1387 // Unscaled instructions.
1388 case AArch64::STURSi:
1389 case AArch64::STURDi:
1390 case AArch64::STURQi:
1391 case AArch64::STURWi:
1392 case AArch64::STURXi:
1393 case AArch64::LDURSi:
1394 case AArch64::LDURDi:
1395 case AArch64::LDURQi:
1396 case AArch64::LDURWi:
1397 case AArch64::LDURXi:
1398 case AArch64::LDURSWi: {
1399 if (tryToMergeLdStInst(MBBI)) {
1406 // FIXME: Do the other instructions.
1410 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1412 MachineInstr *MI = MBBI;
1413 // Do update merging. It's simpler to keep this separate from the above
1414 // switch, though not strictly necessary.
1415 unsigned Opc = MI->getOpcode();
1418 // Just move on to the next instruction.
1421 // Scaled instructions.
1422 case AArch64::STRSui:
1423 case AArch64::STRDui:
1424 case AArch64::STRQui:
1425 case AArch64::STRXui:
1426 case AArch64::STRWui:
1427 case AArch64::STRHHui:
1428 case AArch64::STRBBui:
1429 case AArch64::LDRSui:
1430 case AArch64::LDRDui:
1431 case AArch64::LDRQui:
1432 case AArch64::LDRXui:
1433 case AArch64::LDRWui:
1434 case AArch64::LDRHHui:
1435 case AArch64::LDRBBui:
1436 // Unscaled instructions.
1437 case AArch64::STURSi:
1438 case AArch64::STURDi:
1439 case AArch64::STURQi:
1440 case AArch64::STURWi:
1441 case AArch64::STURXi:
1442 case AArch64::LDURSi:
1443 case AArch64::LDURDi:
1444 case AArch64::LDURQi:
1445 case AArch64::LDURWi:
1446 case AArch64::LDURXi:
1447 // Paired instructions.
1448 case AArch64::LDPSi:
1449 case AArch64::LDPSWi:
1450 case AArch64::LDPDi:
1451 case AArch64::LDPQi:
1452 case AArch64::LDPWi:
1453 case AArch64::LDPXi:
1454 case AArch64::STPSi:
1455 case AArch64::STPDi:
1456 case AArch64::STPQi:
1457 case AArch64::STPWi:
1458 case AArch64::STPXi: {
1459 // Make sure this is a reg+imm (as opposed to an address reloc).
1460 if (!getLdStOffsetOp(MI).isImm()) {
1464 // Look forward to try to form a post-index instruction. For example,
1466 // add x20, x20, #32
1468 // ldr x0, [x20], #32
1469 MachineBasicBlock::iterator Update =
1470 findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
1472 // Merge the update into the ld/st.
1473 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
1478 // Don't know how to handle pre/post-index versions, so move to the next
1480 if (isUnscaledLdSt(Opc)) {
1485 // Look back to try to find a pre-index instruction. For example,
1489 // ldr x1, [x0, #8]!
1490 Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
1492 // Merge the update into the ld/st.
1493 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1498 // The immediate in the load/store is scaled by the size of the memory
1499 // operation. The immediate in the add we're looking for,
1500 // however, is not, so adjust here.
1501 int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
1503 // Look forward to try to find a post-index instruction. For example,
1504 // ldr x1, [x0, #64]
1507 // ldr x1, [x0, #64]!
1508 Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset);
1510 // Merge the update into the ld/st.
1511 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1517 // Nothing found. Just move to the next instruction.
1521 // FIXME: Do the other instructions.
1528 bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
1529 bool ProfitableArch = Subtarget->isCortexA57();
1530 // FIXME: The benefit from converting narrow loads into a wider load could be
1531 // microarchitectural as it assumes that a single load with two bitfield
1532 // extracts is cheaper than two narrow loads. Currently, this conversion is
1533 // enabled only in cortex-a57 on which performance benefits were verified.
1534 return ProfitableArch && !Subtarget->requiresStrictAlign();
1537 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1538 Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
1539 TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
1540 TRI = Subtarget->getRegisterInfo();
1542 bool Modified = false;
1543 bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);
1544 for (auto &MBB : Fn)
1545 Modified |= optimizeBlock(MBB, enableNarrowLdOpt);
1550 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
1551 // loads and stores near one another?
1553 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
1554 /// load / store optimization pass.
1555 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
1556 return new AArch64LoadStoreOpt();