lib/Transforms/Scalar/SROA.cpp

   1 //===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 /// \file
  10 /// This transformation implements the well known scalar replacement of
  11 /// aggregates transformation. It tries to identify promotable elements of an
  12 /// aggregate alloca, and promote them to registers. It will also try to
  13 /// convert uses of an element (or set of elements) of an alloca into a vector
  14 /// or bitfield-style integer scalar if appropriate.
  15 ///
  16 /// It works to do this with minimal slicing of the alloca so that regions
  17 /// which are merely transferred in and out of external memory remain unchanged
  18 /// and are not decomposed to scalar code.
  19 ///
  20 /// Because this also performs alloca promotion, it can be thought of as also
  21 /// serving the purpose of SSA formation. The algorithm iterates on the
  22 /// function until all opportunities for promotion have been realized.
  23 ///
  24 //===----------------------------------------------------------------------===//
  25
  26 #define DEBUG_TYPE "sroa"
  27 #include "llvm/Transforms/Scalar.h"
  28 #include "llvm/Constants.h"
  29 #include "llvm/DIBuilder.h"
  30 #include "llvm/DebugInfo.h"
  31 #include "llvm/DerivedTypes.h"
  32 #include "llvm/Function.h"
  33 #include "llvm/GlobalVariable.h"
  34 #include "llvm/IRBuilder.h"
  35 #include "llvm/Instructions.h"
  36 #include "llvm/IntrinsicInst.h"
  37 #include "llvm/LLVMContext.h"
  38 #include "llvm/Module.h"
  39 #include "llvm/Operator.h"
  40 #include "llvm/Pass.h"
  41 #include "llvm/ADT/SetVector.h"
  42 #include "llvm/ADT/SmallVector.h"
  43 #include "llvm/ADT/Statistic.h"
  44 #include "llvm/ADT/STLExtras.h"
  45 #include "llvm/ADT/TinyPtrVector.h"
  46 #include "llvm/Analysis/Dominators.h"
  47 #include "llvm/Analysis/Loads.h"
  48 #include "llvm/Analysis/ValueTracking.h"
  49 #include "llvm/Support/CallSite.h"
  50 #include "llvm/Support/Debug.h"
  51 #include "llvm/Support/ErrorHandling.h"
  52 #include "llvm/Support/GetElementPtrTypeIterator.h"
  53 #include "llvm/Support/InstVisitor.h"
  54 #include "llvm/Support/MathExtras.h"
  55 #include "llvm/Support/ValueHandle.h"
  56 #include "llvm/Support/raw_ostream.h"
  57 #include "llvm/Target/TargetData.h"
  58 #include "llvm/Transforms/Utils/Local.h"
  59 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
  60 #include "llvm/Transforms/Utils/SSAUpdater.h"
  61 using namespace llvm;
  62
  63 STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
  64 STATISTIC(NumNewAllocas,      "Number of new, smaller allocas introduced");
  65 STATISTIC(NumPromoted,        "Number of allocas promoted to SSA values");
  66 STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
  67 STATISTIC(NumDeleted,         "Number of instructions deleted");
  68 STATISTIC(NumVectorized,      "Number of vectorized aggregates");
  69
  70 namespace {
  71 /// \brief Alloca partitioning representation.
  72 ///
  73 /// This class represents a partitioning of an alloca into slices, and
  74 /// information about the nature of uses of each slice of the alloca. The goal
  75 /// is that this information is sufficient to decide if and how to split the
  76 /// alloca apart and replace slices with scalars. It is also intended that this
  77 /// structure can capture the relevant information needed both to decide about
  78 /// and to enact these transformations.
  79 class AllocaPartitioning {
  80 public:
  81   /// \brief A common base class for representing a half-open byte range.
  82   struct ByteRange {
  83     /// \brief The beginning offset of the range.
  84     uint64_t BeginOffset;
  85
  86     /// \brief The ending offset, not included in the range.
  87     uint64_t EndOffset;
  88
  89     ByteRange() : BeginOffset(), EndOffset() {}
  90     ByteRange(uint64_t BeginOffset, uint64_t EndOffset)
  91         : BeginOffset(BeginOffset), EndOffset(EndOffset) {}
  92
  93     /// \brief Support for ordering ranges.
  94     ///
  95     /// This provides an ordering over ranges such that start offsets are
  96     /// always increasing, and within equal start offsets, the end offsets are
  97     /// decreasing. Thus the spanning range comes first in a cluster with the
  98     /// same start position.
  99     bool operator<(const ByteRange &RHS) const {
 100       if (BeginOffset < RHS.BeginOffset) return true;
 101       if (BeginOffset > RHS.BeginOffset) return false;
 102       if (EndOffset > RHS.EndOffset) return true;
 103       return false;
 104     }
 105
 106     /// \brief Support comparison with a single offset to allow binary searches.
 107     bool operator<(uint64_t RHSOffset) const {
 108       return BeginOffset < RHSOffset;
 109     }
 110
 111     bool operator==(const ByteRange &RHS) const {
 112       return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset;
 113     }
 114     bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); }
 115   };
 116
 117   /// \brief A partition of an alloca.
 118   ///
 119   /// This structure represents a contiguous partition of the alloca. These are
 120   /// formed by examining the uses of the alloca. During formation, they may
 121   /// overlap but once an AllocaPartitioning is built, the Partitions within it
 122   /// are all disjoint.
 123   struct Partition : public ByteRange {
 124     /// \brief Whether this partition is splittable into smaller partitions.
 125     ///
 126     /// We flag partitions as splittable when they are formed entirely due to
 127     /// accesses by trivially splittable operations such as memset and memcpy.
 128     ///
 129     /// FIXME: At some point we should consider loads and stores of FCAs to be
 130     /// splittable and eagerly split them into scalar values.
 131     bool IsSplittable;
 132
 133     Partition() : ByteRange(), IsSplittable() {}
 134     Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable)
 135         : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {}
 136   };
 137
 138   /// \brief A particular use of a partition of the alloca.
 139   ///
 140   /// This structure is used to associate uses of a partition with it. They
 141   /// mark the range of bytes which are referenced by a particular instruction,
 142   /// and includes a handle to the user itself and the pointer value in use.
 143   /// The bounds of these uses are determined by intersecting the bounds of the
 144   /// memory use itself with a particular partition. As a consequence there is
 145   /// intentionally overlap between various uses of the same partition.
 146   struct PartitionUse : public ByteRange {
 147     /// \brief The user of this range of the alloca.
 148     AssertingVH<Instruction> User;
 149
 150     /// \brief The particular pointer value derived from this alloca in use.
 151     AssertingVH<Instruction> Ptr;
 152
 153     PartitionUse() : ByteRange(), User(), Ptr() {}
 154     PartitionUse(uint64_t BeginOffset, uint64_t EndOffset,
 155                  Instruction *User, Instruction *Ptr)
 156         : ByteRange(BeginOffset, EndOffset), User(User), Ptr(Ptr) {}
 157   };
 158
 159   /// \brief Construct a partitioning of a particular alloca.
 160   ///
 161   /// Construction does most of the work for partitioning the alloca. This
 162   /// performs the necessary walks of users and builds a partitioning from it.
 163   AllocaPartitioning(const TargetData &TD, AllocaInst &AI);
 164
 165   /// \brief Test whether a pointer to the allocation escapes our analysis.
 166   ///
 167   /// If this is true, the partitioning is never fully built and should be
 168   /// ignored.
 169   bool isEscaped() const { return PointerEscapingInstr; }
 170
 171   /// \brief Support for iterating over the partitions.
 172   /// @{
 173   typedef SmallVectorImpl<Partition>::iterator iterator;
 174   iterator begin() { return Partitions.begin(); }
 175   iterator end() { return Partitions.end(); }
 176
 177   typedef SmallVectorImpl<Partition>::const_iterator const_iterator;
 178   const_iterator begin() const { return Partitions.begin(); }
 179   const_iterator end() const { return Partitions.end(); }
 180   /// @}
 181
 182   /// \brief Support for iterating over and manipulating a particular
 183   /// partition's uses.
 184   ///
 185   /// The iteration support provided for uses is more limited, but also
 186   /// includes some manipulation routines to support rewriting the uses of
 187   /// partitions during SROA.
 188   /// @{
 189   typedef SmallVectorImpl<PartitionUse>::iterator use_iterator;
 190   use_iterator use_begin(unsigned Idx) { return Uses[Idx].begin(); }
 191   use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); }
 192   use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); }
 193   use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); }
 194   void use_insert(unsigned Idx, use_iterator UI, const PartitionUse &U) {
 195     Uses[Idx].insert(UI, U);
 196   }
 197   void use_insert(const_iterator I, use_iterator UI, const PartitionUse &U) {
 198     Uses[I - begin()].insert(UI, U);
 199   }
 200   void use_erase(unsigned Idx, use_iterator UI) { Uses[Idx].erase(UI); }
 201   void use_erase(const_iterator I, use_iterator UI) {
 202     Uses[I - begin()].erase(UI);
 203   }
 204
 205   typedef SmallVectorImpl<PartitionUse>::const_iterator const_use_iterator;
 206   const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); }
 207   const_use_iterator use_begin(const_iterator I) const {
 208     return Uses[I - begin()].begin();
 209   }
 210   const_use_iterator use_end(unsigned Idx) const { return Uses[Idx].end(); }
 211   const_use_iterator use_end(const_iterator I) const {
 212     return Uses[I - begin()].end();
 213   }
 214   /// @}
 215
 216   /// \brief Allow iterating the dead users for this alloca.
 217   ///
 218   /// These are instructions which will never actually use the alloca as they
 219   /// are outside the allocated range. They are safe to replace with undef and
 220   /// delete.
 221   /// @{
 222   typedef SmallVectorImpl<Instruction *>::const_iterator dead_user_iterator;
 223   dead_user_iterator dead_user_begin() const { return DeadUsers.begin(); }
 224   dead_user_iterator dead_user_end() const { return DeadUsers.end(); }
 225   /// @}
 226
 227   /// \brief Allow iterating the dead expressions referring to this alloca.
 228   ///
 229   /// These are operands which have cannot actually be used to refer to the
 230   /// alloca as they are outside its range and the user doesn't correct for
 231   /// that. These mostly consist of PHI node inputs and the like which we just
 232   /// need to replace with undef.
 233   /// @{
 234   typedef SmallVectorImpl<Use *>::const_iterator dead_op_iterator;
 235   dead_op_iterator dead_op_begin() const { return DeadOperands.begin(); }
 236   dead_op_iterator dead_op_end() const { return DeadOperands.end(); }
 237   /// @}
 238
 239   /// \brief MemTransferInst auxiliary data.
 240   /// This struct provides some auxiliary data about memory transfer
 241   /// intrinsics such as memcpy and memmove. These intrinsics can use two
 242   /// different ranges within the same alloca, and provide other challenges to
 243   /// correctly represent. We stash extra data to help us untangle this
 244   /// after the partitioning is complete.
 245   struct MemTransferOffsets {
 246     uint64_t DestBegin, DestEnd;
 247     uint64_t SourceBegin, SourceEnd;
 248     bool IsSplittable;
 249   };
 250   MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const {
 251     return MemTransferInstData.lookup(&II);
 252   }
 253
 254   /// \brief Map from a PHI or select operand back to a partition.
 255   ///
 256   /// When manipulating PHI nodes or selects, they can use more than one
 257   /// partition of an alloca. We store a special mapping to allow finding the
 258   /// partition referenced by each of these operands, if any.
 259   iterator findPartitionForPHIOrSelectOperand(Instruction &I, Value *Op) {
 260     SmallDenseMap<std::pair<Instruction *, Value *>,
 261                   std::pair<unsigned, unsigned> >::const_iterator MapIt
 262       = PHIOrSelectOpMap.find(std::make_pair(&I, Op));
 263     if (MapIt == PHIOrSelectOpMap.end())
 264       return end();
 265
 266     return begin() + MapIt->second.first;
 267   }
 268
 269   /// \brief Map from a PHI or select operand back to the specific use of
 270   /// a partition.
 271   ///
 272   /// Similar to mapping these operands back to the partitions, this maps
 273   /// directly to the use structure of that partition.
 274   use_iterator findPartitionUseForPHIOrSelectOperand(Instruction &I,
 275                                                      Value *Op) {
 276     SmallDenseMap<std::pair<Instruction *, Value *>,
 277                   std::pair<unsigned, unsigned> >::const_iterator MapIt
 278       = PHIOrSelectOpMap.find(std::make_pair(&I, Op));
 279     assert(MapIt != PHIOrSelectOpMap.end());
 280     return Uses[MapIt->second.first].begin() + MapIt->second.second;
 281   }
 282
 283   /// \brief Compute a common type among the uses of a particular partition.
 284   ///
 285   /// This routines walks all of the uses of a particular partition and tries
 286   /// to find a common type between them. Untyped operations such as memset and
 287   /// memcpy are ignored.
 288   Type *getCommonType(iterator I) const;
 289
 290 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 291   void print(raw_ostream &OS, const_iterator I, StringRef Indent = "  ") const;
 292   void printUsers(raw_ostream &OS, const_iterator I,
 293                   StringRef Indent = "  ") const;
 294   void print(raw_ostream &OS) const;
 295   void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const;
 296   void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const;
 297 #endif
 298
 299 private:
 300   template <typename DerivedT, typename RetT = void> class BuilderBase;
 301   class PartitionBuilder;
 302   friend class AllocaPartitioning::PartitionBuilder;
 303   class UseBuilder;
 304   friend class AllocaPartitioning::UseBuilder;
 305
 306   /// \brief Handle to alloca instruction to simplify method interfaces.
 307   AllocaInst &AI;
 308
 309   /// \brief The instruction responsible for this alloca having no partitioning.
 310   ///
 311   /// When an instruction (potentially) escapes the pointer to the alloca, we
 312   /// store a pointer to that here and abort trying to partition the alloca.
 313   /// This will be null if the alloca is partitioned successfully.
 314   Instruction *PointerEscapingInstr;
 315
 316   /// \brief The partitions of the alloca.
 317   ///
 318   /// We store a vector of the partitions over the alloca here. This vector is
 319   /// sorted by increasing begin offset, and then by decreasing end offset. See
 320   /// the Partition inner class for more details. Initially (during
 321   /// construction) there are overlaps, but we form a disjoint sequence of
 322   /// partitions while finishing construction and a fully constructed object is
 323   /// expected to always have this as a disjoint space.
 324   SmallVector<Partition, 8> Partitions;
 325
 326   /// \brief The uses of the partitions.
 327   ///
 328   /// This is essentially a mapping from each partition to a list of uses of
 329   /// that partition. The mapping is done with a Uses vector that has the exact
 330   /// same number of entries as the partition vector. Each entry is itself
 331   /// a vector of the uses.
 332   SmallVector<SmallVector<PartitionUse, 2>, 8> Uses;
 333
 334   /// \brief Instructions which will become dead if we rewrite the alloca.
 335   ///
 336   /// Note that these are not separated by partition. This is because we expect
 337   /// a partitioned alloca to be completely rewritten or not rewritten at all.
 338   /// If rewritten, all these instructions can simply be removed and replaced
 339   /// with undef as they come from outside of the allocated space.
 340   SmallVector<Instruction *, 8> DeadUsers;
 341
 342   /// \brief Operands which will become dead if we rewrite the alloca.
 343   ///
 344   /// These are operands that in their particular use can be replaced with
 345   /// undef when we rewrite the alloca. These show up in out-of-bounds inputs
 346   /// to PHI nodes and the like. They aren't entirely dead (there might be
 347   /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
 348   /// want to swap this particular input for undef to simplify the use lists of
 349   /// the alloca.
 350   SmallVector<Use *, 8> DeadOperands;
 351
 352   /// \brief The underlying storage for auxiliary memcpy and memset info.
 353   SmallDenseMap<MemTransferInst *, MemTransferOffsets, 4> MemTransferInstData;
 354
 355   /// \brief A side datastructure used when building up the partitions and uses.
 356   ///
 357   /// This mapping is only really used during the initial building of the
 358   /// partitioning so that we can retain information about PHI and select nodes
 359   /// processed.
 360   SmallDenseMap<Instruction *, std::pair<uint64_t, bool> > PHIOrSelectSizes;
 361
 362   /// \brief Auxiliary information for particular PHI or select operands.
 363   SmallDenseMap<std::pair<Instruction *, Value *>,
 364                 std::pair<unsigned, unsigned>, 4> PHIOrSelectOpMap;
 365
 366   /// \brief A utility routine called from the constructor.
 367   ///
 368   /// This does what it says on the tin. It is the key of the alloca partition
 369   /// splitting and merging. After it is called we have the desired disjoint
 370   /// collection of partitions.
 371   void splitAndMergePartitions();
 372 };
 373 }
 374
 375 template <typename DerivedT, typename RetT>
 376 class AllocaPartitioning::BuilderBase
 377     : public InstVisitor<DerivedT, RetT> {
 378 public:
 379   BuilderBase(const TargetData &TD, AllocaInst &AI, AllocaPartitioning &P)
 380       : TD(TD),
 381         AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())),
 382         P(P) {
 383     enqueueUsers(AI, 0);
 384   }
 385
 386 protected:
 387   const TargetData &TD;
 388   const uint64_t AllocSize;
 389   AllocaPartitioning &P;
 390
 391   struct OffsetUse {
 392     Use *U;
 393     uint64_t Offset;
 394   };
 395   SmallVector<OffsetUse, 8> Queue;
 396
 397   // The active offset and use while visiting.
 398   Use *U;
 399   uint64_t Offset;
 400
 401   void enqueueUsers(Instruction &I, uint64_t UserOffset) {
 402     SmallPtrSet<User *, 8> UserSet;
 403     for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
 404          UI != UE; ++UI) {
 405       if (!UserSet.insert(*UI))
 406         continue;
 407
 408       OffsetUse OU = { &UI.getUse(), UserOffset };
 409       Queue.push_back(OU);
 410     }
 411   }
 412
 413   bool computeConstantGEPOffset(GetElementPtrInst &GEPI, uint64_t &GEPOffset) {
 414     GEPOffset = Offset;
 415     for (gep_type_iterator GTI = gep_type_begin(GEPI), GTE = gep_type_end(GEPI);
 416          GTI != GTE; ++GTI) {
 417       ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
 418       if (!OpC)
 419         return false;
 420       if (OpC->isZero())
 421         continue;
 422
 423       // Handle a struct index, which adds its field offset to the pointer.
 424       if (StructType *STy = dyn_cast<StructType>(*GTI)) {
 425         unsigned ElementIdx = OpC->getZExtValue();
 426         const StructLayout *SL = TD.getStructLayout(STy);
 427         GEPOffset += SL->getElementOffset(ElementIdx);
 428         continue;
 429       }
 430
 431       GEPOffset
 432         += OpC->getZExtValue() * TD.getTypeAllocSize(GTI.getIndexedType());
 433     }
 434     return true;
 435   }
 436
 437   Value *foldSelectInst(SelectInst &SI) {
 438     // If the condition being selected on is a constant or the same value is
 439     // being selected between, fold the select. Yes this does (rarely) happen
 440     // early on.
 441     if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
 442       return SI.getOperand(1+CI->isZero());
 443     if (SI.getOperand(1) == SI.getOperand(2)) {
 444       assert(*U == SI.getOperand(1));
 445       return SI.getOperand(1);
 446     }
 447     return 0;
 448   }
 449 };
 450
 451 /// \brief Builder for the alloca partitioning.
 452 ///
 453 /// This class builds an alloca partitioning by recursively visiting the uses
 454 /// of an alloca and splitting the partitions for each load and store at each
 455 /// offset.
 456 class AllocaPartitioning::PartitionBuilder
 457     : public BuilderBase<PartitionBuilder, bool> {
 458   friend class InstVisitor<PartitionBuilder, bool>;
 459
 460   SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap;
 461
 462 public:
 463   PartitionBuilder(const TargetData &TD, AllocaInst &AI, AllocaPartitioning &P)
 464       : BuilderBase<PartitionBuilder, bool>(TD, AI, P) {}
 465
 466   /// \brief Run the builder over the allocation.
 467   bool operator()() {
 468     // Note that we have to re-evaluate size on each trip through the loop as
 469     // the queue grows at the tail.
 470     for (unsigned Idx = 0; Idx < Queue.size(); ++Idx) {
 471       U = Queue[Idx].U;
 472       Offset = Queue[Idx].Offset;
 473       if (!visit(cast<Instruction>(U->getUser())))
 474         return false;
 475     }
 476     return true;
 477   }
 478
 479 private:
 480   bool markAsEscaping(Instruction &I) {
 481     P.PointerEscapingInstr = &I;
 482     return false;
 483   }
 484
 485   void insertUse(Instruction &I, uint64_t Size, bool IsSplittable = false) {
 486     uint64_t BeginOffset = Offset, EndOffset = Offset + Size;
 487
 488     // Completely skip uses which start outside of the allocation.
 489     if (BeginOffset >= AllocSize) {
 490       DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
 491                    << " which starts past the end of the " << AllocSize
 492                    << " byte alloca:\n"
 493                    << "    alloca: " << P.AI << "\n"
 494                    << "       use: " << I << "\n");
 495       return;
 496     }
 497
 498     // Clamp the size to the allocation.
 499     if (EndOffset > AllocSize) {
 500       DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
 501                    << " to remain within the " << AllocSize << " byte alloca:\n"
 502                    << "    alloca: " << P.AI << "\n"
 503                    << "       use: " << I << "\n");
 504       EndOffset = AllocSize;
 505     }
 506
 507     // See if we can just add a user onto the last slot currently occupied.
 508     if (!P.Partitions.empty() &&
 509         P.Partitions.back().BeginOffset == BeginOffset &&
 510         P.Partitions.back().EndOffset == EndOffset) {
 511       P.Partitions.back().IsSplittable &= IsSplittable;
 512       return;
 513     }
 514
 515     Partition New(BeginOffset, EndOffset, IsSplittable);
 516     P.Partitions.push_back(New);
 517   }
 518
 519   bool handleLoadOrStore(Type *Ty, Instruction &I) {
 520     uint64_t Size = TD.getTypeStoreSize(Ty);
 521
 522     // If this memory access can be shown to *statically* extend outside the
 523     // bounds of of the allocation, it's behavior is undefined, so simply
 524     // ignore it. Note that this is more strict than the generic clamping
 525     // behavior of insertUse. We also try to handle cases which might run the
 526     // risk of overflow.
 527     // FIXME: We should instead consider the pointer to have escaped if this
 528     // function is being instrumented for addressing bugs or race conditions.
 529     if (Offset >= AllocSize || Size > AllocSize || Offset + Size > AllocSize) {
 530       DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte "
 531                    << (isa<LoadInst>(I) ? "load" : "store") << " @" << Offset
 532                    << " which extends past the end of the " << AllocSize
 533                    << " byte alloca:\n"
 534                    << "    alloca: " << P.AI << "\n"
 535                    << "       use: " << I << "\n");
 536       return true;
 537     }
 538
 539     insertUse(I, Size);
 540     return true;
 541   }
 542
 543   bool visitBitCastInst(BitCastInst &BC) {
 544     enqueueUsers(BC, Offset);
 545     return true;
 546   }
 547
 548   bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
 549     uint64_t GEPOffset;
 550     if (!computeConstantGEPOffset(GEPI, GEPOffset))
 551       return markAsEscaping(GEPI);
 552
 553     enqueueUsers(GEPI, GEPOffset);
 554     return true;
 555   }
 556
 557   bool visitLoadInst(LoadInst &LI) {
 558     return handleLoadOrStore(LI.getType(), LI);
 559   }
 560
 561   bool visitStoreInst(StoreInst &SI) {
 562     if (SI.getOperand(0) == *U)
 563       return markAsEscaping(SI);
 564
 565     return handleLoadOrStore(SI.getOperand(0)->getType(), SI);
 566   }
 567
 568
 569   bool visitMemSetInst(MemSetInst &II) {
 570     assert(II.getRawDest() == *U && "Pointer use is not the destination?");
 571     ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
 572     insertUse(II, Length ? Length->getZExtValue() : AllocSize - Offset, Length);
 573     return true;
 574   }
 575
 576   bool visitMemTransferInst(MemTransferInst &II) {
 577     ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
 578     uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset;
 579     if (!Size)
 580       // Zero-length mem transfer intrinsics can be ignored entirely.
 581       return true;
 582
 583     MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
 584
 585     // Only intrinsics with a constant length can be split.
 586     Offsets.IsSplittable = Length;
 587
 588     if (*U != II.getRawDest()) {
 589       assert(*U == II.getRawSource());
 590       Offsets.SourceBegin = Offset;
 591       Offsets.SourceEnd = Offset + Size;
 592     } else {
 593       Offsets.DestBegin = Offset;
 594       Offsets.DestEnd = Offset + Size;
 595     }
 596
 597     insertUse(II, Size, Offsets.IsSplittable);
 598     unsigned NewIdx = P.Partitions.size() - 1;
 599
 600     SmallDenseMap<Instruction *, unsigned>::const_iterator PMI;
 601     bool Inserted = false;
 602     llvm::tie(PMI, Inserted)
 603       = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx));
 604     if (!Inserted && Offsets.IsSplittable) {
 605       // We've found a memory transfer intrinsic which refers to the alloca as
 606       // both a source and dest. We refuse to split these to simplify splitting
 607       // logic. If possible, SROA will still split them into separate allocas
 608       // and then re-analyze.
 609       Offsets.IsSplittable = false;
 610       P.Partitions[PMI->second].IsSplittable = false;
 611       P.Partitions[NewIdx].IsSplittable = false;
 612     }
 613
 614     return true;
 615   }
 616
 617   // Disable SRoA for any intrinsics except for lifetime invariants.
 618   bool visitIntrinsicInst(IntrinsicInst &II) {
 619     if (II.getIntrinsicID() == Intrinsic::lifetime_start ||
 620         II.getIntrinsicID() == Intrinsic::lifetime_end) {
 621       ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
 622       uint64_t Size = std::min(AllocSize - Offset, Length->getLimitedValue());
 623       insertUse(II, Size, true);
 624       return true;
 625     }
 626
 627     return markAsEscaping(II);
 628   }
 629
 630   Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
 631     // We consider any PHI or select that results in a direct load or store of
 632     // the same offset to be a viable use for partitioning purposes. These uses
 633     // are considered unsplittable and the size is the maximum loaded or stored
 634     // size.
 635     SmallPtrSet<Instruction *, 4> Visited;
 636     SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
 637     Visited.insert(Root);
 638     Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
 639     do {
 640       Instruction *I, *UsedI;
 641       llvm::tie(UsedI, I) = Uses.pop_back_val();
 642
 643       if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
 644         Size = std::max(Size, TD.getTypeStoreSize(LI->getType()));
 645         continue;
 646       }
 647       if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
 648         Value *Op = SI->getOperand(0);
 649         if (Op == UsedI)
 650           return SI;
 651         Size = std::max(Size, TD.getTypeStoreSize(Op->getType()));
 652         continue;
 653       }
 654
 655       if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
 656         if (!GEP->hasAllZeroIndices())
 657           return GEP;
 658       } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
 659                  !isa<SelectInst>(I)) {
 660         return I;
 661       }
 662
 663       for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE;
 664            ++UI)
 665         if (Visited.insert(cast<Instruction>(*UI)))
 666           Uses.push_back(std::make_pair(I, cast<Instruction>(*UI)));
 667     } while (!Uses.empty());
 668
 669     return 0;
 670   }
 671
 672   bool visitPHINode(PHINode &PN) {
 673     // See if we already have computed info on this node.
 674     std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN];
 675     if (PHIInfo.first) {
 676       PHIInfo.second = true;
 677       insertUse(PN, PHIInfo.first);
 678       return true;
 679     }
 680
 681     // Check for an unsafe use of the PHI node.
 682     if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first))
 683       return markAsEscaping(*EscapingI);
 684
 685     insertUse(PN, PHIInfo.first);
 686     return true;
 687   }
 688
 689   bool visitSelectInst(SelectInst &SI) {
 690     if (Value *Result = foldSelectInst(SI)) {
 691       if (Result == *U)
 692         // If the result of the constant fold will be the pointer, recurse
 693         // through the select as if we had RAUW'ed it.
 694         enqueueUsers(SI, Offset);
 695
 696       return true;
 697     }
 698
 699     // See if we already have computed info on this node.
 700     std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI];
 701     if (SelectInfo.first) {
 702       SelectInfo.second = true;
 703       insertUse(SI, SelectInfo.first);
 704       return true;
 705     }
 706
 707     // Check for an unsafe use of the PHI node.
 708     if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first))
 709       return markAsEscaping(*EscapingI);
 710
 711     insertUse(SI, SelectInfo.first);
 712     return true;
 713   }
 714
 715   /// \brief Disable SROA entirely if there are unhandled users of the alloca.
 716   bool visitInstruction(Instruction &I) { return markAsEscaping(I); }
 717 };
 718
 719
 720 /// \brief Use adder for the alloca partitioning.
 721 ///
 722 /// This class adds the uses of an alloca to all of the partitions which they
 723 /// use. For splittable partitions, this can end up doing essentially a linear
 724 /// walk of the partitions, but the number of steps remains bounded by the
 725 /// total result instruction size:
 726 /// - The number of partitions is a result of the number unsplittable
 727 ///   instructions using the alloca.
 728 /// - The number of users of each partition is at worst the total number of
 729 ///   splittable instructions using the alloca.
 730 /// Thus we will produce N * M instructions in the end, where N are the number
 731 /// of unsplittable uses and M are the number of splittable. This visitor does
 732 /// the exact same number of updates to the partitioning.
 733 ///
 734 /// In the more common case, this visitor will leverage the fact that the
 735 /// partition space is pre-sorted, and do a logarithmic search for the
 736 /// partition needed, making the total visit a classical ((N + M) * log(N))
 737 /// complexity operation.
 738 class AllocaPartitioning::UseBuilder : public BuilderBase<UseBuilder> {
 739   friend class InstVisitor<UseBuilder>;
 740
 741   /// \brief Set to de-duplicate dead instructions found in the use walk.
 742   SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
 743
 744 public:
 745   UseBuilder(const TargetData &TD, AllocaInst &AI, AllocaPartitioning &P)
 746       : BuilderBase<UseBuilder>(TD, AI, P) {}
 747
 748   /// \brief Run the builder over the allocation.
 749   void operator()() {
 750     // Note that we have to re-evaluate size on each trip through the loop as
 751     // the queue grows at the tail.
 752     for (unsigned Idx = 0; Idx < Queue.size(); ++Idx) {
 753       U = Queue[Idx].U;
 754       Offset = Queue[Idx].Offset;
 755       this->visit(cast<Instruction>(U->getUser()));
 756     }
 757   }
 758
 759 private:
 760   void markAsDead(Instruction &I) {
 761     if (VisitedDeadInsts.insert(&I))
 762       P.DeadUsers.push_back(&I);
 763   }
 764
 765   void insertUse(uint64_t Size, Instruction &User) {
 766     uint64_t BeginOffset = Offset, EndOffset = Offset + Size;
 767
 768     // If the use extends outside of the allocation, record it as a dead use
 769     // for elimination later.
 770     if (BeginOffset >= AllocSize || Size == 0)
 771       return markAsDead(User);
 772
 773     // Bound the use by the size of the allocation.
 774     if (EndOffset > AllocSize)
 775       EndOffset = AllocSize;
 776
 777     // NB: This only works if we have zero overlapping partitions.
 778     iterator B = std::lower_bound(P.begin(), P.end(), BeginOffset);
 779     if (B != P.begin() && llvm::prior(B)->EndOffset > BeginOffset)
 780       B = llvm::prior(B);
 781     for (iterator I = B, E = P.end(); I != E && I->BeginOffset < EndOffset;
 782          ++I) {
 783       PartitionUse NewUse(std::max(I->BeginOffset, BeginOffset),
 784                           std::min(I->EndOffset, EndOffset),
 785                           &User, cast<Instruction>(*U));
 786       P.Uses[I - P.begin()].push_back(NewUse);
 787       if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser()))
 788         P.PHIOrSelectOpMap[std::make_pair(&User, U->get())]
 789           = std::make_pair(I - P.begin(), P.Uses[I - P.begin()].size() - 1);
 790     }
 791   }
 792
 793   void handleLoadOrStore(Type *Ty, Instruction &I) {
 794     uint64_t Size = TD.getTypeStoreSize(Ty);
 795
 796     // If this memory access can be shown to *statically* extend outside the
 797     // bounds of of the allocation, it's behavior is undefined, so simply
 798     // ignore it. Note that this is more strict than the generic clamping
 799     // behavior of insertUse.
 800     if (Offset >= AllocSize || Size > AllocSize || Offset + Size > AllocSize)
 801       return markAsDead(I);
 802
 803     insertUse(Size, I);
 804   }
 805
 806   void visitBitCastInst(BitCastInst &BC) {
 807     if (BC.use_empty())
 808       return markAsDead(BC);
 809
 810     enqueueUsers(BC, Offset);
 811   }
 812
 813   void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
 814     if (GEPI.use_empty())
 815       return markAsDead(GEPI);
 816
 817     uint64_t GEPOffset;
 818     if (!computeConstantGEPOffset(GEPI, GEPOffset))
 819       llvm_unreachable("Unable to compute constant offset for use");
 820
 821     enqueueUsers(GEPI, GEPOffset);
 822   }
 823
 824   void visitLoadInst(LoadInst &LI) {
 825     handleLoadOrStore(LI.getType(), LI);
 826   }
 827
 828   void visitStoreInst(StoreInst &SI) {
 829     handleLoadOrStore(SI.getOperand(0)->getType(), SI);
 830   }
 831
 832   void visitMemSetInst(MemSetInst &II) {
 833     ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
 834     insertUse(Length ? Length->getZExtValue() : AllocSize - Offset, II);
 835   }
 836
 837   void visitMemTransferInst(MemTransferInst &II) {
 838     ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
 839     insertUse(Length ? Length->getZExtValue() : AllocSize - Offset, II);
 840   }
 841
 842   void visitIntrinsicInst(IntrinsicInst &II) {
 843     assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
 844            II.getIntrinsicID() == Intrinsic::lifetime_end);
 845
 846     ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
 847     insertUse(std::min(AllocSize - Offset, Length->getLimitedValue()), II);
 848   }
 849
 850   void insertPHIOrSelect(Instruction &User) {
 851     uint64_t Size = P.PHIOrSelectSizes.lookup(&User).first;
 852
 853     // For PHI and select operands outside the alloca, we can't nuke the entire
 854     // phi or select -- the other side might still be relevant, so we special
 855     // case them here and use a separate structure to track the operands
 856     // themselves which should be replaced with undef.
 857     if (Offset >= AllocSize) {
 858       P.DeadOperands.push_back(U);
 859       return;
 860     }
 861
 862     insertUse(Size, User);
 863   }
 864   void visitPHINode(PHINode &PN) {
 865     if (PN.use_empty())
 866       return markAsDead(PN);
 867
 868     insertPHIOrSelect(PN);
 869   }
 870   void visitSelectInst(SelectInst &SI) {
 871     if (SI.use_empty())
 872       return markAsDead(SI);
 873
 874     if (Value *Result = foldSelectInst(SI)) {
 875       if (Result == *U)
 876         // If the result of the constant fold will be the pointer, recurse
 877         // through the select as if we had RAUW'ed it.
 878         enqueueUsers(SI, Offset);
 879
 880       return;
 881     }
 882
 883     insertPHIOrSelect(SI);
 884   }
 885
 886   /// \brief Unreachable, we've already visited the alloca once.
 887   void visitInstruction(Instruction &I) {
 888     llvm_unreachable("Unhandled instruction in use builder.");
 889   }
 890 };
 891
 892 void AllocaPartitioning::splitAndMergePartitions() {
 893   size_t NumDeadPartitions = 0;
 894
 895   // Track the range of splittable partitions that we pass when accumulating
 896   // overlapping unsplittable partitions.
 897   uint64_t SplitEndOffset = 0ull;
 898
 899   Partition New(0ull, 0ull, false);
 900
 901   for (unsigned i = 0, j = i, e = Partitions.size(); i != e; i = j) {
 902     ++j;
 903
 904     if (!Partitions[i].IsSplittable || New.BeginOffset == New.EndOffset) {
 905       assert(New.BeginOffset == New.EndOffset);
 906       New = Partitions[i];
 907     } else {
 908       assert(New.IsSplittable);
 909       New.EndOffset = std::max(New.EndOffset, Partitions[i].EndOffset);
 910     }
 911     assert(New.BeginOffset != New.EndOffset);
 912
 913     // Scan the overlapping partitions.
 914     while (j != e && New.EndOffset > Partitions[j].BeginOffset) {
 915       // If the new partition we are forming is splittable, stop at the first
 916       // unsplittable partition.
 917       if (New.IsSplittable && !Partitions[j].IsSplittable)
 918         break;
 919
 920       // Grow the new partition to include any equally splittable range. 'j' is
 921       // always equally splittable when New is splittable, but when New is not
 922       // splittable, we may subsume some (or part of some) splitable partition
 923       // without growing the new one.
 924       if (New.IsSplittable == Partitions[j].IsSplittable) {
 925         New.EndOffset = std::max(New.EndOffset, Partitions[j].EndOffset);
 926       } else {
 927         assert(!New.IsSplittable);
 928         assert(Partitions[j].IsSplittable);
 929         SplitEndOffset = std::max(SplitEndOffset, Partitions[j].EndOffset);
 930       }
 931
 932       Partitions[j].BeginOffset = Partitions[j].EndOffset = UINT64_MAX;
 933       ++NumDeadPartitions;
 934       ++j;
 935     }
 936
 937     // If the new partition is splittable, chop off the end as soon as the
 938     // unsplittable subsequent partition starts and ensure we eventually cover
 939     // the splittable area.
 940     if (j != e && New.IsSplittable) {
 941       SplitEndOffset = std::max(SplitEndOffset, New.EndOffset);
 942       New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset);
 943     }
 944
 945     // Add the new partition if it differs from the original one and is
 946     // non-empty. We can end up with an empty partition here if it was
 947     // splittable but there is an unsplittable one that starts at the same
 948     // offset.
 949     if (New != Partitions[i]) {
 950       if (New.BeginOffset != New.EndOffset)
 951         Partitions.push_back(New);
 952       // Mark the old one for removal.
 953       Partitions[i].BeginOffset = Partitions[i].EndOffset = UINT64_MAX;
 954       ++NumDeadPartitions;
 955     }
 956
 957     New.BeginOffset = New.EndOffset;
 958     if (!New.IsSplittable) {
 959       New.EndOffset = std::max(New.EndOffset, SplitEndOffset);
 960       if (j != e && !Partitions[j].IsSplittable)
 961         New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset);
 962       New.IsSplittable = true;
 963       // If there is a trailing splittable partition which won't be fused into
 964       // the next splittable partition go ahead and add it onto the partitions
 965       // list.
 966       if (New.BeginOffset < New.EndOffset &&
 967           (j == e || !Partitions[j].IsSplittable ||
 968            New.EndOffset < Partitions[j].BeginOffset)) {
 969         Partitions.push_back(New);
 970         New.BeginOffset = New.EndOffset = 0ull;
 971       }
 972     }
 973   }
 974
 975   // Re-sort the partitions now that they have been split and merged into
 976   // disjoint set of partitions. Also remove any of the dead partitions we've
 977   // replaced in the process.
 978   std::sort(Partitions.begin(), Partitions.end());
 979   if (NumDeadPartitions) {
 980     assert(Partitions.back().BeginOffset == UINT64_MAX);
 981     assert(Partitions.back().EndOffset == UINT64_MAX);
 982     assert((ptrdiff_t)NumDeadPartitions ==
 983            std::count(Partitions.begin(), Partitions.end(), Partitions.back()));
 984   }
 985   Partitions.erase(Partitions.end() - NumDeadPartitions, Partitions.end());
 986 }
 987
 988 AllocaPartitioning::AllocaPartitioning(const TargetData &TD, AllocaInst &AI)
 989     : AI(AI), PointerEscapingInstr(0) {
 990   PartitionBuilder PB(TD, AI, *this);
 991   if (!PB())
 992     return;
 993
 994   if (Partitions.size() > 1) {
 995     // Sort the uses. This arranges for the offsets to be in ascending order,
 996     // and the sizes to be in descending order.
 997     std::sort(Partitions.begin(), Partitions.end());
 998
 999     // Intersect splittability for all partitions with equal offsets and sizes.
1000     // Then remove all but the first so that we have a sequence of non-equal but
1001     // potentially overlapping partitions.
1002     for (iterator I = Partitions.begin(), J = I, E = Partitions.end(); I != E;
1003          I = J) {
1004       ++J;
1005       while (J != E && *I == *J) {
1006         I->IsSplittable &= J->IsSplittable;
1007         ++J;
1008       }
1009     }
1010     Partitions.erase(std::unique(Partitions.begin(), Partitions.end()),
1011                      Partitions.end());
1012
1013     // Split splittable and merge unsplittable partitions into a disjoint set
1014     // of partitions over the used space of the allocation.
1015     splitAndMergePartitions();
1016   }
1017
1018   // Now build up the user lists for each of these disjoint partitions by
1019   // re-walking the recursive users of the alloca.
1020   Uses.resize(Partitions.size());
1021   UseBuilder UB(TD, AI, *this);
1022   UB();
1023   for (iterator I = Partitions.begin(), E = Partitions.end(); I != E; ++I)
1024     std::stable_sort(use_begin(I), use_end(I));
1025 }
1026
1027 Type *AllocaPartitioning::getCommonType(iterator I) const {
1028   Type *Ty = 0;
1029   for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) {
1030     if (isa<MemIntrinsic>(*UI->User))
1031       continue;
1032     if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset)
1033       break;
1034
1035     Type *UserTy = 0;
1036     if (LoadInst *LI = dyn_cast<LoadInst>(&*UI->User)) {
1037       UserTy = LI->getType();
1038     } else if (StoreInst *SI = dyn_cast<StoreInst>(&*UI->User)) {
1039       UserTy = SI->getValueOperand()->getType();
1040     } else if (SelectInst *SI = dyn_cast<SelectInst>(&*UI->User)) {
1041       if (PointerType *PtrTy = dyn_cast<PointerType>(SI->getType()))
1042         UserTy = PtrTy->getElementType();
1043     } else if (PHINode *PN = dyn_cast<PHINode>(&*UI->User)) {
1044       if (PointerType *PtrTy = dyn_cast<PointerType>(PN->getType()))
1045         UserTy = PtrTy->getElementType();
1046     }
1047
1048     if (Ty && Ty != UserTy)
1049       return 0;
1050
1051     Ty = UserTy;
1052   }
1053   return Ty;
1054 }
1055
1056 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1057
1058 void AllocaPartitioning::print(raw_ostream &OS, const_iterator I,
1059                                StringRef Indent) const {
1060   OS << Indent << "partition #" << (I - begin())
1061      << " [" << I->BeginOffset << "," << I->EndOffset << ")"
1062      << (I->IsSplittable ? " (splittable)" : "")
1063      << (Uses[I - begin()].empty() ? " (zero uses)" : "")
1064      << "\n";
1065 }
1066
1067 void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I,
1068                                     StringRef Indent) const {
1069   for (const_use_iterator UI = use_begin(I), UE = use_end(I);
1070        UI != UE; ++UI) {
1071     OS << Indent << "  [" << UI->BeginOffset << "," << UI->EndOffset << ") "
1072        << "used by: " << *UI->User << "\n";
1073     if (MemTransferInst *II = dyn_cast<MemTransferInst>(&*UI->User)) {
1074       const MemTransferOffsets &MTO = MemTransferInstData.lookup(II);
1075       bool IsDest;
1076       if (!MTO.IsSplittable)
1077         IsDest = UI->BeginOffset == MTO.DestBegin;
1078       else
1079         IsDest = MTO.DestBegin != 0u;
1080       OS << Indent << "    (original " << (IsDest ? "dest" : "source") << ": "
1081          << "[" << (IsDest ? MTO.DestBegin : MTO.SourceBegin)
1082          << "," << (IsDest ? MTO.DestEnd : MTO.SourceEnd) << ")\n";
1083     }
1084   }
1085 }
1086
1087 void AllocaPartitioning::print(raw_ostream &OS) const {
1088   if (PointerEscapingInstr) {
1089     OS << "No partitioning for alloca: " << AI << "\n"
1090        << "  A pointer to this alloca escaped by:\n"
1091        << "  " << *PointerEscapingInstr << "\n";
1092     return;
1093   }
1094
1095   OS << "Partitioning of alloca: " << AI << "\n";
1096   unsigned Num = 0;
1097   for (const_iterator I = begin(), E = end(); I != E; ++I, ++Num) {
1098     print(OS, I);
1099     printUsers(OS, I);
1100   }
1101 }
1102
1103 void AllocaPartitioning::dump(const_iterator I) const { print(dbgs(), I); }
1104 void AllocaPartitioning::dump() const { print(dbgs()); }
1105
1106 #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1107
1108
1109 namespace {
1110 /// \brief An optimization pass providing Scalar Replacement of Aggregates.
1111 ///
1112 /// This pass takes allocations which can be completely analyzed (that is, they
1113 /// don't escape) and tries to turn them into scalar SSA values. There are
1114 /// a few steps to this process.
1115 ///
1116 /// 1) It takes allocations of aggregates and analyzes the ways in which they
1117 ///    are used to try to split them into smaller allocations, ideally of
1118 ///    a single scalar data type. It will split up memcpy and memset accesses
1119 ///    as necessary and try to isolate invidual scalar accesses.
1120 /// 2) It will transform accesses into forms which are suitable for SSA value
1121 ///    promotion. This can be replacing a memset with a scalar store of an
1122 ///    integer value, or it can involve speculating operations on a PHI or
1123 ///    select to be a PHI or select of the results.
1124 /// 3) Finally, this will try to detect a pattern of accesses which map cleanly
1125 ///    onto insert and extract operations on a vector value, and convert them to
1126 ///    this form. By doing so, it will enable promotion of vector aggregates to
1127 ///    SSA vector values.
1128 class SROA : public FunctionPass {
1129   LLVMContext *C;
1130   const TargetData *TD;
1131   DominatorTree *DT;
1132
1133   /// \brief Worklist of alloca instructions to simplify.
1134   ///
1135   /// Each alloca in the function is added to this. Each new alloca formed gets
1136   /// added to it as well to recursively simplify unless that alloca can be
1137   /// directly promoted. Finally, each time we rewrite a use of an alloca other
1138   /// the one being actively rewritten, we add it back onto the list if not
1139   /// already present to ensure it is re-visited.
1140   SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist;
1141
1142   /// \brief A collection of instructions to delete.
1143   /// We try to batch deletions to simplify code and make things a bit more
1144   /// efficient.
1145   SmallVector<Instruction *, 8> DeadInsts;
1146
1147   /// \brief A set to prevent repeatedly marking an instruction split into many
1148   /// uses as dead. Only used to guard insertion into DeadInsts.
1149   SmallPtrSet<Instruction *, 4> DeadSplitInsts;
1150
1151   /// \brief A set of deleted alloca instructions.
1152   ///
1153   /// These pointers are *no longer valid* as they have been deleted. They are
1154   /// used to remove deleted allocas from the list of promotable allocas.
1155   SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
1156
1157   /// \brief A collection of alloca instructions we can directly promote.
1158   std::vector<AllocaInst *> PromotableAllocas;
1159
1160 public:
1161   SROA() : FunctionPass(ID), C(0), TD(0), DT(0) {
1162     initializeSROAPass(*PassRegistry::getPassRegistry());
1163   }
1164   bool runOnFunction(Function &F);
1165   void getAnalysisUsage(AnalysisUsage &AU) const;
1166
1167   const char *getPassName() const { return "SROA"; }
1168   static char ID;
1169
1170 private:
1171   friend class AllocaPartitionRewriter;
1172   friend class AllocaPartitionVectorRewriter;
1173
1174   bool rewriteAllocaPartition(AllocaInst &AI,
1175                               AllocaPartitioning &P,
1176                               AllocaPartitioning::iterator PI);
1177   bool splitAlloca(AllocaInst &AI, AllocaPartitioning &P);
1178   bool runOnAlloca(AllocaInst &AI);
1179   void deleteDeadInstructions();
1180 };
1181 }
1182
1183 char SROA::ID = 0;
1184
1185 FunctionPass *llvm::createSROAPass() {
1186   return new SROA();
1187 }
1188
1189 INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates",
1190                       false, false)
1191 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
1192 INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates",
1193                     false, false)
1194
1195 /// \brief Accumulate the constant offsets in a GEP into a single APInt offset.
1196 ///
1197 /// If the provided GEP is all-constant, the total byte offset formed by the
1198 /// GEP is computed and Offset is set to it. If the GEP has any non-constant
1199 /// operands, the function returns false and the value of Offset is unmodified.
1200 static bool accumulateGEPOffsets(const TargetData &TD, GEPOperator &GEP,
1201                                  APInt &Offset) {
1202   APInt GEPOffset(Offset.getBitWidth(), 0);
1203   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
1204        GTI != GTE; ++GTI) {
1205     ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
1206     if (!OpC)
1207       return false;
1208     if (OpC->isZero()) continue;
1209
1210     // Handle a struct index, which adds its field offset to the pointer.
1211     if (StructType *STy = dyn_cast<StructType>(*GTI)) {
1212       unsigned ElementIdx = OpC->getZExtValue();
1213       const StructLayout *SL = TD.getStructLayout(STy);
1214       GEPOffset += APInt(Offset.getBitWidth(),
1215                          SL->getElementOffset(ElementIdx));
1216       continue;
1217     }
1218
1219     APInt TypeSize(Offset.getBitWidth(),
1220                    TD.getTypeAllocSize(GTI.getIndexedType()));
1221     if (VectorType *VTy = dyn_cast<VectorType>(*GTI)) {
1222       assert((VTy->getScalarSizeInBits() % 8) == 0 &&
1223              "vector element size is not a multiple of 8, cannot GEP over it");
1224       TypeSize = VTy->getScalarSizeInBits() / 8;
1225     }
1226
1227     GEPOffset += OpC->getValue().sextOrTrunc(Offset.getBitWidth()) * TypeSize;
1228   }
1229   Offset = GEPOffset;
1230   return true;
1231 }
1232
1233 /// \brief Build a GEP out of a base pointer and indices.
1234 ///
1235 /// This will return the BasePtr if that is valid, or build a new GEP
1236 /// instruction using the IRBuilder if GEP-ing is needed.
1237 static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr,
1238                        SmallVectorImpl<Value *> &Indices,
1239                        const Twine &Prefix) {
1240   if (Indices.empty())
1241     return BasePtr;
1242
1243   // A single zero index is a no-op, so check for this and avoid building a GEP
1244   // in that case.
1245   if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
1246     return BasePtr;
1247
1248   return IRB.CreateInBoundsGEP(BasePtr, Indices, Prefix + ".idx");
1249 }
1250
1251 /// \brief Get a natural GEP off of the BasePtr walking through Ty toward
1252 /// TargetTy without changing the offset of the pointer.
1253 ///
1254 /// This routine assumes we've already established a properly offset GEP with
1255 /// Indices, and arrived at the Ty type. The goal is to continue to GEP with
1256 /// zero-indices down through type layers until we find one the same as
1257 /// TargetTy. If we can't find one with the same type, we at least try to use
1258 /// one with the same size. If none of that works, we just produce the GEP as
1259 /// indicated by Indices to have the correct offset.
1260 static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const TargetData &TD,
1261                                     Value *BasePtr, Type *Ty, Type *TargetTy,
1262                                     SmallVectorImpl<Value *> &Indices,
1263                                     const Twine &Prefix) {
1264   if (Ty == TargetTy)
1265     return buildGEP(IRB, BasePtr, Indices, Prefix);
1266
1267   // See if we can descend into a struct and locate a field with the correct
1268   // type.
1269   unsigned NumLayers = 0;
1270   Type *ElementTy = Ty;
1271   do {
1272     if (ElementTy->isPointerTy())
1273       break;
1274     if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) {
1275       ElementTy = SeqTy->getElementType();
1276       Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(), 0)));
1277     } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
1278       ElementTy = *STy->element_begin();
1279       Indices.push_back(IRB.getInt32(0));
1280     } else {
1281       break;
1282     }
1283     ++NumLayers;
1284   } while (ElementTy != TargetTy);
1285   if (ElementTy != TargetTy)
1286     Indices.erase(Indices.end() - NumLayers, Indices.end());
1287
1288   return buildGEP(IRB, BasePtr, Indices, Prefix);
1289 }
1290
1291 /// \brief Recursively compute indices for a natural GEP.
1292 ///
1293 /// This is the recursive step for getNaturalGEPWithOffset that walks down the
1294 /// element types adding appropriate indices for the GEP.
1295 static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const TargetData &TD,
1296                                        Value *Ptr, Type *Ty, APInt &Offset,
1297                                        Type *TargetTy,
1298                                        SmallVectorImpl<Value *> &Indices,
1299                                        const Twine &Prefix) {
1300   if (Offset == 0)
1301     return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices, Prefix);
1302
1303   // We can't recurse through pointer types.
1304   if (Ty->isPointerTy())
1305     return 0;
1306
1307   if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
1308     unsigned ElementSizeInBits = VecTy->getScalarSizeInBits();
1309     if (ElementSizeInBits % 8)
1310       return 0; // GEPs over multiple of 8 size vector elements are invalid.
1311     APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
1312     APInt NumSkippedElements = Offset.udiv(ElementSize);
1313     if (NumSkippedElements.ugt(VecTy->getNumElements()))
1314       return 0;
1315     Offset -= NumSkippedElements * ElementSize;
1316     Indices.push_back(IRB.getInt(NumSkippedElements));
1317     return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(),
1318                                     Offset, TargetTy, Indices, Prefix);
1319   }
1320
1321   if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
1322     Type *ElementTy = ArrTy->getElementType();
1323     APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
1324     APInt NumSkippedElements = Offset.udiv(ElementSize);
1325     if (NumSkippedElements.ugt(ArrTy->getNumElements()))
1326       return 0;
1327
1328     Offset -= NumSkippedElements * ElementSize;
1329     Indices.push_back(IRB.getInt(NumSkippedElements));
1330     return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
1331                                     Indices, Prefix);
1332   }
1333
1334   StructType *STy = dyn_cast<StructType>(Ty);
1335   if (!STy)
1336     return 0;
1337
1338   const StructLayout *SL = TD.getStructLayout(STy);
1339   uint64_t StructOffset = Offset.getZExtValue();
1340   if (StructOffset > SL->getSizeInBytes())
1341     return 0;
1342   unsigned Index = SL->getElementContainingOffset(StructOffset);
1343   Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
1344   Type *ElementTy = STy->getElementType(Index);
1345   if (Offset.uge(TD.getTypeAllocSize(ElementTy)))
1346     return 0; // The offset points into alignment padding.
1347
1348   Indices.push_back(IRB.getInt32(Index));
1349   return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
1350                                   Indices, Prefix);
1351 }
1352
1353 /// \brief Get a natural GEP from a base pointer to a particular offset and
1354 /// resulting in a particular type.
1355 ///
1356 /// The goal is to produce a "natural" looking GEP that works with the existing
1357 /// composite types to arrive at the appropriate offset and element type for
1358 /// a pointer. TargetTy is the element type the returned GEP should point-to if
1359 /// possible. We recurse by decreasing Offset, adding the appropriate index to
1360 /// Indices, and setting Ty to the result subtype.
1361 ///
1362 /// If no natural GEP can be constructed, this function returns null.
1363 static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const TargetData &TD,
1364                                       Value *Ptr, APInt Offset, Type *TargetTy,
1365                                       SmallVectorImpl<Value *> &Indices,
1366                                       const Twine &Prefix) {
1367   PointerType *Ty = cast<PointerType>(Ptr->getType());
1368
1369   // Don't consider any GEPs through an i8* as natural unless the TargetTy is
1370   // an i8.
1371   if (Ty == IRB.getInt8PtrTy() && TargetTy->isIntegerTy(8))
1372     return 0;
1373
1374   Type *ElementTy = Ty->getElementType();
1375   APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
1376   if (ElementSize == 0)
1377     return 0; // Zero-length arrays can't help us build a natural GEP.
1378   APInt NumSkippedElements = Offset.udiv(ElementSize);
1379
1380   Offset -= NumSkippedElements * ElementSize;
1381   Indices.push_back(IRB.getInt(NumSkippedElements));
1382   return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
1383                                   Indices, Prefix);
1384 }
1385
1386 /// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
1387 /// resulting pointer has PointerTy.
1388 ///
1389 /// This tries very hard to compute a "natural" GEP which arrives at the offset
1390 /// and produces the pointer type desired. Where it cannot, it will try to use
1391 /// the natural GEP to arrive at the offset and bitcast to the type. Where that
1392 /// fails, it will try to use an existing i8* and GEP to the byte offset and
1393 /// bitcast to the type.
1394 ///
1395 /// The strategy for finding the more natural GEPs is to peel off layers of the
1396 /// pointer, walking back through bit casts and GEPs, searching for a base
1397 /// pointer from which we can compute a natural GEP with the desired
1398 /// properities. The algorithm tries to fold as many constant indices into
1399 /// a single GEP as possible, thus making each GEP more independent of the
1400 /// surrounding code.
1401 static Value *getAdjustedPtr(IRBuilder<> &IRB, const TargetData &TD,
1402                              Value *Ptr, APInt Offset, Type *PointerTy,
1403                              const Twine &Prefix) {
1404   // Even though we don't look through PHI nodes, we could be called on an
1405   // instruction in an unreachable block, which may be on a cycle.
1406   SmallPtrSet<Value *, 4> Visited;
1407   Visited.insert(Ptr);
1408   SmallVector<Value *, 4> Indices;
1409
1410   // We may end up computing an offset pointer that has the wrong type. If we
1411   // never are able to compute one directly that has the correct type, we'll
1412   // fall back to it, so keep it around here.
1413   Value *OffsetPtr = 0;
1414
1415   // Remember any i8 pointer we come across to re-use if we need to do a raw
1416   // byte offset.
1417   Value *Int8Ptr = 0;
1418   APInt Int8PtrOffset(Offset.getBitWidth(), 0);
1419
1420   Type *TargetTy = PointerTy->getPointerElementType();
1421
1422   do {
1423     // First fold any existing GEPs into the offset.
1424     while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
1425       APInt GEPOffset(Offset.getBitWidth(), 0);
1426       if (!accumulateGEPOffsets(TD, *GEP, GEPOffset))
1427         break;
1428       Offset += GEPOffset;
1429       Ptr = GEP->getPointerOperand();
1430       if (!Visited.insert(Ptr))
1431         break;
1432     }
1433
1434     // See if we can perform a natural GEP here.
1435     Indices.clear();
1436     if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy,
1437                                            Indices, Prefix)) {
1438       if (P->getType() == PointerTy) {
1439         // Zap any offset pointer that we ended up computing in previous rounds.
1440         if (OffsetPtr && OffsetPtr->use_empty())
1441           if (Instruction *I = dyn_cast<Instruction>(OffsetPtr))
1442             I->eraseFromParent();
1443         return P;
1444       }
1445       if (!OffsetPtr) {
1446         OffsetPtr = P;
1447       }
1448     }
1449
1450     // Stash this pointer if we've found an i8*.
1451     if (Ptr->getType()->isIntegerTy(8)) {
1452       Int8Ptr = Ptr;
1453       Int8PtrOffset = Offset;
1454     }
1455
1456     // Peel off a layer of the pointer and update the offset appropriately.
1457     if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
1458       Ptr = cast<Operator>(Ptr)->getOperand(0);
1459     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
1460       if (GA->mayBeOverridden())
1461         break;
1462       Ptr = GA->getAliasee();
1463     } else {
1464       break;
1465     }
1466     assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
1467   } while (Visited.insert(Ptr));
1468
1469   if (!OffsetPtr) {
1470     if (!Int8Ptr) {
1471       Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(),
1472                                   Prefix + ".raw_cast");
1473       Int8PtrOffset = Offset;
1474     }
1475
1476     OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr :
1477       IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset),
1478                             Prefix + ".raw_idx");
1479   }
1480   Ptr = OffsetPtr;
1481
1482   // On the off chance we were targeting i8*, guard the bitcast here.
1483   if (Ptr->getType() != PointerTy)
1484     Ptr = IRB.CreateBitCast(Ptr, PointerTy, Prefix + ".cast");
1485
1486   return Ptr;
1487 }
1488
1489 /// \brief Test whether the given alloca partition can be promoted to a vector.
1490 ///
1491 /// This is a quick test to check whether we can rewrite a particular alloca
1492 /// partition (and its newly formed alloca) into a vector alloca with only
1493 /// whole-vector loads and stores such that it could be promoted to a vector
1494 /// SSA value. We only can ensure this for a limited set of operations, and we
1495 /// don't want to do the rewrites unless we are confident that the result will
1496 /// be promotable, so we have an early test here.
1497 static bool isVectorPromotionViable(const TargetData &TD,
1498                                     Type *AllocaTy,
1499                                     AllocaPartitioning &P,
1500                                     uint64_t PartitionBeginOffset,
1501                                     uint64_t PartitionEndOffset,
1502                                     AllocaPartitioning::const_use_iterator I,
1503                                     AllocaPartitioning::const_use_iterator E) {
1504   VectorType *Ty = dyn_cast<VectorType>(AllocaTy);
1505   if (!Ty)
1506     return false;
1507
1508   uint64_t VecSize = TD.getTypeSizeInBits(Ty);
1509   uint64_t ElementSize = Ty->getScalarSizeInBits();
1510
1511   // While the definition of LLVM vectors is bitpacked, we don't support sizes
1512   // that aren't byte sized.
1513   if (ElementSize % 8)
1514     return false;
1515   assert((VecSize % 8) == 0 && "vector size not a multiple of element size?");
1516   VecSize /= 8;
1517   ElementSize /= 8;
1518
1519   for (; I != E; ++I) {
1520     uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset;
1521     uint64_t BeginIndex = BeginOffset / ElementSize;
1522     if (BeginIndex * ElementSize != BeginOffset ||
1523         BeginIndex >= Ty->getNumElements())
1524       return false;
1525     uint64_t EndOffset = I->EndOffset - PartitionBeginOffset;
1526     uint64_t EndIndex = EndOffset / ElementSize;
1527     if (EndIndex * ElementSize != EndOffset ||
1528         EndIndex > Ty->getNumElements())
1529       return false;
1530
1531     // FIXME: We should build shuffle vector instructions to handle
1532     // non-element-sized accesses.
1533     if ((EndOffset - BeginOffset) != ElementSize &&
1534         (EndOffset - BeginOffset) != VecSize)
1535       return false;
1536
1537     if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&*I->User)) {
1538       if (MI->isVolatile())
1539         return false;
1540       if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(&*I->User)) {
1541         const AllocaPartitioning::MemTransferOffsets &MTO
1542           = P.getMemTransferOffsets(*MTI);
1543         if (!MTO.IsSplittable)
1544           return false;
1545       }
1546     } else if (I->Ptr->getType()->getPointerElementType()->isStructTy()) {
1547       // Disable vector promotion when there are loads or stores of an FCA.
1548       return false;
1549     } else if (!isa<LoadInst>(*I->User) && !isa<StoreInst>(*I->User)) {
1550       return false;
1551     }
1552   }
1553   return true;
1554 }
1555
1556 namespace {
1557 /// \brief Visitor to rewrite instructions using a partition of an alloca to
1558 /// use a new alloca.
1559 ///
1560 /// Also implements the rewriting to vector-based accesses when the partition
1561 /// passes the isVectorPromotionViable predicate. Most of the rewriting logic
1562 /// lives here.
1563 class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter,
1564                                                    bool> {
1565   // Befriend the base class so it can delegate to private visit methods.
1566   friend class llvm::InstVisitor<AllocaPartitionRewriter, bool>;
1567
1568   const TargetData &TD;
1569   AllocaPartitioning &P;
1570   SROA &Pass;
1571   AllocaInst &OldAI, &NewAI;
1572   const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
1573
1574   // If we are rewriting an alloca partition which can be written as pure
1575   // vector operations, we stash extra information here. When VecTy is
1576   // non-null, we have some strict guarantees about the rewriten alloca:
1577   //   - The new alloca is exactly the size of the vector type here.
1578   //   - The accesses all either map to the entire vector or to a single
1579   //     element.
1580   //   - The set of accessing instructions is only one of those handled above
1581   //     in isVectorPromotionViable. Generally these are the same access kinds
1582   //     which are promotable via mem2reg.
1583   VectorType *VecTy;
1584   Type *ElementTy;
1585   uint64_t ElementSize;
1586
1587   // The offset of the partition user currently being rewritten.
1588   uint64_t BeginOffset, EndOffset;
1589   Instruction *OldPtr;
1590
1591   // The name prefix to use when rewriting instructions for this alloca.
1592   std::string NamePrefix;
1593
1594 public:
1595   AllocaPartitionRewriter(const TargetData &TD, AllocaPartitioning &P,
1596                           AllocaPartitioning::iterator PI,
1597                           SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI,
1598                           uint64_t NewBeginOffset, uint64_t NewEndOffset)
1599     : TD(TD), P(P), Pass(Pass),
1600       OldAI(OldAI), NewAI(NewAI),
1601       NewAllocaBeginOffset(NewBeginOffset),
1602       NewAllocaEndOffset(NewEndOffset),
1603       VecTy(), ElementTy(), ElementSize(),
1604       BeginOffset(), EndOffset() {
1605   }
1606
1607   /// \brief Visit the users of the alloca partition and rewrite them.
1608   bool visitUsers(AllocaPartitioning::const_use_iterator I,
1609                   AllocaPartitioning::const_use_iterator E) {
1610     if (isVectorPromotionViable(TD, NewAI.getAllocatedType(), P,
1611                                 NewAllocaBeginOffset, NewAllocaEndOffset,
1612                                 I, E)) {
1613       ++NumVectorized;
1614       VecTy = cast<VectorType>(NewAI.getAllocatedType());
1615       ElementTy = VecTy->getElementType();
1616       assert((VecTy->getScalarSizeInBits() % 8) == 0 &&
1617              "Only multiple-of-8 sized vector elements are viable");
1618       ElementSize = VecTy->getScalarSizeInBits() / 8;
1619     }
1620     bool CanSROA = true;
1621     for (; I != E; ++I) {
1622       BeginOffset = I->BeginOffset;
1623       EndOffset = I->EndOffset;
1624       OldPtr = I->Ptr;
1625       NamePrefix = (Twine(NewAI.getName()) + "." + Twine(BeginOffset)).str();
1626       CanSROA &= visit(I->User);
1627     }
1628     if (VecTy) {
1629       assert(CanSROA);
1630       VecTy = 0;
1631       ElementTy = 0;
1632       ElementSize = 0;
1633     }
1634     return CanSROA;
1635   }
1636
1637 private:
1638   // Every instruction which can end up as a user must have a rewrite rule.
1639   bool visitInstruction(Instruction &I) {
1640     DEBUG(dbgs() << "    !!!! Cannot rewrite: " << I << "\n");
1641     llvm_unreachable("No rewrite rule for this instruction!");
1642   }
1643
1644   Twine getName(const Twine &Suffix) {
1645     return NamePrefix + Suffix;
1646   }
1647
1648   Value *getAdjustedAllocaPtr(IRBuilder<> &IRB, Type *PointerTy) {
1649     assert(BeginOffset >= NewAllocaBeginOffset);
1650     APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset);
1651     return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy, getName(""));
1652   }
1653
1654   ConstantInt *getIndex(IRBuilder<> &IRB, uint64_t Offset) {
1655     assert(VecTy && "Can only call getIndex when rewriting a vector");
1656     uint64_t RelOffset = Offset - NewAllocaBeginOffset;
1657     assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
1658     uint32_t Index = RelOffset / ElementSize;
1659     assert(Index * ElementSize == RelOffset);
1660     return IRB.getInt32(Index);
1661   }
1662
1663   void deleteIfTriviallyDead(Value *V) {
1664     Instruction *I = cast<Instruction>(V);
1665     if (isInstructionTriviallyDead(I))
1666       Pass.DeadInsts.push_back(I);
1667   }
1668
1669   Value *getValueCast(IRBuilder<> &IRB, Value *V, Type *Ty) {
1670     if (V->getType()->isIntegerTy() && Ty->isPointerTy())
1671       return IRB.CreateIntToPtr(V, Ty);
1672     if (V->getType()->isPointerTy() && Ty->isIntegerTy())
1673       return IRB.CreatePtrToInt(V, Ty);
1674
1675     return IRB.CreateBitCast(V, Ty);
1676   }
1677
1678   bool rewriteVectorizedLoadInst(IRBuilder<> &IRB, LoadInst &LI, Value *OldOp) {
1679     Value *Result;
1680     if (LI.getType() == VecTy->getElementType() ||
1681         BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) {
1682       Result
1683         = IRB.CreateExtractElement(IRB.CreateLoad(&NewAI, getName(".load")),
1684                                    getIndex(IRB, BeginOffset),
1685                                    getName(".extract"));
1686     } else {
1687       Result = IRB.CreateLoad(&NewAI, getName(".load"));
1688     }
1689     if (Result->getType() != LI.getType())
1690       Result = getValueCast(IRB, Result, LI.getType());
1691     LI.replaceAllUsesWith(Result);
1692     Pass.DeadInsts.push_back(&LI);
1693
1694     DEBUG(dbgs() << "          to: " << *Result << "\n");
1695     return true;
1696   }
1697
1698   bool visitLoadInst(LoadInst &LI) {
1699     DEBUG(dbgs() << "    original: " << LI << "\n");
1700     Value *OldOp = LI.getOperand(0);
1701     assert(OldOp == OldPtr);
1702     IRBuilder<> IRB(&LI);
1703
1704     if (VecTy)
1705       return rewriteVectorizedLoadInst(IRB, LI, OldOp);
1706
1707     Value *NewPtr = getAdjustedAllocaPtr(IRB,
1708                                          LI.getPointerOperand()->getType());
1709     LI.setOperand(0, NewPtr);
1710     DEBUG(dbgs() << "          to: " << LI << "\n");
1711
1712     deleteIfTriviallyDead(OldOp);
1713     return NewPtr == &NewAI && !LI.isVolatile();
1714   }
1715
1716   bool rewriteVectorizedStoreInst(IRBuilder<> &IRB, StoreInst &SI,
1717                                   Value *OldOp) {
1718     Value *V = SI.getValueOperand();
1719     if (V->getType() == ElementTy ||
1720         BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) {
1721       if (V->getType() != ElementTy)
1722         V = getValueCast(IRB, V, ElementTy);
1723       V = IRB.CreateInsertElement(IRB.CreateLoad(&NewAI, getName(".load")), V,
1724                                   getIndex(IRB, BeginOffset),
1725                                   getName(".insert"));
1726     } else if (V->getType() != VecTy) {
1727       V = getValueCast(IRB, V, VecTy);
1728     }
1729     StoreInst *Store = IRB.CreateStore(V, &NewAI);
1730     Pass.DeadInsts.push_back(&SI);
1731
1732     (void)Store;
1733     DEBUG(dbgs() << "          to: " << *Store << "\n");
1734     return true;
1735   }
1736
1737   bool visitStoreInst(StoreInst &SI) {
1738     DEBUG(dbgs() << "    original: " << SI << "\n");
1739     Value *OldOp = SI.getOperand(1);
1740     assert(OldOp == OldPtr);
1741     IRBuilder<> IRB(&SI);
1742
1743     if (VecTy)
1744       return rewriteVectorizedStoreInst(IRB, SI, OldOp);
1745
1746     Value *NewPtr = getAdjustedAllocaPtr(IRB,
1747                                          SI.getPointerOperand()->getType());
1748     SI.setOperand(1, NewPtr);
1749     DEBUG(dbgs() << "          to: " << SI << "\n");
1750
1751     deleteIfTriviallyDead(OldOp);
1752     return NewPtr == &NewAI && !SI.isVolatile();
1753   }
1754
1755   bool visitMemSetInst(MemSetInst &II) {
1756     DEBUG(dbgs() << "    original: " << II << "\n");
1757     IRBuilder<> IRB(&II);
1758     assert(II.getRawDest() == OldPtr);
1759
1760     // If the memset has a variable size, it cannot be split, just adjust the
1761     // pointer to the new alloca.
1762     if (!isa<Constant>(II.getLength())) {
1763       II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType()));
1764       deleteIfTriviallyDead(OldPtr);
1765       return false;
1766     }
1767
1768     // Record this instruction for deletion.
1769     if (Pass.DeadSplitInsts.insert(&II))
1770       Pass.DeadInsts.push_back(&II);
1771
1772     Type *AllocaTy = NewAI.getAllocatedType();
1773     Type *ScalarTy = AllocaTy->getScalarType();
1774
1775     // If this doesn't map cleanly onto the alloca type, and that type isn't
1776     // a single value type, just emit a memset.
1777     if (!VecTy && (BeginOffset != NewAllocaBeginOffset ||
1778                    EndOffset != NewAllocaEndOffset ||
1779                    !AllocaTy->isSingleValueType() ||
1780                    !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)))) {
1781       Type *SizeTy = II.getLength()->getType();
1782       Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
1783
1784       CallInst *New
1785         = IRB.CreateMemSet(getAdjustedAllocaPtr(IRB,
1786                                                 II.getRawDest()->getType()),
1787                            II.getValue(), Size, II.getAlignment(),
1788                            II.isVolatile());
1789       (void)New;
1790       DEBUG(dbgs() << "          to: " << *New << "\n");
1791       return false;
1792     }
1793
1794     // If we can represent this as a simple value, we have to build the actual
1795     // value to store, which requires expanding the byte present in memset to
1796     // a sensible representation for the alloca type. This is essentially
1797     // splatting the byte to a sufficiently wide integer, bitcasting to the
1798     // desired scalar type, and splatting it across any desired vector type.
1799     Value *V = II.getValue();
1800     IntegerType *VTy = cast<IntegerType>(V->getType());
1801     Type *IntTy = Type::getIntNTy(VTy->getContext(),
1802                                   TD.getTypeSizeInBits(ScalarTy));
1803     if (TD.getTypeSizeInBits(ScalarTy) > VTy->getBitWidth())
1804       V = IRB.CreateMul(IRB.CreateZExt(V, IntTy, getName(".zext")),
1805                         ConstantExpr::getUDiv(
1806                           Constant::getAllOnesValue(IntTy),
1807                           ConstantExpr::getZExt(
1808                             Constant::getAllOnesValue(V->getType()),
1809                             IntTy)),
1810                         getName(".isplat"));
1811     if (V->getType() != ScalarTy) {
1812       if (ScalarTy->isPointerTy())
1813         V = IRB.CreateIntToPtr(V, ScalarTy);
1814       else if (ScalarTy->isPrimitiveType() || ScalarTy->isVectorTy())
1815         V = IRB.CreateBitCast(V, ScalarTy);
1816       else if (ScalarTy->isIntegerTy())
1817         llvm_unreachable("Computed different integer types with equal widths");
1818       else
1819         llvm_unreachable("Invalid scalar type");
1820     }
1821
1822     // If this is an element-wide memset of a vectorizable alloca, insert it.
1823     if (VecTy && (BeginOffset > NewAllocaBeginOffset ||
1824                   EndOffset < NewAllocaEndOffset)) {
1825       StoreInst *Store = IRB.CreateStore(
1826         IRB.CreateInsertElement(IRB.CreateLoad(&NewAI, getName(".load")), V,
1827                                 getIndex(IRB, BeginOffset),
1828                                 getName(".insert")),
1829         &NewAI);
1830       (void)Store;
1831       DEBUG(dbgs() << "          to: " << *Store << "\n");
1832       return true;
1833     }
1834
1835     // Splat to a vector if needed.
1836     if (VectorType *VecTy = dyn_cast<VectorType>(AllocaTy)) {
1837       VectorType *SplatSourceTy = VectorType::get(V->getType(), 1);
1838       V = IRB.CreateShuffleVector(
1839         IRB.CreateInsertElement(UndefValue::get(SplatSourceTy), V,
1840                                 IRB.getInt32(0), getName(".vsplat.insert")),
1841         UndefValue::get(SplatSourceTy),
1842         ConstantVector::getSplat(VecTy->getNumElements(), IRB.getInt32(0)),
1843         getName(".vsplat.shuffle"));
1844       assert(V->getType() == VecTy);
1845     }
1846
1847     Value *New = IRB.CreateStore(V, &NewAI, II.isVolatile());
1848     (void)New;
1849     DEBUG(dbgs() << "          to: " << *New << "\n");
1850     return !II.isVolatile();
1851   }
1852
1853   bool visitMemTransferInst(MemTransferInst &II) {
1854     // Rewriting of memory transfer instructions can be a bit tricky. We break
1855     // them into two categories: split intrinsics and unsplit intrinsics.
1856
1857     DEBUG(dbgs() << "    original: " << II << "\n");
1858     IRBuilder<> IRB(&II);
1859
1860     assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr);
1861     bool IsDest = II.getRawDest() == OldPtr;
1862
1863     const AllocaPartitioning::MemTransferOffsets &MTO
1864       = P.getMemTransferOffsets(II);
1865
1866     // For unsplit intrinsics, we simply modify the source and destination
1867     // pointers in place. This isn't just an optimization, it is a matter of
1868     // correctness. With unsplit intrinsics we may be dealing with transfers
1869     // within a single alloca before SROA ran, or with transfers that have
1870     // a variable length. We may also be dealing with memmove instead of
1871     // memcpy, and so simply updating the pointers is the necessary for us to
1872     // update both source and dest of a single call.
1873     if (!MTO.IsSplittable) {
1874       Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource();
1875       if (IsDest)
1876         II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType()));
1877       else
1878         II.setSource(getAdjustedAllocaPtr(IRB, II.getRawSource()->getType()));
1879
1880       DEBUG(dbgs() << "          to: " << II << "\n");
1881       deleteIfTriviallyDead(OldOp);
1882       return false;
1883     }
1884     // For split transfer intrinsics we have an incredibly useful assurance:
1885     // the source and destination do not reside within the same alloca, and at
1886     // least one of them does not escape. This means that we can replace
1887     // memmove with memcpy, and we don't need to worry about all manner of
1888     // downsides to splitting and transforming the operations.
1889
1890     // Compute the relative offset within the transfer.
1891     unsigned IntPtrWidth = TD.getPointerSizeInBits();
1892     APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin
1893                                                        : MTO.SourceBegin));
1894
1895     // If this doesn't map cleanly onto the alloca type, and that type isn't
1896     // a single value type, just emit a memcpy.
1897     bool EmitMemCpy
1898       = !VecTy && (BeginOffset != NewAllocaBeginOffset ||
1899                    EndOffset != NewAllocaEndOffset ||
1900                    !NewAI.getAllocatedType()->isSingleValueType());
1901
1902     // If we're just going to emit a memcpy, the alloca hasn't changed, and the
1903     // size hasn't been shrunk based on analysis of the viable range, this is
1904     // a no-op.
1905     if (EmitMemCpy && &OldAI == &NewAI) {
1906       uint64_t OrigBegin = IsDest ? MTO.DestBegin : MTO.SourceBegin;
1907       uint64_t OrigEnd = IsDest ? MTO.DestEnd : MTO.SourceEnd;
1908       // Ensure the start lines up.
1909       assert(BeginOffset == OrigBegin);
1910
1911       // Rewrite the size as needed.
1912       if (EndOffset != OrigEnd)
1913         II.setLength(ConstantInt::get(II.getLength()->getType(),
1914                                       EndOffset - BeginOffset));
1915       return false;
1916     }
1917     // Record this instruction for deletion.
1918     if (Pass.DeadSplitInsts.insert(&II))
1919       Pass.DeadInsts.push_back(&II);
1920
1921     bool IsVectorElement = VecTy && (BeginOffset > NewAllocaBeginOffset ||
1922                                      EndOffset < NewAllocaEndOffset);
1923
1924     Type *OtherPtrTy = IsDest ? II.getRawSource()->getType()
1925                               : II.getRawDest()->getType();
1926     if (!EmitMemCpy)
1927       OtherPtrTy = IsVectorElement ? VecTy->getElementType()->getPointerTo()
1928                                    : NewAI.getType();
1929
1930     // Compute the other pointer, folding as much as possible to produce
1931     // a single, simple GEP in most cases.
1932     Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
1933     OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy,
1934                               getName("." + OtherPtr->getName()));
1935
1936     // Strip all inbounds GEPs and pointer casts to try to dig out any root
1937     // alloca that should be re-examined after rewriting this instruction.
1938     if (AllocaInst *AI
1939           = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets()))
1940       Pass.Worklist.insert(AI);
1941
1942     if (EmitMemCpy) {
1943       Value *OurPtr
1944         = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType()
1945                                            : II.getRawSource()->getType());
1946       Type *SizeTy = II.getLength()->getType();
1947       Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
1948
1949       CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr,
1950                                        IsDest ? OtherPtr : OurPtr,
1951                                        Size, II.getAlignment(),
1952                                        II.isVolatile());
1953       (void)New;
1954       DEBUG(dbgs() << "          to: " << *New << "\n");
1955       return false;
1956     }
1957
1958     Value *SrcPtr = OtherPtr;
1959     Value *DstPtr = &NewAI;
1960     if (!IsDest)
1961       std::swap(SrcPtr, DstPtr);
1962
1963     Value *Src;
1964     if (IsVectorElement && !IsDest) {
1965       // We have to extract rather than load.
1966       Src = IRB.CreateExtractElement(IRB.CreateLoad(SrcPtr,
1967                                                     getName(".copyload")),
1968                                      getIndex(IRB, BeginOffset),
1969                                      getName(".copyextract"));
1970     } else {
1971       Src = IRB.CreateLoad(SrcPtr, II.isVolatile(), getName(".copyload"));
1972     }
1973
1974     if (IsVectorElement && IsDest) {
1975       // We have to insert into a loaded copy before storing.
1976       Src = IRB.CreateInsertElement(IRB.CreateLoad(&NewAI, getName(".load")),
1977                                     Src, getIndex(IRB, BeginOffset),
1978                                     getName(".insert"));
1979     }
1980
1981     Value *Store = IRB.CreateStore(Src, DstPtr, II.isVolatile());
1982     (void)Store;
1983     DEBUG(dbgs() << "          to: " << *Store << "\n");
1984     return !II.isVolatile();
1985   }
1986
1987   bool visitIntrinsicInst(IntrinsicInst &II) {
1988     assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
1989            II.getIntrinsicID() == Intrinsic::lifetime_end);
1990     DEBUG(dbgs() << "    original: " << II << "\n");
1991     IRBuilder<> IRB(&II);
1992     assert(II.getArgOperand(1) == OldPtr);
1993
1994     // Record this instruction for deletion.
1995     if (Pass.DeadSplitInsts.insert(&II))
1996       Pass.DeadInsts.push_back(&II);
1997
1998     ConstantInt *Size
1999       = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
2000                          EndOffset - BeginOffset);
2001     Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType());
2002     Value *New;
2003     if (II.getIntrinsicID() == Intrinsic::lifetime_start)
2004       New = IRB.CreateLifetimeStart(Ptr, Size);
2005     else
2006       New = IRB.CreateLifetimeEnd(Ptr, Size);
2007
2008     DEBUG(dbgs() << "          to: " << *New << "\n");
2009     return true;
2010   }
2011
2012   /// PHI instructions that use an alloca and are subsequently loaded can be
2013   /// rewritten to load both input pointers in the pred blocks and then PHI the
2014   /// results, allowing the load of the alloca to be promoted.
2015   /// From this:
2016   ///   %P2 = phi [i32* %Alloca, i32* %Other]
2017   ///   %V = load i32* %P2
2018   /// to:
2019   ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
2020   ///   ...
2021   ///   %V2 = load i32* %Other
2022   ///   ...
2023   ///   %V = phi [i32 %V1, i32 %V2]
2024   ///
2025   /// We can do this to a select if its only uses are loads and if the operand
2026   /// to the select can be loaded unconditionally.
2027   ///
2028   /// FIXME: This should be hoisted into a generic utility, likely in
2029   /// Transforms/Util/Local.h
2030   bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) {
2031     // For now, we can only do this promotion if the load is in the same block
2032     // as the PHI, and if there are no stores between the phi and load.
2033     // TODO: Allow recursive phi users.
2034     // TODO: Allow stores.
2035     BasicBlock *BB = PN.getParent();
2036     unsigned MaxAlign = 0;
2037     for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end();
2038          UI != UE; ++UI) {
2039       LoadInst *LI = dyn_cast<LoadInst>(*UI);
2040       if (LI == 0 || !LI->isSimple()) return false;
2041
2042       // For now we only allow loads in the same block as the PHI.  This is
2043       // a common case that happens when instcombine merges two loads through
2044       // a PHI.
2045       if (LI->getParent() != BB) return false;
2046
2047       // Ensure that there are no instructions between the PHI and the load that
2048       // could store.
2049       for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI)
2050         if (BBI->mayWriteToMemory())
2051           return false;
2052
2053       MaxAlign = std::max(MaxAlign, LI->getAlignment());
2054       Loads.push_back(LI);
2055     }
2056
2057     // We can only transform this if it is safe to push the loads into the
2058     // predecessor blocks. The only thing to watch out for is that we can't put
2059     // a possibly trapping load in the predecessor if it is a critical edge.
2060     for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num;
2061          ++Idx) {
2062       TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
2063       Value *InVal = PN.getIncomingValue(Idx);
2064
2065       // If the value is produced by the terminator of the predecessor (an
2066       // invoke) or it has side-effects, there is no valid place to put a load
2067       // in the predecessor.
2068       if (TI == InVal || TI->mayHaveSideEffects())
2069         return false;
2070
2071       // If the predecessor has a single successor, then the edge isn't
2072       // critical.
2073       if (TI->getNumSuccessors() == 1)
2074         continue;
2075
2076       // If this pointer is always safe to load, or if we can prove that there
2077       // is already a load in the block, then we can move the load to the pred
2078       // block.
2079       if (InVal->isDereferenceablePointer() ||
2080           isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD))
2081         continue;
2082
2083       return false;
2084     }
2085
2086     return true;
2087   }
2088
2089   bool visitPHINode(PHINode &PN) {
2090     DEBUG(dbgs() << "    original: " << PN << "\n");
2091     // We would like to compute a new pointer in only one place, but have it be
2092     // as local as possible to the PHI. To do that, we re-use the location of
2093     // the old pointer, which necessarily must be in the right position to
2094     // dominate the PHI.
2095     IRBuilder<> PtrBuilder(cast<Instruction>(OldPtr));
2096
2097     SmallVector<LoadInst *, 4> Loads;
2098     if (!isSafePHIToSpeculate(PN, Loads)) {
2099       Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType());
2100       // Replace the operands which were using the old pointer.
2101       User::op_iterator OI = PN.op_begin(), OE = PN.op_end();
2102       for (; OI != OE; ++OI)
2103         if (*OI == OldPtr)
2104           *OI = NewPtr;
2105
2106       DEBUG(dbgs() << "          to: " << PN << "\n");
2107       deleteIfTriviallyDead(OldPtr);
2108       return false;
2109     }
2110     assert(!Loads.empty());
2111
2112     Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
2113     IRBuilder<> PHIBuilder(&PN);
2114     PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues());
2115     NewPN->takeName(&PN);
2116
2117     // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
2118     // matter which one we get and if any differ, it doesn't matter.
2119     LoadInst *SomeLoad = cast<LoadInst>(Loads.back());
2120     MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
2121     unsigned Align = SomeLoad->getAlignment();
2122     Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType());
2123
2124     // Rewrite all loads of the PN to use the new PHI.
2125     do {
2126       LoadInst *LI = Loads.pop_back_val();
2127       LI->replaceAllUsesWith(NewPN);
2128       Pass.DeadInsts.push_back(LI);
2129     } while (!Loads.empty());
2130
2131     // Inject loads into all of the pred blocks.
2132     for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
2133       BasicBlock *Pred = PN.getIncomingBlock(Idx);
2134       TerminatorInst *TI = Pred->getTerminator();
2135       Value *InVal = PN.getIncomingValue(Idx);
2136       IRBuilder<> PredBuilder(TI);
2137
2138       // Map the value to the new alloca pointer if this was the old alloca
2139       // pointer.
2140       bool ThisOperand = InVal == OldPtr;
2141       if (ThisOperand)
2142         InVal = NewPtr;
2143
2144       LoadInst *Load
2145         = PredBuilder.CreateLoad(InVal, getName(".sroa.speculate." +
2146                                                 Pred->getName()));
2147       ++NumLoadsSpeculated;
2148       Load->setAlignment(Align);
2149       if (TBAATag)
2150         Load->setMetadata(LLVMContext::MD_tbaa, TBAATag);
2151       NewPN->addIncoming(Load, Pred);
2152
2153       if (ThisOperand)
2154         continue;
2155       Instruction *OtherPtr = dyn_cast<Instruction>(InVal);
2156       if (!OtherPtr)
2157         // No uses to rewrite.
2158         continue;
2159
2160       // Try to lookup and rewrite any partition uses corresponding to this phi
2161       // input.
2162       AllocaPartitioning::iterator PI
2163         = P.findPartitionForPHIOrSelectOperand(PN, OtherPtr);
2164       if (PI != P.end()) {
2165         // If the other pointer is within the partitioning, replace the PHI in
2166         // its uses with the load we just speculated, or add another load for
2167         // it to rewrite if we've already replaced the PHI.
2168         AllocaPartitioning::use_iterator UI
2169           = P.findPartitionUseForPHIOrSelectOperand(PN, OtherPtr);
2170         if (isa<PHINode>(*UI->User))
2171           UI->User = Load;
2172         else {
2173           AllocaPartitioning::PartitionUse OtherUse = *UI;
2174           OtherUse.User = Load;
2175           P.use_insert(PI, std::upper_bound(UI, P.use_end(PI), OtherUse),
2176                        OtherUse);
2177         }
2178       }
2179     }
2180     DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
2181     return NewPtr == &NewAI;
2182   }
2183
2184   /// Select instructions that use an alloca and are subsequently loaded can be
2185   /// rewritten to load both input pointers and then select between the result,
2186   /// allowing the load of the alloca to be promoted.
2187   /// From this:
2188   ///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other
2189   ///   %V = load i32* %P2
2190   /// to:
2191   ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
2192   ///   %V2 = load i32* %Other
2193   ///   %V = select i1 %cond, i32 %V1, i32 %V2
2194   ///
2195   /// We can do this to a select if its only uses are loads and if the operand
2196   /// to the select can be loaded unconditionally.
2197   bool isSafeSelectToSpeculate(SelectInst &SI,
2198                                SmallVectorImpl<LoadInst *> &Loads) {
2199     Value *TValue = SI.getTrueValue();
2200     Value *FValue = SI.getFalseValue();
2201     bool TDerefable = TValue->isDereferenceablePointer();
2202     bool FDerefable = FValue->isDereferenceablePointer();
2203
2204     for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end();
2205          UI != UE; ++UI) {
2206       LoadInst *LI = dyn_cast<LoadInst>(*UI);
2207       if (LI == 0 || !LI->isSimple()) return false;
2208
2209       // Both operands to the select need to be dereferencable, either
2210       // absolutely (e.g. allocas) or at this point because we can see other
2211       // accesses to it.
2212       if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI,
2213                                                       LI->getAlignment(), &TD))
2214         return false;
2215       if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI,
2216                                                       LI->getAlignment(), &TD))
2217         return false;
2218       Loads.push_back(LI);
2219     }
2220
2221     return true;
2222   }
2223
2224   bool visitSelectInst(SelectInst &SI) {
2225     DEBUG(dbgs() << "    original: " << SI << "\n");
2226     IRBuilder<> IRB(&SI);
2227
2228     // Find the operand we need to rewrite here.
2229     bool IsTrueVal = SI.getTrueValue() == OldPtr;
2230     if (IsTrueVal)
2231       assert(SI.getFalseValue() != OldPtr && "Pointer is both operands!");
2232     else
2233       assert(SI.getFalseValue() == OldPtr && "Pointer isn't an operand!");
2234     Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType());
2235
2236     // If the select isn't safe to speculate, just use simple logic to emit it.
2237     SmallVector<LoadInst *, 4> Loads;
2238     if (!isSafeSelectToSpeculate(SI, Loads)) {
2239       SI.setOperand(IsTrueVal ? 1 : 2, NewPtr);
2240       DEBUG(dbgs() << "          to: " << SI << "\n");
2241       deleteIfTriviallyDead(OldPtr);
2242       return false;
2243     }
2244
2245     Value *OtherPtr = IsTrueVal ? SI.getFalseValue() : SI.getTrueValue();
2246     AllocaPartitioning::iterator PI
2247       = P.findPartitionForPHIOrSelectOperand(SI, OtherPtr);
2248     AllocaPartitioning::PartitionUse OtherUse;
2249     if (PI != P.end()) {
2250       // If the other pointer is within the partitioning, remove the select
2251       // from its uses. We'll add in the new loads below.
2252       AllocaPartitioning::use_iterator UI
2253         = P.findPartitionUseForPHIOrSelectOperand(SI, OtherPtr);
2254       OtherUse = *UI;
2255       P.use_erase(PI, UI);
2256     }
2257
2258     Value *TV = IsTrueVal ? NewPtr : SI.getTrueValue();
2259     Value *FV = IsTrueVal ? SI.getFalseValue() : NewPtr;
2260     // Replace the loads of the select with a select of two loads.
2261     while (!Loads.empty()) {
2262       LoadInst *LI = Loads.pop_back_val();
2263
2264       IRB.SetInsertPoint(LI);
2265       LoadInst *TL =
2266         IRB.CreateLoad(TV, getName("." + LI->getName() + ".true"));
2267       LoadInst *FL =
2268         IRB.CreateLoad(FV, getName("." + LI->getName() + ".false"));
2269       NumLoadsSpeculated += 2;
2270       if (PI != P.end()) {
2271         LoadInst *OtherLoad = IsTrueVal ? FL : TL;
2272         assert(OtherUse.Ptr == OtherLoad->getOperand(0));
2273         OtherUse.User = OtherLoad;
2274         P.use_insert(PI, P.use_end(PI), OtherUse);
2275       }
2276
2277       // Transfer alignment and TBAA info if present.
2278       TL->setAlignment(LI->getAlignment());
2279       FL->setAlignment(LI->getAlignment());
2280       if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) {
2281         TL->setMetadata(LLVMContext::MD_tbaa, Tag);
2282         FL->setMetadata(LLVMContext::MD_tbaa, Tag);
2283       }
2284
2285       Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL);
2286       V->takeName(LI);
2287       DEBUG(dbgs() << "          speculated to: " << *V << "\n");
2288       LI->replaceAllUsesWith(V);
2289       Pass.DeadInsts.push_back(LI);
2290     }
2291     if (PI != P.end())
2292       std::stable_sort(P.use_begin(PI), P.use_end(PI));
2293
2294     deleteIfTriviallyDead(OldPtr);
2295     return NewPtr == &NewAI;
2296   }
2297
2298 };
2299 }
2300
2301 /// \brief Try to find a partition of the aggregate type passed in for a given
2302 /// offset and size.
2303 ///
2304 /// This recurses through the aggregate type and tries to compute a subtype
2305 /// based on the offset and size. When the offset and size span a sub-section
2306 /// of an array, it will even compute a new array type for that sub-section.
2307 static Type *getTypePartition(const TargetData &TD, Type *Ty,
2308                               uint64_t Offset, uint64_t Size) {
2309   if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size)
2310     return Ty;
2311
2312   if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) {
2313     // We can't partition pointers...
2314     if (SeqTy->isPointerTy())
2315       return 0;
2316
2317     Type *ElementTy = SeqTy->getElementType();
2318     uint64_t ElementSize = TD.getTypeAllocSize(ElementTy);
2319     uint64_t NumSkippedElements = Offset / ElementSize;
2320     if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy))
2321       if (NumSkippedElements >= ArrTy->getNumElements())
2322         return 0;
2323     if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy))
2324       if (NumSkippedElements >= VecTy->getNumElements())
2325         return 0;
2326     Offset -= NumSkippedElements * ElementSize;
2327
2328     // First check if we need to recurse.
2329     if (Offset > 0 || Size < ElementSize) {
2330       // Bail if the partition ends in a different array element.
2331       if ((Offset + Size) > ElementSize)
2332         return 0;
2333       // Recurse through the element type trying to peel off offset bytes.
2334       return getTypePartition(TD, ElementTy, Offset, Size);
2335     }
2336     assert(Offset == 0);
2337
2338     if (Size == ElementSize)
2339       return ElementTy;
2340     assert(Size > ElementSize);
2341     uint64_t NumElements = Size / ElementSize;
2342     if (NumElements * ElementSize != Size)
2343       return 0;
2344     return ArrayType::get(ElementTy, NumElements);
2345   }
2346
2347   StructType *STy = dyn_cast<StructType>(Ty);
2348   if (!STy)
2349     return 0;
2350
2351   const StructLayout *SL = TD.getStructLayout(STy);
2352   if (Offset > SL->getSizeInBytes())
2353     return 0;
2354   uint64_t EndOffset = Offset + Size;
2355   if (EndOffset > SL->getSizeInBytes())
2356     return 0;
2357
2358   unsigned Index = SL->getElementContainingOffset(Offset);
2359   if (SL->getElementOffset(Index) != Offset)
2360     return 0; // Inside of padding.
2361   Offset -= SL->getElementOffset(Index);
2362
2363   Type *ElementTy = STy->getElementType(Index);
2364   uint64_t ElementSize = TD.getTypeAllocSize(ElementTy);
2365   if (Offset >= ElementSize)
2366     return 0; // The offset points into alignment padding.
2367
2368   // See if any partition must be contained by the element.
2369   if (Offset > 0 || Size < ElementSize) {
2370     if ((Offset + Size) > ElementSize)
2371       return 0;
2372     // Bail if this is a poniter element, we can't recurse through them.
2373     if (ElementTy->isPointerTy())
2374       return 0;
2375     return getTypePartition(TD, ElementTy, Offset, Size);
2376   }
2377   assert(Offset == 0);
2378
2379   if (Size == ElementSize)
2380     return ElementTy;
2381
2382   StructType::element_iterator EI = STy->element_begin() + Index,
2383                                EE = STy->element_end();
2384   if (EndOffset < SL->getSizeInBytes()) {
2385     unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
2386     if (Index == EndIndex)
2387       return 0; // Within a single element and its padding.
2388     assert(Index < EndIndex);
2389     assert(Index + EndIndex <= STy->getNumElements());
2390     EE = STy->element_begin() + EndIndex;
2391   }
2392
2393   // Try to build up a sub-structure.
2394   SmallVector<Type *, 4> ElementTys;
2395   do {
2396     ElementTys.push_back(*EI++);
2397   } while (EI != EE);
2398   StructType *SubTy = StructType::get(STy->getContext(), ElementTys,
2399                                       STy->isPacked());
2400   const StructLayout *SubSL = TD.getStructLayout(SubTy);
2401   if (Size == SubSL->getSizeInBytes())
2402     return SubTy;
2403
2404   // FIXME: We could potentially recurse down through the last element in the
2405   // sub-struct to find a natural end point.
2406   return 0;
2407 }
2408
2409 /// \brief Rewrite an alloca partition's users.
2410 ///
2411 /// This routine drives both of the rewriting goals of the SROA pass. It tries
2412 /// to rewrite uses of an alloca partition to be conducive for SSA value
2413 /// promotion. If the partition needs a new, more refined alloca, this will
2414 /// build that new alloca, preserving as much type information as possible, and
2415 /// rewrite the uses of the old alloca to point at the new one and have the
2416 /// appropriate new offsets. It also evaluates how successful the rewrite was
2417 /// at enabling promotion and if it was successful queues the alloca to be
2418 /// promoted.
2419 bool SROA::rewriteAllocaPartition(AllocaInst &AI,
2420                                   AllocaPartitioning &P,
2421                                   AllocaPartitioning::iterator PI) {
2422   uint64_t AllocaSize = PI->EndOffset - PI->BeginOffset;
2423   if (P.use_begin(PI) == P.use_end(PI))
2424     return false; // No live uses left of this partition.
2425
2426   // Try to compute a friendly type for this partition of the alloca. This
2427   // won't always succeed, in which case we fall back to a legal integer type
2428   // or an i8 array of an appropriate size.
2429   Type *AllocaTy = 0;
2430   if (Type *PartitionTy = P.getCommonType(PI))
2431     if (TD->getTypeAllocSize(PartitionTy) >= AllocaSize)
2432       AllocaTy = PartitionTy;
2433   if (!AllocaTy)
2434     if (Type *PartitionTy = getTypePartition(*TD, AI.getAllocatedType(),
2435                                              PI->BeginOffset, AllocaSize))
2436       AllocaTy = PartitionTy;
2437   if ((!AllocaTy ||
2438        (AllocaTy->isArrayTy() &&
2439         AllocaTy->getArrayElementType()->isIntegerTy())) &&
2440       TD->isLegalInteger(AllocaSize * 8))
2441     AllocaTy = Type::getIntNTy(*C, AllocaSize * 8);
2442   if (!AllocaTy)
2443     AllocaTy = ArrayType::get(Type::getInt8Ty(*C), AllocaSize);
2444   assert(TD->getTypeAllocSize(AllocaTy) >= AllocaSize);
2445
2446   // Check for the case where we're going to rewrite to a new alloca of the
2447   // exact same type as the original, and with the same access offsets. In that
2448   // case, re-use the existing alloca, but still run through the rewriter to
2449   // performe phi and select speculation.
2450   AllocaInst *NewAI;
2451   if (AllocaTy == AI.getAllocatedType()) {
2452     assert(PI->BeginOffset == 0 &&
2453            "Non-zero begin offset but same alloca type");
2454     assert(PI == P.begin() && "Begin offset is zero on later partition");
2455     NewAI = &AI;
2456   } else {
2457     // FIXME: The alignment here is overly conservative -- we could in many
2458     // cases get away with much weaker alignment constraints.
2459     NewAI = new AllocaInst(AllocaTy, 0, AI.getAlignment(),
2460                            AI.getName() + ".sroa." + Twine(PI - P.begin()),
2461                            &AI);
2462     ++NumNewAllocas;
2463   }
2464
2465   DEBUG(dbgs() << "Rewriting alloca partition "
2466                << "[" << PI->BeginOffset << "," << PI->EndOffset << ") to: "
2467                << *NewAI << "\n");
2468
2469   AllocaPartitionRewriter Rewriter(*TD, P, PI, *this, AI, *NewAI,
2470                                    PI->BeginOffset, PI->EndOffset);
2471   DEBUG(dbgs() << "  rewriting ");
2472   DEBUG(P.print(dbgs(), PI, ""));
2473   if (Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI))) {
2474     DEBUG(dbgs() << "  and queuing for promotion\n");
2475     PromotableAllocas.push_back(NewAI);
2476   } else if (NewAI != &AI) {
2477     // If we can't promote the alloca, iterate on it to check for new
2478     // refinements exposed by splitting the current alloca. Don't iterate on an
2479     // alloca which didn't actually change and didn't get promoted.
2480     Worklist.insert(NewAI);
2481   }
2482   return true;
2483 }
2484
2485 /// \brief Walks the partitioning of an alloca rewriting uses of each partition.
2486 bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) {
2487   bool Changed = false;
2488   for (AllocaPartitioning::iterator PI = P.begin(), PE = P.end(); PI != PE;
2489        ++PI)
2490     Changed |= rewriteAllocaPartition(AI, P, PI);
2491
2492   return Changed;
2493 }
2494
2495 /// \brief Analyze an alloca for SROA.
2496 ///
2497 /// This analyzes the alloca to ensure we can reason about it, builds
2498 /// a partitioning of the alloca, and then hands it off to be split and
2499 /// rewritten as needed.
2500 bool SROA::runOnAlloca(AllocaInst &AI) {
2501   DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
2502   ++NumAllocasAnalyzed;
2503
2504   // Special case dead allocas, as they're trivial.
2505   if (AI.use_empty()) {
2506     AI.eraseFromParent();
2507     return true;
2508   }
2509
2510   // Skip alloca forms that this analysis can't handle.
2511   if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() ||
2512       TD->getTypeAllocSize(AI.getAllocatedType()) == 0)
2513     return false;
2514
2515   // First check if this is a non-aggregate type that we should simply promote.
2516   if (!AI.getAllocatedType()->isAggregateType() && isAllocaPromotable(&AI)) {
2517     DEBUG(dbgs() << "  Trivially scalar type, queuing for promotion...\n");
2518     PromotableAllocas.push_back(&AI);
2519     return false;
2520   }
2521
2522   // Build the partition set using a recursive instruction-visiting builder.
2523   AllocaPartitioning P(*TD, AI);
2524   DEBUG(P.print(dbgs()));
2525   if (P.isEscaped())
2526     return false;
2527
2528   // No partitions to split. Leave the dead alloca for a later pass to clean up.
2529   if (P.begin() == P.end())
2530     return false;
2531
2532   // Delete all the dead users of this alloca before splitting and rewriting it.
2533   bool Changed = false;
2534   for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(),
2535                                               DE = P.dead_user_end();
2536        DI != DE; ++DI) {
2537     Changed = true;
2538     (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType()));
2539     DeadInsts.push_back(*DI);
2540   }
2541   for (AllocaPartitioning::dead_op_iterator DO = P.dead_op_begin(),
2542                                             DE = P.dead_op_end();
2543        DO != DE; ++DO) {
2544     Value *OldV = **DO;
2545     // Clobber the use with an undef value.
2546     **DO = UndefValue::get(OldV->getType());
2547     if (Instruction *OldI = dyn_cast<Instruction>(OldV))
2548       if (isInstructionTriviallyDead(OldI)) {
2549         Changed = true;
2550         DeadInsts.push_back(OldI);
2551       }
2552   }
2553
2554   return splitAlloca(AI, P) || Changed;
2555 }
2556
2557 void SROA::deleteDeadInstructions() {
2558   DeadSplitInsts.clear();
2559   while (!DeadInsts.empty()) {
2560     Instruction *I = DeadInsts.pop_back_val();
2561     DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
2562
2563     for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
2564       if (Instruction *U = dyn_cast<Instruction>(*OI)) {
2565         // Zero out the operand and see if it becomes trivially dead.
2566         *OI = 0;
2567         if (isInstructionTriviallyDead(U))
2568           DeadInsts.push_back(U);
2569       }
2570
2571     if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
2572       DeletedAllocas.insert(AI);
2573
2574     ++NumDeleted;
2575     I->eraseFromParent();
2576   }
2577 }
2578
2579 namespace {
2580   /// \brief A predicate to test whether an alloca belongs to a set.
2581   class IsAllocaInSet {
2582     typedef SmallPtrSet<AllocaInst *, 4> SetType;
2583     const SetType &Set;
2584
2585   public:
2586     IsAllocaInSet(const SetType &Set) : Set(Set) {}
2587     bool operator()(AllocaInst *AI) { return Set.count(AI); }
2588   };
2589 }
2590
2591 bool SROA::runOnFunction(Function &F) {
2592   DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
2593   C = &F.getContext();
2594   TD = getAnalysisIfAvailable<TargetData>();
2595   if (!TD) {
2596     DEBUG(dbgs() << "  Skipping SROA -- no target data!\n");
2597     return false;
2598   }
2599   DT = &getAnalysis<DominatorTree>();
2600
2601   BasicBlock &EntryBB = F.getEntryBlock();
2602   for (BasicBlock::iterator I = EntryBB.begin(), E = llvm::prior(EntryBB.end());
2603        I != E; ++I)
2604     if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
2605       Worklist.insert(AI);
2606
2607   bool Changed = false;
2608   while (!Worklist.empty()) {
2609     Changed |= runOnAlloca(*Worklist.pop_back_val());
2610     deleteDeadInstructions();
2611     if (!DeletedAllocas.empty()) {
2612       PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(),
2613                                              PromotableAllocas.end(),
2614                                              IsAllocaInSet(DeletedAllocas)),
2615                               PromotableAllocas.end());
2616       DeletedAllocas.clear();
2617     }
2618   }
2619
2620   if (!PromotableAllocas.empty()) {
2621     DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
2622     PromoteMemToReg(PromotableAllocas, *DT);
2623     Changed = true;
2624     NumPromoted += PromotableAllocas.size();
2625     PromotableAllocas.clear();
2626   }
2627
2628   return Changed;
2629 }
2630
2631 void SROA::getAnalysisUsage(AnalysisUsage &AU) const {
2632   AU.addRequired<DominatorTree>();
2633   AU.setPreservesCFG();
2634 }