From: Jiangning Liu <jiangning.liu@arm.com>
Date: Thu, 15 May 2014 23:45:42 +0000 (+0000)
Subject: Implement global merge optimization for global variables.
X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=d5db8765d61ca77a55867cf1f39aecb8cae3a6cd;p=oota-llvm.git

Implement global merge optimization for global variables.

This commit implements two command line switches -global-merge-on-external
and -global-merge-aligned, and both of them are false by default, so this
optimization is disabled by default for all targets.

For ARM64, some back-end behaviors need to be tuned to get this optimization
further enabled.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@208934 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/include/llvm/IR/GlobalAlias.h b/include/llvm/IR/GlobalAlias.h
index 5aa42612562..7aa07149495 100644
--- a/include/llvm/IR/GlobalAlias.h
+++ b/include/llvm/IR/GlobalAlias.h
@@ -82,6 +82,10 @@ public:
   static inline bool classof(const Value *V) {
     return V->getValueID() == Value::GlobalAliasVal;
   }
+
+  // return the constant offset of an expression, with which this global var
+  // has alias.
+  uint64_t calculateOffset(const DataLayout &DL) const;
 };
 
 template <>
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index dc2c74a835b..bd81f8e4242 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -32,6 +32,7 @@
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Target/TargetCallingConv.h"
 #include "llvm/Target/TargetMachine.h"
@@ -883,6 +884,12 @@ public:
     return 0;
   }
 
+  /// Returns the alignment required by global merge on external symbols.
+  /// By default, returns the natural alignment of merged data structure.
+  virtual unsigned getGlobalMergeAlignment(StructType *MergedTy) const {
+    return getDataLayout()->getABITypeAlignment(MergedTy);
+  }
+
   /// Returns true if a cast between SrcAS and DestAS is a noop.
   virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const {
     return false;
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 37a2c3220cb..0c4865f2f27 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -946,8 +946,11 @@ bool AsmPrinter::doFinalization(Module &M) {
       EmitVisibility(Name, Alias.getVisibility());
 
       // Emit the directives as assignments aka .set:
-      OutStreamer.EmitAssignment(Name,
-                                 MCSymbolRefExpr::Create(Target, OutContext));
+      const MCExpr *Expr = MCSymbolRefExpr::Create(Target, OutContext);
+      if (uint64_t Offset = Alias.calculateOffset(*TM.getDataLayout()))
+        Expr = MCBinaryExpr::CreateAdd(Expr,
+                 MCConstantExpr::Create(Offset, OutContext), OutContext);
+      OutStreamer.EmitAssignment(Name, Expr);
     }
   }
 
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index 0ec54fe3c08..d64046a7bf9 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -15,6 +15,7 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -282,3 +283,27 @@ GlobalObject *GlobalAlias::getAliasedGlobal() {
       return cast<GlobalObject>(GV);
   }
 }
+
+uint64_t GlobalAlias::calculateOffset(const DataLayout &DL) const {
+  uint64_t Offset = 0;
+  const Constant *C = this;
+  while (C) {
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(C)) {
+      C = GA->getAliasee();
+    } else if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+      if (CE->getOpcode() == Instruction::GetElementPtr) {
+        std::vector<Value*> Args;
+        for (unsigned I = 1; I < CE->getNumOperands(); ++I)
+          Args.push_back(CE->getOperand(I));
+        Offset += DL.getIndexedOffset(CE->getOperand(0)->getType(), Args);
+      }
+      C = CE->getOperand(0);
+    } else if (isa<GlobalValue>(C)) {
+      return Offset;
+    } else {
+      assert(0 && "Unexpected type in alias chain!");
+      return 0;
+    }
+  }
+  return Offset;
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 852d324476a..a676600e5df 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5560,3 +5560,17 @@ unsigned AArch64TargetLowering::getMaximalGlobalOffset() const {
   return 4095;
 }
 
+/// getGlobalMergeAlignment - Set alignment to be the max size of merged
+/// global variable data structure, and make it aligned up to power of 2.
+/// This way, we could guarantee the merged global variable data structure
+/// doesn't cross page boundary, because usually OS always allocates page at
+/// 4096-byte aligned boundary.
+unsigned AArch64TargetLowering::getGlobalMergeAlignment(
+                                  StructType *MergedTy) const {
+  unsigned Align = getDataLayout()->getTypeAllocSize(MergedTy);
+  if (Align & (Align - 1))
+    Align = llvm::NextPowerOf2(Align);
+
+  return Align;
+}
+
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 070db94808f..9818b7a2dcb 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -386,6 +386,10 @@ public:
   /// be used for loads / stores from the global.
   unsigned getMaximalGlobalOffset() const override;
 
+  /// getGlobalMergeAlignment - Set alignment to be the max size of merged
+  /// global variable data structure, and make it aligned up to power of 2.
+  unsigned getGlobalMergeAlignment(StructType *MergedTy) const override;
+
 protected:
   std::pair<const TargetRegisterClass*, uint8_t>
   findRepresentativeClass(MVT VT) const override;
diff --git a/lib/Target/ARM64/ARM64ISelLowering.cpp b/lib/Target/ARM64/ARM64ISelLowering.cpp
index 118007fa70c..880df0c1a81 100644
--- a/lib/Target/ARM64/ARM64ISelLowering.cpp
+++ b/lib/Target/ARM64/ARM64ISelLowering.cpp
@@ -630,6 +630,20 @@ unsigned ARM64TargetLowering::getMaximalGlobalOffset() const {
   return 4095;
 }
 
+/// getGlobalMergeAlignment - Set alignment to be the max size of merged
+/// global variable data structure, and make it aligned up to power of 2.
+/// This way, we could guarantee the merged global variable data structure
+/// doesn't cross page boundary, because usually OS always allocates page at
+/// 4096-byte aligned boundary.
+unsigned ARM64TargetLowering::getGlobalMergeAlignment(
+                                StructType *MergedTy) const {
+  unsigned Align = getDataLayout()->getTypeAllocSize(MergedTy);
+  if (Align & (Align - 1))
+    Align = llvm::NextPowerOf2(Align);
+
+  return Align;
+}
+
 FastISel *
 ARM64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
                                     const TargetLibraryInfo *libInfo) const {
diff --git a/lib/Target/ARM64/ARM64ISelLowering.h b/lib/Target/ARM64/ARM64ISelLowering.h
index 55792317dba..00b2710a35c 100644
--- a/lib/Target/ARM64/ARM64ISelLowering.h
+++ b/lib/Target/ARM64/ARM64ISelLowering.h
@@ -236,6 +236,10 @@ public:
   /// be used for loads / stores from the global.
   unsigned getMaximalGlobalOffset() const override;
 
+  /// getGlobalMergeAlignment - Set alignment to be the max size of merged
+  /// global variable data structure, and make it aligned up to power of 2.
+  unsigned getGlobalMergeAlignment(StructType *MergedTy) const override;
+
   /// Returns true if a cast between SrcAS and DestAS is a noop.
   bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
     // Addrspacecasts are always noops.
diff --git a/lib/Transforms/Scalar/GlobalMerge.cpp b/lib/Transforms/Scalar/GlobalMerge.cpp
index dd9c3784cc2..98061f5b9a8 100644
--- a/lib/Transforms/Scalar/GlobalMerge.cpp
+++ b/lib/Transforms/Scalar/GlobalMerge.cpp
@@ -72,7 +72,7 @@ using namespace llvm;
 #define DEBUG_TYPE "global-merge"
 
 static cl::opt<bool>
-EnableGlobalMerge("global-merge", cl::Hidden,
+EnableGlobalMerge("enable-global-merge", cl::NotHidden,
                   cl::desc("Enable global merge pass"),
                   cl::init(true));
 
@@ -81,6 +81,16 @@ EnableGlobalMergeOnConst("global-merge-on-const", cl::Hidden,
                          cl::desc("Enable global merge pass on constants"),
                          cl::init(false));
 
+static cl::opt<bool>
+EnableGlobalMergeOnExternal("global-merge-on-external", cl::Hidden,
+                   cl::desc("Enable global merge pass on external linkage"),
+                   cl::init(false));
+
+static cl::opt<bool>
+EnableGlobalMergeAligned("global-merge-aligned", cl::Hidden,
+                   cl::desc("Set target specific alignment for global merge pass"),
+                   cl::init(false));
+
 STATISTIC(NumMerged      , "Number of globals merged");
 namespace {
   class GlobalMerge : public FunctionPass {
@@ -129,9 +139,21 @@ namespace {
 } // end anonymous namespace
 
 char GlobalMerge::ID = 0;
-INITIALIZE_PASS(GlobalMerge, "global-merge",
-                "Global Merge", false, false)
 
+static void *initializeGlobalMergePassOnce(PassRegistry &Registry) {
+  PassInfo *PI = new PassInfo(
+      "Merge global variables",
+      "global-merge", &GlobalMerge::ID,
+      PassInfo::NormalCtor_t(callDefaultCtor<GlobalMerge>), false,
+      false, PassInfo::TargetMachineCtor_t(
+                 callTargetMachineCtor<GlobalMerge>));
+  Registry.registerPass(*PI, true);
+  return PI;
+}
+
+void llvm::initializeGlobalMergePass(PassRegistry &Registry) {
+  CALL_ONCE_INITIALIZATION(initializeGlobalMergePassOnce)
+}
 
 bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                           Module &M, bool isConst, unsigned AddrSpace) const {
@@ -154,11 +176,16 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
 
   Type *Int32Ty = Type::getInt32Ty(M.getContext());
 
+  assert (Globals.size() > 1);
+  
   for (size_t i = 0, e = Globals.size(); i != e; ) {
     size_t j = 0;
     uint64_t MergedSize = 0;
     std::vector<Type*> Tys;
     std::vector<Constant*> Inits;
+
+    bool HasExternal = false;
+    GlobalVariable *TheFirstExternal = 0;
     for (j = i; j != e; ++j) {
       Type *Ty = Globals[j]->getType()->getElementType();
       MergedSize += DL->getTypeAllocSize(Ty);
@@ -167,17 +194,45 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
       }
       Tys.push_back(Ty);
       Inits.push_back(Globals[j]->getInitializer());
+
+      if (Globals[j]->hasExternalLinkage() && !HasExternal) {
+        HasExternal = true;
+        TheFirstExternal = Globals[j];
+      }
     }
 
+    // If merged variables doesn't have external linkage, we needn't to expose
+    // the symbol after merging.
+    GlobalValue::LinkageTypes Linkage = HasExternal ?
+                                          GlobalValue::ExternalLinkage :
+                                          GlobalValue::InternalLinkage ;
+
+    // If merged variables have external linkage, we use symbol name of the
+    // first variable merged as the suffix of global symbol name. This would
+    // be able to avoid the link-time naming conflict for globalm symbols.
+    Twine MergedGVName = HasExternal ?
+                           "_MergedGlobals_" + TheFirstExternal->getName() :
+                           "_MergedGlobals" ;
+
     StructType *MergedTy = StructType::get(M.getContext(), Tys);
     Constant *MergedInit = ConstantStruct::get(MergedTy, Inits);
+
     GlobalVariable *MergedGV = new GlobalVariable(M, MergedTy, isConst,
-                                                  GlobalValue::InternalLinkage,
-                                                  MergedInit, "_MergedGlobals",
-                                                  nullptr,
-                                                  GlobalVariable::NotThreadLocal,
-                                                  AddrSpace);
+                                     Linkage, MergedInit, MergedGVName,
+                                     nullptr, GlobalVariable::NotThreadLocal,
+                                     AddrSpace);
+
+    if (EnableGlobalMergeAligned) {
+      unsigned Align = TLI->getGlobalMergeAlignment(MergedTy);
+      assert(((Align % DL->getABITypeAlignment(MergedTy)) == 0) &&
+        "Specified alignment doesn't meet natural alignment requirement.");
+      MergedGV->setAlignment(Align);
+    }
+
     for (size_t k = i; k < j; ++k) {
+      GlobalValue::LinkageTypes Linkage = Globals[k]->getLinkage();
+      std::string Name = Globals[k]->getName();
+
       Constant *Idx[2] = {
         ConstantInt::get(Int32Ty, 0),
         ConstantInt::get(Int32Ty, k-i)
@@ -185,6 +240,12 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
       Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(MergedGV, Idx);
       Globals[k]->replaceAllUsesWith(GEP);
       Globals[k]->eraseFromParent();
+
+      if (Linkage != GlobalValue::InternalLinkage) {
+        // Generate a new alias...
+        new GlobalAlias(GEP->getType(), Linkage, Name, GEP, &M);
+      }
+
       NumMerged++;
     }
     i = j;
@@ -245,8 +306,12 @@ bool GlobalMerge::doInitialization(Module &M) {
   // Grab all non-const globals.
   for (Module::global_iterator I = M.global_begin(),
          E = M.global_end(); I != E; ++I) {
-    // Merge is safe for "normal" internal globals only
-    if (!I->hasLocalLinkage() || I->isThreadLocal() || I->hasSection())
+    // Merge is safe for "normal" internal or external globals only
+    if (I->isDeclaration() || I->isThreadLocal() || I->hasSection())
+      continue;
+
+    if (!(EnableGlobalMergeOnExternal && I->hasExternalLinkage())
+          && !I->hasInternalLinkage())
       continue;
 
     PointerType *PT = dyn_cast<PointerType>(I->getType());
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index f8f828c8405..2cca7254806 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -38,6 +38,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeDSEPass(Registry);
   initializeGVNPass(Registry);
   initializeEarlyCSEPass(Registry);
+  initializeGlobalMergePass(Registry);
   initializeIndVarSimplifyPass(Registry);
   initializeJumpThreadingPass(Registry);
   initializeLICMPass(Registry);
diff --git a/test/CodeGen/AArch64/global-merge.ll b/test/CodeGen/AArch64/global-merge.ll
new file mode 100644
index 00000000000..101a5b778ed
--- /dev/null
+++ b/test/CodeGen/AArch64/global-merge.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck --check-prefix=NO-MERGE %s
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -O0 -global-merge-on-external=true -global-merge-aligned=true | FileCheck --check-prefix=NO-MERGE %s
+
+; RUN: llc < %s -mtriple=arm64-apple-ios -O0 | FileCheck %s --check-prefix=CHECK-APPLE-IOS-NO-MERGE
+; RUN: llc < %s -mtriple=arm64-apple-ios -O0 -global-merge-on-external=true -global-merge-aligned=false | FileCheck %s --check-prefix=CHECK-APPLE-IOS-NO-MERGE
+
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -O1 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -O1 -global-merge-on-external=true -global-merge-aligned=true | FileCheck %s
+
+; RUN: llc < %s -mtriple=arm64-apple-ios -O1 | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+; RUN: llc < %s -mtriple=arm64-apple-ios -O1 -global-merge-on-external=true -global-merge-aligned=false | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+
+@m = internal global i32 0, align 4
+@n = internal global i32 0, align 4
+
+define void @f1(i32 %a1, i32 %a2) {
+; CHECK-LABEL: f1:
+; CHECK: adrp x{{[0-9]+}}, _MergedGlobals
+; CHECK-NOT: adrp
+
+; CHECK-APPLE-IOS-LABEL: f1:
+; CHECK-APPLE-IOS: adrp x{{[0-9]+}}, __MergedGlobals
+; CHECK-APPLE-IOS-NOT: adrp
+  store i32 %a1, i32* @m, align 4
+  store i32 %a2, i32* @n, align 4
+  ret void
+}
+
+; CHECK:        .local _MergedGlobals
+; CHECK:        .comm  _MergedGlobals,8,8
+; NO-MERGE-NOT: .local _MergedGlobals
+
+; CHECK-APPLE-IOS: .zerofill __DATA,__bss,__MergedGlobals,8,3
+; CHECK-APPLE-IOS-NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,8,3
diff --git a/test/CodeGen/AArch64/global_merge_1.ll b/test/CodeGen/AArch64/global_merge_1.ll
deleted file mode 100644
index e0587d6b904..00000000000
--- a/test/CodeGen/AArch64/global_merge_1.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-
-@m = internal global i32 0, align 4
-@n = internal global i32 0, align 4
-
-define void @f1(i32 %a1, i32 %a2) {
-; CHECK-LABEL: f1:
-; CHECK: adrp x{{[0-9]+}}, _MergedGlobals
-; CHECK-NOT: adrp
-  store i32 %a1, i32* @m, align 4
-  store i32 %a2, i32* @n, align 4
-  ret void
-}
-
-; CHECK:        .local _MergedGlobals
-; CHECK:        .comm  _MergedGlobals,8,8
-
diff --git a/test/CodeGen/ARM/global-merge-1.ll b/test/CodeGen/ARM/global-merge-1.ll
new file mode 100644
index 00000000000..341597e6188
--- /dev/null
+++ b/test/CodeGen/ARM/global-merge-1.ll
@@ -0,0 +1,85 @@
+; RUN: llc %s -O0 -o - | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O0 -o - -enable-global-merge=false | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O0 -o - -enable-global-merge=true | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O1 -o - | FileCheck -check-prefix=MERGE %s
+; RUN: llc %s -O1 -o - -enable-global-merge=false | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O1 -o - -enable-global-merge=true | FileCheck -check-prefix=MERGE %s
+
+; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
+; MERGE: .zerofill __DATA,__bss,__MergedGlobals,60,4
+; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
+
+; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
+; NO-MERGE: .zerofill __DATA,__bss,_bar,20,2
+; NO-MERGE: .zerofill __DATA,__bss,_baz,20,2
+; NO-MERGE: .zerofill __DATA,__bss,_foo,20,2
+; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios3.0.0"
+
+@bar = internal global [5 x i32] zeroinitializer, align 4
+@baz = internal global [5 x i32] zeroinitializer, align 4
+@foo = internal global [5 x i32] zeroinitializer, align 4
+
+; Function Attrs: nounwind ssp
+define internal void @initialize() #0 {
+  %1 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3
+  store i32 %1, i32* getelementptr inbounds ([5 x i32]* @bar, i32 0, i32 0), align 4, !tbaa !1
+  %2 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3
+  store i32 %2, i32* getelementptr inbounds ([5 x i32]* @baz, i32 0, i32 0), align 4, !tbaa !1
+  %3 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3
+  store i32 %3, i32* getelementptr inbounds ([5 x i32]* @bar, i32 0, i32 1), align 4, !tbaa !1
+  %4 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3
+  store i32 %4, i32* getelementptr inbounds ([5 x i32]* @baz, i32 0, i32 1), align 4, !tbaa !1
+  %5 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3
+  store i32 %5, i32* getelementptr inbounds ([5 x i32]* @bar, i32 0, i32 2), align 4, !tbaa !1
+  %6 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3
+  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @baz, i32 0, i32 2), align 4, !tbaa !1
+  %7 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3
+  store i32 %7, i32* getelementptr inbounds ([5 x i32]* @bar, i32 0, i32 3), align 4, !tbaa !1
+  %8 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3
+  store i32 %8, i32* getelementptr inbounds ([5 x i32]* @baz, i32 0, i32 3), align 4, !tbaa !1
+  %9 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3
+  store i32 %9, i32* getelementptr inbounds ([5 x i32]* @bar, i32 0, i32 4), align 4, !tbaa !1
+  %10 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3
+  store i32 %10, i32* getelementptr inbounds ([5 x i32]* @baz, i32 0, i32 4), align 4, !tbaa !1
+  ret void
+}
+
+declare i32 @calc(...) #1
+
+; Function Attrs: nounwind ssp
+define internal void @calculate() #0 {
+  %1 = load <4 x i32>* bitcast ([5 x i32]* @bar to <4 x i32>*), align 4
+  %2 = load <4 x i32>* bitcast ([5 x i32]* @baz to <4 x i32>*), align 4
+  %3 = mul <4 x i32> %2, %1
+  store <4 x i32> %3, <4 x i32>* bitcast ([5 x i32]* @foo to <4 x i32>*), align 4
+  %4 = load i32* getelementptr inbounds ([5 x i32]* @bar, i32 0, i32 4), align 4, !tbaa !1
+  %5 = load i32* getelementptr inbounds ([5 x i32]* @baz, i32 0, i32 4), align 4, !tbaa !1
+  %6 = mul nsw i32 %5, %4
+  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @foo, i32 0, i32 4), align 4, !tbaa !1
+  ret void
+}
+
+; Function Attrs: nounwind readnone ssp
+define internal i32* @returnFoo() #2 {
+  ret i32* getelementptr inbounds ([5 x i32]* @foo, i32 0, i32 0)
+}
+
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"LLVM version 3.4 "}
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"int", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/ARM64/global-merge.ll b/test/CodeGen/ARM64/global-merge.ll
new file mode 100644
index 00000000000..4715fd8de23
--- /dev/null
+++ b/test/CodeGen/ARM64/global-merge.ll
@@ -0,0 +1,88 @@
+; RUN: llc %s -O0 -o - | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O0 -o - -enable-global-merge=false | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O0 -o - -enable-global-merge=true | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O1 -o - | FileCheck -check-prefix=MERGE %s
+; RUN: llc %s -O1 -o - -enable-global-merge=false | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O1 -o - -enable-global-merge=true | FileCheck -check-prefix=MERGE %s
+
+; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
+; MERGE: .zerofill __DATA,__bss,__MergedGlobals,60,4
+; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
+
+; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
+; NO-MERGE: .zerofill __DATA,__bss,_bar,20,2
+; NO-MERGE: .zerofill __DATA,__bss,_baz,20,2
+; NO-MERGE: .zerofill __DATA,__bss,_foo,20,2
+; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+@bar = internal global [5 x i32] zeroinitializer, align 4
+@baz = internal global [5 x i32] zeroinitializer, align 4
+@foo = internal global [5 x i32] zeroinitializer, align 4
+
+; Function Attrs: nounwind ssp
+define internal void @initialize() #0 {
+  %1 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %1, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 0), align 4
+  %2 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %2, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 0), align 4
+  %3 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %3, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 1), align 4
+  %4 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %4, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 1), align 4
+  %5 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %5, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 2), align 4
+  %6 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 2), align 4
+  %7 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %7, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 3), align 4
+  %8 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %8, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 3), align 4
+  %9 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %9, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 4), align 4
+  %10 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %10, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 4), align 4
+  ret void
+}
+
+declare i32 @calc(...)
+
+; Function Attrs: nounwind ssp
+define internal void @calculate() #0 {
+  %1 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 0), align 4
+  %2 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 0), align 4
+  %3 = mul nsw i32 %2, %1
+  store i32 %3, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 0), align 4
+  %4 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 1), align 4
+  %5 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 1), align 4
+  %6 = mul nsw i32 %5, %4
+  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 1), align 4
+  %7 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 2), align 4
+  %8 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 2), align 4
+  %9 = mul nsw i32 %8, %7
+  store i32 %9, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 2), align 4
+  %10 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 3), align 4
+  %11 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 3), align 4
+  %12 = mul nsw i32 %11, %10
+  store i32 %12, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 3), align 4
+  %13 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 4), align 4
+  %14 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 4), align 4
+  %15 = mul nsw i32 %14, %13
+  store i32 %15, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 4), align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone ssp
+define internal i32* @returnFoo() #1 {
+  ret i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 0)
+}
+
+attributes #0 = { nounwind ssp }
+attributes #1 = { nounwind readnone ssp }
+attributes #2 = { nounwind }
diff --git a/test/Transforms/GlobalMerge/AArch64/global-merge-1.ll b/test/Transforms/GlobalMerge/AArch64/global-merge-1.ll
new file mode 100644
index 00000000000..346f176ec71
--- /dev/null
+++ b/test/Transforms/GlobalMerge/AArch64/global-merge-1.ll
@@ -0,0 +1,22 @@
+; RUN: opt %s -mtriple=aarch64-none-linux-gnu -global-merge -S -o - | FileCheck %s
+; RUN: opt %s -mtriple=aarch64-none-linux-gnu -global-merge -global-merge-on-external -global-merge-aligned -S -o - | FileCheck %s
+
+; RUN: opt %s -mtriple=arm64-linux-gnuabi -global-merge -S -o - | FileCheck %s
+; RUN: opt %s -mtriple=arm64-linux-gnuabi -global-merge -global-merge-on-external -global-merge-aligned -S -o - | FileCheck %s
+
+; RUN: opt %s -mtriple=arm64-apple-ios -global-merge -S -o - | FileCheck %s
+; RUN: opt %s -mtriple=arm64-apple-ios -global-merge -global-merge-on-external -global-merge-aligned -S -o - | FileCheck %s
+
+@m = internal global i32 0, align 4
+@n = internal global i32 0, align 4
+
+; CHECK: @_MergedGlobals = internal global { i32, i32 } zeroinitializer
+
+define void @f1(i32 %a1, i32 %a2) {
+; CHECK-LABEL: @f1
+; CHECK: getelementptr inbounds ({ i32, i32 }* @_MergedGlobals, i32 0, i32 0)
+; CHECK: getelementptr inbounds ({ i32, i32 }* @_MergedGlobals, i32 0, i32 1)
+  store i32 %a1, i32* @m, align 4
+  store i32 %a2, i32* @n, align 4
+  ret void
+}
diff --git a/test/Transforms/GlobalMerge/AArch64/global-merge-2.ll b/test/Transforms/GlobalMerge/AArch64/global-merge-2.ll
new file mode 100644
index 00000000000..0445b237f5e
--- /dev/null
+++ b/test/Transforms/GlobalMerge/AArch64/global-merge-2.ll
@@ -0,0 +1,30 @@
+; RUN: opt %s -mtriple=aarch64-none-linux-gnu -global-merge -global-merge-on-external -global-merge-aligned -S -o - | FileCheck %s
+; RUN: opt %s -mtriple=arm64-linux-gnuabi -global-merge -global-merge-on-external -global-merge-aligned -S -o - | FileCheck %s
+; RUN: opt %s -mtriple=arm64-apple-ios -global-merge -global-merge-on-external -global-merge-aligned -S -o - | FileCheck %s
+
+@x = global i32 0, align 4
+@y = global i32 0, align 4
+@z = global i32 0, align 4
+
+; CHECK: @_MergedGlobals_x = global { i32, i32, i32 } zeroinitializer, align 16
+; CHECK: @x = alias getelementptr inbounds ({ i32, i32, i32 }* @_MergedGlobals_x, i32 0, i32 0)
+; CHECK: @y = alias getelementptr inbounds ({ i32, i32, i32 }* @_MergedGlobals_x, i32 0, i32 1)
+; CHECK: @z = alias getelementptr inbounds ({ i32, i32, i32 }* @_MergedGlobals_x, i32 0, i32 2)
+
+define void @f1(i32 %a1, i32 %a2) {
+; CHECK-LABEL: @f1
+; CHECK: getelementptr inbounds ({ i32, i32, i32 }* @_MergedGlobals_x, i32 0, i32 0)
+; CHECK: getelementptr inbounds ({ i32, i32, i32 }* @_MergedGlobals_x, i32 0, i32 1)
+  store i32 %a1, i32* @x, align 4
+  store i32 %a2, i32* @y, align 4
+  ret void
+}
+
+define void @g1(i32 %a1, i32 %a2) {
+; CHECK-LABEL: @g1
+; CHECK: getelementptr inbounds ({ i32, i32, i32 }* @_MergedGlobals_x, i32 0, i32 1)
+; CHECK: getelementptr inbounds ({ i32, i32, i32 }* @_MergedGlobals_x, i32 0, i32 2)
+  store i32 %a1, i32* @y, align 4
+  store i32 %a2, i32* @z, align 4
+  ret void
+}
diff --git a/test/Transforms/GlobalMerge/AArch64/global-merge-3.ll b/test/Transforms/GlobalMerge/AArch64/global-merge-3.ll
new file mode 100644
index 00000000000..05ab1ac836d
--- /dev/null
+++ b/test/Transforms/GlobalMerge/AArch64/global-merge-3.ll
@@ -0,0 +1,27 @@
+; RUN: opt %s -mtriple=aarch64-none-linux-gnu -global-merge -global-merge-on-external -global-merge-aligned -S -o - | FileCheck %s
+; RUN: opt %s -mtriple=arm64-linux-gnuabi -global-merge -global-merge-on-external -global-merge-aligned -S -o - | FileCheck %s
+; RUN: opt %s -mtriple=arm64-apple-ios -global-merge -global-merge-on-external -global-merge-aligned -S -o - | FileCheck %s
+
+@x = global [1000 x i32] zeroinitializer, align 1
+@y = global [1000 x i32] zeroinitializer, align 1
+@z = internal global i32 1, align 4
+
+; CHECK: @_MergedGlobals_x = global { i32, [1000 x i32] } { i32 1, [1000 x i32] zeroinitializer }, align 4096
+; CHECK: @_MergedGlobals_y = global { [1000 x i32] } zeroinitializer, align 4096
+
+; CHECK: @x = alias getelementptr inbounds ({ i32, [1000 x i32] }* @_MergedGlobals_x, i32 0, i32 1)
+; CHECK: @y = alias getelementptr inbounds ({ [1000 x i32] }* @_MergedGlobals_y, i32 0, i32 0)
+
+define void @f1(i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: @f1
+; CHECK: %x3 = getelementptr inbounds [1000 x i32]* getelementptr inbounds ({ i32, [1000 x i32] }* @_MergedGlobals_x, i32 0, i32 1), i32 0, i64 3
+; CHECK: %y3 = getelementptr inbounds [1000 x i32]* getelementptr inbounds ({ [1000 x i32] }* @_MergedGlobals_y, i32 0, i32 0), i32 0, i64 3
+; CHECK: store i32 %a3, i32* getelementptr inbounds ({ i32, [1000 x i32] }* @_MergedGlobals_x, i32 0, i32 0), align 4
+
+  %x3 = getelementptr inbounds [1000 x i32]* @x, i32 0, i64 3
+  %y3 = getelementptr inbounds [1000 x i32]* @y, i32 0, i64 3
+  store i32 %a1, i32* %x3, align 4
+  store i32 %a2, i32* %y3, align 4
+  store i32 %a3, i32* @z, align 4
+  ret void
+}
diff --git a/test/Transforms/GlobalMerge/AArch64/lit.local.cfg b/test/Transforms/GlobalMerge/AArch64/lit.local.cfg
new file mode 100644
index 00000000000..9a66a00189e
--- /dev/null
+++ b/test/Transforms/GlobalMerge/AArch64/lit.local.cfg
@@ -0,0 +1,4 @@
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/GlobalMerge/ARM/arm.ll b/test/Transforms/GlobalMerge/ARM/arm.ll
index 8c77de62ece..45062af1177 100644
--- a/test/Transforms/GlobalMerge/ARM/arm.ll
+++ b/test/Transforms/GlobalMerge/ARM/arm.ll
@@ -1,23 +1,4 @@
-; RUN: llc %s -O0 -o - | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O0 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O0 -o - -global-merge=true | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O1 -o - | FileCheck -check-prefix=MERGE %s
-; RUN: llc %s -O1 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O1 -o - -global-merge=true | FileCheck -check-prefix=MERGE %s
-
-; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
-; MERGE: .zerofill __DATA,__bss,__MergedGlobals,60,4
-; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
-
-; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
-; NO-MERGE: .zerofill __DATA,__bss,_bar,20,2
-; NO-MERGE: .zerofill __DATA,__bss,_baz,20,2
-; NO-MERGE: .zerofill __DATA,__bss,_foo,20,2
-; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
+; RUN: opt %s -mtriple=arm-linux-gnuabi -global-merge -S -o - | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios3.0.0"
@@ -26,6 +7,8 @@ target triple = "thumbv7-apple-ios3.0.0"
 @baz = internal global [5 x i32] zeroinitializer, align 4
 @foo = internal global [5 x i32] zeroinitializer, align 4
 
+; CHECK: @_MergedGlobals = internal global { [5 x i32], [5 x i32], [5 x i32] } zeroinitializer
+
 ; Function Attrs: nounwind ssp
 define internal void @initialize() #0 {
   %1 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3
diff --git a/test/Transforms/GlobalMerge/ARM64/arm64.ll b/test/Transforms/GlobalMerge/ARM64/arm64.ll
index eea474a74f1..ebc362ea97d 100644
--- a/test/Transforms/GlobalMerge/ARM64/arm64.ll
+++ b/test/Transforms/GlobalMerge/ARM64/arm64.ll
@@ -1,23 +1,6 @@
-; RUN: llc %s -O0 -o - | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O0 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O0 -o - -global-merge=true | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O1 -o - | FileCheck -check-prefix=MERGE %s
-; RUN: llc %s -O1 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O1 -o - -global-merge=true | FileCheck -check-prefix=MERGE %s
+; RUN: opt %s -mtriple=arm64-linux-gnuabi -global-merge -S -o - | FileCheck %s
 
-; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
-; MERGE: .zerofill __DATA,__bss,__MergedGlobals,60,4
-; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
-
-; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
-; NO-MERGE: .zerofill __DATA,__bss,_bar,20,2
-; NO-MERGE: .zerofill __DATA,__bss,_baz,20,2
-; NO-MERGE: .zerofill __DATA,__bss,_foo,20,2
-; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
+; CHECK: @_MergedGlobals = internal global { [5 x i32], [5 x i32], [5 x i32] } zeroinitializer
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
 target triple = "arm64-apple-ios7.0.0"