#define DEBUG_TYPE "ctrloops"
#include "PPC.h"
#include "PPCTargetMachine.h"
+#include "MCTargetDesc/PPCPredicates.h"
#include "llvm/Constants.h"
#include "llvm/PassSupport.h"
#include "llvm/ADT/DenseMap.h"
/// getCanonicalInductionVariable - Check to see if the loop has a canonical
/// induction variable.
/// Should be defined in MachineLoop. Based upon version in class Loop.
- MachineInstr *getCanonicalInductionVariable(MachineLoop *L,
- MachineInstr *&IOp) const;
+ void getCanonicalInductionVariable(MachineLoop *L,
+ SmallVector<MachineInstr *, 4> &IVars,
+ SmallVector<MachineInstr *, 4> &IOps) const;
/// getTripCount - Return a loop-invariant LLVM register indicating the
/// number of times the loop will be executed. If the trip-count cannot
/// be determined, this return null.
- CountValue *getTripCount(MachineLoop *L, bool &WordCmp,
+ CountValue *getTripCount(MachineLoop *L,
SmallVector<MachineInstr *, 2> &OldInsts) const;
/// isInductionOperation - Return true if the instruction matches the
/// isCompareEquals - Returns true if the instruction is a compare equals
/// instruction with an immediate operand.
-static bool isCompareEqualsImm(const MachineInstr *MI, bool &WordCmp) {
- if (MI->getOpcode() == PPC::CMPWI || MI->getOpcode() == PPC::CMPLWI) {
- WordCmp = true;
+static bool isCompareEqualsImm(const MachineInstr *MI, bool &SignedCmp) {
+ if (MI->getOpcode() == PPC::CMPWI || MI->getOpcode() == PPC::CMPDI) {
+ SignedCmp = true;
return true;
- } else if (MI->getOpcode() == PPC::CMPDI || MI->getOpcode() == PPC::CMPLDI) {
- WordCmp = false;
+ } else if (MI->getOpcode() == PPC::CMPLWI || MI->getOpcode() == PPC::CMPLDI) {
+ SignedCmp = false;
return true;
}
/// the machine.
/// This method assumes that the IndVarSimplify pass has been run by 'opt'.
///
-MachineInstr
-*PPCCTRLoops::getCanonicalInductionVariable(MachineLoop *L,
- MachineInstr *&IOp) const {
+void
+PPCCTRLoops::getCanonicalInductionVariable(MachineLoop *L,
+ SmallVector<MachineInstr *, 4> &IVars,
+ SmallVector<MachineInstr *, 4> &IOps) const {
MachineBasicBlock *TopMBB = L->getTopBlock();
MachineBasicBlock::pred_iterator PI = TopMBB->pred_begin();
assert(PI != TopMBB->pred_end() &&
"Loop must have more than one incoming edge!");
MachineBasicBlock *Backedge = *PI++;
- if (PI == TopMBB->pred_end()) return 0; // dead loop
+ if (PI == TopMBB->pred_end()) return; // dead loop
MachineBasicBlock *Incoming = *PI++;
- if (PI != TopMBB->pred_end()) return 0; // multiple backedges?
+ if (PI != TopMBB->pred_end()) return; // multiple backedges?
// make sure there is one incoming and one backedge and determine which
// is which.
if (L->contains(Incoming)) {
if (L->contains(Backedge))
- return 0;
+ return;
std::swap(Incoming, Backedge);
} else if (!L->contains(Backedge))
- return 0;
+ return;
// Loop over all of the PHI nodes, looking for a canonical induction variable:
// - The PHI node is "reg1 = PHI reg2, BB1, reg3, BB2".
// Check if the definition is an induction operation.
MachineInstr *DI = MRI->getVRegDef(MPhi->getOperand(i).getReg());
if (isInductionOperation(DI, DefReg)) {
- IOp = DI;
- return MPhi;
+ IOps.push_back(DI);
+ IVars.push_back(MPhi);
}
}
}
}
- return 0;
+ return;
}
/// getTripCount - Return a loop-invariant LLVM value indicating the
///
/// Based upon getTripCount in LoopInfo.
///
-CountValue *PPCCTRLoops::getTripCount(MachineLoop *L, bool &WordCmp,
+CountValue *PPCCTRLoops::getTripCount(MachineLoop *L,
SmallVector<MachineInstr *, 2> &OldInsts) const {
+ MachineBasicBlock *LastMBB = L->getExitingBlock();
+ // Don't generate a CTR loop if the loop has more than one exit.
+ if (LastMBB == 0)
+ return 0;
+
+ MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator();
+ if (LastI->getOpcode() != PPC::BCC)
+ return 0;
+
+ // We need to make sure that this compare is defining the condition
+ // register actually used by the terminating branch.
+
+ unsigned PredReg = LastI->getOperand(1).getReg();
+ DEBUG(dbgs() << "Examining loop with first terminator: " << *LastI);
+
+ unsigned PredCond = LastI->getOperand(0).getImm();
+ if (PredCond != PPC::PRED_EQ && PredCond != PPC::PRED_NE)
+ return 0;
+
// Check that the loop has a induction variable.
- MachineInstr *IOp;
- MachineInstr *IV_Inst = getCanonicalInductionVariable(L, IOp);
- if (IV_Inst == 0) return 0;
-
- // Canonical loops will end with a 'cmpwi/cmpdi cr, IV, Imm',
- // if Imm is 0, get the count from the PHI opnd
- // if Imm is -M, than M is the count
- // Otherwise, Imm is the count
- MachineOperand *IV_Opnd;
- const MachineOperand *InitialValue;
- if (!L->contains(IV_Inst->getOperand(2).getMBB())) {
- InitialValue = &IV_Inst->getOperand(1);
- IV_Opnd = &IV_Inst->getOperand(3);
- } else {
- InitialValue = &IV_Inst->getOperand(3);
- IV_Opnd = &IV_Inst->getOperand(1);
- }
+ SmallVector<MachineInstr *, 4> IVars, IOps;
+ getCanonicalInductionVariable(L, IVars, IOps);
+ for (unsigned i = 0; i < IVars.size(); ++i) {
+ MachineInstr *IOp = IOps[i];
+ MachineInstr *IV_Inst = IVars[i];
+
+ // Canonical loops will end with a 'cmpwi/cmpdi cr, IV, Imm',
+ // if Imm is 0, get the count from the PHI opnd
+ // if Imm is -M, than M is the count
+ // Otherwise, Imm is the count
+ MachineOperand *IV_Opnd;
+ const MachineOperand *InitialValue;
+ if (!L->contains(IV_Inst->getOperand(2).getMBB())) {
+ InitialValue = &IV_Inst->getOperand(1);
+ IV_Opnd = &IV_Inst->getOperand(3);
+ } else {
+ InitialValue = &IV_Inst->getOperand(3);
+ IV_Opnd = &IV_Inst->getOperand(1);
+ }
- // Look for the cmp instruction to determine if we
- // can get a useful trip count. The trip count can
- // be either a register or an immediate. The location
- // of the value depends upon the type (reg or imm).
- while ((IV_Opnd = IV_Opnd->getNextOperandForReg())) {
- MachineInstr *MI = IV_Opnd->getParent();
- if (L->contains(MI) && isCompareEqualsImm(MI, WordCmp)) {
- OldInsts.push_back(MI);
- OldInsts.push_back(IOp);
-
- const MachineOperand &MO = MI->getOperand(2);
- assert(MO.isImm() && "IV Cmp Operand should be an immediate");
- int64_t ImmVal = MO.getImm();
-
- const MachineInstr *IV_DefInstr = MRI->getVRegDef(IV_Opnd->getReg());
- assert(L->contains(IV_DefInstr->getParent()) &&
- "IV definition should occurs in loop");
- int64_t iv_value = IV_DefInstr->getOperand(2).getImm();
-
- if (ImmVal == 0) {
- // Make sure the induction variable changes by one on each iteration.
- if (iv_value != 1 && iv_value != -1) {
- return 0;
- }
- return new CountValue(InitialValue->getReg(), iv_value > 0);
- } else {
+ DEBUG(dbgs() << "Considering:\n");
+ DEBUG(dbgs() << " induction operation: " << *IOp);
+ DEBUG(dbgs() << " induction variable: " << *IV_Inst);
+ DEBUG(dbgs() << " initial value: " << *InitialValue << "\n");
+
+ // Look for the cmp instruction to determine if we
+ // can get a useful trip count. The trip count can
+ // be either a register or an immediate. The location
+ // of the value depends upon the type (reg or imm).
+ while ((IV_Opnd = IV_Opnd->getNextOperandForReg())) {
+ bool SignedCmp;
+ MachineInstr *MI = IV_Opnd->getParent();
+ if (L->contains(MI) && isCompareEqualsImm(MI, SignedCmp) &&
+ MI->getOperand(0).getReg() == PredReg) {
+
+ OldInsts.push_back(MI);
+ OldInsts.push_back(IOp);
+
+ DEBUG(dbgs() << " compare: " << *MI);
+
+ const MachineOperand &MO = MI->getOperand(2);
+ assert(MO.isImm() && "IV Cmp Operand should be an immediate");
+
+ int64_t ImmVal;
+ if (SignedCmp)
+ ImmVal = (short) MO.getImm();
+ else
+ ImmVal = MO.getImm();
+
+ const MachineInstr *IV_DefInstr = MRI->getVRegDef(IV_Opnd->getReg());
+ assert(L->contains(IV_DefInstr->getParent()) &&
+ "IV definition should occurs in loop");
+ int64_t iv_value = (short) IV_DefInstr->getOperand(2).getImm();
+
assert(InitialValue->isReg() && "Expecting register for init value");
- const MachineInstr *DefInstr = MRI->getVRegDef(InitialValue->getReg());
-
+ unsigned InitialValueReg = InitialValue->getReg();
+
+ const MachineInstr *DefInstr = MRI->getVRegDef(InitialValueReg);
+
// Here we need to look for an immediate load (an li or lis/ori pair).
if (DefInstr && (DefInstr->getOpcode() == PPC::ORI8 ||
DefInstr->getOpcode() == PPC::ORI)) {
- int64_t start = DefInstr->getOperand(2).getImm();
+ int64_t start = (short) DefInstr->getOperand(2).getImm();
const MachineInstr *DefInstr2 =
MRI->getVRegDef(DefInstr->getOperand(0).getReg());
if (DefInstr2 && (DefInstr2->getOpcode() == PPC::LIS8 ||
DefInstr2->getOpcode() == PPC::LIS)) {
- start |= DefInstr2->getOperand(1).getImm() << 16;
+ DEBUG(dbgs() << " initial constant: " << *DefInstr);
+ DEBUG(dbgs() << " initial constant: " << *DefInstr2);
+ start |= int64_t(short(DefInstr2->getOperand(1).getImm())) << 16;
+
int64_t count = ImmVal - start;
if ((count % iv_value) != 0) {
return 0;
}
} else if (DefInstr && (DefInstr->getOpcode() == PPC::LI8 ||
DefInstr->getOpcode() == PPC::LI)) {
- int64_t count = ImmVal - DefInstr->getOperand(1).getImm();
+ DEBUG(dbgs() << " initial constant: " << *DefInstr);
+
+ int64_t count = ImmVal - int64_t(short(DefInstr->getOperand(1).getImm()));
if ((count % iv_value) != 0) {
return 0;
}
return new CountValue(count/iv_value);
+ } else if (iv_value == 1 || iv_value == -1) {
+ // We can't determine a constant starting value.
+ if (ImmVal == 0) {
+ return new CountValue(InitialValueReg, iv_value > 0);
+ }
+ // FIXME: handle non-zero end value.
}
+ // FIXME: handle non-unit increments (we might not want to introduce division
+ // but we can handle some 2^n cases with shifts).
+
}
}
}
return Changed;
}
- bool WordCmp;
SmallVector<MachineInstr *, 2> OldInsts;
// Are we able to determine the trip count for the loop?
- CountValue *TripCount = getTripCount(L, WordCmp, OldInsts);
+ CountValue *TripCount = getTripCount(L, OldInsts);
if (TripCount == 0) {
DEBUG(dbgs() << "failed to get trip count!\n");
return false;
const PPCSubtarget &Subtarget = MF->getTarget().getSubtarget<PPCSubtarget>();
bool isPPC64 = Subtarget.isPPC64();
+ const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+ const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+ const TargetRegisterClass *RC = isPPC64 ? G8RC : GPRC;
+
unsigned CountReg;
if (TripCount->isReg()) {
// Create a copy of the loop count register.
- const TargetRegisterClass *RC =
+ const TargetRegisterClass *SrcRC =
MF->getRegInfo().getRegClass(TripCount->getReg());
CountReg = MF->getRegInfo().createVirtualRegister(RC);
+ unsigned CopyOp = (isPPC64 && SrcRC == GPRC) ?
+ (unsigned) PPC::EXTSW_32_64 :
+ (unsigned) TargetOpcode::COPY;
BuildMI(*Preheader, InsertPos, dl,
- TII->get(TargetOpcode::COPY), CountReg).addReg(TripCount->getReg());
+ TII->get(CopyOp), CountReg).addReg(TripCount->getReg());
if (TripCount->isNeg()) {
unsigned CountReg1 = CountReg;
CountReg = MF->getRegInfo().createVirtualRegister(RC);
TII->get(isPPC64 ? PPC::NEG8 : PPC::NEG),
CountReg).addReg(CountReg1);
}
-
- // On a 64-bit system, if the original comparison was only 32-bit, then
- // mask out the higher-order part of the count.
- if (isPPC64 && WordCmp) {
- unsigned CountReg1 = CountReg;
- CountReg = MF->getRegInfo().createVirtualRegister(RC);
- BuildMI(*Preheader, InsertPos, dl,
- TII->get(PPC::RLDICL), CountReg).addReg(CountReg1
- ).addImm(0).addImm(32);
- }
} else {
assert(TripCount->isImm() && "Expecting immedate vaule for trip count");
// Put the trip count in a register for transfer into the count register.
- const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
- const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
- const TargetRegisterClass *RC = isPPC64 ? G8RC : GPRC;
int64_t CountImm = TripCount->getImm();
- if (TripCount->isNeg())
- CountImm = -CountImm;
+ assert(!TripCount->isNeg() && "Constant trip count must be positive");
CountReg = MF->getRegInfo().createVirtualRegister(RC);
if (CountImm > 0xFFFF) {
(isPPC64 ? PPC::BDZ8 : PPC::BDZ))).addMBB(BranchTarget);
// Conditional branch; just delete it.
+ DEBUG(dbgs() << "Removing old branch: " << *LastI);
LastMBB->erase(LastI);
delete TripCount;
--- /dev/null
+; ModuleID = 'tsc_s000.c'
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+; RUN: llc < %s -march=ppc64 | FileCheck %s
+
+@Y = common global [16000 x double] zeroinitializer, align 32
+@X = common global [16000 x double] zeroinitializer, align 32
+@Z = common global [16000 x double] zeroinitializer, align 32
+@U = common global [16000 x double] zeroinitializer, align 32
+@V = common global [16000 x double] zeroinitializer, align 32
+@aa = common global [256 x [256 x double]] zeroinitializer, align 32
+@bb = common global [256 x [256 x double]] zeroinitializer, align 32
+@cc = common global [256 x [256 x double]] zeroinitializer, align 32
+@array = common global [65536 x double] zeroinitializer, align 32
+@x = common global [16000 x double] zeroinitializer, align 32
+@temp = common global double 0.000000e+00, align 8
+@temp_int = common global i32 0, align 4
+@a = common global [16000 x double] zeroinitializer, align 32
+@b = common global [16000 x double] zeroinitializer, align 32
+@c = common global [16000 x double] zeroinitializer, align 32
+@d = common global [16000 x double] zeroinitializer, align 32
+@e = common global [16000 x double] zeroinitializer, align 32
+@tt = common global [256 x [256 x double]] zeroinitializer, align 32
+@indx = common global [16000 x i32] zeroinitializer, align 32
+@xx = common global double* null, align 8
+@yy = common global double* null, align 8
+
+define i32 @s000() nounwind {
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %for.end, %entry
+ %nl.010 = phi i32 [ 0, %entry ], [ %inc7, %for.end ]
+ br label %for.body3
+
+for.body3: ; preds = %for.body3, %for.cond1.preheader
+ %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next.15, %for.body3 ]
+ %arrayidx = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv
+ %0 = load double* %arrayidx, align 32, !tbaa !0
+ %add = fadd double %0, 1.000000e+00
+ %arrayidx5 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv
+ store double %add, double* %arrayidx5, align 32, !tbaa !0
+ %indvars.iv.next11 = or i64 %indvars.iv, 1
+ %arrayidx.1 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next11
+ %1 = load double* %arrayidx.1, align 8, !tbaa !0
+ %add.1 = fadd double %1, 1.000000e+00
+ %arrayidx5.1 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next11
+ store double %add.1, double* %arrayidx5.1, align 8, !tbaa !0
+ %indvars.iv.next.112 = or i64 %indvars.iv, 2
+ %arrayidx.2 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.112
+ %2 = load double* %arrayidx.2, align 16, !tbaa !0
+ %add.2 = fadd double %2, 1.000000e+00
+ %arrayidx5.2 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.112
+ store double %add.2, double* %arrayidx5.2, align 16, !tbaa !0
+ %indvars.iv.next.213 = or i64 %indvars.iv, 3
+ %arrayidx.3 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.213
+ %3 = load double* %arrayidx.3, align 8, !tbaa !0
+ %add.3 = fadd double %3, 1.000000e+00
+ %arrayidx5.3 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.213
+ store double %add.3, double* %arrayidx5.3, align 8, !tbaa !0
+ %indvars.iv.next.314 = or i64 %indvars.iv, 4
+ %arrayidx.4 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.314
+ %4 = load double* %arrayidx.4, align 32, !tbaa !0
+ %add.4 = fadd double %4, 1.000000e+00
+ %arrayidx5.4 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.314
+ store double %add.4, double* %arrayidx5.4, align 32, !tbaa !0
+ %indvars.iv.next.415 = or i64 %indvars.iv, 5
+ %arrayidx.5 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.415
+ %5 = load double* %arrayidx.5, align 8, !tbaa !0
+ %add.5 = fadd double %5, 1.000000e+00
+ %arrayidx5.5 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.415
+ store double %add.5, double* %arrayidx5.5, align 8, !tbaa !0
+ %indvars.iv.next.516 = or i64 %indvars.iv, 6
+ %arrayidx.6 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.516
+ %6 = load double* %arrayidx.6, align 16, !tbaa !0
+ %add.6 = fadd double %6, 1.000000e+00
+ %arrayidx5.6 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.516
+ store double %add.6, double* %arrayidx5.6, align 16, !tbaa !0
+ %indvars.iv.next.617 = or i64 %indvars.iv, 7
+ %arrayidx.7 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.617
+ %7 = load double* %arrayidx.7, align 8, !tbaa !0
+ %add.7 = fadd double %7, 1.000000e+00
+ %arrayidx5.7 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.617
+ store double %add.7, double* %arrayidx5.7, align 8, !tbaa !0
+ %indvars.iv.next.718 = or i64 %indvars.iv, 8
+ %arrayidx.8 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.718
+ %8 = load double* %arrayidx.8, align 32, !tbaa !0
+ %add.8 = fadd double %8, 1.000000e+00
+ %arrayidx5.8 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.718
+ store double %add.8, double* %arrayidx5.8, align 32, !tbaa !0
+ %indvars.iv.next.819 = or i64 %indvars.iv, 9
+ %arrayidx.9 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.819
+ %9 = load double* %arrayidx.9, align 8, !tbaa !0
+ %add.9 = fadd double %9, 1.000000e+00
+ %arrayidx5.9 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.819
+ store double %add.9, double* %arrayidx5.9, align 8, !tbaa !0
+ %indvars.iv.next.920 = or i64 %indvars.iv, 10
+ %arrayidx.10 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.920
+ %10 = load double* %arrayidx.10, align 16, !tbaa !0
+ %add.10 = fadd double %10, 1.000000e+00
+ %arrayidx5.10 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.920
+ store double %add.10, double* %arrayidx5.10, align 16, !tbaa !0
+ %indvars.iv.next.1021 = or i64 %indvars.iv, 11
+ %arrayidx.11 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.1021
+ %11 = load double* %arrayidx.11, align 8, !tbaa !0
+ %add.11 = fadd double %11, 1.000000e+00
+ %arrayidx5.11 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.1021
+ store double %add.11, double* %arrayidx5.11, align 8, !tbaa !0
+ %indvars.iv.next.1122 = or i64 %indvars.iv, 12
+ %arrayidx.12 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.1122
+ %12 = load double* %arrayidx.12, align 32, !tbaa !0
+ %add.12 = fadd double %12, 1.000000e+00
+ %arrayidx5.12 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.1122
+ store double %add.12, double* %arrayidx5.12, align 32, !tbaa !0
+ %indvars.iv.next.1223 = or i64 %indvars.iv, 13
+ %arrayidx.13 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.1223
+ %13 = load double* %arrayidx.13, align 8, !tbaa !0
+ %add.13 = fadd double %13, 1.000000e+00
+ %arrayidx5.13 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.1223
+ store double %add.13, double* %arrayidx5.13, align 8, !tbaa !0
+ %indvars.iv.next.1324 = or i64 %indvars.iv, 14
+ %arrayidx.14 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.1324
+ %14 = load double* %arrayidx.14, align 16, !tbaa !0
+ %add.14 = fadd double %14, 1.000000e+00
+ %arrayidx5.14 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.1324
+ store double %add.14, double* %arrayidx5.14, align 16, !tbaa !0
+ %indvars.iv.next.1425 = or i64 %indvars.iv, 15
+ %arrayidx.15 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.1425
+ %15 = load double* %arrayidx.15, align 8, !tbaa !0
+ %add.15 = fadd double %15, 1.000000e+00
+ %arrayidx5.15 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.1425
+ store double %add.15, double* %arrayidx5.15, align 8, !tbaa !0
+ %indvars.iv.next.15 = add i64 %indvars.iv, 16
+ %lftr.wideiv.15 = trunc i64 %indvars.iv.next.15 to i32
+ %exitcond.15 = icmp eq i32 %lftr.wideiv.15, 16000
+ br i1 %exitcond.15, label %for.end, label %for.body3
+
+for.end: ; preds = %for.body3
+ %call = tail call i32 @dummy(double* getelementptr inbounds ([16000 x double]* @X, i64 0, i64 0), double* getelementptr inbounds ([16000 x double]* @Y, i64 0, i64 0), double* getelementptr inbounds ([16000 x double]* @Z, i64 0, i64 0), double* getelementptr inbounds ([16000 x double]* @U, i64 0, i64 0), double* getelementptr inbounds ([16000 x double]* @V, i64 0, i64 0), [256 x double]* getelementptr inbounds ([256 x [256 x double]]* @aa, i64 0, i64 0), [256 x double]* getelementptr inbounds ([256 x [256 x double]]* @bb, i64 0, i64 0), [256 x double]* getelementptr inbounds ([256 x [256 x double]]* @cc, i64 0, i64 0), double 0.000000e+00) nounwind
+ %inc7 = add nsw i32 %nl.010, 1
+ %exitcond = icmp eq i32 %inc7, 400000
+ br i1 %exitcond, label %for.end8, label %for.cond1.preheader
+
+for.end8: ; preds = %for.end
+ ret i32 0
+
+; CHECK: @s000
+; CHECK: mtctr
+; CHECK: bdnz
+}
+
+declare i32 @dummy(double*, double*, double*, double*, double*, [256 x double]*, [256 x double]*, [256 x double]*, double)
+
+!0 = metadata !{metadata !"double", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
--- /dev/null
+; ModuleID = 'SingleSource/Regression/C/sumarray2d.c'
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+; RUN: llc < %s -march=ppc64 | FileCheck %s
+
+@.str = private unnamed_addr constant [23 x i8] c"Sum(Array[%d,%d] = %d\0A\00", align 1
+
+define i32 @SumArray([100 x i32]* nocapture %Array, i32 %NumI, i32 %NumJ) nounwind readonly {
+entry:
+ %cmp12 = icmp eq i32 %NumI, 0
+ br i1 %cmp12, label %for.end8, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph: ; preds = %entry
+ %cmp29 = icmp eq i32 %NumJ, 0
+ br i1 %cmp29, label %for.inc6, label %for.body3.lr.ph.us
+
+for.inc6.us: ; preds = %for.body3.us
+ %indvars.iv.next17 = add i64 %indvars.iv16, 1
+ %lftr.wideiv18 = trunc i64 %indvars.iv.next17 to i32
+ %exitcond19 = icmp eq i32 %lftr.wideiv18, %NumI
+ br i1 %exitcond19, label %for.end8, label %for.body3.lr.ph.us
+
+for.body3.us: ; preds = %for.body3.us, %for.body3.lr.ph.us
+ %indvars.iv = phi i64 [ 0, %for.body3.lr.ph.us ], [ %indvars.iv.next, %for.body3.us ]
+ %Result.111.us = phi i32 [ %Result.014.us, %for.body3.lr.ph.us ], [ %add.us, %for.body3.us ]
+ %arrayidx5.us = getelementptr inbounds [100 x i32]* %Array, i64 %indvars.iv16, i64 %indvars.iv
+ %0 = load i32* %arrayidx5.us, align 4, !tbaa !0
+ %add.us = add nsw i32 %0, %Result.111.us
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %NumJ
+ br i1 %exitcond, label %for.inc6.us, label %for.body3.us
+
+for.body3.lr.ph.us: ; preds = %for.inc6.us, %for.cond1.preheader.lr.ph
+ %indvars.iv16 = phi i64 [ %indvars.iv.next17, %for.inc6.us ], [ 0, %for.cond1.preheader.lr.ph ]
+ %Result.014.us = phi i32 [ %add.us, %for.inc6.us ], [ 0, %for.cond1.preheader.lr.ph ]
+ br label %for.body3.us
+
+for.inc6: ; preds = %for.inc6, %for.cond1.preheader.lr.ph
+ %i.013 = phi i32 [ %inc7, %for.inc6 ], [ 0, %for.cond1.preheader.lr.ph ]
+ %inc7 = add i32 %i.013, 1
+ %exitcond20 = icmp eq i32 %inc7, %NumI
+ br i1 %exitcond20, label %for.end8, label %for.inc6
+
+for.end8: ; preds = %for.inc6.us, %for.inc6, %entry
+ %Result.0.lcssa = phi i32 [ 0, %entry ], [ %add.us, %for.inc6.us ], [ 0, %for.inc6 ]
+ ret i32 %Result.0.lcssa
+; CHECK: @SumArray
+; CHECK: mtctr
+; CHECK: bdnz
+}
+
+define i32 @main() nounwind {
+entry:
+ %Array = alloca [100 x [100 x i32]], align 4
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv33 = phi i64 [ 0, %entry ], [ %indvars.iv.next34, %for.body ]
+ %0 = trunc i64 %indvars.iv33 to i32
+ %sub = sub i32 0, %0
+ %arrayidx2 = getelementptr inbounds [100 x [100 x i32]]* %Array, i64 0, i64 %indvars.iv33, i64 %indvars.iv33
+ store i32 %sub, i32* %arrayidx2, align 4, !tbaa !0
+ %indvars.iv.next34 = add i64 %indvars.iv33, 1
+ %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
+ %exitcond36 = icmp eq i32 %lftr.wideiv35, 100
+ br i1 %exitcond36, label %for.cond6.preheader, label %for.body
+
+for.cond6.preheader: ; preds = %for.body, %for.inc17
+ %indvars.iv29 = phi i64 [ %indvars.iv.next30, %for.inc17 ], [ 0, %for.body ]
+ br label %for.body8
+
+for.body8: ; preds = %for.inc14, %for.cond6.preheader
+ %indvars.iv = phi i64 [ 0, %for.cond6.preheader ], [ %indvars.iv.next, %for.inc14 ]
+ %1 = trunc i64 %indvars.iv to i32
+ %2 = trunc i64 %indvars.iv29 to i32
+ %cmp9 = icmp eq i32 %1, %2
+ br i1 %cmp9, label %for.inc14, label %if.then
+
+if.then: ; preds = %for.body8
+ %3 = add i64 %indvars.iv, %indvars.iv29
+ %arrayidx13 = getelementptr inbounds [100 x [100 x i32]]* %Array, i64 0, i64 %indvars.iv29, i64 %indvars.iv
+ %4 = trunc i64 %3 to i32
+ store i32 %4, i32* %arrayidx13, align 4, !tbaa !0
+ br label %for.inc14
+
+for.inc14: ; preds = %for.body8, %if.then
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %lftr.wideiv27 = trunc i64 %indvars.iv.next to i32
+ %exitcond28 = icmp eq i32 %lftr.wideiv27, 100
+ br i1 %exitcond28, label %for.inc17, label %for.body8
+
+for.inc17: ; preds = %for.inc14
+ %indvars.iv.next30 = add i64 %indvars.iv29, 1
+ %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
+ %exitcond32 = icmp eq i32 %lftr.wideiv31, 100
+ br i1 %exitcond32, label %for.body3.lr.ph.us.i, label %for.cond6.preheader
+
+for.inc6.us.i: ; preds = %for.body3.us.i
+ %indvars.iv.next17.i = add i64 %indvars.iv16.i, 1
+ %lftr.wideiv24 = trunc i64 %indvars.iv.next17.i to i32
+ %exitcond25 = icmp eq i32 %lftr.wideiv24, 100
+ br i1 %exitcond25, label %SumArray.exit, label %for.body3.lr.ph.us.i
+
+for.body3.us.i: ; preds = %for.body3.lr.ph.us.i, %for.body3.us.i
+ %indvars.iv.i = phi i64 [ 0, %for.body3.lr.ph.us.i ], [ %indvars.iv.next.i, %for.body3.us.i ]
+ %Result.111.us.i = phi i32 [ %Result.014.us.i, %for.body3.lr.ph.us.i ], [ %add.us.i, %for.body3.us.i ]
+ %arrayidx5.us.i = getelementptr inbounds [100 x [100 x i32]]* %Array, i64 0, i64 %indvars.iv16.i, i64 %indvars.iv.i
+ %5 = load i32* %arrayidx5.us.i, align 4, !tbaa !0
+ %add.us.i = add nsw i32 %5, %Result.111.us.i
+ %indvars.iv.next.i = add i64 %indvars.iv.i, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next.i to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, 100
+ br i1 %exitcond, label %for.inc6.us.i, label %for.body3.us.i
+
+for.body3.lr.ph.us.i: ; preds = %for.inc17, %for.inc6.us.i
+ %indvars.iv16.i = phi i64 [ %indvars.iv.next17.i, %for.inc6.us.i ], [ 0, %for.inc17 ]
+ %Result.014.us.i = phi i32 [ %add.us.i, %for.inc6.us.i ], [ 0, %for.inc17 ]
+ br label %for.body3.us.i
+
+SumArray.exit: ; preds = %for.inc6.us.i
+ %call20 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([23 x i8]* @.str, i64 0, i64 0), i32 100, i32 100, i32 %add.us.i) nounwind
+ ret i32 0
+
+; CHECK: @main
+; CHECK: mtctr
+; CHECK: bdnz
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}