From: Bill Schmidt <wschmidt@linux.vnet.ibm.com>
Date: Fri, 30 Aug 2013 22:18:55 +0000 (+0000)
Subject: [PowerPC] Call support for fast-isel.
X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=11addd2a2f584571ffcfd51711972b27aef0133f;p=oota-llvm.git

[PowerPC] Call support for fast-isel.

This patch adds fast-isel support for calls (but not intrinsic calls
or varargs calls).  It also removes a badly-formed assert.  There are
some new tests just for calls, and also for folding loads into
arguments on calls to avoid extra extends.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189701 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td
index 9c937edee4c..e8e7f4c2d22 100644
--- a/lib/Target/PowerPC/PPCCallingConv.td
+++ b/lib/Target/PowerPC/PPCCallingConv.td
@@ -42,6 +42,17 @@ def RetCC_PPC : CallingConv<[
 // logic.  FIXME: See if the logic can be simplified with use of CCs.
 // This may require some extensions to current table generation.
 
+// Simple calling convention for 64-bit ELF PowerPC fast isel.
+// Only handle ints and floats.  All ints are promoted to i64.
+// Vector types and quadword ints are not handled.
+def CC_PPC64_ELF_FIS : CallingConv<[
+  CCIfType<[i8],  CCPromoteToType<i64>>,
+  CCIfType<[i16], CCPromoteToType<i64>>,
+  CCIfType<[i32], CCPromoteToType<i64>>,
+  CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6, X7, X8, X9, X10]>>,
+  CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>
+]>;
+
 // Simple return-value convention for 64-bit ELF PowerPC fast isel.
 // All small ints are promoted to i64.  Vector types, quadword ints,
 // and multiple register returns are "supported" to avoid compile
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index 3bceed418a1..8a88e76e90d 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -114,6 +114,7 @@ class PPCFastISel : public FastISel {
     bool SelectIToFP(const Instruction *I, bool IsSigned);
     bool SelectFPToI(const Instruction *I, bool IsSigned);
     bool SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode);
+    bool SelectCall(const Instruction *I);
     bool SelectRet(const Instruction *I);
     bool SelectIntExt(const Instruction *I);
 
@@ -145,6 +146,17 @@ class PPCFastISel : public FastISel {
 
   // Call handling routines.
   private:
+    bool processCallArgs(SmallVectorImpl<Value*> &Args,
+                         SmallVectorImpl<unsigned> &ArgRegs,
+                         SmallVectorImpl<MVT> &ArgVTs,
+                         SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                         SmallVectorImpl<unsigned> &RegArgs,
+                         CallingConv::ID CC,
+                         unsigned &NumBytes,
+                         bool IsVarArg);
+    void finishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                    const Instruction *I, CallingConv::ID CC,
+                    unsigned &NumBytes, bool IsVarArg);
     CCAssignFn *usePPC32CCs(unsigned Flag);
 
   private:
@@ -1150,6 +1162,316 @@ bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
   return true;
 }
 
+// Handle arguments to a call that we're attempting to fast-select.
+// Return false if the arguments are too complex for us at the moment.
+bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
+                                  SmallVectorImpl<unsigned> &ArgRegs,
+                                  SmallVectorImpl<MVT> &ArgVTs,
+                                  SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                                  SmallVectorImpl<unsigned> &RegArgs,
+                                  CallingConv::ID CC,
+                                  unsigned &NumBytes,
+                                  bool IsVarArg) {
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, ArgLocs, *Context);
+  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_PPC64_ELF_FIS);
+
+  // Bail out if we can't handle any of the arguments.
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // Skip vector arguments for now, as well as long double and
+    // uint128_t, and anything that isn't passed in a register.
+    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64 ||
+        !VA.isRegLoc() || VA.needsCustom())
+      return false;
+
+    // Skip bit-converted arguments for now.
+    if (VA.getLocInfo() == CCValAssign::BCvt)
+      return false;
+  }
+
+  // Get a count of how many bytes are to be pushed onto the stack.
+  NumBytes = CCInfo.getNextStackOffset();
+
+  // Issue CALLSEQ_START.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+          TII.get(TII.getCallFrameSetupOpcode()))
+    .addImm(NumBytes);
+
+  // Prepare to assign register arguments.  Every argument uses up a
+  // GPR protocol register even if it's passed in a floating-point
+  // register.
+  unsigned NextGPR = PPC::X3;
+  unsigned NextFPR = PPC::F1;
+
+  // Process arguments.
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    unsigned Arg = ArgRegs[VA.getValNo()];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // Handle argument promotion and bitcasts.
+    switch (VA.getLocInfo()) {
+      default:
+        llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full:
+        break;
+      case CCValAssign::SExt: {
+        MVT DestVT = VA.getLocVT();
+        const TargetRegisterClass *RC =
+          (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+        unsigned TmpReg = createResultReg(RC);
+        if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/false))
+          llvm_unreachable("Failed to emit a sext!");
+        ArgVT = DestVT;
+        Arg = TmpReg;
+        break;
+      }
+      case CCValAssign::AExt:
+      case CCValAssign::ZExt: {
+        MVT DestVT = VA.getLocVT();
+        const TargetRegisterClass *RC =
+          (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+        unsigned TmpReg = createResultReg(RC);
+        if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/true))
+          llvm_unreachable("Failed to emit a zext!");
+        ArgVT = DestVT;
+        Arg = TmpReg;
+        break;
+      }
+      case CCValAssign::BCvt: {
+        // FIXME: Not yet handled.
+        llvm_unreachable("Should have bailed before getting here!");
+        break;
+      }
+    }
+
+    // Copy this argument to the appropriate register.
+    unsigned ArgReg;
+    if (ArgVT == MVT::f32 || ArgVT == MVT::f64) {
+      ArgReg = NextFPR++;
+      ++NextGPR;
+    } else
+      ArgReg = NextGPR++;
+      
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ArgReg).addReg(Arg);
+    RegArgs.push_back(ArgReg);
+  }
+
+  return true;
+}
+
+// For a call that we've determined we can fast-select, finish the
+// call sequence and generate a copy to obtain the return value (if any).
+void PPCFastISel::finishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                             const Instruction *I, CallingConv::ID CC,
+                             unsigned &NumBytes, bool IsVarArg) {
+  // Issue CallSEQ_END.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+          TII.get(TII.getCallFrameDestroyOpcode()))
+    .addImm(NumBytes).addImm(0);
+
+  // Next, generate a copy to obtain the return value.
+  // FIXME: No multi-register return values yet, though I don't foresee
+  // any real difficulties there.
+  if (RetVT != MVT::isVoid) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
+    CCValAssign &VA = RVLocs[0];
+    assert(RVLocs.size() == 1 && "No support for multi-reg return values!");
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    MVT DestVT = VA.getValVT();
+    MVT CopyVT = DestVT;
+
+    // Ints smaller than a register still arrive in a full 64-bit
+    // register, so make sure we recognize this.
+    if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32)
+      CopyVT = MVT::i64;
+
+    unsigned SourcePhysReg = VA.getLocReg();
+    unsigned ResultReg;
+
+    if (RetVT == CopyVT) {
+      const TargetRegisterClass *CpyRC = TLI.getRegClassFor(CopyVT);
+      ResultReg = createResultReg(CpyRC);
+
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+              TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(SourcePhysReg);
+
+    // If necessary, round the floating result to single precision.
+    } else if (CopyVT == MVT::f64) {
+      ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::FRSP),
+              ResultReg).addReg(SourcePhysReg);
+
+    // If only the low half of a general register is needed, generate
+    // a GPRC copy instead of a G8RC copy.  (EXTRACT_SUBREG can't be
+    // used along the fast-isel path (not lowered), and downstream logic
+    // also doesn't like a direct subreg copy on a physical reg.)
+    } else if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32) {
+      ResultReg = createResultReg(&PPC::GPRCRegClass);
+      // Convert physical register from G8RC to GPRC.
+      SourcePhysReg -= PPC::X0 - PPC::R0;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+              TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(SourcePhysReg);
+    }
+
+    UsedRegs.push_back(SourcePhysReg);
+    UpdateValueMap(I, ResultReg);
+  }
+}
+
+// Attempt to fast-select a call instruction.
+bool PPCFastISel::SelectCall(const Instruction *I) {
+  const CallInst *CI = cast<CallInst>(I);
+  const Value *Callee = CI->getCalledValue();
+
+  // Can't handle inline asm.
+  if (isa<InlineAsm>(Callee))
+    return false;
+
+  // Allow SelectionDAG isel to handle tail calls.
+  if (CI->isTailCall())
+    return false;
+
+  // Obtain calling convention.
+  ImmutableCallSite CS(CI);
+  CallingConv::ID CC = CS.getCallingConv();
+
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  bool IsVarArg = FTy->isVarArg();
+
+  // Not ready for varargs yet.
+  if (IsVarArg)
+    return false;
+
+  // Handle simple calls for now, with legal return types and
+  // those that can be extended.
+  Type *RetTy = I->getType();
+  MVT RetVT;
+  if (RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(RetTy, RetVT) && RetVT != MVT::i16 &&
+           RetVT != MVT::i8)
+    return false;
+
+  // FIXME: No multi-register return values yet.
+  if (RetVT != MVT::isVoid && RetVT != MVT::i8 && RetVT != MVT::i16 &&
+      RetVT != MVT::i32 && RetVT != MVT::i64 && RetVT != MVT::f32 &&
+      RetVT != MVT::f64) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
+    if (RVLocs.size() > 1)
+      return false;
+  }
+
+  // Bail early if more than 8 arguments, as we only currently
+  // handle arguments passed in registers.
+  unsigned NumArgs = CS.arg_size();
+  if (NumArgs > 8)
+    return false;
+
+  // Set up the argument vectors.
+  SmallVector<Value*, 8> Args;
+  SmallVector<unsigned, 8> ArgRegs;
+  SmallVector<MVT, 8> ArgVTs;
+  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+
+  Args.reserve(NumArgs);
+  ArgRegs.reserve(NumArgs);
+  ArgVTs.reserve(NumArgs);
+  ArgFlags.reserve(NumArgs);
+
+  for (ImmutableCallSite::arg_iterator II = CS.arg_begin(), IE = CS.arg_end();
+       II != IE; ++II) {
+    // FIXME: ARM does something for intrinsic calls here, check into that.
+
+    unsigned AttrIdx = II - CS.arg_begin() + 1;
+    
+    // Only handle easy calls for now.  It would be reasonably easy
+    // to handle <= 8-byte structures passed ByVal in registers, but we
+    // have to ensure they are right-justified in the register.
+    if (CS.paramHasAttr(AttrIdx, Attribute::InReg) ||
+        CS.paramHasAttr(AttrIdx, Attribute::StructRet) ||
+        CS.paramHasAttr(AttrIdx, Attribute::Nest) ||
+        CS.paramHasAttr(AttrIdx, Attribute::ByVal))
+      return false;
+
+    ISD::ArgFlagsTy Flags;
+    if (CS.paramHasAttr(AttrIdx, Attribute::SExt))
+      Flags.setSExt();
+    if (CS.paramHasAttr(AttrIdx, Attribute::ZExt))
+      Flags.setZExt();
+
+    Type *ArgTy = (*II)->getType();
+    MVT ArgVT;
+    if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8)
+      return false;
+
+    if (ArgVT.isVector())
+      return false;
+
+    unsigned Arg = getRegForValue(*II);
+    if (Arg == 0)
+      return false;
+
+    unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    Args.push_back(*II);
+    ArgRegs.push_back(Arg);
+    ArgVTs.push_back(ArgVT);
+    ArgFlags.push_back(Flags);
+  }
+
+  // Process the arguments.
+  SmallVector<unsigned, 8> RegArgs;
+  unsigned NumBytes;
+
+  if (!processCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
+                       RegArgs, CC, NumBytes, IsVarArg))
+    return false;
+
+  // FIXME: No handling for function pointers yet.  This requires
+  // implementing the function descriptor (OPD) setup.
+  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
+  if (!GV)
+    return false;
+
+  // Build direct call with NOP for TOC restore.
+  // FIXME: We can and should optimize away the NOP for local calls.
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                    TII.get(PPC::BL8_NOP));
+  // Add callee.
+  MIB.addGlobalAddress(GV);
+
+  // Add implicit physical register uses to the call.
+  for (unsigned II = 0, IE = RegArgs.size(); II != IE; ++II)
+    MIB.addReg(RegArgs[II], RegState::Implicit);
+
+  // Add a register mask with the call-preserved registers.  Proper
+  // defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(CC));
+
+  // Finish off the call including any return values.
+  SmallVector<unsigned, 4> UsedRegs;
+  finishCall(RetVT, UsedRegs, I, CC, NumBytes, IsVarArg);
+
+  // Set all unused physregs defs as dead.
+  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+  return true;
+}
+
 // Attempt to fast-select a return instruction.
 bool PPCFastISel::SelectRet(const Instruction *I) {
 
@@ -1414,6 +1736,10 @@ bool PPCFastISel::TargetSelectInstruction(const Instruction *I) {
       return SelectBinaryIntOp(I, ISD::OR);
     case Instruction::Sub:
       return SelectBinaryIntOp(I, ISD::SUB);
+    case Instruction::Call:
+      if (dyn_cast<IntrinsicInst>(I))
+        return false;
+      return SelectCall(I);
     case Instruction::Ret:
       return SelectRet(I);
     case Instruction::ZExt:
@@ -1490,7 +1816,6 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
     // If GV is an alias, use the aliasee for determining thread-locality.
     if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
       GVar = dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false));
-    assert((GVar || isa<Function>(GV)) && "Unexpected GV subclass!");
   }
 
   // FIXME: We don't yet handle the complexity of TLS.
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 29c2270d4ab..244e00d4380 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1813,8 +1813,7 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
 // Function whose sole purpose is to kill compiler warnings 
 // stemming from unused functions included from PPCGenCallingConv.inc.
 CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const {
-  /* One of these will be CC_PPC64_ELF_FIS in a future patch. */
-  return Flag ? RetCC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS;
+  return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS;
 }
 
 bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
diff --git a/test/CodeGen/PowerPC/fast-isel-call.ll b/test/CodeGen/PowerPC/fast-isel-call.ll
new file mode 100644
index 00000000000..33a8ba903e3
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-call.ll
@@ -0,0 +1,132 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+define i32 @t1(i8 signext %a) nounwind {
+  %1 = sext i8 %a to i32
+  ret i32 %1
+}
+
+define i32 @t2(i8 zeroext %a) nounwind {
+  %1 = zext i8 %a to i32
+  ret i32 %1
+}
+
+define i32 @t3(i16 signext %a) nounwind {
+  %1 = sext i16 %a to i32
+  ret i32 %1
+}
+
+define i32 @t4(i16 zeroext %a) nounwind {
+  %1 = zext i16 %a to i32
+  ret i32 %1
+}
+
+define void @foo(i8 %a, i16 %b) nounwind {
+; ELF64: foo
+  %1 = call i32 @t1(i8 signext %a)
+; ELF64: extsb
+  %2 = call i32 @t2(i8 zeroext %a)
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
+  %3 = call i32 @t3(i16 signext %b)
+; ELF64: extsh
+  %4 = call i32 @t4(i16 zeroext %b)
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
+
+;; A few test to check materialization
+  %5 = call i32 @t2(i8 zeroext 255)
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
+  %6 = call i32 @t4(i16 zeroext 65535)
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
+  ret void
+}
+
+define void @foo2() nounwind {
+  %1 = call signext i16 @t5()
+  %2 = call zeroext i16 @t6()
+  %3 = call signext i8 @t7()
+  %4 = call zeroext i8 @t8()
+  ret void
+}
+
+declare signext i16 @t5();
+declare zeroext i16 @t6();
+declare signext i8 @t7();
+declare zeroext i8 @t8();
+
+define i32 @t10(i32 %argc, i8** nocapture %argv) {
+entry:
+; ELF64: t10
+  %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70)
+; ELF64: li 3, 0
+; ELF64: li 4, 248
+; ELF64: li 5, 187
+; ELF64: li 6, 28
+; ELF64: li 7, 40
+; ELF64: li 8, 186
+; ELF64: rldicl 3, 3, 0, 56
+; ELF64: rldicl 4, 4, 0, 56
+; ELF64: rldicl 5, 5, 0, 56
+; ELF64: rldicl 6, 6, 0, 56
+; ELF64: rldicl 7, 7, 0, 56
+; ELF64: rldicl 8, 8, 0, 56
+  ret i32 0
+}
+
+declare i32 @bar(i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext)
+
+define i32 @bar0(i32 %i) nounwind {
+  ret i32 0
+}
+
+; Function pointers are not yet implemented.
+;define void @foo3() uwtable {
+;  %fptr = alloca i32 (i32)*, align 8
+;  store i32 (i32)* @bar0, i32 (i32)** %fptr, align 8
+;  %1 = load i32 (i32)** %fptr, align 8
+;  %call = call i32 %1(i32 0)
+;  ret void
+;}
+
+; Intrinsic calls not yet implemented, and udiv isn't one for PPC anyway.
+;define i32 @LibCall(i32 %a, i32 %b) {
+;entry:
+;        %tmp1 = udiv i32 %a, %b         ; <i32> [#uses=1]
+;        ret i32 %tmp1
+;}
+
+declare void @float_foo(float %f) ssp
+
+define void @float_const() ssp {
+entry:
+; ELF64: float_const
+  call void @float_foo(float 0x401C666660000000)
+; ELF64: addis [[REG:[0-9]+]], 2, .LCPI[[SUF:[0-9_]+]]@toc@ha
+; ELF64: lfs 1, .LCPI[[SUF]]@toc@l([[REG]])
+  ret void
+}
+
+define void @float_reg(float %dummy, float %f) ssp {
+entry:
+; ELF64: float_reg
+  call void @float_foo(float %f)
+; ELF64: fmr 1, 2
+  ret void
+}
+
+declare void @double_foo(double %d) ssp
+
+define void @double_const() ssp {
+entry:
+; ELF64: double_const
+  call void @double_foo(double 0x1397723CCABD0000401C666660000000)
+; ELF64: addis [[REG2:[0-9]+]], 2, .LCPI[[SUF2:[0-9_]+]]@toc@ha
+; ELF64: lfd 1, .LCPI[[SUF2]]@toc@l([[REG2]])
+  ret void
+}
+
+define void @double_reg(double %dummy, double %d) ssp {
+entry:
+; ELF64: double_reg
+  call void @double_foo(double %d)
+; ELF64: fmr 1, 2
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-fold.ll b/test/CodeGen/PowerPC/fast-isel-fold.ll
index 21e691224df..4de345f309a 100644
--- a/test/CodeGen/PowerPC/fast-isel-fold.ll
+++ b/test/CodeGen/PowerPC/fast-isel-fold.ll
@@ -4,6 +4,40 @@
 @b = global i16 2, align 2
 @c = global i32 4, align 4
 
+define void @t1() nounwind uwtable ssp {
+; ELF64: t1
+  %1 = load i8* @a, align 1
+  call void @foo1(i8 zeroext %1)
+; ELF64: lbz
+; ELF64-NOT: rldicl
+; ELF64-NOT: rlwinm
+  ret void
+}
+
+define void @t2() nounwind uwtable ssp {
+; ELF64: t2
+  %1 = load i16* @b, align 2
+  call void @foo2(i16 zeroext %1)
+; ELF64: lhz
+; ELF64-NOT: rldicl
+; ELF64-NOT: rlwinm
+  ret void
+}
+
+define void @t2a() nounwind uwtable ssp {
+; ELF64: t2a
+  %1 = load i32* @c, align 4
+  call void @foo3(i32 zeroext %1)
+; ELF64: lwz
+; ELF64-NOT: rldicl
+; ELF64-NOT: rlwinm
+  ret void
+}
+
+declare void @foo1(i8 zeroext)
+declare void @foo2(i16 zeroext)
+declare void @foo3(i32 zeroext)
+
 define i32 @t3() nounwind uwtable ssp {
 ; ELF64: t3
   %1 = load i8* @a, align 1