Mark most PPC register classes to avoid write-after-write.

[oota-llvm.git] / lib / Target / PowerPC / PPCISelLowering.cpp
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp

index 8357e8bbdb3ac95266448b9859dfaed9418827bc..48feb98be9bd61d9a99e26f7de298691ddf67d00 100644 (file)
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -51,9 +51,11 @@ static bool CC_PPC_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
                                                ISD::ArgFlagsTy &ArgFlags,
                                                CCState &State);
  
-static cl::opt<bool> EnablePPCPreinc("enable-ppc-preinc",
-cl::desc("enable preincrement load/store generation on PPC (experimental)"),
-                                     cl::Hidden);
+static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
+cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
+
+static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",
+cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);
  
  static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) {
    if (TM.getSubtargetImpl()->isDarwin())
@@ -76,9 +78,9 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
    setMinStackArgumentAlignment(TM.getSubtarget<PPCSubtarget>().isPPC64() ? 8:4);
  
    // Set up the register classes.
-  addRegisterClass(MVT::i32, PPC::GPRCRegisterClass);
-  addRegisterClass(MVT::f32, PPC::F4RCRegisterClass);
-  addRegisterClass(MVT::f64, PPC::F8RCRegisterClass);
+  addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
+  addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
+  addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
  
    // PowerPC has an i16 but no i8 (or i1) SEXTLOAD
    setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
@@ -292,7 +294,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
  
    if (TM.getSubtarget<PPCSubtarget>().use64BitRegs()) {
      // 64-bit PowerPC implementations can support i64 types directly
-    addRegisterClass(MVT::i64, PPC::G8RCRegisterClass);
+    addRegisterClass(MVT::i64, &PPC::G8RCRegClass);
      // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
      setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
      // 64-bit PowerPC wants to expand i128 shifts itself.
@@ -370,10 +372,10 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
      setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
      setOperationAction(ISD::STORE , MVT::v4i32, Legal);
  
-    addRegisterClass(MVT::v4f32, PPC::VRRCRegisterClass);
-    addRegisterClass(MVT::v4i32, PPC::VRRCRegisterClass);
-    addRegisterClass(MVT::v8i16, PPC::VRRCRegisterClass);
-    addRegisterClass(MVT::v16i8, PPC::VRRCRegisterClass);
+    addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
+    addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
+    addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass);
+    addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass);
  
      setOperationAction(ISD::MUL, MVT::v4f32, Legal);
      setOperationAction(ISD::MUL, MVT::v4i32, Custom);
@@ -389,6 +391,9 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
      setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
    }
  
+  if (TM.getSubtarget<PPCSubtarget>().has64BitSupport())
+    setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
+
    setOperationAction(ISD::ATOMIC_LOAD,  MVT::i32, Expand);
    setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
  
@@ -443,7 +448,16 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const {
    // Darwin passes everything on 4 byte boundary.
    if (TM.getSubtarget<PPCSubtarget>().isDarwin())
      return 4;
-  // FIXME SVR4 TBD
+
+  // 16byte and wider vectors are passed on 16byte boundary.
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
+    if (VTy->getBitWidth() >= 128)
+      return 16;
+
+  // The rest is 8 on PPC64 and 4 on PPC32 boundary.
+   if (PPCSubTarget.isPPC64())
+     return 8;
+
    return 4;
  }
  
@@ -848,14 +862,10 @@ bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
      APInt LHSKnownZero, LHSKnownOne;
      APInt RHSKnownZero, RHSKnownOne;
      DAG.ComputeMaskedBits(N.getOperand(0),
-                          APInt::getAllOnesValue(N.getOperand(0)
-                            .getValueSizeInBits()),
                            LHSKnownZero, LHSKnownOne);
  
      if (LHSKnownZero.getBoolValue()) {
        DAG.ComputeMaskedBits(N.getOperand(1),
-                            APInt::getAllOnesValue(N.getOperand(1)
-                              .getValueSizeInBits()),
                              RHSKnownZero, RHSKnownOne);
        // If all of the bits are known zero on the LHS or RHS, the add won't
        // carry.
@@ -894,10 +904,11 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
        return true; // [r+i]
      } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
        // Match LOAD (ADD (X, Lo(G))).
-     assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
+      assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
               && "Cannot handle constant offsets yet!");
        Disp = N.getOperand(1).getOperand(0);  // The global address.
        assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
+             Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
               Disp.getOpcode() == ISD::TargetConstantPool ||
               Disp.getOpcode() == ISD::TargetJumpTable);
        Base = N.getOperand(0);
@@ -910,10 +921,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
        // (for better address arithmetic) if the LHS and RHS of the OR are
        // provably disjoint.
        APInt LHSKnownZero, LHSKnownOne;
-      DAG.ComputeMaskedBits(N.getOperand(0),
-                            APInt::getAllOnesValue(N.getOperand(0)
-                                                   .getValueSizeInBits()),
-                            LHSKnownZero, LHSKnownOne);
+      DAG.ComputeMaskedBits(N.getOperand(0), LHSKnownZero, LHSKnownOne);
  
        if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
          // If all of the bits are known zero on the LHS or RHS, the add won't
@@ -1001,7 +1009,7 @@ bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp,
    if (N.getOpcode() == ISD::ADD) {
      short imm = 0;
      if (isIntS16Immediate(N.getOperand(1), imm) && (imm & 3) == 0) {
-      Disp =  DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32);
+      Disp = DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32);
        if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
          Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
        } else {
@@ -1010,7 +1018,7 @@ bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp,
        return true; // [r+i]
      } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
        // Match LOAD (ADD (X, Lo(G))).
-     assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
+      assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
               && "Cannot handle constant offsets yet!");
        Disp = N.getOperand(1).getOperand(0);  // The global address.
        assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
@@ -1026,10 +1034,7 @@ bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp,
        // (for better address arithmetic) if the LHS and RHS of the OR are
        // provably disjoint.
        APInt LHSKnownZero, LHSKnownOne;
-      DAG.ComputeMaskedBits(N.getOperand(0),
-                            APInt::getAllOnesValue(N.getOperand(0)
-                                                   .getValueSizeInBits()),
-                            LHSKnownZero, LHSKnownOne);
+      DAG.ComputeMaskedBits(N.getOperand(0), LHSKnownZero, LHSKnownOne);
        if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
          // If all of the bits are known zero on the LHS or RHS, the add won't
          // carry.
@@ -1082,8 +1087,7 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
                                                    SDValue &Offset,
                                                    ISD::MemIndexedMode &AM,
                                                    SelectionDAG &DAG) const {
-  // Disabled by default for now.
-  if (!EnablePPCPreinc) return false;
+  if (DisablePPCPreinc) return false;
  
    SDValue Ptr;
    EVT VT;
@@ -1101,7 +1105,15 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
    if (VT.isVector())
      return false;
  
-  // TODO: Check reg+reg first.
+  if (SelectAddressRegReg(Ptr, Offset, Base, DAG)) {
+    if (isa<StoreSDNode>(N)) {
+      AM = ISD::PRE_INC;
+      return true;
+    }
+
+    // FIXME: reg+reg preinc loads
+    return false;
+  }
  
    // LDU/STU use reg+imm*4, others use reg+imm.
    if (VT != MVT::i64) {
@@ -1220,6 +1232,30 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
    return LowerLabelRef(TgtBAHi, TgtBALo, isPIC, DAG);
  }
  
+SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  DebugLoc dl = GA->getDebugLoc();
+  const GlobalValue *GV = GA->getGlobal();
+  EVT PtrVT = getPointerTy();
+  bool is64bit = PPCSubTarget.isPPC64();
+
+  TLSModel::Model model = getTargetMachine().getTLSModel(GV);
+
+  SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                             PPCII::MO_TPREL16_HA);
+  SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                             PPCII::MO_TPREL16_LO);
+
+  if (model != TLSModel::LocalExec)
+    llvm_unreachable("only local-exec TLS mode supported");
+  SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2,
+                                   is64bit ? MVT::i64 : MVT::i32);
+  SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
+  return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
+}
+
  SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
    EVT PtrVT = Op.getValueType();
@@ -1438,13 +1474,16 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
    Entry.Node = Nest; Args.push_back(Entry);
  
    // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
-  std::pair<SDValue, SDValue> CallResult =
-    LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()),
-                false, false, false, false, 0, CallingConv::C,
+  TargetLowering::CallLoweringInfo CLI(Chain,
+                                       Type::getVoidTy(*DAG.getContext()),
+                                       false, false, false, false, 0,
+                                       CallingConv::C,
                  /*isTailCall=*/false,
-                /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
+                                       /*doesNotRet=*/false,
+                                       /*isReturnValueUsed=*/true,
                  DAG.getExternalSymbol("__trampoline_setup", PtrVT),
                  Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
  
    return CallResult.second;
  }
@@ -1700,7 +1739,7 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
    // Assign locations to all of the incoming arguments.
    SmallVector<CCValAssign, 16> ArgLocs;
    CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
  
    // Reserve space for the linkage area on the stack.
    CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize);
@@ -1719,19 +1758,19 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
          default:
            llvm_unreachable("ValVT not supported by formal arguments Lowering");
          case MVT::i32:
-          RC = PPC::GPRCRegisterClass;
+          RC = &PPC::GPRCRegClass;
            break;
          case MVT::f32:
-          RC = PPC::F4RCRegisterClass;
+          RC = &PPC::F4RCRegClass;
            break;
          case MVT::f64:
-          RC = PPC::F8RCRegisterClass;
+          RC = &PPC::F8RCRegClass;
            break;
          case MVT::v16i8:
          case MVT::v8i16:
          case MVT::v4i32:
          case MVT::v4f32:
-          RC = PPC::VRRCRegisterClass;
+          RC = &PPC::VRRCRegClass;
            break;
        }
  
@@ -1761,7 +1800,7 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
    // caller's stack frame, right above the parameter list area.
    SmallVector<CCValAssign, 16> ByValArgLocs;
    CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                     getTargetMachine(), ByValArgLocs, *DAG.getContext());
+                      getTargetMachine(), ByValArgLocs, *DAG.getContext());
  
    // Reserve stack space for the allocations in CCInfo.
    CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
@@ -2741,7 +2780,7 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
  
    SmallVector<CCValAssign, 16> RVLocs;
    CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                   getTargetMachine(), RVLocs, *DAG.getContext());
+                    getTargetMachine(), RVLocs, *DAG.getContext());
    CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC);
  
    // Copy all of the result registers out of their specified physreg.
@@ -2798,7 +2837,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
      if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
        SmallVector<CCValAssign, 16> RVLocs;
        CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs, *DAG.getContext());
+                     getTargetMachine(), RVLocs, *DAG.getContext());
        CCInfo.AnalyzeCallResult(Ins, RetCC_PPC);
        for (unsigned i = 0; i != RVLocs.size(); ++i)
          DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
@@ -2862,14 +2901,19 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
  }
  
  SDValue
-PPCTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
-                             CallingConv::ID CallConv, bool isVarArg,
-                             bool doesNotRet, bool &isTailCall,
-                             const SmallVectorImpl<ISD::OutputArg> &Outs,
-                             const SmallVectorImpl<SDValue> &OutVals,
-                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                             DebugLoc dl, SelectionDAG &DAG,
+PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                               SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
    if (isTailCall)
      isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
                                                     Ins, DAG);
@@ -2919,7 +2963,7 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
    // Assign locations to all of the outgoing arguments.
    SmallVector<CCValAssign, 16> ArgLocs;
    CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
  
    // Reserve space for the linkage area on the stack.
    CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize);
@@ -2959,7 +3003,7 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
    // Assign locations to all of the outgoing aggregate by value arguments.
    SmallVector<CCValAssign, 16> ByValArgLocs;
    CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                     getTargetMachine(), ByValArgLocs, *DAG.getContext());
+                      getTargetMachine(), ByValArgLocs, *DAG.getContext());
  
    // Reserve stack space for the allocations in CCInfo.
    CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
@@ -3483,7 +3527,7 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
  
    SmallVector<CCValAssign, 16> RVLocs;
    CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
    CCInfo.AnalyzeReturn(Outs, RetCC_PPC);
  
    // If this is the first return lowered for this function, add the regs to the
@@ -4557,7 +4601,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
    case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
    case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
-  case ISD::GlobalTLSAddress:   llvm_unreachable("TLS not implemented for PPC");
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
    case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
    case ISD::SETCC:              return LowerSETCC(Op, DAG);
    case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
@@ -5505,12 +5549,11 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
  //===----------------------------------------------------------------------===//
  
  void PPCTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                       const APInt &Mask,
                                                         APInt &KnownZero,
                                                         APInt &KnownOne,
                                                         const SelectionDAG &DAG,
                                                         unsigned Depth) const {
-  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
+  KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
    switch (Op.getOpcode()) {
    default: break;
    case PPCISD::LBRX: {
@@ -5611,18 +5654,18 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
      case 'b':   // R1-R31
      case 'r':   // R0-R31
        if (VT == MVT::i64 && PPCSubTarget.isPPC64())
-        return std::make_pair(0U, PPC::G8RCRegisterClass);
-      return std::make_pair(0U, PPC::GPRCRegisterClass);
+        return std::make_pair(0U, &PPC::G8RCRegClass);
+      return std::make_pair(0U, &PPC::GPRCRegClass);
      case 'f':
        if (VT == MVT::f32)
-        return std::make_pair(0U, PPC::F4RCRegisterClass);
-      else if (VT == MVT::f64)
-        return std::make_pair(0U, PPC::F8RCRegisterClass);
+        return std::make_pair(0U, &PPC::F4RCRegClass);
+      if (VT == MVT::f64)
+        return std::make_pair(0U, &PPC::F8RCRegClass);
        break;
      case 'v':
-      return std::make_pair(0U, PPC::VRRCRegisterClass);
+      return std::make_pair(0U, &PPC::VRRCRegClass);
      case 'y':   // crrc
-      return std::make_pair(0U, PPC::CRRCRegisterClass);
+      return std::make_pair(0U, &PPC::CRRCRegClass);
      }
    }
  
@@ -5837,3 +5880,11 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
      return MVT::i32;
    }
  }
+
+Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
+  if (DisableILPPref)
+    return TargetLowering::getSchedulingPreference(N);
+
+  return Sched::ILP;
+}
+