[PowerPC] Fix PR22711 - Misaligned .toc section

[oota-llvm.git] / lib / Target / PowerPC / PPCISelDAGToDAG.cpp
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp

index 2e1c1abf5b672a5e96cc8a3cdb61d288fb7b0df3..b10e85437ba74e37c41bb051149bf508a85bc198 100644 (file)
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -42,11 +42,15 @@ using namespace llvm;
  cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug",
  cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden);
  
-cl::opt<bool> UseBitPermRewriter("ppc-use-bit-perm-rewriter", cl::init(true),
-  cl::desc("use aggressive ppc isel for bit permutations"), cl::Hidden);
-cl::opt<bool> BPermRewriterNoMasking("ppc-bit-perm-rewriter-stress-rotates",
-  cl::desc("stress rotate selection in aggressive ppc isel for "
-           "bit permutations"), cl::Hidden);
+static cl::opt<bool>
+    UseBitPermRewriter("ppc-use-bit-perm-rewriter", cl::init(true),
+                       cl::desc("use aggressive ppc isel for bit permutations"),
+                       cl::Hidden);
+static cl::opt<bool> BPermRewriterNoMasking(
+    "ppc-bit-perm-rewriter-stress-rotates",
+    cl::desc("stress rotate selection in aggressive ppc isel for "
+             "bit permutations"),
+    cl::Hidden);
  
  namespace llvm {
    void initializePPCDAGToDAGISelPass(PassRegistry&);
@@ -59,22 +63,20 @@ namespace {
    ///
    class PPCDAGToDAGISel : public SelectionDAGISel {
      const PPCTargetMachine &TM;
-    const PPCTargetLowering *PPCLowering;
      const PPCSubtarget *PPCSubTarget;
+    const PPCTargetLowering *PPCLowering;
      unsigned GlobalBaseReg;
    public:
      explicit PPCDAGToDAGISel(PPCTargetMachine &tm)
-        : SelectionDAGISel(tm), TM(tm),
-          PPCLowering(TM.getSubtargetImpl()->getTargetLowering()),
-          PPCSubTarget(TM.getSubtargetImpl()) {
+        : SelectionDAGISel(tm), TM(tm) {
        initializePPCDAGToDAGISelPass(*PassRegistry::getPassRegistry());
      }
  
      bool runOnMachineFunction(MachineFunction &MF) override {
        // Make sure we re-emit a set of the global base reg if necessary
        GlobalBaseReg = 0;
-      PPCLowering = TM.getSubtargetImpl()->getTargetLowering();
-      PPCSubTarget = TM.getSubtargetImpl();
+      PPCSubTarget = &MF.getSubtarget<PPCSubtarget>();
+      PPCLowering = PPCSubTarget->getTargetLowering();
        SelectionDAGISel::runOnMachineFunction(MF);
  
        if (!PPCSubTarget->isSVR4ABI())
@@ -188,7 +190,7 @@ namespace {
                                        std::vector<SDValue> &OutOps) override {
        // We need to make sure that this one operand does not end up in r0
        // (because we might end up lowering this as 0(%op)).
-      const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
+      const TargetRegisterInfo *TRI = PPCSubTarget->getRegisterInfo();
        const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF, /*Kind=*/1);
        SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32);
        SDValue NewOp =
@@ -217,9 +219,12 @@ private:
      void PeepholeCROps();
  
      SDValue combineToCMPB(SDNode *N);
+    void foldBoolExts(SDValue &Res, SDNode *&N);
  
      bool AllUsersSelectZero(SDNode *N);
      void SwapAllSelectUsers(SDNode *N);
+
+    SDNode *transferMemOperands(SDNode *N, SDNode *Result);
    };
  }
  
@@ -257,7 +262,7 @@ void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
    unsigned InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
    unsigned UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
  
-  const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo &TII = *PPCSubTarget->getInstrInfo();
    MachineBasicBlock &EntryBB = *Fn.begin();
    DebugLoc dl;
    // Emit the following code into the entry block:
@@ -293,7 +298,7 @@ void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
  ///
  SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
    if (!GlobalBaseReg) {
-    const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
+    const TargetInstrInfo &TII = *PPCSubTarget->getInstrInfo();
      // Insert the set of GlobalBaseReg into the first MBB of the function
      MachineBasicBlock &FirstMBB = MF->front();
      MachineBasicBlock::iterator MBBI = FirstMBB.begin();
@@ -306,12 +311,13 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
          if (M->getPICLevel() == PICLevel::Small) {
            BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MoveGOTtoLR));
            BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
+          MF->getInfo<PPCFunctionInfo>()->setUsesPICBase(true);
          } else {
            BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
            BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
            unsigned TempReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
            BuildMI(FirstMBB, MBBI, dl,
-                  TII.get(PPC::UpdateGBR)).addReg(GlobalBaseReg)
+                  TII.get(PPC::UpdateGBR), GlobalBaseReg)
                    .addReg(TempReg, RegState::Define).addReg(GlobalBaseReg);
            MF->getInfo<PPCFunctionInfo>()->setUsesPICBase(true);
          }
@@ -602,16 +608,33 @@ static unsigned SelectInt64CountDirect(int64_t Imm) {
    return Result;
  }
  
+static uint64_t Rot64(uint64_t Imm, unsigned R) {
+  return (Imm << R) | (Imm >> (64 - R));
+}
+
  static unsigned SelectInt64Count(int64_t Imm) {
-  unsigned DirectCount = SelectInt64CountDirect(Imm);
+  unsigned Count = SelectInt64CountDirect(Imm);
+  if (Count == 1)
+    return Count;
+
+  for (unsigned r = 1; r < 63; ++r) {
+    uint64_t RImm = Rot64(Imm, r);
+    unsigned RCount = SelectInt64CountDirect(RImm) + 1;
+    Count = std::min(Count, RCount);
+
+    // See comments in SelectInt64 for an explanation of the logic below.
+    unsigned LS = findLastSet(RImm);
+    if (LS != r-1)
+      continue;
  
-  // If might be cheaper to materialize the bit-inverted constant, and then
-  // flip the bits (which takes one nor instruction).
-  unsigned NotDirectCount = SelectInt64CountDirect(~(uint64_t) Imm) + 1;
-  if (NotDirectCount < DirectCount)
-    return NotDirectCount;
+    uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1));
+    uint64_t RImmWithOnes = RImm | OnesMask;
  
-  return DirectCount;
+    RCount = SelectInt64CountDirect(RImmWithOnes) + 1;
+    Count = std::min(Count, RCount);
+  }
+
+  return Count;
  }
  
  // Select a 64-bit constant. For cost-modeling purposes, SelectInt64Count
@@ -691,19 +714,59 @@ static SDNode *SelectInt64Direct(SelectionDAG *CurDAG, SDLoc dl, int64_t Imm) {
  }
  
  static SDNode *SelectInt64(SelectionDAG *CurDAG, SDLoc dl, int64_t Imm) {
-  unsigned DirectCount = SelectInt64CountDirect(Imm);
-
-  // If might be cheaper to materialize the bit-inverted constant, and then
-  // flip the bits (which takes one nor instruction).
-  unsigned NotDirectCount = SelectInt64CountDirect(~(uint64_t) Imm) + 1;
-  if (NotDirectCount < DirectCount) {
-    SDValue NotDirectVal =
-      SDValue(SelectInt64Direct(CurDAG, dl, ~(uint64_t) Imm), 0);
-    return CurDAG->getMachineNode(PPC::NOR8, dl, MVT::i64, NotDirectVal,
-                                  NotDirectVal);
+  unsigned Count = SelectInt64CountDirect(Imm);
+  if (Count == 1)
+    return SelectInt64Direct(CurDAG, dl, Imm);
+
+  unsigned RMin = 0;
+
+  int64_t MatImm;
+  unsigned MaskEnd;
+
+  for (unsigned r = 1; r < 63; ++r) {
+    uint64_t RImm = Rot64(Imm, r);
+    unsigned RCount = SelectInt64CountDirect(RImm) + 1;
+    if (RCount < Count) {
+      Count = RCount;
+      RMin = r;
+      MatImm = RImm;
+      MaskEnd = 63;
+    }
+
+    // If the immediate to generate has many trailing zeros, it might be
+    // worthwhile to generate a rotated value with too many leading ones
+    // (because that's free with li/lis's sign-extension semantics), and then
+    // mask them off after rotation.
+
+    unsigned LS = findLastSet(RImm);
+    // We're adding (63-LS) higher-order ones, and we expect to mask them off
+    // after performing the inverse rotation by (64-r). So we need that:
+    //   63-LS == 64-r => LS == r-1
+    if (LS != r-1)
+      continue;
+
+    uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1));
+    uint64_t RImmWithOnes = RImm | OnesMask;
+
+    RCount = SelectInt64CountDirect(RImmWithOnes) + 1;
+    if (RCount < Count) {
+      Count = RCount;
+      RMin = r;
+      MatImm = RImmWithOnes;
+      MaskEnd = LS;
+    }
    }
  
-  return SelectInt64Direct(CurDAG, dl, Imm);
+  if (!RMin)
+    return SelectInt64Direct(CurDAG, dl, Imm);
+
+  auto getI32Imm = [CurDAG](unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+  };
+
+  SDValue Val = SDValue(SelectInt64Direct(CurDAG, dl, MatImm), 0);
+  return CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Val,
+                                getI32Imm(64 - RMin), getI32Imm(MaskEnd));
  }
  
  // Select a 64-bit constant.
@@ -2232,6 +2295,9 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
    // Altivec Vector compare instructions do not set any CR register by default and
    // vector compare operations return the same type as the operands.
    if (LHS.getValueType().isVector()) {
+    if (PPCSubTarget->hasQPX())
+      return nullptr;
+
      EVT VecVT = LHS.getValueType();
      bool Swap, Negate;
      unsigned int VCmpInst = getVCmpInst(VecVT.getSimpleVT(), CC,
@@ -2278,6 +2344,14 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
    return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1));
  }
  
+SDNode *PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
+  return Result;
+}
+
  
  // Select - Convert the specified operand from a target-independent to a
  // target-specific node if it hasn't already been changed.
@@ -2396,9 +2470,10 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
        SDValue Chain = LD->getChain();
        SDValue Base = LD->getBasePtr();
        SDValue Ops[] = { Offset, Base, Chain };
-      return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
-                                    PPCLowering->getPointerTy(),
-                                    MVT::Other, Ops);
+      return transferMemOperands(N, CurDAG->getMachineNode(Opcode, dl,
+                                      LD->getValueType(0),
+                                      PPCLowering->getPointerTy(),
+                                      MVT::Other, Ops));
      } else {
        unsigned Opcode;
        bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
@@ -2407,6 +2482,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
          assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
          switch (LoadedVT.getSimpleVT().SimpleTy) {
            default: llvm_unreachable("Invalid PPC load type!");
+          case MVT::v4f64: Opcode = PPC::QVLFDUX; break; // QPX
+          case MVT::v4f32: Opcode = PPC::QVLFSUX; break; // QPX
            case MVT::f64: Opcode = PPC::LFDUX; break;
            case MVT::f32: Opcode = PPC::LFSUX; break;
            case MVT::i32: Opcode = PPC::LWZUX; break;
@@ -2431,9 +2508,10 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
        SDValue Chain = LD->getChain();
        SDValue Base = LD->getBasePtr();
        SDValue Ops[] = { Base, Offset, Chain };
-      return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
-                                    PPCLowering->getPointerTy(),
-                                    MVT::Other, Ops);
+      return transferMemOperands(N, CurDAG->getMachineNode(Opcode, dl,
+                                      LD->getValueType(0),
+                                      PPCLowering->getPointerTy(),
+                                      MVT::Other, Ops));
      }
    }
  
@@ -2462,7 +2540,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
      if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) &&
          isMask_64(Imm64)) {
        SDValue Val = N->getOperand(0);
-      MB = 64 - CountTrailingOnes_64(Imm64);
+      MB = 64 - countTrailingOnes(Imm64);
        SH = 0;
  
        // If the operand is a logical right shift, we can fold it into this
@@ -2650,6 +2728,12 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
          SelectCCOp = PPC::SELECT_CC_VSFRC;
        else
          SelectCCOp = PPC::SELECT_CC_F8;
+    else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4f64)
+      SelectCCOp = PPC::SELECT_CC_QFRC;
+    else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4f32)
+      SelectCCOp = PPC::SELECT_CC_QSRC;
+    else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4i1)
+      SelectCCOp = PPC::SELECT_CC_QBRC;
      else if (N->getValueType(0) == MVT::v2f64 ||
               N->getValueType(0) == MVT::v2i64)
        SelectCCOp = PPC::SELECT_CC_VSRC;
@@ -2779,8 +2863,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
              "Only supported for 64-bit ABI and 32-bit SVR4");
      if (PPCSubTarget->isSVR4ABI() && !PPCSubTarget->isPPC64()) {
        SDValue GA = N->getOperand(0);
-      return CurDAG->getMachineNode(PPC::LWZtoc, dl, MVT::i32, GA,
-                                    N->getOperand(1));
+      return transferMemOperands(N, CurDAG->getMachineNode(PPC::LWZtoc, dl,
+                                      MVT::i32, GA, N->getOperand(1)));
      }
  
      // For medium and large code model, we generate two instructions as
@@ -2800,12 +2884,12 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
      SDValue GA = N->getOperand(0);
      SDValue TOCbase = N->getOperand(1);
      SDNode *Tmp = CurDAG->getMachineNode(PPC::ADDIStocHA, dl, MVT::i64,
-                                        TOCbase, GA);
+                                         TOCbase, GA);
  
      if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA) ||
          CModel == CodeModel::Large)
-      return CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
-                                    SDValue(Tmp, 0));
+      return transferMemOperands(N, CurDAG->getMachineNode(PPC::LDtocL, dl,
+                                      MVT::i64, GA, SDValue(Tmp, 0)));
  
      if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
        const GlobalValue *GValue = G->getGlobal();
@@ -2813,8 +2897,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
             (GValue->isDeclaration() || GValue->isWeakForLinker())) ||
            GValue->isDeclaration() || GValue->hasCommonLinkage() ||
            GValue->hasAvailableExternallyLinkage())
-        return CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
-                                      SDValue(Tmp, 0));
+        return transferMemOperands(N, CurDAG->getMachineNode(PPC::LDtocL, dl,
+                                        MVT::i64, GA, SDValue(Tmp, 0)));
      }
  
      return CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64,
@@ -3116,6 +3200,73 @@ SDValue PPCDAGToDAGISel::combineToCMPB(SDNode *N) {
    return Res;
  }
  
+// When CR bit registers are enabled, an extension of an i1 variable to a i32
+// or i64 value is lowered in terms of a SELECT_I[48] operation, and thus
+// involves constant materialization of a 0 or a 1 or both. If the result of
+// the extension is then operated upon by some operator that can be constant
+// folded with a constant 0 or 1, and that constant can be materialized using
+// only one instruction (like a zero or one), then we should fold in those
+// operations with the select.
+void PPCDAGToDAGISel::foldBoolExts(SDValue &Res, SDNode *&N) {
+  if (!PPCSubTarget->useCRBits())
+    return;
+
+  if (N->getOpcode() != ISD::ZERO_EXTEND &&
+      N->getOpcode() != ISD::SIGN_EXTEND &&
+      N->getOpcode() != ISD::ANY_EXTEND)
+    return;
+
+  if (N->getOperand(0).getValueType() != MVT::i1)
+    return;
+
+  if (!N->hasOneUse())
+    return;
+
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue Cond = N->getOperand(0);
+  SDValue ConstTrue =
+    CurDAG->getConstant(N->getOpcode() == ISD::SIGN_EXTEND ? -1 : 1, VT);
+  SDValue ConstFalse = CurDAG->getConstant(0, VT);
+
+  do {
+    SDNode *User = *N->use_begin();
+    if (User->getNumOperands() != 2)
+      break;
+
+    auto TryFold = [this, N, User](SDValue Val) {
+      SDValue UserO0 = User->getOperand(0), UserO1 = User->getOperand(1);
+      SDValue O0 = UserO0.getNode() == N ? Val : UserO0;
+      SDValue O1 = UserO1.getNode() == N ? Val : UserO1;
+
+      return CurDAG->FoldConstantArithmetic(User->getOpcode(),
+                                            User->getValueType(0),
+                                            O0.getNode(), O1.getNode());
+    };
+
+    SDValue TrueRes = TryFold(ConstTrue);
+    if (!TrueRes)
+      break;
+    SDValue FalseRes = TryFold(ConstFalse);
+    if (!FalseRes)
+      break;
+
+    // For us to materialize these using one instruction, we must be able to
+    // represent them as signed 16-bit integers.
+    uint64_t True  = cast<ConstantSDNode>(TrueRes)->getZExtValue(),
+             False = cast<ConstantSDNode>(FalseRes)->getZExtValue();
+    if (!isInt<16>(True) || !isInt<16>(False))
+      break;
+
+    // We can replace User with a new SELECT node, and try again to see if we
+    // can fold the select with its user.
+    Res = CurDAG->getSelect(dl, User->getValueType(0), Cond, TrueRes, FalseRes);
+    N = User;
+    ConstTrue = TrueRes;
+    ConstFalse = FalseRes;
+  } while (N->hasOneUse());
+}
+
  void PPCDAGToDAGISel::PreprocessISelDAG() {
    SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
    ++Position;
@@ -3134,6 +3285,9 @@ void PPCDAGToDAGISel::PreprocessISelDAG() {
        break;
      }
  
+    if (!Res)
+      foldBoolExts(Res, N);
+
      if (Res) {
        DEBUG(dbgs() << "PPC DAG preprocessing replacing:\nOld:    ");
        DEBUG(N->dump(CurDAG));
@@ -3275,6 +3429,9 @@ void PPCDAGToDAGISel::PeepholeCROps() {
        case PPC::SELECT_I8:
        case PPC::SELECT_F4:
        case PPC::SELECT_F8:
+      case PPC::SELECT_QFRC:
+      case PPC::SELECT_QSRC:
+      case PPC::SELECT_QBRC:
        case PPC::SELECT_VRRC:
        case PPC::SELECT_VSFRC:
        case PPC::SELECT_VSRC: {
@@ -3582,6 +3739,9 @@ void PPCDAGToDAGISel::PeepholeCROps() {
        case PPC::SELECT_I8:
        case PPC::SELECT_F4:
        case PPC::SELECT_F8:
+      case PPC::SELECT_QFRC:
+      case PPC::SELECT_QSRC:
+      case PPC::SELECT_QBRC:
        case PPC::SELECT_VRRC:
        case PPC::SELECT_VSFRC:
        case PPC::SELECT_VSRC:
@@ -3672,6 +3832,19 @@ static bool PeepholePPC64ZExtGather(SDValue Op32,
      return true;
    }
  
+  // LHBRX and LWBRX always clear the higher-order bits.
+  if (Op32.getMachineOpcode() == PPC::LHBRX ||
+      Op32.getMachineOpcode() == PPC::LWBRX) {
+    ToPromote.insert(Op32.getNode());
+    return true;
+  }
+
+  // CNTLZW always produces a 64-bit value in [0,32], and so is zero extended.
+  if (Op32.getMachineOpcode() == PPC::CNTLZW) {
+    ToPromote.insert(Op32.getNode());
+    return true;
+  }
+
    // Next, check for those instructions we can look through.
  
    // Assuming the mask does not wrap around, then the higher-order bits are
@@ -3859,6 +4032,9 @@ void PPCDAGToDAGISel::PeepholePPC64ZExt() {
        case PPC::SRW:       NewOpcode = PPC::SRW8; break;
        case PPC::LI:        NewOpcode = PPC::LI8; break;
        case PPC::LIS:       NewOpcode = PPC::LIS8; break;
+      case PPC::LHBRX:     NewOpcode = PPC::LHBRX8; break;
+      case PPC::LWBRX:     NewOpcode = PPC::LWBRX8; break;
+      case PPC::CNTLZW:    NewOpcode = PPC::CNTLZW8; break;
        case PPC::RLWIMI:    NewOpcode = PPC::RLWIMI8; break;
        case PPC::OR:        NewOpcode = PPC::OR8; break;
        case PPC::SELECT_I4: NewOpcode = PPC::SELECT_I8; break;