lib/Target/PowerPC/PPCTargetTransformInfo.cpp

   1 //===-- PPCTargetTransformInfo.cpp - PPC specific TTI pass ----------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 /// \file
  10 /// This file implements a TargetTransformInfo analysis pass specific to the
  11 /// PPC target machine. It uses the target's detailed information to provide
  12 /// more precise answers to certain TTI queries, while letting the target
  13 /// independent and default TTI implementations handle the rest.
  14 ///
  15 //===----------------------------------------------------------------------===//
  16
  17 #include "PPC.h"
  18 #include "PPCTargetMachine.h"
  19 #include "llvm/Analysis/TargetTransformInfo.h"
  20 #include "llvm/Support/CommandLine.h"
  21 #include "llvm/Support/Debug.h"
  22 #include "llvm/Target/CostTable.h"
  23 #include "llvm/Target/TargetLowering.h"
  24 using namespace llvm;
  25
  26 #define DEBUG_TYPE "ppctti"
  27
  28 static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
  29 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
  30
  31 // Declare the pass initialization routine locally as target-specific passes
  32 // don't have a target-wide initialization entry point, and so we rely on the
  33 // pass constructor initialization.
  34 namespace llvm {
  35 void initializePPCTTIPass(PassRegistry &);
  36 }
  37
  38 namespace {
  39
  40 class PPCTTI final : public ImmutablePass, public TargetTransformInfo {
  41   const PPCSubtarget *ST;
  42   const PPCTargetLowering *TLI;
  43
  44 public:
  45   PPCTTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) {
  46     llvm_unreachable("This pass cannot be directly constructed");
  47   }
  48
  49   PPCTTI(const PPCTargetMachine *TM)
  50       : ImmutablePass(ID), ST(TM->getSubtargetImpl()),
  51         TLI(TM->getSubtargetImpl()->getTargetLowering()) {
  52     initializePPCTTIPass(*PassRegistry::getPassRegistry());
  53   }
  54
  55   void initializePass() override {
  56     pushTTIStack(this);
  57   }
  58
  59   void getAnalysisUsage(AnalysisUsage &AU) const override {
  60     TargetTransformInfo::getAnalysisUsage(AU);
  61   }
  62
  63   /// Pass identification.
  64   static char ID;
  65
  66   /// Provide necessary pointer adjustments for the two base classes.
  67   void *getAdjustedAnalysisPointer(const void *ID) override {
  68     if (ID == &TargetTransformInfo::ID)
  69       return (TargetTransformInfo*)this;
  70     return this;
  71   }
  72
  73   /// \name Scalar TTI Implementations
  74   /// @{
  75   unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
  76
  77   unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
  78                          Type *Ty) const override;
  79   unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
  80                          Type *Ty) const override;
  81
  82   PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
  83   void getUnrollingPreferences(
  84     Loop *L, UnrollingPreferences &UP) const override;
  85
  86   /// @}
  87
  88   /// \name Vector TTI Implementations
  89   /// @{
  90
  91   unsigned getNumberOfRegisters(bool Vector) const override;
  92   unsigned getRegisterBitWidth(bool Vector) const override;
  93   unsigned getMaximumUnrollFactor() const override;
  94   unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind,
  95                                   OperandValueKind, OperandValueProperties,
  96                                   OperandValueProperties) const override;
  97   unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
  98                           int Index, Type *SubTp) const override;
  99   unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
 100                             Type *Src) const override;
 101   unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
 102                               Type *CondTy) const override;
 103   unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
 104                               unsigned Index) const override;
 105   unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
 106                            unsigned AddressSpace) const override;
 107
 108   /// @}
 109 };
 110
 111 } // end anonymous namespace
 112
 113 INITIALIZE_AG_PASS(PPCTTI, TargetTransformInfo, "ppctti",
 114                    "PPC Target Transform Info", true, true, false)
 115 char PPCTTI::ID = 0;
 116
 117 ImmutablePass *
 118 llvm::createPPCTargetTransformInfoPass(const PPCTargetMachine *TM) {
 119   return new PPCTTI(TM);
 120 }
 121
 122
 123 //===----------------------------------------------------------------------===//
 124 //
 125 // PPC cost model.
 126 //
 127 //===----------------------------------------------------------------------===//
 128
 129 PPCTTI::PopcntSupportKind PPCTTI::getPopcntSupport(unsigned TyWidth) const {
 130   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
 131   if (ST->hasPOPCNTD() && TyWidth <= 64)
 132     return PSK_FastHardware;
 133   return PSK_Software;
 134 }
 135
 136 unsigned PPCTTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
 137   if (DisablePPCConstHoist)
 138     return TargetTransformInfo::getIntImmCost(Imm, Ty);
 139
 140   assert(Ty->isIntegerTy());
 141
 142   unsigned BitSize = Ty->getPrimitiveSizeInBits();
 143   if (BitSize == 0)
 144     return ~0U;
 145
 146   if (Imm == 0)
 147     return TCC_Free;
 148
 149   if (Imm.getBitWidth() <= 64) {
 150     if (isInt<16>(Imm.getSExtValue()))
 151       return TCC_Basic;
 152
 153     if (isInt<32>(Imm.getSExtValue())) {
 154       // A constant that can be materialized using lis.
 155       if ((Imm.getZExtValue() & 0xFFFF) == 0)
 156         return TCC_Basic;
 157
 158       return 2 * TCC_Basic;
 159     }
 160   }
 161
 162   return 4 * TCC_Basic;
 163 }
 164
 165 unsigned PPCTTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
 166                                const APInt &Imm, Type *Ty) const {
 167   if (DisablePPCConstHoist)
 168     return TargetTransformInfo::getIntImmCost(IID, Idx, Imm, Ty);
 169
 170   assert(Ty->isIntegerTy());
 171
 172   unsigned BitSize = Ty->getPrimitiveSizeInBits();
 173   if (BitSize == 0)
 174     return ~0U;
 175
 176   switch (IID) {
 177   default: return TCC_Free;
 178   case Intrinsic::sadd_with_overflow:
 179   case Intrinsic::uadd_with_overflow:
 180   case Intrinsic::ssub_with_overflow:
 181   case Intrinsic::usub_with_overflow:
 182     if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
 183       return TCC_Free;
 184     break;
 185   }
 186   return PPCTTI::getIntImmCost(Imm, Ty);
 187 }
 188
 189 unsigned PPCTTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
 190                                Type *Ty) const {
 191   if (DisablePPCConstHoist)
 192     return TargetTransformInfo::getIntImmCost(Opcode, Idx, Imm, Ty);
 193
 194   assert(Ty->isIntegerTy());
 195
 196   unsigned BitSize = Ty->getPrimitiveSizeInBits();
 197   if (BitSize == 0)
 198     return ~0U;
 199
 200   unsigned ImmIdx = ~0U;
 201   bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
 202        ZeroFree = false;
 203   switch (Opcode) {
 204   default: return TCC_Free;
 205   case Instruction::GetElementPtr:
 206     // Always hoist the base address of a GetElementPtr. This prevents the
 207     // creation of new constants for every base constant that gets constant
 208     // folded with the offset.
 209     if (Idx == 0)
 210       return 2 * TCC_Basic;
 211     return TCC_Free;
 212   case Instruction::And:
 213     RunFree = true; // (for the rotate-and-mask instructions)
 214     // Fallthrough...
 215   case Instruction::Add:
 216   case Instruction::Or:
 217   case Instruction::Xor:
 218     ShiftedFree = true;
 219     // Fallthrough...
 220   case Instruction::Sub:
 221   case Instruction::Mul:
 222   case Instruction::Shl:
 223   case Instruction::LShr:
 224   case Instruction::AShr:
 225     ImmIdx = 1;
 226     break;
 227   case Instruction::ICmp:
 228     UnsignedFree = true;
 229     ImmIdx = 1;
 230     // Fallthrough... (zero comparisons can use record-form instructions)
 231   case Instruction::Select:
 232     ZeroFree = true;
 233     break;
 234   case Instruction::PHI:
 235   case Instruction::Call:
 236   case Instruction::Ret:
 237   case Instruction::Load:
 238   case Instruction::Store:
 239     break;
 240   }
 241
 242   if (ZeroFree && Imm == 0)
 243     return TCC_Free;
 244
 245   if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
 246     if (isInt<16>(Imm.getSExtValue()))
 247       return TCC_Free;
 248
 249     if (RunFree) {
 250       if (Imm.getBitWidth() <= 32 &&
 251           (isShiftedMask_32(Imm.getZExtValue()) ||
 252            isShiftedMask_32(~Imm.getZExtValue())))
 253         return TCC_Free;
 254
 255
 256       if (ST->isPPC64() &&
 257           (isShiftedMask_64(Imm.getZExtValue()) ||
 258            isShiftedMask_64(~Imm.getZExtValue())))
 259         return TCC_Free;
 260     }
 261
 262     if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
 263       return TCC_Free;
 264
 265     if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
 266       return TCC_Free;
 267   }
 268
 269   return PPCTTI::getIntImmCost(Imm, Ty);
 270 }
 271
 272 void PPCTTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const {
 273   if (ST->getDarwinDirective() == PPC::DIR_A2) {
 274     // The A2 is in-order with a deep pipeline, and concatenation unrolling
 275     // helps expose latency-hiding opportunities to the instruction scheduler.
 276     UP.Partial = UP.Runtime = true;
 277   }
 278 }
 279
 280 unsigned PPCTTI::getNumberOfRegisters(bool Vector) const {
 281   if (Vector && !ST->hasAltivec())
 282     return 0;
 283   return ST->hasVSX() ? 64 : 32;
 284 }
 285
 286 unsigned PPCTTI::getRegisterBitWidth(bool Vector) const {
 287   if (Vector) {
 288     if (ST->hasAltivec()) return 128;
 289     return 0;
 290   }
 291
 292   if (ST->isPPC64())
 293     return 64;
 294   return 32;
 295
 296 }
 297
 298 unsigned PPCTTI::getMaximumUnrollFactor() const {
 299   unsigned Directive = ST->getDarwinDirective();
 300   // The 440 has no SIMD support, but floating-point instructions
 301   // have a 5-cycle latency, so unroll by 5x for latency hiding.
 302   if (Directive == PPC::DIR_440)
 303     return 5;
 304
 305   // The A2 has no SIMD support, but floating-point instructions
 306   // have a 6-cycle latency, so unroll by 6x for latency hiding.
 307   if (Directive == PPC::DIR_A2)
 308     return 6;
 309
 310   // FIXME: For lack of any better information, do no harm...
 311   if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500)
 312     return 1;
 313
 314   // For most things, modern systems have two execution units (and
 315   // out-of-order execution).
 316   return 2;
 317 }
 318
 319 unsigned PPCTTI::getArithmeticInstrCost(
 320     unsigned Opcode, Type *Ty, OperandValueKind Op1Info,
 321     OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo,
 322     OperandValueProperties Opd2PropInfo) const {
 323   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 324
 325   // Fallback to the default implementation.
 326   return TargetTransformInfo::getArithmeticInstrCost(
 327       Opcode, Ty, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
 328 }
 329
 330 unsigned PPCTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
 331                                 Type *SubTp) const {
 332   return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
 333 }
 334
 335 unsigned PPCTTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
 336   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 337
 338   return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
 339 }
 340
 341 unsigned PPCTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
 342                                     Type *CondTy) const {
 343   return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
 344 }
 345
 346 unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val,
 347                                     unsigned Index) const {
 348   assert(Val->isVectorTy() && "This must be a vector type");
 349
 350   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 351   assert(ISD && "Invalid opcode");
 352
 353   if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
 354     // Double-precision scalars are already located in index #0.
 355     if (Index == 0)
 356       return 0;
 357
 358     return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
 359   }
 360
 361   // Estimated cost of a load-hit-store delay.  This was obtained
 362   // experimentally as a minimum needed to prevent unprofitable
 363   // vectorization for the paq8p benchmark.  It may need to be
 364   // raised further if other unprofitable cases remain.
 365   unsigned LHSPenalty = 2;
 366   if (ISD == ISD::INSERT_VECTOR_ELT)
 367     LHSPenalty += 7;
 368
 369   // Vector element insert/extract with Altivec is very expensive,
 370   // because they require store and reload with the attendant
 371   // processor stall for load-hit-store.  Until VSX is available,
 372   // these need to be estimated as very costly.
 373   if (ISD == ISD::EXTRACT_VECTOR_ELT ||
 374       ISD == ISD::INSERT_VECTOR_ELT)
 375     return LHSPenalty +
 376       TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
 377
 378   return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
 379 }
 380
 381 unsigned PPCTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
 382                                  unsigned AddressSpace) const {
 383   // Legalize the type.
 384   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
 385   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
 386          "Invalid Opcode");
 387
 388   unsigned Cost =
 389     TargetTransformInfo::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
 390
 391   // VSX loads/stores support unaligned access.
 392   if (ST->hasVSX()) {
 393     if (LT.second == MVT::v2f64 || LT.second == MVT::v2i64)
 394       return Cost;
 395   }
 396
 397   bool UnalignedAltivec =
 398     Src->isVectorTy() &&
 399     Src->getPrimitiveSizeInBits() >= LT.second.getSizeInBits() &&
 400     LT.second.getSizeInBits() == 128 &&
 401     Opcode == Instruction::Load;
 402
 403   // PPC in general does not support unaligned loads and stores. They'll need
 404   // to be decomposed based on the alignment factor.
 405   unsigned SrcBytes = LT.second.getStoreSize();
 406   if (SrcBytes && Alignment && Alignment < SrcBytes && !UnalignedAltivec) {
 407     Cost += LT.first*(SrcBytes/Alignment-1);
 408
 409     // For a vector type, there is also scalarization overhead (only for
 410     // stores, loads are expanded using the vector-load + permutation sequence,
 411     // which is much less expensive).
 412     if (Src->isVectorTy() && Opcode == Instruction::Store)
 413       for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
 414         Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
 415   }
 416
 417   return Cost;
 418 }
 419