lib/Target/SparcV9/SparcV9TargetMachine.cpp

   1 // $Id$
   2 //***************************************************************************
   3 // File:
   4 //      Sparc.cpp
   5 //
   6 // Purpose:
   7 //
   8 // History:
   9 //      7/15/01  -  Vikram Adve  -  Created
  10 //**************************************************************************/
  11
  12
  13 #include "SparcInternals.h"
  14 #include "llvm/Target/Sparc.h"
  15 #include "llvm/CodeGen/InstrScheduling.h"
  16 #include "llvm/CodeGen/InstrSelection.h"
  17 #include "llvm/CodeGen/MachineCodeForInstruction.h"
  18 #include "llvm/CodeGen/MachineCodeForMethod.h"
  19 #include "llvm/CodeGen/RegisterAllocation.h"
  20 #include "llvm/Method.h"
  21 #include "llvm/PassManager.h"
  22 #include <iostream>
  23 using std::cerr;
  24
  25 // Build the MachineInstruction Description Array...
  26 const MachineInstrDescriptor SparcMachineInstrDesc[] = {
  27 #define I(ENUM, OPCODESTRING, NUMOPERANDS, RESULTPOS, MAXIMM, IMMSE, \
  28           NUMDELAYSLOTS, LATENCY, SCHEDCLASS, INSTFLAGS)             \
  29   { OPCODESTRING, NUMOPERANDS, RESULTPOS, MAXIMM, IMMSE,             \
  30           NUMDELAYSLOTS, LATENCY, SCHEDCLASS, INSTFLAGS },
  31 #include "SparcInstr.def"
  32 };
  33
  34 //----------------------------------------------------------------------------
  35 // allocateSparcTargetMachine - Allocate and return a subclass of TargetMachine
  36 // that implements the Sparc backend. (the llvm/CodeGen/Sparc.h interface)
  37 //----------------------------------------------------------------------------
  38 //
  39
  40 TargetMachine *allocateSparcTargetMachine() { return new UltraSparc(); }
  41
  42
  43 //---------------------------------------------------------------------------
  44 // class InsertPrologEpilogCode
  45 //
  46 // Insert SAVE/RESTORE instructions for the method
  47 //
  48 // Insert prolog code at the unique method entry point.
  49 // Insert epilog code at each method exit point.
  50 // InsertPrologEpilog invokes these only if the method is not compiled
  51 // with the leaf method optimization.
  52 //
  53 //---------------------------------------------------------------------------
  54 static MachineInstr* minstrVec[MAX_INSTR_PER_VMINSTR];
  55
  56 class InsertPrologEpilogCode : public MethodPass {
  57   TargetMachine &Target;
  58 public:
  59   inline InsertPrologEpilogCode(TargetMachine &T) : Target(T) {}
  60   bool runOnMethod(Method *M) {
  61     MachineCodeForMethod &mcodeInfo = MachineCodeForMethod::get(M);
  62     if (!mcodeInfo.isCompiledAsLeafMethod()) {
  63       InsertPrologCode(M);
  64       InsertEpilogCode(M);
  65     }
  66     return false;
  67   }
  68
  69   void InsertPrologCode(Method *M);
  70   void InsertEpilogCode(Method *M);
  71 };
  72
  73 void InsertPrologEpilogCode::InsertPrologCode(Method* method)
  74 {
  75   BasicBlock* entryBB = method->getEntryNode();
  76   unsigned N = GetInstructionsForProlog(entryBB, Target, minstrVec);
  77   assert(N <= MAX_INSTR_PER_VMINSTR);
  78   MachineCodeForBasicBlock& bbMvec = entryBB->getMachineInstrVec();
  79   bbMvec.insert(bbMvec.begin(), minstrVec, minstrVec+N);
  80 }
  81
  82
  83 void InsertPrologEpilogCode::InsertEpilogCode(Method* method)
  84 {
  85   for (Method::iterator I=method->begin(), E=method->end(); I != E; ++I)
  86     if ((*I)->getTerminator()->getOpcode() == Instruction::Ret)
  87       {
  88         BasicBlock* exitBB = *I;
  89         unsigned N = GetInstructionsForEpilog(exitBB, Target, minstrVec);
  90
  91         MachineCodeForBasicBlock& bbMvec = exitBB->getMachineInstrVec();
  92         MachineCodeForInstruction &termMvec =
  93           MachineCodeForInstruction::get(exitBB->getTerminator());
  94
  95         // Remove the NOPs in the delay slots of the return instruction
  96         const MachineInstrInfo &mii = Target.getInstrInfo();
  97         unsigned numNOPs = 0;
  98         while (termMvec.back()->getOpCode() == NOP)
  99           {
 100             assert( termMvec.back() == bbMvec.back());
 101             termMvec.pop_back();
 102             bbMvec.pop_back();
 103             ++numNOPs;
 104           }
 105         assert(termMvec.back() == bbMvec.back());
 106
 107         // Check that we found the right number of NOPs and have the right
 108         // number of instructions to replace them.
 109         unsigned ndelays = mii.getNumDelaySlots(termMvec.back()->getOpCode());
 110         assert(numNOPs == ndelays && "Missing NOPs in delay slots?");
 111         assert(N == ndelays && "Cannot use epilog code for delay slots?");
 112
 113         // Append the epilog code to the end of the basic block.
 114         bbMvec.push_back(minstrVec[0]);
 115       }
 116 }
 117
 118
 119 /*---------------------------------------------------------------------------
 120 Scheduling guidelines for SPARC IIi:
 121
 122 I-Cache alignment rules (pg 326)
 123 -- Align a branch target instruction so that it's entire group is within
 124    the same cache line (may be 1-4 instructions).
 125 ** Don't let a branch that is predicted taken be the last instruction
 126    on an I-cache line: delay slot will need an entire line to be fetched
 127 -- Make a FP instruction or a branch be the 4th instruction in a group.
 128    For branches, there are tradeoffs in reordering to make this happen
 129    (see pg. 327).
 130 ** Don't put a branch in a group that crosses a 32-byte boundary!
 131    An artificial branch is inserted after every 32 bytes, and having
 132    another branch will force the group to be broken into 2 groups.
 133
 134 iTLB rules:
 135 -- Don't let a loop span two memory pages, if possible
 136
 137 Branch prediction performance:
 138 -- Don't make the branch in a delay slot the target of a branch
 139 -- Try not to have 2 predicted branches within a group of 4 instructions
 140    (because each such group has a single branch target field).
 141 -- Try to align branches in slots 0, 2, 4 or 6 of a cache line (to avoid
 142    the wrong prediction bits being used in some cases).
 143
 144 D-Cache timing constraints:
 145 -- Signed int loads of less than 64 bits have 3 cycle latency, not 2
 146 -- All other loads that hit in D-Cache have 2 cycle latency
 147 -- All loads are returned IN ORDER, so a D-Cache miss will delay a later hit
 148 -- Mis-aligned loads or stores cause a trap.  In particular, replace
 149    mis-aligned FP double precision l/s with 2 single-precision l/s.
 150 -- Simulations of integer codes show increase in avg. group size of
 151    33% when code (including esp. non-faulting loads) is moved across
 152    one branch, and 50% across 2 branches.
 153
 154 E-Cache timing constraints:
 155 -- Scheduling for E-cache (D-Cache misses) is effective (due to load buffering)
 156
 157 Store buffer timing constraints:
 158 -- Stores can be executed in same cycle as instruction producing the value
 159 -- Stores are buffered and have lower priority for E-cache until
 160    highwater mark is reached in the store buffer (5 stores)
 161
 162 Pipeline constraints:
 163 -- Shifts can only use IEU0.
 164 -- CC setting instructions can only use IEU1.
 165 -- Several other instructions must only use IEU1:
 166    EDGE(?), ARRAY(?), CALL, JMPL, BPr, PST, and FCMP.
 167 -- Two instructions cannot store to the same register file in a single cycle
 168    (single write port per file).
 169
 170 Issue and grouping constraints:
 171 -- FP and branch instructions must use slot 4.
 172 -- Shift instructions cannot be grouped with other IEU0-specific instructions.
 173 -- CC setting instructions cannot be grouped with other IEU1-specific instrs.
 174 -- Several instructions must be issued in a single-instruction group:
 175         MOVcc or MOVr, MULs/x and DIVs/x, SAVE/RESTORE, many others
 176 -- A CALL or JMPL breaks a group, ie, is not combined with subsequent instrs.
 177 --
 178 --
 179
 180 Branch delay slot scheduling rules:
 181 -- A CTI couple (two back-to-back CTI instructions in the dynamic stream)
 182    has a 9-instruction penalty: the entire pipeline is flushed when the
 183    second instruction reaches stage 9 (W-Writeback).
 184 -- Avoid putting multicycle instructions, and instructions that may cause
 185    load misses, in the delay slot of an annulling branch.
 186 -- Avoid putting WR, SAVE..., RESTORE and RETURN instructions in the
 187    delay slot of an annulling branch.
 188
 189  *--------------------------------------------------------------------------- */
 190
 191 //---------------------------------------------------------------------------
 192 // List of CPUResources for UltraSPARC IIi.
 193 //---------------------------------------------------------------------------
 194
 195 static const CPUResource  AllIssueSlots(   "All Instr Slots", 4);
 196 static const CPUResource  IntIssueSlots(   "Int Instr Slots", 3);
 197 static const CPUResource  First3IssueSlots("Instr Slots 0-3", 3);
 198 static const CPUResource  LSIssueSlots(    "Load-Store Instr Slot", 1);
 199 static const CPUResource  CTIIssueSlots(   "Ctrl Transfer Instr Slot", 1);
 200 static const CPUResource  FPAIssueSlots(   "Int Instr Slot 1", 1);
 201 static const CPUResource  FPMIssueSlots(   "Int Instr Slot 1", 1);
 202
 203 // IEUN instructions can use either Alu and should use IAluN.
 204 // IEU0 instructions must use Alu 1 and should use both IAluN and IAlu0.
 205 // IEU1 instructions must use Alu 2 and should use both IAluN and IAlu1.
 206 static const CPUResource  IAluN("Int ALU 1or2", 2);
 207 static const CPUResource  IAlu0("Int ALU 1",    1);
 208 static const CPUResource  IAlu1("Int ALU 2",    1);
 209
 210 static const CPUResource  LSAluC1("Load/Store Unit Addr Cycle", 1);
 211 static const CPUResource  LSAluC2("Load/Store Unit Issue Cycle", 1);
 212 static const CPUResource  LdReturn("Load Return Unit", 1);
 213
 214 static const CPUResource  FPMAluC1("FP Mul/Div Alu Cycle 1", 1);
 215 static const CPUResource  FPMAluC2("FP Mul/Div Alu Cycle 2", 1);
 216 static const CPUResource  FPMAluC3("FP Mul/Div Alu Cycle 3", 1);
 217
 218 static const CPUResource  FPAAluC1("FP Other Alu Cycle 1", 1);
 219 static const CPUResource  FPAAluC2("FP Other Alu Cycle 2", 1);
 220 static const CPUResource  FPAAluC3("FP Other Alu Cycle 3", 1);
 221
 222 static const CPUResource  IRegReadPorts("Int Reg ReadPorts", INT_MAX); // CHECK
 223 static const CPUResource  IRegWritePorts("Int Reg WritePorts", 2);     // CHECK
 224 static const CPUResource  FPRegReadPorts("FP Reg Read Ports", INT_MAX);// CHECK
 225 static const CPUResource  FPRegWritePorts("FP Reg Write Ports", 1);    // CHECK
 226
 227 static const CPUResource  CTIDelayCycle( "CTI  delay cycle", 1);
 228 static const CPUResource  FCMPDelayCycle("FCMP delay cycle", 1);
 229
 230
 231
 232 //---------------------------------------------------------------------------
 233 // const InstrClassRUsage SparcRUsageDesc[]
 234 //
 235 // Purpose:
 236 //   Resource usage information for instruction in each scheduling class.
 237 //   The InstrRUsage Objects for individual classes are specified first.
 238 //   Note that fetch and decode are decoupled from the execution pipelines
 239 //   via an instr buffer, so they are not included in the cycles below.
 240 //---------------------------------------------------------------------------
 241
 242 static const InstrClassRUsage NoneClassRUsage = {
 243   SPARC_NONE,
 244   /*totCycles*/ 7,
 245
 246   /* maxIssueNum */ 4,
 247   /* isSingleIssue */ false,
 248   /* breaksGroup */ false,
 249   /* numBubbles */ 0,
 250
 251   /*numSlots*/ 4,
 252   /* feasibleSlots[] */ { 0, 1, 2, 3 },
 253
 254   /*numEntries*/ 0,
 255   /* V[] */ {
 256     /*Cycle G */
 257     /*Ccle E */
 258     /*Cycle C */
 259     /*Cycle N1*/
 260     /*Cycle N1*/
 261     /*Cycle N1*/
 262     /*Cycle W */
 263   }
 264 };
 265
 266 static const InstrClassRUsage IEUNClassRUsage = {
 267   SPARC_IEUN,
 268   /*totCycles*/ 7,
 269
 270   /* maxIssueNum */ 3,
 271   /* isSingleIssue */ false,
 272   /* breaksGroup */ false,
 273   /* numBubbles */ 0,
 274
 275   /*numSlots*/ 3,
 276   /* feasibleSlots[] */ { 0, 1, 2 },
 277
 278   /*numEntries*/ 4,
 279   /* V[] */ {
 280     /*Cycle G */ { AllIssueSlots.rid, 0, 1 },
 281                  { IntIssueSlots.rid, 0, 1 },
 282     /*Cycle E */ { IAluN.rid, 1, 1 },
 283     /*Cycle C */
 284     /*Cycle N1*/
 285     /*Cycle N1*/
 286     /*Cycle N1*/
 287     /*Cycle W */ { IRegWritePorts.rid, 6, 1  }
 288   }
 289 };
 290
 291 static const InstrClassRUsage IEU0ClassRUsage = {
 292   SPARC_IEU0,
 293   /*totCycles*/ 7,
 294
 295   /* maxIssueNum */ 1,
 296   /* isSingleIssue */ false,
 297   /* breaksGroup */ false,
 298   /* numBubbles */ 0,
 299
 300   /*numSlots*/ 3,
 301   /* feasibleSlots[] */ { 0, 1, 2 },
 302
 303   /*numEntries*/ 5,
 304   /* V[] */ {
 305     /*Cycle G */ { AllIssueSlots.rid, 0, 1 },
 306                  { IntIssueSlots.rid, 0, 1 },
 307     /*Cycle E */ { IAluN.rid, 1, 1 },
 308                  { IAlu0.rid, 1, 1 },
 309     /*Cycle C */
 310     /*Cycle N1*/
 311     /*Cycle N1*/
 312     /*Cycle N1*/
 313     /*Cycle W */ { IRegWritePorts.rid, 6, 1 }
 314   }
 315 };
 316
 317 static const InstrClassRUsage IEU1ClassRUsage = {
 318   SPARC_IEU1,
 319   /*totCycles*/ 7,
 320
 321   /* maxIssueNum */ 1,
 322   /* isSingleIssue */ false,
 323   /* breaksGroup */ false,
 324   /* numBubbles */ 0,
 325
 326   /*numSlots*/ 3,
 327   /* feasibleSlots[] */ { 0, 1, 2 },
 328
 329   /*numEntries*/ 5,
 330   /* V[] */ {
 331     /*Cycle G */ { AllIssueSlots.rid, 0, 1 },
 332                { IntIssueSlots.rid, 0, 1 },
 333     /*Cycle E */ { IAluN.rid, 1, 1 },
 334                { IAlu1.rid, 1, 1 },
 335     /*Cycle C */
 336     /*Cycle N1*/
 337     /*Cycle N1*/
 338     /*Cycle N1*/
 339     /*Cycle W */ { IRegWritePorts.rid, 6, 1 }
 340   }
 341 };
 342
 343 static const InstrClassRUsage FPMClassRUsage = {
 344   SPARC_FPM,
 345   /*totCycles*/ 7,
 346
 347   /* maxIssueNum */ 1,
 348   /* isSingleIssue */ false,
 349   /* breaksGroup */ false,
 350   /* numBubbles */ 0,
 351
 352   /*numSlots*/ 4,
 353   /* feasibleSlots[] */ { 0, 1, 2, 3 },
 354
 355   /*numEntries*/ 7,
 356   /* V[] */ {
 357     /*Cycle G */ { AllIssueSlots.rid,   0, 1 },
 358                  { FPMIssueSlots.rid,   0, 1 },
 359     /*Cycle E */ { FPRegReadPorts.rid,  1, 1 },
 360     /*Cycle C */ { FPMAluC1.rid,        2, 1 },
 361     /*Cycle N1*/ { FPMAluC2.rid,        3, 1 },
 362     /*Cycle N1*/ { FPMAluC3.rid,        4, 1 },
 363     /*Cycle N1*/
 364     /*Cycle W */ { FPRegWritePorts.rid, 6, 1 }
 365   }
 366 };
 367
 368 static const InstrClassRUsage FPAClassRUsage = {
 369   SPARC_FPA,
 370   /*totCycles*/ 7,
 371
 372   /* maxIssueNum */ 1,
 373   /* isSingleIssue */ false,
 374   /* breaksGroup */ false,
 375   /* numBubbles */ 0,
 376
 377   /*numSlots*/ 4,
 378   /* feasibleSlots[] */ { 0, 1, 2, 3 },
 379
 380   /*numEntries*/ 7,
 381   /* V[] */ {
 382     /*Cycle G */ { AllIssueSlots.rid,   0, 1 },
 383                  { FPAIssueSlots.rid,   0, 1 },
 384     /*Cycle E */ { FPRegReadPorts.rid,  1, 1 },
 385     /*Cycle C */ { FPAAluC1.rid,        2, 1 },
 386     /*Cycle N1*/ { FPAAluC2.rid,        3, 1 },
 387     /*Cycle N1*/ { FPAAluC3.rid,        4, 1 },
 388     /*Cycle N1*/
 389     /*Cycle W */ { FPRegWritePorts.rid, 6, 1 }
 390   }
 391 };
 392
 393 static const InstrClassRUsage LDClassRUsage = {
 394   SPARC_LD,
 395   /*totCycles*/ 7,
 396
 397   /* maxIssueNum */ 1,
 398   /* isSingleIssue */ false,
 399   /* breaksGroup */ false,
 400   /* numBubbles */ 0,
 401
 402   /*numSlots*/ 3,
 403   /* feasibleSlots[] */ { 0, 1, 2, },
 404
 405   /*numEntries*/ 6,
 406   /* V[] */ {
 407     /*Cycle G */ { AllIssueSlots.rid,    0, 1 },
 408                  { First3IssueSlots.rid, 0, 1 },
 409                  { LSIssueSlots.rid,     0, 1 },
 410     /*Cycle E */ { LSAluC1.rid,          1, 1 },
 411     /*Cycle C */ { LSAluC2.rid,          2, 1 },
 412                  { LdReturn.rid,         2, 1 },
 413     /*Cycle N1*/
 414     /*Cycle N1*/
 415     /*Cycle N1*/
 416     /*Cycle W */ { IRegWritePorts.rid,   6, 1 }
 417   }
 418 };
 419
 420 static const InstrClassRUsage STClassRUsage = {
 421   SPARC_ST,
 422   /*totCycles*/ 7,
 423
 424   /* maxIssueNum */ 1,
 425   /* isSingleIssue */ false,
 426   /* breaksGroup */ false,
 427   /* numBubbles */ 0,
 428
 429   /*numSlots*/ 3,
 430   /* feasibleSlots[] */ { 0, 1, 2 },
 431
 432   /*numEntries*/ 4,
 433   /* V[] */ {
 434     /*Cycle G */ { AllIssueSlots.rid,    0, 1 },
 435                  { First3IssueSlots.rid, 0, 1 },
 436                  { LSIssueSlots.rid,     0, 1 },
 437     /*Cycle E */ { LSAluC1.rid,          1, 1 },
 438     /*Cycle C */ { LSAluC2.rid,          2, 1 }
 439     /*Cycle N1*/
 440     /*Cycle N1*/
 441     /*Cycle N1*/
 442     /*Cycle W */
 443   }
 444 };
 445
 446 static const InstrClassRUsage CTIClassRUsage = {
 447   SPARC_CTI,
 448   /*totCycles*/ 7,
 449
 450   /* maxIssueNum */ 1,
 451   /* isSingleIssue */ false,
 452   /* breaksGroup */ false,
 453   /* numBubbles */ 0,
 454
 455   /*numSlots*/ 4,
 456   /* feasibleSlots[] */ { 0, 1, 2, 3 },
 457
 458   /*numEntries*/ 4,
 459   /* V[] */ {
 460     /*Cycle G */ { AllIssueSlots.rid,    0, 1 },
 461                  { CTIIssueSlots.rid,    0, 1 },
 462     /*Cycle E */ { IAlu0.rid,            1, 1 },
 463     /*Cycles E-C */ { CTIDelayCycle.rid, 1, 2 }
 464     /*Cycle C */
 465     /*Cycle N1*/
 466     /*Cycle N1*/
 467     /*Cycle N1*/
 468     /*Cycle W */
 469   }
 470 };
 471
 472 static const InstrClassRUsage SingleClassRUsage = {
 473   SPARC_SINGLE,
 474   /*totCycles*/ 7,
 475
 476   /* maxIssueNum */ 1,
 477   /* isSingleIssue */ true,
 478   /* breaksGroup */ false,
 479   /* numBubbles */ 0,
 480
 481   /*numSlots*/ 1,
 482   /* feasibleSlots[] */ { 0 },
 483
 484   /*numEntries*/ 5,
 485   /* V[] */ {
 486     /*Cycle G */ { AllIssueSlots.rid,    0, 1 },
 487                  { AllIssueSlots.rid,    0, 1 },
 488                  { AllIssueSlots.rid,    0, 1 },
 489                  { AllIssueSlots.rid,    0, 1 },
 490     /*Cycle E */ { IAlu0.rid,            1, 1 }
 491     /*Cycle C */
 492     /*Cycle N1*/
 493     /*Cycle N1*/
 494     /*Cycle N1*/
 495     /*Cycle W */
 496   }
 497 };
 498
 499
 500 static const InstrClassRUsage SparcRUsageDesc[] = {
 501   NoneClassRUsage,
 502   IEUNClassRUsage,
 503   IEU0ClassRUsage,
 504   IEU1ClassRUsage,
 505   FPMClassRUsage,
 506   FPAClassRUsage,
 507   CTIClassRUsage,
 508   LDClassRUsage,
 509   STClassRUsage,
 510   SingleClassRUsage
 511 };
 512
 513
 514
 515 //---------------------------------------------------------------------------
 516 // const InstrIssueDelta  SparcInstrIssueDeltas[]
 517 //
 518 // Purpose:
 519 //   Changes to issue restrictions information in InstrClassRUsage for
 520 //   instructions that differ from other instructions in their class.
 521 //---------------------------------------------------------------------------
 522
 523 static const InstrIssueDelta  SparcInstrIssueDeltas[] = {
 524
 525   // opCode,  isSingleIssue,  breaksGroup,  numBubbles
 526
 527                                 // Special cases for single-issue only
 528                                 // Other single issue cases are below.
 529 //{ LDDA,       true,   true,   0 },
 530 //{ STDA,       true,   true,   0 },
 531 //{ LDDF,       true,   true,   0 },
 532 //{ LDDFA,      true,   true,   0 },
 533   { ADDC,       true,   true,   0 },
 534   { ADDCcc,     true,   true,   0 },
 535   { SUBC,       true,   true,   0 },
 536   { SUBCcc,     true,   true,   0 },
 537 //{ LDSTUB,     true,   true,   0 },
 538 //{ SWAP,       true,   true,   0 },
 539 //{ SWAPA,      true,   true,   0 },
 540 //{ CAS,        true,   true,   0 },
 541 //{ CASA,       true,   true,   0 },
 542 //{ CASX,       true,   true,   0 },
 543 //{ CASXA,      true,   true,   0 },
 544 //{ LDFSR,      true,   true,   0 },
 545 //{ LDFSRA,     true,   true,   0 },
 546 //{ LDXFSR,     true,   true,   0 },
 547 //{ LDXFSRA,    true,   true,   0 },
 548 //{ STFSR,      true,   true,   0 },
 549 //{ STFSRA,     true,   true,   0 },
 550 //{ STXFSR,     true,   true,   0 },
 551 //{ STXFSRA,    true,   true,   0 },
 552 //{ SAVED,      true,   true,   0 },
 553 //{ RESTORED,   true,   true,   0 },
 554 //{ FLUSH,      true,   true,   9 },
 555 //{ FLUSHW,     true,   true,   9 },
 556 //{ ALIGNADDR,  true,   true,   0 },
 557   { RETURN,     true,   true,   0 },
 558 //{ DONE,       true,   true,   0 },
 559 //{ RETRY,      true,   true,   0 },
 560 //{ TCC,        true,   true,   0 },
 561 //{ SHUTDOWN,   true,   true,   0 },
 562
 563                                 // Special cases for breaking group *before*
 564                                 // CURRENTLY NOT SUPPORTED!
 565   { CALL,       false,  false,  0 },
 566   { JMPLCALL,   false,  false,  0 },
 567   { JMPLRET,    false,  false,  0 },
 568
 569                                 // Special cases for breaking the group *after*
 570   { MULX,       true,   true,   (4+34)/2 },
 571   { FDIVS,      false,  true,   0 },
 572   { FDIVD,      false,  true,   0 },
 573   { FDIVQ,      false,  true,   0 },
 574   { FSQRTS,     false,  true,   0 },
 575   { FSQRTD,     false,  true,   0 },
 576   { FSQRTQ,     false,  true,   0 },
 577 //{ FCMP{LE,GT,NE,EQ}, false, true, 0 },
 578
 579                                 // Instructions that introduce bubbles
 580 //{ MULScc,     true,   true,   2 },
 581 //{ SMULcc,     true,   true,   (4+18)/2 },
 582 //{ UMULcc,     true,   true,   (4+19)/2 },
 583   { SDIVX,      true,   true,   68 },
 584   { UDIVX,      true,   true,   68 },
 585 //{ SDIVcc,     true,   true,   36 },
 586 //{ UDIVcc,     true,   true,   37 },
 587   { WRCCR,      true,   true,   4 },
 588 //{ WRPR,       true,   true,   4 },
 589 //{ RDCCR,      true,   true,   0 }, // no bubbles after, but see below
 590 //{ RDPR,       true,   true,   0 },
 591 };
 592
 593
 594
 595
 596 //---------------------------------------------------------------------------
 597 // const InstrRUsageDelta SparcInstrUsageDeltas[]
 598 //
 599 // Purpose:
 600 //   Changes to resource usage information in InstrClassRUsage for
 601 //   instructions that differ from other instructions in their class.
 602 //---------------------------------------------------------------------------
 603
 604 static const InstrRUsageDelta SparcInstrUsageDeltas[] = {
 605
 606   // MachineOpCode, Resource, Start cycle, Num cycles
 607
 608   //
 609   // JMPL counts as a load/store instruction for issue!
 610   //
 611   { JMPLCALL, LSIssueSlots.rid,  0,  1 },
 612   { JMPLRET,  LSIssueSlots.rid,  0,  1 },
 613
 614   //
 615   // Many instructions cannot issue for the next 2 cycles after an FCMP
 616   // We model that with a fake resource FCMPDelayCycle.
 617   //
 618   { FCMPS,    FCMPDelayCycle.rid, 1, 3 },
 619   { FCMPD,    FCMPDelayCycle.rid, 1, 3 },
 620   { FCMPQ,    FCMPDelayCycle.rid, 1, 3 },
 621
 622   { MULX,     FCMPDelayCycle.rid, 1, 1 },
 623   { SDIVX,    FCMPDelayCycle.rid, 1, 1 },
 624   { UDIVX,    FCMPDelayCycle.rid, 1, 1 },
 625 //{ SMULcc,   FCMPDelayCycle.rid, 1, 1 },
 626 //{ UMULcc,   FCMPDelayCycle.rid, 1, 1 },
 627 //{ SDIVcc,   FCMPDelayCycle.rid, 1, 1 },
 628 //{ UDIVcc,   FCMPDelayCycle.rid, 1, 1 },
 629   { STD,      FCMPDelayCycle.rid, 1, 1 },
 630   { FMOVRSZ,  FCMPDelayCycle.rid, 1, 1 },
 631   { FMOVRSLEZ,FCMPDelayCycle.rid, 1, 1 },
 632   { FMOVRSLZ, FCMPDelayCycle.rid, 1, 1 },
 633   { FMOVRSNZ, FCMPDelayCycle.rid, 1, 1 },
 634   { FMOVRSGZ, FCMPDelayCycle.rid, 1, 1 },
 635   { FMOVRSGEZ,FCMPDelayCycle.rid, 1, 1 },
 636
 637   //
 638   // Some instructions are stalled in the GROUP stage if a CTI is in
 639   // the E or C stage.  We model that with a fake resource CTIDelayCycle.
 640   //
 641   { LDD,      CTIDelayCycle.rid,  1, 1 },
 642 //{ LDDA,     CTIDelayCycle.rid,  1, 1 },
 643 //{ LDDSTUB,  CTIDelayCycle.rid,  1, 1 },
 644 //{ LDDSTUBA, CTIDelayCycle.rid,  1, 1 },
 645 //{ SWAP,     CTIDelayCycle.rid,  1, 1 },
 646 //{ SWAPA,    CTIDelayCycle.rid,  1, 1 },
 647 //{ CAS,      CTIDelayCycle.rid,  1, 1 },
 648 //{ CASA,     CTIDelayCycle.rid,  1, 1 },
 649 //{ CASX,     CTIDelayCycle.rid,  1, 1 },
 650 //{ CASXA,    CTIDelayCycle.rid,  1, 1 },
 651
 652   //
 653   // Signed int loads of less than dword size return data in cycle N1 (not C)
 654   // and put all loads in consecutive cycles into delayed load return mode.
 655   //
 656   { LDSB,    LdReturn.rid,  2, -1 },
 657   { LDSB,    LdReturn.rid,  3,  1 },
 658
 659   { LDSH,    LdReturn.rid,  2, -1 },
 660   { LDSH,    LdReturn.rid,  3,  1 },
 661
 662   { LDSW,    LdReturn.rid,  2, -1 },
 663   { LDSW,    LdReturn.rid,  3,  1 },
 664
 665   //
 666   // RDPR from certain registers and RD from any register are not dispatchable
 667   // until four clocks after they reach the head of the instr. buffer.
 668   // Together with their single-issue requirement, this means all four issue
 669   // slots are effectively blocked for those cycles, plus the issue cycle.
 670   // This does not increase the latency of the instruction itself.
 671   //
 672   { RDCCR,   AllIssueSlots.rid,     0,  5 },
 673   { RDCCR,   AllIssueSlots.rid,     0,  5 },
 674   { RDCCR,   AllIssueSlots.rid,     0,  5 },
 675   { RDCCR,   AllIssueSlots.rid,     0,  5 },
 676
 677 #undef EXPLICIT_BUBBLES_NEEDED
 678 #ifdef EXPLICIT_BUBBLES_NEEDED
 679   //
 680   // MULScc inserts one bubble.
 681   // This means it breaks the current group (captured in UltraSparcSchedInfo)
 682   // *and occupies all issue slots for the next cycle
 683   //
 684 //{ MULScc,  AllIssueSlots.rid, 2, 2-1 },
 685 //{ MULScc,  AllIssueSlots.rid, 2, 2-1 },
 686 //{ MULScc,  AllIssueSlots.rid, 2, 2-1 },
 687 //{ MULScc,  AllIssueSlots.rid,  2, 2-1 },
 688
 689   //
 690   // SMULcc inserts between 4 and 18 bubbles, depending on #leading 0s in rs1.
 691   // We just model this with a simple average.
 692   //
 693 //{ SMULcc,  AllIssueSlots.rid, 2, ((4+18)/2)-1 },
 694 //{ SMULcc,  AllIssueSlots.rid, 2, ((4+18)/2)-1 },
 695 //{ SMULcc,  AllIssueSlots.rid, 2, ((4+18)/2)-1 },
 696 //{ SMULcc,  AllIssueSlots.rid,  2, ((4+18)/2)-1 },
 697
 698   // SMULcc inserts between 4 and 19 bubbles, depending on #leading 0s in rs1.
 699 //{ UMULcc,  AllIssueSlots.rid, 2, ((4+19)/2)-1 },
 700 //{ UMULcc,  AllIssueSlots.rid, 2, ((4+19)/2)-1 },
 701 //{ UMULcc,  AllIssueSlots.rid, 2, ((4+19)/2)-1 },
 702 //{ UMULcc,  AllIssueSlots.rid,  2, ((4+19)/2)-1 },
 703
 704   //
 705   // MULX inserts between 4 and 34 bubbles, depending on #leading 0s in rs1.
 706   //
 707   { MULX,    AllIssueSlots.rid, 2, ((4+34)/2)-1 },
 708   { MULX,    AllIssueSlots.rid, 2, ((4+34)/2)-1 },
 709   { MULX,    AllIssueSlots.rid, 2, ((4+34)/2)-1 },
 710   { MULX,    AllIssueSlots.rid,  2, ((4+34)/2)-1 },
 711
 712   //
 713   // SDIVcc inserts 36 bubbles.
 714   //
 715 //{ SDIVcc,  AllIssueSlots.rid, 2, 36-1 },
 716 //{ SDIVcc,  AllIssueSlots.rid, 2, 36-1 },
 717 //{ SDIVcc,  AllIssueSlots.rid, 2, 36-1 },
 718 //{ SDIVcc,  AllIssueSlots.rid,  2, 36-1 },
 719
 720   // UDIVcc inserts 37 bubbles.
 721 //{ UDIVcc,  AllIssueSlots.rid, 2, 37-1 },
 722 //{ UDIVcc,  AllIssueSlots.rid, 2, 37-1 },
 723 //{ UDIVcc,  AllIssueSlots.rid, 2, 37-1 },
 724 //{ UDIVcc,  AllIssueSlots.rid,  2, 37-1 },
 725
 726   //
 727   // SDIVX inserts 68 bubbles.
 728   //
 729   { SDIVX,   AllIssueSlots.rid, 2, 68-1 },
 730   { SDIVX,   AllIssueSlots.rid, 2, 68-1 },
 731   { SDIVX,   AllIssueSlots.rid, 2, 68-1 },
 732   { SDIVX,   AllIssueSlots.rid,  2, 68-1 },
 733
 734   //
 735   // UDIVX inserts 68 bubbles.
 736   //
 737   { UDIVX,   AllIssueSlots.rid, 2, 68-1 },
 738   { UDIVX,   AllIssueSlots.rid, 2, 68-1 },
 739   { UDIVX,   AllIssueSlots.rid, 2, 68-1 },
 740   { UDIVX,   AllIssueSlots.rid,  2, 68-1 },
 741
 742   //
 743   // WR inserts 4 bubbles.
 744   //
 745 //{ WR,     AllIssueSlots.rid, 2, 68-1 },
 746 //{ WR,     AllIssueSlots.rid, 2, 68-1 },
 747 //{ WR,     AllIssueSlots.rid, 2, 68-1 },
 748 //{ WR,     AllIssueSlots.rid,  2, 68-1 },
 749
 750   //
 751   // WRPR inserts 4 bubbles.
 752   //
 753 //{ WRPR,   AllIssueSlots.rid, 2, 68-1 },
 754 //{ WRPR,   AllIssueSlots.rid, 2, 68-1 },
 755 //{ WRPR,   AllIssueSlots.rid, 2, 68-1 },
 756 //{ WRPR,   AllIssueSlots.rid,  2, 68-1 },
 757
 758   //
 759   // DONE inserts 9 bubbles.
 760   //
 761 //{ DONE,   AllIssueSlots.rid, 2, 9-1 },
 762 //{ DONE,   AllIssueSlots.rid, 2, 9-1 },
 763 //{ DONE,   AllIssueSlots.rid, 2, 9-1 },
 764 //{ DONE,   AllIssueSlots.rid, 2, 9-1 },
 765
 766   //
 767   // RETRY inserts 9 bubbles.
 768   //
 769 //{ RETRY,   AllIssueSlots.rid, 2, 9-1 },
 770 //{ RETRY,   AllIssueSlots.rid, 2, 9-1 },
 771 //{ RETRY,   AllIssueSlots.rid, 2, 9-1 },
 772 //{ RETRY,   AllIssueSlots.rid,  2, 9-1 },
 773
 774 #endif  /*EXPLICIT_BUBBLES_NEEDED */
 775 };
 776
 777 // Additional delays to be captured in code:
 778 // 1. RDPR from several state registers (page 349)
 779 // 2. RD   from *any* register (page 349)
 780 // 3. Writes to TICK, PSTATE, TL registers and FLUSH{W} instr (page 349)
 781 // 4. Integer store can be in same group as instr producing value to store.
 782 // 5. BICC and BPICC can be in the same group as instr producing CC (pg 350)
 783 // 6. FMOVr cannot be in the same or next group as an IEU instr (pg 351).
 784 // 7. The second instr. of a CTI group inserts 9 bubbles (pg 351)
 785 // 8. WR{PR}, SVAE, SAVED, RESTORE, RESTORED, RETURN, RETRY, and DONE that
 786 //    follow an annulling branch cannot be issued in the same group or in
 787 //    the 3 groups following the branch.
 788 // 9. A predicted annulled load does not stall dependent instructions.
 789 //    Other annulled delay slot instructions *do* stall dependents, so
 790 //    nothing special needs to be done for them during scheduling.
 791 //10. Do not put a load use that may be annulled in the same group as the
 792 //    branch.  The group will stall until the load returns.
 793 //11. Single-prec. FP loads lock 2 registers, for dependency checking.
 794 //
 795 //
 796 // Additional delays we cannot or will not capture:
 797 // 1. If DCTI is last word of cache line, it is delayed until next line can be
 798 //    fetched.  Also, other DCTI alignment-related delays (pg 352)
 799 // 2. Load-after-store is delayed by 7 extra cycles if load hits in D-Cache.
 800 //    Also, several other store-load and load-store conflicts (pg 358)
 801 // 3. MEMBAR, LD{X}FSR, LDD{A} and a bunch of other load stalls (pg 358)
 802 // 4. There can be at most 8 outstanding buffered store instructions
 803 //     (including some others like MEMBAR, LDSTUB, CAS{AX}, and FLUSH)
 804
 805
 806
 807 //---------------------------------------------------------------------------
 808 // class UltraSparcSchedInfo
 809 //
 810 // Purpose:
 811 //   Scheduling information for the UltraSPARC.
 812 //   Primarily just initializes machine-dependent parameters in
 813 //   class MachineSchedInfo.
 814 //---------------------------------------------------------------------------
 815
 816 /*ctor*/
 817 UltraSparcSchedInfo::UltraSparcSchedInfo(const TargetMachine& tgt)
 818   : MachineSchedInfo(tgt,
 819                      (unsigned int) SPARC_NUM_SCHED_CLASSES,
 820                      SparcRUsageDesc,
 821                      SparcInstrUsageDeltas,
 822                      SparcInstrIssueDeltas,
 823                      sizeof(SparcInstrUsageDeltas)/sizeof(InstrRUsageDelta),
 824                      sizeof(SparcInstrIssueDeltas)/sizeof(InstrIssueDelta))
 825 {
 826   maxNumIssueTotal = 4;
 827   longestIssueConflict = 0;             // computed from issuesGaps[]
 828
 829   branchMispredictPenalty = 4;          // 4 for SPARC IIi
 830   branchTargetUnknownPenalty = 2;       // 2 for SPARC IIi
 831   l1DCacheMissPenalty = 8;              // 7 or 9 for SPARC IIi
 832   l1ICacheMissPenalty = 8;              // ? for SPARC IIi
 833
 834   inOrderLoads = true;                  // true for SPARC IIi
 835   inOrderIssue = true;                  // true for SPARC IIi
 836   inOrderExec  = false;                 // false for most architectures
 837   inOrderRetire= true;                  // true for most architectures
 838
 839   // must be called after above parameters are initialized.
 840   this->initializeResources();
 841 }
 842
 843 void
 844 UltraSparcSchedInfo::initializeResources()
 845 {
 846   // Compute MachineSchedInfo::instrRUsages and MachineSchedInfo::issueGaps
 847   MachineSchedInfo::initializeResources();
 848
 849   // Machine-dependent fixups go here.  None for now.
 850 }
 851
 852
 853 //---------------------------------------------------------------------------
 854 // class UltraSparcFrameInfo
 855 //
 856 // Purpose:
 857 //   Interface to stack frame layout info for the UltraSPARC.
 858 //   Starting offsets for each area of the stack frame are aligned at
 859 //   a multiple of getStackFrameSizeAlignment().
 860 //---------------------------------------------------------------------------
 861
 862 int
 863 UltraSparcFrameInfo::getFirstAutomaticVarOffset(MachineCodeForMethod& ,
 864                                                 bool& pos) const
 865 {
 866   pos = false;                          // static stack area grows downwards
 867   return StaticAreaOffsetFromFP;
 868 }
 869
 870 int
 871 UltraSparcFrameInfo::getRegSpillAreaOffset(MachineCodeForMethod& mcInfo,
 872                                            bool& pos) const
 873 {
 874   pos = false;                          // static stack area grows downwards
 875   unsigned int autoVarsSize = mcInfo.getAutomaticVarsSize();
 876   if (int mod = autoVarsSize % getStackFrameSizeAlignment())
 877     autoVarsSize += (getStackFrameSizeAlignment() - mod);
 878   return StaticAreaOffsetFromFP - autoVarsSize;
 879 }
 880
 881 int
 882 UltraSparcFrameInfo::getTmpAreaOffset(MachineCodeForMethod& mcInfo,
 883                                       bool& pos) const
 884 {
 885   pos = false;                          // static stack area grows downwards
 886   unsigned int autoVarsSize = mcInfo.getAutomaticVarsSize();
 887   unsigned int spillAreaSize = mcInfo.getRegSpillsSize();
 888   int offset = autoVarsSize + spillAreaSize;
 889   if (int mod = offset % getStackFrameSizeAlignment())
 890     offset += (getStackFrameSizeAlignment() - mod);
 891   return StaticAreaOffsetFromFP - offset;
 892 }
 893
 894 int
 895 UltraSparcFrameInfo::getDynamicAreaOffset(MachineCodeForMethod& mcInfo,
 896                                           bool& pos) const
 897 {
 898   // dynamic stack area grows downwards starting at top of opt-args area
 899   unsigned int optArgsSize = mcInfo.getMaxOptionalArgsSize();
 900   int offset = optArgsSize + FirstOptionalOutgoingArgOffsetFromSP;
 901   assert(offset % getStackFrameSizeAlignment() == 0);
 902   return offset;
 903 }
 904
 905
 906 //---------------------------------------------------------------------------
 907 // class UltraSparcMachine
 908 //
 909 // Purpose:
 910 //   Primary interface to machine description for the UltraSPARC.
 911 //   Primarily just initializes machine-dependent parameters in
 912 //   class TargetMachine, and creates machine-dependent subclasses
 913 //   for classes such as MachineInstrInfo.
 914 //
 915 //---------------------------------------------------------------------------
 916
 917 UltraSparc::UltraSparc()
 918   : TargetMachine("UltraSparc-Native"),
 919     instrInfo(*this),
 920     schedInfo(*this),
 921     regInfo(*this),
 922     frameInfo(*this),
 923     cacheInfo(*this)
 924 {
 925   optSizeForSubWordData = 4;
 926   minMemOpWordSize = 8;
 927   maxAtomicMemOpWordSize = 8;
 928 }
 929
 930
 931
 932 //===---------------------------------------------------------------------===//
 933 // GenerateCodeForTarget Pass
 934 //
 935 // Native code generation for a specified target.
 936 //===---------------------------------------------------------------------===//
 937
 938 class ConstructMachineCodeForMethod : public MethodPass {
 939   TargetMachine &Target;
 940 public:
 941   inline ConstructMachineCodeForMethod(TargetMachine &T) : Target(T) {}
 942   bool runOnMethod(Method *M) {
 943     MachineCodeForMethod::construct(M, Target);
 944     return false;
 945   }
 946 };
 947
 948 class InstructionSelection : public MethodPass {
 949   TargetMachine &Target;
 950 public:
 951   inline InstructionSelection(TargetMachine &T) : Target(T) {}
 952   bool runOnMethod(Method *M) {
 953     if (SelectInstructionsForMethod(M, Target))
 954       cerr << "Instr selection failed for method " << M->getName() << "\n";
 955     return false;
 956   }
 957 };
 958
 959 class InstructionScheduling : public MethodPass {
 960   TargetMachine &Target;
 961 public:
 962   inline InstructionScheduling(TargetMachine &T) : Target(T) {}
 963   bool runOnMethod(Method *M) {
 964     if (ScheduleInstructionsWithSSA(M, Target))
 965       cerr << "Instr scheduling failed for method " << M->getName() << "\n\n";
 966     return false;
 967   }
 968 };
 969
 970 struct FreeMachineCodeForMethod : public MethodPass {
 971   static void freeMachineCode(Instruction *I) {
 972     MachineCodeForInstruction::destroy(I);
 973   }
 974
 975   bool runOnMethod(Method *M) {
 976     for_each(M->inst_begin(), M->inst_end(), freeMachineCode);
 977     // Don't destruct MachineCodeForMethod - The global printer needs it
 978     //MachineCodeForMethod::destruct(M);
 979     return false;
 980   }
 981 };
 982
 983
 984 void UltraSparc::addPassesToEmitAssembly(PassManager &PM, std::ostream &Out) {
 985   // Construct and initialize the MachineCodeForMethod object for this method.
 986   PM.add(new ConstructMachineCodeForMethod(*this));
 987
 988   PM.add(new InstructionSelection(*this));
 989
 990   //PM.add(new InstructionScheduling(*this));
 991
 992   PM.add(new RegisterAllocation(*this));
 993
 994   //PM.add(new OptimizeLeafProcedures());
 995   //PM.add(new DeleteFallThroughBranches());
 996   //PM.add(new RemoveChainedBranches());    // should be folded with previous
 997   //PM.add(new RemoveRedundantOps());       // operations with %g0, NOP, etc.
 998
 999   PM.add(new InsertPrologEpilogCode(*this));
1000
1001   // Output assembly language to the .s file.  Assembly emission is split into
1002   // two parts: Method output and Global value output.  This is because method
1003   // output is pipelined with all of the rest of code generation stuff,
1004   // allowing machine code representations for methods to be free'd after the
1005   // method has been emitted.
1006   //
1007   PM.add(getMethodAsmPrinterPass(PM, Out));
1008   PM.add(new FreeMachineCodeForMethod());  // Free stuff no longer needed
1009
1010   // Emit Module level assembly after all of the methods have been processed.
1011   PM.add(getModuleAsmPrinterPass(PM, Out));
1012 }