Don't use a potentially expensive shift if all we want is one set bit.

[oota-llvm.git] / lib / Target / R600 / AMDILISelDAGToDAG.cpp
diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp

index ece26efceedf105b2ac2daa1386c935f0e666a84..e79ab3c0db9361e7b6082d8acb677f7f3e9940da 100644 (file)
--- a/lib/Target/R600/AMDILISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp
@@ -14,13 +14,15 @@
  #include "AMDGPUInstrInfo.h"
  #include "AMDGPUISelLowering.h" // For AMDGPUISD
  #include "AMDGPURegisterInfo.h"
-#include "AMDILDevices.h"
  #include "R600InstrInfo.h"
+#include "SIISelLowering.h"
  #include "llvm/ADT/ValueMap.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
  #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAG.h"
  #include "llvm/CodeGen/SelectionDAGISel.h"
  #include "llvm/Support/Compiler.h"
-#include "llvm/CodeGen/SelectionDAG.h"
  #include <list>
  #include <queue>
  
@@ -43,10 +45,14 @@ public:
  
    SDNode *Select(SDNode *N);
    virtual const char *getPassName() const;
+  virtual void PostprocessISelDAG();
  
  private:
    inline SDValue getSmallIPtrImm(unsigned Imm);
+  bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
+                   const R600InstrInfo *TII, std::vector<unsigned> Cst);
    bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
+  bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
  
    // Complex pattern selectors
    bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
@@ -54,27 +60,25 @@ private:
    bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
  
    static bool checkType(const Value *ptr, unsigned int addrspace);
-  static const Value *getBasePointerValue(const Value *V);
  
    static bool isGlobalStore(const StoreSDNode *N);
    static bool isPrivateStore(const StoreSDNode *N);
    static bool isLocalStore(const StoreSDNode *N);
    static bool isRegionStore(const StoreSDNode *N);
  
-  static bool isCPLoad(const LoadSDNode *N);
-  static bool isConstantLoad(const LoadSDNode *N, int cbID);
-  static bool isGlobalLoad(const LoadSDNode *N);
-  static bool isParamLoad(const LoadSDNode *N);
-  static bool isPrivateLoad(const LoadSDNode *N);
-  static bool isLocalLoad(const LoadSDNode *N);
-  static bool isRegionLoad(const LoadSDNode *N);
+  bool isCPLoad(const LoadSDNode *N) const;
+  bool isConstantLoad(const LoadSDNode *N, int cbID) const;
+  bool isGlobalLoad(const LoadSDNode *N) const;
+  bool isParamLoad(const LoadSDNode *N) const;
+  bool isPrivateLoad(const LoadSDNode *N) const;
+  bool isLocalLoad(const LoadSDNode *N) const;
+  bool isRegionLoad(const LoadSDNode *N) const;
  
    bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
    bool SelectGlobalValueVariableOffset(SDValue Addr,
        SDValue &BaseReg, SDValue& Offset);
-  bool SelectADDR8BitOffset(SDValue Addr, SDValue& Base, SDValue& Offset);
-  bool SelectADDRReg(SDValue Addr, SDValue& Base, SDValue& Offset);
    bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
+  bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
  
    // Include the pieces autogenerated from the target description.
  #include "AMDGPUGenDAGISel.inc"
@@ -88,8 +92,7 @@ FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM
    return new AMDGPUDAGToDAGISel(TM);
  }
  
-AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM
-                                     )
+AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM)
    : SelectionDAGISel(TM), Subtarget(TM.getSubtarget<AMDGPUSubtarget>()) {
  }
  
@@ -161,22 +164,64 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
    }
    switch (Opc) {
    default: break;
-  case ISD::FrameIndex: {
-    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(N)) {
-      unsigned int FI = FIN->getIndex();
-      EVT OpVT = N->getValueType(0);
-      unsigned int NewOpc = AMDGPU::COPY;
-      SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i32);
-      return CurDAG->SelectNodeTo(N, NewOpc, OpVT, TFI);
+  case ISD::BUILD_VECTOR: {
+    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+    if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+      break;
      }
-    break;
+    // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
+    // that adds a 128 bits reg copy when going through TwoAddressInstructions
+    // pass. We want to avoid 128 bits copies as much as possible because they
+    // can't be bundled by our scheduler.
+    SDValue RegSeqArgs[9] = {
+      CurDAG->getTargetConstant(AMDGPU::R600_Reg128RegClassID, MVT::i32),
+      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
+      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32),
+      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32),
+      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub3, MVT::i32)
+    };
+    bool IsRegSeq = true;
+    for (unsigned i = 0; i < N->getNumOperands(); i++) {
+      if (dyn_cast<RegisterSDNode>(N->getOperand(i))) {
+        IsRegSeq = false;
+        break;
+      }
+      RegSeqArgs[2 * i + 1] = N->getOperand(i);
+    }
+    if (!IsRegSeq)
+      break;
+    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
+        RegSeqArgs, 2 * N->getNumOperands() + 1);
    }
+  case ISD::BUILD_PAIR: {
+    SDValue RC, SubReg0, SubReg1;
+    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+      break;
+    }
+    if (N->getValueType(0) == MVT::i128) {
+      RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32);
+      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, MVT::i32);
+      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, MVT::i32);
+    } else if (N->getValueType(0) == MVT::i64) {
+      RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32);
+      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32);
+      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32);
+    } else {
+      llvm_unreachable("Unhandled value type for BUILD_PAIR");
+    }
+    const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
+                            N->getOperand(1), SubReg1 };
+    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
+                                  SDLoc(N), N->getValueType(0), Ops);
+  }
+
    case ISD::ConstantFP:
    case ISD::Constant: {
      const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
      // XXX: Custom immediate lowering not implemented yet.  Instead we use
      // pseudo instructions defined in SIInstructions.td
-    if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+    if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
        break;
      }
      const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo());
@@ -229,16 +274,24 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
              continue;
            }
        } else {
-        if (!TII->isALUInstr(Use->getMachineOpcode())) {
+        if (!TII->isALUInstr(Use->getMachineOpcode()) ||
+            (TII->get(Use->getMachineOpcode()).TSFlags &
+            R600_InstFlag::VECTOR)) {
            continue;
          }
  
-        int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(), R600Operands::IMM);
-        assert(ImmIdx != -1);
+        int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(),
+                                        AMDGPU::OpName::literal);
+        if (ImmIdx == -1) {
+          continue;
+        }
  
-        // subtract one from ImmIdx, because the DST operand is usually index
-        // 0 for MachineInstrs, but we have no DST in the Ops vector.
-        ImmIdx--;
+        if (TII->getOperandIdx(Use->getMachineOpcode(),
+                               AMDGPU::OpName::dst) != -1) {
+          // subtract one from ImmIdx, because the DST operand is usually index
+          // 0 for MachineInstrs, but we have no DST in the Ops vector.
+          ImmIdx--;
+        }
  
          // Check that we aren't already using an immediate.
          // XXX: It's possible for an instruction to have more than one
@@ -269,10 +322,29 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
    // Fold operands of selected node
  
    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
-  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
+  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
      const R600InstrInfo *TII =
          static_cast<const R600InstrInfo*>(TM.getInstrInfo());
-    if (Result && TII->isALUInstr(Result->getMachineOpcode())) {
+    if (Result && Result->isMachineOpcode() && Result->getMachineOpcode() == AMDGPU::DOT_4) {
+      bool IsModified = false;
+      do {
+        std::vector<SDValue> Ops;
+        for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end();
+            I != E; ++I)
+          Ops.push_back(*I);
+        IsModified = FoldDotOperands(Result->getMachineOpcode(), TII, Ops);
+        if (IsModified) {
+          Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size());
+        }
+      } while (IsModified);
+
+    }
+    if (Result && Result->isMachineOpcode() &&
+        !(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR)
+        && TII->hasInstrModifiers(Result->getMachineOpcode())) {
+      // Fold FNEG/FABS/CONST_ADDRESS
+      // TODO: Isel can generate multiple MachineInst, we need to recursively
+      // parse Result
        bool IsModified = false;
        do {
          std::vector<SDValue> Ops;
@@ -281,50 +353,189 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
            Ops.push_back(*I);
          IsModified = FoldOperands(Result->getMachineOpcode(), TII, Ops);
          if (IsModified) {
-          Result = CurDAG->MorphNodeTo(Result, Result->getOpcode(),
-              Result->getVTList(), Ops.data(), Ops.size());
+          Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size());
          }
        } while (IsModified);
+
+      // If node has a single use which is CLAMP_R600, folds it
+      if (Result->hasOneUse() && Result->isMachineOpcode()) {
+        SDNode *PotentialClamp = *Result->use_begin();
+        if (PotentialClamp->isMachineOpcode() &&
+            PotentialClamp->getMachineOpcode() == AMDGPU::CLAMP_R600) {
+          unsigned ClampIdx =
+            TII->getOperandIdx(Result->getMachineOpcode(), AMDGPU::OpName::clamp);
+          std::vector<SDValue> Ops;
+          unsigned NumOp = Result->getNumOperands();
+          for (unsigned i = 0; i < NumOp; ++i) {
+            Ops.push_back(Result->getOperand(i));
+          }
+          Ops[ClampIdx - 1] = CurDAG->getTargetConstant(1, MVT::i32);
+          Result = CurDAG->SelectNodeTo(PotentialClamp,
+              Result->getMachineOpcode(), PotentialClamp->getVTList(),
+              Ops.data(), NumOp);
+        }
+      }
      }
    }
  
    return Result;
  }
  
+bool AMDGPUDAGToDAGISel::FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg,
+                                     SDValue &Abs, const R600InstrInfo *TII,
+                                     std::vector<unsigned> Consts) {
+  switch (Src.getOpcode()) {
+  case AMDGPUISD::CONST_ADDRESS: {
+    SDValue CstOffset;
+    if (Src.getValueType().isVector() ||
+        !SelectGlobalValueConstantOffset(Src.getOperand(0), CstOffset))
+      return false;
+
+    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
+    Consts.push_back(Cst->getZExtValue());
+    if (!TII->fitsConstReadLimitations(Consts))
+      return false;
+
+    Src = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32);
+    Sel = CstOffset;
+    return true;
+    }
+  case ISD::FNEG:
+    Src = Src.getOperand(0);
+    Neg = CurDAG->getTargetConstant(1, MVT::i32);
+    return true;
+  case ISD::FABS:
+    if (!Abs.getNode())
+      return false;
+    Src = Src.getOperand(0);
+    Abs = CurDAG->getTargetConstant(1, MVT::i32);
+    return true;
+  case ISD::BITCAST:
+    Src = Src.getOperand(0);
+    return true;
+  default:
+    return false;
+  }
+}
+
  bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode,
      const R600InstrInfo *TII, std::vector<SDValue> &Ops) {
    int OperandIdx[] = {
-    TII->getOperandIdx(Opcode, R600Operands::SRC0),
-    TII->getOperandIdx(Opcode, R600Operands::SRC1),
-    TII->getOperandIdx(Opcode, R600Operands::SRC2)
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
    };
    int SelIdx[] = {
-    TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL),
-    TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL),
-    TII->getOperandIdx(Opcode, R600Operands::SRC2_SEL)
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_sel),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_sel),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_sel)
+  };
+  int NegIdx[] = {
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
+  };
+  int AbsIdx[] = {
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
+    -1
    };
+
+  // Gather constants values
+  std::vector<unsigned> Consts;
+  for (unsigned j = 0; j < 3; j++) {
+    int SrcIdx = OperandIdx[j];
+    if (SrcIdx < 0)
+      break;
+    if (RegisterSDNode *Reg = dyn_cast<RegisterSDNode>(Ops[SrcIdx - 1])) {
+      if (Reg->getReg() == AMDGPU::ALU_CONST) {
+        ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Ops[SelIdx[j] - 1]);
+        Consts.push_back(Cst->getZExtValue());
+      }
+    }
+  }
+
    for (unsigned i = 0; i < 3; i++) {
      if (OperandIdx[i] < 0)
        return false;
-    SDValue Operand = Ops[OperandIdx[i] - 1];
-    switch (Operand.getOpcode()) {
-    case AMDGPUISD::CONST_ADDRESS: {
-      SDValue CstOffset;
-      if (!Operand.getValueType().isVector() &&
-          SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) {
-        Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32);
-        Ops[SelIdx[i] - 1] = CstOffset;
-        return true;
-      }
-      }
-      break;
-    case ISD::BITCAST:
-      Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
+    SDValue &Src = Ops[OperandIdx[i] - 1];
+    SDValue &Sel = Ops[SelIdx[i] - 1];
+    SDValue &Neg = Ops[NegIdx[i] - 1];
+    SDValue FakeAbs;
+    SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
+    if (FoldOperand(Src, Sel, Neg, Abs, TII, Consts))
        return true;
-    default:
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::FoldDotOperands(unsigned Opcode,
+    const R600InstrInfo *TII, std::vector<SDValue> &Ops) {
+  int OperandIdx[] = {
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+  };
+  int SelIdx[] = {
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_sel_X),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_sel_Y),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_sel_Z),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_sel_W),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_sel_X),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_sel_Y),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_sel_Z),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_sel_W)
+  };
+  int NegIdx[] = {
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
+  };
+  int AbsIdx[] = {
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
+    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
+  };
+
+  // Gather constants values
+  std::vector<unsigned> Consts;
+  for (unsigned j = 0; j < 8; j++) {
+    int SrcIdx = OperandIdx[j];
+    if (SrcIdx < 0)
        break;
+    if (RegisterSDNode *Reg = dyn_cast<RegisterSDNode>(Ops[SrcIdx - 1])) {
+      if (Reg->getReg() == AMDGPU::ALU_CONST) {
+        ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Ops[SelIdx[j] - 1]);
+        Consts.push_back(Cst->getZExtValue());
+      }
      }
    }
+
+  for (unsigned i = 0; i < 8; i++) {
+    if (OperandIdx[i] < 0)
+      return false;
+    SDValue &Src = Ops[OperandIdx[i] - 1];
+    SDValue &Sel = Ops[SelIdx[i] - 1];
+    SDValue &Neg = Ops[NegIdx[i] - 1];
+    SDValue &Abs = Ops[AbsIdx[i] - 1];
+    if (FoldOperand(Src, Sel, Neg, Abs, TII, Consts))
+      return true;
+  }
    return false;
  }
  
@@ -336,46 +547,6 @@ bool AMDGPUDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) {
    return dyn_cast<PointerType>(ptrType)->getAddressSpace() == addrspace;
  }
  
-const Value * AMDGPUDAGToDAGISel::getBasePointerValue(const Value *V) {
-  if (!V) {
-    return NULL;
-  }
-  const Value *ret = NULL;
-  ValueMap<const Value *, bool> ValueBitMap;
-  std::queue<const Value *, std::list<const Value *> > ValueQueue;
-  ValueQueue.push(V);
-  while (!ValueQueue.empty()) {
-    V = ValueQueue.front();
-    if (ValueBitMap.find(V) == ValueBitMap.end()) {
-      ValueBitMap[V] = true;
-      if (dyn_cast<Argument>(V) && dyn_cast<PointerType>(V->getType())) {
-        ret = V;
-        break;
-      } else if (dyn_cast<GlobalVariable>(V)) {
-        ret = V;
-        break;
-      } else if (dyn_cast<Constant>(V)) {
-        const ConstantExpr *CE = dyn_cast<ConstantExpr>(V);
-        if (CE) {
-          ValueQueue.push(CE->getOperand(0));
-        }
-      } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
-        ret = AI;
-        break;
-      } else if (const Instruction *I = dyn_cast<Instruction>(V)) {
-        uint32_t numOps = I->getNumOperands();
-        for (uint32_t x = 0; x < numOps; ++x) {
-          ValueQueue.push(I->getOperand(x));
-        }
-      } else {
-        assert(!"Found a Value that we didn't know how to handle!");
-      }
-    }
-    ValueQueue.pop();
-  }
-  return ret;
-}
-
  bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
    return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
  }
@@ -394,41 +565,43 @@ bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
    return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
  }
  
-bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int cbID) {
+bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int cbID) const {
    if (checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)) {
      return true;
    }
+
+  const DataLayout *DL = TM.getDataLayout();
    MachineMemOperand *MMO = N->getMemOperand();
    const Value *V = MMO->getValue();
-  const Value *BV = getBasePointerValue(V);
+  const Value *BV = GetUnderlyingObject(V, DL, 0);
    if (MMO
        && MMO->getValue()
        && ((V && dyn_cast<GlobalValue>(V))
            || (BV && dyn_cast<GlobalValue>(
-                        getBasePointerValue(MMO->getValue()))))) {
+                GetUnderlyingObject(MMO->getValue(), DL, 0))))) {
      return checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS);
    } else {
      return false;
    }
  }
  
-bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) {
+bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const {
    return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
  }
  
-bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) {
+bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) const {
    return checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS);
  }
  
-bool AMDGPUDAGToDAGISel::isLocalLoad(const  LoadSDNode *N) {
+bool AMDGPUDAGToDAGISel::isLocalLoad(const  LoadSDNode *N) const {
    return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
  }
  
-bool AMDGPUDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) {
+bool AMDGPUDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) const {
    return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
  }
  
-bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) {
+bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const {
    MachineMemOperand *MMO = N->getMemOperand();
    if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
      if (MMO) {
@@ -442,7 +615,7 @@ bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) {
    return false;
  }
  
-bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) {
+bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const {
    if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
      // Check to make sure we are not a constant pool load or a constant load
      // that is marked as a private load
@@ -491,43 +664,6 @@ bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
    return false;
  }
  
-bool AMDGPUDAGToDAGISel::SelectADDR8BitOffset(SDValue Addr, SDValue& Base,
-                                             SDValue& Offset) {
-  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
-      Addr.getOpcode() == ISD::TargetGlobalAddress) {
-    return false;
-  }
-
-
-  if (Addr.getOpcode() == ISD::ADD) {
-    bool Match = false;
-
-    // Find the base ptr and the offset
-    for (unsigned i = 0; i < Addr.getNumOperands(); i++) {
-      SDValue Arg = Addr.getOperand(i);
-      ConstantSDNode * OffsetNode = dyn_cast<ConstantSDNode>(Arg);
-      // This arg isn't a constant so it must be the base PTR.
-      if (!OffsetNode) {
-        Base = Addr.getOperand(i);
-        continue;
-      }
-      // Check if the constant argument fits in 8-bits.  The offset is in bytes
-      // so we need to convert it to dwords.
-      if (isUInt<8>(OffsetNode->getZExtValue() >> 2)) {
-        Match = true;
-        Offset = CurDAG->getTargetConstant(OffsetNode->getZExtValue() >> 2,
-                                           MVT::i32);
-      }
-    }
-    return Match;
-  }
-
-  // Default case, no offset
-  Base = Addr;
-  Offset = CurDAG->getTargetConstant(0, MVT::i32);
-  return true;
-}
-
  bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
                                             SDValue &Offset) {
    ConstantSDNode * IMMOffset;
@@ -543,7 +679,7 @@ bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
    } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
               && isInt<16>(IMMOffset->getZExtValue())) {
      Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
-                                  CurDAG->getEntryNode().getDebugLoc(),
+                                  SDLoc(CurDAG->getEntryNode()),
                                    AMDGPU::ZERO, MVT::i32);
      Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32);
      return true;
@@ -555,16 +691,68 @@ bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
    return true;
  }
  
-bool AMDGPUDAGToDAGISel::SelectADDRReg(SDValue Addr, SDValue& Base,
-                                      SDValue& Offset) {
-  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
-      Addr.getOpcode() == ISD::TargetGlobalAddress  ||
-      Addr.getOpcode() != ISD::ADD) {
-    return false;
+bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  ConstantSDNode *C;
+
+  if ((C = dyn_cast<ConstantSDNode>(Addr))) {
+    Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+    Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32);
+  } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
+            (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
+    Base = Addr.getOperand(0);
+    Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32);
+  } else {
+    Base = Addr;
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
    }
  
-  Base = Addr.getOperand(0);
-  Offset = Addr.getOperand(1);
-
    return true;
  }
+
+void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
+
+  if (Subtarget.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    return;
+  }
+
+  // Go over all selected nodes and try to fold them a bit more
+  const AMDGPUTargetLowering& Lowering =
+    (*(const AMDGPUTargetLowering*)getTargetLowering());
+  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+       E = CurDAG->allnodes_end(); I != E; ++I) {
+
+    SDNode *Node = I;
+    switch (Node->getOpcode()) {
+    // Fix the register class in copy to CopyToReg nodes - ISel will always
+    // use SReg classes for 64-bit copies, but this is not always what we want.
+    case ISD::CopyToReg: {
+      unsigned Reg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+      SDValue Val = Node->getOperand(2);
+      const TargetRegisterClass *RC = RegInfo->getRegClass(Reg);
+      if (RC != &AMDGPU::SReg_64RegClass) {
+        continue;
+      }
+
+      if (!Val.getNode()->isMachineOpcode() ||
+          Val.getNode()->getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
+        continue;
+      }
+
+      const MCInstrDesc Desc = TM.getInstrInfo()->get(Val.getNode()->getMachineOpcode());
+      const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+      RegInfo->setRegClass(Reg, TRI->getRegClass(Desc.OpInfo[0].RegClass));
+      continue;
+    }
+    }
+
+    MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I);
+    if (!MachineNode)
+      continue;
+
+    SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
+    if (ResNode != Node) {
+      ReplaceUses(Node, ResNode);
+    }
+  }
+}