[x86] Fix an embarressing bug in the INSERTPS formation code. The mask

[oota-llvm.git] / lib / Target / R600 / SILowerControlFlow.cpp
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp

index 182f28ba7740fbcc2733e7ccf4f0cd6e7428755b..97c706b0655b080b469f099a07cb856367d304eb 100644 (file)
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -49,6 +49,7 @@
  //===----------------------------------------------------------------------===//
  
  #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
  #include "SIInstrInfo.h"
  #include "SIMachineFunctionInfo.h"
  #include "llvm/CodeGen/MachineFunction.h"
@@ -67,7 +68,7 @@ private:
    static const unsigned SkipThreshold = 12;
  
    static char ID;
-  const TargetRegisterInfo *TRI;
+  const SIRegisterInfo *TRI;
    const SIInstrInfo *TII;
  
    bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
@@ -86,17 +87,18 @@ private:
    void Kill(MachineInstr &MI);
    void Branch(MachineInstr &MI);
  
+  void InitM0ForLDS(MachineBasicBlock::iterator MI);
    void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
    void IndirectSrc(MachineInstr &MI);
    void IndirectDst(MachineInstr &MI);
  
  public:
    SILowerControlFlowPass(TargetMachine &tm) :
-    MachineFunctionPass(ID), TRI(0), TII(0) { }
+    MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
  
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
  
-  const char *getPassName() const {
+  const char *getPassName() const override {
      return "SI Lower control flow instructions";
    }
  
@@ -146,7 +148,7 @@ void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
    MachineBasicBlock &MBB = *MI.getParent();
    DebugLoc DL = MI.getDebugLoc();
  
-  if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType !=
+  if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() !=
        ShaderType::PIXEL ||
        !shouldSkip(&MBB, &MBB.getParent()->back()))
      return;
@@ -297,11 +299,13 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
    DebugLoc DL = MI.getDebugLoc();
    const MachineOperand &Op = MI.getOperand(0);
  
-  // Kill is only allowed in pixel / geometry shaders
-  assert(MBB.getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType ==
-         ShaderType::PIXEL ||
-         MBB.getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType ==
-         ShaderType::GEOMETRY);
+#ifndef NDEBUG
+  const SIMachineFunctionInfo *MFI
+    = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+  // Kill is only allowed in pixel / geometry shaders.
+  assert(MFI->getShaderType() == ShaderType::PIXEL ||
+         MFI->getShaderType() == ShaderType::GEOMETRY);
+#endif
  
    // Clear this thread from the exec mask if the operand is negative
    if ((Op.isImm() || Op.isFPImm())) {
@@ -320,6 +324,14 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
    MI.eraseFromParent();
  }
  
+/// The m0 register stores the maximum allowable address for LDS reads and
+/// writes.  Its value must be at least the size in bytes of LDS allocated by
+/// the shader.  For simplicity, we set it to the maximum possible value.
+void SILowerControlFlowPass::InitM0ForLDS(MachineBasicBlock::iterator MI) {
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),  TII->get(AMDGPU::S_MOV_B32),
+            AMDGPU::M0).addImm(0xffffffff);
+}
+
  void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
  
    MachineBasicBlock &MBB = *MI.getParent();
@@ -333,52 +345,57 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
              .addReg(Idx);
      MBB.insert(I, MovRel);
-    MI.eraseFromParent();
-    return;
-  }
+  } else {
  
-  assert(AMDGPU::SReg_64RegClass.contains(Save));
-  assert(AMDGPU::VReg_32RegClass.contains(Idx));
+    assert(AMDGPU::SReg_64RegClass.contains(Save));
+    assert(AMDGPU::VReg_32RegClass.contains(Idx));
  
-  // Save the EXEC mask
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
-          .addReg(AMDGPU::EXEC);
+    // Save the EXEC mask
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
+            .addReg(AMDGPU::EXEC);
  
-  // Read the next variant into VCC (lower 32 bits) <- also loop target
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-          AMDGPU::VCC_LO)
-          .addReg(Idx);
+    // Read the next variant into VCC (lower 32 bits) <- also loop target
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+            AMDGPU::VCC_LO)
+            .addReg(Idx);
  
-  // Move index from VCC into M0
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-          .addReg(AMDGPU::VCC_LO);
+    // Move index from VCC into M0
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+            .addReg(AMDGPU::VCC_LO);
  
-  // Compare the just read M0 value to all possible Idx values
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
-          .addReg(AMDGPU::M0)
-          .addReg(Idx);
+    // Compare the just read M0 value to all possible Idx values
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
+            .addReg(AMDGPU::M0)
+            .addReg(Idx);
  
-  // Update EXEC, save the original EXEC value to VCC
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
-          .addReg(AMDGPU::VCC);
+    // Update EXEC, save the original EXEC value to VCC
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
+            .addReg(AMDGPU::VCC);
  
-  // Do the actual move
-  MBB.insert(I, MovRel);
+    // Do the actual move
+    MBB.insert(I, MovRel);
  
-  // Update EXEC, switch all done bits to 0 and all todo bits to 1
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
-          .addReg(AMDGPU::EXEC)
-          .addReg(AMDGPU::VCC);
+    // Update EXEC, switch all done bits to 0 and all todo bits to 1
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+            .addReg(AMDGPU::EXEC)
+            .addReg(AMDGPU::VCC);
  
-  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-          .addImm(-7)
-          .addReg(AMDGPU::EXEC);
+    // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+            .addImm(-7)
+            .addReg(AMDGPU::EXEC);
  
-  // Restore EXEC
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
-          .addReg(Save);
+    // Restore EXEC
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+            .addReg(Save);
  
+  }
+  // FIXME: Are there any values other than the LDS address clamp that need to
+  // be stored in the m0 register and may be live for more than a few
+  // instructions?  If so, we should save the m0 register at the beginning
+  // of this function and restore it here.
+  // FIXME: Add support for LDS direct loads.
+  InitM0ForLDS(&MI);
    MI.eraseFromParent();
  }
  
@@ -426,8 +443,9 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
  }
  
  bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo());
-  TRI = MF.getTarget().getRegisterInfo();
+  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  TRI =
+      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
  
    bool HaveKill = false;
@@ -439,10 +457,10 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
         BI != BE; ++BI) {
  
      MachineBasicBlock &MBB = *BI;
-    for (MachineBasicBlock::iterator I = MBB.begin(), Next = std::next(I);
-         I != MBB.end(); I = Next) {
-
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
        Next = std::next(I);
+
        MachineInstr &MI = *I;
        if (TII->isDS(MI.getOpcode())) {
          NeedM0 = true;
@@ -523,11 +541,10 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
      MachineBasicBlock &MBB = MF.front();
      // Initialize M0 to a value that won't cause LDS access to be discarded
      // due to offset clamping
-    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_MOV_B32),
-            AMDGPU::M0).addImm(0xffffffff);
+    InitM0ForLDS(MBB.getFirstNonPHI());
    }
  
-  if (NeedWQM && MFI->ShaderType == ShaderType::PIXEL) {
+  if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
      MachineBasicBlock &MBB = MF.front();
      BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
              AMDGPU::EXEC).addReg(AMDGPU::EXEC);