From e67a4afb5da59c02338622eea68e096ba143113f Mon Sep 17 00:00:00 2001
From: Vincent Lejeune <vljn@ovi.com>
Date: Tue, 4 Jun 2013 23:17:15 +0000
Subject: [PATCH] R600: Const/Neg/Abs can be folded to dot4

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@183278 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/R600/AMDILISelDAGToDAG.cpp       | 186 +++++++++++++++-----
 lib/Target/R600/R600EmitClauseMarkers.cpp   |   6 +-
 lib/Target/R600/R600ExpandSpecialInstrs.cpp |   4 +-
 lib/Target/R600/R600InstrInfo.cpp           |  35 ++++
 lib/Target/R600/R600InstrInfo.h             |   2 +
 test/CodeGen/R600/dot4-folding.ll           |  27 +++
 6 files changed, 213 insertions(+), 47 deletions(-)
 create mode 100644 test/CodeGen/R600/dot4-folding.ll
diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp
index 959d6621fba..82f1c7e305a 100644
--- a/lib/Target/R600/AMDILISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp
@@ -49,7 +49,10 @@ public:
 
 private:
   inline SDValue getSmallIPtrImm(unsigned Imm);
+  bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
+                   const R600InstrInfo *TII, std::vector<unsigned> Cst);
   bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
+  bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
 
   // Complex pattern selectors
   bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
@@ -318,6 +321,20 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
     const R600InstrInfo *TII =
         static_cast<const R600InstrInfo*>(TM.getInstrInfo());
+    if (Result && Result->isMachineOpcode() && Result->getMachineOpcode() == AMDGPU::DOT_4) {
+      bool IsModified = false;
+      do {
+        std::vector<SDValue> Ops;
+        for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end();
+            I != E; ++I)
+          Ops.push_back(*I);
+        IsModified = FoldDotOperands(Result->getMachineOpcode(), TII, Ops);
+        if (IsModified) {
+          Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size());
+        }
+      } while (IsModified);
+      
+    }
     if (Result && Result->isMachineOpcode() &&
         !(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR)
         && TII->isALUInstr(Result->getMachineOpcode())) {
@@ -360,6 +377,43 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   return Result;
 }
 
+bool AMDGPUDAGToDAGISel::FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg,
+                                     SDValue &Abs, const R600InstrInfo *TII,
+                                     std::vector<unsigned> Consts) {
+  switch (Src.getOpcode()) {
+  case AMDGPUISD::CONST_ADDRESS: {
+    SDValue CstOffset;
+    if (Src.getValueType().isVector() ||
+        !SelectGlobalValueConstantOffset(Src.getOperand(0), CstOffset))
+      return false;
+
+    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
+    Consts.push_back(Cst->getZExtValue());
+    if (!TII->fitsConstReadLimitations(Consts))
+      return false;
+
+    Src = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32);
+    Sel = CstOffset;
+    return true;
+    }
+  case ISD::FNEG:
+    Src = Src.getOperand(0);
+    Neg = CurDAG->getTargetConstant(1, MVT::i32);
+    return true;
+  case ISD::FABS:
+    if (!Abs.getNode())
+      return false;
+    Src = Src.getOperand(0);
+    Abs = CurDAG->getTargetConstant(1, MVT::i32);
+    return true;
+  case ISD::BITCAST:
+    Src = Src.getOperand(0);
+    return true;
+  default:
+    return false;
+  }
+}
+
 bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode,
     const R600InstrInfo *TII, std::vector<SDValue> &Ops) {
   int OperandIdx[] = {
@@ -383,59 +437,101 @@ bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode,
     -1
   };
 
+  // Gather constants values
+  std::vector<unsigned> Consts;
+  for (unsigned j = 0; j < 3; j++) {
+    int SrcIdx = OperandIdx[j];
+    if (SrcIdx < 0)
+      break;
+    if (RegisterSDNode *Reg = dyn_cast<RegisterSDNode>(Ops[SrcIdx - 1])) {
+      if (Reg->getReg() == AMDGPU::ALU_CONST) {
+        ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Ops[SelIdx[j] - 1]);
+        Consts.push_back(Cst->getZExtValue());
+      }
+    }
+  }
+
   for (unsigned i = 0; i < 3; i++) {
     if (OperandIdx[i] < 0)
       return false;
-    SDValue Operand = Ops[OperandIdx[i] - 1];
-    switch (Operand.getOpcode()) {
-    case AMDGPUISD::CONST_ADDRESS: {
-      SDValue CstOffset;
-      if (Operand.getValueType().isVector() ||
-          !SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset))
-        break;
-
-      // Gather others constants values
-      std::vector<unsigned> Consts;
-      for (unsigned j = 0; j < 3; j++) {
-        int SrcIdx = OperandIdx[j];
-        if (SrcIdx < 0)
-          break;
-        if (RegisterSDNode *Reg = dyn_cast<RegisterSDNode>(Ops[SrcIdx - 1])) {
-          if (Reg->getReg() == AMDGPU::ALU_CONST) {
-            ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Ops[SelIdx[j] - 1]);
-            Consts.push_back(Cst->getZExtValue());
-          }
-        }
-      }
+    SDValue &Src = Ops[OperandIdx[i] - 1];
+    SDValue &Sel = Ops[SelIdx[i] - 1];
+    SDValue &Neg = Ops[NegIdx[i] - 1];
+    SDValue FakeAbs;
+    SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
+    if (FoldOperand(Src, Sel, Neg, Abs, TII, Consts))
+      return true;
+  }
+  return false;
+}
 
-      ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
-      Consts.push_back(Cst->getZExtValue());
-      if (!TII->fitsConstReadLimitations(Consts))
-        break;
+bool AMDGPUDAGToDAGISel::FoldDotOperands(unsigned Opcode,
+    const R600InstrInfo *TII, std::vector<SDValue> &Ops) {
+  int OperandIdx[] = {
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_X),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_Y),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_Z),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_W),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_X),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_Y),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_Z),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_W)
+  };
+  int SelIdx[] = {
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL_X),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL_Y),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL_Z),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL_W),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL_X),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL_Y),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL_Z),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL_W)
+  };
+  int NegIdx[] = {
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_NEG_X),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_NEG_Y),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_NEG_Z),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_NEG_W),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_NEG_X),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_NEG_Y),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_NEG_Z),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_NEG_W)
+  };
+  int AbsIdx[] = {
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_ABS_X),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_ABS_Y),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_ABS_Z),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_ABS_W),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_ABS_X),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_ABS_Y),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_ABS_Z),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_ABS_W)
+  };
 
-      Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32);
-      Ops[SelIdx[i] - 1] = CstOffset;
-      return true;
-      }
-    case ISD::FNEG:
-      if (NegIdx[i] < 0)
-        break;
-      Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
-      Ops[NegIdx[i] - 1] = CurDAG->getTargetConstant(1, MVT::i32);
-      return true;
-    case ISD::FABS:
-      if (AbsIdx[i] < 0)
-        break;
-      Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
-      Ops[AbsIdx[i] - 1] = CurDAG->getTargetConstant(1, MVT::i32);
-      return true;
-    case ISD::BITCAST:
-      Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
-      return true;
-    default:
+  // Gather constants values
+  std::vector<unsigned> Consts;
+  for (unsigned j = 0; j < 8; j++) {
+    int SrcIdx = OperandIdx[j];
+    if (SrcIdx < 0)
       break;
+    if (RegisterSDNode *Reg = dyn_cast<RegisterSDNode>(Ops[SrcIdx - 1])) {
+      if (Reg->getReg() == AMDGPU::ALU_CONST) {
+        ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Ops[SelIdx[j] - 1]);
+        Consts.push_back(Cst->getZExtValue());
+      }
     }
   }
+
+  for (unsigned i = 0; i < 8; i++) {
+    if (OperandIdx[i] < 0)
+      return false;
+    SDValue &Src = Ops[OperandIdx[i] - 1];
+    SDValue &Sel = Ops[SelIdx[i] - 1];
+    SDValue &Neg = Ops[NegIdx[i] - 1];
+    SDValue &Abs = Ops[AbsIdx[i] - 1];
+    if (FoldOperand(Src, Sel, Neg, Abs, TII, Consts))
+      return true;
+  }
   return false;
 }
 
diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp
index ecfcfeb4e8f..c9d8ed1d00d 100644
--- a/lib/Target/R600/R600EmitClauseMarkers.cpp
+++ b/lib/Target/R600/R600EmitClauseMarkers.cpp
@@ -108,7 +108,8 @@ private:
     std::vector<std::pair<unsigned, unsigned> > UsedKCache;
     const SmallVector<std::pair<MachineOperand *, int64_t>, 3> &Consts =
         TII->getSrcs(MI);
-    assert(TII->isALUInstr(MI->getOpcode()) && "Can't assign Const");
+    assert((TII->isALUInstr(MI->getOpcode()) ||
+        MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const");
     for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
       if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
         continue;
@@ -183,6 +184,9 @@ private:
       if (TII->isALUInstr(I->getOpcode()) &&
           !SubstituteKCacheBank(I, KCacheBanks))
         break;
+      if (I->getOpcode() == AMDGPU::DOT_4 &&
+          !SubstituteKCacheBank(I, KCacheBanks))
+        break;
       AluInstCount += OccupiedDwords(I);
     }
     unsigned Opcode = PushBeforeModifier ?
diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
index b9d5303ce1a..072ae3a5dcc 100644
--- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
@@ -214,7 +214,9 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
               .getReg();
           (void) Src0;
           (void) Src1;
-          assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1));
+          if ((TRI.getEncodingValue(Src0) & 0xff) < 127 &&
+              (TRI.getEncodingValue(Src1) & 0xff) < 127)
+            assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1));
         }
         MI.eraseFromParent();
         continue;
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 5f8486d1519..2a4a2459015 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -169,6 +169,31 @@ SmallVector<std::pair<MachineOperand *, int64_t>, 3>
 R600InstrInfo::getSrcs(MachineInstr *MI) const {
   SmallVector<std::pair<MachineOperand *, int64_t>, 3> Result;
 
+  if (MI->getOpcode() == AMDGPU::DOT_4) {
+    static const R600Operands::VecOps OpTable[8][2] = {
+      {R600Operands::SRC0_X, R600Operands::SRC0_SEL_X},
+      {R600Operands::SRC0_Y, R600Operands::SRC0_SEL_Y},
+      {R600Operands::SRC0_Z, R600Operands::SRC0_SEL_Z},
+      {R600Operands::SRC0_W, R600Operands::SRC0_SEL_W},
+      {R600Operands::SRC1_X, R600Operands::SRC1_SEL_X},
+      {R600Operands::SRC1_Y, R600Operands::SRC1_SEL_Y},
+      {R600Operands::SRC1_Z, R600Operands::SRC1_SEL_Z},
+      {R600Operands::SRC1_W, R600Operands::SRC1_SEL_W},
+    };
+
+    for (unsigned j = 0; j < 8; j++) {
+      MachineOperand &MO = MI->getOperand(OpTable[j][0] + 1);
+      unsigned Reg = MO.getReg();
+      if (Reg == AMDGPU::ALU_CONST) {
+        unsigned Sel = MI->getOperand(OpTable[j][1] + 1).getImm();
+        Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel));
+        continue;
+      }
+      
+    }
+    return Result;
+  }
+
   static const R600Operands::Ops OpTable[3][2] = {
     {R600Operands::SRC0, R600Operands::SRC0_SEL},
     {R600Operands::SRC1, R600Operands::SRC1_SEL},
@@ -967,6 +992,11 @@ int R600InstrInfo::getOperandIdx(const MachineInstr &MI,
   return getOperandIdx(MI.getOpcode(), Op);
 }
 
+int R600InstrInfo::getOperandIdx(const MachineInstr &MI,
+                                 R600Operands::VecOps Op) const {
+  return getOperandIdx(MI.getOpcode(), Op);
+}
+
 int R600InstrInfo::getOperandIdx(unsigned Opcode,
                                  R600Operands::Ops Op) const {
   unsigned TargetFlags = get(Opcode).TSFlags;
@@ -997,6 +1027,11 @@ int R600InstrInfo::getOperandIdx(unsigned Opcode,
   return R600Operands::ALUOpTable[OpTableIdx][Op];
 }
 
+int R600InstrInfo::getOperandIdx(unsigned Opcode,
+                                 R600Operands::VecOps Op) const {
+  return Op + 1;
+}
+
 void R600InstrInfo::setImmOperand(MachineInstr *MI, R600Operands::Ops Op,
                                   int64_t Imm) const {
   int Idx = getOperandIdx(*MI, Op);
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index f9ccf4fbb9a..afc24e2d09f 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -212,11 +212,13 @@ namespace llvm {
   ///
   /// \returns -1 if the Instruction does not contain the specified \p Op.
   int getOperandIdx(const MachineInstr &MI, R600Operands::Ops Op) const;
+  int getOperandIdx(const MachineInstr &MI, R600Operands::VecOps Op) const;
 
   /// \brief Get the index of \p Op for the given Opcode.
   ///
   /// \returns -1 if the Instruction does not contain the specified \p Op.
   int getOperandIdx(unsigned Opcode, R600Operands::Ops Op) const;
+  int getOperandIdx(unsigned Opcode, R600Operands::VecOps Op) const;
 
   /// \brief Helper function for setting instruction flag values.
   void setImmOperand(MachineInstr *MI, R600Operands::Ops Op, int64_t Imm) const;
diff --git a/test/CodeGen/R600/dot4-folding.ll b/test/CodeGen/R600/dot4-folding.ll
new file mode 100644
index 00000000000..3e8330f9b3e
--- /dev/null
+++ b/test/CodeGen/R600/dot4-folding.ll
@@ -0,0 +1,27 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; Exactly one constant vector can be folded into dot4, which means exactly
+; 4 MOV instructions
+; CHECK: @main
+; CHECK: MOV
+; CHECK: MOV
+; CHECK: MOV
+; CHECK: MOV
+; CHECK-NOT: MOV
+; CHECK-NOT: MOV
+; CHECK-NOT: MOV
+; CHECK-NOT: MOV
+
+define void @main(float addrspace(1)* %out) {
+main_body:
+  %0 = load <4 x float> addrspace(8)* null
+  %1 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %2 = call float @llvm.AMDGPU.dp4(<4 x float> %0,<4 x float> %1)
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %3, i32 0, i32 0)
+  ret void
+}
+
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+attributes #1 = { readnone }
-- 
2.34.1