R600/SI: Fix 64-bit bit ops that require the VALU.
authorMatt Arsenault <Matthew.Arsenault@amd.com>
Mon, 24 Mar 2014 20:08:05 +0000 (20:08 +0000)
committerMatt Arsenault <Matthew.Arsenault@amd.com>
Mon, 24 Mar 2014 20:08:05 +0000 (20:08 +0000)
Try to match scalar and first like the other instructions.
Expand 64-bit ands to a pair of 32-bit ands since that is not
available on the VALU.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@204660 91177308-0d34-0410-b5e6-96231b3b80d8

lib/Target/R600/SIInstrInfo.cpp
lib/Target/R600/SIInstrInfo.h
lib/Target/R600/SIInstructions.td
test/CodeGen/R600/or.ll

index b0a0e9af4cc893e327665ad3fe8c5445de241f06..6cc4dee82712de3e217c243e74c8475b1132d817 100644 (file)
@@ -879,6 +879,30 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
       Inst->eraseFromParent();
       continue;
     }
+    case AMDGPU::S_AND_B64:
+      splitScalar64BitOp(Worklist, Inst, AMDGPU::S_AND_B32);
+      Inst->eraseFromParent();
+      continue;
+
+    case AMDGPU::S_OR_B64:
+      splitScalar64BitOp(Worklist, Inst, AMDGPU::S_OR_B32);
+      Inst->eraseFromParent();
+      continue;
+
+    case AMDGPU::S_XOR_B64:
+      splitScalar64BitOp(Worklist, Inst, AMDGPU::S_XOR_B32);
+      Inst->eraseFromParent();
+      continue;
+
+    case AMDGPU::S_NOT_B64:
+      splitScalar64BitOp(Worklist, Inst, AMDGPU::S_NOT_B32);
+      Inst->eraseFromParent();
+      continue;
+
+    case AMDGPU::S_BFE_U64:
+    case AMDGPU::S_BFE_I64:
+    case AMDGPU::S_BFM_B64:
+      llvm_unreachable("Moving this op to VALU not implemented");
     }
 
     unsigned NewOpcode = getVALUOp(*Inst);
@@ -968,6 +992,58 @@ const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
   return &AMDGPU::VReg_32RegClass;
 }
 
+void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist,
+                                     MachineInstr *Inst,
+                                     unsigned Opcode) const {
+  MachineBasicBlock &MBB = *Inst->getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  // We shouldn't need to worry about immediate operands here.
+  MachineOperand &Dest = Inst->getOperand(0);
+  MachineOperand &Src0 = Inst->getOperand(1);
+  MachineOperand &Src1 = Inst->getOperand(2);
+  DebugLoc DL = Inst->getDebugLoc();
+
+  MachineBasicBlock::iterator MII = Inst;
+
+  const MCInstrDesc &InstDesc = get(Opcode);
+  const TargetRegisterClass *RC = MRI.getRegClass(Src0.getReg());
+  const TargetRegisterClass *SubRC = RI.getSubRegClass(RC, AMDGPU::sub0);
+  unsigned SrcReg0Sub0 = buildExtractSubReg(MII, MRI, Src0, RC,
+                                            AMDGPU::sub0, SubRC);
+  unsigned SrcReg1Sub0 = buildExtractSubReg(MII, MRI, Src1, RC,
+                                            AMDGPU::sub0, SubRC);
+
+  unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+  MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
+    .addReg(SrcReg0Sub0)
+    .addReg(SrcReg1Sub0);
+
+  unsigned SrcReg0Sub1 = buildExtractSubReg(MII, MRI, Src0, RC,
+                                            AMDGPU::sub1, SubRC);
+  unsigned SrcReg1Sub1 = buildExtractSubReg(MII, MRI, Src1, RC,
+                                            AMDGPU::sub1, SubRC);
+
+  unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+  MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
+    .addReg(SrcReg0Sub1)
+    .addReg(SrcReg1Sub1);
+
+  unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+    .addReg(DestSub0)
+    .addImm(AMDGPU::sub0)
+    .addReg(DestSub1)
+    .addImm(AMDGPU::sub1);
+
+  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+  // Try to legalize the operands in case we need to swap the order to keep it
+  // valid.
+  Worklist.push_back(LoHalf);
+  Worklist.push_back(HiHalf);
+}
+
 MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
                                    MachineBasicBlock *MBB,
                                    MachineBasicBlock::iterator I,
index 8c0fb6fbd5af5ce6b64b64c5c8ab735eb21d68ae..6eefd3ac98ca697f16ba03cd119f1fb3201f6b7c 100644 (file)
@@ -38,6 +38,10 @@ private:
                          const TargetRegisterClass *RC,
                          const MachineOperand &Op) const;
 
+  void splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist,
+                          MachineInstr *Inst, unsigned Opcode) const;
+
+
 public:
   explicit SIInstrInfo(AMDGPUTargetMachine &tm);
 
@@ -92,6 +96,7 @@ public:
 
   bool isSALUInstr(const MachineInstr &MI) const;
   static unsigned getVALUOp(const MachineInstr &MI);
+
   bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const;
 
   /// \brief Return the correct register class for \p OpNo.  For target-specific
index 8ec29713e076800b16d691c1c5c5f04cdc30a97a..8e320929fb70f6f9e6d4a7017326ad963045362a 100644 (file)
@@ -1222,7 +1222,7 @@ def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32",
 >;
 
 def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64",
-  []
+  [(set i64:$dst, (or i64:$src0, i64:$src1))]
 >;
 
 def : Pat <
index 35fc8b33e0be89a55d4de2d99a9f41b5c5744f11..05d1e0f0416dfd938071f9328578a922ac4b9e72 100644 (file)
@@ -56,15 +56,34 @@ define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b)
   ret void
 }
 
-; EG-CHECK-LABEL: @or_i64
+; EG-CHECK-LABEL: @scalar_or_i64
 ; EG-CHECK-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
 ; EG-CHECK-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
-; SI-CHECK-LABEL: @or_i64
+; SI-CHECK-LABEL: @scalar_or_i64
+; SI-CHECK: S_OR_B64
+define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+  %or = or i64 %a, %b
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-CHECK-LABEL: @vector_or_i64
 ; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}
 ; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}
-define void @or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
-entry:
-       %0 = or i64 %a, %b
-       store i64 %0, i64 addrspace(1)* %out
-       ret void
+define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 8
+  %loadb = load i64 addrspace(1)* %a, align 8
+  %or = or i64 %loada, %loadb
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-CHECK-LABEL: @scalar_vector_or_i64
+; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}
+; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}
+define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) {
+  %loada = load i64 addrspace(1)* %a
+  %or = or i64 %loada, %b
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
 }