N->setNodeId(-1);
return NULL; // Already selected.
}
+
+ const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
switch (Opc) {
default: break;
// We are selecting i64 ADD here instead of custom lower it during
// DAG legalization, so we can fold some i64 ADDs used for address
// calculation into the LOAD and STORE instructions.
case ISD::ADD: {
- const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
if (N->getValueType(0) != MVT::i64 ||
ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
break;
}
case ISD::BUILD_VECTOR: {
unsigned RegClassID;
- const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
const AMDGPURegisterInfo *TRI =
static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo());
const SIRegisterInfo *SIRI =
}
case ISD::BUILD_PAIR: {
SDValue RC, SubReg0, SubReg1;
- const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
break;
}
}
case AMDGPUISD::REGISTER_LOAD: {
- const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
break;
SDValue Addr, Offset;
Ops);
}
case AMDGPUISD::REGISTER_STORE: {
- const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
break;
SDValue Addr, Offset;
CurDAG->getVTList(MVT::Other),
Ops);
}
+
+ case AMDGPUISD::BFE_I32:
+ case AMDGPUISD::BFE_U32: {
+ if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ break;
+
+ // There is a scalar version available, but unlike the vector version which
+ // has a separate operand for the offset and width, the scalar version packs
+ // the width and offset into a single operand. Try to move to the scalar
+ // version if the offsets are constant, so that we can try to keep extended
+ // loads of kernel arguments in SGPRs.
+
+ // TODO: Technically we could try to pattern match scalar bitshifts of
+ // dynamic values, but it's probably not useful.
+ ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!Offset)
+ break;
+
+ ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+ if (!Width)
+ break;
+
+ bool Signed = Opc == AMDGPUISD::BFE_I32;
+
+ // Transformation function, pack the offset and width of a BFE into
+ // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+ // source, bits [5:0] contain the offset and bits [22:16] the width.
+
+ uint32_t OffsetVal = Offset->getZExtValue();
+ uint32_t WidthVal = Width->getZExtValue();
+
+ uint32_t PackedVal = OffsetVal | WidthVal << 16;
+
+ SDValue PackedOffsetWidth = CurDAG->getTargetConstant(PackedVal, MVT::i32);
+ return CurDAG->getMachineNode(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
+ SDLoc(N),
+ MVT::i32,
+ N->getOperand(0),
+ PackedOffsetWidth);
+
+ }
}
return SelectCode(N);
}
case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
+ case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
+ case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
addDescImplicitUseDef(NewDesc, Inst);
+ if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
+ const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
+ // If we need to move this to VGPRs, we need to unpack the second operand
+ // back into the 2 separate ones for bit offset and width.
+ assert(OffsetWidthOp.isImm() &&
+ "Scalar BFE is only implemented for constant width and offset");
+ uint32_t Imm = OffsetWidthOp.getImm();
+
+ uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+ uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+
+ Inst->RemoveOperand(2); // Remove old immediate.
+ Inst->addOperand(MachineOperand::CreateImm(Offset));
+ Inst->addOperand(MachineOperand::CreateImm(BitWidth));
+
+ Inst->addOperand(MachineOperand::CreateImm(0));
+ Inst->addOperand(MachineOperand::CreateImm(0));
+ Inst->addOperand(MachineOperand::CreateImm(0));
+ Inst->addOperand(MachineOperand::CreateImm(0));
+ }
+
// Update the destination register class.
const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0);
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=cypress | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone
; FUNC-LABEL: @sext_in_reg_i1_i32
; SI: S_LOAD_DWORD [[ARG:s[0-9]+]],
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[ARG]], 0, 1
+; SI: S_BFE_I32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000
+; SI: V_MOV_B32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]]
; SI: BUFFER_STORE_DWORD [[EXTRACT]],
; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments.
; XFUNC-LABEL: @sext_in_reg_i8_to_v1i64
-; XSI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
-; XSI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31,
+; XSI: S_BFE_I32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288
+; XSI: S_ASHR_I32 {{v[0-9]+}}, [[EXTRACT]], 31
; XSI: BUFFER_STORE_DWORD
; XEG: BFE_INT
; XEG: ASHR
; FUNC-LABEL: @sext_in_reg_v2i1_to_v2i32
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
; SI: BUFFER_STORE_DWORDX2
; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
}
; FUNC-LABEL: @sext_in_reg_v4i1_to_v4i32
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
; SI: BUFFER_STORE_DWORDX4
; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
ret void
}
+; FUNC-LABEL: @vgpr_sext_in_reg_v4i8_to_v4i32
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
+ %loada = load <4 x i32> addrspace(1)* %a, align 16
+ %loadb = load <4 x i32> addrspace(1)* %b, align 16
+ %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
+ %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
+ %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
+ store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: @vgpr_sext_in_reg_v4i16_to_v4i32
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
+define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
+ %loada = load <4 x i32> addrspace(1)* %a, align 16
+ %loadb = load <4 x i32> addrspace(1)* %b, align 16
+ %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
+ %shl = shl <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
+ %ashr = ashr <4 x i32> %shl, <i32 16, i32 16, i32 16, i32 16>
+ store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+ ret void
+}
+
; FIXME: The BFE should really be eliminated. I think it should happen
; when computeMaskedBitsForTargetNode is implemented for imax.