return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, &Chains[0], NumElts);
}
+SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ LoadSDNode *Load = cast<LoadSDNode>(Op);
+ ISD::LoadExtType ExtType = Load->getExtensionType();
+
+ if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS ||
+ ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32))
+ return SDValue();
+
+
+ EVT VT = Op.getValueType();
+ EVT MemVT = Load->getMemoryVT();
+ unsigned Mask = 0;
+ if (Load->getMemoryVT() == MVT::i8) {
+ Mask = 0xff;
+ } else if (Load->getMemoryVT() == MVT::i16) {
+ Mask = 0xffff;
+ }
+ SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
+ DAG.getConstant(2, MVT::i32));
+ SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
+ Load->getChain(), Ptr,
+ DAG.getTargetConstant(0, MVT::i32),
+ Op.getOperand(2));
+ SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
+ Load->getBasePtr(),
+ DAG.getConstant(0x3, MVT::i32));
+ SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+ DAG.getConstant(3, MVT::i32));
+ Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
+ Ret = DAG.getNode(ISD::AND, DL, MVT::i32, Ret,
+ DAG.getConstant(Mask, MVT::i32));
+ if (ExtType == ISD::SEXTLOAD) {
+ SDValue SExtShift = DAG.getConstant(
+ VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
+ Ret = DAG.getNode(ISD::SHL, DL, MVT::i32, Ret, SExtShift);
+ Ret = DAG.getNode(ISD::SRA, DL, MVT::i32, Ret, SExtShift);
+ }
+
+ return Ret;
+}
+
SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG);
if (Result.getNode()) {
return Result;
}
StoreSDNode *Store = cast<StoreSDNode>(Op);
+ SDValue Chain = Store->getChain();
if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
Store->getValue().getValueType().isVector()) {
return SplitVectorStore(Op, DAG);
}
+
+ if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
+ Store->getMemoryVT().bitsLT(MVT::i32)) {
+ unsigned Mask = 0;
+ if (Store->getMemoryVT() == MVT::i8) {
+ Mask = 0xff;
+ } else if (Store->getMemoryVT() == MVT::i16) {
+ Mask = 0xffff;
+ }
+ SDValue TruncPtr = DAG.getZExtOrTrunc(Store->getBasePtr(), DL, MVT::i32);
+ SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr,
+ DAG.getConstant(2, MVT::i32));
+ SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+ Chain, Ptr, DAG.getTargetConstant(0, MVT::i32));
+ SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, TruncPtr,
+ DAG.getConstant(0x3, MVT::i32));
+ SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+ DAG.getConstant(3, MVT::i32));
+ SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
+ Store->getValue());
+ SDValue MaskedValue = DAG.getNode(ISD::AND, DL, MVT::i32, SExtValue,
+ DAG.getConstant(Mask, MVT::i32));
+ SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
+ MaskedValue, ShiftAmt);
+ SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, MVT::i32),
+ ShiftAmt);
+ DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
+ DAG.getConstant(0xffffffff, MVT::i32));
+ Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
+
+ SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
+ return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
+ Chain, Value, Ptr, DAG.getTargetConstant(0, MVT::i32));
+ }
return SDValue();
}
/// \brief Split a vector load into multiple scalar loads.
SDValue SplitVectorLoad(const SDValue &Op, SelectionDAG &DAG) const;
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
bool isHWTrueValue(SDValue Op) const;
bool isHWFalseValue(SDValue Op) const;
return SDValue();
}
+ SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
+ if (Ret.getNode()) {
+ return Ret;
+ }
// Lowering for indirect addressing
const MachineFunction &MF = DAG.getMachineFunction();
SDValue Ptr = Op.getOperand(1);
SDValue LoweredLoad;
+ SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
+ if (Ret.getNode()) {
+ SDValue Ops[2];
+ Ops[0] = Ret;
+ Ops[1] = Chain;
+ return DAG.getMergeValues(Ops, 2, DL);
+ }
+
+
if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
SDValue MergedValues[2] = {
SplitVectorLoad(Op, DAG),
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand);
setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
+ setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+ setTruncStoreAction(MVT::i32, MVT::i8, Custom);
+ setTruncStoreAction(MVT::i32, MVT::i16, Custom);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::i64, MVT::i32, Expand);
setTruncStoreAction(MVT::i128, MVT::i64, Expand);
SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
LoadSDNode *Load = cast<LoadSDNode>(Op);
+ SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
+ SDValue MergedValues[2];
+ MergedValues[1] = Load->getChain();
+ if (Ret.getNode()) {
+ MergedValues[0] = Ret;
+ return DAG.getMergeValues(MergedValues, 2, DL);
+ }
- if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+ if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
return SDValue();
+ }
SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
DAG.getConstant(2, MVT::i32));
+ Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
+ Load->getChain(), Ptr,
+ DAG.getTargetConstant(0, MVT::i32),
+ Op.getOperand(2));
- SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
- Load->getChain(), Ptr,
- DAG.getTargetConstant(0, MVT::i32),
- Op.getOperand(2));
- SDValue MergedValues[2] = {
- Ret,
- Load->getChain()
- };
+ MergedValues[0] = Ret;
return DAG.getMergeValues(MergedValues, 2, DL);
}
SDValue Chain = Store->getChain();
SmallVector<SDValue, 8> Values;
- if (VT == MVT::i64) {
+ if (Store->isTruncatingStore()) {
+ unsigned Mask = 0;
+ if (Store->getMemoryVT() == MVT::i8) {
+ Mask = 0xff;
+ } else if (Store->getMemoryVT() == MVT::i16) {
+ Mask = 0xffff;
+ }
+ SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+ Chain, Store->getBasePtr(),
+ DAG.getConstant(0, MVT::i32));
+ SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getBasePtr(),
+ DAG.getConstant(0x3, MVT::i32));
+ SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+ DAG.getConstant(3, MVT::i32));
+ SDValue MaskedValue = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getValue(),
+ DAG.getConstant(Mask, MVT::i32));
+ SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
+ MaskedValue, ShiftAmt);
+ SDValue RotrAmt = DAG.getNode(ISD::SUB, DL, MVT::i32,
+ DAG.getConstant(32, MVT::i32), ShiftAmt);
+ SDValue DstMask = DAG.getNode(ISD::ROTR, DL, MVT::i32,
+ DAG.getConstant(Mask, MVT::i32),
+ RotrAmt);
+ Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
+ Dst = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
+
+ Values.push_back(Dst);
+ } else if (VT == MVT::i64) {
for (unsigned i = 0; i < 2; ++i) {
Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
Store->getValue(), DAG.getConstant(i, MVT::i32)));
; EG-LABEL: @anyext_load_i8:
; EG: AND_INT
-; EG-NEXT: 255
+; EG: 255
define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
%cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)*
%load = load i32 addrspace(1)* %cast, align 1
; EG-LABEL: @anyext_load_i16:
; EG: AND_INT
-; EG: LSHL
-; EG: 65535
+; EG: AND_INT
+; EG-DAG: 65535
+; EG-DAG: -65536
define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
%cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)*
%load = load i32 addrspace(1)* %cast, align 1
; EG-LABEL: @anyext_load_lds_i8:
; EG: AND_INT
-; EG-NEXT: 255
+; EG: 255
define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
%cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)*
%load = load i32 addrspace(3)* %cast, align 1
; EG-LABEL: @anyext_load_lds_i16:
; EG: AND_INT
-; EG: LSHL
-; EG: 65535
+; EG: AND_INT
+; EG-DAG: 65535
+; EG-DAG: -65536
define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
%cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)*
%load = load i32 addrspace(3)* %cast, align 1
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
; This test checks that uses and defs of the AR register happen in the same
; instruction clause.
-; R600-CHECK-LABEL: @mova_same_clause
+; FUNC-LABEL: @mova_same_clause
+
; R600-CHECK: MOVA_INT
; R600-CHECK-NOT: ALU clause
; R600-CHECK: 0 + AR.x
; R600-CHECK-NOT: ALU clause
; R600-CHECK: 0 + AR.x
-; SI-CHECK-LABEL: @mova_same_clause
; SI-CHECK: V_READFIRSTLANE
; SI-CHECK: V_MOVRELD
; SI-CHECK: S_CBRANCH
; XXX: This generated code has unnecessary MOVs, we should be able to optimize
; this.
-; R600-CHECK-LABEL: @multiple_structs
+; FUNC-LABEL: @multiple_structs
; R600-CHECK-NOT: MOVA_INT
-; SI-CHECK-LABEL: @multiple_structs
; SI-CHECK-NOT: V_MOVREL
%struct.point = type { i32, i32 }
; loads and stores should be lowered to copies, so there shouldn't be any
; MOVA instructions.
-; R600-CHECK-LABEL: @direct_loop
+; FUNC-LABEL: @direct_loop
; R600-CHECK-NOT: MOVA_INT
-; SI-CHECK-LABEL: @direct_loop
; SI-CHECK-NOT: V_MOVREL
define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
store i32 %value, i32 addrspace(1)* %out
ret void
}
+
+; FUNC-LABEL: @short_array
+
+; R600-CHECK: MOV {{\** *}}T{{[0-9]\.[XYZW]}}, literal
+; R600-CHECK: 65536
+; R600-CHECK: MOVA_INT
+
+; SI-CHECK: V_MOV_B32_e32 v{{[0-9]}}, 65536
+; SI-CHECK: V_MOVRELS_B32_e32
+define void @short_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+ %0 = alloca [2 x i16]
+ %1 = getelementptr [2 x i16]* %0, i32 0, i32 0
+ %2 = getelementptr [2 x i16]* %0, i32 0, i32 1
+ store i16 0, i16* %1
+ store i16 1, i16* %2
+ %3 = getelementptr [2 x i16]* %0, i32 0, i32 %index
+ %4 = load i16* %3
+ %5 = sext i16 %4 to i32
+ store i32 %5, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: @char_array
+
+; R600-CHECK: OR_INT {{\** *}}T{{[0-9]\.[XYZW]}}, {{[PVT0-9]+\.[XYZW]}}, literal
+; R600-CHECK: 256
+; R600-CHECK: MOVA_INT
+
+; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 256
+; SI-CHECK: V_MOVRELS_B32_e32
+define void @char_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+ %0 = alloca [2 x i8]
+ %1 = getelementptr [2 x i8]* %0, i32 0, i32 0
+ %2 = getelementptr [2 x i8]* %0, i32 0, i32 1
+ store i8 0, i8* %1
+ store i8 1, i8* %2
+ %3 = getelementptr [2 x i8]* %0, i32 0, i32 %index
+ %4 = load i8* %3
+ %5 = sext i8 %4 to i32
+ store i32 %5, i32 addrspace(1)* %out
+ ret void
+
+}