bool isLocalLoad(const LoadSDNode *N) const;
bool isRegionLoad(const LoadSDNode *N) const;
+ SDNode *glueCopyToM0(SDNode *N) const;
+
const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
return true;
}
+SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+ !checkType(cast<MemSDNode>(N)->getMemOperand()->getValue(),
+ AMDGPUAS::LOCAL_ADDRESS))
+ return N;
+
+ const SITargetLowering& Lowering =
+ *static_cast<const SITargetLowering*>(getTargetLowering());
+
+ // Write max value to m0 before each load operation
+
+ SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N),
+ CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
+
+ SDValue Glue = M0.getValue(1);
+
+ SmallVector <SDValue, 8> Ops;
+ for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+ Ops.push_back(N->getOperand(i));
+ }
+ Ops.push_back(Glue);
+ CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
+
+ return N;
+}
+
SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
unsigned int Opc = N->getOpcode();
if (N->isMachineOpcode()) {
return nullptr; // Already selected.
}
+ if (isa<AtomicSDNode>(N))
+ N = glueCopyToM0(N);
+
switch (Opc) {
default: break;
// We are selecting i64 ADD here instead of custom lower it during
}
case ISD::LOAD: {
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ SDLoc SL(N);
+ EVT VT = N->getValueType(0);
+
+ if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) {
+ N = glueCopyToM0(N);
+ break;
+ }
+
// To simplify the TableGen patters, we replace all i64 loads with
// v2i32 loads. Alternatively, we could promote i64 loads to v2i32
// during DAG legalization, however, so places (ExpandUnalignedLoad)
// in the DAG legalizer assume that if i64 is legal, so doing this
// promotion early can cause problems.
- EVT VT = N->getValueType(0);
- LoadSDNode *LD = cast<LoadSDNode>(N);
- if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD)
- break;
SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(),
- LD->getBasePtr(), LD->getMemOperand());
- SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
+ LD->getBasePtr(), LD->getMemOperand());
+ SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL,
MVT::i64, NewLoad);
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1));
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast);
- SelectCode(NewLoad.getNode());
+ SDNode *Load = glueCopyToM0(NewLoad.getNode());
+ SelectCode(Load);
N = BitCast.getNode();
break;
}
// Handle i64 stores here for the same reason mentioned above for loads.
StoreSDNode *ST = cast<StoreSDNode>(N);
SDValue Value = ST->getValue();
- if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore())
- break;
+ if (Value.getValueType() == MVT::i64 && !ST->isTruncatingStore()) {
+
+ SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
+ MVT::v2i32, Value);
+ SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue,
+ ST->getBasePtr(), ST->getMemOperand());
- SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
- MVT::v2i32, Value);
- SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue,
- ST->getBasePtr(), ST->getMemOperand());
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore);
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore);
+ if (NewValue.getOpcode() == ISD::BITCAST) {
+ Select(NewStore.getNode());
+ return SelectCode(NewValue.getNode());
+ }
- if (NewValue.getOpcode() == ISD::BITCAST) {
- Select(NewStore.getNode());
- return SelectCode(NewValue.getNode());
+ // getNode() may fold the bitcast if its input was another bitcast. If that
+ // happens we should only select the new store.
+ N = NewStore.getNode();
}
- // getNode() may fold the bitcast if its input was another bitcast. If that
- // happens we should only select the new store.
- N = NewStore.getNode();
+ N = glueCopyToM0(N);
break;
}
return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
}]>;
-def az_extload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
+class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr),
+ (ld_node node:$ptr), [{
LoadSDNode *L = cast<LoadSDNode>(N);
return L->getExtensionType() == ISD::ZEXTLOAD ||
L->getExtensionType() == ISD::EXTLOAD;
}]>;
+def az_extload : AZExtLoadBase <unindexedload>;
+
def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
}]>;
return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
}]>;
+multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> {
-def atomic_cmp_swap_32_local :
- PatFrag<(ops node:$ptr, node:$cmp, node:$swap),
- (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{
- AtomicSDNode *AN = cast<AtomicSDNode>(N);
- return AN->getMemoryVT() == MVT::i32 &&
- AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
-}]>;
+ def _32_local : PatFrag <
+ (ops node:$ptr, node:$cmp, node:$swap),
+ (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
+ AtomicSDNode *AN = cast<AtomicSDNode>(N);
+ return AN->getMemoryVT() == MVT::i32 &&
+ AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+ }]>;
-def atomic_cmp_swap_64_local :
- PatFrag<(ops node:$ptr, node:$cmp, node:$swap),
- (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{
- AtomicSDNode *AN = cast<AtomicSDNode>(N);
- return AN->getMemoryVT() == MVT::i64 &&
- AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
-}]>;
+ def _64_local : PatFrag<
+ (ops node:$ptr, node:$cmp, node:$swap),
+ (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
+ AtomicSDNode *AN = cast<AtomicSDNode>(N);
+ return AN->getMemoryVT() == MVT::i64 &&
+ AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+ }]>;
+}
+
+defm atomic_cmp_swap : AtomicCmpSwapLocal <atomic_cmp_swap>;
def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
return isFlatLoad(dyn_cast<LoadSDNode>(N));
let LGKM_CNT = 1;
let DS = 1;
let UseNamedOperandTable = 1;
- let DisableEncoding = "$m0";
+ let Uses = [M0];
// Most instruction load and store data, so set this as the default.
let mayLoad = 1;
"AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]>
>;
+//===----------------------------------------------------------------------===//
+// SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1
+// to be glued to the memory instructions.
+//===----------------------------------------------------------------------===//
+
+def SIld_local : SDNode <"ISD::LOAD", SDTLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
+>;
+
+def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{
+ return isLocalLoad(cast<LoadSDNode>(N));
+}]>;
+
+def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED &&
+ cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
+}]>;
+
+def si_load_local_align8 : Aligned8Bytes <
+ (ops node:$ptr), (si_load_local node:$ptr)
+>;
+
+def si_sextload_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{
+ return cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
+}]>;
+def si_az_extload_local : AZExtLoadBase <si_ld_local>;
+
+multiclass SIExtLoadLocal <PatFrag ld_node> {
+
+ def _i8 : PatFrag <(ops node:$ptr), (ld_node node:$ptr),
+ [{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;}]
+ >;
+
+ def _i16 : PatFrag <(ops node:$ptr), (ld_node node:$ptr),
+ [{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;}]
+ >;
+}
+
+defm si_sextload_local : SIExtLoadLocal <si_sextload_local>;
+defm si_az_extload_local : SIExtLoadLocal <si_az_extload_local>;
+
+def SIst_local : SDNode <"ISD::STORE", SDTStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue]
+>;
+
+def si_st_local : PatFrag <
+ (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{
+ return isLocalStore(cast<StoreSDNode>(N));
+}]>;
+
+def si_store_local : PatFrag <
+ (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED &&
+ !cast<StoreSDNode>(N)->isTruncatingStore();
+}]>;
+
+def si_store_local_align8 : Aligned8Bytes <
+ (ops node:$val, node:$ptr), (si_store_local node:$val, node:$ptr)
+>;
+
+def si_truncstore_local : PatFrag <
+ (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->isTruncatingStore();
+}]>;
+
+def si_truncstore_local_i8 : PatFrag <
+ (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def si_truncstore_local_i16 : PatFrag <
+ (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+multiclass SIAtomicM0Glue2 <string op_name> {
+
+ def _glue : SDNode <"ISD::ATOMIC_"#op_name, SDTAtomic2,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
+ >;
+
+ def _local : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
+}
+
+defm si_atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
+defm si_atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
+defm si_atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
+defm si_atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
+defm si_atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">;
+defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
+defm si_atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">;
+defm si_atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
+defm si_atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
+defm si_atomic_swap : SIAtomicM0Glue2 <"SWAP">;
+
+def si_atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
+>;
+
+defm si_atomic_cmp_swap : AtomicCmpSwapLocal <si_atomic_cmp_swap_glue>;
+
// Transformation function, extract the lower 32bit of a 64bit immediate
def LO32 : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, SDLoc(N),
multiclass DS_1A_RET <bits<8> op, string opName, RegisterClass rc,
dag outs = (outs rc:$vdst),
- dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds, M0Reg:$m0),
+ dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds),
string asm = opName#" $vdst, $addr"#"$offset$gds"> {
def "" : DS_Pseudo <opName, outs, ins, []>;
multiclass DS_1A_Off8_RET <bits<8> op, string opName, RegisterClass rc,
dag outs = (outs rc:$vdst),
dag ins = (ins VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1,
- gds01:$gds, M0Reg:$m0),
+ gds01:$gds),
string asm = opName#" $vdst, $addr"#"$offset0"#"$offset1$gds"> {
def "" : DS_Pseudo <opName, outs, ins, []>;
multiclass DS_1A1D_NORET <bits<8> op, string opName, RegisterClass rc,
dag outs = (outs),
- dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds,
- M0Reg:$m0),
+ dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds),
string asm = opName#" $addr, $data0"#"$offset$gds"> {
def "" : DS_Pseudo <opName, outs, ins, []>,
multiclass DS_1A1D_Off8_NORET <bits<8> op, string opName, RegisterClass rc,
dag outs = (outs),
dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1,
- ds_offset0:$offset0, ds_offset1:$offset1, gds01:$gds, M0Reg:$m0),
+ ds_offset0:$offset0, ds_offset1:$offset1, gds01:$gds),
string asm = opName#" $addr, $data0, $data1"#"$offset0"#"$offset1"#"$gds"> {
def "" : DS_Pseudo <opName, outs, ins, []>;
multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc,
string noRetOp = "",
dag outs = (outs rc:$vdst),
- dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds,
- M0Reg:$m0),
+ dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds),
string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> {
def "" : DS_Pseudo <opName, outs, ins, []>,
string noRetOp = "", RegisterClass src = rc> :
DS_1A2D_RET_m <op, asm, rc, noRetOp,
(ins VGPR_32:$addr, src:$data0, src:$data1,
- ds_offset:$offset, gds:$gds, M0Reg:$m0)
+ ds_offset:$offset, gds:$gds)
>;
multiclass DS_1A2D_NORET <bits<8> op, string opName, RegisterClass rc,
string noRetOp = opName,
dag outs = (outs),
dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1,
- ds_offset:$offset, gds:$gds, M0Reg:$m0),
+ ds_offset:$offset, gds:$gds),
string asm = opName#" $addr, $data0, $data1"#"$offset"#"$gds"> {
def "" : DS_Pseudo <opName, outs, ins, []>,
multiclass DS_0A_RET <bits<8> op, string opName,
dag outs = (outs VGPR_32:$vdst),
- dag ins = (ins ds_offset:$offset, gds:$gds, M0Reg:$m0),
+ dag ins = (ins ds_offset:$offset, gds:$gds),
string asm = opName#" $vdst"#"$offset"#"$gds"> {
let mayLoad = 1, mayStore = 1 in {
multiclass DS_1A_RET_GDS <bits<8> op, string opName,
dag outs = (outs VGPR_32:$vdst),
- dag ins = (ins VGPR_32:$addr, ds_offset_gds:$offset, M0Reg:$m0),
+ dag ins = (ins VGPR_32:$addr, ds_offset_gds:$offset),
string asm = opName#" $vdst, $addr"#"$offset gds"> {
def "" : DS_Pseudo <opName, outs, ins, []>;
multiclass DS_1A_GDS <bits<8> op, string opName,
dag outs = (outs),
- dag ins = (ins VGPR_32:$addr, M0Reg:$m0),
+ dag ins = (ins VGPR_32:$addr),
string asm = opName#" $addr gds"> {
def "" : DS_Pseudo <opName, outs, ins, []>;
multiclass DS_1A <bits<8> op, string opName,
dag outs = (outs),
- dag ins = (ins VGPR_32:$addr, ds_offset:$offset, M0Reg:$m0, gds:$gds),
+ dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds),
string asm = opName#" $addr"#"$offset"#"$gds"> {
let mayLoad = 1, mayStore = 1 in {
class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat <
(vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
- (inst $ptr, (as_i16imm $offset), (i1 0), (S_MOV_B32 -1))
+ (inst $ptr, (as_i16imm $offset), (i1 0))
>;
-def : DSReadPat <DS_READ_I8, i32, sextloadi8_local>;
-def : DSReadPat <DS_READ_U8, i32, az_extloadi8_local>;
-def : DSReadPat <DS_READ_I16, i32, sextloadi16_local>;
-def : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>;
-def : DSReadPat <DS_READ_B32, i32, local_load>;
+def : DSReadPat <DS_READ_I8, i32, si_sextload_local_i8>;
+def : DSReadPat <DS_READ_U8, i32, si_az_extload_local_i8>;
+def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>;
+def : DSReadPat <DS_READ_U16, i32, si_az_extload_local_i16>;
+def : DSReadPat <DS_READ_B32, i32, si_load_local>;
let AddedComplexity = 100 in {
-def : DSReadPat <DS_READ_B64, v2i32, local_load_aligned8bytes>;
+def : DSReadPat <DS_READ_B64, v2i32, si_load_local_align8>;
} // End AddedComplexity = 100
def : Pat <
- (v2i32 (local_load (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
+ (v2i32 (si_load_local (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
i8:$offset1))),
- (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0), (S_MOV_B32 -1))
+ (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0))
>;
class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat <
(frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
- (inst $ptr, $value, (as_i16imm $offset), (i1 0), (S_MOV_B32 -1))
+ (inst $ptr, $value, (as_i16imm $offset), (i1 0))
>;
-def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>;
-def : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>;
-def : DSWritePat <DS_WRITE_B32, i32, local_store>;
+def : DSWritePat <DS_WRITE_B8, i32, si_truncstore_local_i8>;
+def : DSWritePat <DS_WRITE_B16, i32, si_truncstore_local_i16>;
+def : DSWritePat <DS_WRITE_B32, i32, si_store_local>;
let AddedComplexity = 100 in {
-def : DSWritePat <DS_WRITE_B64, v2i32, local_store_aligned8bytes>;
+def : DSWritePat <DS_WRITE_B64, v2i32, si_store_local_align8>;
} // End AddedComplexity = 100
def : Pat <
- (local_store v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
- i8:$offset1)),
+ (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
+ i8:$offset1)),
(DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0),
(EXTRACT_SUBREG $value, sub1), $offset0, $offset1,
- (i1 0), (S_MOV_B32 -1))
+ (i1 0))
>;
class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat <
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
- (inst $ptr, $value, (as_i16imm $offset), (i1 0), (S_MOV_B32 -1))
+ (inst $ptr, $value, (as_i16imm $offset), (i1 0))
>;
// Special case of DSAtomicRetPat for add / sub 1 -> inc / dec
class DSAtomicIncRetPat<DS inst, ValueType vt,
Instruction LoadImm, PatFrag frag> : Pat <
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)),
- (inst $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (i1 0), (S_MOV_B32 -1))
+ (inst $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (i1 0))
>;
class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat <
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
- (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0), (S_MOV_B32 -1))
+ (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0))
>;
// 32-bit atomics.
def : DSAtomicIncRetPat<DS_INC_RTN_U32, i32,
- S_MOV_B32, atomic_load_add_local>;
+ S_MOV_B32, si_atomic_load_add_local>;
def : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32,
- S_MOV_B32, atomic_load_sub_local>;
+ S_MOV_B32, si_atomic_load_sub_local>;
-def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, atomic_swap_local>;
-def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, atomic_load_add_local>;
-def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, atomic_load_sub_local>;
-def : DSAtomicRetPat<DS_AND_RTN_B32, i32, atomic_load_and_local>;
-def : DSAtomicRetPat<DS_OR_RTN_B32, i32, atomic_load_or_local>;
-def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, atomic_load_xor_local>;
-def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, atomic_load_min_local>;
-def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, atomic_load_max_local>;
-def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, atomic_load_umin_local>;
-def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, atomic_load_umax_local>;
+def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>;
+def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>;
+def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, si_atomic_load_sub_local>;
+def : DSAtomicRetPat<DS_AND_RTN_B32, i32, si_atomic_load_and_local>;
+def : DSAtomicRetPat<DS_OR_RTN_B32, i32, si_atomic_load_or_local>;
+def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, si_atomic_load_xor_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, si_atomic_load_min_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, si_atomic_load_max_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, si_atomic_load_umin_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, si_atomic_load_umax_local>;
-def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, atomic_cmp_swap_32_local>;
+def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>;
// 64-bit atomics.
def : DSAtomicIncRetPat<DS_INC_RTN_U64, i64,
- S_MOV_B64, atomic_load_add_local>;
+ S_MOV_B64, si_atomic_load_add_local>;
def : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64,
- S_MOV_B64, atomic_load_sub_local>;
-
-def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, atomic_swap_local>;
-def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, atomic_load_add_local>;
-def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, atomic_load_sub_local>;
-def : DSAtomicRetPat<DS_AND_RTN_B64, i64, atomic_load_and_local>;
-def : DSAtomicRetPat<DS_OR_RTN_B64, i64, atomic_load_or_local>;
-def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, atomic_load_xor_local>;
-def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, atomic_load_min_local>;
-def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, atomic_load_max_local>;
-def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, atomic_load_umin_local>;
-def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, atomic_load_umax_local>;
-
-def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>;
+ S_MOV_B64, si_atomic_load_sub_local>;
+
+def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>;
+def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>;
+def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, si_atomic_load_sub_local>;
+def : DSAtomicRetPat<DS_AND_RTN_B64, i64, si_atomic_load_and_local>;
+def : DSAtomicRetPat<DS_OR_RTN_B64, i64, si_atomic_load_or_local>;
+def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, si_atomic_load_xor_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, si_atomic_load_min_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, si_atomic_load_max_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, si_atomic_load_umin_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, si_atomic_load_umax_local>;
+
+def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, si_atomic_cmp_swap_64_local>;
//===----------------------------------------------------------------------===//
// Be careful, since the addresses could be subregisters themselves in weird
// cases, like vectors of pointers.
const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
- const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0);
unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg();
unsigned DestReg1
.addImm(NewOffset0) // offset0
.addImm(NewOffset1) // offset1
.addImm(0) // gds
- .addOperand(*M0Reg) // M0
.addMemOperand(*I->memoperands_begin())
.addMemOperand(*Paired->memoperands_begin());
- LIS->InsertMachineInstrInMaps(Read2);
-
unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
updateRegDefsUses(DestReg0, DestReg, SubRegIdx0);
updateRegDefsUses(DestReg1, DestReg, SubRegIdx1);
LIS->RemoveMachineInstrFromMaps(I);
- LIS->RemoveMachineInstrFromMaps(Paired);
+ // Replacing Paired in the maps with Read2 allows us to avoid updating the
+ // live range for the m0 register.
+ LIS->ReplaceMachineInstrInMaps(Paired, Read2);
I->eraseFromParent();
Paired->eraseFromParent();
LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
LIS->shrinkToUses(&AddrRegLI);
- LiveInterval &M0RegLI = LIS->getInterval(M0Reg->getReg());
- LIS->shrinkToUses(&M0RegLI);
-
- // Currently m0 is treated as a register class with one member instead of an
- // implicit physical register. We are using the virtual register for the first
- // one, but we still need to update the live range of the now unused second m0
- // virtual register to avoid verifier errors.
- const MachineOperand *PairedM0Reg
- = TII->getNamedOperand(*Paired, AMDGPU::OpName::m0);
- LiveInterval &PairedM0RegLI = LIS->getInterval(PairedM0Reg->getReg());
- LIS->shrinkToUses(&PairedM0RegLI);
-
LIS->getInterval(DestReg); // Create new LI
DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
// Be sure to use .addOperand(), and not .addReg() with these. We want to be
// sure we preserve the subregister index and any register flags set on them.
const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
- const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0);
const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0);
const MachineOperand *Data1
= TII->getNamedOperand(*Paired, AMDGPU::OpName::data0);
const MCInstrDesc &Write2Desc = TII->get(Opc);
DebugLoc DL = I->getDebugLoc();
+ // repairLiveintervalsInRange() doesn't handle physical register, so we have
+ // to update the M0 range manually.
+ SlotIndex PairedIndex = LIS->getInstructionIndex(Paired);
+ LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
+ LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
+ bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
+
MachineInstrBuilder Write2
= BuildMI(*MBB, I, DL, Write2Desc)
.addOperand(*Addr) // addr
.addImm(NewOffset0) // offset0
.addImm(NewOffset1) // offset1
.addImm(0) // gds
- .addOperand(*M0Reg) // m0
.addMemOperand(*I->memoperands_begin())
.addMemOperand(*Paired->memoperands_begin());
// XXX - How do we express subregisters here?
- unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg(),
- M0Reg->getReg()};
+ unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() };
LIS->RemoveMachineInstrFromMaps(I);
LIS->RemoveMachineInstrFromMaps(Paired);
I->eraseFromParent();
Paired->eraseFromParent();
+ // This doesn't handle physical registers like M0
LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs);
+ if (UpdateM0Range) {
+ SlotIndex Write2Index = LIS->getInstructionIndex(Write2);
+ M0Segment->end = Write2Index.getRegSlot();
+ }
+
DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
return Write2.getInstr();
}
; SI-LABEL: @simple_read2st64_f32_over_max_offset
; SI-NOT: ds_read2st64_b32
-; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
+; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]
; SI: s_endpgm
define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
; pointer can be used with an offset into the second one.
; SI-LABEL: {{^}}load_shl_base_lds_2:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
; SI: s_mov_b32 m0, -1
-; SI-NEXT: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
; SI: s_endpgm
define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {