return true;
}
-bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
- int64_t Offset1, int64_t Offset2,
- unsigned NumLoads) const {
- assert(Offset2 > Offset1
- && "Second offset should be larger than first offset!");
- // If we have less than 16 loads in a row, and the offsets are within 16,
- // then schedule together.
- // TODO: Make the loads schedule near if it fits in a cacheline
- return (NumLoads < 16 && (Offset2 - Offset1) < 16);
+// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
+// the first 16 loads will be interleaved with the stores, and the next 16 will
+// be clustered as expected. It should really split into 2 16 store batches.
+//
+// Loads are clustered until this returns false, rather than trying to schedule
+// groups of stores. This also means we have to deal with saying different
+// address space loads should be clustered, and ones which might cause bank
+// conflicts.
+//
+// This might be deprecated so it might not be worth that much effort to fix.
+bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
+ int64_t Offset0, int64_t Offset1,
+ unsigned NumLoads) const {
+ assert(Offset1 > Offset0 &&
+ "Second offset should be larger than first offset!");
+ // If we have less than 16 loads in a row, and the offsets are within 64
+ // bytes, then schedule together.
+
+ // A cacheline is 64 bytes (for global memory).
+ return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
}
bool
; SI: DS_WRITE_B8
; SI: DS_READ_U8
; SI: DS_WRITE_B8
+
; SI: DS_READ_U8
; SI: DS_WRITE_B8
; SI: DS_READ_U8
; SI: DS_WRITE_B8
; SI: DS_READ_U8
; SI: DS_WRITE_B8
-
; SI: DS_READ_U8
; SI: DS_WRITE_B8
; SI: DS_READ_U8
; SI: DS_WRITE_B8
+
; SI: DS_READ_U8
; SI: DS_WRITE_B8
; SI: DS_READ_U8
; SI: DS_READ_U8
; SI: DS_WRITE_B8
; SI: DS_READ_U8
-; SI: DS_WRITE_B8
; SI: DS_READ_U8
-; SI: DS_WRITE_B8
+
; SI: DS_READ_U8
; SI: DS_READ_U8
; SI: DS_READ_U8
; SI: DS_READ_U8
; SI: DS_READ_U8
+
; SI: DS_READ_U8
; SI: DS_READ_U8
; SI: DS_READ_U8
; SI: DS_WRITE_B8
; SI: DS_WRITE_B8
; SI: DS_WRITE_B8
+; SI: DS_WRITE_B8
+; SI: DS_WRITE_B8
+
; SI: DS_WRITE_B8
; SI: DS_WRITE_B8
; SI: DS_WRITE_B8
; FUNC-LABEL: @test_small_memcpy_i64_lds_to_lds_align2
; SI: DS_READ_U16
-; SI: DS_WRITE_B16
; SI: DS_READ_U16
-; SI: DS_WRITE_B16
; SI: DS_READ_U16
-; SI: DS_WRITE_B16
; SI: DS_READ_U16
-; SI: DS_WRITE_B16
; SI: DS_READ_U16
-; SI: DS_WRITE_B16
; SI: DS_READ_U16
-; SI: DS_WRITE_B16
; SI: DS_READ_U16
-; SI: DS_WRITE_B16
; SI: DS_READ_U16
-; SI: DS_WRITE_B16
; SI: DS_READ_U16
; SI: DS_READ_U16
; SI: DS_WRITE_B16
; SI: DS_WRITE_B16
+; SI: DS_WRITE_B16
+; SI: DS_WRITE_B16
+; SI: DS_WRITE_B16
+; SI: DS_WRITE_B16
+; SI: DS_WRITE_B16
+; SI: DS_WRITE_B16
+; SI: DS_WRITE_B16
+; SI: DS_WRITE_B16
+
; SI: S_ENDPGM
define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
%bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align2
; SI-DAG: BUFFER_LOAD_USHORT
-; SI-DAG: BUFFER_STORE_SHORT
; SI-DAG: BUFFER_LOAD_USHORT
-; SI-DAG: BUFFER_STORE_SHORT
; SI-DAG: BUFFER_LOAD_USHORT
-; SI-DAG: BUFFER_STORE_SHORT
; SI-DAG: BUFFER_LOAD_USHORT
-; SI-DAG: BUFFER_STORE_SHORT
; SI-DAG: BUFFER_LOAD_USHORT
-; SI-DAG: BUFFER_STORE_SHORT
; SI-DAG: BUFFER_LOAD_USHORT
-; SI-DAG: BUFFER_STORE_SHORT
; SI-DAG: BUFFER_LOAD_USHORT
-; SI-DAG: BUFFER_STORE_SHORT
; SI-DAG: BUFFER_LOAD_USHORT
-; SI-DAG: BUFFER_STORE_SHORT
-
; SI-DAG: BUFFER_LOAD_USHORT
-; SI-DAG: BUFFER_STORE_SHORT
; SI-DAG: BUFFER_LOAD_USHORT
-; SI-DAG: BUFFER_STORE_SHORT
; SI-DAG: BUFFER_LOAD_USHORT
-; SI-DAG: BUFFER_STORE_SHORT
; SI-DAG: BUFFER_LOAD_USHORT
-; SI-DAG: BUFFER_STORE_SHORT
; SI-DAG: BUFFER_LOAD_USHORT
-; SI-DAG: BUFFER_STORE_SHORT
; SI-DAG: BUFFER_LOAD_USHORT
-; SI-DAG: BUFFER_STORE_SHORT
; SI-DAG: BUFFER_LOAD_USHORT
-; SI-DAG: BUFFER_STORE_SHORT
; SI-DAG: BUFFER_LOAD_USHORT
+
+; SI-DAG: BUFFER_STORE_SHORT
+; SI-DAG: BUFFER_STORE_SHORT
+; SI-DAG: BUFFER_STORE_SHORT
+; SI-DAG: BUFFER_STORE_SHORT
+; SI-DAG: BUFFER_STORE_SHORT
+; SI-DAG: BUFFER_STORE_SHORT
+; SI-DAG: BUFFER_STORE_SHORT
+; SI-DAG: BUFFER_STORE_SHORT
+; SI-DAG: BUFFER_STORE_SHORT
+; SI-DAG: BUFFER_STORE_SHORT
+; SI-DAG: BUFFER_STORE_SHORT
+; SI-DAG: BUFFER_STORE_SHORT
+; SI-DAG: BUFFER_STORE_SHORT
+; SI-DAG: BUFFER_STORE_SHORT
+; SI-DAG: BUFFER_STORE_SHORT
; SI-DAG: BUFFER_STORE_SHORT
; SI: S_ENDPGM
; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align4
; SI: BUFFER_LOAD_DWORDX4
-; SI: BUFFER_STORE_DWORDX4
; SI: BUFFER_LOAD_DWORDX4
; SI: BUFFER_STORE_DWORDX4
+; SI: BUFFER_STORE_DWORDX4
; SI: S_ENDPGM
define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
%bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align8
; SI: BUFFER_LOAD_DWORDX4
-; SI: BUFFER_STORE_DWORDX4
; SI: BUFFER_LOAD_DWORDX4
; SI: BUFFER_STORE_DWORDX4
+; SI: BUFFER_STORE_DWORDX4
; SI: S_ENDPGM
define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
%bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align16
; SI: BUFFER_LOAD_DWORDX4
-; SI: BUFFER_STORE_DWORDX4
; SI: BUFFER_LOAD_DWORDX4
; SI: BUFFER_STORE_DWORDX4
+; SI: BUFFER_STORE_DWORDX4
; SI: S_ENDPGM
define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
%bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*