//===----------------------------------------------------------------------===//
#include "NVPTXISelDAGToDAG.h"
+#include "NVPTXUtilities.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Instructions.h"
}
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
- unsigned codeAddrSpace, const DataLayout &DL) {
- if (!Subtarget.hasLDG() || codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL) {
+ unsigned CodeAddrSpace, MachineFunction *F) {
+ // To use non-coherent caching, the load has to be from global
+ // memory and we have to prove that the memory area is not written
+ // to anywhere for the duration of the kernel call, not even after
+ // the load.
+ //
+ // To ensure that there are no writes to the memory, we require the
+ // underlying pointer to be a noalias (__restrict) kernel parameter
+ // that is never used for a write. We can only do this for kernel
+ // functions since from within a device function, we cannot know if
+ // there were or will be writes to the memory from the caller - or we
+ // could, but then we would have to do inter-procedural analysis.
+ if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL ||
+ !isKernelFunction(*F->getFunction())) {
return false;
}
- // Check whether load operates on a readonly argument.
- bool canUseLDG = false;
- if (const Argument *A = dyn_cast<const Argument>(
- GetUnderlyingObject(N->getMemOperand()->getValue(), DL)))
- canUseLDG = A->onlyReadsMemory() && A->hasNoAliasAttr();
+ // We use GetUnderlyingObjects() here instead of
+ // GetUnderlyingObject() mainly because the former looks through phi
+ // nodes while the latter does not. We need to look through phi
+ // nodes to handle pointer induction variables.
+ SmallVector<Value *, 8> Objs;
+ GetUnderlyingObjects(const_cast<Value *>(N->getMemOperand()->getValue()),
+ Objs, F->getDataLayout());
+ for (Value *Obj : Objs) {
+ auto *A = dyn_cast<const Argument>(Obj);
+ if (!A || !A->onlyReadsMemory() || !A->hasNoAliasAttr()) return false;
+ }
- return canUseLDG;
+ return true;
}
SDNode *NVPTXDAGToDAGISel::SelectIntrinsicNoChain(SDNode *N) {
// Address Space Setting
unsigned int codeAddrSpace = getCodeAddrSpace(LD);
- if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, CurDAG->getDataLayout())) {
+ if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, MF)) {
return SelectLDGLDU(N);
}
// Address Space Setting
unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
- if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, CurDAG->getDataLayout())) {
+ if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
return SelectLDGLDU(N);
}
ret void
}
-!nvvm.annotations = !{!1 ,!2 ,!3 ,!4 ,!5 ,!6, !7 ,!8 ,!9 ,!10 ,!11 ,!12, !13, !14, !15, !16, !17, !18}
+; Test that we can infer a cached load for a pointer induction variable.
+; SM20-LABEL: .visible .entry foo19(
+; SM20: ld.global.f32
+; SM35-LABEL: .visible .entry foo19(
+; SM35: ld.global.nc.f32
+define void @foo19(float * noalias readonly %from, float * %to, i32 %n) {
+entry:
+ br label %loop
+
+loop:
+ %i = phi i32 [ 0, %entry ], [ %nexti, %loop ]
+ %sum = phi float [ 0.0, %entry ], [ %nextsum, %loop ]
+ %ptr = getelementptr inbounds float, float * %from, i32 %i
+ %value = load float, float * %ptr, align 4
+ %nextsum = fadd float %value, %sum
+ %nexti = add nsw i32 %i, 1
+ %exitcond = icmp eq i32 %nexti, %n
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ store float %nextsum, float * %to
+ ret void
+}
+
+; This test captures the case of a non-kernel function. In a
+; non-kernel function, without interprocedural analysis, we do not
+; know that the parameter is global. We also do not know that the
+; pointed-to memory is never written to (for the duration of the
+; kernel). For both reasons, we cannot use a cached load here.
+; SM20-LABEL: notkernel(
+; SM20: ld.f32
+; SM35-LABEL: notkernel(
+; SM35: ld.f32
+define void @notkernel(float * noalias readonly %from, float * %to) {
+ %1 = load float, float * %from
+ store float %1, float * %to
+ ret void
+}
+
+; As @notkernel, but with the parameter explicitly marked as global. We still
+; do not know that the parameter is never written to (for the duration of the
+; kernel). This case does not currently come up normally since we do not infer
+; that pointers are global interprocedurally as of 2015-08-05.
+; SM20-LABEL: notkernel2(
+; SM20: ld.global.f32
+; SM35-LABEL: notkernel2(
+; SM35: ld.global.f32
+define void @notkernel2(float addrspace(1) * noalias readonly %from, float * %to) {
+ %1 = load float, float addrspace(1) * %from
+ store float %1, float * %to
+ ret void
+}
+
+!nvvm.annotations = !{!1 ,!2 ,!3 ,!4 ,!5 ,!6, !7 ,!8 ,!9 ,!10 ,!11 ,!12, !13, !14, !15, !16, !17, !18, !19}
!1 = !{void (float *, float *)* @foo1, !"kernel", i32 1}
!2 = !{void (double *, double *)* @foo2, !"kernel", i32 1}
!3 = !{void (i16 *, i16 *)* @foo3, !"kernel", i32 1}
!16 = !{void (<4 x float> *, <4 x float> *)* @foo16, !"kernel", i32 1}
!17 = !{void (<4 x double> *, <4 x double> *)* @foo17, !"kernel", i32 1}
!18 = !{void (float **, float **)* @foo18, !"kernel", i32 1}
+!19 = !{void (float *, float *, i32)* @foo19, !"kernel", i32 1}