sz = 8;
SmallVector<EVT, 4> LoadRetVTs;
- if (sz < 16) {
+ EVT TheLoadType = VTs[i];
+ if (retTy->isIntegerTy() &&
+ TD->getTypeAllocSizeInBits(retTy) < 32) {
+ // This is for integer types only, and specifically not for
+ // aggregates.
+ LoadRetVTs.push_back(MVT::i32);
+ TheLoadType = MVT::i32;
+ } else if (sz < 16) {
// If loading i1/i8 result, generate
// load i8 (-> i16)
// trunc i16 to i1/i8
SDValue retval = DAG.getMemIntrinsicNode(
NVPTXISD::LoadParam, dl,
DAG.getVTList(&LoadRetVTs[0], LoadRetVTs.size()), &LoadRetOps[0],
- LoadRetOps.size(), VTs[i], MachinePointerInfo());
+ LoadRetOps.size(), TheLoadType, MachinePointerInfo());
Chain = retval.getValue(1);
InFlag = retval.getValue(2);
SDValue Ret0 = retval.getValue(0);
SDLoc dl, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
const Function *F = MF.getFunction();
- const Type *RetTy = F->getReturnType();
+ Type *RetTy = F->getReturnType();
const DataLayout *TD = getDataLayout();
bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
if (!isABI)
return Chain;
- if (const VectorType *VTy = dyn_cast<const VectorType>(RetTy)) {
+ if (VectorType *VTy = dyn_cast<VectorType>(RetTy)) {
// If we have a vector type, the OutVals array will be the scalarized
// components and we have combine them into 1 or more vector stores.
unsigned NumElts = VTy->getNumElements();
assert(NumElts == Outs.size() && "Bad scalarization of return value");
// const_cast can be removed in later LLVM versions
- EVT EltVT = getValueType(const_cast<Type *>(RetTy)).getVectorElementType();
+ EVT EltVT = getValueType(RetTy).getVectorElementType();
bool NeedExtend = false;
if (EltVT.getSizeInBits() < 16)
NeedExtend = true;
SmallVector<EVT, 16> ValVTs;
// const_cast is necessary since we are still using an LLVM version from
// before the type system re-write.
- ComputePTXValueVTs(*this, const_cast<Type *>(RetTy), ValVTs);
+ ComputePTXValueVTs(*this, RetTy, ValVTs);
assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition");
- unsigned sizesofar = 0;
+ unsigned SizeSoFar = 0;
for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
SDValue theVal = OutVals[i];
- EVT theValType = theVal.getValueType();
+ EVT TheValType = theVal.getValueType();
unsigned numElems = 1;
- if (theValType.isVector())
- numElems = theValType.getVectorNumElements();
+ if (TheValType.isVector())
+ numElems = TheValType.getVectorNumElements();
for (unsigned j = 0, je = numElems; j != je; ++j) {
- SDValue tmpval = theVal;
- if (theValType.isVector())
- tmpval = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
- theValType.getVectorElementType(), tmpval,
+ SDValue TmpVal = theVal;
+ if (TheValType.isVector())
+ TmpVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+ TheValType.getVectorElementType(), TmpVal,
DAG.getIntPtrConstant(j));
- EVT theStoreType = tmpval.getValueType();
- if (theStoreType.getSizeInBits() < 8)
- tmpval = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, tmpval);
- SDValue Ops[] = { Chain, DAG.getConstant(sizesofar, MVT::i32), tmpval };
+ EVT TheStoreType = ValVTs[i];
+ if (RetTy->isIntegerTy() &&
+ TD->getTypeAllocSizeInBits(RetTy) < 32) {
+ // The following zero-extension is for integer types only, and
+ // specifically not for aggregates.
+ TmpVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, TmpVal);
+ TheStoreType = MVT::i32;
+ }
+ else if (TmpVal.getValueType().getSizeInBits() < 16)
+ TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal);
+
+ SDValue Ops[] = { Chain, DAG.getConstant(SizeSoFar, MVT::i32), TmpVal };
Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
- DAG.getVTList(MVT::Other), &Ops[0], 3,
- ValVTs[i], MachinePointerInfo());
- if (theValType.isVector())
- sizesofar +=
- ValVTs[i].getVectorElementType().getStoreSizeInBits() / 8;
+ DAG.getVTList(MVT::Other), &Ops[0],
+ 3, TheStoreType,
+ MachinePointerInfo());
+ if(TheValType.isVector())
+ SizeSoFar +=
+ TheStoreType.getVectorElementType().getStoreSizeInBits() / 8;
else
- sizesofar += ValVTs[i].getStoreSizeInBits() / 8;
+ SizeSoFar += TheStoreType.getStoreSizeInBits()/8;
}
}
}
define i16 @icmp_eq_i16(i16 %a, i16 %b) {
; CHECK: setp.eq.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp eq i16 %a, %b
%ret = zext i1 %cmp to i16
define i16 @icmp_ne_i16(i16 %a, i16 %b) {
; CHECK: setp.ne.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp ne i16 %a, %b
%ret = zext i1 %cmp to i16
define i16 @icmp_ugt_i16(i16 %a, i16 %b) {
; CHECK: setp.gt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp ugt i16 %a, %b
%ret = zext i1 %cmp to i16
define i16 @icmp_uge_i16(i16 %a, i16 %b) {
; CHECK: setp.ge.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp uge i16 %a, %b
%ret = zext i1 %cmp to i16
define i16 @icmp_ult_i16(i16 %a, i16 %b) {
; CHECK: setp.lt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp ult i16 %a, %b
%ret = zext i1 %cmp to i16
define i16 @icmp_ule_i16(i16 %a, i16 %b) {
; CHECK: setp.le.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp ule i16 %a, %b
%ret = zext i1 %cmp to i16
define i16 @icmp_sgt_i16(i16 %a, i16 %b) {
; CHECK: setp.gt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp sgt i16 %a, %b
%ret = zext i1 %cmp to i16
define i16 @icmp_sge_i16(i16 %a, i16 %b) {
; CHECK: setp.ge.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp sge i16 %a, %b
%ret = zext i1 %cmp to i16
define i16 @icmp_slt_i16(i16 %a, i16 %b) {
; CHECK: setp.lt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp slt i16 %a, %b
%ret = zext i1 %cmp to i16
define i16 @icmp_sle_i16(i16 %a, i16 %b) {
; CHECK: setp.le.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp sle i16 %a, %b
%ret = zext i1 %cmp to i16
define i8 @icmp_eq_i8(i8 %a, i8 %b) {
; Comparison happens in 16-bit
; CHECK: setp.eq.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp eq i8 %a, %b
%ret = zext i1 %cmp to i8
define i8 @icmp_ne_i8(i8 %a, i8 %b) {
; Comparison happens in 16-bit
; CHECK: setp.ne.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp ne i8 %a, %b
%ret = zext i1 %cmp to i8
define i8 @icmp_ugt_i8(i8 %a, i8 %b) {
; Comparison happens in 16-bit
; CHECK: setp.gt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp ugt i8 %a, %b
%ret = zext i1 %cmp to i8
define i8 @icmp_uge_i8(i8 %a, i8 %b) {
; Comparison happens in 16-bit
; CHECK: setp.ge.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp uge i8 %a, %b
%ret = zext i1 %cmp to i8
define i8 @icmp_ult_i8(i8 %a, i8 %b) {
; Comparison happens in 16-bit
; CHECK: setp.lt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp ult i8 %a, %b
%ret = zext i1 %cmp to i8
define i8 @icmp_ule_i8(i8 %a, i8 %b) {
; Comparison happens in 16-bit
; CHECK: setp.le.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp ule i8 %a, %b
%ret = zext i1 %cmp to i8
define i8 @icmp_sgt_i8(i8 %a, i8 %b) {
; Comparison happens in 16-bit
; CHECK: setp.gt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp sgt i8 %a, %b
%ret = zext i1 %cmp to i8
define i8 @icmp_sge_i8(i8 %a, i8 %b) {
; Comparison happens in 16-bit
; CHECK: setp.ge.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp sge i8 %a, %b
%ret = zext i1 %cmp to i8
define i8 @icmp_slt_i8(i8 %a, i8 %b) {
; Comparison happens in 16-bit
; CHECK: setp.lt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp slt i8 %a, %b
%ret = zext i1 %cmp to i8
define i8 @icmp_sle_i8(i8 %a, i8 %b) {
; Comparison happens in 16-bit
; CHECK: setp.le.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp sle i8 %a, %b
%ret = zext i1 %cmp to i8
;; i8
define i8 @ld_global_i8(i8 addrspace(1)* %ptr) {
-; PTX32: ld.global.u8 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.global.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.global.u8 %rs{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.global.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
; PTX64: ret
%a = load i8 addrspace(1)* %ptr
ret i8 %a
}
define i8 @ld_shared_i8(i8 addrspace(3)* %ptr) {
-; PTX32: ld.shared.u8 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.shared.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.shared.u8 %rs{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.shared.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
; PTX64: ret
%a = load i8 addrspace(3)* %ptr
ret i8 %a
}
define i8 @ld_local_i8(i8 addrspace(5)* %ptr) {
-; PTX32: ld.local.u8 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.local.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.local.u8 %rs{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.local.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
; PTX64: ret
%a = load i8 addrspace(5)* %ptr
ret i8 %a
;; i16
define i16 @ld_global_i16(i16 addrspace(1)* %ptr) {
-; PTX32: ld.global.u16 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.global.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.global.u16 %rs{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.global.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
; PTX64: ret
%a = load i16 addrspace(1)* %ptr
ret i16 %a
}
define i16 @ld_shared_i16(i16 addrspace(3)* %ptr) {
-; PTX32: ld.shared.u16 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.shared.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.shared.u16 %rs{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.shared.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
; PTX64: ret
%a = load i16 addrspace(3)* %ptr
ret i16 %a
}
define i16 @ld_local_i16(i16 addrspace(5)* %ptr) {
-; PTX32: ld.local.u16 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.local.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.local.u16 %rs{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.local.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
; PTX64: ret
%a = load i16 addrspace(5)* %ptr
ret i16 %a