From 6e3a667705ac8f405e91cc378a299548834c11aa Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 11 Jan 2016 22:01:48 +0000 Subject: [PATCH] AMDGPU: Implement {{s|u}}int_to_fp i64 -> f32 The old lowering for uint_to_fp failed opencl conformance. It might be OK for fast math mode, but I'm not sure. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@257393 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 117 +++++++++++++++++++---- lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 + test/CodeGen/AMDGPU/sint_to_fp.i64.ll | 62 ++++++++++++ test/CodeGen/AMDGPU/uint_to_fp.i64.ll | 57 +++++++++++ test/CodeGen/AMDGPU/uint_to_fp.ll | 16 ++-- 5 files changed, 227 insertions(+), 26 deletions(-) create mode 100644 test/CodeGen/AMDGPU/sint_to_fp.i64.ll create mode 100644 test/CodeGen/AMDGPU/uint_to_fp.i64.ll diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 8f63fd61571..2b8032e38a5 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2223,6 +2223,91 @@ SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz); } +SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, + bool Signed) const { + // Unsigned + // cul2f(ulong u) + //{ + // uint lz = clz(u); + // uint e = (u != 0) ? 127U + 63U - lz : 0; + // u = (u << lz) & 0x7fffffffffffffffUL; + // ulong t = u & 0xffffffffffUL; + // uint v = (e << 23) | (uint)(u >> 40); + // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U); + // return as_float(v + r); + //} + // Signed + // cl2f(long l) + //{ + // long s = l >> 63; + // float r = cul2f((l + s) ^ s); + // return s ? -r : r; + //} + + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + SDValue L = Src; + + SDValue S; + if (Signed) { + const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64); + S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit); + + SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S); + L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S); + } + + EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), MVT::f32); + + + SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32); + SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64); + SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L); + LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ); + + SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32); + SDValue E = DAG.getSelect(SL, MVT::i32, + DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE), + DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ), + ZeroI32); + + SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64, + DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ), + DAG.getConstant((-1ULL) >> 1, SL, MVT::i64)); + + SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U, + DAG.getConstant(0xffffffffffULL, SL, MVT::i64)); + + SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64, + U, DAG.getConstant(40, SL, MVT::i64)); + + SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32, + DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)), + DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, UShl)); + + SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64); + SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT); + SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ); + + SDValue One = DAG.getConstant(1, SL, MVT::i32); + + SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One); + + SDValue R = DAG.getSelect(SL, MVT::i32, + RCmp, + One, + DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32)); + R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R); + R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R); + + if (!Signed) + return R; + + SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R); + return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R); +} + SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const { SDLoc SL(Op); @@ -2248,35 +2333,29 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - SDValue S0 = Op.getOperand(0); - if (S0.getValueType() != MVT::i64) - return SDValue(); + assert(Op.getOperand(0).getValueType() == MVT::i64 && + "operation should be legal"); EVT DestVT = Op.getValueType(); if (DestVT == MVT::f64) return LowerINT_TO_FP64(Op, DAG, false); - assert(DestVT == MVT::f32); + if (DestVT == MVT::f32) + return LowerINT_TO_FP32(Op, DAG, false); - SDLoc DL(Op); - - // f32 uint_to_fp i64 - SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, - DAG.getConstant(0, DL, MVT::i32)); - SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo); - SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, - DAG.getConstant(1, DL, MVT::i32)); - SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi); - // TODO: Should this propagate fast-math-flags? - FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi, - DAG.getConstantFP(4294967296.0f, DL, MVT::f32)); // 2^32 - return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); + return SDValue(); } SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - SDValue Src = Op.getOperand(0); - if (Src.getValueType() == MVT::i64 && Op.getValueType() == MVT::f64) + assert(Op.getOperand(0).getValueType() == MVT::i64 && + "operation should be legal"); + + EVT DestVT = Op.getValueType(); + if (DestVT == MVT::f32) + return LowerINT_TO_FP32(Op, DAG, true); + + if (DestVT == MVT::f64) return LowerINT_TO_FP64(Op, DAG, true); return SDValue(); diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index 952fd4cc502..65e4a0a7186 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -56,6 +56,7 @@ private: SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; diff --git a/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/test/CodeGen/AMDGPU/sint_to_fp.i64.ll new file mode 100644 index 00000000000..138b93b16d8 --- /dev/null +++ b/test/CodeGen/AMDGPU/sint_to_fp.i64.ll @@ -0,0 +1,62 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s + +; FIXME: This should be merged with sint_to_fp.ll, but s_sint_to_fp_v2i64 crashes on r600 + +; FUNC-LABEL: {{^}}s_sint_to_fp_i64_to_f32: +define void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 { + %result = sitofp i64 %in to float + store float %result, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_sint_to_fp_i64_to_f32: +; GCN: {{buffer|flat}}_load_dwordx2 + +; SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 63 +; VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\]}}, 63, {{v\[[0-9]+:[0-9]+\]}} +; GCN: v_xor_b32 + +; GCN: v_ffbh_u32 +; GCN: v_ffbh_u32 +; GCN: v_cndmask +; GCN: v_cndmask + +; GCN-DAG: v_cmp_eq_i64 +; GCN-DAG: v_cmp_lt_u64 + +; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} +; GCN: v_cndmask_b32_e32 [[SIGN_SEL:v[0-9]+]], +; GCN: {{buffer|flat}}_store_dword [[SIGN_SEL]] +define void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %in.gep + %result = sitofp i64 %val to float + store float %result, float addrspace(1)* %out.gep + ret void +} + +; FUNC-LABEL: {{^}}s_sint_to_fp_v2i64: +define void @s_sint_to_fp_v2i64(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{ + %result = sitofp <2 x i64> %in to <2 x float> + store <2 x float> %result, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_sint_to_fp_v4i64: +define void @v_sint_to_fp_v4i64(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid + %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid + %value = load <4 x i64>, <4 x i64> addrspace(1)* %in.gep + %result = sitofp <4 x i64> %value to <4 x float> + store <4 x float> %result, <4 x float> addrspace(1)* %out.gep + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/test/CodeGen/AMDGPU/uint_to_fp.i64.ll new file mode 100644 index 00000000000..3ab11442d5c --- /dev/null +++ b/test/CodeGen/AMDGPU/uint_to_fp.i64.ll @@ -0,0 +1,57 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s + +; FIXME: This should be merged with uint_to_fp.ll, but s_uint_to_fp_v2i64 crashes on r600 + +; FUNC-LABEL: {{^}}s_uint_to_fp_i64_to_f32: +define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 { + %result = uitofp i64 %in to float + store float %result, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_uint_to_fp_i64_to_f32: +; GCN: {{buffer|flat}}_load_dwordx2 + +; GCN: v_ffbh_u32 +; GCN: v_ffbh_u32 +; GCN: v_cndmask +; GCN: v_cndmask + +; GCN-DAG: v_cmp_eq_i64 +; GCN-DAG: v_cmp_lt_u64 + +; GCN: v_add_i32_e32 [[VR:v[0-9]+]] +; GCN: {{buffer|flat}}_store_dword [[VR]] +define void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %in.gep + %result = uitofp i64 %val to float + store float %result, float addrspace(1)* %out.gep + ret void +} + +; FUNC-LABEL: {{^}}s_uint_to_fp_v2i64: +define void @s_uint_to_fp_v2i64(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{ + %result = uitofp <2 x i64> %in to <2 x float> + store <2 x float> %result, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_uint_to_fp_v4i64: +define void @v_uint_to_fp_v4i64(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid + %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid + %value = load <4 x i64>, <4 x i64> addrspace(1)* %in.gep + %result = uitofp <4 x i64> %value to <4 x float> + store <4 x float> %result, <4 x float> addrspace(1)* %out.gep + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/uint_to_fp.ll b/test/CodeGen/AMDGPU/uint_to_fp.ll index c12db507ca1..a3343d1e2d9 100644 --- a/test/CodeGen/AMDGPU/uint_to_fp.ll +++ b/test/CodeGen/AMDGPU/uint_to_fp.ll @@ -115,15 +115,17 @@ define void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* ret void } +; FIXME: Repeated here to test r600 ; FUNC-LABEL: {{^}}s_uint_to_fp_i64_to_f32: -; SI: v_cvt_f32_u32_e32 -; SI: v_cvt_f32_u32_e32 -; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x4f800000 -; SI: s_endpgm +; R600: FFBH_UINT +; R600: FFBH_UINT +; R600: CNDE_INT +; R600: CNDE_INT + +; R600-DAG: SETGT_UINT +; R600-DAG: SETGT_UINT +; R600-DAG: SETE_INT -; R600: UINT_TO_FLT -; R600: UINT_TO_FLT -; R600: MULADD_IEEE define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 { entry: %cvt = uitofp i64 %in to float -- 2.34.1