From f71720231f6de9b2b7fe28edd179ae217a105329 Mon Sep 17 00:00:00 2001 From: Che-Liang Chiou Date: Mon, 28 Feb 2011 06:34:09 +0000 Subject: [PATCH] Add preliminary support for .f32 in the PTX backend. - Add appropriate TableGen patterns for fadd, fsub, fmul. - Add .f32 as the PTX type for the LLVM float type. - Allow parameters, return values, and global variable declarations to accept the float type. - Add appropriate test cases. Patch by Justin Holewinski git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@126636 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/PTX/PTXAsmPrinter.cpp | 29 +++++++++- lib/Target/PTX/PTXISelLowering.cpp | 22 ++++++-- lib/Target/PTX/PTXInstrInfo.cpp | 8 +-- lib/Target/PTX/PTXInstrInfo.td | 42 +++++++++++++++ lib/Target/PTX/PTXRegisterInfo.td | 40 ++++++++++++++ test/CodeGen/PTX/add.ll | 14 +++++ test/CodeGen/PTX/ld_float.ll | 86 ++++++++++++++++++++++++++++++ test/CodeGen/PTX/mov.ll | 12 +++++ test/CodeGen/PTX/mul.ll | 25 +++++++++ test/CodeGen/PTX/st_float.ll | 78 +++++++++++++++++++++++++++ test/CodeGen/PTX/sub.ll | 14 +++++ 11 files changed, 360 insertions(+), 10 deletions(-) create mode 100644 test/CodeGen/PTX/ld_float.ll create mode 100644 test/CodeGen/PTX/mul.ll create mode 100644 test/CodeGen/PTX/st_float.ll diff --git a/lib/Target/PTX/PTXAsmPrinter.cpp b/lib/Target/PTX/PTXAsmPrinter.cpp index a6059974ab3..25f26fa4c41 100644 --- a/lib/Target/PTX/PTXAsmPrinter.cpp +++ b/lib/Target/PTX/PTXAsmPrinter.cpp @@ -84,6 +84,7 @@ static const char PARAM_PREFIX[] = "__param_"; static const char *getRegisterTypeName(unsigned RegNo) { #define TEST_REGCLS(cls, clsstr) \ if (PTX::cls ## RegisterClass->contains(RegNo)) return # clsstr; + TEST_REGCLS(RRegf32, f32); TEST_REGCLS(RRegs32, s32); TEST_REGCLS(Preds, pred); #undef TEST_REGCLS @@ -115,6 +116,21 @@ static const char *getStateSpaceName(unsigned addressSpace) { return NULL; } +static const char *getTypeName(const Type* type) { + while (true) { + switch (type->getTypeID()) { + default: llvm_unreachable("Unknown type"); + case Type::FloatTyID: return ".f32"; + case Type::IntegerTyID: return ".s32"; // TODO: Handle 64-bit types. + case Type::ArrayTyID: + case Type::PointerTyID: + type = dyn_cast(type)->getElementType(); + break; + } + } + return NULL; +} + bool PTXAsmPrinter::doFinalization(Module &M) { // XXX Temproarily remove global variables so that doFinalization() will not // emit them again (global variables are emitted at beginning). @@ -218,6 +234,15 @@ void PTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum, case MachineOperand::MO_Register: OS << getRegisterName(MO.getReg()); break; + case MachineOperand::MO_FPImmediate: + APInt constFP = MO.getFPImm()->getValueAPF().bitcastToAPInt(); + if (constFP.getZExtValue() > 0) { + OS << "0F" << constFP.toString(16, false); + } + else { + OS << "0F00000000"; + } + break; } } @@ -265,8 +290,8 @@ void PTXAsmPrinter::EmitVariableDeclaration(const GlobalVariable *gv) { decl += " "; } - // TODO: add types - decl += ".s32 "; + decl += getTypeName(gv->getType()); + decl += " "; decl += gvsym->getName(); diff --git a/lib/Target/PTX/PTXISelLowering.cpp b/lib/Target/PTX/PTXISelLowering.cpp index e6d44907ed3..d30c9ecbe49 100644 --- a/lib/Target/PTX/PTXISelLowering.cpp +++ b/lib/Target/PTX/PTXISelLowering.cpp @@ -28,9 +28,12 @@ PTXTargetLowering::PTXTargetLowering(TargetMachine &TM) // Set up the register classes. addRegisterClass(MVT::i1, PTX::PredsRegisterClass); addRegisterClass(MVT::i32, PTX::RRegs32RegisterClass); - + addRegisterClass(MVT::f32, PTX::RRegf32RegisterClass); + setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); + setOperationAction(ISD::ConstantFP, MVT::f32, Legal); + // Customize translation of memory addresses setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); @@ -87,7 +90,8 @@ struct argmap_entry { bool operator==(MVT::SimpleValueType _VT) const { return VT == _VT; } } argmap[] = { argmap_entry(MVT::i1, PTX::PredsRegisterClass), - argmap_entry(MVT::i32, PTX::RRegs32RegisterClass) + argmap_entry(MVT::i32, PTX::RRegs32RegisterClass), + argmap_entry(MVT::f32, PTX::RRegf32RegisterClass) }; } // end anonymous namespace @@ -185,10 +189,18 @@ SDValue PTXTargetLowering:: if (Outs.size() == 0) return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain); - assert(Outs[0].VT == MVT::i32 && "Can return only basic types"); - SDValue Flag; - unsigned reg = PTX::R0; + unsigned reg; + + if (Outs[0].VT == MVT::i32) { + reg = PTX::R0; + } + else if (Outs[0].VT == MVT::f32) { + reg = PTX::F0; + } + else { + assert(false && "Can return only basic types"); + } MachineFunction &MF = DAG.getMachineFunction(); PTXMachineFunctionInfo *MFI = MF.getInfo(); diff --git a/lib/Target/PTX/PTXInstrInfo.cpp b/lib/Target/PTX/PTXInstrInfo.cpp index 805759bcab1..f2e5e4c1102 100644 --- a/lib/Target/PTX/PTXInstrInfo.cpp +++ b/lib/Target/PTX/PTXInstrInfo.cpp @@ -28,6 +28,7 @@ static const struct map_entry { const int opcode; } map[] = { { &PTX::RRegs32RegClass, PTX::MOVrr }, + { &PTX::RRegf32RegClass, PTX::MOVrr }, { &PTX::PredsRegClass, PTX::MOVpp } }; @@ -35,12 +36,13 @@ void PTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DstReg, unsigned SrcReg, bool KillSrc) const { - for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++ i) - if (PTX::RRegs32RegClass.contains(DstReg, SrcReg)) { + for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++ i) { + if (map[i].cls->contains(DstReg, SrcReg)) { BuildMI(MBB, I, DL, - get(PTX::MOVrr), DstReg).addReg(SrcReg, getKillRegState(KillSrc)); + get(map[i].opcode), DstReg).addReg(SrcReg, getKillRegState(KillSrc)); return; } + } llvm_unreachable("Impossible reg-to-reg copy"); } diff --git a/lib/Target/PTX/PTXInstrInfo.td b/lib/Target/PTX/PTXInstrInfo.td index 9a747788f6a..9d962b0e252 100644 --- a/lib/Target/PTX/PTXInstrInfo.td +++ b/lib/Target/PTX/PTXInstrInfo.td @@ -143,6 +143,18 @@ def PTXret // Instruction Class Templates //===----------------------------------------------------------------------===// +// Three-operand f32 instruction template +multiclass FLOAT3 { + def rr : InstPTX<(outs RRegf32:$d), + (ins RRegf32:$a, RRegf32:$b), + !strconcat(opcstr, ".%type\t$d, $a, $b"), + [(set RRegf32:$d, (opnode RRegf32:$a, RRegf32:$b))]>; + def ri : InstPTX<(outs RRegf32:$d), + (ins RRegf32:$a, f32imm:$b), + !strconcat(opcstr, ".%type\t$d, $a, $b"), + [(set RRegf32:$d, (opnode RRegf32:$a, fpimm:$b))]>; +} + multiclass INT3 { def rr : InstPTX<(outs RRegs32:$d), (ins RRegs32:$a, RRegs32:$b), @@ -204,6 +216,12 @@ multiclass PTX_ST { // Instructions //===----------------------------------------------------------------------===// +///===- Floating-Point Arithmetic Instructions ----------------------------===// + +defm FADD : FLOAT3<"add", fadd>; +defm FSUB : FLOAT3<"sub", fsub>; +defm FMUL : FLOAT3<"mul", fmul>; + ///===- Integer Arithmetic Instructions -----------------------------------===// defm ADD : INT3<"add", add>; @@ -223,6 +241,8 @@ let neverHasSideEffects = 1 in { : InstPTX<(outs Preds:$d), (ins Preds:$a), "mov.pred\t$d, $a", []>; def MOVrr : InstPTX<(outs RRegs32:$d), (ins RRegs32:$a), "mov.%type\t$d, $a", []>; + def FMOVrr + : InstPTX<(outs RRegf32:$d), (ins RRegf32:$a), "mov.f32\t$d, $a", []>; } let isReMaterializable = 1, isAsCheapAsAMove = 1 in { @@ -232,8 +252,12 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1 in { def MOVri : InstPTX<(outs RRegs32:$d), (ins i32imm:$a), "mov.s32\t$d, $a", [(set RRegs32:$d, imm:$a)]>; + def FMOVri + : InstPTX<(outs RRegf32:$d), (ins f32imm:$a), "mov.f32\t$d, $a", + [(set RRegf32:$d, fpimm:$a)]>; } +// Integer loads defm LDg : PTX_LD<"ld.global", RRegs32, load_global>; defm LDc : PTX_LD<"ld.const", RRegs32, load_constant>; defm LDl : PTX_LD<"ld.local", RRegs32, load_local>; @@ -243,12 +267,30 @@ defm LDs : PTX_LD<"ld.shared", RRegs32, load_shared>; def LDpi : InstPTX<(outs RRegs32:$d), (ins MEMpi:$a), "ld.param.%type\t$d, [$a]", []>; +// Floating-point loads +defm FLDg : PTX_LD<"ld.global", RRegf32, load_global>; +defm FLDc : PTX_LD<"ld.const", RRegf32, load_constant>; +defm FLDl : PTX_LD<"ld.local", RRegf32, load_local>; +defm FLDp : PTX_LD<"ld.param", RRegf32, load_parameter>; +defm FLDs : PTX_LD<"ld.shared", RRegf32, load_shared>; + +def FLDpi : InstPTX<(outs RRegf32:$d), (ins MEMpi:$a), + "ld.param.%type\t$d, [$a]", []>; + +// Integer stores defm STg : PTX_ST<"st.global", RRegs32, store_global>; defm STl : PTX_ST<"st.local", RRegs32, store_local>; // Store to parameter state space requires PTX 2.0 or higher? // defm STp : PTX_ST<"st.param", RRegs32, store_parameter>; defm STs : PTX_ST<"st.shared", RRegs32, store_shared>; +// Floating-point stores +defm FSTg : PTX_ST<"st.global", RRegf32, store_global>; +defm FSTl : PTX_ST<"st.local", RRegf32, store_local>; +// Store to parameter state space requires PTX 2.0 or higher? +// defm FSTp : PTX_ST<"st.param", RRegf32, store_parameter>; +defm FSTs : PTX_ST<"st.shared", RRegf32, store_shared>; + ///===- Control Flow Instructions -----------------------------------------===// let isReturn = 1, isTerminator = 1, isBarrier = 1 in { diff --git a/lib/Target/PTX/PTXRegisterInfo.td b/lib/Target/PTX/PTXRegisterInfo.td index 22e2b343a0e..9158f0d31c7 100644 --- a/lib/Target/PTX/PTXRegisterInfo.td +++ b/lib/Target/PTX/PTXRegisterInfo.td @@ -85,6 +85,40 @@ def R29 : PTXReg<"r29">; def R30 : PTXReg<"r30">; def R31 : PTXReg<"r31">; +def F0 : PTXReg<"f0">; +def F1 : PTXReg<"f1">; +def F2 : PTXReg<"f2">; +def F3 : PTXReg<"f3">; +def F4 : PTXReg<"f4">; +def F5 : PTXReg<"f5">; +def F6 : PTXReg<"f6">; +def F7 : PTXReg<"f7">; +def F8 : PTXReg<"f8">; +def F9 : PTXReg<"f9">; +def F10 : PTXReg<"f10">; +def F11 : PTXReg<"f11">; +def F12 : PTXReg<"f12">; +def F13 : PTXReg<"f13">; +def F14 : PTXReg<"f14">; +def F15 : PTXReg<"f15">; +def F16 : PTXReg<"f16">; +def F17 : PTXReg<"f17">; +def F18 : PTXReg<"f18">; +def F19 : PTXReg<"f19">; +def F20 : PTXReg<"f20">; +def F21 : PTXReg<"f21">; +def F22 : PTXReg<"f22">; +def F23 : PTXReg<"f23">; +def F24 : PTXReg<"f24">; +def F25 : PTXReg<"f25">; +def F26 : PTXReg<"f26">; +def F27 : PTXReg<"f27">; +def F28 : PTXReg<"f28">; +def F29 : PTXReg<"f29">; +def F30 : PTXReg<"f30">; +def F31 : PTXReg<"f31">; + + //===----------------------------------------------------------------------===// // Register classes //===----------------------------------------------------------------------===// @@ -100,3 +134,9 @@ def RRegs32 : RegisterClass<"PTX", [i32], 32, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31]>; + +def RRegf32 : RegisterClass<"PTX", [f32], 32, + [F0, F1, F2, F3, F4, F5, F6, F7, + F8, F9, F10, F11, F12, F13, F14, F15, + F16, F17, F18, F19, F20, F21, F22, F23, + F24, F25, F26, F27, F28, F29, F30, F31]>; diff --git a/test/CodeGen/PTX/add.ll b/test/CodeGen/PTX/add.ll index 1259d03e96c..9e777ae30cb 100644 --- a/test/CodeGen/PTX/add.ll +++ b/test/CodeGen/PTX/add.ll @@ -13,3 +13,17 @@ define ptx_device i32 @t2(i32 %x) { ; CHECK: ret; ret i32 %z } + +define ptx_device float @t3(float %x, float %y) { +; CHECK: add.f32 f0, f1, f2 +; CHECK-NEXT: ret; + %z = fadd float %x, %y + ret float %z +} + +define ptx_device float @t4(float %x) { +; CHECK: add.f32 f0, f1, 0F3F800000; +; CHECK-NEXT: ret; + %z = fadd float %x, 1.0 + ret float %z +} diff --git a/test/CodeGen/PTX/ld_float.ll b/test/CodeGen/PTX/ld_float.ll new file mode 100644 index 00000000000..62d2c36e64a --- /dev/null +++ b/test/CodeGen/PTX/ld_float.ll @@ -0,0 +1,86 @@ +; RUN: llc < %s -march=ptx | FileCheck %s + +;CHECK: .extern .global .f32 array[]; +@array = external global [10 x float] + +;CHECK: .extern .const .f32 array_constant[]; +@array_constant = external addrspace(1) constant [10 x float] + +;CHECK: .extern .local .f32 array_local[]; +@array_local = external addrspace(2) global [10 x float] + +;CHECK: .extern .shared .f32 array_shared[]; +@array_shared = external addrspace(4) global [10 x float] + +define ptx_device float @t1(float* %p) { +entry: +;CHECK: ld.global.f32 f0, [r1]; +;CHECK-NEXT: ret; + %x = load float* %p + ret float %x +} + +define ptx_device float @t2(float* %p) { +entry: +;CHECK: ld.global.f32 f0, [r1+4]; +;CHECK-NEXT: ret; + %i = getelementptr float* %p, i32 1 + %x = load float* %i + ret float %x +} + +define ptx_device float @t3(float* %p, i32 %q) { +entry: +;CHECK: shl.b32 r0, r2, 2; +;CHECK-NEXT: add.s32 r0, r1, r0; +;CHECK-NEXT: ld.global.f32 f0, [r0]; +;CHECK-NEXT: ret; + %i = getelementptr float* %p, i32 %q + %x = load float* %i + ret float %x +} + +define ptx_device float @t4_global() { +entry: +;CHECK: ld.global.f32 f0, [array]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x float]* @array, i32 0, i32 0 + %x = load float* %i + ret float %x +} + +define ptx_device float @t4_const() { +entry: +;CHECK: ld.const.f32 f0, [array_constant]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x float] addrspace(1)* @array_constant, i32 0, i32 0 + %x = load float addrspace(1)* %i + ret float %x +} + +define ptx_device float @t4_local() { +entry: +;CHECK: ld.local.f32 f0, [array_local]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x float] addrspace(2)* @array_local, i32 0, i32 0 + %x = load float addrspace(2)* %i + ret float %x +} + +define ptx_device float @t4_shared() { +entry: +;CHECK: ld.shared.f32 f0, [array_shared]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x float] addrspace(4)* @array_shared, i32 0, i32 0 + %x = load float addrspace(4)* %i + ret float %x +} + +define ptx_device float @t5() { +entry: +;CHECK: ld.global.f32 f0, [array+4]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x float]* @array, i32 0, i32 1 + %x = load float* %i + ret float %x +} diff --git a/test/CodeGen/PTX/mov.ll b/test/CodeGen/PTX/mov.ll index c365e9beb89..d201a7867aa 100644 --- a/test/CodeGen/PTX/mov.ll +++ b/test/CodeGen/PTX/mov.ll @@ -11,3 +11,15 @@ define ptx_device i32 @t2(i32 %x) { ; CHECK: ret; ret i32 %x } + +define ptx_device float @t3() { +; CHECK: mov.f32 f0, 0F00000000; +; CHECK-NEXT: ret; + ret float 0.0 +} + +define ptx_device float @t4(float %x) { +; CHECK: mov.f32 f0, f1; +; CHECK-NEXT: ret; + ret float %x +} diff --git a/test/CodeGen/PTX/mul.ll b/test/CodeGen/PTX/mul.ll new file mode 100644 index 00000000000..01871da4893 --- /dev/null +++ b/test/CodeGen/PTX/mul.ll @@ -0,0 +1,25 @@ +; RUN: llc < %s -march=ptx | FileCheck %s + +;define ptx_device i32 @t1(i32 %x, i32 %y) { +; %z = mul i32 %x, %y +; ret i32 %z +;} + +;define ptx_device i32 @t2(i32 %x) { +; %z = mul i32 %x, 1 +; ret i32 %z +;} + +define ptx_device float @t3(float %x, float %y) { +; CHECK: mul.f32 f0, f1, f2 +; CHECK-NEXT: ret; + %z = fmul float %x, %y + ret float %z +} + +define ptx_device float @t4(float %x) { +; CHECK: mul.f32 f0, f1, 0F40A00000; +; CHECK-NEXT: ret; + %z = fmul float %x, 5.0 + ret float %z +} diff --git a/test/CodeGen/PTX/st_float.ll b/test/CodeGen/PTX/st_float.ll new file mode 100644 index 00000000000..f0e00105f93 --- /dev/null +++ b/test/CodeGen/PTX/st_float.ll @@ -0,0 +1,78 @@ +; RUN: llc < %s -march=ptx | FileCheck %s + +;CHECK: .extern .global .f32 array[]; +@array = external global [10 x float] + +;CHECK: .extern .const .f32 array_constant[]; +@array_constant = external addrspace(1) constant [10 x float] + +;CHECK: .extern .local .f32 array_local[]; +@array_local = external addrspace(2) global [10 x float] + +;CHECK: .extern .shared .f32 array_shared[]; +@array_shared = external addrspace(4) global [10 x float] + +define ptx_device void @t1(float* %p, float %x) { +entry: +;CHECK: st.global.f32 [r1], f1; +;CHECK-NEXT: ret; + store float %x, float* %p + ret void +} + +define ptx_device void @t2(float* %p, float %x) { +entry: +;CHECK: st.global.f32 [r1+4], f1; +;CHECK-NEXT: ret; + %i = getelementptr float* %p, i32 1 + store float %x, float* %i + ret void +} + +define ptx_device void @t3(float* %p, i32 %q, float %x) { +;CHECK: .reg .s32 r0; +entry: +;CHECK: shl.b32 r0, r2, 2; +;CHECK-NEXT: add.s32 r0, r1, r0; +;CHECK-NEXT: st.global.f32 [r0], f1; +;CHECK-NEXT: ret; + %i = getelementptr float* %p, i32 %q + store float %x, float* %i + ret void +} + +define ptx_device void @t4_global(float %x) { +entry: +;CHECK: st.global.f32 [array], f1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x float]* @array, i32 0, i32 0 + store float %x, float* %i + ret void +} + +define ptx_device void @t4_local(float %x) { +entry: +;CHECK: st.local.f32 [array_local], f1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x float] addrspace(2)* @array_local, i32 0, i32 0 + store float %x, float addrspace(2)* %i + ret void +} + +define ptx_device void @t4_shared(float %x) { +entry: +;CHECK: st.shared.f32 [array_shared], f1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x float] addrspace(4)* @array_shared, i32 0, i32 0 + store float %x, float addrspace(4)* %i + ret void +} + +define ptx_device void @t5(float %x) { +entry: +;CHECK: st.global.f32 [array+4], f1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x float]* @array, i32 0, i32 1 + store float %x, float* %i + ret void +} diff --git a/test/CodeGen/PTX/sub.ll b/test/CodeGen/PTX/sub.ll index aab3fdadad1..e11decaf5cf 100644 --- a/test/CodeGen/PTX/sub.ll +++ b/test/CodeGen/PTX/sub.ll @@ -13,3 +13,17 @@ define ptx_device i32 @t2(i32 %x) { ;CHECK: ret; ret i32 %z } + +define ptx_device float @t3(float %x, float %y) { +; CHECK: sub.f32 f0, f1, f2 +; CHECK-NEXT: ret; + %z = fsub float %x, %y + ret float %z +} + +define ptx_device float @t4(float %x) { +; CHECK: add.f32 f0, f1, 0FBF800000; +; CHECK-NEXT: ret; + %z = fsub float %x, 1.0 + ret float %z +} -- 2.34.1