From 9a70efafc7bb8892541999d417a45f1ef3ad4af2 Mon Sep 17 00:00:00 2001 From: Chad Rosier Date: Wed, 7 Oct 2015 17:39:18 +0000 Subject: [PATCH] [AArch64] Fold a floating-point multiply by power of two into fp conversion. Part of http://reviews.llvm.org/D13442 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@249576 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64ISelLowering.cpp | 70 ++++++++++ test/CodeGen/AArch64/fcvt_combine.ll | 154 +++++++++++++++++++++ 2 files changed, 224 insertions(+) create mode 100644 test/CodeGen/AArch64/fcvt_combine.ll diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index b4a7352a046..6af2d9d9d06 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -478,6 +478,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); + setTargetDAGCombine(ISD::FP_TO_SINT); + setTargetDAGCombine(ISD::FP_TO_UINT); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::ANY_EXTEND); @@ -7529,6 +7532,70 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Fold a floating-point multiply by power of two into floating-point to +/// fixed-point conversion. +static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue Op = N->getOperand(0); + if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL) + return SDValue(); + + SDValue ConstVec = Op->getOperand(1); + if (!isa(ConstVec)) + return SDValue(); + + MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); + uint32_t FloatBits = FloatTy.getSizeInBits(); + if (FloatBits != 32 && FloatBits != 64) + return SDValue(); + + MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); + uint32_t IntBits = IntTy.getSizeInBits(); + if (IntBits != 16 && IntBits != 32 && IntBits != 64) + return SDValue(); + + // Avoid conversions where iN is larger than the float (e.g., float -> i64). + if (IntBits > FloatBits) + return SDValue(); + + BitVector UndefElements; + BuildVectorSDNode *BV = cast(ConstVec); + int32_t Bits = IntBits == 64 ? 64 : 32; + int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1); + if (C == -1 || C == 0 || C > Bits) + return SDValue(); + + MVT ResTy; + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + switch (NumLanes) { + default: + return SDValue(); + case 2: + ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; + break; + case 4: + ResTy = MVT::v4i32; + break; + } + + SDLoc DL(N); + bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; + unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs + : Intrinsic::aarch64_neon_vcvtfp2fxu; + SDValue FixConv = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy, + DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), + Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32)); + // We can handle smaller integers by generating an extra trunc. + if (IntBits < FloatBits) + FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv); + + return FixConv; +} + /// An EXTR instruction is made up of two shifts, ORed together. This helper /// searches for and classifies those shifts. static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, @@ -9400,6 +9467,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return performIntToFpCombine(N, DAG, Subtarget); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + return performFpToIntCombine(N, DAG, Subtarget); case ISD::OR: return performORCombine(N, DCI, Subtarget); case ISD::INTRINSIC_WO_CHAIN: diff --git a/test/CodeGen/AArch64/fcvt_combine.ll b/test/CodeGen/AArch64/fcvt_combine.ll new file mode 100644 index 00000000000..093ce4a4cd8 --- /dev/null +++ b/test/CodeGen/AArch64/fcvt_combine.ll @@ -0,0 +1,154 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-neon-syntax=apple -verify-machineinstrs -o - %s | FileCheck %s + +; CHECK-LABEL: test1 +; CHECK-NOT: fmul.2s +; CHECK: fcvtzs.2s v0, v0, #4 +; CHECK: ret +define <2 x i32> @test1(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptosi <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; CHECK-LABEL: test2 +; CHECK-NOT: fmul.4s +; CHECK: fcvtzs.4s v0, v0, #3 +; CHECK: ret +define <4 x i32> @test2(<4 x float> %f) { + %mul.i = fmul <4 x float> %f, + %vcvt.i = fptosi <4 x float> %mul.i to <4 x i32> + ret <4 x i32> %vcvt.i +} + +; CHECK-LABEL: test3 +; CHECK-NOT: fmul.2d +; CHECK: fcvtzs.2d v0, v0, #5 +; CHECK: ret +define <2 x i64> @test3(<2 x double> %d) { + %mul.i = fmul <2 x double> %d, + %vcvt.i = fptosi <2 x double> %mul.i to <2 x i64> + ret <2 x i64> %vcvt.i +} + +; Truncate double to i32 +; CHECK-LABEL: test4 +; CHECK-NOT: fmul.2d v0, v0, #4 +; CHECK: fcvtzs.2d v0, v0 +; CHECK: xtn.2s +; CHECK: ret +define <2 x i32> @test4(<2 x double> %d) { + %mul.i = fmul <2 x double> %d, + %vcvt.i = fptosi <2 x double> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; Truncate float to i16 +; CHECK-LABEL: test5 +; CHECK-NOT: fmul.2s +; CHECK: fcvtzs.2s v0, v0, #4 +; CHECK: ret +define <2 x i16> @test5(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptosi <2 x float> %mul.i to <2 x i16> + ret <2 x i16> %vcvt.i +} + +; Don't convert float to i64 +; CHECK-LABEL: test6 +; CHECK: fmov.2s v1, #16.00000000 +; CHECK: fmul.2s v0, v0, v1 +; CHECK: fcvtl v0.2d, v0.2s +; CHECK: fcvtzs.2d v0, v0 +; CHECK: ret +define <2 x i64> @test6(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptosi <2 x float> %mul.i to <2 x i64> + ret <2 x i64> %vcvt.i +} + +; Check unsigned conversion. +; CHECK-LABEL: test7 +; CHECK-NOT: fmul.2s +; CHECK: fcvtzu.2s v0, v0, #4 +; CHECK: ret +define <2 x i32> @test7(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; Test which should not fold due to non-power of 2. +; CHECK-LABEL: test8 +; CHECK: fmov.2s v1, #17.00000000 +; CHECK: fmul.2s v0, v0, v1 +; CHECK: fcvtzu.2s v0, v0 +; CHECK: ret +define <2 x i32> @test8(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; Test which should not fold due to non-matching power of 2. +; CHECK-LABEL: test9 +; CHECK: fmul.2s v0, v0, v1 +; CHECK: fcvtzu.2s v0, v0 +; CHECK: ret +define <2 x i32> @test9(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; Don't combine all undefs. +; CHECK-LABEL: test10 +; CHECK: fmul.2s v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CHECK: fcvtzu.2s v{{[0-9]+}}, v{{[0-9]+}} +; CHECK: ret +define <2 x i32> @test10(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; Combine if mix of undef and pow2. +; CHECK-LABEL: test11 +; CHECK: fcvtzu.2s v0, v0, #3 +; CHECK: ret +define <2 x i32> @test11(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; Don't combine when multiplied by 0.0. +; CHECK-LABEL: test12 +; CHECK: fmul.2s v0, v0, v1 +; CHECK: fcvtzs.2s v0, v0 +; CHECK: ret +define <2 x i32> @test12(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptosi <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; Test which should not fold due to power of 2 out of range (i.e., 2^33). +; CHECK-LABEL: test13 +; CHECK: fmul.2s v0, v0, v1 +; CHECK: fcvtzs.2s v0, v0 +; CHECK: ret +define <2 x i32> @test13(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptosi <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; Test case where const is max power of 2 (i.e., 2^32). +; CHECK-LABEL: test14 +; CHECK: fcvtzs.2s v0, v0, #32 +; CHECK: ret +define <2 x i32> @test14(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptosi <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} -- 2.34.1