return false;
}
- /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
- /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
- /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
- /// is expanded to mul + add.
- virtual bool isFMAFasterThanMulAndAdd(EVT) const {
+ /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+ /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+ /// expanded to FMAs when this method returns true, otherwise fmuladd is
+ /// expanded to fmul + fadd.
+ ///
+ /// NOTE: This may be called before legalization on types for which FMAs are
+ /// not legal, but should return true if those types will eventually legalize
+ /// to types that support FMAs. After legalization, it will only be called on
+ /// types that support FMAs (via Legal or Custom actions)
+ virtual bool isFMAFasterThanFMulAndFAdd(EVT) const {
return false;
}
// FADD -> FMA combines:
if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
DAG.getTarget().Options.UnsafeFPMath) &&
- DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) &&
- TLI.isOperationLegalOrCustom(ISD::FMA, VT)) {
+ DAG.getTarget().getTargetLowering()->isFMAFasterThanFMulAndFAdd(VT) &&
+ (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) {
// fold (fadd (fmul x, y), z) -> (fma x, y, z)
if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse())
// FSUB -> FMA combines:
if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
DAG.getTarget().Options.UnsafeFPMath) &&
- DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) &&
- TLI.isOperationLegalOrCustom(ISD::FMA, VT)) {
+ DAG.getTarget().getTargetLowering()->isFMAFasterThanFMulAndFAdd(VT) &&
+ (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) {
// fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse())
case Intrinsic::fmuladd: {
EVT VT = TLI->getValueType(I.getType());
if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
- TLI->isFMAFasterThanMulAndAdd(VT)) {
+ TLI->isFMAFasterThanFMulAndFAdd(VT)) {
setValue(&I, DAG.getNode(ISD::FMA, sdl,
getValue(I.getArgOperand(0)).getValueType(),
getValue(I.getArgOperand(0)),
return SDValue();
}
+bool
+AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+ VT = VT.getScalarType();
+
+ if (!VT.isSimple())
+ return false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f16:
+ case MVT::f32:
+ case MVT::f64:
+ return true;
+ case MVT::f128:
+ return false;
+ default:
+ break;
+ }
+
+ return false;
+}
+
AArch64TargetLowering::ConstraintType
AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
if (Constraint.size() == 1) {
virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
- /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
- /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
- /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
- /// is expanded to mul + add.
- virtual bool isFMAFasterThanMulAndAdd(EVT) const { return true; }
+ /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+ /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+ /// expanded to FMAs when this method returns true, otherwise fmuladd is
+ /// expanded to fmul + fadd.
+ virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
ConstraintType getConstraintType(const std::string &Constraint) const;
return true;
}
-/// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
-/// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
-/// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
-/// is expanded to mul + add.
-bool PPCTargetLowering::isFMAFasterThanMulAndAdd(EVT VT) const {
+bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+ VT = VT.getScalarType();
+
if (!VT.isSimple())
return false;
switch (VT.getSimpleVT().SimpleTy) {
case MVT::f32:
case MVT::f64:
- case MVT::v4f32:
return true;
default:
break;
/// relative to software emulation.
virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast = 0) const;
- /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
- /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
- /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
- /// is expanded to mul + add.
- virtual bool isFMAFasterThanMulAndAdd(EVT VT) const;
+ /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+ /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+ /// expanded to FMAs when this method returns true, otherwise fmuladd is
+ /// expanded to fmul + fadd.
+ virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
private:
SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
MaxStoresPerMemsetOptSize = 0;
}
+bool
+SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+ VT = VT.getScalarType();
+
+ if (!VT.isSimple())
+ return false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f32:
+ case MVT::f64:
+ return true;
+ case MVT::f128:
+ return false;
+ default:
+ break;
+ }
+
+ return false;
+}
+
bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
// We can load zero using LZ?R and negative zero using LZ?R;LC?BR.
return Imm.isZero() || Imm.isNegZero();
virtual EVT getSetCCResultType(LLVMContext &, EVT) const {
return MVT::i32;
}
- virtual bool isFMAFasterThanMulAndAdd(EVT) const LLVM_OVERRIDE {
- return true;
- }
+ virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const LLVM_OVERRIDE;
virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const;
virtual const char *getTargetNodeName(unsigned Opcode) const LLVM_OVERRIDE;
return false;
}
+bool
+X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+ if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
+ return false;
+
+ VT = VT.getScalarType();
+
+ if (!VT.isSimple())
+ return false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f32:
+ case MVT::f64:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
// i16 instructions are longer (0x66 prefix) and potentially slower.
return !(VT1 == MVT::i32 && VT2 == MVT::i16);
virtual bool isZExtFree(EVT VT1, EVT VT2) const;
virtual bool isZExtFree(SDValue Val, EVT VT2) const;
- /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
- /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
- /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
- /// is expanded to mul + add.
- virtual bool isFMAFasterThanMulAndAdd(EVT) const { return true; }
+ /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+ /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+ /// expanded to FMAs when this method returns true, otherwise fmuladd is
+ /// expanded to fmul + fadd.
+ virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
/// isNarrowingProfitable - Return true if it's profitable to narrow
/// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -fp-contract=fast | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s -check-prefix=CHECK-NOFAST
declare float @llvm.fma.f32(float, float, float)
declare double @llvm.fma.f64(double, double, double)
define float @test_fmadd(float %a, float %b, float %c) {
; CHECK: test_fmadd:
+; CHECK-NOFAST: test_fmadd:
%val = call float @llvm.fma.f32(float %a, float %b, float %c)
; CHECK: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
ret float %val
}
define float @test_fmsub(float %a, float %b, float %c) {
; CHECK: test_fmsub:
+; CHECK-NOFAST: test_fmsub:
%nega = fsub float -0.0, %a
%val = call float @llvm.fma.f32(float %nega, float %b, float %c)
; CHECK: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
ret float %val
}
define float @test_fnmadd(float %a, float %b, float %c) {
; CHECK: test_fnmadd:
+; CHECK-NOFAST: test_fnmadd:
%negc = fsub float -0.0, %c
%val = call float @llvm.fma.f32(float %a, float %b, float %negc)
; CHECK: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
ret float %val
}
define float @test_fnmsub(float %a, float %b, float %c) {
; CHECK: test_fnmsub:
+; CHECK-NOFAST: test_fnmsub:
%nega = fsub float -0.0, %a
%negc = fsub float -0.0, %c
%val = call float @llvm.fma.f32(float %nega, float %b, float %negc)
; CHECK: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
ret float %val
}
define double @testd_fmadd(double %a, double %b, double %c) {
; CHECK: testd_fmadd:
+; CHECK-NOFAST: testd_fmadd:
%val = call double @llvm.fma.f64(double %a, double %b, double %c)
; CHECK: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NOFAST: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
ret double %val
}
define double @testd_fmsub(double %a, double %b, double %c) {
; CHECK: testd_fmsub:
+; CHECK-NOFAST: testd_fmsub:
%nega = fsub double -0.0, %a
%val = call double @llvm.fma.f64(double %nega, double %b, double %c)
; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NOFAST: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
ret double %val
}
define double @testd_fnmadd(double %a, double %b, double %c) {
; CHECK: testd_fnmadd:
+; CHECK-NOFAST: testd_fnmadd:
%negc = fsub double -0.0, %c
%val = call double @llvm.fma.f64(double %a, double %b, double %negc)
; CHECK: fnmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NOFAST: fnmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
ret double %val
}
define double @testd_fnmsub(double %a, double %b, double %c) {
; CHECK: testd_fnmsub:
+; CHECK-NOFAST: testd_fnmsub:
%nega = fsub double -0.0, %a
%negc = fsub double -0.0, %c
%val = call double @llvm.fma.f64(double %nega, double %b, double %negc)
; CHECK: fnmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NOFAST: fnmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
ret double %val
}
define float @test_fmadd_unfused(float %a, float %b, float %c) {
; CHECK: test_fmadd_unfused:
+; CHECK-NOFAST: test_fmadd_unfused:
%prod = fmul float %b, %c
%sum = fadd float %a, %prod
; CHECK: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST-NOT: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST: fadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
ret float %sum
}
define float @test_fmsub_unfused(float %a, float %b, float %c) {
; CHECK: test_fmsub_unfused:
+; CHECK-NOFAST: test_fmsub_unfused:
%prod = fmul float %b, %c
%diff = fsub float %a, %prod
; CHECK: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST-NOT: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
ret float %diff
}
define float @test_fnmadd_unfused(float %a, float %b, float %c) {
; CHECK: test_fnmadd_unfused:
+; CHECK-NOFAST: test_fnmadd_unfused:
%nega = fsub float -0.0, %a
%prod = fmul float %b, %c
%sum = fadd float %nega, %prod
; CHECK: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST-NOT: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
ret float %sum
}
define float @test_fnmsub_unfused(float %a, float %b, float %c) {
; CHECK: test_fnmsub_unfused:
+; CHECK-NOFAST: test_fnmsub_unfused:
%nega = fsub float -0.0, %a
%prod = fmul float %b, %c
%diff = fsub float %nega, %prod
; CHECK: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST-NOT: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST: fneg {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
ret float %diff
}
ret void
}
+
+declare fp128 @llvm.fma.f128(fp128, fp128, fp128)
+
+define void @test_fma(fp128 %fp128) {
+; CHECK: test_fma:
+
+ %fmafp128 = call fp128 @llvm.fma.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128)
+ store fp128 %fmafp128, fp128* @varfp128
+; CHECK: bl fmal
+
+ ret void
+}
+
+declare fp128 @llvm.fmuladd.f128(fp128, fp128, fp128)
+
+define void @test_fmuladd(fp128 %fp128) {
+; CHECK: test_fmuladd:
+
+ %fmuladdfp128 = call fp128 @llvm.fmuladd.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128)
+ store fp128 %fmuladdfp128, fp128* @varfp128
+; CHECK-NOT: bl fmal
+; CHECK: bl __multf3
+; CHECK: bl __addtf3
+
+ ret void
+}
--- /dev/null
+; RUN: llc -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare <2 x float> @llvm.fmuladd.v2f32(<2 x float> %val, <2 x float>, <2 x float>)
+declare <4 x float> @llvm.fmuladd.v4f32(<4 x float> %val, <4 x float>, <4 x float>)
+declare <8 x float> @llvm.fmuladd.v8f32(<8 x float> %val, <8 x float>, <8 x float>)
+declare <2 x double> @llvm.fmuladd.v2f64(<2 x double> %val, <2 x double>, <2 x double>)
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double> %val, <4 x double>, <4 x double>)
+
+define <2 x float> @v2f32_fmuladd(<2 x float> %x) nounwind readnone {
+entry:
+ %fmuladd = call <2 x float> @llvm.fmuladd.v2f32 (<2 x float> %x, <2 x float> %x, <2 x float> %x)
+ ret <2 x float> %fmuladd
+}
+; fmuladd (<2 x float>) is promoted to fmuladd (<4 x float>)
+; CHECK: v2f32_fmuladd:
+; CHECK: vmaddfp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+define <4 x float> @v4f32_fmuladd(<4 x float> %x) nounwind readnone {
+entry:
+ %fmuladd = call <4 x float> @llvm.fmuladd.v4f32 (<4 x float> %x, <4 x float> %x, <4 x float> %x)
+ ret <4 x float> %fmuladd
+}
+; CHECK: v4f32_fmuladd:
+; CHECK: vmaddfp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+define <8 x float> @v8f32_fmuladd(<8 x float> %x) nounwind readnone {
+entry:
+ %fmuladd = call <8 x float> @llvm.fmuladd.v8f32 (<8 x float> %x, <8 x float> %x, <8 x float> %x)
+ ret <8 x float> %fmuladd
+}
+; CHECK: v8f32_fmuladd:
+; CHECK: vmaddfp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vmaddfp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+define <2 x double> @v2f64_fmuladd(<2 x double> %x) nounwind readnone {
+entry:
+ %fmuladd = call <2 x double> @llvm.fmuladd.v2f64 (<2 x double> %x, <2 x double> %x, <2 x double> %x)
+ ret <2 x double> %fmuladd
+}
+; CHECK: v2f64_fmuladd:
+; CHECK: fmadd {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fmadd {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+define <4 x double> @v4f64_fmuladd(<4 x double> %x) nounwind readnone {
+entry:
+ %fmuladd = call <4 x double> @llvm.fmuladd.v4f64 (<4 x double> %x, <4 x double> %x, <4 x double> %x)
+ ret <4 x double> %fmuladd
+}
+; CHECK: v4f64_fmuladd:
+; CHECK: fmadd {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fmadd {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fmadd {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fmadd {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
--- /dev/null
+; RUN: llc -march=x86 -mattr=+fma4 -mtriple=x86_64-apple-darwin < %s | FileCheck %s
+; RUN: llc -march=x86 -mtriple=x86_64-apple-darwin < %s | FileCheck %s --check-prefix=CHECK-NOFMA
+
+; CHECK: fmafunc
+define <3 x float> @fmafunc(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+
+; CHECK-NOT: vmulps
+; CHECK-NOT: vaddps
+; CHECK: vfmaddps
+; CHECK-NOT: vmulps
+; CHECK-NOT: vaddps
+
+; CHECK-NOFMA-NOT: calll
+; CHECK-NOFMA: vmulps
+; CHECK-NOFMA: vaddps
+; CHECK-NOFMA-NOT: calll
+
+ %ret = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c)
+ ret <3 x float> %ret
+}
+
+declare <3 x float> @llvm.fmuladd.v3f32(<3 x float>, <3 x float>, <3 x float>) nounwind readnone
--- /dev/null
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=avx2,+fma -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver1 -fp-contract=fast | FileCheck %s --check-prefix=CHECK_FMA4
+
+; CHECK: test_x86_fmadd_ps_y_wide
+; CHECK: vfmadd213ps
+; CHECK: vfmadd213ps
+; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps_y_wide
+; CHECK_FMA4: vfmaddps
+; CHECK_FMA4: vfmaddps
+; CHECK_FMA4: ret
+define <16 x float> @test_x86_fmadd_ps_y_wide(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+ %x = fmul <16 x float> %a0, %a1
+ %res = fadd <16 x float> %x, %a2
+ ret <16 x float> %res
+}
+
+; CHECK: test_x86_fmsub_ps_y_wide
+; CHECK: vfmsub213ps
+; CHECK: vfmsub213ps
+; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps_y_wide
+; CHECK_FMA4: vfmsubps
+; CHECK_FMA4: vfmsubps
+; CHECK_FMA4: ret
+define <16 x float> @test_x86_fmsub_ps_y_wide(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+ %x = fmul <16 x float> %a0, %a1
+ %res = fsub <16 x float> %x, %a2
+ ret <16 x float> %res
+}
+
+; CHECK: test_x86_fnmadd_ps_y_wide
+; CHECK: vfnmadd213ps
+; CHECK: vfnmadd213ps
+; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ps_y_wide
+; CHECK_FMA4: vfnmaddps
+; CHECK_FMA4: vfnmaddps
+; CHECK_FMA4: ret
+define <16 x float> @test_x86_fnmadd_ps_y_wide(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+ %x = fmul <16 x float> %a0, %a1
+ %res = fsub <16 x float> %a2, %x
+ ret <16 x float> %res
+}
+
+; CHECK: test_x86_fnmsub_ps_y_wide
+; CHECK: vfnmsub213ps
+; CHECK: vfnmsub213ps
+; CHECK: ret
+define <16 x float> @test_x86_fnmsub_ps_y_wide(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+ %x = fmul <16 x float> %a0, %a1
+ %y = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
+ %res = fsub <16 x float> %y, %a2
+ ret <16 x float> %res
+}
+
+; CHECK: test_x86_fmadd_pd_y_wide
+; CHECK: vfmadd213pd
+; CHECK: vfmadd213pd
+; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_pd_y_wide
+; CHECK_FMA4: vfmaddpd
+; CHECK_FMA4: vfmaddpd
+; CHECK_FMA4: ret
+define <8 x double> @test_x86_fmadd_pd_y_wide(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+ %x = fmul <8 x double> %a0, %a1
+ %res = fadd <8 x double> %x, %a2
+ ret <8 x double> %res
+}
+
+; CHECK: test_x86_fmsub_pd_y_wide
+; CHECK: vfmsub213pd
+; CHECK: vfmsub213pd
+; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_pd_y_wide
+; CHECK_FMA4: vfmsubpd
+; CHECK_FMA4: vfmsubpd
+; CHECK_FMA4: ret
+define <8 x double> @test_x86_fmsub_pd_y_wide(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+ %x = fmul <8 x double> %a0, %a1
+ %res = fsub <8 x double> %x, %a2
+ ret <8 x double> %res
+}
; RUN: llc -march=x86 -mattr=+fma4 -mtriple=x86_64-apple-darwin < %s | FileCheck %s
+; RUN: llc -march=x86 -mtriple=x86_64-apple-darwin < %s | FileCheck %s --check-prefix=CHECK-NOFMA
; CHECK: fmafunc
define <16 x float> @fmafunc(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
+
; CHECK-NOT: vmulps
; CHECK-NOT: vaddps
; CHECK: vfmaddps
; CHECK: vfmaddps
; CHECK-NOT: vmulps
; CHECK-NOT: vaddps
+
+; CHECK-NOFMA-NOT: calll
+; CHECK-NOFMA: vmulps
+; CHECK-NOFMA: vaddps
+; CHECK-NOFMA-NOT: calll
+; CHECK-NOFMA: vmulps
+; CHECK-NOFMA: vaddps
+; CHECK-NOFMA-NOT: calll
+
%ret = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c)
ret <16 x float> %ret
}
declare <16 x float> @llvm.fmuladd.v16f32(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
-
-
-