SDValue visitFADDForFMACombine(SDNode *N);
SDValue visitFSUBForFMACombine(SDNode *N);
+ SDValue visitFMULForFMACombine(SDNode *N);
SDValue XformToShuffleWithZero(SDNode *N);
SDValue ReassociateOps(unsigned Opc, SDLoc DL, SDValue LHS, SDValue RHS);
return SDValue();
}
+/// Try to perform FMA combining on a given FMUL node.
+SDValue DAGCombiner::visitFMULForFMACombine(SDNode *N) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ SDLoc SL(N);
+
+ assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation");
+
+ const TargetOptions &Options = DAG.getTarget().Options;
+ bool AllowFusion =
+ (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
+
+ // Floating-point multiply-add with intermediate rounding.
+ bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
+
+ // Floating-point multiply-add without intermediate rounding.
+ bool HasFMA =
+ AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) &&
+ (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
+
+ // No valid opcode, do not combine.
+ if (!HasFMAD && !HasFMA)
+ return SDValue();
+
+ // Always prefer FMAD to FMA for precision.
+ unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
+ bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
+
+ // fold (fmul (fadd x, +1.0), y) -> (fma x, y, y)
+ // fold (fmul (fadd x, -1.0), y) -> (fma x, y, (fneg y))
+ auto FuseFADD = [&](SDValue X, SDValue Y) {
+ if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) {
+ auto XC1 = isConstOrConstSplatFP(X.getOperand(1));
+ if (XC1 && XC1->isExactlyValue(+1.0))
+ return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y);
+ if (XC1 && XC1->isExactlyValue(-1.0))
+ return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
+ DAG.getNode(ISD::FNEG, SL, VT, Y));
+ }
+ return SDValue();
+ };
+
+ if (SDValue FMA = FuseFADD(N0, N1))
+ return FMA;
+ if (SDValue FMA = FuseFADD(N1, N0))
+ return FMA;
+
+ // fold (fmul (fsub +1.0, x), y) -> (fma (fneg x), y, y)
+ // fold (fmul (fsub -1.0, x), y) -> (fma (fneg x), y, (fneg y))
+ // fold (fmul (fsub x, +1.0), y) -> (fma x, y, (fneg y))
+ // fold (fmul (fsub x, -1.0), y) -> (fma x, y, y)
+ auto FuseFSUB = [&](SDValue X, SDValue Y) {
+ if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) {
+ auto XC0 = isConstOrConstSplatFP(X.getOperand(0));
+ if (XC0 && XC0->isExactlyValue(+1.0))
+ return DAG.getNode(PreferredFusedOpcode, SL, VT,
+ DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
+ Y);
+ if (XC0 && XC0->isExactlyValue(-1.0))
+ return DAG.getNode(PreferredFusedOpcode, SL, VT,
+ DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
+ DAG.getNode(ISD::FNEG, SL, VT, Y));
+
+ auto XC1 = isConstOrConstSplatFP(X.getOperand(1));
+ if (XC1 && XC1->isExactlyValue(+1.0))
+ return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
+ DAG.getNode(ISD::FNEG, SL, VT, Y));
+ if (XC1 && XC1->isExactlyValue(-1.0))
+ return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y);
+ }
+ return SDValue();
+ };
+
+ if (SDValue FMA = FuseFSUB(N0, N1))
+ return FMA;
+ if (SDValue FMA = FuseFSUB(N1, N0))
+ return FMA;
+
+ return SDValue();
+}
+
SDValue DAGCombiner::visitFADD(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
}
}
+ // FMUL -> FMA combines:
+ if (SDValue Fused = visitFMULForFMACombine(N)) {
+ AddToWorklist(Fused.getNode());
+ return Fused;
+ }
+
return SDValue();
}
ret void
}
+;
+; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y)
+;
+
+; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y:
+; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY:v[0-9]]], [[VX:v[0-9]]]
+define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
+ float addrspace(1)* %in1,
+ float addrspace(1)* %in2) {
+ %x = load float, float addrspace(1)* %in1
+ %y = load float, float addrspace(1)* %in2
+ %a = fadd float %x, 1.0
+ %m = fmul float %a, %y
+ store float %m, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one:
+; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY:v[0-9]]], [[VX:v[0-9]]]
+define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
+ float addrspace(1)* %in1,
+ float addrspace(1)* %in2) {
+ %x = load float, float addrspace(1)* %in1
+ %y = load float, float addrspace(1)* %in2
+ %a = fadd float %x, 1.0
+ %m = fmul float %y, %a
+ store float %m, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y:
+; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]]
+define void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
+ float addrspace(1)* %in1,
+ float addrspace(1)* %in2) {
+ %x = load float, float addrspace(1)* %in1
+ %y = load float, float addrspace(1)* %in2
+ %a = fadd float %x, -1.0
+ %m = fmul float %a, %y
+ store float %m, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone:
+; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]]
+define void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
+ float addrspace(1)* %in1,
+ float addrspace(1)* %in2) {
+ %x = load float, float addrspace(1)* %in1
+ %y = load float, float addrspace(1)* %in2
+ %a = fadd float %x, -1.0
+ %m = fmul float %y, %a
+ store float %m, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y:
+; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], [[VY]]
+define void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
+ float addrspace(1)* %in1,
+ float addrspace(1)* %in2) {
+ %x = load float, float addrspace(1)* %in1
+ %y = load float, float addrspace(1)* %in2
+ %s = fsub float 1.0, %x
+ %m = fmul float %s, %y
+ store float %m, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x:
+; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], [[VY]]
+define void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
+ float addrspace(1)* %in1,
+ float addrspace(1)* %in2) {
+ %x = load float, float addrspace(1)* %in1
+ %y = load float, float addrspace(1)* %in2
+ %s = fsub float 1.0, %x
+ %m = fmul float %y, %s
+ store float %m, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y:
+; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], -[[VY]]
+define void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
+ float addrspace(1)* %in1,
+ float addrspace(1)* %in2) {
+ %x = load float, float addrspace(1)* %in1
+ %y = load float, float addrspace(1)* %in2
+ %s = fsub float -1.0, %x
+ %m = fmul float %s, %y
+ store float %m, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x:
+; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], -[[VY]]
+define void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
+ float addrspace(1)* %in1,
+ float addrspace(1)* %in2) {
+ %x = load float, float addrspace(1)* %in1
+ %y = load float, float addrspace(1)* %in2
+ %s = fsub float -1.0, %x
+ %m = fmul float %y, %s
+ store float %m, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y:
+; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]]
+define void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
+ float addrspace(1)* %in1,
+ float addrspace(1)* %in2) {
+ %x = load float, float addrspace(1)* %in1
+ %y = load float, float addrspace(1)* %in2
+ %s = fsub float %x, 1.0
+ %m = fmul float %s, %y
+ store float %m, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one:
+; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]]
+define void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
+ float addrspace(1)* %in1,
+ float addrspace(1)* %in2) {
+ %x = load float, float addrspace(1)* %in1
+ %y = load float, float addrspace(1)* %in2
+ %s = fsub float %x, 1.0
+ %m = fmul float %y, %s
+ store float %m, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y:
+; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY]], [[VX:v[0-9]]]
+define void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
+ float addrspace(1)* %in1,
+ float addrspace(1)* %in2) {
+ %x = load float, float addrspace(1)* %in1
+ %y = load float, float addrspace(1)* %in2
+ %s = fsub float %x, -1.0
+ %m = fmul float %s, %y
+ store float %m, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone:
+; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY]], [[VX:v[0-9]]]
+define void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
+ float addrspace(1)* %in1,
+ float addrspace(1)* %in2) {
+ %x = load float, float addrspace(1)* %in1
+ %y = load float, float addrspace(1)* %in2
+ %s = fsub float %x, -1.0
+ %m = fmul float %y, %s
+ store float %m, float addrspace(1)* %out
+ ret void
+}
+
+;
+; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
+;
+
+; FUNC-LABEL: {{^}}test_f32_interp:
+; SI: v_mad_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
+; SI: v_mac_f32_e32 [[VR]], [[VT]], [[VX:v[0-9]]]
+define void @test_f32_interp(float addrspace(1)* %out,
+ float addrspace(1)* %in1,
+ float addrspace(1)* %in2,
+ float addrspace(1)* %in3) {
+ %x = load float, float addrspace(1)* %in1
+ %y = load float, float addrspace(1)* %in2
+ %t = load float, float addrspace(1)* %in3
+ %t1 = fsub float 1.0, %t
+ %tx = fmul float %x, %t
+ %ty = fmul float %y, %t1
+ %r = fadd float %tx, %ty
+ store float %r, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_f64_interp:
+; SI: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
+; SI: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
+define void @test_f64_interp(double addrspace(1)* %out,
+ double addrspace(1)* %in1,
+ double addrspace(1)* %in2,
+ double addrspace(1)* %in3) {
+ %x = load double, double addrspace(1)* %in1
+ %y = load double, double addrspace(1)* %in2
+ %t = load double, double addrspace(1)* %in3
+ %t1 = fsub double 1.0, %t
+ %tx = fmul double %x, %t
+ %ty = fmul double %y, %t1
+ %r = fadd double %tx, %ty
+ store double %r, double addrspace(1)* %out
+ ret void
+}
+
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
declare float @llvm.AMDGPU.lrp(float, float, float) nounwind readnone
; FUNC-LABEL: {{^}}test_lrp:
-; SI: v_sub_f32
+; SI: v_mad_f32
; SI: v_mac_f32_e32
define void @test_lrp(float addrspace(1)* %out, float %src0, float %src1, float %src2) nounwind {
%mad = call float @llvm.AMDGPU.lrp(float %src0, float %src1, float %src2) nounwind readnone
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=CHECK_FMA4
+;
+; Patterns (+ fneg variants): add(mul(x,y),z), sub(mul(x,y),z)
+;
+
define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
; CHECK-LABEL: test_x86_fmadd_ps:
; CHECK: # BB#0:
ret <4 x float> %res
}
+;
+; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y)
+;
+
+define <4 x float> @test_v4f32_mul_add_x_one_y(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_v4f32_mul_add_x_one_y:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_v4f32_mul_add_x_one_y:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4-NEXT: retq
+ %a = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
+ %m = fmul <4 x float> %a, %y
+ ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_y_add_x_one(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_v4f32_mul_y_add_x_one:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_v4f32_mul_y_add_x_one:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4-NEXT: retq
+ %a = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
+ %m = fmul <4 x float> %y, %a
+ ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_add_x_negone_y(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_v4f32_mul_add_x_negone_y:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_v4f32_mul_add_x_negone_y:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4-NEXT: retq
+ %a = fadd <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
+ %m = fmul <4 x float> %a, %y
+ ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_y_add_x_negone(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_v4f32_mul_y_add_x_negone:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_v4f32_mul_y_add_x_negone:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4-NEXT: retq
+ %a = fadd <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
+ %m = fmul <4 x float> %y, %a
+ ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_v4f32_mul_sub_one_x_y:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfnmadd213ps %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_v4f32_mul_sub_one_x_y:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfnmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4-NEXT: retq
+ %s = fsub <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
+ %m = fmul <4 x float> %s, %y
+ ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_v4f32_mul_y_sub_one_x:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfnmadd213ps %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_v4f32_mul_y_sub_one_x:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfnmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4-NEXT: retq
+ %s = fsub <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
+ %m = fmul <4 x float> %y, %s
+ ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_v4f32_mul_sub_negone_x_y:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfnmsub213ps %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_v4f32_mul_sub_negone_x_y:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfnmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4-NEXT: retq
+ %s = fsub <4 x float> <float -1.0, float -1.0, float -1.0, float -1.0>, %x
+ %m = fmul <4 x float> %s, %y
+ ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_v4f32_mul_y_sub_negone_x:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfnmsub213ps %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_v4f32_mul_y_sub_negone_x:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfnmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4-NEXT: retq
+ %s = fsub <4 x float> <float -1.0, float -1.0, float -1.0, float -1.0>, %x
+ %m = fmul <4 x float> %y, %s
+ ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_sub_x_one_y(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_v4f32_mul_sub_x_one_y:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_v4f32_mul_sub_x_one_y:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4-NEXT: retq
+ %s = fsub <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
+ %m = fmul <4 x float> %s, %y
+ ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_y_sub_x_one(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_v4f32_mul_y_sub_x_one:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_v4f32_mul_y_sub_x_one:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4-NEXT: retq
+ %s = fsub <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
+ %m = fmul <4 x float> %y, %s
+ ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_sub_x_negone_y(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_v4f32_mul_sub_x_negone_y:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_v4f32_mul_sub_x_negone_y:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4-NEXT: retq
+ %s = fsub <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
+ %m = fmul <4 x float> %s, %y
+ ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_y_sub_x_negone(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_v4f32_mul_y_sub_x_negone:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_v4f32_mul_y_sub_x_negone:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4-NEXT: retq
+ %s = fsub <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
+ %m = fmul <4 x float> %y, %s
+ ret <4 x float> %m
+}
+
+;
+; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
+;
+
+define float @test_f32_interp(float %x, float %y, float %t) {
+; CHECK-LABEL: test_f32_interp:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfnmadd213ss %xmm1, %xmm2, %xmm1
+; CHECK-NEXT: vfmadd213ss %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_f32_interp:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfnmaddss %xmm1, %xmm1, %xmm2, %xmm1
+; CHECK_FMA4-NEXT: vfmaddss %xmm1, %xmm2, %xmm0, %xmm0
+; CHECK_FMA4-NEXT: retq
+ %t1 = fsub float 1.0, %t
+ %tx = fmul float %x, %t
+ %ty = fmul float %y, %t1
+ %r = fadd float %tx, %ty
+ ret float %r
+}
+
+define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float> %t) {
+; CHECK-LABEL: test_v4f32_interp:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfnmadd213ps %xmm1, %xmm2, %xmm1
+; CHECK-NEXT: vfmadd213ps %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_v4f32_interp:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfnmaddps %xmm1, %xmm1, %xmm2, %xmm1
+; CHECK_FMA4-NEXT: vfmaddps %xmm1, %xmm2, %xmm0, %xmm0
+; CHECK_FMA4-NEXT: retq
+ %t1 = fsub <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %t
+ %tx = fmul <4 x float> %x, %t
+ %ty = fmul <4 x float> %y, %t1
+ %r = fadd <4 x float> %tx, %ty
+ ret <4 x float> %r
+}
+
+define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float> %t) {
+; CHECK-LABEL: test_v8f32_interp:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfnmadd213ps %ymm1, %ymm2, %ymm1
+; CHECK-NEXT: vfmadd213ps %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_v8f32_interp:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfnmaddps %ymm1, %ymm1, %ymm2, %ymm1
+; CHECK_FMA4-NEXT: vfmaddps %ymm1, %ymm2, %ymm0, %ymm0
+; CHECK_FMA4-NEXT: retq
+ %t1 = fsub <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %t
+ %tx = fmul <8 x float> %x, %t
+ %ty = fmul <8 x float> %y, %t1
+ %r = fadd <8 x float> %tx, %ty
+ ret <8 x float> %r
+}
+
+define double @test_f64_interp(double %x, double %y, double %t) {
+; CHECK-LABEL: test_f64_interp:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfnmadd213sd %xmm1, %xmm2, %xmm1
+; CHECK-NEXT: vfmadd213sd %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_f64_interp:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfnmaddsd %xmm1, %xmm1, %xmm2, %xmm1
+; CHECK_FMA4-NEXT: vfmaddsd %xmm1, %xmm2, %xmm0, %xmm0
+; CHECK_FMA4-NEXT: retq
+ %t1 = fsub double 1.0, %t
+ %tx = fmul double %x, %t
+ %ty = fmul double %y, %t1
+ %r = fadd double %tx, %ty
+ ret double %r
+}
+
+define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x double> %t) {
+; CHECK-LABEL: test_v2f64_interp:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfnmadd213pd %xmm1, %xmm2, %xmm1
+; CHECK-NEXT: vfmadd213pd %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_v2f64_interp:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfnmaddpd %xmm1, %xmm1, %xmm2, %xmm1
+; CHECK_FMA4-NEXT: vfmaddpd %xmm1, %xmm2, %xmm0, %xmm0
+; CHECK_FMA4-NEXT: retq
+ %t1 = fsub <2 x double> <double 1.0, double 1.0>, %t
+ %tx = fmul <2 x double> %x, %t
+ %ty = fmul <2 x double> %y, %t1
+ %r = fadd <2 x double> %tx, %ty
+ ret <2 x double> %r
+}
+
+define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x double> %t) {
+; CHECK-LABEL: test_v4f64_interp:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfnmadd213pd %ymm1, %ymm2, %ymm1
+; CHECK-NEXT: vfmadd213pd %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_v4f64_interp:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfnmaddpd %ymm1, %ymm1, %ymm2, %ymm1
+; CHECK_FMA4-NEXT: vfmaddpd %ymm1, %ymm2, %ymm0, %ymm0
+; CHECK_FMA4-NEXT: retq
+ %t1 = fsub <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, %t
+ %tx = fmul <4 x double> %x, %t
+ %ty = fmul <4 x double> %y, %t1
+ %r = fadd <4 x double> %tx, %ty
+ ret <4 x double> %r
+}