Combine fmul vector FP constants when unsafe math is allowed.

author Sanjay Patel <spatel@rotateright.com>

Thu, 11 Sep 2014 15:45:27 +0000 (15:45 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Thu, 11 Sep 2014 15:45:27 +0000 (15:45 +0000)
author Sanjay Patel <spatel@rotateright.com>
Thu, 11 Sep 2014 15:45:27 +0000 (15:45 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Thu, 11 Sep 2014 15:45:27 +0000 (15:45 +0000)
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

index 156d0a369305108cd7072f257be4dbccfe4c87a4..c29200a549ecdc37523cd06d5dd866a8d49e2ca5 100644 (file)
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6820,8 +6820,16 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
  
    // fold vector ops
    if (VT.isVector()) {
+    // This just handles C1 * C2 for vectors. Other vector folds are below.
      SDValue FoldedVOp = SimplifyVBinOp(N);
-    if (FoldedVOp.getNode()) return FoldedVOp;
+    if (FoldedVOp.getNode())
+      return FoldedVOp;
+    // Canonicalize vector constant to RHS.
+    if (N0.getOpcode() == ISD::BUILD_VECTOR &&
+        N1.getOpcode() != ISD::BUILD_VECTOR)
+      if (auto *BV0 = dyn_cast<BuildVectorSDNode>(N0))
+        if (BV0->isConstant())
+          return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);
    }
  
    // fold (fmul c1, c2) -> c1*c2
@@ -6842,11 +6850,19 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
        return N1;
  
      // fold (fmul (fmul x, c1), c2) -> (fmul x, (fmul c1, c2))
-    if (N1CFP && N0.getOpcode() == ISD::FMUL &&
-        N0.getNode()->hasOneUse() && isConstOrConstSplatFP(N0.getOperand(1))) {
-      SDLoc SL(N);
-      SDValue MulConsts = DAG.getNode(ISD::FMUL, SL, VT, N0.getOperand(1), N1);
-      return DAG.getNode(ISD::FMUL, SL, VT, N0.getOperand(0), MulConsts);
+    if (N0.getOpcode() == ISD::FMUL) {
+      // Fold scalars or any vector constants (not just splats).
+      // This fold is done in general by InstCombine, but extra fmul insts
+      // may have been generated during lowering.
+      SDValue N01 = N0.getOperand(1);
+      auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
+      auto *BV01 = dyn_cast<BuildVectorSDNode>(N01);
+      if ((N1CFP && isConstOrConstSplatFP(N01)) ||
+          (BV1 && BV01 && BV1->isConstant() && BV01->isConstant())) {
+        SDLoc SL(N);
+        SDValue MulConsts = DAG.getNode(ISD::FMUL, SL, VT, N01, N1);
+        return DAG.getNode(ISD::FMUL, SL, VT, N0.getOperand(0), MulConsts);
+      }
      }
  
      // fold (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c))
diff --git a/test/CodeGen/X86/fmul-combines.ll b/test/CodeGen/X86/fmul-combines.ll

index a0122356720bc9415881e0233725244ccc608512..be041073b06b52e6bc2704c081b45a221be9a89f 100644 (file)
--- a/test/CodeGen/X86/fmul-combines.ll
+++ b/test/CodeGen/X86/fmul-combines.ll
@@ -55,6 +55,54 @@ define <4 x float> @fmul_c3_c4_v4f32(<4 x float> %x) #0 {
    ret <4 x float> %z
  }
  
+; We should be able to pre-multiply the two constant vectors.
+; CHECK: ## float 5.000000e+00
+; CHECK: ## float 1.200000e+01
+; CHECK: ## float 2.100000e+01
+; CHECK: ## float 3.200000e+01
+; CHECK-LABEL: fmul_v4f32_two_consts_no_splat:
+; CHECK: mulps
+; CHECK-NOT: mulps
+; CHECK-NEXT: ret
+define <4 x float> @fmul_v4f32_two_consts_no_splat(<4 x float> %x) #0 {
+  %y = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %z = fmul <4 x float> %y, <float 5.0, float 6.0, float 7.0, float 8.0>
+  ret <4 x float> %z
+}
+
+; Same as above, but reverse operands to make sure non-canonical form is also handled.
+; CHECK: ## float 5.000000e+00
+; CHECK: ## float 1.200000e+01
+; CHECK: ## float 2.100000e+01
+; CHECK: ## float 3.200000e+01
+; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_non_canonical:
+; CHECK: mulps
+; CHECK-NOT: mulps
+; CHECK-NEXT: ret
+define <4 x float> @fmul_v4f32_two_consts_no_splat_non_canonical(<4 x float> %x) #0 {
+  %y = fmul <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
+  %z = fmul <4 x float> <float 5.0, float 6.0, float 7.0, float 8.0>, %y
+  ret <4 x float> %z
+}
+
+; More than one use of a constant multiply should not inhibit the optimization.
+; Instead of a chain of 2 dependent mults, this test will have 2 independent mults. 
+; CHECK: ## float 5.000000e+00
+; CHECK: ## float 1.200000e+01
+; CHECK: ## float 2.100000e+01
+; CHECK: ## float 3.200000e+01
+; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_multiple_use:
+; CHECK: mulps
+; CHECK: mulps
+; CHECK: addps
+; CHECK: ret
+define <4 x float> @fmul_v4f32_two_consts_no_splat_multiple_use(<4 x float> %x) #0 {
+  %y = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %z = fmul <4 x float> %y, <float 5.0, float 6.0, float 7.0, float 8.0>
+  %a = fadd <4 x float> %y, %z
+  ret <4 x float> %a
+}
+
  ; CHECK-LABEL: fmul_c2_c4_f32:
  ; CHECK-NOT: addss
  ; CHECK: mulss
author	Sanjay Patel <spatel@rotateright.com>
	Thu, 11 Sep 2014 15:45:27 +0000 (15:45 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Thu, 11 Sep 2014 15:45:27 +0000 (15:45 +0000)
lib/CodeGen/SelectionDAG/DAGCombiner.cpp		patch \| blob \| history
test/CodeGen/X86/fmul-combines.ll		patch \| blob \| history