From 4b3fcc21ec0561d09105862a63a9e8a66696ec8c Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Tue, 10 Dec 2013 15:22:48 +0000
Subject: [PATCH] Ensure that the backend no longer emits unnecessary vector
 insert instructions immediately after SSE scalar fp instructions like addss
 or mulss.

Added patterns to select SSE scalar fp arithmetic instructions from a scalar
fp operation followed by a blend.

For example, given the following code:
  __m128 foo(__m128 A, __m128 B) {
    A[0] += B[0];
    return A;
  }

previously we generated:
  addss %xmm0, %xmm1
  movss %xmm1, %xmm0

now we generate:
  addss %xmm1, %xmm0


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@196925 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrSSE.td           | 125 ++++++++++
 test/CodeGen/X86/sse-scalar-fp-arith.ll | 310 ++++++++++++++++++++++++
 2 files changed, 435 insertions(+)
 create mode 100644 test/CodeGen/X86/sse-scalar-fp-arith.ll

diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 8c0076dfd40..08b547cf6d7 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -3017,6 +3017,131 @@ let isCodeGenOnly = 1 in {
              basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
 }
 
+// Patterns used to select SSE scalar fp arithmetic instructions from
+// a scalar fp operation followed by a blend.
+//
+// These patterns know, for example, how to select an ADDSS from a
+// float add plus vector insert.
+//
+// The effect is that the backend no longer emits unnecessary vector
+// insert instructions immediately after SSE scalar fp instructions
+// like addss or mulss.
+//
+// For example, given the following code:
+//   __m128 foo(__m128 A, __m128 B) {
+//     A[0] += B[0];
+//     return A;
+//   }
+//
+// previously we generated:
+//   addss %xmm0, %xmm1
+//   movss %xmm1, %xmm0
+// 
+// we now generate:
+//   addss %xmm1, %xmm0
+
+def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd
+                    (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                    FR32:$src))))),
+          (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub
+                    (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                    FR32:$src))))),
+          (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul
+                    (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                    FR32:$src))))),
+          (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv
+                    (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                    FR32:$src))))),
+          (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+let Predicates = [HasSSE2] in {
+  // SSE2 patterns to select scalar double-precision fp arithmetic instructions
+
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+}
+
+let Predicates = [UseSSE41] in {
+  // If the subtarget has SSE4.1 but not AVX, the vector insert
+  // instruction is lowered into a X86insrtps rather than a X86Movss.
+  // When selecting SSE scalar single-precision fp arithmetic instructions,
+  // make sure that we correctly match the X86insrtps.
+
+  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                  (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                    FR32:$src))), (iPTR 0))),
+            (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                  (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                    FR32:$src))), (iPTR 0))),
+            (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                  (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                    FR32:$src))), (iPTR 0))),
+            (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                  (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                    FR32:$src))), (iPTR 0))),
+            (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+}
+
+let AddedComplexity = 20, Predicates = [HasAVX] in {
+  // The following patterns select AVX Scalar single/double precision fp
+  // arithmetic instructions.
+  // The 'AddedComplexity' is required to give them higher priority over
+  // the equivalent SSE/SSE2 patterns.
+
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                 (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                       FR32:$src))), (iPTR 0))),
+            (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                 (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                       FR32:$src))), (iPTR 0))),
+            (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                 (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                       FR32:$src))), (iPTR 0))),
+            (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                 (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                       FR32:$src))), (iPTR 0))),
+            (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+}
+
 /// Unop Arithmetic
 /// In addition, we also have a special variant of the scalar form here to
 /// represent the associated intrinsic operation.  This form is unlike the
diff --git a/test/CodeGen/X86/sse-scalar-fp-arith.ll b/test/CodeGen/X86/sse-scalar-fp-arith.ll
new file mode 100644
index 00000000000..3949a835e67
--- /dev/null
+++ b/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -0,0 +1,310 @@
+; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
+; RUN: llc -mtriple=x86_64-pc-linux -mattr=-sse4.1 -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
+; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7-avx < %s | FileCheck -check-prefix=CHECK -check-prefix=AVX %s
+
+; Ensure that the backend no longer emits unnecessary vector insert
+; instructions immediately after SSE scalar fp instructions
+; like addss or mulss.
+
+
+define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %add = fadd float %2, %1
+  %3 = insertelement <4 x float> %a, float %add, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_add_ss
+; SSE2: addss   %xmm1, %xmm0
+; AVX: vaddss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %sub = fsub float %2, %1
+  %3 = insertelement <4 x float> %a, float %sub, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_sub_ss
+; SSE2: subss   %xmm1, %xmm0
+; AVX: vsubss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %mul = fmul float %2, %1
+  %3 = insertelement <4 x float> %a, float %mul, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_mul_ss
+; SSE2: mulss   %xmm1, %xmm0
+; AVX: vmulss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %div = fdiv float %2, %1
+  %3 = insertelement <4 x float> %a, float %div, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_div_ss
+; SSE2: divss   %xmm1, %xmm0
+; AVX: vdivss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %b, i32 0
+  %2 = extractelement <2 x double> %a, i32 0
+  %add = fadd double %2, %1
+  %3 = insertelement <2 x double> %a, double %add, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_add_sd
+; SSE2: addsd   %xmm1, %xmm0
+; AVX: vaddsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %b, i32 0
+  %2 = extractelement <2 x double> %a, i32 0
+  %sub = fsub double %2, %1
+  %3 = insertelement <2 x double> %a, double %sub, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_sub_sd
+; SSE2: subsd   %xmm1, %xmm0
+; AVX: vsubsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %b, i32 0
+  %2 = extractelement <2 x double> %a, i32 0
+  %mul = fmul double %2, %1
+  %3 = insertelement <2 x double> %a, double %mul, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_mul_sd
+; SSE2: mulsd   %xmm1, %xmm0
+; AVX: vmulsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %b, i32 0
+  %2 = extractelement <2 x double> %a, i32 0
+  %div = fdiv double %2, %1
+  %3 = insertelement <2 x double> %a, double %div, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_div_sd
+; SSE2: divsd   %xmm1, %xmm0
+; AVX: vdivsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %a, i32 0
+  %2 = extractelement <4 x float> %b, i32 0
+  %add = fadd float %1, %2
+  %3 = insertelement <4 x float> %b, float %add, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test2_add_ss
+; SSE2: addss   %xmm0, %xmm1
+; AVX: vaddss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %a, i32 0
+  %2 = extractelement <4 x float> %b, i32 0
+  %sub = fsub float %2, %1
+  %3 = insertelement <4 x float> %b, float %sub, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test2_sub_ss
+; SSE2: subss   %xmm0, %xmm1
+; AVX: vsubss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %a, i32 0
+  %2 = extractelement <4 x float> %b, i32 0
+  %mul = fmul float %1, %2
+  %3 = insertelement <4 x float> %b, float %mul, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test2_mul_ss
+; SSE2: mulss   %xmm0, %xmm1
+; AVX: vmulss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %a, i32 0
+  %2 = extractelement <4 x float> %b, i32 0
+  %div = fdiv float %2, %1
+  %3 = insertelement <4 x float> %b, float %div, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test2_div_ss
+; SSE2: divss   %xmm0, %xmm1
+; AVX: vdivss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %a, i32 0
+  %2 = extractelement <2 x double> %b, i32 0
+  %add = fadd double %1, %2
+  %3 = insertelement <2 x double> %b, double %add, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test2_add_sd
+; SSE2: addsd   %xmm0, %xmm1
+; AVX: vaddsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %a, i32 0
+  %2 = extractelement <2 x double> %b, i32 0
+  %sub = fsub double %2, %1
+  %3 = insertelement <2 x double> %b, double %sub, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test2_sub_sd
+; SSE2: subsd   %xmm0, %xmm1
+; AVX: vsubsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %a, i32 0
+  %2 = extractelement <2 x double> %b, i32 0
+  %mul = fmul double %1, %2
+  %3 = insertelement <2 x double> %b, double %mul, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test2_mul_sd
+; SSE2: mulsd   %xmm0, %xmm1
+; AVX: vmulsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %a, i32 0
+  %2 = extractelement <2 x double> %b, i32 0
+  %div = fdiv double %2, %1
+  %3 = insertelement <2 x double> %b, double %div, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test2_div_sd
+; SSE2: divsd   %xmm0, %xmm1
+; AVX: vdivsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %add = fadd float %2, %1
+  %add2 = fadd float %2, %add
+  %3 = insertelement <4 x float> %a, float %add2, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_multiple_add_ss
+; CHECK: addss
+; CHECK: addss
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %sub = fsub float %2, %1
+  %sub2 = fsub float %2, %sub
+  %3 = insertelement <4 x float> %a, float %sub2, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_multiple_sub_ss
+; CHECK: subss
+; CHECK: subss
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %mul = fmul float %2, %1
+  %mul2 = fmul float %2, %mul
+  %3 = insertelement <4 x float> %a, float %mul2, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_multiple_mul_ss
+; CHECK: mulss
+; CHECK: mulss
+; CHECK-NOT: movss
+; CHECK: ret
+
+define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %div = fdiv float %2, %1
+  %div2 = fdiv float %2, %div
+  %3 = insertelement <4 x float> %a, float %div2, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_multiple_div_ss
+; CHECK: divss
+; CHECK: divss
+; CHECK-NOT: movss
+; CHECK: ret
+
-- 
2.34.1