From: Manman Ren <mren@apple.com>
Date: Tue, 30 Oct 2012 23:53:59 +0000 (+0000)
Subject: X86 SSE: update rsqrtss and rcpss to use two source operands and
X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=dfd0b9b460686ca9491e49dd3647beec5e748a1a;p=oota-llvm.git

X86 SSE: update rsqrtss and rcpss to use two source operands and
the first source operand is tied to the destination operand.

This is to accurately model the corresponding instructions where the upper
bits are unmodified.

rdar://12558838
PR14221


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167064 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 3fcc0dc4149..dff2d4ea1cf 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -3293,17 +3293,52 @@ defm SQRT  : sse1_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse_sqrt_ss,
              sse2_fp_unop_p<0x51, "sqrt",  fsqrt, SSE_SQRTS>,
              sse2_fp_unop_p_int<0x51, "sqrt", int_x86_sse2_sqrt_pd, SSE_SQRTS>;
 
+/// sse1_fp_unop_s_rw - SSE1 unops where vector form has a read-write operand.
+multiclass sse1_fp_unop_rw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                               Intrinsic F32Int, OpndItins itins> {
+  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode FR32:$src))]>;
+  // For scalar unary operations, fold a load into the operation
+  // only in OptForSize mode. It eliminates an instruction, but it also
+  // eliminates a whole-register clobber (the load), so it introduces a
+  // partial register update condition.
+  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
+                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
+            Requires<[UseSSE1, OptForSize]>;
+  let Constraints = "$src1 = $dst" in {
+    def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, VR128:$src2),
+                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                      [], itins.rr>;
+    def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, ssmem:$src2),
+                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                      [], itins.rm>;
+  }
+}
+
 // Reciprocal approximations. Note that these typically require refinement
 // in order to obtain suitable precision.
-defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ss,
-                            SSE_SQRTS>,
+defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ss,
+                             SSE_SQRTS>,
              sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTS>,
              sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
                             SSE_SQRTS>;
-defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss,
-                            SSE_RCPS>,
+let Predicates = [UseSSE1] in {
+  def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
+            (RSQRTSSr_Int VR128:$src, VR128:$src)>;
+}
+
+defm RCP   : sse1_fp_unop_rw<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss,
+                             SSE_RCPS>,
              sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPS>,
              sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, SSE_RCPS>;
+let Predicates = [UseSSE1] in {
+  def : Pat<(int_x86_sse_rcp_ss VR128:$src),
+            (RCPSSr_Int VR128:$src, VR128:$src)>;
+}
 
 // There is no f64 version of the reciprocal approximation instructions.
 
diff --git a/test/CodeGen/X86/sse_partial_update.ll b/test/CodeGen/X86/sse_partial_update.ll
new file mode 100644
index 00000000000..655f75800cf
--- /dev/null
+++ b/test/CodeGen/X86/sse_partial_update.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse2 -mcpu=nehalem | FileCheck %s
+
+; rdar: 12558838
+; PR14221
+; There is a mismatch between the intrinsic and the actual instruction.
+; The actual instruction has a partial update of dest, while the intrinsic
+; passes through the upper FP values. Here, we make sure the source and
+; destination of rsqrtss are the same.
+define void @t1(<4 x float> %a) nounwind uwtable ssp {
+entry:
+; CHECK: t1:
+; CHECK: rsqrtss %xmm0, %xmm0
+  %0 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a) nounwind
+  %a.addr.0.extract = extractelement <4 x float> %0, i32 0
+  %conv = fpext float %a.addr.0.extract to double
+  %a.addr.4.extract = extractelement <4 x float> %0, i32 1
+  %conv3 = fpext float %a.addr.4.extract to double
+  tail call void @callee(double %conv, double %conv3) nounwind
+  ret void
+}
+declare void @callee(double, double)
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define void @t2(<4 x float> %a) nounwind uwtable ssp {
+entry:
+; CHECK: t2:
+; CHECK: rcpss %xmm0, %xmm0
+  %0 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a) nounwind
+  %a.addr.0.extract = extractelement <4 x float> %0, i32 0
+  %conv = fpext float %a.addr.0.extract to double
+  %a.addr.4.extract = extractelement <4 x float> %0, i32 1
+  %conv3 = fpext float %a.addr.4.extract to double
+  tail call void @callee(double %conv, double %conv3) nounwind
+  ret void
+}
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone