From c97ac423205ec29d4beff0001d630838d8db2e79 Mon Sep 17 00:00:00 2001
From: Ahmed Bougacha <ahmed.bougacha@gmail.com>
Date: Thu, 1 Oct 2015 00:11:07 +0000
Subject: [PATCH] [X86] Don't custom-lower vNi32 uint_to_fp when
 unsafe-fp-math.

The custom code produces incorrect results if later reassociated.

Since r221657, on x86, vNi32 uitofp is lowered using an optimized
sequence:

  movdqa LCPI0_0(%rip), %xmm1 ## xmm1 = [65535, ...]
  pand %xmm0, %xmm1
  por LCPI0_1(%rip), %xmm1 ## [0x4b000000, ...]
  psrld $16, %xmm0
  por LCPI0_2(%rip), %xmm0 ## [0x53000000, ...]
  addps LCPI0_3(%rip), %xmm0 ## [float -5.497642e+11, ...]
  addps %xmm1, %xmm0

Since r240361, the machine combiner opportunistically reassociates
2-instruction sequences (with -ffast-math). In the new code sequence,
the ADDPS' are eligible. In isolation, for simple examples (without
reassociable users), this makes no performance difference (the goal
being to enable reassociation of longer chains).

In the trivial example (just one uitofp), the reassociation doesn't
happen, because (I think) it would require the emission of a separate
movaps for a constantpool load (instead of folding it into addps).

However, when we have multiple uitofp sequences, and the constantpool
loads are CSE'd earlier, the machine combiner can do the reassociation.

When the ADDPS' are reassociated, the resulting sequence isn't correct
anymore, as we'd be adding large (2**39) constants with comparatively
smaller values (~2**23). Given that two of the three inputs are powers
of 2 larger than 2**16, and that ulp(2**39) == 2**(39-24) == 2**15,
the reassociated chain will produce 0 for any input in [0, 2**14[.
In my testing, it also produces wrong results for 99.5% of [0, 2**32[.

Avoid this by disabling the new lowering when -ffast-math. It does
mean that we'll get slower code than without it, but at least we
won't get egregiously incorrect code.

One might argue that, considering -ffast-math is all but meaningless,
uitofp producing wrong results isn't a compiler bug. But it really is.

Fixes PR24512.

...though this is really more of a workaround.
Ideally, we'd have some sort of Machine FMF, but that's a problem
that's not worth tackling until we do more with machine IR.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@248965 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp          |   9 ++
 test/CodeGen/X86/vec_uint_to_fp-fastmath.ll | 130 ++++++++++++++++++++
 2 files changed, 139 insertions(+)
 create mode 100644 test/CodeGen/X86/vec_uint_to_fp-fastmath.ll

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 0ed196a4acb..6d8aa4a10d3 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -12490,6 +12490,15 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
   //     return (float4) lo + fhi;
 
+  // We shouldn't use it when unsafe-fp-math is enabled though: we might later
+  // reassociate the two FADDs, and if we do that, the algorithm fails
+  // spectacularly (PR24512).
+  // FIXME: If we ever have some kind of Machine FMF, this should be marked
+  // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
+  // there's also the MachineCombiner reassociations happening on Machine IR.
+  if (DAG.getTarget().Options.UnsafeFPMath)
+    return SDValue();
+
   SDLoc DL(Op);
   SDValue V = Op->getOperand(0);
   EVT VecIntVT = V.getValueType();
diff --git a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
new file mode 100644
index 00000000000..0d67ac4bc25
--- /dev/null
+++ b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
@@ -0,0 +1,130 @@
+; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math \
+; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=CST
+; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+sse4.1 \
+; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=CST
+; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx \
+; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=CST
+; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx2 \
+; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+
+; CST: [[MASKCSTADDR:.LCPI[0-9_]+]]:
+; CST-NEXT: .long 65535 # 0xffff
+; CST-NEXT: .long 65535 # 0xffff
+; CST-NEXT: .long 65535 # 0xffff
+; CST-NEXT: .long 65535 # 0xffff
+
+; CST: [[FPMASKCSTADDR:.LCPI[0-9_]+]]:
+; CST-NEXT: .long 1199570944 # float 6.553600e+04
+; CST-NEXT: .long 1199570944 # float 6.553600e+04
+; CST-NEXT: .long 1199570944 # float 6.553600e+04
+; CST-NEXT: .long 1199570944 # float 6.553600e+04
+
+; AVX2: [[FPMASKCSTADDR:.LCPI[0-9_]+]]:
+; AVX2-NEXT: .long 1199570944 # float 65536
+
+; AVX2: [[MASKCSTADDR:.LCPI[0-9_]+]]:
+; AVX2-NEXT: .long 65535 # 0xffff
+
+define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) {
+; SSE-LABEL: test_uitofp_v4i32_to_v4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; SSE-NEXT:    andps %xmm0, %xmm1
+; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT:    mulps [[FPMASKCSTADDR]](%rip), %xmm0
+; SSE-NEXT:    addps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_uitofp_v4i32_to_v4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vandps [[MASKCSTADDR]](%rip), %xmm0, %xmm1
+; AVX-NEXT:    vcvtdq2ps %xmm1, %xmm1
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT:    vmulps [[FPMASKCSTADDR]](%rip), %xmm0, %xmm0
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_uitofp_v4i32_to_v4f32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vcvtdq2ps %xmm1, %xmm1
+; AVX2-NEXT:    vbroadcastss [[FPMASKCSTADDR]](%rip), %xmm2
+; AVX2-NEXT:    vmulps %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastd [[MASKCSTADDR]](%rip), %xmm2
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; AVX2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+  %tmp = uitofp <4 x i32> %arg to <4 x float>
+  ret <4 x float> %tmp
+}
+
+; AVX: [[MASKCSTADDR_v8:.LCPI[0-9_]+]]:
+; AVX-NEXT: .long 65535 # 0xffff
+; AVX-NEXT: .long 65535 # 0xffff
+; AVX-NEXT: .long 65535 # 0xffff
+; AVX-NEXT: .long 65535 # 0xffff
+
+; AVX: [[FPMASKCSTADDR_v8:.LCPI[0-9_]+]]:
+; AVX-NEXT: .long 1199570944 # float 6.553600e+04
+; AVX-NEXT: .long 1199570944 # float 6.553600e+04
+; AVX-NEXT: .long 1199570944 # float 6.553600e+04
+; AVX-NEXT: .long 1199570944 # float 6.553600e+04
+
+; AVX2: [[FPMASKCSTADDR_v8:.LCPI[0-9_]+]]:
+; AVX2-NEXT: .long 1199570944 # float 65536
+
+; AVX2: [[MASKCSTADDR_v8:.LCPI[0-9_]+]]:
+; AVX2-NEXT: .long 65535 # 0xffff
+
+define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) {
+; SSE-LABEL: test_uitofp_v8i32_to_v8f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    psrld $16, %xmm2
+; SSE-NEXT:    cvtdq2ps %xmm2, %xmm2
+; SSE-NEXT:    movaps {{.*#+}} xmm3 = [6.553600e+04,6.553600e+04,6.553600e+04,6.553600e+04]
+; SSE-NEXT:    mulps %xmm3, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT:    addps %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrld $16, %xmm2
+; SSE-NEXT:    cvtdq2ps %xmm2, %xmm2
+; SSE-NEXT:    mulps %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
+; SSE-NEXT:    addps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_uitofp_v8i32_to_v8f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vandps [[MASKCSTADDR_v8]](%rip), %ymm0, %ymm1
+; AVX-NEXT:    vcvtdq2ps %ymm1, %ymm1
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm2
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX-NEXT:    vmulps [[FPMASKCSTADDR_v8]](%rip), %ymm0, %ymm0
+; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_uitofp_v8i32_to_v8f32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm1
+; AVX2-NEXT:    vcvtdq2ps %ymm1, %ymm1
+; AVX2-NEXT:    vbroadcastss [[FPMASKCSTADDR_v8]](%rip), %ymm2
+; AVX2-NEXT:    vmulps %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastd [[MASKCSTADDR_v8]](%rip), %ymm2
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %tmp = uitofp <8 x i32> %arg to <8 x float>
+  ret <8 x float> %tmp
+}
-- 
2.34.1