From a825d75f1dab354135a6608fb78640b8f844df20 Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Thu, 10 Apr 2008 05:37:47 +0000 Subject: [PATCH] move the x86-32 part of PR2108 here. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@49465 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/README-SSE.txt | 48 +++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt index 4d7224514cb..bbd93bf3ed4 100644 --- a/lib/Target/X86/README-SSE.txt +++ b/lib/Target/X86/README-SSE.txt @@ -428,6 +428,54 @@ entry: //===---------------------------------------------------------------------===// +Consider (PR2108): + +#include +__m128i doload64(unsigned long long x) { return _mm_loadl_epi64(&x);} +__m128i doload64_2(unsigned long long *x) { return _mm_loadl_epi64(x);} + +These are very similar routines, but we generate significantly worse code for +the first one on x86-32: + +_doload64: + subl $12, %esp + movl 20(%esp), %eax + movl %eax, 4(%esp) + movl 16(%esp), %eax + movl %eax, (%esp) + movsd (%esp), %xmm0 + addl $12, %esp + ret +_doload64_2: + movl 4(%esp), %eax + movsd (%eax), %xmm0 + ret + +The problem is that the argument lowering logic splits the i64 argument into +2x i32 loads early, the f64 insert doesn't match. Here's a reduced testcase: + +define fastcc double @doload64(i64 %x) nounwind { +entry: + %tmp717 = bitcast i64 %x to double ; [#uses=1] + ret double %tmp717 +} + +compiles to: + +_doload64: + subl $12, %esp + movl 20(%esp), %eax + movl %eax, 4(%esp) + movl 16(%esp), %eax + movl %eax, (%esp) + movsd (%esp), %xmm0 + addl $12, %esp + ret + +instead of movsd from the stack. + +//===---------------------------------------------------------------------===// + __m128d test1( __m128d A, __m128d B) { return _mm_shuffle_pd(A, B, 0x3); } -- 2.34.1