// Random ideas for the X86 backend: SSE-specific stuff.
//===---------------------------------------------------------------------===//
-- Consider eliminating the unaligned SSE load intrinsics, replacing them with
- unaligned LLVM load instructions.
+//===---------------------------------------------------------------------===//
+
+SSE Variable shift can be custom lowered to something like this, which uses a
+small table + unaligned load + shuffle instead of going through memory.
+
+__m128i_shift_right:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+
+...
+__m128i shift_right(__m128i value, unsigned long offset) {
+ return _mm_shuffle_epi8(value,
+ _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
+}
+
+//===---------------------------------------------------------------------===//
+
+SSE has instructions for doing operations on complex numbers, we should pattern
+match them. For example, this should turn into a horizontal add:
+
+typedef float __attribute__((vector_size(16))) v4f32;
+float f32(v4f32 A) {
+ return A[0]+A[1]+A[2]+A[3];
+}
+
+Instead we get this:
+
+_f32: ## @f32
+ pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0]
+ addss %xmm0, %xmm1
+ pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0]
+ movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1]
+ movaps %xmm0, %xmm3
+ addss %xmm1, %xmm3
+ movdqa %xmm2, %xmm0
+ addss %xmm3, %xmm0
+ ret
+
+Also, there are cases where some simple local SLP would improve codegen a bit.
+compiling this:
+
+_Complex float f32(_Complex float A, _Complex float B) {
+ return A+B;
+}
+
+into:
+
+_f32: ## @f32
+ movdqa %xmm0, %xmm2
+ addss %xmm1, %xmm2
+ pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0]
+ pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0]
+ addss %xmm1, %xmm3
+ movaps %xmm2, %xmm0
+ unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+ ret
+
+seems silly when it could just be one addps.
+
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
-It's not clear whether we should use pxor or xorps / xorpd to clear XMM
-registers. The choice may depend on subtarget information. We should do some
-more experiments on different x86 machines.
-
-//===---------------------------------------------------------------------===//
-
Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
feasible.
//===---------------------------------------------------------------------===//
-How to decide when to use the "floating point version" of logical ops? Here are
-some code fragments:
-
- movaps LCPI5_5, %xmm2
- divps %xmm1, %xmm2
- mulps %xmm2, %xmm3
- mulps 8656(%ecx), %xmm3
- addps 8672(%ecx), %xmm3
- andps LCPI5_6, %xmm2
- andps LCPI5_1, %xmm3
- por %xmm2, %xmm3
- movdqa %xmm3, (%edi)
-
- movaps LCPI5_5, %xmm1
- divps %xmm0, %xmm1
- mulps %xmm1, %xmm3
- mulps 8656(%ecx), %xmm3
- addps 8672(%ecx), %xmm3
- andps LCPI5_6, %xmm1
- andps LCPI5_1, %xmm3
- orps %xmm1, %xmm3
- movaps %xmm3, 112(%esp)
- movaps %xmm3, (%ebx)
-
-Due to some minor source change, the later case ended up using orps and movaps
-instead of por and movdqa. Does it matter?
-
-//===---------------------------------------------------------------------===//
-
-X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
-to choose between movaps, movapd, and movdqa based on types of source and
-destination?
-
-How about andps, andpd, and pand? Do we really care about the type of the packed
-elements? If not, why not always use the "ps" variants which are likely to be
-shorter.
-
-//===---------------------------------------------------------------------===//
-
External test Nurbs exposed some problems. Look for
__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
emits:
//===---------------------------------------------------------------------===//
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
-
-LLVM is producing bad code.
-
-LBB_main_4: # cond_true44
- addps %xmm1, %xmm2
- subps %xmm3, %xmm2
- movaps (%ecx), %xmm4
- movaps %xmm2, %xmm1
- addps %xmm4, %xmm1
- addl $16, %ecx
- incl %edx
- cmpl $262144, %edx
- movaps %xmm3, %xmm2
- movaps %xmm4, %xmm3
- jne LBB_main_4 # cond_true44
-
-There are two problems. 1) No need to two loop induction variables. We can
-compare against 262144 * 16. 2) Known register coalescer issue. We should
-be able eliminate one of the movaps:
-
- addps %xmm2, %xmm1 <=== Commute!
- subps %xmm3, %xmm1
- movaps (%ecx), %xmm4
- movaps %xmm1, %xmm1 <=== Eliminate!
- addps %xmm4, %xmm1
- addl $16, %ecx
- incl %edx
- cmpl $262144, %edx
- movaps %xmm3, %xmm2
- movaps %xmm4, %xmm3
- jne LBB_main_4 # cond_true44
-
-//===---------------------------------------------------------------------===//
-
Consider:
__m128 test(float a) {
//===---------------------------------------------------------------------===//
-__m128d test1( __m128d A, __m128d B) {
- return _mm_shuffle_pd(A, B, 0x3);
-}
-
-compiles to
-
-shufpd $3, %xmm1, %xmm0
-
-Perhaps it's better to use unpckhpd instead?
-
-unpckhpd %xmm1, %xmm0
-
-Don't know if unpckhpd is faster. But it is shorter.
-
-//===---------------------------------------------------------------------===//
-
This code generates ugly code, probably due to costs being off or something:
define void @test(float* %P, <4 x float>* %P2 ) {
%tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
ret i64 %tmp20
}
+declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
This currently compiles to:
//===---------------------------------------------------------------------===//
-SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
-sitting between the truncate and the extract.
-
-//===---------------------------------------------------------------------===//
-
INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
any number of 0.0 simultaneously. Currently we only use it for simple
insertions.
//===---------------------------------------------------------------------===//
-'select' on vectors and scalars could be a whole lot better. We currently
-lower them to conditional branches. On x86-64 for example, we compile this:
-
-double test(double a, double b, double c, double d) { return a<b ? c : d; }
-
-to:
-
-_test:
- ucomisd %xmm0, %xmm1
- ja LBB1_2 # entry
-LBB1_1: # entry
- movapd %xmm3, %xmm2
-LBB1_2: # entry
- movapd %xmm2, %xmm0
- ret
-
-instead of:
-
-_test:
- cmpltsd %xmm1, %xmm0
- andpd %xmm0, %xmm2
- andnpd %xmm3, %xmm0
- orpd %xmm2, %xmm0
- ret
-
-For unpredictable branches, the later is much more efficient. This should
-just be a matter of having scalar sse map to SELECT_CC and custom expanding
-or iseling it.
-
-//===---------------------------------------------------------------------===//
-
LLVM currently generates stack realignment code, when it is not necessary
needed. The problem is that we need to know about stack alignment too early,
before RA runs.
This IR (from PR6194):
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-darwin10.0.0"
%0 = type { double, double }
doing a shuffle from v[1] to v[0] then a float store.
//===---------------------------------------------------------------------===//
+
+On SSE4 machines, we compile this code:
+
+define <2 x float> @test2(<2 x float> %Q, <2 x float> %R,
+ <2 x float> *%P) nounwind {
+ %Z = fadd <2 x float> %Q, %R
+
+ store <2 x float> %Z, <2 x float> *%P
+ ret <2 x float> %Z
+}
+
+into:
+
+_test2: ## @test2
+## BB#0:
+ insertps $0, %xmm2, %xmm2
+ insertps $16, %xmm3, %xmm2
+ insertps $0, %xmm0, %xmm3
+ insertps $16, %xmm1, %xmm3
+ addps %xmm2, %xmm3
+ movq %xmm3, (%rdi)
+ movaps %xmm3, %xmm0
+ pshufd $1, %xmm3, %xmm1
+ ## kill: XMM1<def> XMM1<kill>
+ ret
+
+The insertps's of $0 are pointless complex copies.
+
+//===---------------------------------------------------------------------===//
+
+[UNSAFE FP]
+
+void foo(double, double, double);
+void norm(double x, double y, double z) {
+ double scale = __builtin_sqrt(x*x + y*y + z*z);
+ foo(x/scale, y/scale, z/scale);
+}
+
+We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is
+slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first
+and emit 3 mulsd in place of the divs. This can be done as a target-independent
+transform.
+
+If we're dealing with floats instead of doubles we could even replace the sqrtss
+and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
+cost of reduced accuracy.
+
+//===---------------------------------------------------------------------===//
+
+This function should be matched to haddpd when the appropriate CPU is enabled:
+
+#include <x86intrin.h>
+double f (__m128d p) {
+ return p[0] + p[1];
+}
+
+similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should
+turn into hsubpd also.
+
+//===---------------------------------------------------------------------===//
+
+define <2 x i32> @foo(<2 x double> %in) {
+ %x = fptosi <2 x double> %in to <2 x i32>
+ ret <2 x i32> %x
+}
+
+Should compile into cvttpd2dq instead of being scalarized into 2 cvttsd2si.
+
+//===---------------------------------------------------------------------===//