1 //===---------------------------------------------------------------------===//
2 // Random ideas for the X86 backend: SSE-specific stuff.
3 //===---------------------------------------------------------------------===//
5 - Consider eliminating the unaligned SSE load intrinsics, replacing them with
6 unaligned LLVM load instructions.
8 //===---------------------------------------------------------------------===//
10 Expand libm rounding functions inline: Significant speedups possible.
11 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
13 //===---------------------------------------------------------------------===//
15 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
18 //===---------------------------------------------------------------------===//
20 Think about doing i64 math in SSE regs on x86-32.
22 //===---------------------------------------------------------------------===//
24 This testcase should have no SSE instructions in it, and only one load from
27 double %test3(bool %B) {
28 %C = select bool %B, double 123.412, double 523.01123123
32 Currently, the select is being lowered, which prevents the dag combiner from
33 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
35 The pattern isel got this one right.
37 //===---------------------------------------------------------------------===//
39 SSE should implement 'select_cc' using 'emulated conditional moves' that use
40 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
42 double %X(double %Y, double %Z, double %A, double %B) {
43 %C = setlt double %A, %B
44 %z = fadd double %Z, 0.0 ;; select operand is not a load
45 %D = select bool %C, double %Y, double %z
57 ucomisd 40(%esp), %xmm1
67 //===---------------------------------------------------------------------===//
69 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
72 //===---------------------------------------------------------------------===//
75 if (copysign(1.0, x) == copysign(1.0, y))
80 //===---------------------------------------------------------------------===//
82 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
85 //===---------------------------------------------------------------------===//
87 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
88 Perhaps use pxor / xorp* to clear a XMM register first?
90 //===---------------------------------------------------------------------===//
92 X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
93 to choose between movaps, movapd, and movdqa based on types of source and
96 How about andps, andpd, and pand? Do we really care about the type of the packed
97 elements? If not, why not always use the "ps" variants which are likely to be
100 //===---------------------------------------------------------------------===//
102 External test Nurbs exposed some problems. Look for
103 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
106 movaps (%edx), %xmm2 #59.21
107 movaps (%edx), %xmm5 #60.21
108 movaps (%edx), %xmm4 #61.21
109 movaps (%edx), %xmm3 #62.21
110 movl 40(%ecx), %ebp #69.49
111 shufps $0, %xmm2, %xmm5 #60.21
112 movl 100(%esp), %ebx #69.20
113 movl (%ebx), %edi #69.20
114 imull %ebp, %edi #69.49
115 addl (%eax), %edi #70.33
116 shufps $85, %xmm2, %xmm4 #61.21
117 shufps $170, %xmm2, %xmm3 #62.21
118 shufps $255, %xmm2, %xmm2 #63.21
119 lea (%ebp,%ebp,2), %ebx #69.49
121 lea -3(%edi,%ebx), %ebx #70.33
123 addl 32(%ecx), %ebx #68.37
124 testb $15, %bl #91.13
125 jne L_B1.24 # Prob 5% #91.13
127 This is the llvm code after instruction scheduling:
129 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
130 %reg1078 = MOV32ri -3
131 %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
132 %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
133 %reg1080 = IMUL32rr %reg1079, %reg1037
134 %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
135 %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
136 %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
137 %reg1082 = SHL32ri %reg1038, 4
138 %reg1039 = ADD32rr %reg1036, %reg1082
139 %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
140 %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
141 %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
142 %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
143 %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
144 %reg1040 = MOV32rr %reg1039
145 %reg1084 = AND32ri8 %reg1039, 15
147 JE mbb<cond_next204,0xa914d30>
149 Still ok. After register allocation:
151 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
153 %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
154 ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
155 %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
156 %EDX = MOV32rm %EDX, 1, %NOREG, 40
157 IMUL32rr %EAX<def&use>, %EDX
158 %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
159 %ESI = MOV32rm %ESI, 1, %NOREG, 0
160 MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
161 %EAX = LEA32r %ESI, 1, %EAX, -3
162 %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
163 %ESI = MOV32rm %ESI, 1, %NOREG, 32
165 SHL32ri %EDI<def&use>, 4
166 ADD32rr %EDI<def&use>, %ESI
167 %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
168 %XMM1 = MOVAPSrr %XMM0
169 SHUFPSrr %XMM1<def&use>, %XMM1, 170
170 %XMM2 = MOVAPSrr %XMM0
171 SHUFPSrr %XMM2<def&use>, %XMM2, 0
172 %XMM3 = MOVAPSrr %XMM0
173 SHUFPSrr %XMM3<def&use>, %XMM3, 255
174 SHUFPSrr %XMM0<def&use>, %XMM0, 85
176 AND32ri8 %EBX<def&use>, 15
178 JE mbb<cond_next204,0xa914d30>
180 This looks really bad. The problem is shufps is a destructive opcode. Since it
181 appears as operand two in more than one shufps ops. It resulted in a number of
182 copies. Note icc also suffers from the same problem. Either the instruction
183 selector should select pshufd or The register allocator can made the two-address
184 to three-address transformation.
186 It also exposes some other problems. See MOV32ri -3 and the spills.
188 //===---------------------------------------------------------------------===//
192 __m128 test(float a) {
193 return _mm_set_ps(0.0, 0.0, 0.0, a*a);
204 Because mulss doesn't modify the top 3 elements, the top elements of
205 xmm1 are already zero'd. We could compile this to:
211 //===---------------------------------------------------------------------===//
213 Here's a sick and twisted idea. Consider code like this:
215 __m128 test(__m128 a) {
216 float b = *(float*)&A;
218 return _mm_set_ps(0.0, 0.0, 0.0, b);
221 This might compile to this code:
223 movaps c(%esp), %xmm1
228 Now consider if the ... code caused xmm1 to get spilled. This might produce
231 movaps c(%esp), %xmm1
232 movaps %xmm1, c2(%esp)
236 movaps c2(%esp), %xmm1
240 However, since the reload is only used by these instructions, we could
241 "fold" it into the uses, producing something like this:
243 movaps c(%esp), %xmm1
244 movaps %xmm1, c2(%esp)
247 movss c2(%esp), %xmm0
250 ... saving two instructions.
252 The basic idea is that a reload from a spill slot, can, if only one 4-byte
253 chunk is used, bring in 3 zeros the one element instead of 4 elements.
254 This can be used to simplify a variety of shuffle operations, where the
255 elements are fixed zeros.
257 //===---------------------------------------------------------------------===//
259 This code generates ugly code, probably due to costs being off or something:
261 define void @test(float* %P, <4 x float>* %P2 ) {
262 %xFloat0.688 = load float* %P
263 %tmp = load <4 x float>* %P2
264 %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
265 store <4 x float> %inFloat3.713, <4 x float>* %P2
276 shufps $50, %xmm1, %xmm2
277 shufps $132, %xmm2, %xmm0
281 Would it be better to generate:
287 pinsrw $6, %eax, %xmm0
288 pinsrw $7, %eax, %xmm0
294 //===---------------------------------------------------------------------===//
296 Some useful information in the Apple Altivec / SSE Migration Guide:
298 http://developer.apple.com/documentation/Performance/Conceptual/
299 Accelerate_sse_migration/index.html
301 e.g. SSE select using and, andnot, or. Various SSE compare translations.
303 //===---------------------------------------------------------------------===//
305 Add hooks to commute some CMPP operations.
307 //===---------------------------------------------------------------------===//
309 Apply the same transformation that merged four float into a single 128-bit load
310 to loads from constant pool.
312 //===---------------------------------------------------------------------===//
314 Floating point max / min are commutable when -enable-unsafe-fp-path is
315 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
316 nodes which are selected to max / min instructions that are marked commutable.
318 //===---------------------------------------------------------------------===//
320 We should materialize vector constants like "all ones" and "signbit" with
323 cmpeqps xmm1, xmm1 ; xmm1 = all-ones
326 cmpeqps xmm1, xmm1 ; xmm1 = all-ones
327 psrlq xmm1, 31 ; xmm1 = all 100000000000...
329 instead of using a load from the constant pool. The later is important for
330 ABS/NEG/copysign etc.
332 //===---------------------------------------------------------------------===//
336 #include <xmmintrin.h>
338 void x(unsigned short n) {
339 a = _mm_slli_epi32 (a, n);
342 a = _mm_slli_epi32 (a, n);
345 compile to ( -O3 -static -fomit-frame-pointer):
360 "y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems
361 like movd would be sufficient in both cases as the value is already zero
362 extended in the 32-bit stack slot IIRC. For signed short, it should also be
363 save, as a really-signed value would be undefined for pslld.
366 //===---------------------------------------------------------------------===//
369 int t1(double d) { return signbit(d); }
371 This currently compiles to:
373 movsd 16(%esp), %xmm0
380 We should use movmskp{s|d} instead.
382 //===---------------------------------------------------------------------===//
384 CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
385 (aligned) vector load. This functionality has a couple of problems.
387 1. The code to infer alignment from loads of globals is in the X86 backend,
388 not the dag combiner. This is because dagcombine2 needs to be able to see
389 through the X86ISD::Wrapper node, which DAGCombine can't really do.
390 2. The code for turning 4 x load into a single vector load is target
391 independent and should be moved to the dag combiner.
392 3. The code for turning 4 x load into a vector load can only handle a direct
393 load from a global or a direct load from the stack. It should be generalized
394 to handle any load from P, P+4, P+8, P+12, where P can be anything.
395 4. The alignment inference code cannot handle loads from globals in non-static
396 mode because it doesn't look through the extra dyld stub load. If you try
397 vec_align.ll without -relocation-model=static, you'll see what I mean.
399 //===---------------------------------------------------------------------===//
401 We should lower store(fneg(load p), q) into an integer load+xor+store, which
402 eliminates a constant pool load. For example, consider:
404 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly {
406 %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1]
407 %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
410 declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
412 This currently compiles to:
414 LCPI1_0: # <4 x float>
415 .long 2147483648 # float -0
416 .long 2147483648 # float -0
417 .long 2147483648 # float -0
418 .long 2147483648 # float -0
421 movss 16(%esp), %xmm0
423 movss 20(%esp), %xmm0
430 Note the load into xmm0, then xor (to negate), then store. In PIC mode,
431 this code computes the pic base and does two loads to do the constant pool
432 load, so the improvement is much bigger.
434 The tricky part about this xform is that the argument load/store isn't exposed
435 until post-legalize, and at that point, the fneg has been custom expanded into
436 an X86 fxor. This means that we need to handle this case in the x86 backend
437 instead of in target independent code.
439 //===---------------------------------------------------------------------===//
441 Non-SSE4 insert into 16 x i8 is atrociously bad.
443 //===---------------------------------------------------------------------===//
445 <2 x i64> extract is substantially worse than <2 x f64>, even if the destination
448 //===---------------------------------------------------------------------===//
450 SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
451 sitting between the truncate and the extract.
453 //===---------------------------------------------------------------------===//
455 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
456 any number of 0.0 simultaneously. Currently we only use it for simple
459 See comments in LowerINSERT_VECTOR_ELT_SSE4.
461 //===---------------------------------------------------------------------===//
463 On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
464 Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
465 legal, it'll just take a few extra patterns written in the .td file.
467 Note: this is not a code quality issue; the custom lowered code happens to be
468 right, but we shouldn't have to custom lower anything. This is probably related
469 to <2 x i64> ops being so bad.
471 //===---------------------------------------------------------------------===//
473 'select' on vectors and scalars could be a whole lot better. We currently
474 lower them to conditional branches. On x86-64 for example, we compile this:
476 double test(double a, double b, double c, double d) { return a<b ? c : d; }
498 For unpredictable branches, the later is much more efficient. This should
499 just be a matter of having scalar sse map to SELECT_CC and custom expanding
502 //===---------------------------------------------------------------------===//
504 LLVM currently generates stack realignment code, when it is not necessary
505 needed. The problem is that we need to know about stack alignment too early,
508 At that point we don't know, whether there will be vector spill, or not.
509 Stack realignment logic is overly conservative here, but otherwise we can
510 produce unaligned loads/stores.
512 Fixing this will require some huge RA changes.
515 #include <emmintrin.h>
517 typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
519 static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
522 vSInt16 madd(vSInt16 b)
524 return _mm_madd_epi16(a, b);
527 Generated code (x86-32, linux):
532 movaps .LCPI1_0, %xmm1
538 //===---------------------------------------------------------------------===//
541 #include <emmintrin.h>
542 __m128 foo2 (float x) {
543 return _mm_set_ps (0, 0, x, 0);
546 In x86-32 mode, we generate this spiffy code:
550 pshufd $81, %xmm0, %xmm0
553 in x86-64 mode, we generate this code, which could be better:
558 pshufd $81, %xmm1, %xmm0
561 In sse4 mode, we could use insertps to make both better.
563 Here's another testcase that could use insertps [mem]:
565 #include <xmmintrin.h>
567 __m128 foo1 (float x1, float x4) {
568 return _mm_set_ps (x2, x1, x3, x4);
571 gcc mainline compiles it to:
574 insertps $0x10, x2(%rip), %xmm0
575 insertps $0x10, x3(%rip), %xmm1
581 //===---------------------------------------------------------------------===//
583 We compile vector multiply-by-constant into poor code:
585 define <4 x i32> @f(<4 x i32> %i) nounwind {
586 %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
590 On targets without SSE4.1, this compiles into:
592 LCPI1_0: ## <4 x i32>
601 pshufd $3, %xmm0, %xmm1
603 imull LCPI1_0+12, %eax
605 pshufd $1, %xmm0, %xmm2
607 imull LCPI1_0+4, %eax
609 punpckldq %xmm1, %xmm2
615 imull LCPI1_0+8, %eax
617 punpckldq %xmm0, %xmm1
619 punpckldq %xmm2, %xmm0
622 It would be better to synthesize integer vector multiplication by constants
623 using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
624 simple cases such as multiplication by powers of two would be better as
625 vector shifts than as multiplications.
627 //===---------------------------------------------------------------------===//
634 return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
640 pinsrw $2, %eax, %xmm0
642 pinsrw $3, %eax, %xmm0
644 pinsrw $7, %eax, %xmm0
650 movzbl 16(%esp), %eax
652 pinsrw $3, %eax, %xmm0
667 With SSE4, it should be
668 movdqa .LC0(%rip), %xmm0
669 pinsrb $6, %edi, %xmm0
671 //===---------------------------------------------------------------------===//
673 We should transform a shuffle of two vectors of constants into a single vector
674 of constants. Also, insertelement of a constant into a vector of constants
675 should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
677 We compiled it to something horrible:
681 .long 1065353216 ## float 1
685 LCPI1_0: ## <4 x float>
687 .long 1065353216 ## float 1
689 .long 1065353216 ## float 1
695 movhps LCPI1_0, %xmm0
698 shufps $2, %xmm1, %xmm2
699 shufps $132, %xmm2, %xmm0
702 //===---------------------------------------------------------------------===//
707 float foo(unsigned char x) {
711 compiles to (x86-32):
713 define float @foo(i8 zeroext %x) nounwind {
714 %tmp12 = uitofp i8 %x to float ; <float> [#uses=1]
729 We should be able to use:
730 cvtsi2ss 8($esp), %xmm0
731 since we know the stack slot is already zext'd.
733 //===---------------------------------------------------------------------===//
735 Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
736 when code size is critical. movlps is slower than movsd on core2 but it's one
739 //===---------------------------------------------------------------------===//
741 We should use a dynamic programming based approach to tell when using FPStack
742 operations is cheaper than SSE. SciMark montecarlo contains code like this
745 double MonteCarlo_num_flops(int Num_samples) {
746 return ((double) Num_samples)* 4.0;
749 In fpstack mode, this compiles into:
752 .long 1082130432 ## float 4.000000e+00
753 _MonteCarlo_num_flops:
762 in SSE mode, it compiles into significantly slower code:
764 _MonteCarlo_num_flops:
766 cvtsi2sd 16(%esp), %xmm0
773 There are also other cases in scimark where using fpstack is better, it is
774 cheaper to do fld1 than load from a constant pool for example, so
775 "load, add 1.0, store" is better done in the fp stack, etc.
777 //===---------------------------------------------------------------------===//
779 The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to
780 "cmpsd". For example, this code:
782 double d1(double x) { return x == x ? x : x + x; }
794 Also, the 'ret's should be shared. This is PR6032.
796 //===---------------------------------------------------------------------===//
798 These should compile into the same code (PR6214): Perhaps instcombine should
799 canonicalize the former into the later?
801 define float @foo(float %x) nounwind {
802 %t = bitcast float %x to i32
803 %s = and i32 %t, 2147483647
804 %d = bitcast i32 %s to float
808 declare float @fabsf(float %n)
809 define float @bar(float %x) nounwind {
810 %d = call float @fabsf(float %x)
814 //===---------------------------------------------------------------------===//
816 This IR (from PR6194):
818 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
819 target triple = "x86_64-apple-darwin10.0.0"
821 %0 = type { double, double }
822 %struct.float3 = type { float, float, float }
824 define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
826 %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1]
827 %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1]
828 %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1]
829 %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1]
830 %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1]
831 %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1]
832 %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
833 store float %tmp12, float* %tmp5
845 This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
846 doing a shuffle from v[1] to v[0] then a float store.
848 //===---------------------------------------------------------------------===//
850 On SSE4 machines, we compile this code:
852 define <2 x float> @test2(<2 x float> %Q, <2 x float> %R,
853 <2 x float> *%P) nounwind {
854 %Z = fadd <2 x float> %Q, %R
856 store <2 x float> %Z, <2 x float> *%P
864 insertps $0, %xmm2, %xmm2
865 insertps $16, %xmm3, %xmm2
866 insertps $0, %xmm0, %xmm3
867 insertps $16, %xmm1, %xmm3
871 pshufd $1, %xmm3, %xmm1
872 ## kill: XMM1<def> XMM1<kill>
875 The insertps's of $0 are pointless complex copies.
877 //===---------------------------------------------------------------------===//