1 //===---------------------------------------------------------------------===//
2 // Random ideas for the X86 backend: SSE-specific stuff.
3 //===---------------------------------------------------------------------===//
5 //===---------------------------------------------------------------------===//
7 There are serious issues folding loads into "scalar sse" intrinsics. For
10 float minss4( float x, float *y ) {
11 return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(x),_mm_set_ss(*y)));
19 *** movss 8(%esp), %xmm0
20 *** movss (%eax), %xmm1
21 *** minss %xmm1, %xmm0
27 Each operand of the minss is a load. At least one should be folded!
29 //===---------------------------------------------------------------------===//
31 Expand libm rounding functions inline: Significant speedups possible.
32 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
34 //===---------------------------------------------------------------------===//
36 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
39 //===---------------------------------------------------------------------===//
41 Think about doing i64 math in SSE regs.
43 //===---------------------------------------------------------------------===//
45 This testcase should have no SSE instructions in it, and only one load from
48 double %test3(bool %B) {
49 %C = select bool %B, double 123.412, double 523.01123123
53 Currently, the select is being lowered, which prevents the dag combiner from
54 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
56 The pattern isel got this one right.
58 //===---------------------------------------------------------------------===//
60 SSE doesn't have [mem] op= reg instructions. If we have an SSE instruction
65 and the register allocator decides to spill X, it is cheaper to emit this as:
76 ..and this uses one fewer register (so this should be done at load folding
77 time, not at spiller time). *Note* however that this can only be done
78 if Y is dead. Here's a testcase:
80 %.str_3 = external global [15 x sbyte] ; <[15 x sbyte]*> [#uses=0]
81 implementation ; Functions:
82 declare void %printf(int, ...)
86 no_exit.i7: ; preds = %no_exit.i7, %build_tree.exit
87 %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ] ; <double> [#uses=1]
88 %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ] ; <double> [#uses=1]
89 %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
90 %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
91 br bool false, label %Compute_Tree.exit23, label %no_exit.i7
92 Compute_Tree.exit23: ; preds = %no_exit.i7
93 tail call void (int, ...)* %printf( int 0 )
94 store double %tmp.34.i18, double* null
103 *** movsd %XMM2, QWORD PTR [%ESP + 8]
104 *** addsd %XMM2, %XMM1
105 *** movsd QWORD PTR [%ESP + 8], %XMM2
106 jmp .BBmain_1 # no_exit.i7
108 This is a bugpoint reduced testcase, which is why the testcase doesn't make
109 much sense (e.g. its an infinite loop). :)
111 //===---------------------------------------------------------------------===//
113 SSE should implement 'select_cc' using 'emulated conditional moves' that use
114 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
116 double %X(double %Y, double %Z, double %A, double %B) {
117 %C = setlt double %A, %B
118 %z = add double %Z, 0.0 ;; select operand is not a load
119 %D = select bool %C, double %Y, double %z
128 addsd 24(%esp), %xmm0
129 movsd 32(%esp), %xmm1
130 movsd 16(%esp), %xmm2
131 ucomisd 40(%esp), %xmm1
141 //===---------------------------------------------------------------------===//
143 It's not clear whether we should use pxor or xorps / xorpd to clear XMM
144 registers. The choice may depend on subtarget information. We should do some
145 more experiments on different x86 machines.
147 //===---------------------------------------------------------------------===//
149 Currently the x86 codegen isn't very good at mixing SSE and FPStack
152 unsigned int foo(double x) { return x; }
156 movsd 24(%esp), %xmm0
164 This will be solved when we go to a dynamic programming based isel.
166 //===---------------------------------------------------------------------===//
168 Should generate min/max for stuff like:
170 void minf(float a, float b, float *X) {
174 Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
175 and ISD::FMAX node types?
177 //===---------------------------------------------------------------------===//
179 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
182 //===---------------------------------------------------------------------===//
184 Teach the coalescer to commute 2-addr instructions, allowing us to eliminate
185 the reg-reg copy in this example:
187 float foo(int *x, float *y, unsigned c) {
190 for (i = 0; i < c; i++) {
191 float xx = (float)x[i];
200 cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI]
201 mulss %XMM0, DWORD PTR [%EAX + 4*%ESI]
205 **** movaps %XMM1, %XMM0
206 jb LBB_foo_3 # no_exit
208 //===---------------------------------------------------------------------===//
211 if (copysign(1.0, x) == copysign(1.0, y))
216 //===---------------------------------------------------------------------===//
218 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
221 //===---------------------------------------------------------------------===//
223 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
224 Perhaps use pxor / xorp* to clear a XMM register first?
226 //===---------------------------------------------------------------------===//
230 void f(float a, float b, vector float * out) { *out = (vector float){ a, 0.0, 0.0, b}; }
231 void f(float a, float b, vector float * out) { *out = (vector float){ a, b, 0.0, 0}; }
233 For the later we generate:
239 unpcklps %xmm1, %xmm2
241 unpcklps %xmm0, %xmm1
242 unpcklps %xmm2, %xmm1
247 This seems like it should use shufps, one for each of a & b.
249 //===---------------------------------------------------------------------===//
251 How to decide when to use the "floating point version" of logical ops? Here are
254 movaps LCPI5_5, %xmm2
257 mulps 8656(%ecx), %xmm3
258 addps 8672(%ecx), %xmm3
264 movaps LCPI5_5, %xmm1
267 mulps 8656(%ecx), %xmm3
268 addps 8672(%ecx), %xmm3
272 movaps %xmm3, 112(%esp)
275 Due to some minor source change, the later case ended up using orps and movaps
276 instead of por and movdqa. Does it matter?
278 //===---------------------------------------------------------------------===//
280 X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
281 to choose between movaps, movapd, and movdqa based on types of source and
284 How about andps, andpd, and pand? Do we really care about the type of the packed
285 elements? If not, why not always use the "ps" variants which are likely to be
288 //===---------------------------------------------------------------------===//
290 External test Nurbs exposed some problems. Look for
291 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
294 movaps (%edx), %xmm2 #59.21
295 movaps (%edx), %xmm5 #60.21
296 movaps (%edx), %xmm4 #61.21
297 movaps (%edx), %xmm3 #62.21
298 movl 40(%ecx), %ebp #69.49
299 shufps $0, %xmm2, %xmm5 #60.21
300 movl 100(%esp), %ebx #69.20
301 movl (%ebx), %edi #69.20
302 imull %ebp, %edi #69.49
303 addl (%eax), %edi #70.33
304 shufps $85, %xmm2, %xmm4 #61.21
305 shufps $170, %xmm2, %xmm3 #62.21
306 shufps $255, %xmm2, %xmm2 #63.21
307 lea (%ebp,%ebp,2), %ebx #69.49
309 lea -3(%edi,%ebx), %ebx #70.33
311 addl 32(%ecx), %ebx #68.37
312 testb $15, %bl #91.13
313 jne L_B1.24 # Prob 5% #91.13
315 This is the llvm code after instruction scheduling:
317 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
318 %reg1078 = MOV32ri -3
319 %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
320 %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
321 %reg1080 = IMUL32rr %reg1079, %reg1037
322 %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
323 %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
324 %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
325 %reg1082 = SHL32ri %reg1038, 4
326 %reg1039 = ADD32rr %reg1036, %reg1082
327 %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
328 %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
329 %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
330 %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
331 %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
332 %reg1040 = MOV32rr %reg1039
333 %reg1084 = AND32ri8 %reg1039, 15
335 JE mbb<cond_next204,0xa914d30>
337 Still ok. After register allocation:
339 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
341 %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
342 ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
343 %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
344 %EDX = MOV32rm %EDX, 1, %NOREG, 40
345 IMUL32rr %EAX<def&use>, %EDX
346 %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
347 %ESI = MOV32rm %ESI, 1, %NOREG, 0
348 MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
349 %EAX = LEA32r %ESI, 1, %EAX, -3
350 %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
351 %ESI = MOV32rm %ESI, 1, %NOREG, 32
353 SHL32ri %EDI<def&use>, 4
354 ADD32rr %EDI<def&use>, %ESI
355 %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
356 %XMM1 = MOVAPSrr %XMM0
357 SHUFPSrr %XMM1<def&use>, %XMM1, 170
358 %XMM2 = MOVAPSrr %XMM0
359 SHUFPSrr %XMM2<def&use>, %XMM2, 0
360 %XMM3 = MOVAPSrr %XMM0
361 SHUFPSrr %XMM3<def&use>, %XMM3, 255
362 SHUFPSrr %XMM0<def&use>, %XMM0, 85
364 AND32ri8 %EBX<def&use>, 15
366 JE mbb<cond_next204,0xa914d30>
368 This looks really bad. The problem is shufps is a destructive opcode. Since it
369 appears as operand two in more than one shufps ops. It resulted in a number of
370 copies. Note icc also suffers from the same problem. Either the instruction
371 selector should select pshufd or The register allocator can made the two-address
372 to three-address transformation.
374 It also exposes some other problems. See MOV32ri -3 and the spills.
376 //===---------------------------------------------------------------------===//
378 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
380 LLVM is producing bad code.
382 LBB_main_4: # cond_true44
393 jne LBB_main_4 # cond_true44
395 There are two problems. 1) No need to two loop induction variables. We can
396 compare against 262144 * 16. 2) Known register coalescer issue. We should
397 be able eliminate one of the movaps:
399 addps %xmm2, %xmm1 <=== Commute!
402 movaps %xmm1, %xmm1 <=== Eliminate!
409 jne LBB_main_4 # cond_true44
411 //===---------------------------------------------------------------------===//
415 __m128 test(float a) {
416 return _mm_set_ps(0.0, 0.0, 0.0, a*a);
427 Because mulss doesn't modify the top 3 elements, the top elements of
428 xmm1 are already zero'd. We could compile this to:
434 //===---------------------------------------------------------------------===//
436 Here's a sick and twisted idea. Consider code like this:
438 __m128 test(__m128 a) {
439 float b = *(float*)&A;
441 return _mm_set_ps(0.0, 0.0, 0.0, b);
444 This might compile to this code:
446 movaps c(%esp), %xmm1
451 Now consider if the ... code caused xmm1 to get spilled. This might produce
454 movaps c(%esp), %xmm1
455 movaps %xmm1, c2(%esp)
459 movaps c2(%esp), %xmm1
463 However, since the reload is only used by these instructions, we could
464 "fold" it into the uses, producing something like this:
466 movaps c(%esp), %xmm1
467 movaps %xmm1, c2(%esp)
470 movss c2(%esp), %xmm0
473 ... saving two instructions.
475 The basic idea is that a reload from a spill slot, can, if only one 4-byte
476 chunk is used, bring in 3 zeros the the one element instead of 4 elements.
477 This can be used to simplify a variety of shuffle operations, where the
478 elements are fixed zeros.
480 //===---------------------------------------------------------------------===//
484 #include <emmintrin.h>
485 void test(__m128d *r, __m128d *A, double B) {
486 *r = _mm_loadl_pd(*A, &B);
492 movsd 24(%esp), %xmm0
504 movl 4(%esp), %edx #3.6
505 movl 8(%esp), %eax #3.6
506 movapd (%eax), %xmm0 #4.22
507 movlpd 12(%esp), %xmm0 #4.8
508 movapd %xmm0, (%edx) #4.3
511 So icc is smart enough to know that B is in memory so it doesn't load it and
512 store it back to stack.
514 //===---------------------------------------------------------------------===//
516 __m128d test1( __m128d A, __m128d B) {
517 return _mm_shuffle_pd(A, B, 0x3);
522 shufpd $3, %xmm1, %xmm0
524 Perhaps it's better to use unpckhpd instead?
526 unpckhpd %xmm1, %xmm0
528 Don't know if unpckhpd is faster. But it is shorter.
530 //===---------------------------------------------------------------------===//
532 This code generates ugly code, probably due to costs being off or something:
534 void %test(float* %P, <4 x float>* %P2 ) {
535 %xFloat0.688 = load float* %P
536 %loadVector37.712 = load <4 x float>* %P2
537 %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
538 store <4 x float> %inFloat3.713, <4 x float>* %P2
546 movd %xmm0, %eax ;; EAX = 0!
549 pinsrw $6, %eax, %xmm0
550 shrl $16, %eax ;; EAX = 0 again!
551 pinsrw $7, %eax, %xmm0
555 It would be better to generate:
561 pinsrw $6, %eax, %xmm0
562 pinsrw $7, %eax, %xmm0
566 or use pxor (to make a zero vector) and shuffle (to insert it).
568 //===---------------------------------------------------------------------===//
570 Some useful information in the Apple Altivec / SSE Migration Guide:
572 http://developer.apple.com/documentation/Performance/Conceptual/
573 Accelerate_sse_migration/index.html
575 e.g. SSE select using and, andnot, or. Various SSE compare translations.
577 //===---------------------------------------------------------------------===//
579 Add hooks to commute some CMPP operations.
581 //===---------------------------------------------------------------------===//
583 Implement some missing insert/extract element operations without going through
584 the stack. Testcase here:
585 CodeGen/X86/vec_ins_extract.ll
586 corresponds to this C code:
588 typedef float vectorfloat __attribute__((vector_size(16)));
589 void test(vectorfloat *F, float f) {
590 vectorfloat G = *F + *F;
594 void test2(vectorfloat *F, float f) {
595 vectorfloat G = *F + *F;
599 void test3(vectorfloat *F, float *f) {
600 vectorfloat G = *F + *F;
601 *f = ((float*)&G)[2];
603 void test4(vectorfloat *F, float *f) {
604 vectorfloat G = *F + *F;
608 //===---------------------------------------------------------------------===//
610 Apply the same transformation that merged four float into a single 128-bit load
611 to loads from constant pool.