1 //===---------------------------------------------------------------------===//
2 // Random ideas for the X86 backend: SSE-specific stuff.
3 //===---------------------------------------------------------------------===//
5 //===---------------------------------------------------------------------===//
7 There are serious issues folding loads into "scalar sse" intrinsics. For
10 float minss4( float x, float *y ) {
11 return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(x),_mm_set_ss(*y)));
19 *** movss 8(%esp), %xmm0
20 *** movss (%eax), %xmm1
21 *** minss %xmm1, %xmm0
27 Each operand of the minss is a load. At least one should be folded!
29 //===---------------------------------------------------------------------===//
31 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
34 //===---------------------------------------------------------------------===//
36 Think about doing i64 math in SSE regs.
38 //===---------------------------------------------------------------------===//
40 This testcase should have no SSE instructions in it, and only one load from
43 double %test3(bool %B) {
44 %C = select bool %B, double 123.412, double 523.01123123
48 Currently, the select is being lowered, which prevents the dag combiner from
49 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
51 The pattern isel got this one right.
53 //===---------------------------------------------------------------------===//
55 SSE doesn't have [mem] op= reg instructions. If we have an SSE instruction
60 and the register allocator decides to spill X, it is cheaper to emit this as:
71 ..and this uses one fewer register (so this should be done at load folding
72 time, not at spiller time). *Note* however that this can only be done
73 if Y is dead. Here's a testcase:
75 %.str_3 = external global [15 x sbyte] ; <[15 x sbyte]*> [#uses=0]
76 implementation ; Functions:
77 declare void %printf(int, ...)
81 no_exit.i7: ; preds = %no_exit.i7, %build_tree.exit
82 %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ] ; <double> [#uses=1]
83 %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ] ; <double> [#uses=1]
84 %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
85 %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
86 br bool false, label %Compute_Tree.exit23, label %no_exit.i7
87 Compute_Tree.exit23: ; preds = %no_exit.i7
88 tail call void (int, ...)* %printf( int 0 )
89 store double %tmp.34.i18, double* null
98 *** movsd %XMM2, QWORD PTR [%ESP + 8]
99 *** addsd %XMM2, %XMM1
100 *** movsd QWORD PTR [%ESP + 8], %XMM2
101 jmp .BBmain_1 # no_exit.i7
103 This is a bugpoint reduced testcase, which is why the testcase doesn't make
104 much sense (e.g. its an infinite loop). :)
106 //===---------------------------------------------------------------------===//
108 SSE should implement 'select_cc' using 'emulated conditional moves' that use
109 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
111 double %X(double %Y, double %Z, double %A, double %B) {
112 %C = setlt double %A, %B
113 %z = add double %Z, 0.0 ;; select operand is not a load
114 %D = select bool %C, double %Y, double %z
123 addsd 24(%esp), %xmm0
124 movsd 32(%esp), %xmm1
125 movsd 16(%esp), %xmm2
126 ucomisd 40(%esp), %xmm1
136 //===---------------------------------------------------------------------===//
138 It's not clear whether we should use pxor or xorps / xorpd to clear XMM
139 registers. The choice may depend on subtarget information. We should do some
140 more experiments on different x86 machines.
142 //===---------------------------------------------------------------------===//
144 Currently the x86 codegen isn't very good at mixing SSE and FPStack
147 unsigned int foo(double x) { return x; }
151 movsd 24(%esp), %xmm0
159 This will be solved when we go to a dynamic programming based isel.
161 //===---------------------------------------------------------------------===//
163 Should generate min/max for stuff like:
165 void minf(float a, float b, float *X) {
169 Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
170 and ISD::FMAX node types?
172 //===---------------------------------------------------------------------===//
174 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
177 //===---------------------------------------------------------------------===//
179 Teach the coalescer to commute 2-addr instructions, allowing us to eliminate
180 the reg-reg copy in this example:
182 float foo(int *x, float *y, unsigned c) {
185 for (i = 0; i < c; i++) {
186 float xx = (float)x[i];
195 cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI]
196 mulss %XMM0, DWORD PTR [%EAX + 4*%ESI]
200 **** movaps %XMM1, %XMM0
201 jb LBB_foo_3 # no_exit
203 //===---------------------------------------------------------------------===//
206 if (copysign(1.0, x) == copysign(1.0, y))
211 //===---------------------------------------------------------------------===//
213 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
216 //===---------------------------------------------------------------------===//
218 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
219 Perhaps use pxor / xorp* to clear a XMM register first?
221 //===---------------------------------------------------------------------===//
225 void f(float a, float b, vector float * out) { *out = (vector float){ a, 0.0, 0.0, b}; }
226 void f(float a, float b, vector float * out) { *out = (vector float){ a, b, 0.0, 0}; }
228 For the later we generate:
234 unpcklps %xmm1, %xmm2
236 unpcklps %xmm0, %xmm1
237 unpcklps %xmm2, %xmm1
242 This seems like it should use shufps, one for each of a & b.
244 //===---------------------------------------------------------------------===//
246 How to decide when to use the "floating point version" of logical ops? Here are
249 movaps LCPI5_5, %xmm2
252 mulps 8656(%ecx), %xmm3
253 addps 8672(%ecx), %xmm3
259 movaps LCPI5_5, %xmm1
262 mulps 8656(%ecx), %xmm3
263 addps 8672(%ecx), %xmm3
267 movaps %xmm3, 112(%esp)
270 Due to some minor source change, the later case ended up using orps and movaps
271 instead of por and movdqa. Does it matter?
273 //===---------------------------------------------------------------------===//
275 X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
276 to choose between movaps, movapd, and movdqa based on types of source and
279 How about andps, andpd, and pand? Do we really care about the type of the packed
280 elements? If not, why not always use the "ps" variants which are likely to be
283 //===---------------------------------------------------------------------===//
285 External test Nurbs exposed some problems. Look for
286 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
289 movaps (%edx), %xmm2 #59.21
290 movaps (%edx), %xmm5 #60.21
291 movaps (%edx), %xmm4 #61.21
292 movaps (%edx), %xmm3 #62.21
293 movl 40(%ecx), %ebp #69.49
294 shufps $0, %xmm2, %xmm5 #60.21
295 movl 100(%esp), %ebx #69.20
296 movl (%ebx), %edi #69.20
297 imull %ebp, %edi #69.49
298 addl (%eax), %edi #70.33
299 shufps $85, %xmm2, %xmm4 #61.21
300 shufps $170, %xmm2, %xmm3 #62.21
301 shufps $255, %xmm2, %xmm2 #63.21
302 lea (%ebp,%ebp,2), %ebx #69.49
304 lea -3(%edi,%ebx), %ebx #70.33
306 addl 32(%ecx), %ebx #68.37
307 testb $15, %bl #91.13
308 jne L_B1.24 # Prob 5% #91.13
310 This is the llvm code after instruction scheduling:
312 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
313 %reg1078 = MOV32ri -3
314 %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
315 %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
316 %reg1080 = IMUL32rr %reg1079, %reg1037
317 %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
318 %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
319 %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
320 %reg1082 = SHL32ri %reg1038, 4
321 %reg1039 = ADD32rr %reg1036, %reg1082
322 %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
323 %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
324 %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
325 %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
326 %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
327 %reg1040 = MOV32rr %reg1039
328 %reg1084 = AND32ri8 %reg1039, 15
330 JE mbb<cond_next204,0xa914d30>
332 Still ok. After register allocation:
334 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
336 %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
337 ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
338 %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
339 %EDX = MOV32rm %EDX, 1, %NOREG, 40
340 IMUL32rr %EAX<def&use>, %EDX
341 %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
342 %ESI = MOV32rm %ESI, 1, %NOREG, 0
343 MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
344 %EAX = LEA32r %ESI, 1, %EAX, -3
345 %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
346 %ESI = MOV32rm %ESI, 1, %NOREG, 32
348 SHL32ri %EDI<def&use>, 4
349 ADD32rr %EDI<def&use>, %ESI
350 %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
351 %XMM1 = MOVAPSrr %XMM0
352 SHUFPSrr %XMM1<def&use>, %XMM1, 170
353 %XMM2 = MOVAPSrr %XMM0
354 SHUFPSrr %XMM2<def&use>, %XMM2, 0
355 %XMM3 = MOVAPSrr %XMM0
356 SHUFPSrr %XMM3<def&use>, %XMM3, 255
357 SHUFPSrr %XMM0<def&use>, %XMM0, 85
359 AND32ri8 %EBX<def&use>, 15
361 JE mbb<cond_next204,0xa914d30>
363 This looks really bad. The problem is shufps is a destructive opcode. Since it
364 appears as operand two in more than one shufps ops. It resulted in a number of
365 copies. Note icc also suffers from the same problem. Either the instruction
366 selector should select pshufd or The register allocator can made the two-address
367 to three-address transformation.
369 It also exposes some other problems. See MOV32ri -3 and the spills.
371 //===---------------------------------------------------------------------===//
373 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
375 LLVM is producing bad code.
377 LBB_main_4: # cond_true44
388 jne LBB_main_4 # cond_true44
390 There are two problems. 1) No need to two loop induction variables. We can
391 compare against 262144 * 16. 2) Known register coalescer issue. We should
392 be able eliminate one of the movaps:
394 addps %xmm2, %xmm1 <=== Commute!
397 movaps %xmm1, %xmm1 <=== Eliminate!
404 jne LBB_main_4 # cond_true44
406 //===---------------------------------------------------------------------===//
410 __m128 test(float a) {
411 return _mm_set_ps(0.0, 0.0, 0.0, a*a);
422 Because mulss doesn't modify the top 3 elements, the top elements of
423 xmm1 are already zero'd. We could compile this to:
429 //===---------------------------------------------------------------------===//
431 Here's a sick and twisted idea. Consider code like this:
433 __m128 test(__m128 a) {
434 float b = *(float*)&A;
436 return _mm_set_ps(0.0, 0.0, 0.0, b);
439 This might compile to this code:
441 movaps c(%esp), %xmm1
446 Now consider if the ... code caused xmm1 to get spilled. This might produce
449 movaps c(%esp), %xmm1
450 movaps %xmm1, c2(%esp)
454 movaps c2(%esp), %xmm1
458 However, since the reload is only used by these instructions, we could
459 "fold" it into the uses, producing something like this:
461 movaps c(%esp), %xmm1
462 movaps %xmm1, c2(%esp)
465 movss c2(%esp), %xmm0
468 ... saving two instructions.
470 The basic idea is that a reload from a spill slot, can, if only one 4-byte
471 chunk is used, bring in 3 zeros the the one element instead of 4 elements.
472 This can be used to simplify a variety of shuffle operations, where the
473 elements are fixed zeros.
475 //===---------------------------------------------------------------------===//
479 #include <emmintrin.h>
480 void test(__m128d *r, __m128d *A, double B) {
481 *r = _mm_loadl_pd(*A, &B);
487 movsd 24(%esp), %xmm0
499 movl 4(%esp), %edx #3.6
500 movl 8(%esp), %eax #3.6
501 movapd (%eax), %xmm0 #4.22
502 movlpd 12(%esp), %xmm0 #4.8
503 movapd %xmm0, (%edx) #4.3
506 So icc is smart enough to know that B is in memory so it doesn't load it and
507 store it back to stack.
509 //===---------------------------------------------------------------------===//
511 __m128d test1( __m128d A, __m128d B) {
512 return _mm_shuffle_pd(A, B, 0x3);
517 shufpd $3, %xmm1, %xmm0
519 Perhaps it's better to use unpckhpd instead?
521 unpckhpd %xmm1, %xmm0
523 Don't know if unpckhpd is faster. But it is shorter.
525 //===---------------------------------------------------------------------===//
527 This code generates ugly code, probably due to costs being off or something:
529 void %test(float* %P, <4 x float>* %P2 ) {
530 %xFloat0.688 = load float* %P
531 %loadVector37.712 = load <4 x float>* %P2
532 %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
533 store <4 x float> %inFloat3.713, <4 x float>* %P2
541 movd %xmm0, %eax ;; EAX = 0!
544 pinsrw $6, %eax, %xmm0
545 shrl $16, %eax ;; EAX = 0 again!
546 pinsrw $7, %eax, %xmm0
550 It would be better to generate:
556 pinsrw $6, %eax, %xmm0
557 pinsrw $7, %eax, %xmm0
561 or use pxor (to make a zero vector) and shuffle (to insert it).
563 //===---------------------------------------------------------------------===//
565 Some useful information in the Apple Altivec / SSE Migration Guide:
567 http://developer.apple.com/documentation/Performance/Conceptual/
568 Accelerate_sse_migration/index.html
570 e.g. SSE select using and, andnot, or. Various SSE compare translations.
572 //===---------------------------------------------------------------------===//
574 Add hooks to commute some CMPP operations.
576 //===---------------------------------------------------------------------===//
578 Implement some missing insert/extract element operations without going through
579 the stack. Testcase here:
580 CodeGen/X86/vec_ins_extract.ll
581 corresponds to this C code:
583 typedef float vectorfloat __attribute__((vector_size(16)));
584 void test(vectorfloat *F, float f) {
585 vectorfloat G = *F + *F;
589 void test2(vectorfloat *F, float f) {
590 vectorfloat G = *F + *F;
594 void test3(vectorfloat *F, float *f) {
595 vectorfloat G = *F + *F;
596 *f = ((float*)&G)[2];
598 void test4(vectorfloat *F, float *f) {
599 vectorfloat G = *F + *F;
603 //===---------------------------------------------------------------------===//
605 Apply the same transformation that merged four float into a single 128-bit load
606 to loads from constant pool.