1 //===---------------------------------------------------------------------===//
2 // Random ideas for the X86 backend: SSE-specific stuff.
3 //===---------------------------------------------------------------------===//
5 //===---------------------------------------------------------------------===//
7 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
10 //===---------------------------------------------------------------------===//
12 Think about doing i64 math in SSE regs.
14 //===---------------------------------------------------------------------===//
16 This testcase should have no SSE instructions in it, and only one load from
19 double %test3(bool %B) {
20 %C = select bool %B, double 123.412, double 523.01123123
24 Currently, the select is being lowered, which prevents the dag combiner from
25 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
27 The pattern isel got this one right.
29 //===---------------------------------------------------------------------===//
31 SSE doesn't have [mem] op= reg instructions. If we have an SSE instruction
36 and the register allocator decides to spill X, it is cheaper to emit this as:
47 ..and this uses one fewer register (so this should be done at load folding
48 time, not at spiller time). *Note* however that this can only be done
49 if Y is dead. Here's a testcase:
51 %.str_3 = external global [15 x sbyte] ; <[15 x sbyte]*> [#uses=0]
52 implementation ; Functions:
53 declare void %printf(int, ...)
57 no_exit.i7: ; preds = %no_exit.i7, %build_tree.exit
58 %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ] ; <double> [#uses=1]
59 %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ] ; <double> [#uses=1]
60 %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
61 %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
62 br bool false, label %Compute_Tree.exit23, label %no_exit.i7
63 Compute_Tree.exit23: ; preds = %no_exit.i7
64 tail call void (int, ...)* %printf( int 0 )
65 store double %tmp.34.i18, double* null
74 *** movsd %XMM2, QWORD PTR [%ESP + 8]
75 *** addsd %XMM2, %XMM1
76 *** movsd QWORD PTR [%ESP + 8], %XMM2
77 jmp .BBmain_1 # no_exit.i7
79 This is a bugpoint reduced testcase, which is why the testcase doesn't make
80 much sense (e.g. its an infinite loop). :)
82 //===---------------------------------------------------------------------===//
84 SSE should implement 'select_cc' using 'emulated conditional moves' that use
85 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
87 double %X(double %Y, double %Z, double %A, double %B) {
88 %C = setlt double %A, %B
89 %z = add double %Z, 0.0 ;; select operand is not a load
90 %D = select bool %C, double %Y, double %z
100 movsd 32(%esp), %xmm1
101 movsd 16(%esp), %xmm2
102 ucomisd 40(%esp), %xmm1
112 //===---------------------------------------------------------------------===//
114 It's not clear whether we should use pxor or xorps / xorpd to clear XMM
115 registers. The choice may depend on subtarget information. We should do some
116 more experiments on different x86 machines.
118 //===---------------------------------------------------------------------===//
120 Currently the x86 codegen isn't very good at mixing SSE and FPStack
123 unsigned int foo(double x) { return x; }
127 movsd 24(%esp), %xmm0
135 This will be solved when we go to a dynamic programming based isel.
137 //===---------------------------------------------------------------------===//
139 Should generate min/max for stuff like:
141 void minf(float a, float b, float *X) {
145 Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
146 and ISD::FMAX node types?
148 //===---------------------------------------------------------------------===//
150 The first BB of this code:
154 %V = call bool %foo()
155 br bool %V, label %T, label %F
172 It would be better to emit "cmp %al, 1" than a xor and test.
174 //===---------------------------------------------------------------------===//
176 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
179 //===---------------------------------------------------------------------===//
181 Teach the coalescer to commute 2-addr instructions, allowing us to eliminate
182 the reg-reg copy in this example:
184 float foo(int *x, float *y, unsigned c) {
187 for (i = 0; i < c; i++) {
188 float xx = (float)x[i];
197 cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI]
198 mulss %XMM0, DWORD PTR [%EAX + 4*%ESI]
202 **** movaps %XMM1, %XMM0
203 jb LBB_foo_3 # no_exit
205 //===---------------------------------------------------------------------===//
208 if (copysign(1.0, x) == copysign(1.0, y))
213 //===---------------------------------------------------------------------===//
215 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
218 //===---------------------------------------------------------------------===//
220 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
221 Perhaps use pxor / xorp* to clear a XMM register first?
223 //===---------------------------------------------------------------------===//
227 void f(float a, float b, vector float * out) { *out = (vector float){ a, 0.0, 0.0, b}; }
228 void f(float a, float b, vector float * out) { *out = (vector float){ a, b, 0.0, 0}; }
230 For the later we generate:
236 unpcklps %xmm1, %xmm2
238 unpcklps %xmm0, %xmm1
239 unpcklps %xmm2, %xmm1
244 This seems like it should use shufps, one for each of a & b.
246 //===---------------------------------------------------------------------===//
248 How to decide when to use the "floating point version" of logical ops? Here are
251 movaps LCPI5_5, %xmm2
254 mulps 8656(%ecx), %xmm3
255 addps 8672(%ecx), %xmm3
261 movaps LCPI5_5, %xmm1
264 mulps 8656(%ecx), %xmm3
265 addps 8672(%ecx), %xmm3
269 movaps %xmm3, 112(%esp)
272 Due to some minor source change, the later case ended up using orps and movaps
273 instead of por and movdqa. Does it matter?
275 //===---------------------------------------------------------------------===//
277 Use movddup to splat a v2f64 directly from a memory source. e.g.
279 #include <emmintrin.h>
281 void test(__m128d *r, double A) {
289 unpcklpd %xmm0, %xmm0
298 movddup 8(%esp), %xmm0
302 //===---------------------------------------------------------------------===//
304 X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
305 to choose between movaps, movapd, and movdqa based on types of source and
308 How about andps, andpd, and pand? Do we really care about the type of the packed
309 elements? If not, why not always use the "ps" variants which are likely to be
312 //===---------------------------------------------------------------------===//
314 We are emitting bad code for this:
316 float %test(float* %V, int %I, int %D, float %V) {
318 %tmp = seteq int %D, 0
319 br bool %tmp, label %cond_true, label %cond_false23
322 %tmp3 = getelementptr float* %V, int %I
323 %tmp = load float* %tmp3
324 %tmp5 = setgt float %tmp, %V
325 %tmp6 = tail call bool %llvm.isunordered.f32( float %tmp, float %V )
326 %tmp7 = or bool %tmp5, %tmp6
327 br bool %tmp7, label %UnifiedReturnBlock, label %cond_next
330 %tmp10 = add int %I, 1
331 %tmp12 = getelementptr float* %V, int %tmp10
332 %tmp13 = load float* %tmp12
333 %tmp15 = setle float %tmp13, %V
334 %tmp16 = tail call bool %llvm.isunordered.f32( float %tmp13, float %V )
335 %tmp17 = or bool %tmp15, %tmp16
336 %retval = select bool %tmp17, float 0.000000e+00, float 1.000000e+00
340 %tmp28 = tail call float %foo( float* %V, int %I, int %D, float %V )
343 UnifiedReturnBlock: ; preds = %cond_true
344 ret float 0.000000e+00
347 declare bool %llvm.isunordered.f32(float, float)
349 declare float %foo(float*, int, int, float)
352 It exposes a known load folding problem:
354 movss (%edx,%ecx,4), %xmm1
359 LBB_test_2: # cond_next
363 jbe LBB_test_6 # cond_next
364 LBB_test_5: # cond_next
366 LBB_test_6: # cond_next
367 movss %xmm3, 40(%esp)
372 Clearly it's unnecessary to clear %xmm3. It's also not clear why we are emitting
373 three moves (movss, movaps, movss).
375 //===---------------------------------------------------------------------===//
377 External test Nurbs exposed some problems. Look for
378 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
381 movaps (%edx), %xmm2 #59.21
382 movaps (%edx), %xmm5 #60.21
383 movaps (%edx), %xmm4 #61.21
384 movaps (%edx), %xmm3 #62.21
385 movl 40(%ecx), %ebp #69.49
386 shufps $0, %xmm2, %xmm5 #60.21
387 movl 100(%esp), %ebx #69.20
388 movl (%ebx), %edi #69.20
389 imull %ebp, %edi #69.49
390 addl (%eax), %edi #70.33
391 shufps $85, %xmm2, %xmm4 #61.21
392 shufps $170, %xmm2, %xmm3 #62.21
393 shufps $255, %xmm2, %xmm2 #63.21
394 lea (%ebp,%ebp,2), %ebx #69.49
396 lea -3(%edi,%ebx), %ebx #70.33
398 addl 32(%ecx), %ebx #68.37
399 testb $15, %bl #91.13
400 jne L_B1.24 # Prob 5% #91.13
402 This is the llvm code after instruction scheduling:
404 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
405 %reg1078 = MOV32ri -3
406 %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
407 %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
408 %reg1080 = IMUL32rr %reg1079, %reg1037
409 %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
410 %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
411 %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
412 %reg1082 = SHL32ri %reg1038, 4
413 %reg1039 = ADD32rr %reg1036, %reg1082
414 %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
415 %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
416 %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
417 %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
418 %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
419 %reg1040 = MOV32rr %reg1039
420 %reg1084 = AND32ri8 %reg1039, 15
422 JE mbb<cond_next204,0xa914d30>
424 Still ok. After register allocation:
426 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
428 %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
429 ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
430 %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
431 %EDX = MOV32rm %EDX, 1, %NOREG, 40
432 IMUL32rr %EAX<def&use>, %EDX
433 %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
434 %ESI = MOV32rm %ESI, 1, %NOREG, 0
435 MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
436 %EAX = LEA32r %ESI, 1, %EAX, -3
437 %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
438 %ESI = MOV32rm %ESI, 1, %NOREG, 32
440 SHL32ri %EDI<def&use>, 4
441 ADD32rr %EDI<def&use>, %ESI
442 %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
443 %XMM1 = MOVAPSrr %XMM0
444 SHUFPSrr %XMM1<def&use>, %XMM1, 170
445 %XMM2 = MOVAPSrr %XMM0
446 SHUFPSrr %XMM2<def&use>, %XMM2, 0
447 %XMM3 = MOVAPSrr %XMM0
448 SHUFPSrr %XMM3<def&use>, %XMM3, 255
449 SHUFPSrr %XMM0<def&use>, %XMM0, 85
451 AND32ri8 %EBX<def&use>, 15
453 JE mbb<cond_next204,0xa914d30>
455 This looks really bad. The problem is shufps is a destructive opcode. Since it
456 appears as operand two in more than one shufps ops. It resulted in a number of
457 copies. Note icc also suffers from the same problem. Either the instruction
458 selector should select pshufd or The register allocator can made the two-address
459 to three-address transformation.
461 It also exposes some other problems. See MOV32ri -3 and the spills.
463 //===---------------------------------------------------------------------===//
465 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
467 LLVM is producing bad code.
469 LBB_main_4: # cond_true44
480 jne LBB_main_4 # cond_true44
482 There are two problems. 1) No need to two loop induction variables. We can
483 compare against 262144 * 16. 2) Known register coalescer issue. We should
484 be able eliminate one of the movaps:
486 addps %xmm2, %xmm1 <=== Commute!
489 movaps %xmm1, %xmm1 <=== Eliminate!
496 jne LBB_main_4 # cond_true44
498 //===---------------------------------------------------------------------===//
502 __m128 test(float a) {
503 return _mm_set_ps(0.0, 0.0, 0.0, a*a);
514 Because mulss doesn't modify the top 3 elements, the top elements of
515 xmm1 are already zero'd. We could compile this to:
521 //===---------------------------------------------------------------------===//
523 Here's a sick and twisted idea. Consider code like this:
525 __m128 test(__m128 a) {
526 float b = *(float*)&A;
528 return _mm_set_ps(0.0, 0.0, 0.0, b);
531 This might compile to this code:
533 movaps c(%esp), %xmm1
538 Now consider if the ... code caused xmm1 to get spilled. This might produce
541 movaps c(%esp), %xmm1
542 movaps %xmm1, c2(%esp)
546 movaps c2(%esp), %xmm1
550 However, since the reload is only used by these instructions, we could
551 "fold" it into the uses, producing something like this:
553 movaps c(%esp), %xmm1
554 movaps %xmm1, c2(%esp)
557 movss c2(%esp), %xmm0
560 ... saving two instructions.
562 The basic idea is that a reload from a spill slot, can, if only one 4-byte
563 chunk is used, bring in 3 zeros the the one element instead of 4 elements.
564 This can be used to simplify a variety of shuffle operations, where the
565 elements are fixed zeros.
567 //===---------------------------------------------------------------------===//
571 #include <emmintrin.h>
572 void test(__m128d *r, __m128d *A, double B) {
573 *r = _mm_loadl_pd(*A, &B);
579 movsd 24(%esp), %xmm0
591 movl 4(%esp), %edx #3.6
592 movl 8(%esp), %eax #3.6
593 movapd (%eax), %xmm0 #4.22
594 movlpd 12(%esp), %xmm0 #4.8
595 movapd %xmm0, (%edx) #4.3
598 So icc is smart enough to know that B is in memory so it doesn't load it and
599 store it back to stack.
601 //===---------------------------------------------------------------------===//
603 __m128d test1( __m128d A, __m128d B) {
604 return _mm_shuffle_pd(A, B, 0x3);
609 shufpd $3, %xmm1, %xmm0
611 Perhaps it's better to use unpckhpd instead?
613 unpckhpd %xmm1, %xmm0
615 Don't know if unpckhpd is faster. But it is shorter.
617 //===---------------------------------------------------------------------===//
619 This code generates ugly code, probably due to costs being off or something:
621 void %test(float* %P, <4 x float>* %P2 ) {
622 %xFloat0.688 = load float* %P
623 %loadVector37.712 = load <4 x float>* %P2
624 %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
625 store <4 x float> %inFloat3.713, <4 x float>* %P2
633 movd %xmm0, %eax ;; EAX = 0!
636 pinsrw $6, %eax, %xmm0
637 shrl $16, %eax ;; EAX = 0 again!
638 pinsrw $7, %eax, %xmm0
642 It would be better to generate:
648 pinsrw $6, %eax, %xmm0
649 pinsrw $7, %eax, %xmm0
653 or use pxor (to make a zero vector) and shuffle (to insert it).
655 //===---------------------------------------------------------------------===//
657 Some useful information in the Apple Altivec / SSE Migration Guide:
659 http://developer.apple.com/documentation/Performance/Conceptual/
660 Accelerate_sse_migration/index.html
662 e.g. SSE select using and, andnot, or. Various SSE compare translations.
664 //===---------------------------------------------------------------------===//
666 Add hooks to commute some CMPP operations.
668 //===---------------------------------------------------------------------===//
670 Implement some missing insert/extract element operations without going through
671 the stack. Testcase here:
672 CodeGen/X86/vec_ins_extract.ll
673 corresponds to this C code:
675 typedef float vectorfloat __attribute__((vector_size(16)));
676 void test(vectorfloat *F, float f) {
677 vectorfloat G = *F + *F;
681 void test2(vectorfloat *F, float f) {
682 vectorfloat G = *F + *F;
686 void test3(vectorfloat *F, float *f) {
687 vectorfloat G = *F + *F;
688 *f = ((float*)&G)[2];
690 void test4(vectorfloat *F, float *f) {
691 vectorfloat G = *F + *F;
695 //===---------------------------------------------------------------------===//
697 Apply the same transformation that merged four float into a single 128-bit load
698 to loads from constant pool.