1 //===---------------------------------------------------------------------===//
2 // Random ideas for the X86 backend: SSE-specific stuff.
3 //===---------------------------------------------------------------------===//
5 //===---------------------------------------------------------------------===//
7 Expand libm rounding functions inline: Significant speedups possible.
8 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
10 //===---------------------------------------------------------------------===//
12 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
15 //===---------------------------------------------------------------------===//
17 Think about doing i64 math in SSE regs.
19 //===---------------------------------------------------------------------===//
21 This testcase should have no SSE instructions in it, and only one load from
24 double %test3(bool %B) {
25 %C = select bool %B, double 123.412, double 523.01123123
29 Currently, the select is being lowered, which prevents the dag combiner from
30 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
32 The pattern isel got this one right.
34 //===---------------------------------------------------------------------===//
36 SSE doesn't have [mem] op= reg instructions. If we have an SSE instruction
41 and the register allocator decides to spill X, it is cheaper to emit this as:
52 ..and this uses one fewer register (so this should be done at load folding
53 time, not at spiller time). *Note* however that this can only be done
54 if Y is dead. Here's a testcase:
56 %.str_3 = external global [15 x sbyte] ; <[15 x sbyte]*> [#uses=0]
57 implementation ; Functions:
58 declare void %printf(int, ...)
62 no_exit.i7: ; preds = %no_exit.i7, %build_tree.exit
63 %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ] ; <double> [#uses=1]
64 %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ] ; <double> [#uses=1]
65 %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
66 %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
67 br bool false, label %Compute_Tree.exit23, label %no_exit.i7
68 Compute_Tree.exit23: ; preds = %no_exit.i7
69 tail call void (int, ...)* %printf( int 0 )
70 store double %tmp.34.i18, double* null
79 *** movsd %XMM2, QWORD PTR [%ESP + 8]
80 *** addsd %XMM2, %XMM1
81 *** movsd QWORD PTR [%ESP + 8], %XMM2
82 jmp .BBmain_1 # no_exit.i7
84 This is a bugpoint reduced testcase, which is why the testcase doesn't make
85 much sense (e.g. its an infinite loop). :)
87 //===---------------------------------------------------------------------===//
89 SSE should implement 'select_cc' using 'emulated conditional moves' that use
90 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
92 double %X(double %Y, double %Z, double %A, double %B) {
93 %C = setlt double %A, %B
94 %z = add double %Z, 0.0 ;; select operand is not a load
95 %D = select bool %C, double %Y, double %z
104 addsd 24(%esp), %xmm0
105 movsd 32(%esp), %xmm1
106 movsd 16(%esp), %xmm2
107 ucomisd 40(%esp), %xmm1
117 //===---------------------------------------------------------------------===//
119 It's not clear whether we should use pxor or xorps / xorpd to clear XMM
120 registers. The choice may depend on subtarget information. We should do some
121 more experiments on different x86 machines.
123 //===---------------------------------------------------------------------===//
125 Currently the x86 codegen isn't very good at mixing SSE and FPStack
128 unsigned int foo(double x) { return x; }
132 movsd 24(%esp), %xmm0
140 This will be solved when we go to a dynamic programming based isel.
142 //===---------------------------------------------------------------------===//
144 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
147 //===---------------------------------------------------------------------===//
149 Teach the coalescer to commute 2-addr instructions, allowing us to eliminate
150 the reg-reg copy in this example:
152 float foo(int *x, float *y, unsigned c) {
155 for (i = 0; i < c; i++) {
156 float xx = (float)x[i];
165 cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI]
166 mulss %XMM0, DWORD PTR [%EAX + 4*%ESI]
170 **** movaps %XMM1, %XMM0
171 jb LBB_foo_3 # no_exit
173 //===---------------------------------------------------------------------===//
176 if (copysign(1.0, x) == copysign(1.0, y))
181 //===---------------------------------------------------------------------===//
183 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
186 //===---------------------------------------------------------------------===//
188 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
189 Perhaps use pxor / xorp* to clear a XMM register first?
191 //===---------------------------------------------------------------------===//
193 How to decide when to use the "floating point version" of logical ops? Here are
196 movaps LCPI5_5, %xmm2
199 mulps 8656(%ecx), %xmm3
200 addps 8672(%ecx), %xmm3
206 movaps LCPI5_5, %xmm1
209 mulps 8656(%ecx), %xmm3
210 addps 8672(%ecx), %xmm3
214 movaps %xmm3, 112(%esp)
217 Due to some minor source change, the later case ended up using orps and movaps
218 instead of por and movdqa. Does it matter?
220 //===---------------------------------------------------------------------===//
222 X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
223 to choose between movaps, movapd, and movdqa based on types of source and
226 How about andps, andpd, and pand? Do we really care about the type of the packed
227 elements? If not, why not always use the "ps" variants which are likely to be
230 //===---------------------------------------------------------------------===//
232 External test Nurbs exposed some problems. Look for
233 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
236 movaps (%edx), %xmm2 #59.21
237 movaps (%edx), %xmm5 #60.21
238 movaps (%edx), %xmm4 #61.21
239 movaps (%edx), %xmm3 #62.21
240 movl 40(%ecx), %ebp #69.49
241 shufps $0, %xmm2, %xmm5 #60.21
242 movl 100(%esp), %ebx #69.20
243 movl (%ebx), %edi #69.20
244 imull %ebp, %edi #69.49
245 addl (%eax), %edi #70.33
246 shufps $85, %xmm2, %xmm4 #61.21
247 shufps $170, %xmm2, %xmm3 #62.21
248 shufps $255, %xmm2, %xmm2 #63.21
249 lea (%ebp,%ebp,2), %ebx #69.49
251 lea -3(%edi,%ebx), %ebx #70.33
253 addl 32(%ecx), %ebx #68.37
254 testb $15, %bl #91.13
255 jne L_B1.24 # Prob 5% #91.13
257 This is the llvm code after instruction scheduling:
259 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
260 %reg1078 = MOV32ri -3
261 %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
262 %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
263 %reg1080 = IMUL32rr %reg1079, %reg1037
264 %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
265 %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
266 %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
267 %reg1082 = SHL32ri %reg1038, 4
268 %reg1039 = ADD32rr %reg1036, %reg1082
269 %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
270 %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
271 %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
272 %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
273 %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
274 %reg1040 = MOV32rr %reg1039
275 %reg1084 = AND32ri8 %reg1039, 15
277 JE mbb<cond_next204,0xa914d30>
279 Still ok. After register allocation:
281 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
283 %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
284 ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
285 %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
286 %EDX = MOV32rm %EDX, 1, %NOREG, 40
287 IMUL32rr %EAX<def&use>, %EDX
288 %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
289 %ESI = MOV32rm %ESI, 1, %NOREG, 0
290 MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
291 %EAX = LEA32r %ESI, 1, %EAX, -3
292 %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
293 %ESI = MOV32rm %ESI, 1, %NOREG, 32
295 SHL32ri %EDI<def&use>, 4
296 ADD32rr %EDI<def&use>, %ESI
297 %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
298 %XMM1 = MOVAPSrr %XMM0
299 SHUFPSrr %XMM1<def&use>, %XMM1, 170
300 %XMM2 = MOVAPSrr %XMM0
301 SHUFPSrr %XMM2<def&use>, %XMM2, 0
302 %XMM3 = MOVAPSrr %XMM0
303 SHUFPSrr %XMM3<def&use>, %XMM3, 255
304 SHUFPSrr %XMM0<def&use>, %XMM0, 85
306 AND32ri8 %EBX<def&use>, 15
308 JE mbb<cond_next204,0xa914d30>
310 This looks really bad. The problem is shufps is a destructive opcode. Since it
311 appears as operand two in more than one shufps ops. It resulted in a number of
312 copies. Note icc also suffers from the same problem. Either the instruction
313 selector should select pshufd or The register allocator can made the two-address
314 to three-address transformation.
316 It also exposes some other problems. See MOV32ri -3 and the spills.
318 //===---------------------------------------------------------------------===//
320 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
322 LLVM is producing bad code.
324 LBB_main_4: # cond_true44
335 jne LBB_main_4 # cond_true44
337 There are two problems. 1) No need to two loop induction variables. We can
338 compare against 262144 * 16. 2) Known register coalescer issue. We should
339 be able eliminate one of the movaps:
341 addps %xmm2, %xmm1 <=== Commute!
344 movaps %xmm1, %xmm1 <=== Eliminate!
351 jne LBB_main_4 # cond_true44
353 //===---------------------------------------------------------------------===//
357 __m128 test(float a) {
358 return _mm_set_ps(0.0, 0.0, 0.0, a*a);
369 Because mulss doesn't modify the top 3 elements, the top elements of
370 xmm1 are already zero'd. We could compile this to:
376 //===---------------------------------------------------------------------===//
378 Here's a sick and twisted idea. Consider code like this:
380 __m128 test(__m128 a) {
381 float b = *(float*)&A;
383 return _mm_set_ps(0.0, 0.0, 0.0, b);
386 This might compile to this code:
388 movaps c(%esp), %xmm1
393 Now consider if the ... code caused xmm1 to get spilled. This might produce
396 movaps c(%esp), %xmm1
397 movaps %xmm1, c2(%esp)
401 movaps c2(%esp), %xmm1
405 However, since the reload is only used by these instructions, we could
406 "fold" it into the uses, producing something like this:
408 movaps c(%esp), %xmm1
409 movaps %xmm1, c2(%esp)
412 movss c2(%esp), %xmm0
415 ... saving two instructions.
417 The basic idea is that a reload from a spill slot, can, if only one 4-byte
418 chunk is used, bring in 3 zeros the the one element instead of 4 elements.
419 This can be used to simplify a variety of shuffle operations, where the
420 elements are fixed zeros.
422 //===---------------------------------------------------------------------===//
426 #include <emmintrin.h>
427 void test(__m128d *r, __m128d *A, double B) {
428 *r = _mm_loadl_pd(*A, &B);
434 movsd 24(%esp), %xmm0
446 movl 4(%esp), %edx #3.6
447 movl 8(%esp), %eax #3.6
448 movapd (%eax), %xmm0 #4.22
449 movlpd 12(%esp), %xmm0 #4.8
450 movapd %xmm0, (%edx) #4.3
453 So icc is smart enough to know that B is in memory so it doesn't load it and
454 store it back to stack.
456 //===---------------------------------------------------------------------===//
458 __m128d test1( __m128d A, __m128d B) {
459 return _mm_shuffle_pd(A, B, 0x3);
464 shufpd $3, %xmm1, %xmm0
466 Perhaps it's better to use unpckhpd instead?
468 unpckhpd %xmm1, %xmm0
470 Don't know if unpckhpd is faster. But it is shorter.
472 //===---------------------------------------------------------------------===//
474 This code generates ugly code, probably due to costs being off or something:
476 void %test(float* %P, <4 x float>* %P2 ) {
477 %xFloat0.688 = load float* %P
478 %loadVector37.712 = load <4 x float>* %P2
479 %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
480 store <4 x float> %inFloat3.713, <4 x float>* %P2
488 movd %xmm0, %eax ;; EAX = 0!
491 pinsrw $6, %eax, %xmm0
492 shrl $16, %eax ;; EAX = 0 again!
493 pinsrw $7, %eax, %xmm0
497 It would be better to generate:
503 pinsrw $6, %eax, %xmm0
504 pinsrw $7, %eax, %xmm0
508 or use pxor (to make a zero vector) and shuffle (to insert it).
510 //===---------------------------------------------------------------------===//
512 Some useful information in the Apple Altivec / SSE Migration Guide:
514 http://developer.apple.com/documentation/Performance/Conceptual/
515 Accelerate_sse_migration/index.html
517 e.g. SSE select using and, andnot, or. Various SSE compare translations.
519 //===---------------------------------------------------------------------===//
521 Add hooks to commute some CMPP operations.
523 //===---------------------------------------------------------------------===//
525 Apply the same transformation that merged four float into a single 128-bit load
526 to loads from constant pool.
528 //===---------------------------------------------------------------------===//
530 Floating point max / min are commutable when -enable-unsafe-fp-path is
531 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
532 nodes which are selected to max / min instructions that are marked commutable.
534 //===---------------------------------------------------------------------===//
536 We should compile this:
537 #include <xmmintrin.h>
543 void swizzle (const void *a, vector4_t * b, vector4_t * c) {
544 b->v = _mm_loadl_pi (b->v, (__m64 *) a);
545 c->v = _mm_loadl_pi (c->v, ((__m64 *) a) + 1);
556 movlps 8(%eax), %xmm0
570 movlps 8(%ecx), %xmm0
574 //===---------------------------------------------------------------------===//
576 We should compile this:
578 #include <xmmintrin.h>
580 void foo(__m128i *A, __m128i *B) {
581 *A = _mm_sll_epi16 (*A, *B);
603 pinsrw $2, %eax, %xmm0
605 pinsrw $3, %ecx, %xmm0
606 pinsrw $4, %eax, %xmm0
607 pinsrw $5, %ecx, %xmm0
608 pinsrw $6, %eax, %xmm0
609 pinsrw $7, %ecx, %xmm0