1 //===---------------------------------------------------------------------===//
2 // Random ideas for the X86 backend: SSE-specific stuff.
3 //===---------------------------------------------------------------------===//
5 //===---------------------------------------------------------------------===//
7 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
10 //===---------------------------------------------------------------------===//
12 Think about doing i64 math in SSE regs.
14 //===---------------------------------------------------------------------===//
16 This testcase should have no SSE instructions in it, and only one load from
19 double %test3(bool %B) {
20 %C = select bool %B, double 123.412, double 523.01123123
24 Currently, the select is being lowered, which prevents the dag combiner from
25 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
27 The pattern isel got this one right.
29 //===---------------------------------------------------------------------===//
31 SSE doesn't have [mem] op= reg instructions. If we have an SSE instruction
36 and the register allocator decides to spill X, it is cheaper to emit this as:
47 ..and this uses one fewer register (so this should be done at load folding
48 time, not at spiller time). *Note* however that this can only be done
49 if Y is dead. Here's a testcase:
51 %.str_3 = external global [15 x sbyte] ; <[15 x sbyte]*> [#uses=0]
52 implementation ; Functions:
53 declare void %printf(int, ...)
57 no_exit.i7: ; preds = %no_exit.i7, %build_tree.exit
58 %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ] ; <double> [#uses=1]
59 %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ] ; <double> [#uses=1]
60 %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
61 %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
62 br bool false, label %Compute_Tree.exit23, label %no_exit.i7
63 Compute_Tree.exit23: ; preds = %no_exit.i7
64 tail call void (int, ...)* %printf( int 0 )
65 store double %tmp.34.i18, double* null
74 *** movsd %XMM2, QWORD PTR [%ESP + 8]
75 *** addsd %XMM2, %XMM1
76 *** movsd QWORD PTR [%ESP + 8], %XMM2
77 jmp .BBmain_1 # no_exit.i7
79 This is a bugpoint reduced testcase, which is why the testcase doesn't make
80 much sense (e.g. its an infinite loop). :)
82 //===---------------------------------------------------------------------===//
84 SSE should implement 'select_cc' using 'emulated conditional moves' that use
85 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
87 double %X(double %Y, double %Z, double %A, double %B) {
88 %C = setlt double %A, %B
89 %z = add double %Z, 0.0 ;; select operand is not a load
90 %D = select bool %C, double %Y, double %z
100 movsd 32(%esp), %xmm1
101 movsd 16(%esp), %xmm2
102 ucomisd 40(%esp), %xmm1
112 //===---------------------------------------------------------------------===//
114 It's not clear whether we should use pxor or xorps / xorpd to clear XMM
115 registers. The choice may depend on subtarget information. We should do some
116 more experiments on different x86 machines.
118 //===---------------------------------------------------------------------===//
120 Currently the x86 codegen isn't very good at mixing SSE and FPStack
123 unsigned int foo(double x) { return x; }
127 movsd 24(%esp), %xmm0
135 This will be solved when we go to a dynamic programming based isel.
137 //===---------------------------------------------------------------------===//
139 Should generate min/max for stuff like:
141 void minf(float a, float b, float *X) {
145 Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
146 and ISD::FMAX node types?
148 //===---------------------------------------------------------------------===//
150 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
153 //===---------------------------------------------------------------------===//
155 Teach the coalescer to commute 2-addr instructions, allowing us to eliminate
156 the reg-reg copy in this example:
158 float foo(int *x, float *y, unsigned c) {
161 for (i = 0; i < c; i++) {
162 float xx = (float)x[i];
171 cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI]
172 mulss %XMM0, DWORD PTR [%EAX + 4*%ESI]
176 **** movaps %XMM1, %XMM0
177 jb LBB_foo_3 # no_exit
179 //===---------------------------------------------------------------------===//
182 if (copysign(1.0, x) == copysign(1.0, y))
187 //===---------------------------------------------------------------------===//
189 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
192 //===---------------------------------------------------------------------===//
194 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
195 Perhaps use pxor / xorp* to clear a XMM register first?
197 //===---------------------------------------------------------------------===//
201 void f(float a, float b, vector float * out) { *out = (vector float){ a, 0.0, 0.0, b}; }
202 void f(float a, float b, vector float * out) { *out = (vector float){ a, b, 0.0, 0}; }
204 For the later we generate:
210 unpcklps %xmm1, %xmm2
212 unpcklps %xmm0, %xmm1
213 unpcklps %xmm2, %xmm1
218 This seems like it should use shufps, one for each of a & b.
220 //===---------------------------------------------------------------------===//
222 How to decide when to use the "floating point version" of logical ops? Here are
225 movaps LCPI5_5, %xmm2
228 mulps 8656(%ecx), %xmm3
229 addps 8672(%ecx), %xmm3
235 movaps LCPI5_5, %xmm1
238 mulps 8656(%ecx), %xmm3
239 addps 8672(%ecx), %xmm3
243 movaps %xmm3, 112(%esp)
246 Due to some minor source change, the later case ended up using orps and movaps
247 instead of por and movdqa. Does it matter?
249 //===---------------------------------------------------------------------===//
251 X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
252 to choose between movaps, movapd, and movdqa based on types of source and
255 How about andps, andpd, and pand? Do we really care about the type of the packed
256 elements? If not, why not always use the "ps" variants which are likely to be
259 //===---------------------------------------------------------------------===//
261 External test Nurbs exposed some problems. Look for
262 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
265 movaps (%edx), %xmm2 #59.21
266 movaps (%edx), %xmm5 #60.21
267 movaps (%edx), %xmm4 #61.21
268 movaps (%edx), %xmm3 #62.21
269 movl 40(%ecx), %ebp #69.49
270 shufps $0, %xmm2, %xmm5 #60.21
271 movl 100(%esp), %ebx #69.20
272 movl (%ebx), %edi #69.20
273 imull %ebp, %edi #69.49
274 addl (%eax), %edi #70.33
275 shufps $85, %xmm2, %xmm4 #61.21
276 shufps $170, %xmm2, %xmm3 #62.21
277 shufps $255, %xmm2, %xmm2 #63.21
278 lea (%ebp,%ebp,2), %ebx #69.49
280 lea -3(%edi,%ebx), %ebx #70.33
282 addl 32(%ecx), %ebx #68.37
283 testb $15, %bl #91.13
284 jne L_B1.24 # Prob 5% #91.13
286 This is the llvm code after instruction scheduling:
288 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
289 %reg1078 = MOV32ri -3
290 %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
291 %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
292 %reg1080 = IMUL32rr %reg1079, %reg1037
293 %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
294 %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
295 %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
296 %reg1082 = SHL32ri %reg1038, 4
297 %reg1039 = ADD32rr %reg1036, %reg1082
298 %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
299 %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
300 %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
301 %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
302 %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
303 %reg1040 = MOV32rr %reg1039
304 %reg1084 = AND32ri8 %reg1039, 15
306 JE mbb<cond_next204,0xa914d30>
308 Still ok. After register allocation:
310 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
312 %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
313 ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
314 %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
315 %EDX = MOV32rm %EDX, 1, %NOREG, 40
316 IMUL32rr %EAX<def&use>, %EDX
317 %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
318 %ESI = MOV32rm %ESI, 1, %NOREG, 0
319 MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
320 %EAX = LEA32r %ESI, 1, %EAX, -3
321 %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
322 %ESI = MOV32rm %ESI, 1, %NOREG, 32
324 SHL32ri %EDI<def&use>, 4
325 ADD32rr %EDI<def&use>, %ESI
326 %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
327 %XMM1 = MOVAPSrr %XMM0
328 SHUFPSrr %XMM1<def&use>, %XMM1, 170
329 %XMM2 = MOVAPSrr %XMM0
330 SHUFPSrr %XMM2<def&use>, %XMM2, 0
331 %XMM3 = MOVAPSrr %XMM0
332 SHUFPSrr %XMM3<def&use>, %XMM3, 255
333 SHUFPSrr %XMM0<def&use>, %XMM0, 85
335 AND32ri8 %EBX<def&use>, 15
337 JE mbb<cond_next204,0xa914d30>
339 This looks really bad. The problem is shufps is a destructive opcode. Since it
340 appears as operand two in more than one shufps ops. It resulted in a number of
341 copies. Note icc also suffers from the same problem. Either the instruction
342 selector should select pshufd or The register allocator can made the two-address
343 to three-address transformation.
345 It also exposes some other problems. See MOV32ri -3 and the spills.
347 //===---------------------------------------------------------------------===//
349 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
351 LLVM is producing bad code.
353 LBB_main_4: # cond_true44
364 jne LBB_main_4 # cond_true44
366 There are two problems. 1) No need to two loop induction variables. We can
367 compare against 262144 * 16. 2) Known register coalescer issue. We should
368 be able eliminate one of the movaps:
370 addps %xmm2, %xmm1 <=== Commute!
373 movaps %xmm1, %xmm1 <=== Eliminate!
380 jne LBB_main_4 # cond_true44
382 //===---------------------------------------------------------------------===//
386 __m128 test(float a) {
387 return _mm_set_ps(0.0, 0.0, 0.0, a*a);
398 Because mulss doesn't modify the top 3 elements, the top elements of
399 xmm1 are already zero'd. We could compile this to:
405 //===---------------------------------------------------------------------===//
407 Here's a sick and twisted idea. Consider code like this:
409 __m128 test(__m128 a) {
410 float b = *(float*)&A;
412 return _mm_set_ps(0.0, 0.0, 0.0, b);
415 This might compile to this code:
417 movaps c(%esp), %xmm1
422 Now consider if the ... code caused xmm1 to get spilled. This might produce
425 movaps c(%esp), %xmm1
426 movaps %xmm1, c2(%esp)
430 movaps c2(%esp), %xmm1
434 However, since the reload is only used by these instructions, we could
435 "fold" it into the uses, producing something like this:
437 movaps c(%esp), %xmm1
438 movaps %xmm1, c2(%esp)
441 movss c2(%esp), %xmm0
444 ... saving two instructions.
446 The basic idea is that a reload from a spill slot, can, if only one 4-byte
447 chunk is used, bring in 3 zeros the the one element instead of 4 elements.
448 This can be used to simplify a variety of shuffle operations, where the
449 elements are fixed zeros.
451 //===---------------------------------------------------------------------===//
455 #include <emmintrin.h>
456 void test(__m128d *r, __m128d *A, double B) {
457 *r = _mm_loadl_pd(*A, &B);
463 movsd 24(%esp), %xmm0
475 movl 4(%esp), %edx #3.6
476 movl 8(%esp), %eax #3.6
477 movapd (%eax), %xmm0 #4.22
478 movlpd 12(%esp), %xmm0 #4.8
479 movapd %xmm0, (%edx) #4.3
482 So icc is smart enough to know that B is in memory so it doesn't load it and
483 store it back to stack.
485 //===---------------------------------------------------------------------===//
487 __m128d test1( __m128d A, __m128d B) {
488 return _mm_shuffle_pd(A, B, 0x3);
493 shufpd $3, %xmm1, %xmm0
495 Perhaps it's better to use unpckhpd instead?
497 unpckhpd %xmm1, %xmm0
499 Don't know if unpckhpd is faster. But it is shorter.
501 //===---------------------------------------------------------------------===//
503 This code generates ugly code, probably due to costs being off or something:
505 void %test(float* %P, <4 x float>* %P2 ) {
506 %xFloat0.688 = load float* %P
507 %loadVector37.712 = load <4 x float>* %P2
508 %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
509 store <4 x float> %inFloat3.713, <4 x float>* %P2
517 movd %xmm0, %eax ;; EAX = 0!
520 pinsrw $6, %eax, %xmm0
521 shrl $16, %eax ;; EAX = 0 again!
522 pinsrw $7, %eax, %xmm0
526 It would be better to generate:
532 pinsrw $6, %eax, %xmm0
533 pinsrw $7, %eax, %xmm0
537 or use pxor (to make a zero vector) and shuffle (to insert it).
539 //===---------------------------------------------------------------------===//
541 Some useful information in the Apple Altivec / SSE Migration Guide:
543 http://developer.apple.com/documentation/Performance/Conceptual/
544 Accelerate_sse_migration/index.html
546 e.g. SSE select using and, andnot, or. Various SSE compare translations.
548 //===---------------------------------------------------------------------===//
550 Add hooks to commute some CMPP operations.
552 //===---------------------------------------------------------------------===//
554 Implement some missing insert/extract element operations without going through
555 the stack. Testcase here:
556 CodeGen/X86/vec_ins_extract.ll
557 corresponds to this C code:
559 typedef float vectorfloat __attribute__((vector_size(16)));
560 void test(vectorfloat *F, float f) {
561 vectorfloat G = *F + *F;
565 void test2(vectorfloat *F, float f) {
566 vectorfloat G = *F + *F;
570 void test3(vectorfloat *F, float *f) {
571 vectorfloat G = *F + *F;
572 *f = ((float*)&G)[2];
574 void test4(vectorfloat *F, float *f) {
575 vectorfloat G = *F + *F;
579 //===---------------------------------------------------------------------===//
581 Apply the same transformation that merged four float into a single 128-bit load
582 to loads from constant pool.