lib/Target/PowerPC/README.txt

   1 //===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
   2
   3 TODO:
   4 * lmw/stmw pass a la arm load store optimizer for prolog/epilog
   5
   6 ===-------------------------------------------------------------------------===
   7
   8 This code:
   9
  10 unsigned add32carry(unsigned sum, unsigned x) {
  11  unsigned z = sum + x;
  12  if (sum + x < x)
  13      z++;
  14  return z;
  15 }
  16
  17 Should compile to something like:
  18
  19         addc r3,r3,r4
  20         addze r3,r3
  21
  22 instead we get:
  23
  24         add r3, r4, r3
  25         cmplw cr7, r3, r4
  26         mfcr r4 ; 1
  27         rlwinm r4, r4, 29, 31, 31
  28         add r3, r3, r4
  29
  30 Ick.
  31
  32 ===-------------------------------------------------------------------------===
  33
  34 Support 'update' load/store instructions.  These are cracked on the G5, but are
  35 still a codesize win.
  36
  37 With preinc enabled, this:
  38
  39 long *%test4(long *%X, long *%dest) {
  40         %Y = getelementptr long* %X, int 4
  41         %A = load long* %Y
  42         store long %A, long* %dest
  43         ret long* %Y
  44 }
  45
  46 compiles to:
  47
  48 _test4:
  49         mr r2, r3
  50         lwzu r5, 32(r2)
  51         lwz r3, 36(r3)
  52         stw r5, 0(r4)
  53         stw r3, 4(r4)
  54         mr r3, r2
  55         blr
  56
  57 with -sched=list-burr, I get:
  58
  59 _test4:
  60         lwz r2, 36(r3)
  61         lwzu r5, 32(r3)
  62         stw r2, 4(r4)
  63         stw r5, 0(r4)
  64         blr
  65
  66 ===-------------------------------------------------------------------------===
  67
  68 We compile the hottest inner loop of viterbi to:
  69
  70         li r6, 0
  71         b LBB1_84       ;bb432.i
  72 LBB1_83:        ;bb420.i
  73         lbzx r8, r5, r7
  74         addi r6, r7, 1
  75         stbx r8, r4, r7
  76 LBB1_84:        ;bb432.i
  77         mr r7, r6
  78         cmplwi cr0, r7, 143
  79         bne cr0, LBB1_83        ;bb420.i
  80
  81 The CBE manages to produce:
  82
  83         li r0, 143
  84         mtctr r0
  85 loop:
  86         lbzx r2, r2, r11
  87         stbx r0, r2, r9
  88         addi r2, r2, 1
  89         bdz later
  90         b loop
  91
  92 This could be much better (bdnz instead of bdz) but it still beats us.  If we
  93 produced this with bdnz, the loop would be a single dispatch group.
  94
  95 ===-------------------------------------------------------------------------===
  96
  97 Lump the constant pool for each function into ONE pic object, and reference
  98 pieces of it as offsets from the start.  For functions like this (contrived
  99 to have lots of constants obviously):
 100
 101 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
 102
 103 We generate:
 104
 105 _X:
 106         lis r2, ha16(.CPI_X_0)
 107         lfd f0, lo16(.CPI_X_0)(r2)
 108         lis r2, ha16(.CPI_X_1)
 109         lfd f2, lo16(.CPI_X_1)(r2)
 110         fmadd f0, f1, f0, f2
 111         lis r2, ha16(.CPI_X_2)
 112         lfd f1, lo16(.CPI_X_2)(r2)
 113         lis r2, ha16(.CPI_X_3)
 114         lfd f2, lo16(.CPI_X_3)(r2)
 115         fmadd f1, f0, f1, f2
 116         blr
 117
 118 It would be better to materialize .CPI_X into a register, then use immediates
 119 off of the register to avoid the lis's.  This is even more important in PIC
 120 mode.
 121
 122 Note that this (and the static variable version) is discussed here for GCC:
 123 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 124
 125 Here's another example (the sgn function):
 126 double testf(double a) {
 127        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
 128 }
 129
 130 it produces a BB like this:
 131 LBB1_1: ; cond_true
 132         lis r2, ha16(LCPI1_0)
 133         lfs f0, lo16(LCPI1_0)(r2)
 134         lis r2, ha16(LCPI1_1)
 135         lis r3, ha16(LCPI1_2)
 136         lfs f2, lo16(LCPI1_2)(r3)
 137         lfs f3, lo16(LCPI1_1)(r2)
 138         fsub f0, f0, f1
 139         fsel f1, f0, f2, f3
 140         blr
 141
 142 ===-------------------------------------------------------------------------===
 143
 144 PIC Code Gen IPO optimization:
 145
 146 Squish small scalar globals together into a single global struct, allowing the
 147 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
 148 of the GOT on targets with one).
 149
 150 Note that this is discussed here for GCC:
 151 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 152
 153 ===-------------------------------------------------------------------------===
 154
 155 No loads or stores of the constants should be needed:
 156
 157 struct foo { double X, Y; };
 158 void xxx(struct foo F);
 159 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 160
 161 ===-------------------------------------------------------------------------===
 162
 163 Darwin Stub removal:
 164
 165 We still generate calls to foo$stub, and stubs, on Darwin.  This is not
 166 necessary when building with the Leopard (10.5) or later linker, as stubs are
 167 generated by ld when necessary.  Parameterizing this based on the deployment
 168 target (-mmacosx-version-min) is probably enough.  x86-32 does this right, see
 169 its logic.
 170
 171 ===-------------------------------------------------------------------------===
 172
 173 Darwin Stub LICM optimization:
 174
 175 Loops like this:
 176
 177   for (...)  bar();
 178
 179 Have to go through an indirect stub if bar is external or linkonce.  It would
 180 be better to compile it as:
 181
 182      fp = &bar;
 183      for (...)  fp();
 184
 185 which only computes the address of bar once (instead of each time through the
 186 stub).  This is Darwin specific and would have to be done in the code generator.
 187 Probably not a win on x86.
 188
 189 ===-------------------------------------------------------------------------===
 190
 191 Simple IPO for argument passing, change:
 192   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 193
 194 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 195 of arguments get assigned to r3 through r10. That is, if you have a function
 196 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 197 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 198 order for functions we can internalize so that the maximum number of
 199 integers/pointers get passed in regs before you see any of the fp arguments.
 200
 201 Instead of implementing this, it would actually probably be easier to just
 202 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 203 including having this work sanely.
 204
 205 ===-------------------------------------------------------------------------===
 206
 207 Fix Darwin FP-In-Integer Registers ABI
 208
 209 Darwin passes doubles in structures in integer registers, which is very very
 210 bad.  Add something like a BITCAST to LLVM, then do an i-p transformation that
 211 percolates these things out of functions.
 212
 213 Check out how horrible this is:
 214 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 215
 216 This is an extension of "interprocedural CC unmunging" that can't be done with
 217 just fastcc.
 218
 219 ===-------------------------------------------------------------------------===
 220
 221 Compile this:
 222
 223 int foo(int a) {
 224   int b = (a < 8);
 225   if (b) {
 226     return b * 3;     // ignore the fact that this is always 3.
 227   } else {
 228     return 2;
 229   }
 230 }
 231
 232 into something not this:
 233
 234 _foo:
 235 1)      cmpwi cr7, r3, 8
 236         mfcr r2, 1
 237         rlwinm r2, r2, 29, 31, 31
 238 1)      cmpwi cr0, r3, 7
 239         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 240 LBB1_1: ; then
 241         rlwinm r2, r2, 0, 31, 31
 242         mulli r3, r2, 3
 243         blr
 244 LBB1_2: ; UnifiedReturnBlock
 245         li r3, 2
 246         blr
 247
 248 In particular, the two compares (marked 1) could be shared by reversing one.
 249 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 250 same operands (but backwards) exists.  In this case, this wouldn't save us
 251 anything though, because the compares still wouldn't be shared.
 252
 253 ===-------------------------------------------------------------------------===
 254
 255 Fold add and sub with constant into non-extern, non-weak addresses so this:
 256
 257 static int a;
 258 void bar(int b) { a = b; }
 259 void foo(unsigned char *c) {
 260   *c = a;
 261 }
 262
 263 So that
 264
 265 _foo:
 266         lis r2, ha16(_a)
 267         la r2, lo16(_a)(r2)
 268         lbz r2, 3(r2)
 269         stb r2, 0(r3)
 270         blr
 271
 272 Becomes
 273
 274 _foo:
 275         lis r2, ha16(_a+3)
 276         lbz r2, lo16(_a+3)(r2)
 277         stb r2, 0(r3)
 278         blr
 279
 280 ===-------------------------------------------------------------------------===
 281
 282 We generate really bad code for this:
 283
 284 int f(signed char *a, _Bool b, _Bool c) {
 285    signed char t = 0;
 286   if (b)  t = *a;
 287   if (c)  *a = t;
 288 }
 289
 290 ===-------------------------------------------------------------------------===
 291
 292 This:
 293 int test(unsigned *P) { return *P >> 24; }
 294
 295 Should compile to:
 296
 297 _test:
 298         lbz r3,0(r3)
 299         blr
 300
 301 not:
 302
 303 _test:
 304         lwz r2, 0(r3)
 305         srwi r3, r2, 24
 306         blr
 307
 308 ===-------------------------------------------------------------------------===
 309
 310 On the G5, logical CR operations are more expensive in their three
 311 address form: ops that read/write the same register are half as expensive as
 312 those that read from two registers that are different from their destination.
 313
 314 We should model this with two separate instructions.  The isel should generate
 315 the "two address" form of the instructions.  When the register allocator
 316 detects that it needs to insert a copy due to the two-addresness of the CR
 317 logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
 318 we can convert to the "three address" instruction, to save code space.
 319
 320 This only matters when we start generating cr logical ops.
 321
 322 ===-------------------------------------------------------------------------===
 323
 324 We should compile these two functions to the same thing:
 325
 326 #include <stdlib.h>
 327 void f(int a, int b, int *P) {
 328   *P = (a-b)>=0?(a-b):(b-a);
 329 }
 330 void g(int a, int b, int *P) {
 331   *P = abs(a-b);
 332 }
 333
 334 Further, they should compile to something better than:
 335
 336 _g:
 337         subf r2, r4, r3
 338         subfic r3, r2, 0
 339         cmpwi cr0, r2, -1
 340         bgt cr0, LBB2_2 ; entry
 341 LBB2_1: ; entry
 342         mr r2, r3
 343 LBB2_2: ; entry
 344         stw r2, 0(r5)
 345         blr
 346
 347 GCC produces:
 348
 349 _g:
 350         subf r4,r4,r3
 351         srawi r2,r4,31
 352         xor r0,r2,r4
 353         subf r0,r2,r0
 354         stw r0,0(r5)
 355         blr
 356
 357 ... which is much nicer.
 358
 359 This theoretically may help improve twolf slightly (used in dimbox.c:142?).
 360
 361 ===-------------------------------------------------------------------------===
 362
 363 PR5945: This:
 364 define i32 @clamp0g(i32 %a) {
 365 entry:
 366         %cmp = icmp slt i32 %a, 0
 367         %sel = select i1 %cmp, i32 0, i32 %a
 368         ret i32 %sel
 369 }
 370
 371 Is compile to this with the PowerPC (32-bit) backend:
 372
 373 _clamp0g:
 374         cmpwi cr0, r3, 0
 375         li r2, 0
 376         blt cr0, LBB1_2
 377 ; BB#1:                                                     ; %entry
 378         mr r2, r3
 379 LBB1_2:                                                     ; %entry
 380         mr r3, r2
 381         blr
 382
 383 This could be reduced to the much simpler:
 384
 385 _clamp0g:
 386         srawi r2, r3, 31
 387         andc r3, r3, r2
 388         blr
 389
 390 ===-------------------------------------------------------------------------===
 391
 392 int foo(int N, int ***W, int **TK, int X) {
 393   int t, i;
 394
 395   for (t = 0; t < N; ++t)
 396     for (i = 0; i < 4; ++i)
 397       W[t / X][i][t % X] = TK[i][t];
 398
 399   return 5;
 400 }
 401
 402 We generate relatively atrocious code for this loop compared to gcc.
 403
 404 We could also strength reduce the rem and the div:
 405 http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
 406
 407 ===-------------------------------------------------------------------------===
 408
 409 float foo(float X) { return (int)(X); }
 410
 411 Currently produces:
 412
 413 _foo:
 414         fctiwz f0, f1
 415         stfd f0, -8(r1)
 416         lwz r2, -4(r1)
 417         extsw r2, r2
 418         std r2, -16(r1)
 419         lfd f0, -16(r1)
 420         fcfid f0, f0
 421         frsp f1, f0
 422         blr
 423
 424 We could use a target dag combine to turn the lwz/extsw into an lwa when the
 425 lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
 426 win only.
 427
 428 ===-------------------------------------------------------------------------===
 429
 430 We generate ugly code for this:
 431
 432 void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
 433   unsigned code = 0;
 434   if(dx < -dw) code |= 1;
 435   if(dx > dw)  code |= 2;
 436   if(dy < -dw) code |= 4;
 437   if(dy > dw)  code |= 8;
 438   if(dz < -dw) code |= 16;
 439   if(dz > dw)  code |= 32;
 440   *ret = code;
 441 }
 442
 443 ===-------------------------------------------------------------------------===
 444
 445 %struct.B = type { i8, [3 x i8] }
 446
 447 define void @bar(%struct.B* %b) {
 448 entry:
 449         %tmp = bitcast %struct.B* %b to i32*              ; <uint*> [#uses=1]
 450         %tmp = load i32* %tmp          ; <uint> [#uses=1]
 451         %tmp3 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=1]
 452         %tmp4 = load i32* %tmp3                ; <uint> [#uses=1]
 453         %tmp8 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=2]
 454         %tmp9 = load i32* %tmp8                ; <uint> [#uses=1]
 455         %tmp4.mask17 = shl i32 %tmp4, i8 1          ; <uint> [#uses=1]
 456         %tmp1415 = and i32 %tmp4.mask17, 2147483648            ; <uint> [#uses=1]
 457         %tmp.masked = and i32 %tmp, 2147483648         ; <uint> [#uses=1]
 458         %tmp11 = or i32 %tmp1415, %tmp.masked          ; <uint> [#uses=1]
 459         %tmp12 = and i32 %tmp9, 2147483647             ; <uint> [#uses=1]
 460         %tmp13 = or i32 %tmp12, %tmp11         ; <uint> [#uses=1]
 461         store i32 %tmp13, i32* %tmp8
 462         ret void
 463 }
 464
 465 We emit:
 466
 467 _foo:
 468         lwz r2, 0(r3)
 469         slwi r4, r2, 1
 470         or r4, r4, r2
 471         rlwimi r2, r4, 0, 0, 0
 472         stw r2, 0(r3)
 473         blr
 474
 475 We could collapse a bunch of those ORs and ANDs and generate the following
 476 equivalent code:
 477
 478 _foo:
 479         lwz r2, 0(r3)
 480         rlwinm r4, r2, 1, 0, 0
 481         or r2, r2, r4
 482         stw r2, 0(r3)
 483         blr
 484
 485 ===-------------------------------------------------------------------------===
 486
 487 Consider a function like this:
 488
 489 float foo(float X) { return X + 1234.4123f; }
 490
 491 The FP constant ends up in the constant pool, so we need to get the LR register.
 492  This ends up producing code like this:
 493
 494 _foo:
 495 .LBB_foo_0:     ; entry
 496         mflr r11
 497 ***     stw r11, 8(r1)
 498         bl "L00000$pb"
 499 "L00000$pb":
 500         mflr r2
 501         addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
 502         lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
 503         fadds f1, f1, f0
 504 ***     lwz r11, 8(r1)
 505         mtlr r11
 506         blr
 507
 508 This is functional, but there is no reason to spill the LR register all the way
 509 to the stack (the two marked instrs): spilling it to a GPR is quite enough.
 510
 511 Implementing this will require some codegen improvements.  Nate writes:
 512
 513 "So basically what we need to support the "no stack frame save and restore" is a
 514 generalization of the LR optimization to "callee-save regs".
 515
 516 Currently, we have LR marked as a callee-save reg.  The register allocator sees
 517 that it's callee save, and spills it directly to the stack.
 518
 519 Ideally, something like this would happen:
 520
 521 LR would be in a separate register class from the GPRs. The class of LR would be
 522 marked "unspillable".  When the register allocator came across an unspillable
 523 reg, it would ask "what is the best class to copy this into that I *can* spill"
 524 If it gets a class back, which it will in this case (the gprs), it grabs a free
 525 register of that class.  If it is then later necessary to spill that reg, so be
 526 it.
 527
 528 ===-------------------------------------------------------------------------===
 529
 530 We compile this:
 531 int test(_Bool X) {
 532   return X ? 524288 : 0;
 533 }
 534
 535 to:
 536 _test:
 537         cmplwi cr0, r3, 0
 538         lis r2, 8
 539         li r3, 0
 540         beq cr0, LBB1_2 ;entry
 541 LBB1_1: ;entry
 542         mr r3, r2
 543 LBB1_2: ;entry
 544         blr
 545
 546 instead of:
 547 _test:
 548         addic r2,r3,-1
 549         subfe r0,r2,r3
 550         slwi r3,r0,19
 551         blr
 552
 553 This sort of thing occurs a lot due to globalopt.
 554
 555 ===-------------------------------------------------------------------------===
 556
 557 We compile:
 558
 559 define i32 @bar(i32 %x) nounwind readnone ssp {
 560 entry:
 561   %0 = icmp eq i32 %x, 0                          ; <i1> [#uses=1]
 562   %neg = sext i1 %0 to i32              ; <i32> [#uses=1]
 563   ret i32 %neg
 564 }
 565
 566 to:
 567
 568 _bar:
 569         cntlzw r2, r3
 570         slwi r2, r2, 26
 571         srawi r3, r2, 31
 572         blr
 573
 574 it would be better to produce:
 575
 576 _bar:
 577         addic r3,r3,-1
 578         subfe r3,r3,r3
 579         blr
 580
 581 ===-------------------------------------------------------------------------===
 582
 583 test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
 584
 585 __ZNK4llvm5APInt17countLeadingZerosEv:
 586         ld r2, 0(r3)
 587         cntlzd r2, r2
 588         or r2, r2, r2     <<-- silly.
 589         addi r3, r2, -64
 590         blr
 591
 592 The dead or is a 'truncate' from 64- to 32-bits.
 593
 594 ===-------------------------------------------------------------------------===
 595
 596 We generate horrible ppc code for this:
 597
 598 #define N  2000000
 599 double   a[N],c[N];
 600 void simpleloop() {
 601    int j;
 602    for (j=0; j<N; j++)
 603      c[j] = a[j];
 604 }
 605
 606 LBB1_1: ;bb
 607         lfdx f0, r3, r4
 608         addi r5, r5, 1                 ;; Extra IV for the exit value compare.
 609         stfdx f0, r2, r4
 610         addi r4, r4, 8
 611
 612         xoris r6, r5, 30               ;; This is due to a large immediate.
 613         cmplwi cr0, r6, 33920
 614         bne cr0, LBB1_1
 615
 616 //===---------------------------------------------------------------------===//
 617
 618 This:
 619         #include <algorithm>
 620         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
 621         { return std::make_pair(a + b, a + b < a); }
 622         bool no_overflow(unsigned a, unsigned b)
 623         { return !full_add(a, b).second; }
 624
 625 Should compile to:
 626
 627 __Z11no_overflowjj:
 628         add r4,r3,r4
 629         subfc r3,r3,r4
 630         li r3,0
 631         adde r3,r3,r3
 632         blr
 633
 634 (or better) not:
 635
 636 __Z11no_overflowjj:
 637         add r2, r4, r3
 638         cmplw cr7, r2, r3
 639         mfcr r2
 640         rlwinm r2, r2, 29, 31, 31
 641         xori r3, r2, 1
 642         blr
 643
 644 //===---------------------------------------------------------------------===//
 645
 646 We compile some FP comparisons into an mfcr with two rlwinms and an or.  For
 647 example:
 648 #include <math.h>
 649 int test(double x, double y) { return islessequal(x, y);}
 650 int test2(double x, double y) {  return islessgreater(x, y);}
 651 int test3(double x, double y) {  return !islessequal(x, y);}
 652
 653 Compiles into (all three are similar, but the bits differ):
 654
 655 _test:
 656         fcmpu cr7, f1, f2
 657         mfcr r2
 658         rlwinm r3, r2, 29, 31, 31
 659         rlwinm r2, r2, 31, 31, 31
 660         or r3, r2, r3
 661         blr
 662
 663 GCC compiles this into:
 664
 665  _test:
 666         fcmpu cr7,f1,f2
 667         cror 30,28,30
 668         mfcr r3
 669         rlwinm r3,r3,31,1
 670         blr
 671
 672 which is more efficient and can use mfocr.  See PR642 for some more context.
 673
 674 //===---------------------------------------------------------------------===//
 675
 676 void foo(float *data, float d) {
 677    long i;
 678    for (i = 0; i < 8000; i++)
 679       data[i] = d;
 680 }
 681 void foo2(float *data, float d) {
 682    long i;
 683    data--;
 684    for (i = 0; i < 8000; i++) {
 685       data[1] = d;
 686       data++;
 687    }
 688 }
 689
 690 These compile to:
 691
 692 _foo:
 693         li r2, 0
 694 LBB1_1: ; bb
 695         addi r4, r2, 4
 696         stfsx f1, r3, r2
 697         cmplwi cr0, r4, 32000
 698         mr r2, r4
 699         bne cr0, LBB1_1 ; bb
 700         blr
 701 _foo2:
 702         li r2, 0
 703 LBB2_1: ; bb
 704         addi r4, r2, 4
 705         stfsx f1, r3, r2
 706         cmplwi cr0, r4, 32000
 707         mr r2, r4
 708         bne cr0, LBB2_1 ; bb
 709         blr
 710
 711 The 'mr' could be eliminated to folding the add into the cmp better.
 712
 713 //===---------------------------------------------------------------------===//
 714 Codegen for the following (low-probability) case deteriorated considerably
 715 when the correctness fixes for unordered comparisons went in (PR 642, 58871).
 716 It should be possible to recover the code quality described in the comments.
 717
 718 ; RUN: llvm-as < %s | llc -march=ppc32  | grep or | count 3
 719 ; This should produce one 'or' or 'cror' instruction per function.
 720
 721 ; RUN: llvm-as < %s | llc -march=ppc32  | grep mfcr | count 3
 722 ; PR2964
 723
 724 define i32 @test(double %x, double %y) nounwind  {
 725 entry:
 726         %tmp3 = fcmp ole double %x, %y          ; <i1> [#uses=1]
 727         %tmp345 = zext i1 %tmp3 to i32          ; <i32> [#uses=1]
 728         ret i32 %tmp345
 729 }
 730
 731 define i32 @test2(double %x, double %y) nounwind  {
 732 entry:
 733         %tmp3 = fcmp one double %x, %y          ; <i1> [#uses=1]
 734         %tmp345 = zext i1 %tmp3 to i32          ; <i32> [#uses=1]
 735         ret i32 %tmp345
 736 }
 737
 738 define i32 @test3(double %x, double %y) nounwind  {
 739 entry:
 740         %tmp3 = fcmp ugt double %x, %y          ; <i1> [#uses=1]
 741         %tmp34 = zext i1 %tmp3 to i32           ; <i32> [#uses=1]
 742         ret i32 %tmp34
 743 }
 744 //===----------------------------------------------------------------------===//
 745 ; RUN: llvm-as < %s | llc -march=ppc32 | not grep fneg
 746
 747 ; This could generate FSEL with appropriate flags (FSEL is not IEEE-safe, and
 748 ; should not be generated except with -enable-finite-only-fp-math or the like).
 749 ; With the correctness fixes for PR642 (58871) LowerSELECT_CC would need to
 750 ; recognize a more elaborate tree than a simple SETxx.
 751
 752 define double @test_FNEG_sel(double %A, double %B, double %C) {
 753         %D = fsub double -0.000000e+00, %A               ; <double> [#uses=1]
 754         %Cond = fcmp ugt double %D, -0.000000e+00               ; <i1> [#uses=1]
 755         %E = select i1 %Cond, double %B, double %C              ; <double> [#uses=1]
 756         ret double %E
 757 }
 758
 759 //===----------------------------------------------------------------------===//
 760 The save/restore sequence for CR in prolog/epilog is terrible:
 761 - Each CR subreg is saved individually, rather than doing one save as a unit.
 762 - On Darwin, the save is done after the decrement of SP, which means the offset
 763 from SP of the save slot can be too big for a store instruction, which means we
 764 need an additional register (currently hacked in 96015+96020; the solution there
 765 is correct, but poor).
 766 - On SVR4 the same thing can happen, and I don't think saving before the SP
 767 decrement is safe on that target, as there is no red zone.  This is currently
 768 broken AFAIK, although it's not a target I can exercise.
 769 The following demonstrates the problem:
 770 extern void bar(char *p);
 771 void foo() {
 772   char x[100000];
 773   bar(x);
 774   __asm__("" ::: "cr2");
 775 }