Ok for vector_shuffle mask to contain undef elements.

[oota-llvm.git] / lib / Target / PowerPC / README.txt
diff --git a/lib/Target/PowerPC/README.txt b/lib/Target/PowerPC/README.txt

index c90343eeee4e4e368b11ef99f738e820c4293caa..14a0bc124ed7f42bfa1374f323615426675535ef 100644 (file)
--- a/lib/Target/PowerPC/README.txt
+++ b/lib/Target/PowerPC/README.txt
@@ -1,3 +1,5 @@
+//===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
+
  TODO:
  * gpr0 allocation
  * implement do-loop -> bdnz transform
@@ -309,12 +311,6 @@ Generate lwbrx and other byteswapping load/store instructions when reasonable.
  
  ===-------------------------------------------------------------------------===
  
-Implement TargetConstantVec, and set up PPC to custom lower ConstantVec into
-TargetConstantVec's if it's one of the many forms that are algorithmically
-computable using the spiffy altivec instructions.
-
-===-------------------------------------------------------------------------===
-
  Compile this:
  
  int foo(int a) {
@@ -502,11 +498,6 @@ This theoretically may help improve twolf slightly (used in dimbox.c:142?).
  
  ===-------------------------------------------------------------------------===
  
-Implement PPCInstrInfo::isLoadFromStackSlot/isStoreToStackSlot for vector
-registers, to generate better spill code.
-
-===-------------------------------------------------------------------------===
-
  int foo(int N, int ***W, int **TK, int X) {
    int t, i;
    
@@ -524,62 +515,39 @@ http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
  
  ===-------------------------------------------------------------------------===
  
-Altivec support.  The first should be a single lvx from the constant pool, the
-second should be a xor/stvx:
-
-void foo(void) {
-  int x[8] __attribute__((aligned(128))) = { 1, 1, 1, 1, 1, 1, 1, 1 };
-  bar (x);
-}
-
-#include <string.h>
-void foo(void) {
-  int x[8] __attribute__((aligned(128)));
-  memset (x, 0, sizeof (x));
-  bar (x);
-}
-
-===-------------------------------------------------------------------------===
-
-Altivec: Codegen'ing MUL with vector FMADD should add -0.0, not 0.0:
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=8763
-
-We need to codegen -0.0 vector efficiently (no constant pool load).
-
-When -ffast-math is on, we can use 0.0.
-
-===-------------------------------------------------------------------------===
-
  float foo(float X) { return (int)(X); }
  
-Currently produces
+Currently produces:
  
  _foo:
-        lis r2, ha16(LCPI1_0)
-        lis r3, 17200
          fctiwz f0, f1
          stfd f0, -8(r1)
-        lwz r4, -4(r1)
-        xoris r4, r4, 32768
-        stw r4, -12(r1)
-        stw r3, -16(r1)
-        lfs f0, lo16(LCPI1_0)(r2)
-        lfd f1, -16(r1)
-        fsub f0, f1, f0
+        lwz r2, -4(r1)
+        extsw r2, r2
+        std r2, -16(r1)
+        lfd f0, -16(r1)
+        fcfid f0, f0
          frsp f1, f0
          blr
  
-When we have ppc64 working properly, it could produce the nicer code:
+We could use a target dag combine to turn the lwz/extsw into an lwa when the 
+lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
+win only.
  
-_foo:
-        fctiwz f0, f1
-        stfd f0, -8(r1)
-        lwz r4, -4(r1)
-        extsh r4, r4
-        std r4, -16(r1)
-        lfd f1, -16(r1)
-        fcfid f0, f0
-        frsp f0, f0
-        blr
+===-------------------------------------------------------------------------===
+
+We generate ugly code for this:
+
+void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
+  unsigned code = 0;
+  if(dx < -dw) code |= 1;
+  if(dx > dw)  code |= 2;
+  if(dy < -dw) code |= 4;
+  if(dy > dw)  code |= 8;
+  if(dz < -dw) code |= 16;
+  if(dz > dw)  code |= 32;
+  *ret = code;
+}
+
+===-------------------------------------------------------------------------===
  
-Note: this would speed up SingleSource/Misc/pi by about 30%