//===---------------------------------------------------------------------===//
+Expand libm rounding functions inline: Significant speedups possible.
+http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
+
+//===---------------------------------------------------------------------===//
+
When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
other fast SSE modes.
//===---------------------------------------------------------------------===//
-Should generate min/max for stuff like:
-
-void minf(float a, float b, float *X) {
- *X = a <= b ? a : b;
-}
-
-Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
-and ISD::FMAX node types?
-
-//===---------------------------------------------------------------------===//
-
-The first BB of this code:
-
-declare bool %foo()
-int %bar() {
- %V = call bool %foo()
- br bool %V, label %T, label %F
-T:
- ret int 1
-F:
- call bool %foo()
- ret int 12
-}
-
-compiles to:
-
-_bar:
- subl $12, %esp
- call L_foo$stub
- xorb $1, %al
- testb %al, %al
- jne LBB_bar_2 # F
-
-It would be better to emit "cmp %al, 1" than a xor and test.
-
-//===---------------------------------------------------------------------===//
-
Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
feasible.
//===---------------------------------------------------------------------===//
-Better codegen for:
-
-void f(float a, float b, vector float * out) { *out = (vector float){ a, 0.0, 0.0, b}; }
-void f(float a, float b, vector float * out) { *out = (vector float){ a, b, 0.0, 0}; }
-
-For the later we generate:
-
-_f:
- pxor %xmm0, %xmm0
- movss 8(%esp), %xmm1
- movaps %xmm0, %xmm2
- unpcklps %xmm1, %xmm2
- movss 4(%esp), %xmm1
- unpcklps %xmm0, %xmm1
- unpcklps %xmm2, %xmm1
- movl 12(%esp), %eax
- movaps %xmm1, (%eax)
- ret
-
-This seems like it should use shufps, one for each of a & b.
-
-//===---------------------------------------------------------------------===//
-
How to decide when to use the "floating point version" of logical ops? Here are
some code fragments:
//===---------------------------------------------------------------------===//
-Use movddup to splat a v2f64 directly from a memory source. e.g.
-
-#include <emmintrin.h>
-
-void test(__m128d *r, double A) {
- *r = _mm_set1_pd(A);
-}
-
-llc:
-
-_test:
- movsd 8(%esp), %xmm0
- unpcklpd %xmm0, %xmm0
- movl 4(%esp), %eax
- movapd %xmm0, (%eax)
- ret
-
-icc:
-
-_test:
- movl 4(%esp), %eax
- movddup 8(%esp), %xmm0
- movapd %xmm0, (%eax)
- ret
-
-//===---------------------------------------------------------------------===//
-
X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
to choose between movaps, movapd, and movdqa based on types of source and
destination?
//===---------------------------------------------------------------------===//
-We are emitting bad code for this:
-
-float %test(float* %V, int %I, int %D, float %V) {
-entry:
- %tmp = seteq int %D, 0
- br bool %tmp, label %cond_true, label %cond_false23
-
-cond_true:
- %tmp3 = getelementptr float* %V, int %I
- %tmp = load float* %tmp3
- %tmp5 = setgt float %tmp, %V
- %tmp6 = tail call bool %llvm.isunordered.f32( float %tmp, float %V )
- %tmp7 = or bool %tmp5, %tmp6
- br bool %tmp7, label %UnifiedReturnBlock, label %cond_next
-
-cond_next:
- %tmp10 = add int %I, 1
- %tmp12 = getelementptr float* %V, int %tmp10
- %tmp13 = load float* %tmp12
- %tmp15 = setle float %tmp13, %V
- %tmp16 = tail call bool %llvm.isunordered.f32( float %tmp13, float %V )
- %tmp17 = or bool %tmp15, %tmp16
- %retval = select bool %tmp17, float 0.000000e+00, float 1.000000e+00
- ret float %retval
-
-cond_false23:
- %tmp28 = tail call float %foo( float* %V, int %I, int %D, float %V )
- ret float %tmp28
-
-UnifiedReturnBlock: ; preds = %cond_true
- ret float 0.000000e+00
-}
-
-declare bool %llvm.isunordered.f32(float, float)
-
-declare float %foo(float*, int, int, float)
-
-
-It exposes a known load folding problem:
-
- movss (%edx,%ecx,4), %xmm1
- ucomiss %xmm1, %xmm0
-
-As well as this:
-
-LBB_test_2: # cond_next
- movss LCPI1_0, %xmm2
- pxor %xmm3, %xmm3
- ucomiss %xmm0, %xmm1
- jbe LBB_test_6 # cond_next
-LBB_test_5: # cond_next
- movaps %xmm2, %xmm3
-LBB_test_6: # cond_next
- movss %xmm3, 40(%esp)
- flds 40(%esp)
- addl $44, %esp
- ret
-
-Clearly it's unnecessary to clear %xmm3. It's also not clear why we are emitting
-three moves (movss, movaps, movss).
-
-//===---------------------------------------------------------------------===//
-
External test Nurbs exposed some problems. Look for
__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
emits:
//===---------------------------------------------------------------------===//
-Implement some missing insert/extract element operations without going through
-the stack. Testcase here:
-CodeGen/X86/vec_ins_extract.ll
-corresponds to this C code:
+Apply the same transformation that merged four float into a single 128-bit load
+to loads from constant pool.
-typedef float vectorfloat __attribute__((vector_size(16)));
-void test(vectorfloat *F, float f) {
- vectorfloat G = *F + *F;
- *((float*)&G) = f;
- *F = G + G;
-}
-void test2(vectorfloat *F, float f) {
- vectorfloat G = *F + *F;
- ((float*)&G)[2] = f;
- *F = G + G;
-}
-void test3(vectorfloat *F, float *f) {
- vectorfloat G = *F + *F;
- *f = ((float*)&G)[2];
-}
-void test4(vectorfloat *F, float *f) {
- vectorfloat G = *F + *F;
- *f = *((float*)&G);
+//===---------------------------------------------------------------------===//
+
+Floating point max / min are commutable when -enable-unsafe-fp-path is
+specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
+nodes which are selected to max / min instructions that are marked commutable.
+
+//===---------------------------------------------------------------------===//
+
+We should compile this:
+#include <xmmintrin.h>
+typedef union {
+ int i[4];
+ float f[4];
+ __m128 v;
+} vector4_t;
+void swizzle (const void *a, vector4_t * b, vector4_t * c) {
+ b->v = _mm_loadl_pi (b->v, (__m64 *) a);
+ c->v = _mm_loadl_pi (c->v, ((__m64 *) a) + 1);
}
+to:
+
+_swizzle:
+ movl 4(%esp), %eax
+ movl 8(%esp), %edx
+ movl 12(%esp), %ecx
+ movlps (%eax), %xmm0
+ movlps %xmm0, (%edx)
+ movlps 8(%eax), %xmm0
+ movlps %xmm0, (%ecx)
+ ret
+
+not:
+
+swizzle:
+ movl 8(%esp), %eax
+ movaps (%eax), %xmm0
+ movl 4(%esp), %ecx
+ movlps (%ecx), %xmm0
+ movaps %xmm0, (%eax)
+ movl 12(%esp), %eax
+ movaps (%eax), %xmm0
+ movlps 8(%ecx), %xmm0
+ movaps %xmm0, (%eax)
+ ret
+
//===---------------------------------------------------------------------===//