From f47d167c3be808a008105e74516f33e8d215d71c Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Mon, 11 Sep 2006 05:35:17 +0000 Subject: [PATCH] Updates. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@30245 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/README-SSE.txt | 116 ---------------------------------- lib/Target/X86/README.txt | 38 ----------- 2 files changed, 154 deletions(-) diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt index b68d43e4658..e019afa966f 100644 --- a/lib/Target/X86/README-SSE.txt +++ b/lib/Target/X86/README-SSE.txt @@ -147,32 +147,6 @@ and ISD::FMAX node types? //===---------------------------------------------------------------------===// -The first BB of this code: - -declare bool %foo() -int %bar() { - %V = call bool %foo() - br bool %V, label %T, label %F -T: - ret int 1 -F: - call bool %foo() - ret int 12 -} - -compiles to: - -_bar: - subl $12, %esp - call L_foo$stub - xorb $1, %al - testb %al, %al - jne LBB_bar_2 # F - -It would be better to emit "cmp %al, 1" than a xor and test. - -//===---------------------------------------------------------------------===// - Lower memcpy / memset to a series of SSE 128 bit move instructions when it's feasible. @@ -274,33 +248,6 @@ instead of por and movdqa. Does it matter? //===---------------------------------------------------------------------===// -Use movddup to splat a v2f64 directly from a memory source. e.g. - -#include - -void test(__m128d *r, double A) { - *r = _mm_set1_pd(A); -} - -llc: - -_test: - movsd 8(%esp), %xmm0 - unpcklpd %xmm0, %xmm0 - movl 4(%esp), %eax - movapd %xmm0, (%eax) - ret - -icc: - -_test: - movl 4(%esp), %eax - movddup 8(%esp), %xmm0 - movapd %xmm0, (%eax) - ret - -//===---------------------------------------------------------------------===// - X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible to choose between movaps, movapd, and movdqa based on types of source and destination? @@ -311,69 +258,6 @@ shorter. //===---------------------------------------------------------------------===// -We are emitting bad code for this: - -float %test(float* %V, int %I, int %D, float %V) { -entry: - %tmp = seteq int %D, 0 - br bool %tmp, label %cond_true, label %cond_false23 - -cond_true: - %tmp3 = getelementptr float* %V, int %I - %tmp = load float* %tmp3 - %tmp5 = setgt float %tmp, %V - %tmp6 = tail call bool %llvm.isunordered.f32( float %tmp, float %V ) - %tmp7 = or bool %tmp5, %tmp6 - br bool %tmp7, label %UnifiedReturnBlock, label %cond_next - -cond_next: - %tmp10 = add int %I, 1 - %tmp12 = getelementptr float* %V, int %tmp10 - %tmp13 = load float* %tmp12 - %tmp15 = setle float %tmp13, %V - %tmp16 = tail call bool %llvm.isunordered.f32( float %tmp13, float %V ) - %tmp17 = or bool %tmp15, %tmp16 - %retval = select bool %tmp17, float 0.000000e+00, float 1.000000e+00 - ret float %retval - -cond_false23: - %tmp28 = tail call float %foo( float* %V, int %I, int %D, float %V ) - ret float %tmp28 - -UnifiedReturnBlock: ; preds = %cond_true - ret float 0.000000e+00 -} - -declare bool %llvm.isunordered.f32(float, float) - -declare float %foo(float*, int, int, float) - - -It exposes a known load folding problem: - - movss (%edx,%ecx,4), %xmm1 - ucomiss %xmm1, %xmm0 - -As well as this: - -LBB_test_2: # cond_next - movss LCPI1_0, %xmm2 - pxor %xmm3, %xmm3 - ucomiss %xmm0, %xmm1 - jbe LBB_test_6 # cond_next -LBB_test_5: # cond_next - movaps %xmm2, %xmm3 -LBB_test_6: # cond_next - movss %xmm3, 40(%esp) - flds 40(%esp) - addl $44, %esp - ret - -Clearly it's unnecessary to clear %xmm3. It's also not clear why we are emitting -three moves (movss, movaps, movss). - -//===---------------------------------------------------------------------===// - External test Nurbs exposed some problems. Look for __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc emits: diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index 94c8ea13b39..fdfc9fb0269 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -390,44 +390,6 @@ require a copy to be inserted (in X86InstrInfo::convertToThreeAddress). //===---------------------------------------------------------------------===// -This code generates ugly code, probably due to costs being off or something: - -void %test(float* %P, <4 x float>* %P2 ) { - %xFloat0.688 = load float* %P - %loadVector37.712 = load <4 x float>* %P2 - %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3 - store <4 x float> %inFloat3.713, <4 x float>* %P2 - ret void -} - -Generates: - -_test: - pxor %xmm0, %xmm0 - movd %xmm0, %eax ;; EAX = 0! - movl 8(%esp), %ecx - movaps (%ecx), %xmm0 - pinsrw $6, %eax, %xmm0 - shrl $16, %eax ;; EAX = 0 again! - pinsrw $7, %eax, %xmm0 - movaps %xmm0, (%ecx) - ret - -It would be better to generate: - -_test: - movl 8(%esp), %ecx - movaps (%ecx), %xmm0 - xor %eax, %eax - pinsrw $6, %eax, %xmm0 - pinsrw $7, %eax, %xmm0 - movaps %xmm0, (%ecx) - ret - -or use pxor (to make a zero vector) and shuffle (to insert it). - -//===---------------------------------------------------------------------===// - Bad codegen: char foo(int x) { return x; } -- 2.34.1