CodeGen peephole: fold redundant phys reg copies

[oota-llvm.git] / test / CodeGen / X86 / sse41.ll
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll

index ca13392ffe8a233ba7d6391c7760922b9ffb0b1b..0a83a9753b81a0939e482508d1341cfbb57f8614 100644 (file)
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -31,49 +31,6 @@ define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind {
    ret <16 x i8> %tmp1
  }
  
-define <2 x i64> @pmovsxbd_1(i32* %p) nounwind {
-; X32-LABEL: pmovsxbd_1:
-; X32:       ## BB#0: ## %entry
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    pmovsxbd (%eax), %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: pmovsxbd_1:
-; X64:       ## BB#0: ## %entry
-; X64-NEXT:    pmovsxbd (%rdi), %xmm0
-; X64-NEXT:    retq
-entry:
-       %0 = load i32, i32* %p, align 4
-       %1 = insertelement <4 x i32> undef, i32 %0, i32 0
-       %2 = insertelement <4 x i32> %1, i32 0, i32 1
-       %3 = insertelement <4 x i32> %2, i32 0, i32 2
-       %4 = insertelement <4 x i32> %3, i32 0, i32 3
-       %5 = bitcast <4 x i32> %4 to <16 x i8>
-       %6 = tail call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %5) nounwind readnone
-       %7 = bitcast <4 x i32> %6 to <2 x i64>
-       ret <2 x i64> %7
-}
-
-define <2 x i64> @pmovsxwd_1(i64* %p) nounwind readonly {
-; X32-LABEL: pmovsxwd_1:
-; X32:       ## BB#0: ## %entry
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    pmovsxwd (%eax), %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: pmovsxwd_1:
-; X64:       ## BB#0: ## %entry
-; X64-NEXT:    pmovsxwd (%rdi), %xmm0
-; X64-NEXT:    retq
-entry:
-       %0 = load i64, i64* %p          ; <i64> [#uses=1]
-       %tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0          ; <<2 x i64>> [#uses=1]
-       %1 = bitcast <2 x i64> %tmp2 to <8 x i16>               ; <<8 x i16>> [#uses=1]
-       %2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone               ; <<4 x i32>> [#uses=1]
-       %3 = bitcast <4 x i32> %2 to <2 x i64>          ; <<2 x i64>> [#uses=1]
-       ret <2 x i64> %3
-}
-
  define <2 x i64> @pmovzxbq_1() nounwind {
  ; X32-LABEL: pmovzxbq_1:
  ; X32:       ## BB#0: ## %entry
@@ -94,8 +51,6 @@ entry:
         ret <2 x i64> %3
  }
  
-declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
-declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
  declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
  
  define i32 @extractps_1(<4 x float> %v) nounwind {
@@ -137,7 +92,7 @@ define float @ext_1(<4 x float> %v) nounwind {
  ; X32:       ## BB#0:
  ; X32-NEXT:    pushl %eax
  ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; X32-NEXT:    addss LCPI7_0, %xmm0
+; X32-NEXT:    addss LCPI5_0, %xmm0
  ; X32-NEXT:    movss %xmm0, (%esp)
  ; X32-NEXT:    flds (%esp)
  ; X32-NEXT:    popl %eax
@@ -204,7 +159,7 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) noun
  define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind {
  ; X32-LABEL: blendps_not_insertps_1:
  ; X32:       ## BB#0:
-; X32-NEXT:    movss   {{.*#+}} xmm1
+; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
  ; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
  ; X32-NEXT:    retl
  ;
@@ -839,12 +794,12 @@ define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocap
  ; X32-LABEL: insertps_from_vector_load:
  ; X32:       ## BB#0:
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X32-NEXT:    insertps    $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: insertps_from_vector_load:
  ; X64:       ## BB#0:
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X64-NEXT:    insertps    $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
  ; X64-NEXT:    retq
    %1 = load <4 x float>, <4 x float>* %pb, align 16
    %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
@@ -857,12 +812,12 @@ define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>
  ; X32-LABEL: insertps_from_vector_load_offset:
  ; X32:       ## BB#0:
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3]
+; X32-NEXT:    insertps    $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: insertps_from_vector_load_offset:
  ; X64:       ## BB#0:
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3]
+; X64-NEXT:    insertps    $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
  ; X64-NEXT:    retq
    %1 = load <4 x float>, <4 x float>* %pb, align 16
    %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
@@ -876,13 +831,13 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
  ; X32-NEXT:    shll $4, %ecx
-; X32-NEXT:    insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3]
+; X32-NEXT:    insertps    $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: insertps_from_vector_load_offset_2:
  ; X64:       ## BB#0:
  ; X64-NEXT:    shlq $4, %rsi
-; X64-NEXT:    insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3]
+; X64-NEXT:    insertps    $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
  ; X64-NEXT:    retq
    %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
    %2 = load <4 x float>, <4 x float>* %1, align 16
@@ -1013,12 +968,12 @@ define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
  ; X32-LABEL: pr20087:
  ; X32:       ## BB#0:
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2]
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: pr20087:
  ; X64:       ## BB#0:
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2]
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
  ; X64-NEXT:    retq
    %load = load <4 x float> , <4 x float> *%ptr
    %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
@@ -1026,27 +981,22 @@ define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
  }
  
  ; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1>
-define void @insertps_pr20411(i32* noalias nocapture %RET) #1 {
+define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* noalias nocapture %RET) #1 {
  ; X32-LABEL: insertps_pr20411:
  ; X32:       ## BB#0:
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    pshufd {{.*#+}} xmm0 = mem[2,3,0,1]
-; X32-NEXT:    pshufd {{.*#+}} xmm1 = mem[3,1,2,3]
-; X32-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
  ; X32-NEXT:    movdqu %xmm1, (%eax)
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: insertps_pr20411:
  ; X64:       ## BB#0:
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = mem[2,3,0,1]
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = mem[3,1,2,3]
-; X64-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X64-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
  ; X64-NEXT:    movdqu %xmm1, (%rdi)
  ; X64-NEXT:    retq
-  %gather_load = shufflevector <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %shuffle109 = shufflevector <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>  ; 4 5 6 7
-  %shuffle116 = shufflevector <8 x i32> %gather_load, <8 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> ; 3 x x x
-  %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 4, i32 3, i32 undef, i32 undef> ; 3 7 x x
+  %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 0, i32 7, i32 undef, i32 undef>
    %ptrcast = bitcast i32* %RET to <4 x i32>*
    store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4
    ret void