CodeGen peephole: fold redundant phys reg copies

[oota-llvm.git] / test / CodeGen / X86 / avx-vperm2x128.ll
diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll

index 74d20f348b5298c9c79cc33ab07ec4062db33b95..0958008d9a3e1b0731f7d97835eaac34cffae75e 100644 (file)
--- a/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/test/CodeGen/X86/avx-vperm2x128.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
  ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
  ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
  
@@ -262,14 +263,14 @@ entry:
    ret <8 x float> %shuffle
  }
  
-;; Test zero mask generation. 
+;; Test zero mask generation.
  ;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984
  ;; Prefer xor+vblendpd over vperm2f128 because that has better performance.
  
  define <4 x double> @vperm2z_0x08(<4 x double> %a) {
  ; ALL-LABEL: vperm2z_0x08:
-; ALL:       # BB#0:
-; ALL-NEXT:    vperm2f128 $40, %ymm0, %ymm0, %ymm0
+; ALL:       ## BB#0:
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
  ; ALL-NEXT:    retq
    %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
    ret <4 x double> %s
@@ -277,9 +278,9 @@ define <4 x double> @vperm2z_0x08(<4 x double> %a) {
  
  define <4 x double> @vperm2z_0x18(<4 x double> %a) {
  ; ALL-LABEL: vperm2z_0x18:
-; ALL:       # BB#0:
+; ALL:       ## BB#0:
  ; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
-; ALL-NEXT:    vblendpd $12, %ymm0, %ymm1, %ymm0
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
  ; ALL-NEXT:    retq
    %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
    ret <4 x double> %s
@@ -287,8 +288,8 @@ define <4 x double> @vperm2z_0x18(<4 x double> %a) {
  
  define <4 x double> @vperm2z_0x28(<4 x double> %a) {
  ; ALL-LABEL: vperm2z_0x28:
-; ALL:       # BB#0:
-; ALL-NEXT:    vperm2f128 $40, %ymm0, %ymm0, %ymm0
+; ALL:       ## BB#0:
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
  ; ALL-NEXT:    retq
    %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
    ret <4 x double> %s
@@ -296,9 +297,9 @@ define <4 x double> @vperm2z_0x28(<4 x double> %a) {
  
  define <4 x double> @vperm2z_0x38(<4 x double> %a) {
  ; ALL-LABEL: vperm2z_0x38:
-; ALL:       # BB#0:
+; ALL:       ## BB#0:
  ; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
-; ALL-NEXT:    vblendpd $12, %ymm0, %ymm1, %ymm0
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
  ; ALL-NEXT:    retq
    %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
    ret <4 x double> %s
@@ -306,8 +307,9 @@ define <4 x double> @vperm2z_0x38(<4 x double> %a) {
  
  define <4 x double> @vperm2z_0x80(<4 x double> %a) {
  ; ALL-LABEL: vperm2z_0x80:
-; ALL:       # BB#0:
-; ALL-NEXT:    vperm2f128 $128, %ymm0, %ymm0, %ymm0
+; ALL:       ## BB#0:
+; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
  ; ALL-NEXT:    retq
    %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
    ret <4 x double> %s
@@ -315,8 +317,8 @@ define <4 x double> @vperm2z_0x80(<4 x double> %a) {
  
  define <4 x double> @vperm2z_0x81(<4 x double> %a) {
  ; ALL-LABEL: vperm2z_0x81:
-; ALL:       # BB#0:
-; ALL-NEXT:    vperm2f128 $129, %ymm0, %ymm0, %ymm0
+; ALL:       ## BB#0:
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
  ; ALL-NEXT:    retq
    %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
    ret <4 x double> %s
@@ -324,8 +326,9 @@ define <4 x double> @vperm2z_0x81(<4 x double> %a) {
  
  define <4 x double> @vperm2z_0x82(<4 x double> %a) {
  ; ALL-LABEL: vperm2z_0x82:
-; ALL:       # BB#0:
-; ALL-NEXT:    vperm2f128 $128, %ymm0, %ymm0, %ymm0
+; ALL:       ## BB#0:
+; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
  ; ALL-NEXT:    retq
    %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
    ret <4 x double> %s
@@ -333,8 +336,8 @@ define <4 x double> @vperm2z_0x82(<4 x double> %a) {
  
  define <4 x double> @vperm2z_0x83(<4 x double> %a) {
  ; ALL-LABEL: vperm2z_0x83:
-; ALL:       # BB#0:
-; ALL-NEXT:    vperm2f128 $129, %ymm0, %ymm0, %ymm0
+; ALL:       ## BB#0:
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
  ; ALL-NEXT:    retq
    %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
    ret <4 x double> %s
@@ -343,10 +346,21 @@ define <4 x double> @vperm2z_0x83(<4 x double> %a) {
  ;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection.
  
  define <4 x i64> @vperm2z_int_0x83(<4 x i64> %a, <4 x i64> %b) {
-; ALL-LABEL: vperm2z_int_0x83:
-; ALL:       # BB#0:
-; AVX1:    vperm2f128 $129, %ymm0, %ymm0, %ymm0
-; AVX2:    vperm2i128 $129, %ymm0, %ymm0, %ymm0
+; AVX1-LABEL: vperm2z_int_0x83:
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vperm2z_int_0x83:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
+; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
    %s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
    %c = add <4 x i64> %b, %s
    ret <4 x i64> %c