From 29c353b9c3d8094b9ac7cbbc23cfc8d23a010db4 Mon Sep 17 00:00:00 2001
From: Bruno Cardoso Lopes <bruno.cardoso@gmail.com>
Date: Wed, 1 Sep 2010 22:33:20 +0000
Subject: [PATCH] Using target specific nodes for shuffle nodes makes the mask
 check more strict, breaking some cases not checked in the testsuite, but also
 exposes some foldings not done before, as this example:

  movaps  (%rdi), %xmm0
  movaps  (%rax), %xmm1
  movaps  %xmm0, %xmm2
  movss %xmm1, %xmm2
  shufps  $36, %xmm2, %xmm0

now is generated as:

  movaps  (%rdi), %xmm0
  movaps  %xmm0, %xmm1
  movlps  (%rax), %xmm1
  shufps  $36, %xmm1, %xmm0



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@112753 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrSSE.td      |  3 +++
 test/CodeGen/X86/vec_shuffle-37.ll | 14 ++++++++++++++
 2 files changed, 17 insertions(+)
 create mode 100644 test/CodeGen/X86/vec_shuffle-37.ll
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 8a1786272fd..9dfcbc48365 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -5909,6 +5909,9 @@ def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
 def : Pat<(X86Movlps VR128:$src1,
                     (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
           (MOVLPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(X86Movlps VR128:$src1,
+                    (bc_v4i32 (v2i64 (load addr:$src2)))),
+          (MOVLPSrm VR128:$src1, addr:$src2)>;
 
 // Shuffle with MOVLPD
 def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
diff --git a/test/CodeGen/X86/vec_shuffle-37.ll b/test/CodeGen/X86/vec_shuffle-37.ll
new file mode 100644
index 00000000000..1ed858de64e
--- /dev/null
+++ b/test/CodeGen/X86/vec_shuffle-37.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+
+define <4 x i32> @t00(<4 x i32>* %a0) nounwind ssp {
+entry:
+; CHECK: movaps  (%rdi), %xmm0
+; CHECK-NEXT: movaps  %xmm0, %xmm1
+; CHECK-NEXT: movlps  (%rax), %xmm1
+; CHECK-NEXT: shufps  $36, %xmm1, %xmm0
+  %0 = load <4 x i32>* undef, align 16
+  %1 = load <4 x i32>* %a0, align 16
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x i32> %2
+}
+
-- 
2.34.1