From: Jim Grosbach <grosbach@apple.com>
Date: Wed, 9 Apr 2014 23:39:25 +0000 (+0000)
Subject: Add support for load folding of avx1 logical instructions
X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=afb4ef3549c25bb3d4808be484aa0163bd829492;p=oota-llvm.git

Add support for load folding of avx1 logical instructions

AVX supports logical operations using an operand from memory. Unfortunately
because integer operations were not added until AVX2 the AVX1 logical
operation's types were preventing the isel from folding the loads. In a limited
number of cases the peephole optimizer would fold the loads, but most were
missed. This patch adds explicit patterns with appropriate casts in order for
these loads to be folded.

The included test cases run on reduced examples and disable the peephole
optimizer to ensure the folds are being pattern matched.

Patch by Louis Gerbarg <lgg@apple.com>

rdar://16355124

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@205938 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index f2f39679447..72a18e7d794 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -2973,6 +2973,19 @@ defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor>;
 let isCommutable = 0 in
   defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
 
+// AVX1 requires type coercions in order to fold loads directly into logical
+// operations.
+let Predicates = [HasAVX1Only] in {
+  def : Pat<(bc_v8f32 (and VR256:$src1, (loadv4i64 addr:$src2))),
+            (VANDPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(bc_v8f32 (or VR256:$src1, (loadv4i64 addr:$src2))),
+            (VORPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(bc_v8f32 (xor VR256:$src1, (loadv4i64 addr:$src2))),
+            (VXORPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(bc_v8f32 (X86andnp VR256:$src1, (loadv4i64 addr:$src2))),
+            (VANDNPSYrm VR256:$src1, addr:$src2)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Arithmetic Instructions
 //===----------------------------------------------------------------------===//
diff --git a/test/CodeGen/X86/avx1-logical-load-folding.ll b/test/CodeGen/X86/avx1-logical-load-folding.ll
new file mode 100644
index 00000000000..56ef55309b9
--- /dev/null
+++ b/test/CodeGen/X86/avx1-logical-load-folding.ll
@@ -0,0 +1,60 @@
+; RUN: llc -O3 -disable-peephole -mattr=-avx2 < %s | FileCheck %s
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Function Attrs: nounwind ssp uwtable
+define void @test1(float* %A, float* %C) #0 {
+  %tmp1 = bitcast float* %A to <8 x float>*
+  %tmp2 = load <8 x float>* %tmp1, align 32
+  %tmp3 = bitcast <8 x float> %tmp2 to <8 x i32>
+  %tmp4 = and <8 x i32> %tmp3, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  %tmp5 = bitcast <8 x i32> %tmp4 to <8 x float>
+  %tmp6 = extractelement <8 x float> %tmp5, i32 0
+  store float %tmp6, float* %C
+  ret void
+
+  ; CHECK: vandps LCPI0_0(%rip), %ymm0, %ymm0
+}
+
+; Function Attrs: nounwind ssp uwtable
+define void @test2(float* %A, float* %C) #0 {
+  %tmp1 = bitcast float* %A to <8 x float>*
+  %tmp2 = load <8 x float>* %tmp1, align 32
+  %tmp3 = bitcast <8 x float> %tmp2 to <8 x i32>
+  %tmp4 = or <8 x i32> %tmp3, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  %tmp5 = bitcast <8 x i32> %tmp4 to <8 x float>
+  %tmp6 = extractelement <8 x float> %tmp5, i32 0
+  store float %tmp6, float* %C
+  ret void
+
+  ; CHECK: vorps LCPI1_0(%rip), %ymm0, %ymm0
+}
+
+; Function Attrs: nounwind ssp uwtable
+define void @test3(float* %A, float* %C) #0 {
+  %tmp1 = bitcast float* %A to <8 x float>*
+  %tmp2 = load <8 x float>* %tmp1, align 32
+  %tmp3 = bitcast <8 x float> %tmp2 to <8 x i32>
+  %tmp4 = xor <8 x i32> %tmp3, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  %tmp5 = bitcast <8 x i32> %tmp4 to <8 x float>
+  %tmp6 = extractelement <8 x float> %tmp5, i32 0
+  store float %tmp6, float* %C
+  ret void
+
+  ; CHECK: vxorps LCPI2_0(%rip), %ymm0, %ymm0
+}
+
+define void @test4(float* %A, float* %C) #0 {
+  %tmp1 = bitcast float* %A to <8 x float>*
+  %tmp2 = load <8 x float>* %tmp1, align 32
+  %tmp3 = bitcast <8 x float> %tmp2 to <8 x i32>
+  %tmp4 = xor <8 x i32> %tmp3, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %tmp5 = and <8 x i32> %tmp4, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  %tmp6 = bitcast <8 x i32> %tmp5 to <8 x float>
+  %tmp7 = extractelement <8 x float> %tmp6, i32 0
+  store float %tmp7, float * %C
+  ret void
+
+  ;CHECK: vandnps LCPI3_0(%rip), %ymm0, %ymm0
+}