[x86] Combine x86mmx/i64 to v2i64 conversion to use scalar_to_vector

author Bruno Cardoso Lopes <bruno.cardoso@gmail.com>

Fri, 23 Jan 2015 22:44:16 +0000 (22:44 +0000)

committer Bruno Cardoso Lopes <bruno.cardoso@gmail.com>

Fri, 23 Jan 2015 22:44:16 +0000 (22:44 +0000)
author Bruno Cardoso Lopes <bruno.cardoso@gmail.com>
Fri, 23 Jan 2015 22:44:16 +0000 (22:44 +0000)
committer Bruno Cardoso Lopes <bruno.cardoso@gmail.com>
Fri, 23 Jan 2015 22:44:16 +0000 (22:44 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index e92a099753b37e91f5e4a8adb75b4a7c12f2a4fe..37026ce0f12ed5e93be19f5e72315c390ca69cb5 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -24757,6 +24757,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
    LoadSDNode *Ld = cast<LoadSDNode>(N);
    EVT RegVT = Ld->getValueType(0);
    EVT MemVT = Ld->getMemoryVT();
+  SDValue Ptr   = Ld->getBasePtr();
+  SDValue Chain = Ld->getChain();
    SDLoc dl(Ld);
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  
@@ -24795,6 +24797,33 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
      return DCI.CombineTo(N, NewVec, TF, true);
    }
  
+  // Conversion from x86mmx/i64 to v2i64 types is often done via stack
+  // store/load. Under certain conditions we can bypass the memory access and
+  // combine this load to use a scalar_to_vector instead. This leads to
+  // a reduction in the stack use, redundant emission of shuffles and create
+  // isel matching candidates for movq2dq instructions.
+  if (RegVT == MVT::v2i64 && Subtarget->hasSSE2() && Ext == ISD::EXTLOAD &&
+      !Ld->isVolatile() && ISD::isNON_TRUNCStore(Chain.getNode())) {
+
+    // If this load is directly stored, get the original source value.
+    StoreSDNode *PrevST = cast<StoreSDNode>(Chain);
+    EVT SrcTy = PrevST->getValue().getValueType();
+    if (PrevST->getBasePtr() != Ptr ||
+        !(SrcTy == MVT::i64 || SrcTy == MVT::x86mmx))
+      return SDValue();
+    SDValue SrcVal = Chain.getOperand(1);
+
+    // On 32bit systems, we can't save 64bit integers, use f64 instead.
+    bool Usef64 = TLI.isTypeLegal(MVT::f64) && !Subtarget->is64Bit();
+    if (Usef64)
+      SrcVal = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SrcVal);
+    SrcVal = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, Usef64 ? MVT::v2f64 : RegVT,
+                              SrcVal);
+
+    return DCI.CombineTo(N, Usef64 ?
+        DAG.getNode(ISD::BITCAST, dl, RegVT, SrcVal) : SrcVal, Chain);
+  }
+
    return SDValue();
  }
  
diff --git a/test/CodeGen/X86/2012-01-18-vbitcast.ll b/test/CodeGen/X86/2012-01-18-vbitcast.ll

index 9eb59e41ef7de29486a8cd3795dadc9f4d8b956a..efba66be97e8a13ee55844c1cb7330f03a8a3fc3 100644 (file)
--- a/test/CodeGen/X86/2012-01-18-vbitcast.ll
+++ b/test/CodeGen/X86/2012-01-18-vbitcast.ll
@@ -1,14 +1,15 @@
  ; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mtriple=x86_64-pc-win32 | FileCheck %s
  
-;CHECK-LABEL: vcast:
+; CHECK-LABEL: vcast:
  define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) {
-;CHECK: pmovzxdq
-;CHECK: pmovzxdq
+; CHECK-NOT: pmovzxdq
+; CHECK-NOT: pmovzxdq
+; CHECK: movdqa (%{{.*}}),  %[[R0:xmm[0-9]+]]
    %af = bitcast <2 x float> %a to <2 x i32>
    %bf = bitcast <2 x float> %b to <2 x i32>
+; CHECK-NEXT: psubq (%{{.*}}), %[[R0]]
    %x = sub <2 x i32> %af, %bf
-;CHECK: psubq
+; CHECK: ret
    ret <2 x i32> %x
-;CHECK: ret
  }
  
diff --git a/test/CodeGen/X86/lower-bitcast.ll b/test/CodeGen/X86/lower-bitcast.ll

index edb8433ec30ca831ea63e5fd1988c275ce726c55..5fad82497b997aaef6bcffe8d5a12711272cec6b 100644 (file)
--- a/test/CodeGen/X86/lower-bitcast.ll
+++ b/test/CodeGen/X86/lower-bitcast.ll
@@ -68,12 +68,13 @@ define i64 @test4(i64 %A) {
    %2 = bitcast <2 x i32> %add to i64
    ret i64 %2
  }
-; FIXME: At the moment we still produce the sequence pshufd+paddd+pshufd.
+; FIXME: At the moment we still produce the sequence paddd+pshufd.
  ; Ideally, we should fold that sequence into a single paddd. This is fixed with
  ; the widening legalization.
  ;
  ; CHECK-LABEL: test4
-; CHECK: pshufd
+; CHECK: movd
+; CHECK-NOT: pshufd
  ; CHECK-NEXT: paddd
  ; CHECK-NEXT: pshufd
  ; CHECK: ret
diff --git a/test/CodeGen/X86/mmx-movq2dq.ll b/test/CodeGen/X86/mmx-movq2dq.ll

new file mode 100644 (file)

index 0000000..9f46da5
--- /dev/null
+++ b/test/CodeGen/X86/mmx-movq2dq.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -march=x86 -mattr=+mmx,+sse2 | FileCheck %s -check-prefix=X86-32
+; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 | FileCheck %s -check-prefix=X86-64
+
+; X86-32-LABEL: test0
+; X86-64-LABEL: test0
+define i32 @test0(<1 x i64>* %v4) {
+  %v5 = load <1 x i64>* %v4, align 8
+  %v12 = bitcast <1 x i64> %v5 to <4 x i16>
+  %v13 = bitcast <4 x i16> %v12 to x86_mmx
+  ; X86-32: pshufw  $238
+  ; X86-32-NOT: movq
+  ; X86-32-NOT: movsd
+  ; X86-32: movq2dq
+  ; X86-64: pshufw  $238
+  ; X86-64-NOT: movq
+  ; X86-64-NOT: pshufd
+  ; X86-64: movq2dq
+  ; X86-64-NEXT: movd
+  %v14 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v13, i8 -18)
+  %v15 = bitcast x86_mmx %v14 to <4 x i16>
+  %v16 = bitcast <4 x i16> %v15 to <1 x i64>
+  %v17 = extractelement <1 x i64> %v16, i32 0
+  %v18 = bitcast i64 %v17 to <2 x i32>
+  %v19 = extractelement <2 x i32> %v18, i32 0
+  %v20 = add i32 %v19, 32
+  ret i32 %v20
+}
+
+declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll

index c6bd96421d751a4743588b14117707b9ae198488..0d5380eb3ac13e0f5f095a4645182432eeb17e62 100644 (file)
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -78,8 +78,7 @@ define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp
  ; CHECK-NEXT:    paddd    %[[R0]], %[[R1]]
  ; CHECK-NEXT:    pextrw   $4, %[[R1]], 4(%{{.*}})
  ; CHECK-NEXT:    pshufb   {{.*}}, %[[R1]]
-; CHECK-NEXT:    pmovzxdq %[[R1]], %[[R0]]
-; CHECK-NEXT:    movd     %[[R0]], (%{{.*}})
+; CHECK-NEXT:    movd     %[[R1]], (%{{.*}})
         %a = load %i16vec3* %ap, align 16
         %b = load %i16vec3* %bp, align 16
         %x = add %i16vec3 %a, %b
author	Bruno Cardoso Lopes <bruno.cardoso@gmail.com>
	Fri, 23 Jan 2015 22:44:16 +0000 (22:44 +0000)
committer	Bruno Cardoso Lopes <bruno.cardoso@gmail.com>
	Fri, 23 Jan 2015 22:44:16 +0000 (22:44 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/2012-01-18-vbitcast.ll		patch \| blob \| history
test/CodeGen/X86/lower-bitcast.ll		patch \| blob \| history
test/CodeGen/X86/mmx-movq2dq.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/X86/widen_load-2.ll		patch \| blob \| history