From 95d397cf33ff6940bfdc0d82fc2472387abc5b3d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 9 Jan 2016 20:59:39 +0000 Subject: [PATCH] [X86][AVX] Match broadcast loads through a bitcast AVX1 v8i32/v4i64 shuffles are bitcasted to v8f32/v4f64, this patch peeks through any bitcast to check for a load node to allow broadcasts to occur. This is a re-commit of r257055 after r257264 fixed 32-bit broadcast loads of i64 scalars. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@257266 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 9 +++++++-- test/CodeGen/X86/2012-01-12-extract-sv.ll | 4 +--- test/CodeGen/X86/avx-vbroadcast.ll | 20 ++++---------------- 3 files changed, 12 insertions(+), 21 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 7b97e79de5d..17ac3da53f5 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -8175,6 +8175,11 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, MVT BroadcastVT = VT; + // Peek through any bitcast (only useful for loads). + SDValue BC = V; + while (BC.getOpcode() == ISD::BITCAST) + BC = BC.getOperand(0); + // Also check the simpler case, where we can directly reuse the scalar. if (V.getOpcode() == ISD::BUILD_VECTOR || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { @@ -8184,14 +8189,14 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, // Only AVX2 has register broadcasts. if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) return SDValue(); - } else if (MayFoldLoad(V) && !cast(V)->isVolatile()) { + } else if (MayFoldLoad(BC) && !cast(BC)->isVolatile()) { // 32-bit targets need to load i64 as a f64 and then bitcast the result. if (!Subtarget->is64Bit() && VT.getScalarType() == MVT::i64) BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); // If we are broadcasting a load that is only used by the shuffle // then we can reduce the vector load to the broadcasted scalar load. - LoadSDNode *Ld = cast(V); + LoadSDNode *Ld = cast(BC); SDValue BaseAddr = Ld->getOperand(1); EVT AddrVT = BaseAddr.getValueType(); EVT SVT = BroadcastVT.getScalarType(); diff --git a/test/CodeGen/X86/2012-01-12-extract-sv.ll b/test/CodeGen/X86/2012-01-12-extract-sv.ll index 92ec107a007..6950641a08a 100644 --- a/test/CodeGen/X86/2012-01-12-extract-sv.ll +++ b/test/CodeGen/X86/2012-01-12-extract-sv.ll @@ -3,9 +3,7 @@ define void @endless_loop() { ; CHECK-LABEL: endless_loop: ; CHECK-NEXT: # BB#0: -; CHECK-NEXT: vmovaps (%eax), %ymm0 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] +; CHECK-NEXT: vbroadcastss (%eax), %ymm0 ; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll index a98a86b733e..0c92f4884fb 100644 --- a/test/CodeGen/X86/avx-vbroadcast.ll +++ b/test/CodeGen/X86/avx-vbroadcast.ll @@ -192,18 +192,12 @@ define <8 x i32> @load_splat_8i32_8i32_55555555(<8 x i32>* %ptr) nounwind uwtabl ; X32-LABEL: load_splat_8i32_8i32_55555555: ; X32: ## BB#0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovaps (%eax), %ymm0 -; X32-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vbroadcastss 20(%eax), %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: load_splat_8i32_8i32_55555555: ; X64: ## BB#0: ## %entry -; X64-NEXT: vmovaps (%rdi), %ymm0 -; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-NEXT: vbroadcastss 20(%rdi), %ymm0 ; X64-NEXT: retq entry: %ld = load <8 x i32>, <8 x i32>* %ptr @@ -304,18 +298,12 @@ define <4 x i64> @load_splat_4i64_4i64_2222(<4 x i64>* %ptr) nounwind uwtable re ; X32-LABEL: load_splat_4i64_4i64_2222: ; X32: ## BB#0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovapd (%eax), %ymm0 -; X32-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vbroadcastsd 16(%eax), %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: load_splat_4i64_4i64_2222: ; X64: ## BB#0: ## %entry -; X64-NEXT: vmovapd (%rdi), %ymm0 -; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0 ; X64-NEXT: retq entry: %ld = load <4 x i64>, <4 x i64>* %ptr -- 2.34.1