From fd350586f5047c093ea5c7861ea0d575fc539b28 Mon Sep 17 00:00:00 2001 From: Michael Kuperstein Date: Wed, 17 Dec 2014 12:32:17 +0000 Subject: [PATCH] [DAGCombine] Slightly improve lowering of BUILD_VECTOR into a shuffle. This handles the case of a BUILD_VECTOR being constructed out of elements extracted from a vector twice the size of the result vector. Previously this was always scalarized. Now, we try to construct a shuffle node that feeds on extract_subvectors. This fixes PR15872 and provides a partial fix for PR21711. Differential Revision: http://reviews.llvm.org/D6678 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@224429 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Target/TargetLowering.h | 9 +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 33 +++++--- lib/Target/X86/X86ISelLowering.cpp | 8 ++ lib/Target/X86/X86ISelLowering.h | 4 + test/CodeGen/X86/vec_extract-avx.ll | 82 ++++++++++++++++++++ test/CodeGen/X86/vector-shuffle-combining.ll | 31 ++++++++ 6 files changed, 156 insertions(+), 11 deletions(-) create mode 100644 test/CodeGen/X86/vec_extract-avx.ll diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 1f5c1fd7157..9c018978224 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -1531,6 +1531,15 @@ public: Type *Ty) const { return false; } + + /// Return true if EXTRACT_SUBVECTOR is cheap for this result type + /// with this index. This is needed because EXTRACT_SUBVECTOR usually + /// has custom lowering that depends on the index of the first element, + /// and only the target knows which lowering is cheap. + virtual bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const { + return false; + } + //===--------------------------------------------------------------------===// // Runtime Library hooks // diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index ed3d06cc920..8297e841469 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10783,9 +10783,6 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { SDValue ExtVal = Extract.getOperand(1); unsigned ExtIndex = cast(ExtVal)->getZExtValue(); if (Extract.getOperand(0) == VecIn1) { - if (ExtIndex > VT.getVectorNumElements()) - return SDValue(); - Mask.push_back(ExtIndex); continue; } @@ -10805,20 +10802,34 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { if (VecIn2.getNode()) return SDValue(); - // We only support widening of vectors which are half the size of the - // output registers. For example XMM->YMM widening on X86 with AVX. - if (VecIn1.getValueType().getSizeInBits()*2 != VT.getSizeInBits()) - return SDValue(); - // If the input vector type has a different base type to the output // vector type, bail out. if (VecIn1.getValueType().getVectorElementType() != VT.getVectorElementType()) return SDValue(); - // Widen the input vector by adding undef values. - VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, - VecIn1, DAG.getUNDEF(VecIn1.getValueType())); + // If the input vector is too small, widen it. + // We only support widening of vectors which are half the size of the + // output registers. For example XMM->YMM widening on X86 with AVX. + EVT VecInT = VecIn1.getValueType(); + if (VecInT.getSizeInBits() * 2 == VT.getSizeInBits()) { + // Widen the input vector by adding undef values. + VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + VecIn1, DAG.getUNDEF(VecIn1.getValueType())); + } else if (VecInT.getSizeInBits() == VT.getSizeInBits() * 2) { + // If the input vector is too large, try to split it. + if (!TLI.isExtractSubvectorCheap(VT, VT.getVectorNumElements())) + return SDValue(); + + // Try to replace VecIn1 with two extract_subvectors + // No need to update the masks, they should still be correct. + VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1, + DAG.getConstant(VT.getVectorNumElements(), TLI.getVectorIdxTy())); + VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1, + DAG.getConstant(0, TLI.getVectorIdxTy())); + UsesZeroVector = false; + } else + return SDValue(); } if (UsesZeroVector) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 2678ca645fd..6a6b20e81d9 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -3851,6 +3851,14 @@ bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, return true; } +bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, + unsigned Index) const { + if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) + return false; + + return (Index == 0 || Index == ResVT.getVectorNumElements()); +} + /// isUndefOrInRange - Return true if Val is undef or if its value falls within /// the specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index b793171e2c5..c5e3e14c545 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -791,6 +791,10 @@ namespace llvm { bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; + /// Return true if EXTRACT_SUBVECTOR is cheap for this result type + /// with this index. + bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override; + /// Intel processors have a unified instruction and data cache const char * getClearCacheBuiltinName() const override { return nullptr; // nothing to do, move along. diff --git a/test/CodeGen/X86/vec_extract-avx.ll b/test/CodeGen/X86/vec_extract-avx.ll new file mode 100644 index 00000000000..e5e79375fff --- /dev/null +++ b/test/CodeGen/X86/vec_extract-avx.ll @@ -0,0 +1,82 @@ +target triple = "x86_64-unknown-unknown" + +; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s + +; When extracting multiple consecutive elements from a larger +; vector into a smaller one, do it efficiently. We should use +; an EXTRACT_SUBVECTOR node internally rather than a bunch of +; single element extractions. + +; Extracting the low elements only requires using the right kind of store. +define void @low_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) { + %ext0 = extractelement <8 x float> %v, i32 0 + %ext1 = extractelement <8 x float> %v, i32 1 + %ext2 = extractelement <8 x float> %v, i32 2 + %ext3 = extractelement <8 x float> %v, i32 3 + %ins0 = insertelement <4 x float> undef, float %ext0, i32 0 + %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1 + %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2 + %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3 + store <4 x float> %ins3, <4 x float>* %ptr, align 16 + ret void + +; CHECK-LABEL: low_v8f32_to_v4f32 +; CHECK: vmovaps +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} + +; Extracting the high elements requires just one AVX instruction. +define void @high_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) { + %ext0 = extractelement <8 x float> %v, i32 4 + %ext1 = extractelement <8 x float> %v, i32 5 + %ext2 = extractelement <8 x float> %v, i32 6 + %ext3 = extractelement <8 x float> %v, i32 7 + %ins0 = insertelement <4 x float> undef, float %ext0, i32 0 + %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1 + %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2 + %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3 + store <4 x float> %ins3, <4 x float>* %ptr, align 16 + ret void + +; CHECK-LABEL: high_v8f32_to_v4f32 +; CHECK: vextractf128 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} + +; Make sure element type doesn't alter the codegen. Note that +; if we were actually using the vector in this function and +; have AVX2, we should generate vextracti128 (the int version). +define void @high_v8i32_to_v4i32(<8 x i32> %v, <4 x i32>* %ptr) { + %ext0 = extractelement <8 x i32> %v, i32 4 + %ext1 = extractelement <8 x i32> %v, i32 5 + %ext2 = extractelement <8 x i32> %v, i32 6 + %ext3 = extractelement <8 x i32> %v, i32 7 + %ins0 = insertelement <4 x i32> undef, i32 %ext0, i32 0 + %ins1 = insertelement <4 x i32> %ins0, i32 %ext1, i32 1 + %ins2 = insertelement <4 x i32> %ins1, i32 %ext2, i32 2 + %ins3 = insertelement <4 x i32> %ins2, i32 %ext3, i32 3 + store <4 x i32> %ins3, <4 x i32>* %ptr, align 16 + ret void + +; CHECK-LABEL: high_v8i32_to_v4i32 +; CHECK: vextractf128 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} + +; Make sure that element size doesn't alter the codegen. +define void @high_v4f64_to_v2f64(<4 x double> %v, <2 x double>* %ptr) { + %ext0 = extractelement <4 x double> %v, i32 2 + %ext1 = extractelement <4 x double> %v, i32 3 + %ins0 = insertelement <2 x double> undef, double %ext0, i32 0 + %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1 + store <2 x double> %ins1, <2 x double>* %ptr, align 16 + ret void + +; CHECK-LABEL: high_v4f64_to_v2f64 +; CHECK: vextractf128 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll index 5f30891365d..e7bae3415bf 100644 --- a/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/test/CodeGen/X86/vector-shuffle-combining.ll @@ -1552,6 +1552,37 @@ define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) { ret <4 x i32> %2 } +define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) { +; SSE-LABEL: combine_test21: +; SSE: # BB#0: +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm2, +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_test21: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-NEXT: movdqa %xmm2, +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test21: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-NEXT: movdqa %xmm2, +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> + %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> + store <4 x i32> %1, <4 x i32>* %ptr, align 16 + ret <4 x i32> %2 +} ; Check some negative cases. ; FIXME: Do any of these really make sense? Are they redundant with the above tests? -- 2.34.1