From 5b93ab6cde250b3c6470cf49daa28e54848a86c5 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Thu, 2 Apr 2015 20:21:52 +0000 Subject: [PATCH] [AVX] Improve insertion of i8 or i16 into low element of 256-bit zero vector Without this patch, we split the 256-bit vector into halves and produced something like: movzwl (%rdi), %eax vmovd %eax, %xmm0 vxorps %xmm1, %xmm1, %xmm1 vblendps $15, %ymm0, %ymm1, %ymm0 ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] Now, we eliminate the xor and blend because those zeros are free with the vmovd: movzwl (%rdi), %eax vmovd %eax, %xmm0 This should be the final fix needed to resolve PR22685: https://llvm.org/bugs/show_bug.cgi?id=22685 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@233941 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 16 +++++++++++++--- test/CodeGen/X86/vector-shuffle-256-v16.ll | 12 ++++++++++++ test/CodeGen/X86/vector-shuffle-256-v32.ll | 2 -- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 966aec0df96..de81949e0cc 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5679,14 +5679,24 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } + // We can't directly insert an i8 or i16 into a vector, so zero extend + // it to i32 first. if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); - Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); if (VT.is256BitVector()) { - SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); - Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); + if (Subtarget->hasAVX()) { + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v8i32, Item); + Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); + } else { + // Without AVX, we need to extend to a 128-bit vector and then + // insert into the 256-bit vector. + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); + SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); + Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); + } } else { assert(VT.is128BitVector() && "Expected an SSE value type!"); + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } return DAG.getNode(ISD::BITCAST, dl, VT, Item); diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index aad37022d27..df4994da693 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -3249,3 +3249,15 @@ define <16 x i16> @shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_u %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } + +define <16 x i16> @insert_v16i16_0elt_into_zero_vector(i16* %ptr) { +; ALL-LABEL: insert_v16i16_0elt_into_zero_vector: +; ALL: # BB#0: +; ALL-NEXT: movzwl (%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm0 +; ALL-NEXT: retq + %val = load i16, i16* %ptr + %i0 = insertelement <16 x i16> zeroinitializer, i16 %val, i32 0 + ret <16 x i16> %i0 +} + diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index f9f4b96be3c..a0f43de7563 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -656,8 +656,6 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-NEXT: movl $15, %eax ; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpblendd $15, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> -- 2.34.1