I suggested this change in D7898 (http://llvm.org/viewvc/llvm-project?view=revision&revision=231354)
It improves the v4i64 case although not optimally. This AVX codegen:
vmovq {{.*#+}} xmm0 = mem[0],zero
vxorpd %ymm1, %ymm1, %ymm1
vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
Becomes:
vmovsd {{.*#+}} xmm0 = mem[0],zero
Unfortunately, this doesn't completely solve PR22685. There are still at least 2 problems under here:
We're not handling v32i8 / v16i16.
We're not getting the FP / int domains right for instruction selection.
But since this patch alone appears to do no harm, reduces code duplication, and helps v4i64,
I'm submitting this patch ahead of fixing the above.
Differential Revision: http://reviews.llvm.org/D8341
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@233704
91177308-0d34-0410-b5e6-
96231b3b80d8
if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1);
- // If we have a single input to the zero element, insert that into V1 if we
- // can do so cheaply.
- int NumV2Elements =
- std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
- if (NumV2Elements == 1 && Mask[0] >= 4)
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
- return Insertion;
-
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
Subtarget, DAG))
return Blend;
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
- // If we have a single input to the zero element, insert that into V1 if we
- // can do so cheaply.
- int NumV2Elements =
- std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 8; });
- if (NumV2Elements == 1 && Mask[0] >= 8)
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
- return Insertion;
-
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
Subtarget, DAG))
return Blend;
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();
+ // If we have a single input to the zero element, insert that into V1 if we
+ // can do so cheaply.
+ int NumElts = VT.getVectorNumElements();
+ int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) {
+ return M >= NumElts;
+ });
+
+ if (NumV2Elements == 1 && Mask[0] >= NumElts)
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, VT, V1, V2, Mask, Subtarget, DAG))
+ return Insertion;
+
// There is a really nice hard cut-over between AVX1 and AVX2 that means we can
// check for those subtargets here and avoid much of the subtarget querying in
// the per-vector-type lowering routines. With AVX1 we have essentially *zero*
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:128:128-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S32"
target triple = "i686-pc-win32"
-;CHECK-LABEL: bad_cast:
+; CHECK-LABEL: bad_cast:
define void @bad_cast() {
entry:
%vext.i = shufflevector <2 x i64> undef, <2 x i64> undef, <3 x i32> <i32 0, i32 1, i32 undef>
%vecinit8.i = shufflevector <3 x i64> zeroinitializer, <3 x i64> %vext.i, <3 x i32> <i32 0, i32 3, i32 4>
store <3 x i64> %vecinit8.i, <3 x i64>* undef, align 32
-;CHECK: ret
+; CHECK: ret
ret void
}
-;CHECK-LABEL: bad_insert:
+; CHECK-LABEL: bad_insert:
define void @bad_insert(i32 %t) {
entry:
-;CHECK: vxorps %ymm1, %ymm1, %ymm1
-;CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; CHECK: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmovaps %ymm0
+; CHECK: ret
+
%v2 = insertelement <8 x i32> zeroinitializer, i32 %t, i32 0
store <8 x i32> %v2, <8 x i32> addrspace(1)* undef, align 32
-;CHECK: ret
ret void
}
; AVX1-LABEL: insert_reg_and_zero_v4i64:
; AVX1: # BB#0:
; AVX1-NEXT: vmovq %rdi, %xmm0
-; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_reg_and_zero_v4i64:
; AVX2: # BB#0:
; AVX2-NEXT: vmovq %rdi, %xmm0
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
; AVX2-NEXT: retq
%v = insertelement <4 x i64> undef, i64 %a, i64 0
%shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
; AVX1-LABEL: insert_mem_and_zero_v4i64:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_mem_and_zero_v4i64:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: retq
%a = load i64, i64* %ptr
%v = insertelement <4 x i64> undef, i64 %a, i64 0
; AVX2: # BB#0:
; AVX2-NEXT: movl $7, %eax
; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
+; AVX2-NEXT: vxorps %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
; AVX2: # BB#0:
; AVX2-NEXT: movl $7, %eax
; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
+; AVX2-NEXT: vxorps %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>