const unsigned Scale = V0EltSize / EltSize;
const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
- // If we're extracting non-least-significant bits, this isn't a truncation.
- if (BroadcastIdx % Scale)
- return SDValue();
-
if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
V0Opc != ISD::BUILD_VECTOR)
return SDValue();
SDValue Scalar = V0.getOperand(V0BroadcastIdx);
+
+ // If we're extracting non-least-significant bits, shift so we can truncate.
+ // Hopefully, we can fold away the trunc/srl/load into the broadcast.
+ // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
+ // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
+ if (const int OffsetIdx = BroadcastIdx % Scale)
+ Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
+ DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
+
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
}
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; SSE41-NEXT: retq
;
-; AVX-LABEL: insert_dup_elt1_mem_v16i8_i32:
-; AVX: # BB#0:
-; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX-NEXT: retq
+; AVX1-LABEL: insert_dup_elt1_mem_v16i8_i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_dup_elt1_mem_v16i8_i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb 1(%rdi), %xmm0
+; AVX2-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
%tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; SSE41-NEXT: retq
;
-; AVX-LABEL: insert_dup_elt2_mem_v16i8_i32:
-; AVX: # BB#0:
-; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
-; AVX-NEXT: retq
+; AVX1-LABEL: insert_dup_elt2_mem_v16i8_i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_dup_elt2_mem_v16i8_i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb 2(%rdi), %xmm0
+; AVX2-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
%tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; SSE41-NEXT: retq
;
-; AVX-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
-; AVX: # BB#0:
-; AVX-NEXT: movsbl (%rdi), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX-NEXT: retq
+; AVX1-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: movsbl (%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: movsbl (%rdi), %eax
+; AVX2-NEXT: shrl $8, %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT: retq
%tmp = load i8, i8* %ptr, align 1
%tmp1 = sext i8 %tmp to i32
%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; SSE41-NEXT: retq
;
-; AVX-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
-; AVX: # BB#0:
-; AVX-NEXT: movsbl (%rdi), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
-; AVX-NEXT: retq
+; AVX1-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: movsbl (%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: movsbl (%rdi), %eax
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT: retq
%tmp = load i8, i8* %ptr, align 1
%tmp1 = sext i8 %tmp to i32
%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
; SSE41-NEXT: retq
;
-; AVX-LABEL: insert_dup_elt1_mem_v8i16_i32:
-; AVX: # BB#0:
-; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
-; AVX-NEXT: retq
+; AVX1-LABEL: insert_dup_elt1_mem_v8i16_i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_dup_elt1_mem_v8i16_i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: movzwl 2(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
; SSE41-NEXT: retq
;
-; AVX-LABEL: insert_dup_elt3_mem_v8i16_i32:
-; AVX: # BB#0:
-; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
-; AVX-NEXT: retq
+; AVX1-LABEL: insert_dup_elt3_mem_v8i16_i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_dup_elt3_mem_v8i16_i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: movzwl 2(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1
%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
; SSE41-NEXT: retq
;
-; AVX-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
-; AVX: # BB#0:
-; AVX-NEXT: movswl (%rdi), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
-; AVX-NEXT: retq
+; AVX1-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: movswl (%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: movswl (%rdi), %eax
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: retq
%tmp = load i16, i16* %ptr, align 2
%tmp1 = sext i16 %tmp to i32
%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
; SSE41-NEXT: retq
;
-; AVX-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
-; AVX: # BB#0:
-; AVX-NEXT: movswl (%rdi), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
-; AVX-NEXT: retq
+; AVX1-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: movswl (%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: movswl (%rdi), %eax
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: retq
%tmp = load i16, i16* %ptr, align 2
%tmp1 = sext i16 %tmp to i32
%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 1
;
; AVX2-LABEL: insert_dup_elt1_mem_v16i16_i32:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: movzwl 2(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
;
; AVX2-LABEL: insert_dup_elt3_mem_v16i16_i32:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: movzwl 2(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1
;
; AVX2-LABEL: insert_dup_elt1_mem_v32i8_i32:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastb 1(%rdi), %ymm0
; AVX2-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
;
; AVX2-LABEL: insert_dup_elt3_mem_v32i8_i32:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastb 3(%rdi), %ymm0
; AVX2-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
; AVX2-LABEL: insert_dup_elt1_mem_v32i8_sext_i8:
; AVX2: # BB#0:
; AVX2-NEXT: movsbl (%rdi), %eax
+; AVX2-NEXT: shrl $8, %eax
; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: retq
%tmp = load i8, i8* %ptr, align 1
%tmp1 = sext i8 %tmp to i32