Fixes PR23464: one way to use the broadcast intrinsics is:
_mm256_broadcastw_epi16(_mm_cvtsi32_si128(*(int*)src));
We don't currently fold this, but now that we use native IR for
the intrinsics (r245605), we can look through one bitcast to find
the broadcast scalar.
Differential Revision: http://reviews.llvm.org/D10557
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@245613
91177308-0d34-0410-b5e6-
96231b3b80d8
// Check if this is a broadcast of a scalar. We special case lowering
// for scalars so that we can more effectively fold with loads.
+ // First, look through bitcast: if the original value has a larger element
+ // type than the shuffle, the broadcast element is in essence truncated.
+ // Make that explicit to ease folding.
+ if (V.getOpcode() == ISD::BITCAST && VT.isInteger()) {
+ EVT EltVT = VT.getVectorElementType();
+ SDValue V0 = V.getOperand(0);
+ EVT V0VT = V0.getValueType();
+
+ if (V0VT.isInteger() && V0VT.getVectorElementType().bitsGT(EltVT) &&
+ ((V0.getOpcode() == ISD::BUILD_VECTOR ||
+ (V0.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)))) {
+ V = DAG.getNode(ISD::TRUNCATE, DL, EltVT, V0.getOperand(BroadcastIdx));
+ BroadcastIdx = 0;
+ }
+ }
+
+ // Also check the simpler case, where we can directly reuse the scalar.
if (V.getOpcode() == ISD::BUILD_VECTOR ||
(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
V = V.getOperand(BroadcastIdx);
defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, v2i64, v4i64>;
let Predicates = [HasAVX2] in {
+ // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
+ // This means we'll encounter truncated i32 loads; match that here.
+ def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+ (VPBROADCASTWrm addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+ (VPBROADCASTWYrm addr:$src)>;
+
// Provide aliases for broadcast from the same register class that
// automatically does the extract.
def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))),
;
; AVX2-LABEL: insert_dup_mem_v16i8_i32:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
; AVX2-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
;
; AVX2-LABEL: insert_dup_mem_v16i8_sext_i8:
; AVX2: # BB#0:
-; AVX2-NEXT: movsbl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
; AVX2-NEXT: retq
%tmp = load i8, i8* %ptr, align 1
%tmp1 = sext i8 %tmp to i32
;
; AVX2-LABEL: insert_dup_mem_v8i16_i32:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
; AVX2-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
;
; AVX2-LABEL: insert_dup_mem_v16i16_i32:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0
; AVX2-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
;
; AVX2-LABEL: insert_dup_mem_v32i8_i32:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0
; AVX2-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
;
; AVX2-LABEL: insert_dup_mem_v32i8_sext_i8:
; AVX2: # BB#0:
-; AVX2-NEXT: movsbl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0
; AVX2-NEXT: retq
%tmp = load i8, i8* %ptr, align 1
%tmp1 = sext i8 %tmp to i32