return Zeroable;
}
-/// \brief Try to lower a vector shuffle as a zero extension.
+/// \brief Try to lower a vector shuffle as a zero extension on any micrarch.
///
-/// This tries to use the SSE4.1 PMOVZX instruction family to lower a vector
-/// shuffle throuh a zero extension. It doesn't check for the availability or
-/// profitability of this lowering though, it tries to aggressively match this
-/// pattern. It handles both blends with all-zero inputs to explicitly
-/// zero-extend and undef-lanes (sometimes undef due to masking out later).
-static SDValue lowerVectorShuffleAsZeroExtend(SDLoc DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- SelectionDAG &DAG) {
+/// This routine will try to do everything in its power to cleverly lower
+/// a shuffle which happens to match the pattern of a zero extend. It doesn't
+/// check for the profitability of this lowering, it tries to aggressively
+/// match this pattern. It will use all of the micro-architectural details it
+/// can to emit an efficient lowering. It handles both blends with all-zero
+/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
+/// masking out later).
+///
+/// The reason we have dedicated lowering for zext-style shuffles is that they
+/// are both incredibly common and often quite performance sensitive.
+static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
+ SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget, SelectionDAG &DAG) {
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
int Bits = VT.getSizeInBits();
// if valid.
auto LowerWithStride = [&](int Stride) -> SDValue {
SDValue InputV;
+ bool AnyExt = true;
for (int i = 0; i < NumElements; ++i) {
if (Mask[i] == -1)
continue; // Valid anywhere but doesn't tell us anything.
// Each of the extend elements needs to be zeroable.
if (!Zeroable[i])
return SDValue();
- else
- continue;
+
+ // We no lorger are in the anyext case.
+ AnyExt = false;
+ continue;
}
// Each of the base elements needs to be consecutive indices into the
if (!InputV)
return SDValue();
- // Found a valid lowering! Compute all the types and the operation. We force
- // everything to integer types here as that's the only way zext makes sense.
- MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
- MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Stride),
- NumElements / Stride);
-
- InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
- return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
+ // Found a valid zext mask! Try various lowering strategies based on the
+ // input type and available ISA extensions.
+ if (Subtarget->hasSSE41()) {
+ MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
+ MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Stride),
+ NumElements / Stride);
+ InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
+ }
+
+ // For any extends we can cheat for larger element sizes and use shuffle
+ // instructions that can fold with a load and/or copy.
+ if (AnyExt && EltBits == 32) {
+ int PSHUFDMask[4] = {0, -1, 1, -1};
+ return DAG.getNode(
+ ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
+ }
+ if (AnyExt && EltBits == 16 && Stride > 2) {
+ int PSHUFDMask[4] = {0, -1, 0, -1};
+ InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
+ int PSHUFHWMask[4] = {1, -1, -1, -1};
+ return DAG.getNode(
+ ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
+ getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
+ }
+
+ // If this would require more than 2 unpack instructions to expand, use
+ // pshufb when available. We can only use more than 2 unpack instructions
+ // when zero extending i8 elements which also makes it easier to use pshufb.
+ if (Stride > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
+ assert(NumElements == 16 && "Unexpected byte vector width!");
+ SDValue PSHUFBMask[16];
+ for (int i = 0; i < 16; ++i)
+ PSHUFBMask[i] =
+ DAG.getConstant((i % Stride == 0) ? i / Stride : 0x80, MVT::i8);
+ InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
+ DAG.getNode(ISD::BUILD_VECTOR, DL,
+ MVT::v16i8, PSHUFBMask)));
+ }
+
+ // Otherwise emit a sequence of unpacks.
+ do {
+ MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
+ SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
+ : getZeroVector(InputVT, Subtarget, DAG, DL);
+ InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
+ InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
+ Stride /= 2;
+ EltBits *= 2;
+ NumElements /= 2;
+ } while (Stride > 1);
+ return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
};
// The widest stride possible for zero extending is to a 64-bit integer.
return V;
}
- // No viable zext lowering found.
+ // No viable ext lowering found.
return SDValue();
}
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative.
- if (Subtarget->hasSSE41())
- if (SDValue ZExt =
- lowerVectorShuffleAsZeroExtend(DL, MVT::v4i32, V1, V2, Mask, DAG))
- return ZExt;
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return ZExt;
// Use dedicated unpack instructions for masks that match their pattern.
if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative.
- if (Subtarget->hasSSE41())
- if (SDValue ZExt = lowerVectorShuffleAsZeroExtend(DL, MVT::v8i16, V1, V2,
- OrigMask, DAG))
- return ZExt;
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
+ return ZExt;
auto isV1 = [](int M) { return M >= 0 && M < 8; };
auto isV2 = [](int M) { return M >= 8; };
return Rotate;
// Try to use a zext lowering.
- if (Subtarget->hasSSE41())
- if (SDValue ZExt = lowerVectorShuffleAsZeroExtend(DL, MVT::v16i8, V1, V2,
- OrigMask, DAG))
- return ZExt;
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
+ return ZExt;
int MaskStorage[16] = {
OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
; SSE2-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu
; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
-; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: punpckldq {{.*}} # xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu
; SSSE3: # BB#0:
-; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}},1,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}]
+; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu
}
define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
+; SSE2-LABEL: @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3],xmm0[4],[[X]][4],xmm0[5],[[X]][5],xmm0[6],[[X]][6],xmm0[7],[[X]][7]
+; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3]
+; SSE2-NEXT: punpckldq {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1]
+; SSE2-NEXT: retq
+;
; SSSE3-LABEL: @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz
; SSSE3: # BB#0:
-; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]]
-; SSSE3-NEXT: pxor %[[X2:xmm[0-9]+]], %[[X2]]
-; SSSE3-NEXT: pshufb {{.*}} # [[X2]] = zero,[[X2]][2,4,6],zero,[[X2]][10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: por %xmm0, %[[X2]]
-; SSSE3-NEXT: punpcklbw {{.*}} # [[X2]] = [[X2]][0],[[X1]][0],[[X2]][1],[[X1]][1],[[X2]][2],[[X1]][2],[[X2]][3],[[X1]][3],[[X2]][4],[[X1]][4],[[X2]][5],[[X1]][5],[[X2]][6],[[X1]][6],[[X2]][7],[[X1]][7]
-; SSSE3-NEXT: movdqa %[[X2]], %xmm0
+; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz
define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(<16 x i8> %a) {
; SSE2-LABEL: @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz
; SSE2: # BB#0:
-; SSE2-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]]
-; SSE2-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7]
-; SSE2-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7]
+; SSE2-NEXT: pxor %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3],xmm0[4],[[X]][4],xmm0[5],[[X]][5],xmm0[6],[[X]][6],xmm0[7],[[X]][7]
+; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz
; SSSE3: # BB#0:
; SSSE3-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]]
; SSSE3-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7]
-; SSSE3-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7]
+; SSSE3-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz
define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) {
; SSE2-LABEL: @shuffle_v4i32_0z1z
; SSE2: # BB#0:
-; SSE2-NEXT: xorps %[[X:xmm[0-9]+]], %[[X]]
-; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,1],[[X]][1,3]
-; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT: pxor %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT: punpckldq {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: @shuffle_v4i32_0z1z
; SSE3: # BB#0:
-; SSE3-NEXT: xorps %[[X:xmm[0-9]+]], %[[X]]
-; SSE3-NEXT: shufps {{.*}} # xmm0 = xmm0[0,1],[[X]][1,3]
-; SSE3-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,1,3]
+; SSE3-NEXT: pxor %[[X:xmm[0-9]+]], %[[X]]
+; SSE3-NEXT: punpckldq {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v4i32_0z1z
; SSSE3: # BB#0:
-; SSSE3-NEXT: xorps %[[X:xmm[0-9]+]], %[[X]]
-; SSSE3-NEXT: shufps {{.*}} # xmm0 = xmm0[0,1],[[X]][1,3]
-; SSSE3-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT: pxor %[[X:xmm[0-9]+]], %[[X]]
+; SSSE3-NEXT: punpckldq {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: @shuffle_v4i32_0z1z
; SSE2-LABEL: @shuffle_v8i16_0zzz1zzz
; SSE2: # BB#0:
; SSE2-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]]
-; SSE2-NEXT: pxor %[[X2:xmm[0-9]+]], %[[X2]]
-; SSE2-NEXT: punpcklwd {{.*}} # [[X2]] = [[X2]][0],xmm0[0],[[X2]][1],xmm0[1],[[X2]][2],xmm0[2],[[X2]][3],xmm0[3]
-; SSE2-NEXT: pshufd {{.*}} # xmm0 = [[X2]][0,3,2,1]
-; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,2,3,0,4,5,6,7]
; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3]
+; SSE2-NEXT: punpckldq {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_0zzz1zzz
; SSSE3: # BB#0:
; SSSE3-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]]
-; SSSE3-NEXT: pxor %[[X2:xmm[0-9]+]], %[[X2]]
-; SSSE3-NEXT: punpcklwd {{.*}} # [[X2]] = [[X2]][0],xmm0[0],[[X2]][1],xmm0[1],[[X2]][2],xmm0[2],[[X2]][3],xmm0[3]
-; SSSE3-NEXT: pshufb {{.*}} # [[X2]] = [[X2]][2,3,8,9,6,7,0,1,8,9,6,7,4,5,6,7]
-; SSSE3-NEXT: punpcklwd {{.*}} # [[X2]] = [[X2]][0],[[X1]][0],[[X2]][1],[[X1]][1],[[X2]][2],[[X1]][2],[[X2]][3],[[X1]][3]
-; SSSE3-NEXT: movdqa %[[X2]], %xmm0
+; SSSE3-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3]
+; SSSE3-NEXT: punpckldq {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: @shuffle_v8i16_0zzz1zzz
; CHECK: movd (%{{.*}}), %[[X:xmm[0-9]+]]
; CHECK-NEXT: pxor %[[Z:xmm[0-9]+]], %[[Z]]
; CHECK-NEXT: punpcklbw %[[Z]], %[[X]]
-; CHECK-NEXT: punpcklbw %[[Z]], %[[X]]
+; CHECK-NEXT: punpcklwd %[[Z]], %[[X]]
; CHECK-NEXT: ret
%val = load <4 x i8>* %ptr