- case MVT::v4i64:
- case MVT::v8i32:
- assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
- // FALLTHROUGH
- case MVT::v2i64:
- case MVT::v4i32:
- // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
- // that instruction.
- if (Subtarget->hasAVX2()) {
- // Scale the blend by the number of 32-bit dwords per element.
- int Scale = VT.getScalarSizeInBits() / 32;
- BlendMask = 0;
- for (int i = 0, Size = Mask.size(); i < Size; ++i)
- if (Mask[i] >= Size)
- for (int j = 0; j < Scale; ++j)
- BlendMask |= 1u << (i * Scale + j);
-
- MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
- V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
- V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
- return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
- DAG.getConstant(BlendMask, MVT::i8)));
- }
- // FALLTHROUGH
- case MVT::v8i16: {
- // For integer shuffles we need to expand the mask and cast the inputs to
- // v8i16s prior to blending.
- int Scale = 8 / VT.getVectorNumElements();
- BlendMask = 0;
- for (int i = 0, Size = Mask.size(); i < Size; ++i)
- if (Mask[i] >= Size)
- for (int j = 0; j < Scale; ++j)
- BlendMask |= 1u << (i * Scale + j);
-
- V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
- V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
- return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
- DAG.getConstant(BlendMask, MVT::i8)));
- }
-
- case MVT::v16i16: {
- assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
- SmallVector<int, 8> RepeatedMask;
- if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
- // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
- assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
- BlendMask = 0;
- for (int i = 0; i < 8; ++i)
- if (RepeatedMask[i] >= 16)
- BlendMask |= 1u << i;
- return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
- DAG.getConstant(BlendMask, MVT::i8));
- }
- }
- // FALLTHROUGH
- case MVT::v32i8: {
- assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
- // Scale the blend by the number of bytes per element.
- int Scale = VT.getScalarSizeInBits() / 8;
- assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
-
- // Compute the VSELECT mask. Note that VSELECT is really confusing in the
- // mix of LLVM's code generator and the x86 backend. We tell the code
- // generator that boolean values in the elements of an x86 vector register
- // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
- // mapping a select to operand #1, and 'false' mapping to operand #2. The
- // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
- // of the element (the remaining are ignored) and 0 in that high bit would
- // mean operand #1 while 1 in the high bit would mean operand #2. So while
- // the LLVM model for boolean values in vector elements gets the relevant
- // bit set, it is set backwards and over constrained relative to x86's
- // actual model.
- SDValue VSELECTMask[32];
- for (int i = 0, Size = Mask.size(); i < Size; ++i)
- for (int j = 0; j < Scale; ++j)
- VSELECTMask[Scale * i + j] =
- Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
- : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
-
- V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
- V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
- return DAG.getNode(
- ISD::BITCAST, DL, VT,
- DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
- V1, V2));
- }