// First fix the masks for all the inputs that are staying in their
// original halves. This will then dictate the targets of the cross-half
// shuffles.
- auto fixInPlaceInputs = [&PSHUFDMask](
- ArrayRef<int> InPlaceInputs, MutableArrayRef<int> SourceHalfMask,
- MutableArrayRef<int> HalfMask, int HalfOffset) {
+ auto fixInPlaceInputs =
+ [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
+ MutableArrayRef<int> SourceHalfMask,
+ MutableArrayRef<int> HalfMask, int HalfOffset) {
if (InPlaceInputs.empty())
return;
if (InPlaceInputs.size() == 1) {
PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
return;
}
+ if (IncomingInputs.empty()) {
+ // Just fix all of the in place inputs.
+ for (int Input : InPlaceInputs) {
+ SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
+ PSHUFDMask[Input / 2] = Input / 2;
+ }
+ return;
+ }
assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
};
- if (!HToLInputs.empty())
- fixInPlaceInputs(LToLInputs, PSHUFLMask, LoMask, 0);
- if (!LToHInputs.empty())
- fixInPlaceInputs(HToHInputs, PSHUFHMask, HiMask, 4);
+ fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
+ fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
// Now gather the cross-half inputs and place them into a free dword of
// their target half.
auto moveInputsToRightHalf = [&PSHUFDMask](
MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
- int SourceOffset, int DestOffset) {
+ MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
+ int DestOffset) {
auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
};
} else if (IncomingInputs.size() == 2) {
if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
- int SourceDWordBase = !isDWordClobbered(SourceHalfMask, 0) ? 0 : 2;
- assert(!isDWordClobbered(SourceHalfMask, SourceDWordBase) &&
- "Not all dwords can be clobbered!");
- SourceHalfMask[SourceDWordBase] = IncomingInputs[0] - SourceOffset;
- SourceHalfMask[SourceDWordBase + 1] = IncomingInputs[1] - SourceOffset;
+ // We have two non-adjacent or clobbered inputs we need to extract from
+ // the source half. To do this, we need to map them into some adjacent
+ // dword slot in the source mask.
+ int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
+ IncomingInputs[1] - SourceOffset};
+
+ // If there is a free slot in the source half mask adjacent to one of
+ // the inputs, place the other input in it. We use (Index XOR 1) to
+ // compute an adjacent index.
+ if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
+ SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
+ SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
+ SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
+ InputsFixed[1] = InputsFixed[0] ^ 1;
+ } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
+ SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
+ SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
+ SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
+ InputsFixed[0] = InputsFixed[1] ^ 1;
+ } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
+ SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
+ // The two inputs are in the same DWord but it is clobbered and the
+ // adjacent DWord isn't used at all. Move both inputs to the free
+ // slot.
+ SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
+ SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
+ InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
+ InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
+ } else {
+ // The only way we hit this point is if there is no clobbering
+ // (because there are no off-half inputs to this half) and there is no
+ // free slot adjacent to one of the inputs. In this case, we have to
+ // swap an input with a non-input.
+ for (int i = 0; i < 4; ++i)
+ assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
+ "We can't handle any clobbers here!");
+ assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
+ "Cannot have adjacent inputs here!");
+
+ SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
+ SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
+
+ // We also have to update the final source mask in this case because
+ // it may need to undo the above swap.
+ for (int &M : FinalSourceHalfMask)
+ if (M == (InputsFixed[0] ^ 1))
+ M = InputsFixed[1];
+ else if (M == InputsFixed[1])
+ M = InputsFixed[0] ^ 1;
+
+ InputsFixed[1] = InputsFixed[0] ^ 1;
+ }
+
+ // Point everything at the fixed inputs.
for (int &M : HalfMask)
if (M == IncomingInputs[0])
- M = SourceDWordBase + SourceOffset;
+ M = InputsFixed[0] + SourceOffset;
else if (M == IncomingInputs[1])
- M = SourceDWordBase + 1 + SourceOffset;
- IncomingInputs[0] = SourceDWordBase + SourceOffset;
- IncomingInputs[1] = SourceDWordBase + 1 + SourceOffset;
+ M = InputsFixed[1] + SourceOffset;
+
+ IncomingInputs[0] = InputsFixed[0] + SourceOffset;
+ IncomingInputs[1] = InputsFixed[1] + SourceOffset;
}
} else {
llvm_unreachable("Unhandled input size!");
if (M == Input)
M = FreeDWord * 2 + Input % 2;
};
- moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask,
+ moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
/*SourceOffset*/ 4, /*DestOffset*/ 0);
- moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask,
+ moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
/*SourceOffset*/ 0, /*DestOffset*/ 4);
// Now enact all the shuffles we've computed to move the inputs into their
// Other-half shuffles are no-ops.
continue;
-
- case X86ISD::PSHUFD: {
- // We can only handle pshufd if the half we are combining either stays in
- // its half, or switches to the other half. Bail if one of these isn't
- // true.
- SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
- int DOffset = CombineOpcode == X86ISD::PSHUFLW ? 0 : 2;
- if (!((VMask[DOffset + 0] < 2 && VMask[DOffset + 1] < 2) ||
- (VMask[DOffset + 0] >= 2 && VMask[DOffset + 1] >= 2)))
- return false;
-
- // Map the mask through the pshufd and keep walking up the chain.
- for (int i = 0; i < 4; ++i)
- Mask[i] = 2 * (VMask[DOffset + Mask[i] / 2] % 2) + Mask[i] % 2;
-
- // Switch halves if the pshufd does.
- CombineOpcode =
- VMask[DOffset + Mask[0] / 2] < 2 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
- continue;
- }
}
// Break out of the loop if we break out of the switch.
break;
; SSE2-LABEL: @shuffle_v8i16_26405173
; SSE2: # BB#0:
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
+; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,1]
-; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7]
+; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,2,3,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7]
; SSE2-NEXT: retq
;
; SSE2-LABEL: @shuffle_v8i16_20645173
; SSE2: # BB#0:
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
+; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,1]
-; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,0,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7]
; SSE2-NEXT: retq
;
; SSE2-LABEL: @shuffle_v8i16_26401375
; SSE2: # BB#0:
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
+; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,1,2]
-; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7]
+; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,2,3,0,4,5,6,7]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_26401375
; SSE2-LABEL: @shuffle_v8i16_45630127
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
-; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,1,2,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,0,1,3]
-; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,5,4]
+; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,1,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,0,3,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_45630127
define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_XXX1X579
; SSE2: # BB#0:
-; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,7,6,7]
+; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,7]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,1,3,2,4,5,6,7]
; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
;
; SSSE3-LABEL: @shuffle_v8i16_XXX1X579
; SSSE3: # BB#0:
-; SSSE3-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,7,6,7]
-; SSSE3-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+}},2,3,10,11,14,15,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}]
; SSSE3-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}},4,5,{{[0-9]+,[0-9]+}},8,9,12,13,6,7]
+; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}},4,5,{{[0-9]+,[0-9]+}},8,9,12,13,6,7]
; SSSE3-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 5, i32 7, i32 9>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
-; FIXME-SSE2-LABEL: @shuffle_v8i16_XX4X8acX
-; FIXME-SSE2: # BB#0:
-; FIXME-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,1,2,3]
-; FIXME-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; FIXME-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[0,2,2,3]
-; FIXME-SSE2-NEXT: punpcklwd {{.*}} # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; FIXME-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm1[0,1,0,2,4,5,6,7]
-; FIXME-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
-; FIXME-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; FIXME-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,4,7]
-; FIXME-SSE2-NEXT: retq
+; SSE2-LABEL: @shuffle_v8i16_XX4X8acX
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,1,2,3]
+; SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpcklwd {{.*}} # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm1[0,1,2,0,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
+; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,1,1,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,4,7]
+; SSE2-NEXT: retq
;
-; FIXME-SSSE3-LABEL: @shuffle_v8i16_XX4X8acX
-; FIXME-SSSE3: # BB#0:
-; FIXME-SSSE3-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,1,2,3]
-; FIXME-SSSE3-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,2,2,3,5,7,6,7]
-; FIXME-SSSE3-NEXT: pshufd {{.*}} # xmm1 = xmm1[0,2,2,3]
-; FIXME-SSSE3-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; FIXME-SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+}},0,1,{{[0-9]+,[0-9]+}},2,3,6,7,10,11{{[0-9]+,[0-9]+}}]
-; FIXME-SSSE3-NEXT: retq
+; SSSE3-LABEL: @shuffle_v8i16_XX4X8acX
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufd {{.*}} # [[X:xmm[0-9]+]] = xmm0[2,1,2,3]
+; SSSE3-NEXT: pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3]
+; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+}},2,3,{{[0-9]+,[0-9]+}},0,1,4,5,8,9,{{[0-9]+,[0-9]+}}]
+; SSSE3-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 undef>
ret <8 x i16> %shuffle
}