/// Match "(X shl/srl V1) & V2" where V2 may not be present.
static bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) {
if (Op.getOpcode() == ISD::AND) {
- if (isa<ConstantSDNode>(Op.getOperand(1))) {
+ if (isConstOrConstSplat(Op.getOperand(1))) {
Mask = Op.getOperand(1);
Op = Op.getOperand(0);
} else {
}
// Return true if we can prove that, whenever Neg and Pos are both in the
-// range [0, OpSize), Neg == (Pos == 0 ? 0 : OpSize - Pos). This means that
+// range [0, EltSize), Neg == (Pos == 0 ? 0 : EltSize - Pos). This means that
// for two opposing shifts shift1 and shift2 and a value X with OpBits bits:
//
// (or (shift1 X, Neg), (shift2 X, Pos))
//
// reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate
-// in direction shift1 by Neg. The range [0, OpSize) means that we only need
+// in direction shift1 by Neg. The range [0, EltSize) means that we only need
// to consider shift amounts with defined behavior.
-static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned OpSize) {
- // If OpSize is a power of 2 then:
+static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize) {
+ // If EltSize is a power of 2 then:
//
- // (a) (Pos == 0 ? 0 : OpSize - Pos) == (OpSize - Pos) & (OpSize - 1)
- // (b) Neg == Neg & (OpSize - 1) whenever Neg is in [0, OpSize).
+ // (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1)
+ // (b) Neg == Neg & (EltSize - 1) whenever Neg is in [0, EltSize).
//
- // So if OpSize is a power of 2 and Neg is (and Neg', OpSize-1), we check
+ // So if EltSize is a power of 2 and Neg is (and Neg', EltSize-1), we check
// for the stronger condition:
//
- // Neg & (OpSize - 1) == (OpSize - Pos) & (OpSize - 1) [A]
+ // Neg & (EltSize - 1) == (EltSize - Pos) & (EltSize - 1) [A]
//
- // for all Neg and Pos. Since Neg & (OpSize - 1) == Neg' & (OpSize - 1)
+ // for all Neg and Pos. Since Neg & (EltSize - 1) == Neg' & (EltSize - 1)
// we can just replace Neg with Neg' for the rest of the function.
//
// In other cases we check for the even stronger condition:
//
- // Neg == OpSize - Pos [B]
+ // Neg == EltSize - Pos [B]
//
// for all Neg and Pos. Note that the (or ...) then invokes undefined
- // behavior if Pos == 0 (and consequently Neg == OpSize).
+ // behavior if Pos == 0 (and consequently Neg == EltSize).
//
- // We could actually use [A] whenever OpSize is a power of 2, but the
+ // We could actually use [A] whenever EltSize is a power of 2, but the
// only extra cases that it would match are those uninteresting ones
// where Neg and Pos are never in range at the same time. E.g. for
- // OpSize == 32, using [A] would allow a Neg of the form (sub 64, Pos)
+ // EltSize == 32, using [A] would allow a Neg of the form (sub 64, Pos)
// as well as (sub 32, Pos), but:
//
// (or (shift1 X, (sub 64, Pos)), (shift2 X, Pos))
//
// always invokes undefined behavior for 32-bit X.
//
- // Below, Mask == OpSize - 1 when using [A] and is all-ones otherwise.
+ // Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise.
unsigned MaskLoBits = 0;
- if (Neg.getOpcode() == ISD::AND &&
- isPowerOf2_64(OpSize) &&
- Neg.getOperand(1).getOpcode() == ISD::Constant &&
- cast<ConstantSDNode>(Neg.getOperand(1))->getAPIntValue() == OpSize - 1) {
- Neg = Neg.getOperand(0);
- MaskLoBits = Log2_64(OpSize);
+ if (Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) {
+ if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) {
+ if (NegC->getAPIntValue() == EltSize - 1) {
+ Neg = Neg.getOperand(0);
+ MaskLoBits = Log2_64(EltSize);
+ }
+ }
}
// Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1.
if (Neg.getOpcode() != ISD::SUB)
return 0;
- ConstantSDNode *NegC = dyn_cast<ConstantSDNode>(Neg.getOperand(0));
+ ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(0));
if (!NegC)
return 0;
SDValue NegOp1 = Neg.getOperand(1);
- // On the RHS of [A], if Pos is Pos' & (OpSize - 1), just replace Pos with
+ // On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with
// Pos'. The truncation is redundant for the purpose of the equality.
- if (MaskLoBits &&
- Pos.getOpcode() == ISD::AND &&
- Pos.getOperand(1).getOpcode() == ISD::Constant &&
- cast<ConstantSDNode>(Pos.getOperand(1))->getAPIntValue() == OpSize - 1)
- Pos = Pos.getOperand(0);
+ if (MaskLoBits && Pos.getOpcode() == ISD::AND)
+ if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1)))
+ if (PosC->getAPIntValue() == EltSize - 1)
+ Pos = Pos.getOperand(0);
// The condition we need is now:
//
- // (NegC - NegOp1) & Mask == (OpSize - Pos) & Mask
+ // (NegC - NegOp1) & Mask == (EltSize - Pos) & Mask
//
// If NegOp1 == Pos then we need:
//
- // OpSize & Mask == NegC & Mask
+ // EltSize & Mask == NegC & Mask
//
// (because "x & Mask" is a truncation and distributes through subtraction).
APInt Width;
if (Pos == NegOp1)
Width = NegC->getAPIntValue();
+
// Check for cases where Pos has the form (add NegOp1, PosC) for some PosC.
// Then the condition we want to prove becomes:
//
- // (NegC - NegOp1) & Mask == (OpSize - (NegOp1 + PosC)) & Mask
+ // (NegC - NegOp1) & Mask == (EltSize - (NegOp1 + PosC)) & Mask
//
// which, again because "x & Mask" is a truncation, becomes:
//
- // NegC & Mask == (OpSize - PosC) & Mask
- // OpSize & Mask == (NegC + PosC) & Mask
- else if (Pos.getOpcode() == ISD::ADD &&
- Pos.getOperand(0) == NegOp1 &&
- Pos.getOperand(1).getOpcode() == ISD::Constant)
- Width = (cast<ConstantSDNode>(Pos.getOperand(1))->getAPIntValue() +
- NegC->getAPIntValue());
- else
+ // NegC & Mask == (EltSize - PosC) & Mask
+ // EltSize & Mask == (NegC + PosC) & Mask
+ else if (Pos.getOpcode() == ISD::ADD && Pos.getOperand(0) == NegOp1) {
+ if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1)))
+ Width = PosC->getAPIntValue() + NegC->getAPIntValue();
+ else
+ return false;
+ } else
return false;
- // Now we just need to check that OpSize & Mask == Width & Mask.
+ // Now we just need to check that EltSize & Mask == Width & Mask.
if (MaskLoBits)
- // Opsize & Mask is 0 since Mask is Opsize - 1.
+ // EltSize & Mask is 0 since Mask is EltSize - 1.
return Width.getLoBits(MaskLoBits) == 0;
- return Width == OpSize;
+ return Width == EltSize;
}
// A subroutine of MatchRotate used once we have found an OR of two opposite
// (srl x, (*ext y))) ->
// (rotr x, y) or (rotl x, (sub 32, y))
EVT VT = Shifted.getValueType();
- if (matchRotateSub(InnerPos, InnerNeg, VT.getSizeInBits())) {
+ if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits())) {
bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
HasPos ? Pos : Neg).getNode();
if (RHSShift.getOpcode() == ISD::SHL) {
std::swap(LHS, RHS);
std::swap(LHSShift, RHSShift);
- std::swap(LHSMask , RHSMask );
+ std::swap(LHSMask, RHSMask);
}
- unsigned OpSizeInBits = VT.getSizeInBits();
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
SDValue LHSShiftArg = LHSShift.getOperand(0);
SDValue LHSShiftAmt = LHSShift.getOperand(1);
SDValue RHSShiftArg = RHSShift.getOperand(0);
// fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
// fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
- if (LHSShiftAmt.getOpcode() == ISD::Constant &&
- RHSShiftAmt.getOpcode() == ISD::Constant) {
- uint64_t LShVal = cast<ConstantSDNode>(LHSShiftAmt)->getZExtValue();
- uint64_t RShVal = cast<ConstantSDNode>(RHSShiftAmt)->getZExtValue();
- if ((LShVal + RShVal) != OpSizeInBits)
+ if (isConstOrConstSplat(LHSShiftAmt) && isConstOrConstSplat(RHSShiftAmt)) {
+ uint64_t LShVal = isConstOrConstSplat(LHSShiftAmt)->getZExtValue();
+ uint64_t RShVal = isConstOrConstSplat(RHSShiftAmt)->getZExtValue();
+ if ((LShVal + RShVal) != EltSizeInBits)
return nullptr;
SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
// If there is an AND of either shifted operand, apply it to the result.
if (LHSMask.getNode() || RHSMask.getNode()) {
- APInt Mask = APInt::getAllOnesValue(OpSizeInBits);
+ APInt Mask = APInt::getAllOnesValue(EltSizeInBits);
if (LHSMask.getNode()) {
- APInt RHSBits = APInt::getLowBitsSet(OpSizeInBits, LShVal);
- Mask &= cast<ConstantSDNode>(LHSMask)->getAPIntValue() | RHSBits;
+ APInt RHSBits = APInt::getLowBitsSet(EltSizeInBits, LShVal);
+ Mask &= isConstOrConstSplat(LHSMask)->getAPIntValue() | RHSBits;
}
if (RHSMask.getNode()) {
- APInt LHSBits = APInt::getHighBitsSet(OpSizeInBits, RShVal);
- Mask &= cast<ConstantSDNode>(RHSMask)->getAPIntValue() | LHSBits;
+ APInt LHSBits = APInt::getHighBitsSet(EltSizeInBits, RShVal);
+ Mask &= isConstOrConstSplat(RHSMask)->getAPIntValue() | LHSBits;
}
Rot = DAG.getNode(ISD::AND, DL, VT, Rot, DAG.getConstant(Mask, DL, VT));
; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: var_rotate_v2i64:
-; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
-; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm1
-; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT: vpshlq %xmm2, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: var_rotate_v2i64:
-; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
-; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm1
-; XOPAVX2-NEXT: vpsrlvq %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: var_rotate_v2i64:
+; XOP: # BB#0:
+; XOP-NEXT: vprotq %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; X32-SSE-LABEL: var_rotate_v2i64:
; X32-SSE: # BB#0:
; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: var_rotate_v4i32:
-; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32]
-; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm1
-; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT: vpshld %xmm2, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: var_rotate_v4i32:
-; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
-; XOPAVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm1
-; XOPAVX2-NEXT: vpsrlvd %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: var_rotate_v4i32:
+; XOP: # BB#0:
+; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; X32-SSE-LABEL: var_rotate_v4i32:
; X32-SSE: # BB#0:
;
; XOP-LABEL: var_rotate_v8i16:
; XOP: # BB#0:
-; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
-; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm2
-; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm1
-; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOP-NEXT: vpsubw %xmm2, %xmm3, %xmm2
-; XOP-NEXT: vpshlw %xmm2, %xmm0, %xmm0
-; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; XOP-NEXT: vprotw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: var_rotate_v8i16:
;
; XOP-LABEL: var_rotate_v16i8:
; XOP: # BB#0:
-; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm2
-; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm1
-; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOP-NEXT: vpsubb %xmm2, %xmm3, %xmm2
-; XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm0
-; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; XOP-NEXT: vprotb %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: var_rotate_v16i8:
;
; XOP-LABEL: splatconstant_rotate_v2i64:
; XOP: # BB#0:
-; XOP-NEXT: vpsllq $14, %xmm0, %xmm1
-; XOP-NEXT: vpsrlq $50, %xmm0, %xmm0
-; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; XOP-NEXT: vprotq $14, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_rotate_v2i64:
;
; XOP-LABEL: splatconstant_rotate_v4i32:
; XOP: # BB#0:
-; XOP-NEXT: vpslld $4, %xmm0, %xmm1
-; XOP-NEXT: vpsrld $28, %xmm0, %xmm0
-; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; XOP-NEXT: vprotd $4, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_rotate_v4i32:
;
; XOP-LABEL: splatconstant_rotate_v8i16:
; XOP: # BB#0:
-; XOP-NEXT: vpsllw $7, %xmm0, %xmm1
-; XOP-NEXT: vpsrlw $9, %xmm0, %xmm0
-; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; XOP-NEXT: vprotw $7, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_rotate_v8i16:
;
; XOP-LABEL: splatconstant_rotate_v16i8:
; XOP: # BB#0:
-; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm2
-; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOP-NEXT: vpsubb %xmm1, %xmm3, %xmm1
-; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
-; XOP-NEXT: vpor %xmm0, %xmm2, %xmm0
+; XOP-NEXT: vprotb $4, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_rotate_v16i8:
;
; XOP-LABEL: splatconstant_rotate_mask_v2i64:
; XOP: # BB#0:
-; XOP-NEXT: vpsllq $15, %xmm0, %xmm1
-; XOP-NEXT: vpsrlq $49, %xmm0, %xmm0
+; XOP-NEXT: vprotq $15, %xmm0, %xmm0
; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_rotate_mask_v2i64:
;
; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i32:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vpslld $4, %xmm0, %xmm1
-; XOPAVX1-NEXT: vpsrld $28, %xmm0, %xmm0
+; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i32:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpslld $4, %xmm0, %xmm1
-; XOPAVX2-NEXT: vpsrld $28, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
-; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
-; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
+; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_rotate_mask_v4i32:
;
; XOP-LABEL: splatconstant_rotate_mask_v8i16:
; XOP: # BB#0:
-; XOP-NEXT: vpsllw $5, %xmm0, %xmm1
-; XOP-NEXT: vpsrlw $11, %xmm0, %xmm0
+; XOP-NEXT: vprotw $5, %xmm0, %xmm0
; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_rotate_mask_v8i16:
;
; XOP-LABEL: splatconstant_rotate_mask_v16i8:
; XOP: # BB#0:
-; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm2
-; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOP-NEXT: vpsubb %xmm1, %xmm3, %xmm1
-; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vprotb $4, %xmm0, %xmm0
; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm1
-; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_rotate_mask_v16i8:
;
; XOPAVX1-LABEL: var_rotate_v4i64:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
-; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; XOPAVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; XOPAVX1-NEXT: vpshlq %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm1
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2
-; XOPAVX1-NEXT: vpshlq %xmm2, %xmm5, %xmm2
-; XOPAVX1-NEXT: vpsubq %xmm3, %xmm4, %xmm3
-; XOPAVX1-NEXT: vpshlq %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT: vprotq %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_rotate_v4i64:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
-; XOPAVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2
-; XOPAVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm1
-; XOPAVX2-NEXT: vpsrlvq %ymm2, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%b64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %b
%shl = shl <4 x i64> %a, %b
;
; XOPAVX1-LABEL: var_rotate_v8i32:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32]
-; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm3
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; XOPAVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; XOPAVX1-NEXT: vpshld %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm1
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
-; XOPAVX1-NEXT: vpshld %xmm2, %xmm5, %xmm2
-; XOPAVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3
-; XOPAVX1-NEXT: vpshld %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT: vprotd %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_rotate_v8i32:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
-; XOPAVX2-NEXT: vpsubd %ymm1, %ymm2, %ymm2
-; XOPAVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm1
-; XOPAVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%b32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
%shl = shl <8 x i32> %a, %b
;
; XOPAVX1-LABEL: var_rotate_v16i16:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
-; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm3
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; XOPAVX1-NEXT: vpsubw %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; XOPAVX1-NEXT: vpshlw %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm1
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm2
-; XOPAVX1-NEXT: vpshlw %xmm2, %xmm5, %xmm2
-; XOPAVX1-NEXT: vpsubw %xmm3, %xmm4, %xmm3
-; XOPAVX1-NEXT: vpshlw %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT: vprotw %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_rotate_v16i16:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; XOPAVX2-NEXT: vpsubw %ymm1, %ymm2, %ymm2
-; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
-; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; XOPAVX2-NEXT: vpshlw %xmm3, %xmm4, %xmm3
-; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm1
-; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX2-NEXT: vpsubw %xmm3, %xmm5, %xmm3
-; XOPAVX2-NEXT: vpshlw %xmm3, %xmm4, %xmm3
-; XOPAVX2-NEXT: vpsubw %xmm2, %xmm5, %xmm2
-; XOPAVX2-NEXT: vpshlw %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%b16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
%shl = shl <16 x i16> %a, %b
;
; XOPAVX1-LABEL: var_rotate_v32i8:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; XOPAVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; XOPAVX1-NEXT: vpshlb %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm1
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpsubb %xmm2, %xmm4, %xmm2
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm5, %xmm2
-; XOPAVX1-NEXT: vpsubb %xmm3, %xmm4, %xmm3
-; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT: vprotb %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_rotate_v32i8:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; XOPAVX2-NEXT: vpsubb %ymm1, %ymm2, %ymm2
-; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
-; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; XOPAVX2-NEXT: vpshlb %xmm3, %xmm4, %xmm3
-; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm1
-; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX2-NEXT: vpsubb %xmm3, %xmm5, %xmm3
-; XOPAVX2-NEXT: vpshlb %xmm3, %xmm4, %xmm3
-; XOPAVX2-NEXT: vpsubb %xmm2, %xmm5, %xmm2
-; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
%shl = shl <32 x i8> %a, %b
;
; XOPAVX1-LABEL: splatconstant_rotate_v4i64:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vpsllq $14, %xmm0, %xmm1
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT: vpsllq $14, %xmm2, %xmm3
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX1-NEXT: vpsrlq $50, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrlq $50, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_v4i64:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpsllq $14, %ymm0, %ymm1
-; XOPAVX2-NEXT: vpsrlq $50, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm1
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX2-NEXT: retq
%shl = shl <4 x i64> %a, <i64 14, i64 14, i64 14, i64 14>
%lshr = lshr <4 x i64> %a, <i64 50, i64 50, i64 50, i64 50>
;
; XOPAVX1-LABEL: splatconstant_rotate_v8i32:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vpslld $4, %xmm0, %xmm1
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT: vpslld $4, %xmm2, %xmm3
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX1-NEXT: vpsrld $28, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrld $28, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_v8i32:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpslld $4, %ymm0, %ymm1
-; XOPAVX2-NEXT: vpsrld $28, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX2-NEXT: retq
%shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
%lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
;
; XOPAVX1-LABEL: splatconstant_rotate_v16i16:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vpsllw $7, %xmm0, %xmm1
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT: vpsllw $7, %xmm2, %xmm3
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX1-NEXT: vpsrlw $9, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrlw $9, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm1
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_v16i16:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpsllw $7, %ymm0, %ymm1
-; XOPAVX2-NEXT: vpsrlw $9, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm1
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX2-NEXT: retq
%shl = shl <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
%lshr = lshr <16 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
;
; XOPAVX1-LABEL: splatconstant_rotate_v32i8:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm3
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm4
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpsubb %xmm2, %xmm4, %xmm2
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm0, %ymm3, %ymm0
+; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_v32i8:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpsllw $4, %ymm0, %ymm1
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; XOPAVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX2-NEXT: retq
%shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
%lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
;
; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vpsllq $15, %xmm0, %xmm1
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT: vpsllq $15, %xmm2, %xmm3
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX1-NEXT: vpsrlq $49, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrlq $49, %xmm2, %xmm2
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; XOPAVX1-NEXT: vprotq $15, %xmm1, %xmm1
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; XOPAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i64:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpsllq $15, %ymm0, %ymm1
-; XOPAVX2-NEXT: vpsrlq $49, %ymm0, %ymm0
+; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm1
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; XOPAVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
-; XOPAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; XOPAVX2-NEXT: retq
%shl = shl <4 x i64> %a, <i64 15, i64 15, i64 15, i64 15>
%lshr = lshr <4 x i64> %a, <i64 49, i64 49, i64 49, i64 49>
;
; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vpslld $4, %xmm0, %xmm1
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT: vpslld $4, %xmm2, %xmm3
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX1-NEXT: vpsrld $28, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrld $28, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_mask_v8i32:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpslld $4, %ymm0, %ymm1
-; XOPAVX2-NEXT: vpsrld $28, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
-; XOPAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
-; XOPAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm2
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
+; XOPAVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
%lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
;
; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vpsllw $5, %xmm0, %xmm1
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT: vpsllw $5, %xmm2, %xmm3
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX1-NEXT: vpsrlw $11, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrlw $11, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm1
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_mask_v16i16:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpsllw $5, %ymm0, %ymm1
-; XOPAVX2-NEXT: vpsrlw $11, %ymm0, %ymm0
+; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm1
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; XOPAVX2-NEXT: retq
%shl = shl <16 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
%lshr = lshr <16 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
;
; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm3
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm4
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpsubb %xmm2, %xmm4, %xmm2
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm3, %ymm1
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_mask_v32i8:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpsllw $4, %ymm0, %ymm1
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; XOPAVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; XOPAVX2-NEXT: retq
%shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
%lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>