else if (Mask[i] - i != AlignVal)
return SDValue();
}
- return DAG.getNode(X86ISD::VALIGN, DL, VT, V1, V2,
+ // Vector source operands should be swapped
+ return DAG.getNode(X86ISD::VALIGN, DL, VT, V2, V1,
DAG.getConstant(AlignVal, DL, MVT::i8));
}
Src1,Src2),
Mask, PassThru, Subtarget, DAG);
}
+ case INTR_TYPE_3OP_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue PassThru = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(6);
+ unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
+ if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src1, Src2, Src3, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ }
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1, Src2, Src3),
+ Mask, PassThru, Subtarget, DAG);
+ }
case FMA_OP_MASK: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
Op.getOperand(2), Op.getOperand(1));
- case Intrinsic::x86_avx512_mask_valign_q_512:
- case Intrinsic::x86_avx512_mask_valign_d_512:
- // Vector source operands are swapped.
- return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
- Op.getValueType(), Op.getOperand(2),
- Op.getOperand(1),
- Op.getOperand(3)),
- Op.getOperand(5), Op.getOperand(4),
- Subtarget, DAG);
-
// ptest and testp intrinsics. The intrinsic these come from are designed to
// return an integer value, not just an instruction so lower it to the ptest
// or testp pattern and a setcc for the result.
(loadv8i64 addr:$src2), (i8 imm:$imm))),
(VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>;
-multiclass avx512_valign<X86VectorVTInfo _> {
- defm rri : AVX512_maskable<0x03, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
- "valign"##_.Suffix,
- "$src3, $src2, $src1", "$src1, $src2, $src3",
- (_.VT (X86VAlign _.RC:$src2, _.RC:$src1,
- (i8 imm:$src3)))>,
- AVX512AIi8Base, EVEX_4V;
-
- // Also match valign of packed floats.
- def : Pat<(_.FloatVT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 imm:$imm))),
- (!cast<Instruction>(NAME##rri) _.RC:$src2, _.RC:$src1, imm:$imm)>;
-
- let mayLoad = 1 in
- def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs _.RC:$dst),
- (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
- !strconcat("valign"##_.Suffix,
- "\t{$src3, $src2, $src1, $dst|"
- "$dst, $src1, $src2, $src3}"),
- []>, EVEX_4V;
-}
-defm VALIGND : avx512_valign<v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VALIGNQ : avx512_valign<v8i64_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
-
// Helper fragments to match sext vXi1 to vXiY.
def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>;
def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>;
AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
let Predicates = [prd] in {
defm Z : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
- avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+ avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
EVEX_V512;
}
}
}
+multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _,
+ bits<8> opc, SDNode OpNode>{
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+ }
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z128 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
+ defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+ }
+}
+
multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr,
X86VectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
let Predicates = [prd] in {
AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+
+multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I,
+ AVX512VLVectorVTInfo VTInfo_FP>{
+ defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign>,
+ AVX512AIi8Base, EVEX_4V;
+ let isCodeGenOnly = 1 in {
+ defm NAME#_FP: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0x03, X86VAlign>,
+ AVX512AIi8Base, EVEX_4V;
+ }
+}
+
+defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info, avx512vl_f32_info>,
+ EVEX_CD8<32, CD8VF>;
+defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info, avx512vl_f64_info>,
+ EVEX_CD8<64, CD8VF>, VEX_W;
{ X86::VPSUBQZrr, X86::VPSUBQZrm, 0 },
{ X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 },
{ X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 },
- { X86::VALIGNQrri, X86::VALIGNQrmi, 0 },
- { X86::VALIGNDrri, X86::VALIGNDrmi, 0 },
+ { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 },
+ { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 },
{ X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 },
{ X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE },
{ X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE },
GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX,
INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP,
CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI,
- INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, FMA_OP_MASK,
+ INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK,
+ INTR_TYPE_3OP_MASK, FMA_OP_MASK,
INTR_TYPE_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
EXPAND_FROM_MEM, BLEND
};
X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0),
X86_INTRINSIC_DATA(avx512_mask_xor_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
X86_INTRINSIC_DATA(avx512_mask_xor_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
X86_INTRINSIC_DATA(avx512_mask_xor_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK-SKX
; CHECK-LABEL: test1:
; CHECK: vpermps
%c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 1, i32 10, i32 2, i32 undef, i32 5, i32 15, i32 undef>
ret <8 x double> %c
}
+
+define <16 x i32> @test_align_v16i32_rr(<16 x i32> %a, <16 x i32> %b) nounwind {
+; CHECK-LABEL: test_align_v16i32_rr:
+; CHECK: ## BB#0:
+; CHECK-NEXT: valignd $3, %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
+ ret <16 x i32> %c
+}
+
+define <16 x i32> @test_align_v16i32_rm(<16 x i32>* %a.ptr, <16 x i32> %b) nounwind {
+; CHECK-LABEL: test_align_v16i32_rm:
+; CHECK: ## BB#0:
+; CHECK-NEXT: valignd $3, (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %a = load <16 x i32>, <16 x i32>* %a.ptr
+ %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
+ ret <16 x i32> %c
+}
+
+define <16 x i32> @test_align_v16i32_rm_mask(<16 x i32>* %a.ptr, <16 x i32> %b, <16 x i1> %mask) nounwind {
+; CHECK-LABEL: test_align_v16i32_rm_mask:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsxbd %xmm1, %zmm1
+; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1
+; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1
+; CHECK-NEXT: valignd $3, %zmm1, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+;
+; CHECK-SKX-LABEL: test_align_v16i32_rm_mask:
+; CHECK-SKX: ## BB#0:
+; CHECK-SKX-NEXT: vpmovb2m %xmm1, %k1
+; CHECK-SKX-NEXT: vmovdqa32 (%rdi), %zmm1
+; CHECK-SKX-NEXT: valignd $3, %zmm1, %zmm0, %zmm1 {%k1}
+; CHECK-SKX-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-SKX-NEXT: retq
+ %a = load <16 x i32>, <16 x i32>* %a.ptr
+ %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
+ %res = select <16 x i1> %mask,<16 x i32> %c, <16 x i32> %a
+ ret <16 x i32> %res
+}
+
+define <8 x double> @test_align_v8f64_rr(<8 x double> %a, <8 x double> %b) nounwind {
+; CHECK-LABEL: test_align_v8f64_rr:
+; CHECK: ## BB#0:
+; CHECK-NEXT: valignq $3, %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+ ret <8 x double> %c
+}
+
+define <8 x double> @test_align_v18f64_rm(<8 x double>* %a.ptr, <8 x double> %b) nounwind {
+; CHECK-LABEL: test_align_v18f64_rm:
+; CHECK: ## BB#0:
+; CHECK-NEXT: valignq $3, (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %a = load <8 x double>, <8 x double>* %a.ptr
+ %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+ ret <8 x double> %c
+}
+
+define <8 x double> @test_align_v18f64_rm_mask(<8 x double>* %a.ptr, <8 x double> %b, <8 x i1> %mask) nounwind {
+; CHECK-LABEL: test_align_v18f64_rm_mask:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsxwq %xmm1, %zmm1
+; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1
+; CHECK-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+;
+; CHECK-SKX-LABEL: test_align_v18f64_rm_mask:
+; CHECK-SKX: ## BB#0:
+; CHECK-SKX-NEXT: vpmovw2m %xmm1, %k1
+; CHECK-SKX-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-SKX-NEXT: retq
+ %a = load <8 x double>, <8 x double>* %a.ptr
+ %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+ %res = select <8 x i1> %mask,<8 x double> %c, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
// CHECK: encoding: [0x62,0xf3,0xfd,0x49,0x03,0xcb,0x03]
valignq $3, %zmm3, %zmm0, %zmm1 {%k1}
+// CHECK: valignq $171, %zmm23, %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x23,0xdd,0x48,0x03,0xe7,0xab]
+ valignq $0xab, %zmm23, %zmm4, %zmm28
+
+// CHECK: valignq $171, %zmm23, %zmm4, %zmm28 {%k3}
+// CHECK: encoding: [0x62,0x23,0xdd,0x4b,0x03,0xe7,0xab]
+ valignq $0xab, %zmm23, %zmm4, %zmm28 {%k3}
+
+// CHECK: valignq $171, %zmm23, %zmm4, %zmm28 {%k3} {z}
+// CHECK: encoding: [0x62,0x23,0xdd,0xcb,0x03,0xe7,0xab]
+ valignq $0xab, %zmm23, %zmm4, %zmm28 {%k3} {z}
+
+// CHECK: valignq $123, %zmm23, %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x23,0xdd,0x48,0x03,0xe7,0x7b]
+ valignq $0x7b, %zmm23, %zmm4, %zmm28
+
+// CHECK: valignq $123, (%rcx), %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x63,0xdd,0x48,0x03,0x21,0x7b]
+ valignq $0x7b, (%rcx), %zmm4, %zmm28
+
+// CHECK: valignq $123, 291(%rax,%r14,8), %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x23,0xdd,0x48,0x03,0xa4,0xf0,0x23,0x01,0x00,0x00,0x7b]
+ valignq $0x7b, 291(%rax,%r14,8), %zmm4, %zmm28
+
+// CHECK: valignq $123, (%rcx){1to8}, %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x63,0xdd,0x58,0x03,0x21,0x7b]
+ valignq $0x7b, (%rcx){1to8}, %zmm4, %zmm28
+
+// CHECK: valignq $123, 8128(%rdx), %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x63,0xdd,0x48,0x03,0x62,0x7f,0x7b]
+ valignq $0x7b, 8128(%rdx), %zmm4, %zmm28
+
+// CHECK: valignq $123, 8192(%rdx), %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x63,0xdd,0x48,0x03,0xa2,0x00,0x20,0x00,0x00,0x7b]
+ valignq $0x7b, 8192(%rdx), %zmm4, %zmm28
+
+// CHECK: valignq $123, -8192(%rdx), %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x63,0xdd,0x48,0x03,0x62,0x80,0x7b]
+ valignq $0x7b, -8192(%rdx), %zmm4, %zmm28
+
+// CHECK: valignq $123, -8256(%rdx), %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x63,0xdd,0x48,0x03,0xa2,0xc0,0xdf,0xff,0xff,0x7b]
+ valignq $0x7b, -8256(%rdx), %zmm4, %zmm28
+
+// CHECK: valignq $123, 1016(%rdx){1to8}, %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x63,0xdd,0x58,0x03,0x62,0x7f,0x7b]
+ valignq $0x7b, 1016(%rdx){1to8}, %zmm4, %zmm28
+
+// CHECK: valignq $123, 1024(%rdx){1to8}, %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x63,0xdd,0x58,0x03,0xa2,0x00,0x04,0x00,0x00,0x7b]
+ valignq $0x7b, 1024(%rdx){1to8}, %zmm4, %zmm28
+
+// CHECK: valignq $123, -1024(%rdx){1to8}, %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x63,0xdd,0x58,0x03,0x62,0x80,0x7b]
+ valignq $0x7b, -1024(%rdx){1to8}, %zmm4, %zmm28
+
+// CHECK: valignq $123, -1032(%rdx){1to8}, %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x63,0xdd,0x58,0x03,0xa2,0xf8,0xfb,0xff,0xff,0x7b]
+ valignq $0x7b, -1032(%rdx){1to8}, %zmm4, %zmm28
+
// CHECK: vextractf32x4 $3
// CHECK: encoding: [0x62,0xf3,0x7d,0x49,0x19,0xd9,0x03]
vextractf32x4 $3, %zmm3, %xmm1 {%k1}
// CHECK: encoding: [0x62,0x63,0xad,0x30,0x43,0x8a,0xf8,0xfb,0xff,0xff,0x7b]
vshufi64x2 $0x7b, -1032(%rdx){1to4}, %ymm26, %ymm25
+// CHECK: valignq $171, %xmm24, %xmm18, %xmm19
+// CHECK: encoding: [0x62,0x83,0xed,0x00,0x03,0xd8,0xab]
+ valignq $0xab, %xmm24, %xmm18, %xmm19
+
+// CHECK: valignq $171, %xmm24, %xmm18, %xmm19 {%k5}
+// CHECK: encoding: [0x62,0x83,0xed,0x05,0x03,0xd8,0xab]
+ valignq $0xab, %xmm24, %xmm18, %xmm19 {%k5}
+
+// CHECK: valignq $171, %xmm24, %xmm18, %xmm19 {%k5} {z}
+// CHECK: encoding: [0x62,0x83,0xed,0x85,0x03,0xd8,0xab]
+ valignq $0xab, %xmm24, %xmm18, %xmm19 {%k5} {z}
+
+// CHECK: valignq $123, %xmm24, %xmm18, %xmm19
+// CHECK: encoding: [0x62,0x83,0xed,0x00,0x03,0xd8,0x7b]
+ valignq $0x7b, %xmm24, %xmm18, %xmm19
+
+// CHECK: valignq $123, (%rcx), %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xe3,0xed,0x00,0x03,0x19,0x7b]
+ valignq $0x7b, (%rcx), %xmm18, %xmm19
+
+// CHECK: valignq $123, 291(%rax,%r14,8), %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xa3,0xed,0x00,0x03,0x9c,0xf0,0x23,0x01,0x00,0x00,0x7b]
+ valignq $0x7b, 291(%rax,%r14,8), %xmm18, %xmm19
+
+// CHECK: valignq $123, (%rcx){1to2}, %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xe3,0xed,0x10,0x03,0x19,0x7b]
+ valignq $0x7b, (%rcx){1to2}, %xmm18, %xmm19
+
+// CHECK: valignq $123, 2032(%rdx), %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xe3,0xed,0x00,0x03,0x5a,0x7f,0x7b]
+ valignq $0x7b, 2032(%rdx), %xmm18, %xmm19
+
+// CHECK: valignq $123, 2048(%rdx), %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xe3,0xed,0x00,0x03,0x9a,0x00,0x08,0x00,0x00,0x7b]
+ valignq $0x7b, 2048(%rdx), %xmm18, %xmm19
+
+// CHECK: valignq $123, -2048(%rdx), %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xe3,0xed,0x00,0x03,0x5a,0x80,0x7b]
+ valignq $0x7b, -2048(%rdx), %xmm18, %xmm19
+
+// CHECK: valignq $123, -2064(%rdx), %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xe3,0xed,0x00,0x03,0x9a,0xf0,0xf7,0xff,0xff,0x7b]
+ valignq $0x7b, -2064(%rdx), %xmm18, %xmm19
+
+// CHECK: valignq $123, 1016(%rdx){1to2}, %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xe3,0xed,0x10,0x03,0x5a,0x7f,0x7b]
+ valignq $0x7b, 1016(%rdx){1to2}, %xmm18, %xmm19
+
+// CHECK: valignq $123, 1024(%rdx){1to2}, %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xe3,0xed,0x10,0x03,0x9a,0x00,0x04,0x00,0x00,0x7b]
+ valignq $0x7b, 1024(%rdx){1to2}, %xmm18, %xmm19
+
+// CHECK: valignq $123, -1024(%rdx){1to2}, %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xe3,0xed,0x10,0x03,0x5a,0x80,0x7b]
+ valignq $0x7b, -1024(%rdx){1to2}, %xmm18, %xmm19
+
+// CHECK: valignq $123, -1032(%rdx){1to2}, %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xe3,0xed,0x10,0x03,0x9a,0xf8,0xfb,0xff,0xff,0x7b]
+ valignq $0x7b, -1032(%rdx){1to2}, %xmm18, %xmm19
+
+// CHECK: valignq $171, %ymm26, %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x03,0xbd,0x20,0x03,0xca,0xab]
+ valignq $0xab, %ymm26, %ymm24, %ymm25
+
+// CHECK: valignq $171, %ymm26, %ymm24, %ymm25 {%k2}
+// CHECK: encoding: [0x62,0x03,0xbd,0x22,0x03,0xca,0xab]
+ valignq $0xab, %ymm26, %ymm24, %ymm25 {%k2}
+
+// CHECK: valignq $171, %ymm26, %ymm24, %ymm25 {%k2} {z}
+// CHECK: encoding: [0x62,0x03,0xbd,0xa2,0x03,0xca,0xab]
+ valignq $0xab, %ymm26, %ymm24, %ymm25 {%k2} {z}
+
+// CHECK: valignq $123, %ymm26, %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x03,0xbd,0x20,0x03,0xca,0x7b]
+ valignq $0x7b, %ymm26, %ymm24, %ymm25
+
+// CHECK: valignq $123, (%rcx), %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x63,0xbd,0x20,0x03,0x09,0x7b]
+ valignq $0x7b, (%rcx), %ymm24, %ymm25
+
+// CHECK: valignq $123, 291(%rax,%r14,8), %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x23,0xbd,0x20,0x03,0x8c,0xf0,0x23,0x01,0x00,0x00,0x7b]
+ valignq $0x7b, 291(%rax,%r14,8), %ymm24, %ymm25
+
+// CHECK: valignq $123, (%rcx){1to4}, %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x63,0xbd,0x30,0x03,0x09,0x7b]
+ valignq $0x7b, (%rcx){1to4}, %ymm24, %ymm25
+
+// CHECK: valignq $123, 4064(%rdx), %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x63,0xbd,0x20,0x03,0x4a,0x7f,0x7b]
+ valignq $0x7b, 4064(%rdx), %ymm24, %ymm25
+
+// CHECK: valignq $123, 4096(%rdx), %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x63,0xbd,0x20,0x03,0x8a,0x00,0x10,0x00,0x00,0x7b]
+ valignq $0x7b, 4096(%rdx), %ymm24, %ymm25
+
+// CHECK: valignq $123, -4096(%rdx), %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x63,0xbd,0x20,0x03,0x4a,0x80,0x7b]
+ valignq $0x7b, -4096(%rdx), %ymm24, %ymm25
+
+// CHECK: valignq $123, -4128(%rdx), %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x63,0xbd,0x20,0x03,0x8a,0xe0,0xef,0xff,0xff,0x7b]
+ valignq $0x7b, -4128(%rdx), %ymm24, %ymm25
+
+// CHECK: valignq $123, 1016(%rdx){1to4}, %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x63,0xbd,0x30,0x03,0x4a,0x7f,0x7b]
+ valignq $0x7b, 1016(%rdx){1to4}, %ymm24, %ymm25
+
+// CHECK: valignq $123, 1024(%rdx){1to4}, %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x63,0xbd,0x30,0x03,0x8a,0x00,0x04,0x00,0x00,0x7b]
+ valignq $0x7b, 1024(%rdx){1to4}, %ymm24, %ymm25
+
+// CHECK: valignq $123, -1024(%rdx){1to4}, %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x63,0xbd,0x30,0x03,0x4a,0x80,0x7b]
+ valignq $0x7b, -1024(%rdx){1to4}, %ymm24, %ymm25
+
+// CHECK: valignq $123, -1032(%rdx){1to4}, %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x63,0xbd,0x30,0x03,0x8a,0xf8,0xfb,0xff,0xff,0x7b]
+ valignq $0x7b, -1032(%rdx){1to4}, %ymm24, %ymm25