From 17e24879cb51d1137f2269258c8c0f41218bbe0c Mon Sep 17 00:00:00 2001 From: Igor Breger Date: Mon, 8 Jun 2015 14:03:17 +0000 Subject: [PATCH] AVX-512: Implemented 256/128bit VALIGND/Q instructions for SKX and KNL Implemented DAG lowering for all these forms. Added tests for DAG lowering and encoding. Differential Revision: http://reviews.llvm.org/D10310 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239300 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 37 ++++++--- lib/Target/X86/X86InstrAVX512.td | 52 +++++++------ lib/Target/X86/X86InstrInfo.cpp | 4 +- lib/Target/X86/X86IntrinsicsInfo.h | 5 +- test/CodeGen/X86/avx512-shuffle.ll | 84 ++++++++++++++++++++ test/MC/X86/avx512-encodings.s | 60 +++++++++++++++ test/MC/X86/x86-64-avx512f_vl.s | 119 +++++++++++++++++++++++++++++ 7 files changed, 322 insertions(+), 39 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 040779e5ed2..229795cbe4b 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -10132,7 +10132,8 @@ static SDValue lowerVectorShuffleWithVALIGN(SDLoc DL, MVT VT, else if (Mask[i] - i != AlignVal) return SDValue(); } - return DAG.getNode(X86ISD::VALIGN, DL, VT, V1, V2, + // Vector source operands should be swapped + return DAG.getNode(X86ISD::VALIGN, DL, VT, V2, V1, DAG.getConstant(AlignVal, DL, MVT::i8)); } @@ -15167,6 +15168,30 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Src1,Src2), Mask, PassThru, Subtarget, DAG); } + case INTR_TYPE_3OP_MASK: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue PassThru = Op.getOperand(4); + SDValue Mask = Op.getOperand(5); + // We specify 2 possible opcodes for intrinsics with rounding modes. + // First, we check if the intrinsic may have non-default rounding mode, + // (IntrData->Opc1 != 0), then we check the rounding mode operand. + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(6); + unsigned Round = cast(Rnd)->getZExtValue(); + if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { + return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, Op.getValueType(), + Src1, Src2, Src3, Rnd), + Mask, PassThru, Subtarget, DAG); + } + } + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + Src1, Src2, Src3), + Mask, PassThru, Subtarget, DAG); + } case FMA_OP_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); @@ -15309,16 +15334,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(1)); - case Intrinsic::x86_avx512_mask_valign_q_512: - case Intrinsic::x86_avx512_mask_valign_d_512: - // Vector source operands are swapped. - return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl, - Op.getValueType(), Op.getOperand(2), - Op.getOperand(1), - Op.getOperand(3)), - Op.getOperand(5), Op.getOperand(4), - Subtarget, DAG); - // ptest and testp intrinsics. The intrinsic these come from are designed to // return an integer value, not just an instruction so lower it to the ptest // or testp pattern and a setcc for the result. diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 7d16c22909d..c1d0aef0711 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -5611,30 +5611,6 @@ def : Pat<(v8i64 (X86Shufp VR512:$src1, (loadv8i64 addr:$src2), (i8 imm:$imm))), (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>; -multiclass avx512_valign { - defm rri : AVX512_maskable<0x03, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), - "valign"##_.Suffix, - "$src3, $src2, $src1", "$src1, $src2, $src3", - (_.VT (X86VAlign _.RC:$src2, _.RC:$src1, - (i8 imm:$src3)))>, - AVX512AIi8Base, EVEX_4V; - - // Also match valign of packed floats. - def : Pat<(_.FloatVT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 imm:$imm))), - (!cast(NAME##rri) _.RC:$src2, _.RC:$src1, imm:$imm)>; - - let mayLoad = 1 in - def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs _.RC:$dst), - (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), - !strconcat("valign"##_.Suffix, - "\t{$src3, $src2, $src1, $dst|" - "$dst, $src1, $src2, $src3}"), - []>, EVEX_4V; -} -defm VALIGND : avx512_valign, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VALIGNQ : avx512_valign, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; - // Helper fragments to match sext vXi1 to vXiY. def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>; def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>; @@ -6121,7 +6097,7 @@ multiclass avx512_common_fp_sae_packed_imm opc, SDNode OpNode, Predicate prd>{ let Predicates = [prd] in { defm Z : avx512_fp_packed_imm, - avx512_fp_sae_packed_imm, + avx512_fp_sae_packed_imm, EVEX_V512; } @@ -6133,6 +6109,17 @@ multiclass avx512_common_fp_sae_packed_imm opc, SDNode OpNode>{ + let Predicates = [HasAVX512] in { + defm Z : avx512_3Op_imm8, EVEX_V512; + } + let Predicates = [HasAVX512, HasVLX] in { + defm Z128 : avx512_3Op_imm8, EVEX_V128; + defm Z256 : avx512_3Op_imm8, EVEX_V256; + } +} + multiclass avx512_common_fp_sae_scalar_imm opc, SDNode OpNode, Predicate prd>{ let Predicates = [prd] in { @@ -6189,3 +6176,18 @@ defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4",avx512vl_i32_info, 0x43>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; + +multiclass avx512_valign{ + defm NAME: avx512_common_3Op_imm8, + AVX512AIi8Base, EVEX_4V; + let isCodeGenOnly = 1 in { + defm NAME#_FP: avx512_common_3Op_imm8, + AVX512AIi8Base, EVEX_4V; + } +} + +defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info, avx512vl_f32_info>, + EVEX_CD8<32, CD8VF>; +defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info, avx512vl_f64_info>, + EVEX_CD8<64, CD8VF>, VEX_W; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 44068f9190b..4d61b1f6eb7 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -1696,8 +1696,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSUBQZrr, X86::VPSUBQZrm, 0 }, { X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 }, { X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 }, - { X86::VALIGNQrri, X86::VALIGNQrmi, 0 }, - { X86::VALIGNDrri, X86::VALIGNDrmi, 0 }, + { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 }, + { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 }, { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 }, { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE }, { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE }, diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 8bf0d445453..0268066c2ba 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -21,7 +21,8 @@ enum IntrinsicType { GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, - INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, FMA_OP_MASK, + INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, + INTR_TYPE_3OP_MASK, FMA_OP_MASK, INTR_TYPE_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, EXPAND_FROM_MEM, BLEND }; @@ -603,6 +604,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0), X86_INTRINSIC_DATA(avx512_mask_xor_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), X86_INTRINSIC_DATA(avx512_mask_xor_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), X86_INTRINSIC_DATA(avx512_mask_xor_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll index 8411fee1502..2683d6fe238 100644 --- a/test/CodeGen/X86/avx512-shuffle.ll +++ b/test/CodeGen/X86/avx512-shuffle.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK-SKX ; CHECK-LABEL: test1: ; CHECK: vpermps @@ -250,3 +251,86 @@ define <8 x double> @test32(<8 x double> %a, <8 x double> %b) nounwind { %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %c } + +define <16 x i32> @test_align_v16i32_rr(<16 x i32> %a, <16 x i32> %b) nounwind { +; CHECK-LABEL: test_align_v16i32_rr: +; CHECK: ## BB#0: +; CHECK-NEXT: valignd $3, %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> + ret <16 x i32> %c +} + +define <16 x i32> @test_align_v16i32_rm(<16 x i32>* %a.ptr, <16 x i32> %b) nounwind { +; CHECK-LABEL: test_align_v16i32_rm: +; CHECK: ## BB#0: +; CHECK-NEXT: valignd $3, (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq + %a = load <16 x i32>, <16 x i32>* %a.ptr + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> + ret <16 x i32> %c +} + +define <16 x i32> @test_align_v16i32_rm_mask(<16 x i32>* %a.ptr, <16 x i32> %b, <16 x i1> %mask) nounwind { +; CHECK-LABEL: test_align_v16i32_rm_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovsxbd %xmm1, %zmm1 +; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1 +; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 +; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1 +; CHECK-NEXT: valignd $3, %zmm1, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq +; +; CHECK-SKX-LABEL: test_align_v16i32_rm_mask: +; CHECK-SKX: ## BB#0: +; CHECK-SKX-NEXT: vpmovb2m %xmm1, %k1 +; CHECK-SKX-NEXT: vmovdqa32 (%rdi), %zmm1 +; CHECK-SKX-NEXT: valignd $3, %zmm1, %zmm0, %zmm1 {%k1} +; CHECK-SKX-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-SKX-NEXT: retq + %a = load <16 x i32>, <16 x i32>* %a.ptr + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> + %res = select <16 x i1> %mask,<16 x i32> %c, <16 x i32> %a + ret <16 x i32> %res +} + +define <8 x double> @test_align_v8f64_rr(<8 x double> %a, <8 x double> %b) nounwind { +; CHECK-LABEL: test_align_v8f64_rr: +; CHECK: ## BB#0: +; CHECK-NEXT: valignq $3, %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> + ret <8 x double> %c +} + +define <8 x double> @test_align_v18f64_rm(<8 x double>* %a.ptr, <8 x double> %b) nounwind { +; CHECK-LABEL: test_align_v18f64_rm: +; CHECK: ## BB#0: +; CHECK-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq + %a = load <8 x double>, <8 x double>* %a.ptr + %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> + ret <8 x double> %c +} + +define <8 x double> @test_align_v18f64_rm_mask(<8 x double>* %a.ptr, <8 x double> %b, <8 x i1> %mask) nounwind { +; CHECK-LABEL: test_align_v18f64_rm_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovsxwq %xmm1, %zmm1 +; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1 +; CHECK-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq +; +; CHECK-SKX-LABEL: test_align_v18f64_rm_mask: +; CHECK-SKX: ## BB#0: +; CHECK-SKX-NEXT: vpmovw2m %xmm1, %k1 +; CHECK-SKX-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-SKX-NEXT: retq + %a = load <8 x double>, <8 x double>* %a.ptr + %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> + %res = select <8 x i1> %mask,<8 x double> %c, <8 x double> zeroinitializer + ret <8 x double> %res +} + diff --git a/test/MC/X86/avx512-encodings.s b/test/MC/X86/avx512-encodings.s index ba467afe153..ca0fccb2e3e 100644 --- a/test/MC/X86/avx512-encodings.s +++ b/test/MC/X86/avx512-encodings.s @@ -6084,6 +6084,66 @@ valignq $2, 0x100(%rsp), %zmm0, %zmm1 // CHECK: encoding: [0x62,0xf3,0xfd,0x49,0x03,0xcb,0x03] valignq $3, %zmm3, %zmm0, %zmm1 {%k1} +// CHECK: valignq $171, %zmm23, %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x23,0xdd,0x48,0x03,0xe7,0xab] + valignq $0xab, %zmm23, %zmm4, %zmm28 + +// CHECK: valignq $171, %zmm23, %zmm4, %zmm28 {%k3} +// CHECK: encoding: [0x62,0x23,0xdd,0x4b,0x03,0xe7,0xab] + valignq $0xab, %zmm23, %zmm4, %zmm28 {%k3} + +// CHECK: valignq $171, %zmm23, %zmm4, %zmm28 {%k3} {z} +// CHECK: encoding: [0x62,0x23,0xdd,0xcb,0x03,0xe7,0xab] + valignq $0xab, %zmm23, %zmm4, %zmm28 {%k3} {z} + +// CHECK: valignq $123, %zmm23, %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x23,0xdd,0x48,0x03,0xe7,0x7b] + valignq $0x7b, %zmm23, %zmm4, %zmm28 + +// CHECK: valignq $123, (%rcx), %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x63,0xdd,0x48,0x03,0x21,0x7b] + valignq $0x7b, (%rcx), %zmm4, %zmm28 + +// CHECK: valignq $123, 291(%rax,%r14,8), %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x23,0xdd,0x48,0x03,0xa4,0xf0,0x23,0x01,0x00,0x00,0x7b] + valignq $0x7b, 291(%rax,%r14,8), %zmm4, %zmm28 + +// CHECK: valignq $123, (%rcx){1to8}, %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x63,0xdd,0x58,0x03,0x21,0x7b] + valignq $0x7b, (%rcx){1to8}, %zmm4, %zmm28 + +// CHECK: valignq $123, 8128(%rdx), %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x63,0xdd,0x48,0x03,0x62,0x7f,0x7b] + valignq $0x7b, 8128(%rdx), %zmm4, %zmm28 + +// CHECK: valignq $123, 8192(%rdx), %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x63,0xdd,0x48,0x03,0xa2,0x00,0x20,0x00,0x00,0x7b] + valignq $0x7b, 8192(%rdx), %zmm4, %zmm28 + +// CHECK: valignq $123, -8192(%rdx), %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x63,0xdd,0x48,0x03,0x62,0x80,0x7b] + valignq $0x7b, -8192(%rdx), %zmm4, %zmm28 + +// CHECK: valignq $123, -8256(%rdx), %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x63,0xdd,0x48,0x03,0xa2,0xc0,0xdf,0xff,0xff,0x7b] + valignq $0x7b, -8256(%rdx), %zmm4, %zmm28 + +// CHECK: valignq $123, 1016(%rdx){1to8}, %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x63,0xdd,0x58,0x03,0x62,0x7f,0x7b] + valignq $0x7b, 1016(%rdx){1to8}, %zmm4, %zmm28 + +// CHECK: valignq $123, 1024(%rdx){1to8}, %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x63,0xdd,0x58,0x03,0xa2,0x00,0x04,0x00,0x00,0x7b] + valignq $0x7b, 1024(%rdx){1to8}, %zmm4, %zmm28 + +// CHECK: valignq $123, -1024(%rdx){1to8}, %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x63,0xdd,0x58,0x03,0x62,0x80,0x7b] + valignq $0x7b, -1024(%rdx){1to8}, %zmm4, %zmm28 + +// CHECK: valignq $123, -1032(%rdx){1to8}, %zmm4, %zmm28 +// CHECK: encoding: [0x62,0x63,0xdd,0x58,0x03,0xa2,0xf8,0xfb,0xff,0xff,0x7b] + valignq $0x7b, -1032(%rdx){1to8}, %zmm4, %zmm28 + // CHECK: vextractf32x4 $3 // CHECK: encoding: [0x62,0xf3,0x7d,0x49,0x19,0xd9,0x03] vextractf32x4 $3, %zmm3, %xmm1 {%k1} diff --git a/test/MC/X86/x86-64-avx512f_vl.s b/test/MC/X86/x86-64-avx512f_vl.s index 983e87912ed..f521b3e42d4 100644 --- a/test/MC/X86/x86-64-avx512f_vl.s +++ b/test/MC/X86/x86-64-avx512f_vl.s @@ -11013,3 +11013,122 @@ vaddpd {rz-sae}, %zmm2, %zmm1, %zmm1 // CHECK: encoding: [0x62,0x63,0xad,0x30,0x43,0x8a,0xf8,0xfb,0xff,0xff,0x7b] vshufi64x2 $0x7b, -1032(%rdx){1to4}, %ymm26, %ymm25 +// CHECK: valignq $171, %xmm24, %xmm18, %xmm19 +// CHECK: encoding: [0x62,0x83,0xed,0x00,0x03,0xd8,0xab] + valignq $0xab, %xmm24, %xmm18, %xmm19 + +// CHECK: valignq $171, %xmm24, %xmm18, %xmm19 {%k5} +// CHECK: encoding: [0x62,0x83,0xed,0x05,0x03,0xd8,0xab] + valignq $0xab, %xmm24, %xmm18, %xmm19 {%k5} + +// CHECK: valignq $171, %xmm24, %xmm18, %xmm19 {%k5} {z} +// CHECK: encoding: [0x62,0x83,0xed,0x85,0x03,0xd8,0xab] + valignq $0xab, %xmm24, %xmm18, %xmm19 {%k5} {z} + +// CHECK: valignq $123, %xmm24, %xmm18, %xmm19 +// CHECK: encoding: [0x62,0x83,0xed,0x00,0x03,0xd8,0x7b] + valignq $0x7b, %xmm24, %xmm18, %xmm19 + +// CHECK: valignq $123, (%rcx), %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xe3,0xed,0x00,0x03,0x19,0x7b] + valignq $0x7b, (%rcx), %xmm18, %xmm19 + +// CHECK: valignq $123, 291(%rax,%r14,8), %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xa3,0xed,0x00,0x03,0x9c,0xf0,0x23,0x01,0x00,0x00,0x7b] + valignq $0x7b, 291(%rax,%r14,8), %xmm18, %xmm19 + +// CHECK: valignq $123, (%rcx){1to2}, %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xe3,0xed,0x10,0x03,0x19,0x7b] + valignq $0x7b, (%rcx){1to2}, %xmm18, %xmm19 + +// CHECK: valignq $123, 2032(%rdx), %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xe3,0xed,0x00,0x03,0x5a,0x7f,0x7b] + valignq $0x7b, 2032(%rdx), %xmm18, %xmm19 + +// CHECK: valignq $123, 2048(%rdx), %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xe3,0xed,0x00,0x03,0x9a,0x00,0x08,0x00,0x00,0x7b] + valignq $0x7b, 2048(%rdx), %xmm18, %xmm19 + +// CHECK: valignq $123, -2048(%rdx), %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xe3,0xed,0x00,0x03,0x5a,0x80,0x7b] + valignq $0x7b, -2048(%rdx), %xmm18, %xmm19 + +// CHECK: valignq $123, -2064(%rdx), %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xe3,0xed,0x00,0x03,0x9a,0xf0,0xf7,0xff,0xff,0x7b] + valignq $0x7b, -2064(%rdx), %xmm18, %xmm19 + +// CHECK: valignq $123, 1016(%rdx){1to2}, %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xe3,0xed,0x10,0x03,0x5a,0x7f,0x7b] + valignq $0x7b, 1016(%rdx){1to2}, %xmm18, %xmm19 + +// CHECK: valignq $123, 1024(%rdx){1to2}, %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xe3,0xed,0x10,0x03,0x9a,0x00,0x04,0x00,0x00,0x7b] + valignq $0x7b, 1024(%rdx){1to2}, %xmm18, %xmm19 + +// CHECK: valignq $123, -1024(%rdx){1to2}, %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xe3,0xed,0x10,0x03,0x5a,0x80,0x7b] + valignq $0x7b, -1024(%rdx){1to2}, %xmm18, %xmm19 + +// CHECK: valignq $123, -1032(%rdx){1to2}, %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xe3,0xed,0x10,0x03,0x9a,0xf8,0xfb,0xff,0xff,0x7b] + valignq $0x7b, -1032(%rdx){1to2}, %xmm18, %xmm19 + +// CHECK: valignq $171, %ymm26, %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x03,0xbd,0x20,0x03,0xca,0xab] + valignq $0xab, %ymm26, %ymm24, %ymm25 + +// CHECK: valignq $171, %ymm26, %ymm24, %ymm25 {%k2} +// CHECK: encoding: [0x62,0x03,0xbd,0x22,0x03,0xca,0xab] + valignq $0xab, %ymm26, %ymm24, %ymm25 {%k2} + +// CHECK: valignq $171, %ymm26, %ymm24, %ymm25 {%k2} {z} +// CHECK: encoding: [0x62,0x03,0xbd,0xa2,0x03,0xca,0xab] + valignq $0xab, %ymm26, %ymm24, %ymm25 {%k2} {z} + +// CHECK: valignq $123, %ymm26, %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x03,0xbd,0x20,0x03,0xca,0x7b] + valignq $0x7b, %ymm26, %ymm24, %ymm25 + +// CHECK: valignq $123, (%rcx), %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x63,0xbd,0x20,0x03,0x09,0x7b] + valignq $0x7b, (%rcx), %ymm24, %ymm25 + +// CHECK: valignq $123, 291(%rax,%r14,8), %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x23,0xbd,0x20,0x03,0x8c,0xf0,0x23,0x01,0x00,0x00,0x7b] + valignq $0x7b, 291(%rax,%r14,8), %ymm24, %ymm25 + +// CHECK: valignq $123, (%rcx){1to4}, %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x63,0xbd,0x30,0x03,0x09,0x7b] + valignq $0x7b, (%rcx){1to4}, %ymm24, %ymm25 + +// CHECK: valignq $123, 4064(%rdx), %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x63,0xbd,0x20,0x03,0x4a,0x7f,0x7b] + valignq $0x7b, 4064(%rdx), %ymm24, %ymm25 + +// CHECK: valignq $123, 4096(%rdx), %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x63,0xbd,0x20,0x03,0x8a,0x00,0x10,0x00,0x00,0x7b] + valignq $0x7b, 4096(%rdx), %ymm24, %ymm25 + +// CHECK: valignq $123, -4096(%rdx), %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x63,0xbd,0x20,0x03,0x4a,0x80,0x7b] + valignq $0x7b, -4096(%rdx), %ymm24, %ymm25 + +// CHECK: valignq $123, -4128(%rdx), %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x63,0xbd,0x20,0x03,0x8a,0xe0,0xef,0xff,0xff,0x7b] + valignq $0x7b, -4128(%rdx), %ymm24, %ymm25 + +// CHECK: valignq $123, 1016(%rdx){1to4}, %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x63,0xbd,0x30,0x03,0x4a,0x7f,0x7b] + valignq $0x7b, 1016(%rdx){1to4}, %ymm24, %ymm25 + +// CHECK: valignq $123, 1024(%rdx){1to4}, %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x63,0xbd,0x30,0x03,0x8a,0x00,0x04,0x00,0x00,0x7b] + valignq $0x7b, 1024(%rdx){1to4}, %ymm24, %ymm25 + +// CHECK: valignq $123, -1024(%rdx){1to4}, %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x63,0xbd,0x30,0x03,0x4a,0x80,0x7b] + valignq $0x7b, -1024(%rdx){1to4}, %ymm24, %ymm25 + +// CHECK: valignq $123, -1032(%rdx){1to4}, %ymm24, %ymm25 +// CHECK: encoding: [0x62,0x63,0xbd,0x30,0x03,0x8a,0xf8,0xfb,0xff,0xff,0x7b] + valignq $0x7b, -1032(%rdx){1to4}, %ymm24, %ymm25 -- 2.34.1