From: Asaf Badouh Date: Wed, 2 Sep 2015 14:21:54 +0000 (+0000) Subject: [X86][AVX512VLBW] add support in byte shift and SAD X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=05859c7cbb636ed173d4397eceac61bbc0462d08;p=oota-llvm.git [X86][AVX512VLBW] add support in byte shift and SAD add byte shift left/right add SAD - compute sum of absolute differences Differential Revision: http://reviews.llvm.org/D12479 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@246654 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 12165341130..c9d584bd4ae 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -2356,6 +2356,12 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_psll_dq_512 : GCCBuiltin<"__builtin_ia32_pslldq512">, + Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_i32_ty], + [IntrNoMem]>; + def int_x86_avx512_psrl_dq_512 : GCCBuiltin<"__builtin_ia32_psrldq512">, + Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_i32_ty], + [IntrNoMem]>; } // Gather ops @@ -4981,6 +4987,9 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; +def int_x86_avx512_psad_bw_512 : GCCBuiltin<"__builtin_ia32_psadbw512">, + Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty], + [IntrNoMem]>; } // FP logical ops let TargetPrefix = "x86" in { diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 560e8934ce6..b715cd5931c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -15576,6 +15576,9 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget case INTR_TYPE_2OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case INTR_TYPE_2OP_IMM8: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), + DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2))); case INTR_TYPE_3OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index ebc11dc4330..d93deaec55d 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -6822,3 +6822,71 @@ multiclass avx512_shufp, PS; defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD, VEX_W; +//===----------------------------------------------------------------------===// +// AVX-512 - Byte shift Left/Right +//===----------------------------------------------------------------------===// + +multiclass avx512_shift_packed opc, SDNode OpNode, Format MRMr, + Format MRMm, string OpcodeStr, X86VectorVTInfo _>{ + def rr : AVX512; + let mayLoad = 1 in + def rm : AVX512; +} + +multiclass avx512_shift_packed_all opc, SDNode OpNode, Format MRMr, + Format MRMm, string OpcodeStr, Predicate prd>{ + let Predicates = [prd] in + defm Z512 : avx512_shift_packed, EVEX_V512; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_shift_packed, EVEX_V256; + defm Z128 : avx512_shift_packed, EVEX_V128; + } +} +defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq", + HasBWI>, AVX512PDIi8Base, EVEX_4V; +defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq", + HasBWI>, AVX512PDIi8Base, EVEX_4V; + + +multiclass avx512_psadbw_packed opc, SDNode OpNode, + string OpcodeStr, X86VectorVTInfo _src>{ + def rr : AVX512BI; + let mayLoad = 1 in + def rm : AVX512BI; +} + +multiclass avx512_psadbw_packed_all opc, SDNode OpNode, + string OpcodeStr, Predicate prd> { + let Predicates = [prd] in + defm Z512 : avx512_psadbw_packed, + EVEX_V512; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_psadbw_packed, + EVEX_V256; + defm Z128 : avx512_psadbw_packed, + EVEX_V128; + } +} + +defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw", + HasBWI>, EVEX_4V; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index b4907074605..c42d789c203 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -4137,8 +4137,10 @@ defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, VR128, v4i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +} // Predicates = [HasAVX] -let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] , + Predicates = [HasAVX, NoVLX_Or_NoBWI]in { // 128-bit logical shifts. def VPSLLDQri : PDIi8<0x73, MRM7r, (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), @@ -4153,8 +4155,7 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>, VEX_4V; // PSRADQri doesn't exist in SSE[1-3]. -} -} // Predicates = [HasAVX] +} // Predicates = [HasAVX, NoVLX_Or_NoBWI] let Predicates = [HasAVX2, NoVLX] in { defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, @@ -4183,8 +4184,10 @@ defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, VR256, v8i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +}// Predicates = [HasAVX2] -let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 , + Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { // 256-bit logical shifts. def VPSLLDQYri : PDIi8<0x73, MRM7r, (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2), @@ -4199,8 +4202,7 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { (v4i64 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>, VEX_4V, VEX_L; // PSRADQYri doesn't exist in SSE[1-3]. -} -} // Predicates = [HasAVX2] +} // Predicates = [HasAVX2, NoVLX_Or_NoBWI] let Constraints = "$src1 = $dst" in { defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 49219926719..409ba59b1dc 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -19,7 +19,7 @@ namespace llvm { enum IntrinsicType { INTR_NO_TYPE, GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, - INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP, + INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_2OP_IMM8, INTR_TYPE_3OP, INTR_TYPE_4OP, CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, @@ -1426,6 +1426,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_512, VPERM_3OP_MASKZ, X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0), + X86_INTRINSIC_DATA(avx512_psll_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSHLDQ, 0), + X86_INTRINSIC_DATA(avx512_psrl_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSRLDQ, 0), X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), diff --git a/test/CodeGen/X86/avx-isa-check.ll b/test/CodeGen/X86/avx-isa-check.ll index d551e2331d4..4d8db7df8b0 100644 --- a/test/CodeGen/X86/avx-isa-check.ll +++ b/test/CodeGen/X86/avx-isa-check.ll @@ -261,4 +261,9 @@ define <2 x double> @shuffle_v2f64_bitcast_1z(<2 x double> %a) { %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> undef, <4 x i32> %bitcast64 = bitcast <4 x float> %shuffle32 to <2 x double> ret <2 x double> %bitcast64 -} \ No newline at end of file +} + +define <16 x i16> @shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24(<16 x i16> %a) { + %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> + ret <16 x i16> %shuffle +} diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index ab724da6927..6376657cf16 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1221,3 +1221,41 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> %res4 = add <32 x i16> %res3, %res2 ret <32 x i16> %res4 } + +declare <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64>, i32) + +; CHECK-LABEL: @test_int_x86_avx512_mask_psll_dq_512 +; CHECK-NOT: call +; CHECK: vpslldq +; CHECK: vpslldq +define <8 x i64>@test_int_x86_avx512_mask_psll_dq_512(<8 x i64> %x0) { + %res = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 8) + %res1 = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 4) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64>, i32) + +; CHECK-LABEL: @test_int_x86_avx512_mask_psrl_dq_512 +; CHECK-NOT: call +; CHECK: vpsrldq +; CHECK: vpsrldq +define <8 x i64>@test_int_x86_avx512_mask_psrl_dq_512(<8 x i64> %x0) { + %res = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 8) + %res1 = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 4) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} +declare <64 x i8> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>) + +; CHECK-LABEL: @test_int_x86_avx512_mask_psadb_w_512 +; CHECK-NOT: call +; CHECK: vpsadbw %zmm1 +; CHECK: vpsadbw %zmm2 +define <64 x i8>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){ + %res = call <64 x i8> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1) + %res1 = call <64 x i8> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2) + %res2 = add <64 x i8> %res, %res1 + ret <64 x i8> %res2 +} diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll index 3397c770d42..bb8a1f51cb0 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -4275,3 +4275,4 @@ define <16 x i16>@test_int_x86_avx512_mask_dbpsadbw_256(<32 x i8> %x0, <32 x i8> %res4 = add <16 x i16> %res3, %res2 ret <16 x i16> %res4 } + diff --git a/test/MC/X86/x86-64-avx512bw.s b/test/MC/X86/x86-64-avx512bw.s index a13c49aaac6..c557da13ea8 100644 --- a/test/MC/X86/x86-64-avx512bw.s +++ b/test/MC/X86/x86-64-avx512bw.s @@ -4216,3 +4216,94 @@ // CHECK: encoding: [0x62,0xe3,0x5d,0x40,0x42,0xaa,0xc0,0xdf,0xff,0xff,0x7b] vdbpsadbw $123, -8256(%rdx), %zmm20, %zmm21 +// CHECK: vpslldq $171, %zmm28, %zmm20 +// CHECK: encoding: [0x62,0x91,0x5d,0x40,0x73,0xfc,0xab] + vpslldq $171, %zmm28, %zmm20 + +// CHECK: vpslldq $123, %zmm28, %zmm20 +// CHECK: encoding: [0x62,0x91,0x5d,0x40,0x73,0xfc,0x7b] + vpslldq $123, %zmm28, %zmm20 + +// CHECK: vpslldq $123, (%rcx), %zmm20 +// CHECK: encoding: [0x62,0xf1,0x5d,0x40,0x73,0x39,0x7b] + vpslldq $123, (%rcx), %zmm20 + +// CHECK: vpslldq $123, 291(%rax,%r14,8), %zmm20 +// CHECK: encoding: [0x62,0xb1,0x5d,0x40,0x73,0xbc,0xf0,0x23,0x01,0x00,0x00,0x7b] + vpslldq $123, 291(%rax,%r14,8), %zmm20 + +// CHECK: vpslldq $123, 8128(%rdx), %zmm20 +// CHECK: encoding: [0x62,0xf1,0x5d,0x40,0x73,0x7a,0x7f,0x7b] + vpslldq $123, 8128(%rdx), %zmm20 + +// CHECK: vpslldq $123, 8192(%rdx), %zmm20 +// CHECK: encoding: [0x62,0xf1,0x5d,0x40,0x73,0xba,0x00,0x20,0x00,0x00,0x7b] + vpslldq $123, 8192(%rdx), %zmm20 + +// CHECK: vpslldq $123, -8192(%rdx), %zmm20 +// CHECK: encoding: [0x62,0xf1,0x5d,0x40,0x73,0x7a,0x80,0x7b] + vpslldq $123, -8192(%rdx), %zmm20 + +// CHECK: vpslldq $123, -8256(%rdx), %zmm20 +// CHECK: encoding: [0x62,0xf1,0x5d,0x40,0x73,0xba,0xc0,0xdf,0xff,0xff,0x7b] + vpslldq $123, -8256(%rdx), %zmm20 + +// CHECK: vpsrldq $171, %zmm26, %zmm18 +// CHECK: encoding: [0x62,0x91,0x6d,0x40,0x73,0xda,0xab] + vpsrldq $171, %zmm26, %zmm18 + +// CHECK: vpsrldq $123, %zmm26, %zmm18 +// CHECK: encoding: [0x62,0x91,0x6d,0x40,0x73,0xda,0x7b] + vpsrldq $123, %zmm26, %zmm18 + +// CHECK: vpsrldq $123, (%rcx), %zmm18 +// CHECK: encoding: [0x62,0xf1,0x6d,0x40,0x73,0x19,0x7b] + vpsrldq $123, (%rcx), %zmm18 + +// CHECK: vpsrldq $123, 291(%rax,%r14,8), %zmm18 +// CHECK: encoding: [0x62,0xb1,0x6d,0x40,0x73,0x9c,0xf0,0x23,0x01,0x00,0x00,0x7b] + vpsrldq $123, 291(%rax,%r14,8), %zmm18 + +// CHECK: vpsrldq $123, 8128(%rdx), %zmm18 +// CHECK: encoding: [0x62,0xf1,0x6d,0x40,0x73,0x5a,0x7f,0x7b] + vpsrldq $123, 8128(%rdx), %zmm18 + +// CHECK: vpsrldq $123, 8192(%rdx), %zmm18 +// CHECK: encoding: [0x62,0xf1,0x6d,0x40,0x73,0x9a,0x00,0x20,0x00,0x00,0x7b] + vpsrldq $123, 8192(%rdx), %zmm18 + +// CHECK: vpsrldq $123, -8192(%rdx), %zmm18 +// CHECK: encoding: [0x62,0xf1,0x6d,0x40,0x73,0x5a,0x80,0x7b] + vpsrldq $123, -8192(%rdx), %zmm18 + +// CHECK: vpsrldq $123, -8256(%rdx), %zmm18 +// CHECK: encoding: [0x62,0xf1,0x6d,0x40,0x73,0x9a,0xc0,0xdf,0xff,0xff,0x7b] + vpsrldq $123, -8256(%rdx), %zmm18 + +// CHECK: vpsadbw %zmm22, %zmm25, %zmm28 +// CHECK: encoding: [0x62,0x21,0x35,0x40,0xf6,0xe6] + vpsadbw %zmm22, %zmm25, %zmm28 + +// CHECK: vpsadbw (%rcx), %zmm25, %zmm28 +// CHECK: encoding: [0x62,0x61,0x35,0x40,0xf6,0x21] + vpsadbw (%rcx), %zmm25, %zmm28 + +// CHECK: vpsadbw 291(%rax,%r14,8), %zmm25, %zmm28 +// CHECK: encoding: [0x62,0x21,0x35,0x40,0xf6,0xa4,0xf0,0x23,0x01,0x00,0x00] + vpsadbw 291(%rax,%r14,8), %zmm25, %zmm28 + +// CHECK: vpsadbw 8128(%rdx), %zmm25, %zmm28 +// CHECK: encoding: [0x62,0x61,0x35,0x40,0xf6,0x62,0x7f] + vpsadbw 8128(%rdx), %zmm25, %zmm28 + +// CHECK: vpsadbw 8192(%rdx), %zmm25, %zmm28 +// CHECK: encoding: [0x62,0x61,0x35,0x40,0xf6,0xa2,0x00,0x20,0x00,0x00] + vpsadbw 8192(%rdx), %zmm25, %zmm28 + +// CHECK: vpsadbw -8192(%rdx), %zmm25, %zmm28 +// CHECK: encoding: [0x62,0x61,0x35,0x40,0xf6,0x62,0x80] + vpsadbw -8192(%rdx), %zmm25, %zmm28 + +// CHECK: vpsadbw -8256(%rdx), %zmm25, %zmm28 +// CHECK: encoding: [0x62,0x61,0x35,0x40,0xf6,0xa2,0xc0,0xdf,0xff,0xff] + vpsadbw -8256(%rdx), %zmm25, %zmm28 diff --git a/test/MC/X86/x86-64-avx512bw_vl.s b/test/MC/X86/x86-64-avx512bw_vl.s index ae46e942476..91ae301b9d1 100644 --- a/test/MC/X86/x86-64-avx512bw_vl.s +++ b/test/MC/X86/x86-64-avx512bw_vl.s @@ -8399,6 +8399,7 @@ // CHECK: encoding: [0x62,0x61,0x35,0x20,0x69,0xa2,0xe0,0xef,0xff,0xff] vpunpckhwd -4128(%rdx), %ymm25, %ymm28 + // CHECK: vpalignr $171, %xmm21, %xmm26, %xmm19 // CHECK: encoding: [0x62,0xa3,0x2d,0x00,0x0f,0xdd,0xab] vpalignr $171, %xmm21, %xmm26, %xmm19 @@ -8718,3 +8719,370 @@ // CHECK: vdbpsadbw $123, -4128(%rdx), %ymm19, %ymm17 // CHECK: encoding: [0x62,0xe3,0x65,0x20,0x42,0x8a,0xe0,0xef,0xff,0xff,0x7b] vdbpsadbw $123, -4128(%rdx), %ymm19, %ymm17 +// CHECK: vpslldq $171, %xmm24, %xmm20 +// CHECK: encoding: [0x62,0x91,0x5d,0x00,0x73,0xf8,0xab] + vpslldq $171, %xmm24, %xmm20 + +// CHECK: vpslldq $123, %xmm24, %xmm20 +// CHECK: encoding: [0x62,0x91,0x5d,0x00,0x73,0xf8,0x7b] + vpslldq $123, %xmm24, %xmm20 + +// CHECK: vpslldq $123, (%rcx), %xmm20 +// CHECK: encoding: [0x62,0xf1,0x5d,0x00,0x73,0x39,0x7b] + vpslldq $123, (%rcx), %xmm20 + +// CHECK: vpslldq $123, 291(%rax,%r14,8), %xmm20 +// CHECK: encoding: [0x62,0xb1,0x5d,0x00,0x73,0xbc,0xf0,0x23,0x01,0x00,0x00,0x7b] + vpslldq $123, 291(%rax,%r14,8), %xmm20 + +// CHECK: vpslldq $123, 2032(%rdx), %xmm20 +// CHECK: encoding: [0x62,0xf1,0x5d,0x00,0x73,0x7a,0x7f,0x7b] + vpslldq $123, 2032(%rdx), %xmm20 + +// CHECK: vpslldq $123, 2048(%rdx), %xmm20 +// CHECK: encoding: [0x62,0xf1,0x5d,0x00,0x73,0xba,0x00,0x08,0x00,0x00,0x7b] + vpslldq $123, 2048(%rdx), %xmm20 + +// CHECK: vpslldq $123, -2048(%rdx), %xmm20 +// CHECK: encoding: [0x62,0xf1,0x5d,0x00,0x73,0x7a,0x80,0x7b] + vpslldq $123, -2048(%rdx), %xmm20 + +// CHECK: vpslldq $123, -2064(%rdx), %xmm20 +// CHECK: encoding: [0x62,0xf1,0x5d,0x00,0x73,0xba,0xf0,0xf7,0xff,0xff,0x7b] + vpslldq $123, -2064(%rdx), %xmm20 + +// CHECK: vpslldq $171, %ymm25, %ymm26 +// CHECK: encoding: [0x62,0x91,0x2d,0x20,0x73,0xf9,0xab] + vpslldq $171, %ymm25, %ymm26 + +// CHECK: vpslldq $123, %ymm25, %ymm26 +// CHECK: encoding: [0x62,0x91,0x2d,0x20,0x73,0xf9,0x7b] + vpslldq $123, %ymm25, %ymm26 + +// CHECK: vpslldq $123, (%rcx), %ymm26 +// CHECK: encoding: [0x62,0xf1,0x2d,0x20,0x73,0x39,0x7b] + vpslldq $123, (%rcx), %ymm26 + +// CHECK: vpslldq $123, 291(%rax,%r14,8), %ymm26 +// CHECK: encoding: [0x62,0xb1,0x2d,0x20,0x73,0xbc,0xf0,0x23,0x01,0x00,0x00,0x7b] + vpslldq $123, 291(%rax,%r14,8), %ymm26 + +// CHECK: vpslldq $123, 4064(%rdx), %ymm26 +// CHECK: encoding: [0x62,0xf1,0x2d,0x20,0x73,0x7a,0x7f,0x7b] + vpslldq $123, 4064(%rdx), %ymm26 + +// CHECK: vpslldq $123, 4096(%rdx), %ymm26 +// CHECK: encoding: [0x62,0xf1,0x2d,0x20,0x73,0xba,0x00,0x10,0x00,0x00,0x7b] + vpslldq $123, 4096(%rdx), %ymm26 + +// CHECK: vpslldq $123, -4096(%rdx), %ymm26 +// CHECK: encoding: [0x62,0xf1,0x2d,0x20,0x73,0x7a,0x80,0x7b] + vpslldq $123, -4096(%rdx), %ymm26 + +// CHECK: vpslldq $123, -4128(%rdx), %ymm26 +// CHECK: encoding: [0x62,0xf1,0x2d,0x20,0x73,0xba,0xe0,0xef,0xff,0xff,0x7b] + vpslldq $123, -4128(%rdx), %ymm26 + +// CHECK: vpslldq $171, %xmm19, %xmm23 +// CHECK: encoding: [0x62,0xb1,0x45,0x00,0x73,0xfb,0xab] + vpslldq $0xab, %xmm19, %xmm23 + +// CHECK: vpslldq $123, %xmm19, %xmm23 +// CHECK: encoding: [0x62,0xb1,0x45,0x00,0x73,0xfb,0x7b] + vpslldq $0x7b, %xmm19, %xmm23 + +// CHECK: vpslldq $123, (%rcx), %xmm23 +// CHECK: encoding: [0x62,0xf1,0x45,0x00,0x73,0x39,0x7b] + vpslldq $0x7b,(%rcx), %xmm23 + +// CHECK: vpslldq $123, 4660(%rax,%r14,8), %xmm23 +// CHECK: encoding: [0x62,0xb1,0x45,0x00,0x73,0xbc,0xf0,0x34,0x12,0x00,0x00,0x7b] + vpslldq $0x7b,4660(%rax,%r14,8), %xmm23 + +// CHECK: vpslldq $123, 2032(%rdx), %xmm23 +// CHECK: encoding: [0x62,0xf1,0x45,0x00,0x73,0x7a,0x7f,0x7b] + vpslldq $0x7b,2032(%rdx), %xmm23 + +// CHECK: vpslldq $123, 2048(%rdx), %xmm23 +// CHECK: encoding: [0x62,0xf1,0x45,0x00,0x73,0xba,0x00,0x08,0x00,0x00,0x7b] + vpslldq $0x7b,2048(%rdx), %xmm23 + +// CHECK: vpslldq $123, -2048(%rdx), %xmm23 +// CHECK: encoding: [0x62,0xf1,0x45,0x00,0x73,0x7a,0x80,0x7b] + vpslldq $0x7b,-2048(%rdx), %xmm23 + +// CHECK: vpslldq $123, -2064(%rdx), %xmm23 +// CHECK: encoding: [0x62,0xf1,0x45,0x00,0x73,0xba,0xf0,0xf7,0xff,0xff,0x7b] + vpslldq $0x7b,-2064(%rdx), %xmm23 + +// CHECK: vpslldq $171, %ymm25, %ymm29 +// CHECK: encoding: [0x62,0x91,0x15,0x20,0x73,0xf9,0xab] + vpslldq $0xab, %ymm25, %ymm29 + +// CHECK: vpslldq $123, %ymm25, %ymm29 +// CHECK: encoding: [0x62,0x91,0x15,0x20,0x73,0xf9,0x7b] + vpslldq $0x7b, %ymm25, %ymm29 + +// CHECK: vpslldq $123, (%rcx), %ymm29 +// CHECK: encoding: [0x62,0xf1,0x15,0x20,0x73,0x39,0x7b] + vpslldq $0x7b,(%rcx), %ymm29 + +// CHECK: vpslldq $123, 4660(%rax,%r14,8), %ymm29 +// CHECK: encoding: [0x62,0xb1,0x15,0x20,0x73,0xbc,0xf0,0x34,0x12,0x00,0x00,0x7b] + vpslldq $0x7b,4660(%rax,%r14,8), %ymm29 + +// CHECK: vpslldq $123, 4064(%rdx), %ymm29 +// CHECK: encoding: [0x62,0xf1,0x15,0x20,0x73,0x7a,0x7f,0x7b] + vpslldq $0x7b,4064(%rdx), %ymm29 + +// CHECK: vpslldq $123, 4096(%rdx), %ymm29 +// CHECK: encoding: [0x62,0xf1,0x15,0x20,0x73,0xba,0x00,0x10,0x00,0x00,0x7b] + vpslldq $0x7b,4096(%rdx), %ymm29 + +// CHECK: vpslldq $123, -4096(%rdx), %ymm29 +// CHECK: encoding: [0x62,0xf1,0x15,0x20,0x73,0x7a,0x80,0x7b] + vpslldq $0x7b,-4096(%rdx), %ymm29 + +// CHECK: vpslldq $123, -4128(%rdx), %ymm29 +// CHECK: encoding: [0x62,0xf1,0x15,0x20,0x73,0xba,0xe0,0xef,0xff,0xff,0x7b] + vpslldq $0x7b,-4128(%rdx), %ymm29 + +// CHECK: vpsrldq $171, %xmm21, %xmm24 +// CHECK: encoding: [0x62,0xb1,0x3d,0x00,0x73,0xdd,0xab] + vpsrldq $171, %xmm21, %xmm24 + +// CHECK: vpsrldq $123, %xmm21, %xmm24 +// CHECK: encoding: [0x62,0xb1,0x3d,0x00,0x73,0xdd,0x7b] + vpsrldq $123, %xmm21, %xmm24 + +// CHECK: vpsrldq $123, (%rcx), %xmm24 +// CHECK: encoding: [0x62,0xf1,0x3d,0x00,0x73,0x19,0x7b] + vpsrldq $123, (%rcx), %xmm24 + +// CHECK: vpsrldq $123, 291(%rax,%r14,8), %xmm24 +// CHECK: encoding: [0x62,0xb1,0x3d,0x00,0x73,0x9c,0xf0,0x23,0x01,0x00,0x00,0x7b] + vpsrldq $123, 291(%rax,%r14,8), %xmm24 + +// CHECK: vpsrldq $123, 2032(%rdx), %xmm24 +// CHECK: encoding: [0x62,0xf1,0x3d,0x00,0x73,0x5a,0x7f,0x7b] + vpsrldq $123, 2032(%rdx), %xmm24 + +// CHECK: vpsrldq $123, 2048(%rdx), %xmm24 +// CHECK: encoding: [0x62,0xf1,0x3d,0x00,0x73,0x9a,0x00,0x08,0x00,0x00,0x7b] + vpsrldq $123, 2048(%rdx), %xmm24 + +// CHECK: vpsrldq $123, -2048(%rdx), %xmm24 +// CHECK: encoding: [0x62,0xf1,0x3d,0x00,0x73,0x5a,0x80,0x7b] + vpsrldq $123, -2048(%rdx), %xmm24 + +// CHECK: vpsrldq $123, -2064(%rdx), %xmm24 +// CHECK: encoding: [0x62,0xf1,0x3d,0x00,0x73,0x9a,0xf0,0xf7,0xff,0xff,0x7b] + vpsrldq $123, -2064(%rdx), %xmm24 + +// CHECK: vpsrldq $171, %ymm25, %ymm24 +// CHECK: encoding: [0x62,0x91,0x3d,0x20,0x73,0xd9,0xab] + vpsrldq $171, %ymm25, %ymm24 + +// CHECK: vpsrldq $123, %ymm25, %ymm24 +// CHECK: encoding: [0x62,0x91,0x3d,0x20,0x73,0xd9,0x7b] + vpsrldq $123, %ymm25, %ymm24 + +// CHECK: vpsrldq $123, (%rcx), %ymm24 +// CHECK: encoding: [0x62,0xf1,0x3d,0x20,0x73,0x19,0x7b] + vpsrldq $123, (%rcx), %ymm24 + +// CHECK: vpsrldq $123, 291(%rax,%r14,8), %ymm24 +// CHECK: encoding: [0x62,0xb1,0x3d,0x20,0x73,0x9c,0xf0,0x23,0x01,0x00,0x00,0x7b] + vpsrldq $123, 291(%rax,%r14,8), %ymm24 + +// CHECK: vpsrldq $123, 4064(%rdx), %ymm24 +// CHECK: encoding: [0x62,0xf1,0x3d,0x20,0x73,0x5a,0x7f,0x7b] + vpsrldq $123, 4064(%rdx), %ymm24 + +// CHECK: vpsrldq $123, 4096(%rdx), %ymm24 +// CHECK: encoding: [0x62,0xf1,0x3d,0x20,0x73,0x9a,0x00,0x10,0x00,0x00,0x7b] + vpsrldq $123, 4096(%rdx), %ymm24 + +// CHECK: vpsrldq $123, -4096(%rdx), %ymm24 +// CHECK: encoding: [0x62,0xf1,0x3d,0x20,0x73,0x5a,0x80,0x7b] + vpsrldq $123, -4096(%rdx), %ymm24 + +// CHECK: vpsrldq $123, -4128(%rdx), %ymm24 +// CHECK: encoding: [0x62,0xf1,0x3d,0x20,0x73,0x9a,0xe0,0xef,0xff,0xff,0x7b] + vpsrldq $123, -4128(%rdx), %ymm24 + +// CHECK: vpsrldq $171, %xmm17, %xmm18 +// CHECK: encoding: [0x62,0xb1,0x6d,0x00,0x73,0xd9,0xab] + vpsrldq $0xab, %xmm17, %xmm18 + +// CHECK: vpsrldq $123, %xmm17, %xmm18 +// CHECK: encoding: [0x62,0xb1,0x6d,0x00,0x73,0xd9,0x7b] + vpsrldq $0x7b, %xmm17, %xmm18 + +// CHECK: vpsrldq $123, (%rcx), %xmm18 +// CHECK: encoding: [0x62,0xf1,0x6d,0x00,0x73,0x19,0x7b] + vpsrldq $0x7b,(%rcx), %xmm18 + +// CHECK: vpsrldq $123, 4660(%rax,%r14,8), %xmm18 +// CHECK: encoding: [0x62,0xb1,0x6d,0x00,0x73,0x9c,0xf0,0x34,0x12,0x00,0x00,0x7b] + vpsrldq $0x7b,4660(%rax,%r14,8), %xmm18 + +// CHECK: vpsrldq $123, 2032(%rdx), %xmm18 +// CHECK: encoding: [0x62,0xf1,0x6d,0x00,0x73,0x5a,0x7f,0x7b] + vpsrldq $0x7b,2032(%rdx), %xmm18 + +// CHECK: vpsrldq $123, 2048(%rdx), %xmm18 +// CHECK: encoding: [0x62,0xf1,0x6d,0x00,0x73,0x9a,0x00,0x08,0x00,0x00,0x7b] + vpsrldq $0x7b,2048(%rdx), %xmm18 + +// CHECK: vpsrldq $123, -2048(%rdx), %xmm18 +// CHECK: encoding: [0x62,0xf1,0x6d,0x00,0x73,0x5a,0x80,0x7b] + vpsrldq $0x7b,-2048(%rdx), %xmm18 + +// CHECK: vpsrldq $123, -2064(%rdx), %xmm18 +// CHECK: encoding: [0x62,0xf1,0x6d,0x00,0x73,0x9a,0xf0,0xf7,0xff,0xff,0x7b] + vpsrldq $0x7b,-2064(%rdx), %xmm18 + +// CHECK: vpsrldq $171, %ymm28, %ymm20 +// CHECK: encoding: [0x62,0x91,0x5d,0x20,0x73,0xdc,0xab] + vpsrldq $0xab, %ymm28, %ymm20 + +// CHECK: vpsrldq $123, %ymm28, %ymm20 +// CHECK: encoding: [0x62,0x91,0x5d,0x20,0x73,0xdc,0x7b] + vpsrldq $0x7b, %ymm28, %ymm20 + +// CHECK: vpsrldq $123, (%rcx), %ymm20 +// CHECK: encoding: [0x62,0xf1,0x5d,0x20,0x73,0x19,0x7b] + vpsrldq $0x7b,(%rcx), %ymm20 + +// CHECK: vpsrldq $123, 4660(%rax,%r14,8), %ymm20 +// CHECK: encoding: [0x62,0xb1,0x5d,0x20,0x73,0x9c,0xf0,0x34,0x12,0x00,0x00,0x7b] + vpsrldq $0x7b,4660(%rax,%r14,8), %ymm20 + +// CHECK: vpsrldq $123, 4064(%rdx), %ymm20 +// CHECK: encoding: [0x62,0xf1,0x5d,0x20,0x73,0x5a,0x7f,0x7b] + vpsrldq $0x7b,4064(%rdx), %ymm20 + +// CHECK: vpsrldq $123, 4096(%rdx), %ymm20 +// CHECK: encoding: [0x62,0xf1,0x5d,0x20,0x73,0x9a,0x00,0x10,0x00,0x00,0x7b] + vpsrldq $0x7b,4096(%rdx), %ymm20 + +// CHECK: vpsrldq $123, -4096(%rdx), %ymm20 +// CHECK: encoding: [0x62,0xf1,0x5d,0x20,0x73,0x5a,0x80,0x7b] + vpsrldq $0x7b,-4096(%rdx), %ymm20 + +// CHECK: vpsrldq $123, -4128(%rdx), %ymm20 +// CHECK: encoding: [0x62,0xf1,0x5d,0x20,0x73,0x9a,0xe0,0xef,0xff,0xff,0x7b] + vpsrldq $0x7b,-4128(%rdx), %ymm20 + +// CHECK: vpsadbw %xmm24, %xmm24, %xmm17 +// CHECK: encoding: [0x62,0x81,0x3d,0x00,0xf6,0xc8] + vpsadbw %xmm24, %xmm24, %xmm17 + +// CHECK: vpsadbw (%rcx), %xmm24, %xmm17 +// CHECK: encoding: [0x62,0xe1,0x3d,0x00,0xf6,0x09] + vpsadbw (%rcx), %xmm24, %xmm17 + +// CHECK: vpsadbw 291(%rax,%r14,8), %xmm24, %xmm17 +// CHECK: encoding: [0x62,0xa1,0x3d,0x00,0xf6,0x8c,0xf0,0x23,0x01,0x00,0x00] + vpsadbw 291(%rax,%r14,8), %xmm24, %xmm17 + +// CHECK: vpsadbw 2032(%rdx), %xmm24, %xmm17 +// CHECK: encoding: [0x62,0xe1,0x3d,0x00,0xf6,0x4a,0x7f] + vpsadbw 2032(%rdx), %xmm24, %xmm17 + +// CHECK: vpsadbw 2048(%rdx), %xmm24, %xmm17 +// CHECK: encoding: [0x62,0xe1,0x3d,0x00,0xf6,0x8a,0x00,0x08,0x00,0x00] + vpsadbw 2048(%rdx), %xmm24, %xmm17 + +// CHECK: vpsadbw -2048(%rdx), %xmm24, %xmm17 +// CHECK: encoding: [0x62,0xe1,0x3d,0x00,0xf6,0x4a,0x80] + vpsadbw -2048(%rdx), %xmm24, %xmm17 + +// CHECK: vpsadbw -2064(%rdx), %xmm24, %xmm17 +// CHECK: encoding: [0x62,0xe1,0x3d,0x00,0xf6,0x8a,0xf0,0xf7,0xff,0xff] + vpsadbw -2064(%rdx), %xmm24, %xmm17 + +// CHECK: vpsadbw %ymm24, %ymm27, %ymm19 +// CHECK: encoding: [0x62,0x81,0x25,0x20,0xf6,0xd8] + vpsadbw %ymm24, %ymm27, %ymm19 + +// CHECK: vpsadbw (%rcx), %ymm27, %ymm19 +// CHECK: encoding: [0x62,0xe1,0x25,0x20,0xf6,0x19] + vpsadbw (%rcx), %ymm27, %ymm19 + +// CHECK: vpsadbw 291(%rax,%r14,8), %ymm27, %ymm19 +// CHECK: encoding: [0x62,0xa1,0x25,0x20,0xf6,0x9c,0xf0,0x23,0x01,0x00,0x00] + vpsadbw 291(%rax,%r14,8), %ymm27, %ymm19 + +// CHECK: vpsadbw 4064(%rdx), %ymm27, %ymm19 +// CHECK: encoding: [0x62,0xe1,0x25,0x20,0xf6,0x5a,0x7f] + vpsadbw 4064(%rdx), %ymm27, %ymm19 + +// CHECK: vpsadbw 4096(%rdx), %ymm27, %ymm19 +// CHECK: encoding: [0x62,0xe1,0x25,0x20,0xf6,0x9a,0x00,0x10,0x00,0x00] + vpsadbw 4096(%rdx), %ymm27, %ymm19 + +// CHECK: vpsadbw -4096(%rdx), %ymm27, %ymm19 +// CHECK: encoding: [0x62,0xe1,0x25,0x20,0xf6,0x5a,0x80] + vpsadbw -4096(%rdx), %ymm27, %ymm19 + +// CHECK: vpsadbw -4128(%rdx), %ymm27, %ymm19 +// CHECK: encoding: [0x62,0xe1,0x25,0x20,0xf6,0x9a,0xe0,0xef,0xff,0xff] + vpsadbw -4128(%rdx), %ymm27, %ymm19 + +// CHECK: vpsadbw %xmm21, %xmm19, %xmm30 +// CHECK: encoding: [0x62,0x21,0x65,0x00,0xf6,0xf5] + vpsadbw %xmm21, %xmm19, %xmm30 + +// CHECK: vpsadbw (%rcx), %xmm19, %xmm30 +// CHECK: encoding: [0x62,0x61,0x65,0x00,0xf6,0x31] + vpsadbw (%rcx), %xmm19, %xmm30 + +// CHECK: vpsadbw 4660(%rax,%r14,8), %xmm19, %xmm30 +// CHECK: encoding: [0x62,0x21,0x65,0x00,0xf6,0xb4,0xf0,0x34,0x12,0x00,0x00] + vpsadbw 4660(%rax,%r14,8), %xmm19, %xmm30 + +// CHECK: vpsadbw 2032(%rdx), %xmm19, %xmm30 +// CHECK: encoding: [0x62,0x61,0x65,0x00,0xf6,0x72,0x7f] + vpsadbw 2032(%rdx), %xmm19, %xmm30 + +// CHECK: vpsadbw 2048(%rdx), %xmm19, %xmm30 +// CHECK: encoding: [0x62,0x61,0x65,0x00,0xf6,0xb2,0x00,0x08,0x00,0x00] + vpsadbw 2048(%rdx), %xmm19, %xmm30 + +// CHECK: vpsadbw -2048(%rdx), %xmm19, %xmm30 +// CHECK: encoding: [0x62,0x61,0x65,0x00,0xf6,0x72,0x80] + vpsadbw -2048(%rdx), %xmm19, %xmm30 + +// CHECK: vpsadbw -2064(%rdx), %xmm19, %xmm30 +// CHECK: encoding: [0x62,0x61,0x65,0x00,0xf6,0xb2,0xf0,0xf7,0xff,0xff] + vpsadbw -2064(%rdx), %xmm19, %xmm30 + +// CHECK: vpsadbw %ymm27, %ymm26, %ymm20 +// CHECK: encoding: [0x62,0x81,0x2d,0x20,0xf6,0xe3] + vpsadbw %ymm27, %ymm26, %ymm20 + +// CHECK: vpsadbw (%rcx), %ymm26, %ymm20 +// CHECK: encoding: [0x62,0xe1,0x2d,0x20,0xf6,0x21] + vpsadbw (%rcx), %ymm26, %ymm20 + +// CHECK: vpsadbw 4660(%rax,%r14,8), %ymm26, %ymm20 +// CHECK: encoding: [0x62,0xa1,0x2d,0x20,0xf6,0xa4,0xf0,0x34,0x12,0x00,0x00] + vpsadbw 4660(%rax,%r14,8), %ymm26, %ymm20 + +// CHECK: vpsadbw 4064(%rdx), %ymm26, %ymm20 +// CHECK: encoding: [0x62,0xe1,0x2d,0x20,0xf6,0x62,0x7f] + vpsadbw 4064(%rdx), %ymm26, %ymm20 + +// CHECK: vpsadbw 4096(%rdx), %ymm26, %ymm20 +// CHECK: encoding: [0x62,0xe1,0x2d,0x20,0xf6,0xa2,0x00,0x10,0x00,0x00] + vpsadbw 4096(%rdx), %ymm26, %ymm20 + +// CHECK: vpsadbw -4096(%rdx), %ymm26, %ymm20 +// CHECK: encoding: [0x62,0xe1,0x2d,0x20,0xf6,0x62,0x80] + vpsadbw -4096(%rdx), %ymm26, %ymm20 + +// CHECK: vpsadbw -4128(%rdx), %ymm26, %ymm20 +// CHECK: encoding: [0x62,0xe1,0x2d,0x20,0xf6,0xa2,0xe0,0xef,0xff,0xff] + vpsadbw -4128(%rdx), %ymm26, %ymm20