From: Ahmed Bougacha Date: Thu, 20 Aug 2015 20:36:19 +0000 (+0000) Subject: [X86] Replace avx2 broadcast intrinsics with native IR. X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=ad0ddd8e0190ef7ebb162921e53115f9d969f939;p=oota-llvm.git [X86] Replace avx2 broadcast intrinsics with native IR. Since r245605, the clang headers don't use these anymore. r245165 updated some of the tests already; update the others, add an autoupgrade, remove the intrinsics, and cleanup the definitions. Differential Revision: http://reviews.llvm.org/D10555 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@245606 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 0d60aefed15..05affdbcd79 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -2167,39 +2167,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // Vector load with broadcast let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_avx2_vbroadcast_ss_ps : - GCCBuiltin<"__builtin_ia32_vbroadcastss_ps">, - Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; - def int_x86_avx2_vbroadcast_sd_pd_256 : - GCCBuiltin<"__builtin_ia32_vbroadcastsd_pd256">, - Intrinsic<[llvm_v4f64_ty], [llvm_v2f64_ty], [IntrNoMem]>; - def int_x86_avx2_vbroadcast_ss_ps_256 : - GCCBuiltin<"__builtin_ia32_vbroadcastss_ps256">, - Intrinsic<[llvm_v8f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; - def int_x86_avx2_pbroadcastb_128 : - GCCBuiltin<"__builtin_ia32_pbroadcastb128">, - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>; - def int_x86_avx2_pbroadcastb_256 : - GCCBuiltin<"__builtin_ia32_pbroadcastb256">, - Intrinsic<[llvm_v32i8_ty], [llvm_v16i8_ty], [IntrNoMem]>; - def int_x86_avx2_pbroadcastw_128 : - GCCBuiltin<"__builtin_ia32_pbroadcastw128">, - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty], [IntrNoMem]>; - def int_x86_avx2_pbroadcastw_256 : - GCCBuiltin<"__builtin_ia32_pbroadcastw256">, - Intrinsic<[llvm_v16i16_ty], [llvm_v8i16_ty], [IntrNoMem]>; - def int_x86_avx2_pbroadcastd_128 : - GCCBuiltin<"__builtin_ia32_pbroadcastd128">, - Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty], [IntrNoMem]>; - def int_x86_avx2_pbroadcastd_256 : - GCCBuiltin<"__builtin_ia32_pbroadcastd256">, - Intrinsic<[llvm_v8i32_ty], [llvm_v4i32_ty], [IntrNoMem]>; - def int_x86_avx2_pbroadcastq_128 : - GCCBuiltin<"__builtin_ia32_pbroadcastq128">, - Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty], [IntrNoMem]>; - def int_x86_avx2_pbroadcastq_256 : - GCCBuiltin<"__builtin_ia32_pbroadcastq256">, - Intrinsic<[llvm_v4i64_ty], [llvm_v2i64_ty], [IntrNoMem]>; def int_x86_avx512_mask_pbroadcast_d_gpr_512 : GCCBuiltin<"__builtin_ia32_pbroadcastd512_gpr_mask">, Intrinsic<[llvm_v16i32_ty], [llvm_i32_ty, llvm_v16i32_ty, diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index f1c6ebd4846..aeefa38f74d 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -129,6 +129,8 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { Name.startswith("x86.sse2.pcmpgt.") || Name.startswith("x86.avx2.pcmpeq.") || Name.startswith("x86.avx2.pcmpgt.") || + Name.startswith("x86.avx2.vbroadcast") || + Name.startswith("x86.avx2.pbroadcast") || Name.startswith("x86.avx.vpermil.") || Name == "x86.avx.vinsertf128.pd.256" || Name == "x86.avx.vinsertf128.ps.256" || @@ -447,6 +449,14 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { const int Idxs[4] = { 0, 1, 0, 1 }; Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()), Idxs); + } else if (Name.startswith("llvm.x86.avx2.pbroadcast") || + Name.startswith("llvm.x86.avx2.vbroadcast")) { + // Replace vp?broadcasts with a vector shuffle. + Value *Op = CI->getArgOperand(0); + unsigned NumElts = CI->getType()->getVectorNumElements(); + Type *MaskTy = VectorType::get(Type::getInt32Ty(C), NumElts); + Rep = Builder.CreateShuffleVector(Op, UndefValue::get(Op->getType()), + Constant::getNullValue(MaskTy)); } else if (Name == "llvm.x86.sse2.psll.dq") { // 128-bit shift left specified in bits. unsigned Shift = cast(CI->getArgOperand(1))->getZExtValue(); diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index f383f5e0c51..f186738a3a9 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -7823,13 +7823,7 @@ def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), // VBROADCAST - Load from memory and broadcast to all elements of the // destination operand // -class avx_broadcast opc, string OpcodeStr, RegisterClass RC, - X86MemOperand x86memop, Intrinsic Int, SchedWrite Sched> : - AVX8I, Sched<[Sched]>, VEX; - -class avx_broadcast_no_int opc, string OpcodeStr, RegisterClass RC, +class avx_broadcast_rm opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, ValueType VT, PatFrag ld_frag, SchedWrite Sched> : AVX8I opc, string OpcodeStr, RegisterClass RC, } // AVX2 adds register forms -class avx2_broadcast_reg opc, string OpcodeStr, RegisterClass RC, - Intrinsic Int, SchedWrite Sched> : +class avx2_broadcast_rr opc, string OpcodeStr, RegisterClass RC, + ValueType ResVT, ValueType OpVT, SchedWrite Sched> : AVX28I, Sched<[Sched]>, VEX; + [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>, + Sched<[Sched]>, VEX; let ExeDomain = SSEPackedSingle in { - def VBROADCASTSSrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR128, + def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128, f32mem, v4f32, loadf32, WriteLoad>; - def VBROADCASTSSYrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR256, + def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256, f32mem, v8f32, loadf32, WriteFShuffleLd>, VEX_L; } let ExeDomain = SSEPackedDouble in -def VBROADCASTSDYrm : avx_broadcast_no_int<0x19, "vbroadcastsd", VR256, f64mem, +def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem, v4f64, loadf64, WriteFShuffleLd>, VEX_L; -def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem, - int_x86_avx_vbroadcastf128_pd_256, - WriteFShuffleLd>, VEX_L; let ExeDomain = SSEPackedSingle in { - def VBROADCASTSSrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR128, - int_x86_avx2_vbroadcast_ss_ps, - WriteFShuffle>; - def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256, - int_x86_avx2_vbroadcast_ss_ps_256, - WriteFShuffle256>, VEX_L; + def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128, + v4f32, v4f32, WriteFShuffle>; + def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256, + v8f32, v4f32, WriteFShuffle256>, VEX_L; } let ExeDomain = SSEPackedDouble in -def VBROADCASTSDYrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256, - int_x86_avx2_vbroadcast_sd_pd_256, - WriteFShuffle256>, VEX_L; +def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, + v4f64, v2f64, WriteFShuffle256>, VEX_L; let mayLoad = 1, Predicates = [HasAVX2] in def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), @@ -7879,6 +7868,13 @@ def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), "vbroadcasti128\t{$src, $dst|$dst, $src}", []>, Sched<[WriteLoad]>, VEX, VEX_L; +def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), + (ins f128mem:$src), + "vbroadcastf128\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (int_x86_avx_vbroadcastf128_pd_256 addr:$src))]>, + Sched<[WriteFShuffleLd]>, VEX, VEX_L; + let Predicates = [HasAVX] in def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), (VBROADCASTF128 addr:$src)>; @@ -8317,83 +8313,31 @@ defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32, // multiclass avx2_broadcast opc, string OpcodeStr, X86MemOperand x86memop, PatFrag ld_frag, - Intrinsic Int128, Intrinsic Int256> { + ValueType OpVT128, ValueType OpVT256> { def rr : AVX28I, + [(set VR128:$dst, (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>, Sched<[WriteShuffle]>, VEX; def rm : AVX28I, + [(set VR128:$dst, (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>, Sched<[WriteLoad]>, VEX; def Yrr : AVX28I, + [(set VR256:$dst, (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>, Sched<[WriteShuffle256]>, VEX, VEX_L; def Yrm : AVX28I, + [(set VR256:$dst, (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>, Sched<[WriteLoad]>, VEX, VEX_L; } -defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, - int_x86_avx2_pbroadcastb_128, - int_x86_avx2_pbroadcastb_256>; -defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, - int_x86_avx2_pbroadcastw_128, - int_x86_avx2_pbroadcastw_256>; -defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, - int_x86_avx2_pbroadcastd_128, - int_x86_avx2_pbroadcastd_256>; -defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, - int_x86_avx2_pbroadcastq_128, - int_x86_avx2_pbroadcastq_256>; +defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, v16i8, v32i8>; +defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, v8i16, v16i16>; +defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, v4i32, v8i32>; +defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, v2i64, v4i64>; let Predicates = [HasAVX2] in { - def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))), - (VPBROADCASTBrm addr:$src)>; - def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))), - (VPBROADCASTBYrm addr:$src)>; - def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))), - (VPBROADCASTWrm addr:$src)>; - def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))), - (VPBROADCASTWYrm addr:$src)>; - def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), - (VPBROADCASTDrm addr:$src)>; - def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), - (VPBROADCASTDYrm addr:$src)>; - def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), - (VPBROADCASTQrm addr:$src)>; - def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), - (VPBROADCASTQYrm addr:$src)>; - - def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))), - (VPBROADCASTBrr VR128:$src)>; - def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))), - (VPBROADCASTBYrr VR128:$src)>; - def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))), - (VPBROADCASTWrr VR128:$src)>; - def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))), - (VPBROADCASTWYrr VR128:$src)>; - def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))), - (VPBROADCASTDrr VR128:$src)>; - def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))), - (VPBROADCASTDYrr VR128:$src)>; - def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))), - (VPBROADCASTQrr VR128:$src)>; - def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))), - (VPBROADCASTQYrr VR128:$src)>; - def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))), - (VBROADCASTSSrr VR128:$src)>; - def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))), - (VBROADCASTSSYrr VR128:$src)>; - def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))), - (VPBROADCASTQrr VR128:$src)>; - def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))), - (VBROADCASTSDYrr VR128:$src)>; - // Provide aliases for broadcast from the same register class that // automatically does the extract. def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))), diff --git a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll index a30d8371775..36b6da5ef96 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll @@ -83,3 +83,123 @@ define <4 x i64> @test_x86_avx2_vinserti128(<4 x i64> %a0, <2 x i64> %a1) { } declare <4 x i64> @llvm.x86.avx2.vinserti128(<4 x i64>, <2 x i64>, i8) nounwind readnone + +define <4 x double> @test_x86_avx2_vbroadcast_sd_pd_256(<2 x double> %a0) { + ; CHECK-LABEL: test_x86_avx2_vbroadcast_sd_pd_256: + ; CHECK: ## BB#0: + ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 + ; CHECK-NEXT: retl + %res = call <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double> %a0) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double>) nounwind readonly + + +define <4 x float> @test_x86_avx2_vbroadcast_ss_ps(<4 x float> %a0) { + ; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps: + ; CHECK: ## BB#0: + ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 + ; CHECK-NEXT: retl + %res = call <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float> %a0) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float>) nounwind readonly + + +define <8 x float> @test_x86_avx2_vbroadcast_ss_ps_256(<4 x float> %a0) { + ; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps_256: + ; CHECK: ## BB#0: + ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 + ; CHECK-NEXT: retl + %res = call <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float> %a0) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float>) nounwind readonly + + +define <16 x i8> @test_x86_avx2_pbroadcastb_128(<16 x i8> %a0) { +; CHECK-LABEL: test_x86_avx2_pbroadcastb_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 +; CHECK-NEXT: retl + %res = call <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8> %a0) + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8>) nounwind readonly + + +define <32 x i8> @test_x86_avx2_pbroadcastb_256(<16 x i8> %a0) { +; CHECK-LABEL: test_x86_avx2_pbroadcastb_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 +; CHECK-NEXT: retl + %res = call <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8> %a0) + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8>) nounwind readonly + + +define <8 x i16> @test_x86_avx2_pbroadcastw_128(<8 x i16> %a0) { +; CHECK-LABEL: test_x86_avx2_pbroadcastw_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 +; CHECK-NEXT: retl + %res = call <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16> %a0) + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16>) nounwind readonly + + +define <16 x i16> @test_x86_avx2_pbroadcastw_256(<8 x i16> %a0) { +; CHECK-LABEL: test_x86_avx2_pbroadcastw_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 +; CHECK-NEXT: retl + %res = call <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16> %a0) + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16>) nounwind readonly + + +define <4 x i32> @test_x86_avx2_pbroadcastd_128(<4 x i32> %a0) { +; CHECK-LABEL: test_x86_avx2_pbroadcastd_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 +; CHECK-NEXT: retl + %res = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0) + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32>) nounwind readonly + + +define <8 x i32> @test_x86_avx2_pbroadcastd_256(<4 x i32> %a0) { +; CHECK-LABEL: test_x86_avx2_pbroadcastd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 +; CHECK-NEXT: retl + %res = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0) + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32>) nounwind readonly + + +define <2 x i64> @test_x86_avx2_pbroadcastq_128(<2 x i64> %a0) { +; CHECK-LABEL: test_x86_avx2_pbroadcastq_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 +; CHECK-NEXT: retl + %res = call <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64> %a0) + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64>) nounwind readonly + + +define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) { +; CHECK-LABEL: test_x86_avx2_pbroadcastq_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-NEXT: retl + %res = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0) + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll index 5b607afef91..3b2a009f271 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -641,30 +641,6 @@ define <4 x i64> @test_x86_avx2_pmul.dq(<8 x i32> %a0, <8 x i32> %a1) { declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone -define <4 x double> @test_x86_avx2_vbroadcast_sd_pd_256(<2 x double> %a0) { - ; CHECK: vbroadcastsd - %res = call <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double> %a0) ; <<4 x double>> [#uses=1] - ret <4 x double> %res -} -declare <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double>) nounwind readonly - - -define <4 x float> @test_x86_avx2_vbroadcast_ss_ps(<4 x float> %a0) { - ; CHECK: vbroadcastss - %res = call <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1] - ret <4 x float> %res -} -declare <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float>) nounwind readonly - - -define <8 x float> @test_x86_avx2_vbroadcast_ss_ps_256(<4 x float> %a0) { - ; CHECK: vbroadcastss - %res = call <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float> %a0) ; <<8 x float>> [#uses=1] - ret <8 x float> %res -} -declare <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float>) nounwind readonly - - define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK: vpblendd %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i8 7) ; <<4 x i32>> [#uses=1] @@ -681,70 +657,6 @@ define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) { declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone -define <16 x i8> @test_x86_avx2_pbroadcastb_128(<16 x i8> %a0) { - ; CHECK: vpbroadcastb - %res = call <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8> %a0) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} -declare <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8>) nounwind readonly - - -define <32 x i8> @test_x86_avx2_pbroadcastb_256(<16 x i8> %a0) { - ; CHECK: vpbroadcastb - %res = call <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8> %a0) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} -declare <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8>) nounwind readonly - - -define <8 x i16> @test_x86_avx2_pbroadcastw_128(<8 x i16> %a0) { - ; CHECK: vpbroadcastw - %res = call <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16> %a0) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16>) nounwind readonly - - -define <16 x i16> @test_x86_avx2_pbroadcastw_256(<8 x i16> %a0) { - ; CHECK: vpbroadcastw - %res = call <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16> %a0) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} -declare <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16>) nounwind readonly - - -define <4 x i32> @test_x86_avx2_pbroadcastd_128(<4 x i32> %a0) { - ; CHECK: vbroadcastss - %res = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %res -} -declare <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32>) nounwind readonly - - -define <8 x i32> @test_x86_avx2_pbroadcastd_256(<4 x i32> %a0) { - ; CHECK: vbroadcastss {{[^,]+}}, %ymm{{[0-9]+}} - %res = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0) ; <<8 x i32>> [#uses=1] - ret <8 x i32> %res -} -declare <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32>) nounwind readonly - - -define <2 x i64> @test_x86_avx2_pbroadcastq_128(<2 x i64> %a0) { - ; CHECK: vpbroadcastq - %res = call <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64> %a0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64>) nounwind readonly - - -define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) { - ; CHECK: vbroadcastsd {{[^,]+}}, %ymm{{[0-9]+}} - %res = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} -declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly - - define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) { ; Check that the arguments are swapped between the intrinsic definition ; and its lowering. Indeed, the offsets are the first source in diff --git a/test/CodeGen/X86/stack-folding-int-avx2.ll b/test/CodeGen/X86/stack-folding-int-avx2.ll index 03241bbb741..bcfdfc57030 100644 --- a/test/CodeGen/X86/stack-folding-int-avx2.ll +++ b/test/CodeGen/X86/stack-folding-int-avx2.ll @@ -12,7 +12,7 @@ define <4 x double> @stack_fold_broadcastsd_ymm(<2 x double> %a0) { ;CHECK-LABEL: stack_fold_broadcastsd_ymm ;CHECK: vbroadcastsd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double> %a0) + %2 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer ; fadd forces execution domain %3 = fadd <4 x double> %2, ret <4 x double> %3 @@ -23,7 +23,7 @@ define <4 x float> @stack_fold_broadcastss(<4 x float> %a0) { ;CHECK-LABEL: stack_fold_broadcastss ;CHECK: vbroadcastss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float> %a0) + %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer ; fadd forces execution domain %3 = fadd <4 x float> %2, ret <4 x float> %3 @@ -34,7 +34,7 @@ define <8 x float> @stack_fold_broadcastss_ymm(<4 x float> %a0) { ;CHECK-LABEL: stack_fold_broadcastss_ymm ;CHECK: vbroadcastss {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float> %a0) + %2 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer ; fadd forces execution domain %3 = fadd <8 x float> %2, ret <8 x float> %3