From bf1ab36dc35472153e97ea3a188a117767b47a43 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 4 Aug 2015 08:05:27 +0000 Subject: [PATCH] [InstCombine] Split off SSE2/AVX2 vector shift tests. These aren't vector demanded bits tests. More tests to follow. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@243963 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../InstCombine/vec_demanded_elts.ll | 210 ----------------- .../InstCombine/x86-vector-shifts.ll | 212 ++++++++++++++++++ 2 files changed, 212 insertions(+), 210 deletions(-) create mode 100644 test/Transforms/InstCombine/x86-vector-shifts.ll diff --git a/test/Transforms/InstCombine/vec_demanded_elts.ll b/test/Transforms/InstCombine/vec_demanded_elts.ll index dabbeb05a34..5a2825ebfc3 100644 --- a/test/Transforms/InstCombine/vec_demanded_elts.ll +++ b/test/Transforms/InstCombine/vec_demanded_elts.ll @@ -253,213 +253,3 @@ define <4 x double> @test_vpermilvar_pd_256_zero(<4 x double> %v) { %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> zeroinitializer) ret <4 x double> %a } - -define <2 x i64> @test_sse2_1() nounwind readnone uwtable { - %S = bitcast i32 1 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> , <8 x i16> %4) - %6 = bitcast <8 x i16> %5 to <4 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7) - %9 = bitcast <4 x i32> %8 to <2 x i64> - %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3) - %11 = bitcast <2 x i64> %10 to <8 x i16> - %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S) - %13 = bitcast <8 x i16> %12 to <4 x i32> - %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S) - %15 = bitcast <4 x i32> %14 to <2 x i64> - %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S) - ret <2 x i64> %16 -; CHECK: test_sse2_1 -; CHECK: ret <2 x i64> -} - -define <4 x i64> @test_avx2_1() nounwind readnone uwtable { - %S = bitcast i32 1 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> , <8 x i16> %4) - %6 = bitcast <16 x i16> %5 to <8 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7) - %9 = bitcast <8 x i32> %8 to <4 x i64> - %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3) - %11 = bitcast <4 x i64> %10 to <16 x i16> - %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S) - %13 = bitcast <16 x i16> %12 to <8 x i32> - %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S) - %15 = bitcast <8 x i32> %14 to <4 x i64> - %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S) - ret <4 x i64> %16 -; CHECK: test_avx2_1 -; CHECK: ret <4 x i64> -} - -define <2 x i64> @test_sse2_0() nounwind readnone uwtable { - %S = bitcast i32 128 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> , <8 x i16> %4) - %6 = bitcast <8 x i16> %5 to <4 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7) - %9 = bitcast <4 x i32> %8 to <2 x i64> - %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3) - %11 = bitcast <2 x i64> %10 to <8 x i16> - %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S) - %13 = bitcast <8 x i16> %12 to <4 x i32> - %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S) - %15 = bitcast <4 x i32> %14 to <2 x i64> - %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S) - ret <2 x i64> %16 -; CHECK: test_sse2_0 -; CHECK: ret <2 x i64> zeroinitializer -} - -define <4 x i64> @test_avx2_0() nounwind readnone uwtable { - %S = bitcast i32 128 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> , <8 x i16> %4) - %6 = bitcast <16 x i16> %5 to <8 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7) - %9 = bitcast <8 x i32> %8 to <4 x i64> - %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3) - %11 = bitcast <4 x i64> %10 to <16 x i16> - %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S) - %13 = bitcast <16 x i16> %12 to <8 x i32> - %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S) - %15 = bitcast <8 x i32> %14 to <4 x i64> - %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S) - ret <4 x i64> %16 -; CHECK: test_avx2_0 -; CHECK: ret <4 x i64> zeroinitializer -} -define <2 x i64> @test_sse2_psrl_1() nounwind readnone uwtable { - %S = bitcast i32 1 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> , <8 x i16> %4) - %6 = bitcast <8 x i16> %5 to <4 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7) - %9 = bitcast <4 x i32> %8 to <2 x i64> - %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3) - %11 = bitcast <2 x i64> %10 to <8 x i16> - %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S) - %13 = bitcast <8 x i16> %12 to <4 x i32> - %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S) - %15 = bitcast <4 x i32> %14 to <2 x i64> - %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S) - ret <2 x i64> %16 -; CHECK: test_sse2_psrl_1 -; CHECK: ret <2 x i64> -} - -define <4 x i64> @test_avx2_psrl_1() nounwind readnone uwtable { - %S = bitcast i32 1 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> , <8 x i16> %4) - %6 = bitcast <16 x i16> %5 to <8 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7) - %9 = bitcast <8 x i32> %8 to <4 x i64> - %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3) - %11 = bitcast <4 x i64> %10 to <16 x i16> - %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S) - %13 = bitcast <16 x i16> %12 to <8 x i32> - %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S) - %15 = bitcast <8 x i32> %14 to <4 x i64> - %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S) - ret <4 x i64> %16 -; CHECK: test_avx2_psrl_1 -; CHECK: ret <4 x i64> -} - -define <2 x i64> @test_sse2_psrl_0() nounwind readnone uwtable { - %S = bitcast i32 128 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> , <8 x i16> %4) - %6 = bitcast <8 x i16> %5 to <4 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7) - %9 = bitcast <4 x i32> %8 to <2 x i64> - %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3) - %11 = bitcast <2 x i64> %10 to <8 x i16> - %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S) - %13 = bitcast <8 x i16> %12 to <4 x i32> - %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S) - %15 = bitcast <4 x i32> %14 to <2 x i64> - %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S) - ret <2 x i64> %16 -; CHECK: test_sse2_psrl_0 -; CHECK: ret <2 x i64> zeroinitializer -} - -define <4 x i64> @test_avx2_psrl_0() nounwind readnone uwtable { - %S = bitcast i32 128 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> , <8 x i16> %4) - %6 = bitcast <16 x i16> %5 to <8 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7) - %9 = bitcast <8 x i32> %8 to <4 x i64> - %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3) - %11 = bitcast <4 x i64> %10 to <16 x i16> - %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S) - %13 = bitcast <16 x i16> %12 to <8 x i32> - %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S) - %15 = bitcast <8 x i32> %14 to <4 x i64> - %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S) - ret <4 x i64> %16 -; CHECK: test_avx2_psrl_0 -; CHECK: ret <4 x i64> zeroinitializer -} - -declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1 -declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1 -declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1 -declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) #1 -declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) #1 -declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) #1 -declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) #1 -declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) #1 -declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) #1 -declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1 -declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1 -declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1 -declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) #1 -declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) #1 -declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) #1 -declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) #1 -declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) #1 -declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) #1 -declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) #1 -declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) #1 -declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) #1 -declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) #1 -declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1 -declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1 - -attributes #1 = { nounwind readnone } diff --git a/test/Transforms/InstCombine/x86-vector-shifts.ll b/test/Transforms/InstCombine/x86-vector-shifts.ll new file mode 100644 index 00000000000..b2dcfa63290 --- /dev/null +++ b/test/Transforms/InstCombine/x86-vector-shifts.ll @@ -0,0 +1,212 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define <2 x i64> @test_sse2_1() nounwind readnone uwtable { + %S = bitcast i32 1 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> , <8 x i16> %4) + %6 = bitcast <8 x i16> %5 to <4 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7) + %9 = bitcast <4 x i32> %8 to <2 x i64> + %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3) + %11 = bitcast <2 x i64> %10 to <8 x i16> + %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S) + %13 = bitcast <8 x i16> %12 to <4 x i32> + %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S) + %15 = bitcast <4 x i32> %14 to <2 x i64> + %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S) + ret <2 x i64> %16 +; CHECK: test_sse2_1 +; CHECK: ret <2 x i64> +} + +define <4 x i64> @test_avx2_1() nounwind readnone uwtable { + %S = bitcast i32 1 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> , <8 x i16> %4) + %6 = bitcast <16 x i16> %5 to <8 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7) + %9 = bitcast <8 x i32> %8 to <4 x i64> + %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3) + %11 = bitcast <4 x i64> %10 to <16 x i16> + %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S) + %13 = bitcast <16 x i16> %12 to <8 x i32> + %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S) + %15 = bitcast <8 x i32> %14 to <4 x i64> + %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S) + ret <4 x i64> %16 +; CHECK: test_avx2_1 +; CHECK: ret <4 x i64> +} + +define <2 x i64> @test_sse2_0() nounwind readnone uwtable { + %S = bitcast i32 128 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> , <8 x i16> %4) + %6 = bitcast <8 x i16> %5 to <4 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7) + %9 = bitcast <4 x i32> %8 to <2 x i64> + %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3) + %11 = bitcast <2 x i64> %10 to <8 x i16> + %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S) + %13 = bitcast <8 x i16> %12 to <4 x i32> + %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S) + %15 = bitcast <4 x i32> %14 to <2 x i64> + %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S) + ret <2 x i64> %16 +; CHECK: test_sse2_0 +; CHECK: ret <2 x i64> zeroinitializer +} + +define <4 x i64> @test_avx2_0() nounwind readnone uwtable { + %S = bitcast i32 128 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> , <8 x i16> %4) + %6 = bitcast <16 x i16> %5 to <8 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7) + %9 = bitcast <8 x i32> %8 to <4 x i64> + %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3) + %11 = bitcast <4 x i64> %10 to <16 x i16> + %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S) + %13 = bitcast <16 x i16> %12 to <8 x i32> + %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S) + %15 = bitcast <8 x i32> %14 to <4 x i64> + %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S) + ret <4 x i64> %16 +; CHECK: test_avx2_0 +; CHECK: ret <4 x i64> zeroinitializer +} +define <2 x i64> @test_sse2_psrl_1() nounwind readnone uwtable { + %S = bitcast i32 1 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> , <8 x i16> %4) + %6 = bitcast <8 x i16> %5 to <4 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7) + %9 = bitcast <4 x i32> %8 to <2 x i64> + %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3) + %11 = bitcast <2 x i64> %10 to <8 x i16> + %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S) + %13 = bitcast <8 x i16> %12 to <4 x i32> + %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S) + %15 = bitcast <4 x i32> %14 to <2 x i64> + %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S) + ret <2 x i64> %16 +; CHECK: test_sse2_psrl_1 +; CHECK: ret <2 x i64> +} + +define <4 x i64> @test_avx2_psrl_1() nounwind readnone uwtable { + %S = bitcast i32 1 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> , <8 x i16> %4) + %6 = bitcast <16 x i16> %5 to <8 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7) + %9 = bitcast <8 x i32> %8 to <4 x i64> + %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3) + %11 = bitcast <4 x i64> %10 to <16 x i16> + %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S) + %13 = bitcast <16 x i16> %12 to <8 x i32> + %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S) + %15 = bitcast <8 x i32> %14 to <4 x i64> + %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S) + ret <4 x i64> %16 +; CHECK: test_avx2_psrl_1 +; CHECK: ret <4 x i64> +} + +define <2 x i64> @test_sse2_psrl_0() nounwind readnone uwtable { + %S = bitcast i32 128 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> , <8 x i16> %4) + %6 = bitcast <8 x i16> %5 to <4 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7) + %9 = bitcast <4 x i32> %8 to <2 x i64> + %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3) + %11 = bitcast <2 x i64> %10 to <8 x i16> + %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S) + %13 = bitcast <8 x i16> %12 to <4 x i32> + %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S) + %15 = bitcast <4 x i32> %14 to <2 x i64> + %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S) + ret <2 x i64> %16 +; CHECK: test_sse2_psrl_0 +; CHECK: ret <2 x i64> zeroinitializer +} + +define <4 x i64> @test_avx2_psrl_0() nounwind readnone uwtable { + %S = bitcast i32 128 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> , <8 x i16> %4) + %6 = bitcast <16 x i16> %5 to <8 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7) + %9 = bitcast <8 x i32> %8 to <4 x i64> + %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3) + %11 = bitcast <4 x i64> %10 to <16 x i16> + %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S) + %13 = bitcast <16 x i16> %12 to <8 x i32> + %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S) + %15 = bitcast <8 x i32> %14 to <4 x i64> + %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S) + ret <4 x i64> %16 +; CHECK: test_avx2_psrl_0 +; CHECK: ret <4 x i64> zeroinitializer +} + +declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1 +declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1 +declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1 +declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) #1 +declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) #1 +declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) #1 +declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) #1 +declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) #1 +declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) #1 +declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1 +declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1 +declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1 +declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) #1 +declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) #1 +declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) #1 +declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) #1 +declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) #1 +declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) #1 +declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) #1 +declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) #1 +declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) #1 +declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) #1 +declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1 +declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1 + +attributes #1 = { nounwind readnone } -- 2.34.1