-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s -check-prefix=KNL
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_64
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_32
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX_32
; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
-; KNL-LABEL: test1
-; KNL: kxnorw %k1, %k1, %k1
-; KNL: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; SCALAR-LABEL: test1
-; SCALAR: extractelement <16 x float*>
+; SCALAR: extractelement <16 x float*>
; SCALAR-NEXT: load float
; SCALAR-NEXT: insertelement <16 x float>
; SCALAR-NEXT: extractelement <16 x float*>
; SCALAR-NEXT: load float
define <16 x float> @test1(float* %base, <16 x i32> %ind) {
+; KNL_64-LABEL: test1:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: kxnorw %k1, %k1, %k1
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT: vmovaps %zmm1, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test1:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: kxnorw %k1, %k1, %k1
+; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test1:
+; SKX: # BB#0:
+; SKX-NEXT: kxnorw %k1, %k1, %k1
+; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
-
+
%res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
ret <16 x float>%res
}
declare <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )
-
-; KNL-LABEL: test2
-; KNL: kmovw %esi, %k1
-; KNL: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+
; SCALAR-LABEL: test2
-; SCALAR: extractelement <16 x float*>
+; SCALAR: extractelement <16 x float*>
; SCALAR-NEXT: load float
; SCALAR-NEXT: insertelement <16 x float>
; SCALAR-NEXT: br label %else
; SCALAR: else:
-; SCALAR-NEXT: %res.phi.else = phi
+; SCALAR-NEXT: %res.phi.else = phi
; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
; SCALAR-NEXT: %ToLoad1 = icmp eq i1 %Mask1, true
; SCALAR-NEXT: br i1 %ToLoad1, label %cond.load1, label %else2
define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
+; KNL_64-LABEL: test2:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: kmovw %esi, %k1
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT: vmovaps %zmm1, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test2:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test2:
+; SKX: # BB#0:
+; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
ret <16 x float> %res
}
-; KNL-LABEL: test3
-; KNL: kmovw %esi, %k1
-; KNL: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
+; KNL_64-LABEL: test3:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: kmovw %esi, %k1
+; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT: vmovaps %zmm1, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test3:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test3:
+; SKX: # BB#0:
+; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
%broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
%broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
ret <16 x i32> %res
}
-; KNL-LABEL: test4
-; KNL: kmovw %esi, %k1
-; KNL: kmovw
-; KNL: vpgatherdd
-; KNL: vpgatherdd
define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
+; KNL_64-LABEL: test4:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: kmovw %esi, %k1
+; KNL_64-NEXT: kmovw %k1, %k2
+; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
+; KNL_64-NEXT: vmovaps %zmm1, %zmm2
+; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
+; KNL_64-NEXT: vpaddd %zmm2, %zmm1, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test4:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT: kmovw %k1, %k2
+; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm2
+; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
+; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test4:
+; SKX: # BB#0:
+; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: kmovw %k1, %k2
+; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
+; SKX-NEXT: vmovaps %zmm1, %zmm2
+; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
+; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0
+; SKX-NEXT: retq
%broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
%broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
ret <16 x i32> %res
}
-; KNL-LABEL: test5
-; KNL: kmovw %k1, %k2
-; KNL: vpscatterdd {{.*}}%k2
-; KNL: vpscatterdd {{.*}}%k1
; SCALAR-LABEL: test5
; SCALAR: %Mask0 = extractelement <16 x i1> %imask, i32 0
; SCALAR-NEXT: br i1 %ToStore1, label %cond.store1, label %else2
define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
+; KNL_64-LABEL: test5:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: kmovw %esi, %k1
+; KNL_64-NEXT: kmovw %k1, %k2
+; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
+; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test5:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT: kmovw %k1, %k2
+; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
+; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test5:
+; SKX: # BB#0:
+; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: kmovw %k1, %k2
+; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
+; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
+; SKX-NEXT: retq
%broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
%broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
declare void @llvm.masked.scatter.v8i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )
-; KNL-LABEL: test6
-; KNL: kxnorw %k1, %k1, %k1
-; KNL: kxnorw %k2, %k2, %k2
-; KNL: vpgatherqd (,%zmm{{.*}}), %ymm{{.*}} {%k2}
-; KNL: vpscatterqd %ymm{{.*}}, (,%zmm{{.*}}) {%k1}
; SCALAR-LABEL: test6
; SCALAR: store i32 %Elt0, i32* %Ptr01, align 4
; SCALAR-NEXT: store i32 %Elt2, i32* %Ptr23, align 4
define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
+; KNL_64-LABEL: test6:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: kxnorw %k1, %k1, %k1
+; KNL_64-NEXT: kxnorw %k2, %k2, %k2
+; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
+; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT: vmovaps %zmm2, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test6:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: kxnorw %k1, %k1, %k1
+; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm2
+; KNL_32-NEXT: kxnorw %k2, %k2, %k2
+; KNL_32-NEXT: vpgatherqd (,%zmm2), %ymm1 {%k2}
+; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm2) {%k1}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test6:
+; SKX: # BB#0:
+; SKX-NEXT: kxnorw %k1, %k1, %k1
+; SKX-NEXT: kxnorw %k2, %k2, %k2
+; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
+; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: retq
%a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
ret <8 x i32>%a
}
-; In this case the index should be promoted to <8 x i64> for KNL
-; KNL-LABEL: test7
-; KNL: vpmovsxdq %ymm0, %zmm0
-; KNL: kmovw %k1, %k2
-; KNL: vpgatherqd {{.*}} {%k2}
-; KNL: vpgatherqd {{.*}} {%k1}
define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
+;
+; KNL_64-LABEL: test7:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: movzbl %sil, %eax
+; KNL_64-NEXT: kmovw %eax, %k1
+; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_64-NEXT: kmovw %k1, %k2
+; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2}
+; KNL_64-NEXT: vmovaps %zmm1, %zmm2
+; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
+; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test7:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_32-NEXT: kmovw %k1, %k2
+; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm2
+; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
+; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test7:
+; SKX: # BB#0:
+; SKX-NEXT: kmovb %esi, %k1
+; SKX-NEXT: kmovw %k1, %k2
+; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
+; SKX-NEXT: vmovaps %zmm1, %zmm2
+; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
+; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0
+; SKX-NEXT: retq
%broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0
%broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer
; No uniform base in this case, index <8 x i64> contains addresses,
; each gather call will be split into two
-; KNL-LABEL: test8
-; KNL: kshiftrw $8, %k1, %k2
-; KNL: vpgatherqd
-; KNL: vpgatherqd
-; KNL: vinserti64x4
-; KNL: vpgatherqd
-; KNL: vpgatherqd
-; KNL: vinserti64x4
define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
+; KNL_64-LABEL: test8:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: kmovw %edi, %k1
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: kmovw %k2, %k3
+; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
+; KNL_64-NEXT: kmovw %k1, %k3
+; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
+; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
+; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
+; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
+; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
+; KNL_64-NEXT: vpaddd %zmm0, %zmm4, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test8:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT: kmovw %k1, %k2
+; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm2
+; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
+; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test8:
+; SKX: # BB#0:
+; SKX-NEXT: kmovw %edi, %k1
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: kmovw %k2, %k3
+; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
+; SKX-NEXT: kmovw %k1, %k3
+; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
+; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm4
+; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
+; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
+; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0
+; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test8:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; SKX_32-NEXT: kmovw %k1, %k2
+; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
+; SKX_32-NEXT: vmovaps %zmm1, %zmm2
+; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
+; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
+; SKX_32-NEXT: retl
+
%imask = bitcast i16 %mask to <16 x i1>
%gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
%gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
; Masked gather for agregate types
; Test9 and Test10 should give the same result (scalar and vector indices in GEP)
-; KNL-LABEL: test9
-; KNL: vpbroadcastq %rdi, %zmm
-; KNL: vpmovsxdq
-; KNL: vpbroadcastq
-; KNL: vpmuludq
-; KNL: vpaddq
-; KNL: vpaddq
-; KNL: vpaddq
-; KNL: vpaddq
-; KNL: vpgatherqd (,%zmm
define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
+; KNL_64-LABEL: test9:
+; KNL_64: # BB#0: # %entry
+; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
+; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
+; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
+; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4
+; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1
+; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
+; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1
+; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1
+; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
+; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
+; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
+; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
+; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
+; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
+; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; KNL_64-NEXT: kxnorw %k1, %k1, %k1
+; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test9:
+; KNL_32: # BB#0: # %entry
+; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
+; KNL_32-NEXT: vpbroadcastd .LCPI8_0, %ymm3
+; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
+; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
+; KNL_32-NEXT: vpbroadcastd .LCPI8_1, %ymm3
+; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
+; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; KNL_32-NEXT: vpbroadcastd .LCPI8_2, %ymm1
+; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
+; KNL_32-NEXT: kxnorw %k1, %k1, %k1
+; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test9:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vpbroadcastq %rdi, %zmm2
+; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
+; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; SKX-NEXT: kxnorw %k1, %k1, %k1
+; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
+; SKX-NEXT: retq
entry:
%broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
%broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
ret <8 x i32> %res
}
-; KNL-LABEL: test10
-; KNL: vpbroadcastq %rdi, %zmm
-; KNL: vpmovsxdq
-; KNL: vpbroadcastq
-; KNL: vpmuludq
-; KNL: vpaddq
-; KNL: vpaddq
-; KNL: vpaddq
-; KNL: vpaddq
-; KNL: vpgatherqd (,%zmm
define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
+; KNL_64-LABEL: test10:
+; KNL_64: # BB#0: # %entry
+; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
+; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
+; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
+; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4
+; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1
+; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
+; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1
+; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1
+; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
+; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
+; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
+; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
+; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
+; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
+; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; KNL_64-NEXT: kxnorw %k1, %k1, %k1
+; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test10:
+; KNL_32: # BB#0: # %entry
+; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
+; KNL_32-NEXT: vpbroadcastd .LCPI9_0, %ymm3
+; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
+; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
+; KNL_32-NEXT: vpbroadcastd .LCPI9_1, %ymm3
+; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
+; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; KNL_32-NEXT: vpbroadcastd .LCPI9_2, %ymm1
+; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
+; KNL_32-NEXT: kxnorw %k1, %k1, %k1
+; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test10:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vpbroadcastq %rdi, %zmm2
+; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
+; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; SKX-NEXT: kxnorw %k1, %k1, %k1
+; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
+; SKX-NEXT: retq
entry:
%broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
%broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
}
; Splat index in GEP, requires broadcast
-; KNL-LABEL: test11
-; KNL: vpbroadcastd %esi, %zmm
-; KNL: vgatherdps (%rdi,%zmm
define <16 x float> @test11(float* %base, i32 %ind) {
+; KNL_64-LABEL: test11:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpbroadcastd %esi, %zmm1
+; KNL_64-NEXT: kxnorw %k1, %k1, %k1
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test11:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %zmm1
+; KNL_32-NEXT: kxnorw %k1, %k1, %k1
+; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test11:
+; SKX: # BB#0:
+; SKX-NEXT: vpbroadcastd %esi, %zmm1
+; SKX-NEXT: kxnorw %k1, %k1, %k1
+; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
+; SKX-NEXT: retq
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
}
; We are checking the uniform base here. It is taken directly from input to vgatherdps
-; KNL-LABEL: test12
-; KNL: kxnorw %k1, %k1, %k1
-; KNL: vgatherdps (%rdi,%zmm
define <16 x float> @test12(float* %base, <16 x i32> %ind) {
+; KNL_64-LABEL: test12:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: kxnorw %k1, %k1, %k1
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT: vmovaps %zmm1, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test12:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: kxnorw %k1, %k1, %k1
+; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test12:
+; SKX: # BB#0:
+; SKX-NEXT: kxnorw %k1, %k1, %k1
+; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
}
; The same as the previous, but the mask is undefined
-; KNL-LABEL: test13
-; KNL-NOT: kxnorw
-; KNL: vgatherdps (%rdi,%zmm
define <16 x float> @test13(float* %base, <16 x i32> %ind) {
+; KNL_64-LABEL: test13:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT: vmovaps %zmm1, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test13:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test13:
+; SKX: # BB#0:
+; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
}
; The base pointer is not splat, can't find unform base
-; KNL-LABEL: test14
-; KNL: vgatherqps (,%zmm0)
-; KNL: vgatherqps (,%zmm0)
define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
+; KNL_64-LABEL: test14:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
+; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0
+; KNL_64-NEXT: vmovd %esi, %xmm1
+; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1
+; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
+; KNL_64-NEXT: vpsllq $2, %zmm1, %zmm1
+; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; KNL_64-NEXT: kshiftrw $8, %k0, %k1
+; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
+; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1}
+; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test14:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
+; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0
+; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
+; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
+; KNL_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test14:
+; SKX: # BB#0:
+; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
+; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT: vpbroadcastq %xmm0, %zmm0
+; SKX-NEXT: vmovd %esi, %xmm1
+; SKX-NEXT: vpbroadcastd %xmm1, %ymm1
+; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
+; SKX-NEXT: vpsllq $2, %zmm1, %zmm1
+; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; SKX-NEXT: kshiftrw $8, %k0, %k1
+; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
+; SKX-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1}
+; SKX-NEXT: vinsertf32x8 $1, %ymm1, %zmm2, %zmm0
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test14:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
+; SKX_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0
+; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
+; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
+; SKX_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
+; SKX_32-NEXT: retl
%broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
ret <16 x float>%res
}
+declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
+declare <4 x double> @llvm.masked.gather.v4f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
+
+; Gather smaller than existing instruction
+define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
+;
+; KNL_64-LABEL: test15:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2
+; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm0
+; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k1
+; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test15:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2
+; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm0
+; KNL_32-NEXT: vpandq .LCPI14_0, %zmm0, %zmm0
+; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k1
+; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test15:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovd2m %xmm1, %k1
+; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
+
+ %sext_ind = sext <4 x i32> %ind to <4 x i64>
+ %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind
+ %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
+ ret <4 x float>%res
+}
+
+; Gather smaller than existing instruction
+define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
+;
+; KNL_64-LABEL: test16:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
+; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
+; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
+; KNL_64-NEXT: vmovaps %zmm2, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test16:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
+; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
+; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_32-NEXT: vpandq .LCPI15_0, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test16:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovd2m %xmm1, %k1
+; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
+; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: retq
+
+ %sext_ind = sext <4 x i32> %ind to <4 x i64>
+ %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind
+ %res = call <4 x double> @llvm.masked.gather.v4f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
+ ret <4 x double>%res
+}
+
+define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
+;
+; KNL_64-LABEL: test17:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
+; KNL_64-NEXT: vmovaps %zmm2, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test17:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpandq .LCPI16_0, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test17:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovq2m %xmm1, %k1
+; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
+; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: retq
+
+ %sext_ind = sext <2 x i32> %ind to <2 x i64>
+ %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind
+ %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
+ ret <2 x double>%res
+}
+
+declare void @llvm.masked.scatter.v4i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
+declare void @llvm.masked.scatter.v4f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
+declare void @llvm.masked.scatter.v2i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
+declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
+declare void @llvm.masked.scatter.v2f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
+
+define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
+;
+; KNL_64-LABEL: test18:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2
+; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test18:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
+; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2
+; KNL_32-NEXT: vpandq .LCPI17_0, %zmm2, %zmm2
+; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test18:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovd2m %xmm2, %k1
+; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
+ ret void
+}
+
+define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) {
+;
+; KNL_64-LABEL: test19:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
+; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
+; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test19:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
+; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
+; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpandq .LCPI18_0, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test19:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovd2m %xmm1, %k1
+; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test19:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpmovd2m %xmm1, %k1
+; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
+; SKX_32-NEXT: retl
+ %gep = getelementptr double, double* %ptr, <4 x i64> %ind
+ call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
+ ret void
+}
+
+; Data type requires widening
+define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
+;
+; KNL_64-LABEL: test20:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL_64-NEXT: vmovq %xmm2, %xmm2
+; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2
+; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test20:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL_32-NEXT: vmovq %xmm2, %xmm2
+; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
+; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2
+; KNL_32-NEXT: vpandq .LCPI19_0, %zmm2, %zmm2
+; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test20:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovq2m %xmm2, %k0
+; SKX-NEXT: kshiftlw $2, %k0, %k0
+; SKX-NEXT: kshiftrw $2, %k0, %k1
+; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
+ ret void
+}
-; KNL-LABEL: test15
-; KNL: kmovw %eax, %k1
-; KNL: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; Data type requires promotion
+define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
+;
+; KNL_64-LABEL: test21:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
+; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test21:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32-NEXT: vpandq .LCPI20_0, %zmm2, %zmm2
+; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test21:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovq2m %xmm2, %k0
+; SKX-NEXT: kshiftlw $2, %k0, %k0
+; SKX-NEXT: kshiftrw $2, %k0, %k1
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
+ ret void
+}
-; SCALAR-LABEL: test15
-; SCALAR: extractelement <16 x float*>
+; The result type requires widening
+declare <2 x float> @llvm.masked.gather.v2f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
+
+define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
+;
+;
+; KNL_64-LABEL: test22:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; KNL_64-NEXT: vmovq %xmm1, %xmm1
+; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
+; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
+; KNL_64-NEXT: vmovaps %zmm2, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test22:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; KNL_32-NEXT: vmovq %xmm1, %xmm1
+; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
+; KNL_32-NEXT: vpandq .LCPI21_0, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test22:
+; SKX: # BB#0:
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX-NEXT: vpmovq2m %xmm1, %k0
+; SKX-NEXT: kshiftlw $2, %k0, %k0
+; SKX-NEXT: kshiftrw $2, %k0, %k1
+; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
+; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: retq
+ %sext_ind = sext <2 x i32> %ind to <2 x i64>
+ %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
+ %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
+ ret <2 x float>%res
+}
+
+declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
+declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
+
+define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
+;
+; KNL_64-LABEL: test23:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
+; KNL_64-NEXT: vmovaps %zmm2, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test23:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpandq .LCPI22_0, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test23:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovq2m %xmm1, %k1
+; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
+; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: retq
+ %sext_ind = sext <2 x i32> %ind to <2 x i64>
+ %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
+ %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
+ ret <2 x i32>%res
+}
+
+define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
+;
+;
+; KNL_64-LABEL: test24:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: movb $3, %al
+; KNL_64-NEXT: movzbl %al, %eax
+; KNL_64-NEXT: kmovw %eax, %k1
+; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
+; KNL_64-NEXT: vmovaps %zmm1, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test24:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; KNL_32-NEXT: vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1
+; KNL_32-NEXT: vpandq .LCPI23_1, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test24:
+; SKX: # BB#0:
+; SKX-NEXT: kxnorw %k1, %k1, %k1
+; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
+ %sext_ind = sext <2 x i32> %ind to <2 x i64>
+ %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
+ %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+ ret <2 x i32>%res
+}
+
+define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) {
+;
+; KNL_64-LABEL: test25:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
+; KNL_64-NEXT: vmovaps %zmm2, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test25:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpandq .LCPI24_0, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test25:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovq2m %xmm1, %k1
+; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
+; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: retq
+ %sext_ind = sext <2 x i32> %ind to <2 x i64>
+ %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
+ %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
+ ret <2 x i64>%res
+}
+
+define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
+;
+; KNL_64-LABEL: test26:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: movb $3, %al
+; KNL_64-NEXT: movzbl %al, %eax
+; KNL_64-NEXT: kmovw %eax, %k1
+; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
+; KNL_64-NEXT: vmovaps %zmm1, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test26:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; KNL_32-NEXT: vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2
+; KNL_32-NEXT: vpandq .LCPI25_1, %zmm2, %zmm2
+; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test26:
+; SKX: # BB#0:
+; SKX-NEXT: kxnorw %k1, %k1, %k1
+; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
+ %sext_ind = sext <2 x i32> %ind to <2 x i64>
+ %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
+ %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
+ ret <2 x i64>%res
+}
+
+; Result type requires widening; all-ones mask
+define <2 x float> @test27(float* %base, <2 x i32> %ind) {
+;
+; KNL_64-LABEL: test27:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1
+; KNL_64-NEXT: movb $3, %al
+; KNL_64-NEXT: movzbl %al, %eax
+; KNL_64-NEXT: kmovw %eax, %k1
+; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test27:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
+; KNL_32-NEXT: movb $3, %cl
+; KNL_32-NEXT: movzbl %cl, %ecx
+; KNL_32-NEXT: kmovw %ecx, %k1
+; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test27:
+; SKX: # BB#0:
+; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; SKX-NEXT: movb $3, %al
+; SKX-NEXT: kmovb %eax, %k1
+; SKX-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
+; SKX-NEXT: retq
+ %sext_ind = sext <2 x i32> %ind to <2 x i64>
+ %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
+ %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
+ ret <2 x float>%res
+}
+
+; Data type requires promotion, mask is all-ones
+define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
+;
+;
+; KNL_64-LABEL: test28:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT: movb $3, %al
+; KNL_64-NEXT: movzbl %al, %eax
+; KNL_64-NEXT: kmovw %eax, %k1
+; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test28:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; KNL_32-NEXT: vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2
+; KNL_32-NEXT: vpandq .LCPI27_1, %zmm2, %zmm2
+; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test28:
+; SKX: # BB#0:
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX-NEXT: movb $3, %al
+; SKX-NEXT: kmovb %eax, %k1
+; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
+ ret void
+}
+
+
+; SCALAR-LABEL: test29
+; SCALAR: extractelement <16 x float*>
; SCALAR-NEXT: load float
; SCALAR-NEXT: insertelement <16 x float>
; SCALAR-NEXT: extractelement <16 x float*>
; SCALAR-NEXT: load float
-define <16 x float> @test15(float* %base, <16 x i32> %ind) {
+define <16 x float> @test29(float* %base, <16 x i32> %ind) {
+; KNL_64-LABEL: test29:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: movw $44, %ax
+; KNL_64-NEXT: kmovw %eax, %k1
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT: vmovaps %zmm1, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test29:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: movw $44, %cx
+; KNL_32-NEXT: kmovw %ecx, %k1
+; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test29:
+; SKX: # BB#0:
+; SKX-NEXT: movw $44, %ax
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
; Check non-power-of-2 case. It should be scalarized.
declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
-; KNL-LABEL: test16
-; KNL: testb
-; KNL: je
-; KNL: testb
-; KNL: je
-; KNL: testb
-; KNL: je
-define <3 x i32> @test16(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
+define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
+; KNL_64-LABEL: test30:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: andl $1, %edx
+; KNL_64-NEXT: kmovw %edx, %k1
+; KNL_64-NEXT: andl $1, %esi
+; KNL_64-NEXT: kmovw %esi, %k2
+; KNL_64-NEXT: movl %edi, %eax
+; KNL_64-NEXT: andl $1, %eax
+; KNL_64-NEXT: kmovw %eax, %k0
+; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
+; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1
+; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm1
+; KNL_64-NEXT: # implicit-def: %XMM0
+; KNL_64-NEXT: testb $1, %dil
+; KNL_64-NEXT: je .LBB29_2
+; KNL_64-NEXT: # BB#1: # %cond.load
+; KNL_64-NEXT: vmovq %xmm1, %rax
+; KNL_64-NEXT: vmovd (%rax), %xmm0
+; KNL_64-NEXT: .LBB29_2: # %else
+; KNL_64-NEXT: kmovw %k2, %eax
+; KNL_64-NEXT: movl %eax, %ecx
+; KNL_64-NEXT: andl $1, %ecx
+; KNL_64-NEXT: testb %cl, %cl
+; KNL_64-NEXT: je .LBB29_4
+; KNL_64-NEXT: # BB#3: # %cond.load1
+; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx
+; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0
+; KNL_64-NEXT: .LBB29_4: # %else2
+; KNL_64-NEXT: kmovw %k1, %ecx
+; KNL_64-NEXT: movl %ecx, %edx
+; KNL_64-NEXT: andl $1, %edx
+; KNL_64-NEXT: testb %dl, %dl
+; KNL_64-NEXT: je .LBB29_6
+; KNL_64-NEXT: # BB#5: # %cond.load4
+; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1
+; KNL_64-NEXT: vmovq %xmm1, %rdx
+; KNL_64-NEXT: vpinsrd $2, (%rdx), %xmm0, %xmm0
+; KNL_64-NEXT: .LBB29_6: # %else5
+; KNL_64-NEXT: kmovw %k0, %edx
+; KNL_64-NEXT: vmovd %edx, %xmm1
+; KNL_64-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; KNL_64-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
+; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_64-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test30:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: andl $1, %eax
+; KNL_32-NEXT: kmovw %eax, %k1
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: andl $1, %eax
+; KNL_32-NEXT: kmovw %eax, %k2
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: movl %eax, %ecx
+; KNL_32-NEXT: andl $1, %ecx
+; KNL_32-NEXT: kmovw %ecx, %k0
+; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
+; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; KNL_32-NEXT: # implicit-def: %XMM0
+; KNL_32-NEXT: testb $1, %al
+; KNL_32-NEXT: je .LBB29_2
+; KNL_32-NEXT: # BB#1: # %cond.load
+; KNL_32-NEXT: vmovd %xmm1, %eax
+; KNL_32-NEXT: vmovd (%eax), %xmm0
+; KNL_32-NEXT: .LBB29_2: # %else
+; KNL_32-NEXT: kmovw %k2, %eax
+; KNL_32-NEXT: movl %eax, %ecx
+; KNL_32-NEXT: andl $1, %ecx
+; KNL_32-NEXT: testb %cl, %cl
+; KNL_32-NEXT: je .LBB29_4
+; KNL_32-NEXT: # BB#3: # %cond.load1
+; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx
+; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0
+; KNL_32-NEXT: .LBB29_4: # %else2
+; KNL_32-NEXT: kmovw %k1, %ecx
+; KNL_32-NEXT: movl %ecx, %edx
+; KNL_32-NEXT: andl $1, %edx
+; KNL_32-NEXT: testb %dl, %dl
+; KNL_32-NEXT: je .LBB29_6
+; KNL_32-NEXT: # BB#5: # %cond.load4
+; KNL_32-NEXT: vpextrd $2, %xmm1, %edx
+; KNL_32-NEXT: vpinsrd $2, (%edx), %xmm0, %xmm0
+; KNL_32-NEXT: .LBB29_6: # %else5
+; KNL_32-NEXT: kmovw %k0, %edx
+; KNL_32-NEXT: vmovd %edx, %xmm1
+; KNL_32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; KNL_32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
+; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_32-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test30:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovd2m %xmm2, %k1
+; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
+; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
+; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
+; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm1
+; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SKX-NEXT: # implicit-def: %XMM0
+; SKX-NEXT: andb $1, %al
+; SKX-NEXT: je .LBB29_2
+; SKX-NEXT: # BB#1: # %cond.load
+; SKX-NEXT: vmovq %xmm1, %rax
+; SKX-NEXT: vmovd (%rax), %xmm0
+; SKX-NEXT: .LBB29_2: # %else
+; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
+; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SKX-NEXT: andb $1, %al
+; SKX-NEXT: je .LBB29_4
+; SKX-NEXT: # BB#3: # %cond.load1
+; SKX-NEXT: vpextrq $1, %xmm1, %rax
+; SKX-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0
+; SKX-NEXT: .LBB29_4: # %else2
+; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
+; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SKX-NEXT: andb $1, %al
+; SKX-NEXT: je .LBB29_6
+; SKX-NEXT: # BB#5: # %cond.load4
+; SKX-NEXT: vextracti128 $1, %ymm1, %xmm1
+; SKX-NEXT: vmovq %xmm1, %rax
+; SKX-NEXT: vpinsrd $2, (%rax), %xmm0, %xmm0
+; SKX-NEXT: .LBB29_6: # %else5
+; SKX-NEXT: vmovdqa32 %xmm0, %xmm3 {%k1}
+; SKX-NEXT: vmovaps %zmm3, %zmm0
+; SKX-NEXT: retq
+
%sext_ind = sext <3 x i32> %ind to <3 x i64>
%gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
%res = call <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
declare <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
-; KNL-LABEL: test17
+; KNL-LABEL: test31
; KNL: vpgatherqq
; KNL: vpgatherqq
-define <16 x float*> @test17(<16 x float**> %ptrs) {
+define <16 x float*> @test31(<16 x float**> %ptrs) {
+; KNL_64-LABEL: test31:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: kxnorw %k1, %k1, %k1
+; KNL_64-NEXT: kxnorw %k2, %k2, %k2
+; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
+; KNL_64-NEXT: kshiftrw $8, %k1, %k1
+; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
+; KNL_64-NEXT: vmovaps %zmm2, %zmm0
+; KNL_64-NEXT: vmovaps %zmm3, %zmm1
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test31:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: kxnorw %k1, %k1, %k1
+; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
+; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test31:
+; SKX: # BB#0:
+; SKX-NEXT: kxnorw %k1, %k1, %k1
+; SKX-NEXT: kxnorw %k2, %k2, %k2
+; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
+; SKX-NEXT: kshiftrw $8, %k1, %k1
+; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
+; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: vmovaps %zmm3, %zmm1
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test31:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: kxnorw %k1, %k1, %k1
+; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
+; SKX_32-NEXT: vmovaps %zmm1, %zmm0
+; SKX_32-NEXT: retl
%res = call <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
ret <16 x float*>%res
}
+
+define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
+; KNL_64-LABEL: test_gather_16i32:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
+; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
+; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_gather_16i32:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI31_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_gather_16i32:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm2
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
+; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
+; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test_gather_16i32:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT: vpandd .LCPI31_0{1to16}, %zmm1, %zmm1
+; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
+; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: retl
+ %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
+ ret <16 x i32> %res
+}
+define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
+; KNL_64-LABEL: test_gather_16i64:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
+; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
+; KNL_64-NEXT: vmovaps %zmm3, %zmm0
+; KNL_64-NEXT: vmovaps %zmm4, %zmm1
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_gather_16i64:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: pushl %ebp
+; KNL_32-NEXT: .Ltmp0:
+; KNL_32-NEXT: .cfi_def_cfa_offset 8
+; KNL_32-NEXT: .Ltmp1:
+; KNL_32-NEXT: .cfi_offset %ebp, -8
+; KNL_32-NEXT: movl %esp, %ebp
+; KNL_32-NEXT: .Ltmp2:
+; KNL_32-NEXT: .cfi_def_cfa_register %ebp
+; KNL_32-NEXT: andl $-64, %esp
+; KNL_32-NEXT: subl $64, %esp
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI32_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
+; KNL_32-NEXT: kshiftrw $8, %k1, %k2
+; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
+; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: movl %ebp, %esp
+; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_gather_16i64:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
+; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
+; SKX-NEXT: vmovaps %zmm3, %zmm0
+; SKX-NEXT: vmovaps %zmm4, %zmm1
+; SKX-NEXT: retq
+ %res = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
+ ret <16 x i64> %res
+}
+declare <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
+define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
+; KNL_64-LABEL: test_gather_16f32:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
+; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
+; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_gather_16f32:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI33_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_gather_16f32:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm2
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
+; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
+; SKX-NEXT: vinsertf32x8 $1, %ymm2, %zmm3, %zmm0
+; SKX-NEXT: retq
+ %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
+ ret <16 x float> %res
+}
+define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
+; KNL_64-LABEL: test_gather_16f64:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
+; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
+; KNL_64-NEXT: vmovaps %zmm3, %zmm0
+; KNL_64-NEXT: vmovaps %zmm4, %zmm1
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_gather_16f64:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: pushl %ebp
+; KNL_32-NEXT: .Ltmp3:
+; KNL_32-NEXT: .cfi_def_cfa_offset 8
+; KNL_32-NEXT: .Ltmp4:
+; KNL_32-NEXT: .cfi_offset %ebp, -8
+; KNL_32-NEXT: movl %esp, %ebp
+; KNL_32-NEXT: .Ltmp5:
+; KNL_32-NEXT: .cfi_def_cfa_register %ebp
+; KNL_32-NEXT: andl $-64, %esp
+; KNL_32-NEXT: subl $64, %esp
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI34_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
+; KNL_32-NEXT: kshiftrw $8, %k1, %k2
+; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
+; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: movl %ebp, %esp
+; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_gather_16f64:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
+; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
+; SKX-NEXT: vmovaps %zmm3, %zmm0
+; SKX-NEXT: vmovaps %zmm4, %zmm1
+; SKX-NEXT: retq
+ %res = call <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
+ ret <16 x double> %res
+}
+declare <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
+define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
+; KNL_64-LABEL: test_scatter_16i32:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
+; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0
+; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_scatter_16i32:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI35_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_scatter_16i32:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
+; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm0
+; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test_scatter_16i32:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT: vpandd .LCPI35_0{1to16}, %zmm1, %zmm1
+; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
+; SKX_32-NEXT: retl
+ call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
+ ret void
+}
+define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
+; KNL_64-LABEL: test_scatter_16i64:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
+; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_scatter_16i64:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: pushl %ebp
+; KNL_32-NEXT: .Ltmp6:
+; KNL_32-NEXT: .cfi_def_cfa_offset 8
+; KNL_32-NEXT: .Ltmp7:
+; KNL_32-NEXT: .cfi_offset %ebp, -8
+; KNL_32-NEXT: movl %esp, %ebp
+; KNL_32-NEXT: .Ltmp8:
+; KNL_32-NEXT: .cfi_def_cfa_register %ebp
+; KNL_32-NEXT: andl $-64, %esp
+; KNL_32-NEXT: subl $64, %esp
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI36_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
+; KNL_32-NEXT: kshiftrw $8, %k1, %k2
+; KNL_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
+; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
+; KNL_32-NEXT: movl %ebp, %esp
+; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_scatter_16i64:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
+; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
+; SKX-NEXT: retq
+ call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
+ ret void
+}
+declare void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
+define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
+; KNL_64-LABEL: test_scatter_16f32:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
+; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0
+; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_scatter_16f32:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI37_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_scatter_16f32:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
+; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm0
+; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
+; SKX-NEXT: retq
+ call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
+ ret void
+}
+declare void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
+define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
+; KNL_64-LABEL: test_scatter_16f64:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
+; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_scatter_16f64:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: pushl %ebp
+; KNL_32-NEXT: .Ltmp9:
+; KNL_32-NEXT: .cfi_def_cfa_offset 8
+; KNL_32-NEXT: .Ltmp10:
+; KNL_32-NEXT: .cfi_offset %ebp, -8
+; KNL_32-NEXT: movl %esp, %ebp
+; KNL_32-NEXT: .Ltmp11:
+; KNL_32-NEXT: .cfi_def_cfa_register %ebp
+; KNL_32-NEXT: andl $-64, %esp
+; KNL_32-NEXT: subl $64, %esp
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI38_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
+; KNL_32-NEXT: kshiftrw $8, %k1, %k2
+; KNL_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
+; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
+; KNL_32-NEXT: movl %ebp, %esp
+; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_scatter_16f64:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
+; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
+; SKX-NEXT: retq
+ call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
+ ret void
+}
+declare void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)