From: Craig Topper Date: Fri, 16 Aug 2013 06:07:34 +0000 (+0000) Subject: Don't use v16i32 for load pattern matching. All 512-bit loads are cated to v8i64. X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=0163356ad1944dda162956993a95e547dc03251b;p=oota-llvm.git Don't use v16i32 for load pattern matching. All 512-bit loads are cated to v8i64. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188534 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index f4528a9b3e0..d100e88454f 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -555,7 +555,7 @@ let Constraints = "$src1 = $dst" in { (bitconvert (mem_frag addr:$src3)))))]>, EVEX_4V; } } -defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32, i512mem, +defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv8i64, i512mem, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, memopv8i64, i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; @@ -1107,7 +1107,7 @@ def VMOVDQA64rm : AVX512BI<0x6F, MRMSrcMem, (outs VR512:$dst), } multiclass avx512_mov_int opc, string asm, RegisterClass RC, - RegisterClass KRC, + RegisterClass KRC, PatFrag bc_frag, PatFrag ld_frag, X86MemOperand x86memop> { let neverHasSideEffects = 1 in def rr : AVX512XSI, + [(set RC:$dst, (bc_frag (ld_frag addr:$src)))]>, EVEX; let Constraints = "$src1 = $dst" in { def rrk : AVX512XSI, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VMOVDQU64 : avx512_mov_int<0x6F, "vmovdqu64", VR512, VK8WM, memopv8i64, i512mem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VMOVDQU32 : avx512_mov_int<0x6F, "vmovdqu32", VR512, VK16WM, bc_v16i32, + memopv8i64, i512mem>, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VMOVDQU64 : avx512_mov_int<0x6F, "vmovdqu64", VR512, VK8WM, bc_v8i64, + memopv8i64, i512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; let AddedComplexity = 20 in { def : Pat<(v16f32 (vselect VK16WM:$mask, (v16f32 VR512:$src1), diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 3d6370fdac6..fe35393fc58 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -277,7 +277,6 @@ def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>; // 512-bit load pattern fragments def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>; def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>; -def loadv16i32 : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>; def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>; // 128-/256-/512-bit extload pattern fragments @@ -351,8 +350,6 @@ def alignedloadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (alignedload512 node:$ptr))>; def alignedloadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (alignedload512 node:$ptr))>; -def alignedloadv16i32 : PatFrag<(ops node:$ptr), - (v16i32 (alignedload512 node:$ptr))>; def alignedloadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (alignedload512 node:$ptr))>; @@ -379,14 +376,12 @@ def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>; // 256-bit memop pattern fragments // NOTE: all 256-bit integer vector loads are promoted to v4i64 def memopv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop node:$ptr))>; -def memopv8i32 : PatFrag<(ops node:$ptr), (v8i32 (memop node:$ptr))>; def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>; def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>; // 512-bit memop pattern fragments def memopv16f32 : PatFrag<(ops node:$ptr), (v16f32 (memop node:$ptr))>; def memopv8f64 : PatFrag<(ops node:$ptr), (v8f64 (memop node:$ptr))>; -def memopv16i32 : PatFrag<(ops node:$ptr), (v16i32 (memop node:$ptr))>; def memopv8i64 : PatFrag<(ops node:$ptr), (v8i64 (memop node:$ptr))>; // SSSE3 uses MMX registers for some instructions. They aren't aligned on a @@ -438,6 +433,11 @@ def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>; def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>; def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>; +// 512-bit bitconvert pattern fragments +def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>; +def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>; + + def vzmovl_v2i64 : PatFrag<(ops node:$src), (bitconvert (v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 node:$src))))))>; diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll index 9f3d86a5e64..8f8bf42ad36 100644 --- a/test/CodeGen/X86/avx512-shuffle.ll +++ b/test/CodeGen/X86/avx512-shuffle.ll @@ -64,3 +64,44 @@ define <8 x i64> @test6(<8 x i64> %a) nounwind { ret <8 x i64> %c } +; CHECK: test7: +; CHECK: vpermi2q +; CHECK: ret +define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind { + %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> + ret <8 x i64> %c +} + +; CHECK: test8: +; CHECK: vpermi2d +; CHECK: ret +define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind { + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> + ret <16 x i32> %c +} + +; CHECK: test9: +; CHECK: vpermi2ps +; CHECK: ret +define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind { + %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> + ret <16 x float> %c +} + +; CHECK: test10: +; CHECK: vpermi2ps ( +; CHECK: ret +define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind { + %c = load <16 x float>* %b + %d = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> + ret <16 x float> %d +} + +; CHECK: test11: +; CHECK: vpermi2d ( +; CHECK: ret +define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind { + %c = load <16 x i32>* %b + %d = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> + ret <16 x i32> %d +}