From e5716c4e3ac95bb5f204390720af3a186814f2cb Mon Sep 17 00:00:00 2001 From: Robert Lougher Date: Tue, 12 Jan 2016 11:48:25 +0000 Subject: [PATCH] The isel pattern that selects the memory-register form of VCVTPH2PS (64 to 128-bit) matches against the pattern fragment 'vzmovl_v2i64' (a zero-extended 64-bit load). However, a change in r248784 teaches the instruction combiner that only the lower 64 bits of the input to a 128-bit vcvtph2ps are used. This means the instruction combiner will ordinarily optimize away the upper 64-bit insertelement instruction in the zero-extension and so we no longer select the memory-register form. To fix this a new pattern has been added. Differential Revision: http://reviews.llvm.org/D16067 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@257470 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrSSE.td | 3 +++ test/CodeGen/X86/f16c-intrinsics.ll | 12 ++++++++++++ 2 files changed, 15 insertions(+) diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 859ff378182..6a7c45665e9 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -8257,6 +8257,9 @@ let Predicates = [HasF16C] in { (VCVTPH2PSrm addr:$src)>; def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)), (VCVTPH2PSrm addr:$src)>; + def : Pat<(int_x86_vcvtph2ps_128 (bitconvert + (v2i64 (scalar_to_vector (loadi64 addr:$src))))), + (VCVTPH2PSrm addr:$src)>; def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16 (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))), diff --git a/test/CodeGen/X86/f16c-intrinsics.ll b/test/CodeGen/X86/f16c-intrinsics.ll index 485592aeac3..a78022ac550 100644 --- a/test/CodeGen/X86/f16c-intrinsics.ll +++ b/test/CodeGen/X86/f16c-intrinsics.ll @@ -61,6 +61,18 @@ define <4 x float> @test_x86_vcvtps2ph_128_scalar(i64* %ptr) { ret <4 x float> %res } +define <4 x float> @test_x86_vcvtps2ph_128_scalar2(i64* %ptr) { +; CHECK-LABEL: test_x86_vcvtps2ph_128_scalar2: +; CHECK-NOT: vmov +; CHECK: vcvtph2ps (% + + %load = load i64, i64* %ptr + %ins = insertelement <2 x i64> undef, i64 %load, i32 0 + %bc = bitcast <2 x i64> %ins to <8 x i16> + %res = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %bc) + ret <4 x float> %res +} + define void @test_x86_vcvtps2ph_256_m(<8 x i16>* nocapture %d, <8 x float> %a) nounwind { entry: ; CHECK-LABEL: test_x86_vcvtps2ph_256_m: -- 2.34.1