From d636e64cbcb85252ffe281d640b9a42ff608d832 Mon Sep 17 00:00:00 2001 From: Ahmed Bougacha Date: Thu, 10 Sep 2015 01:42:28 +0000 Subject: [PATCH] [AArch64] Support selecting STNP. We could go through the load/store optimizer and match STNP where we would have matched a nontemporal-annotated STP, but that's not reliable enough, as an opportunistic optimization. Insetad, we can guarantee emitting STNP, by matching them at ISel. Since there are no single-input nontemporal stores, we have to resort to some high-bits-extracting trickery to generate an STNP from a plain store. Also, we need to support another, LDP/STP-specific addressing mode, base + signed scaled 7-bit immediate offset. For now, only match the base. Let's make it smart separately. Part of PR24086. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@247231 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 33 ++++ lib/Target/AArch64/AArch64InstrFormats.td | 6 + lib/Target/AArch64/AArch64InstrInfo.td | 39 +++++ test/CodeGen/AArch64/nontemporal.ll | 192 +++++++++++++++++++++ 4 files changed, 270 insertions(+) create mode 100644 test/CodeGen/AArch64/nontemporal.ll diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index d93e59ccf77..77896af196d 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -77,6 +77,21 @@ public: bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) { return SelectShiftedRegister(N, true, Reg, Shift); } + bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 1, Base, OffImm); + } + bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 2, Base, OffImm); + } + bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 4, Base, OffImm); + } + bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 8, Base, OffImm); + } + bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 16, Base, OffImm); + } bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) { return SelectAddrModeIndexed(N, 1, Base, OffImm); } @@ -164,6 +179,8 @@ public: private: bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg, SDValue &Shift); + bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base, + SDValue &OffImm); bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base, SDValue &OffImm); bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base, @@ -606,6 +623,22 @@ static bool isWorthFoldingADDlow(SDValue N) { return true; } +/// SelectAddrModeIndexed7S - Select a "register plus scaled signed 7-bit +/// immediate" address. The "Size" argument is the size in bytes of the memory +/// reference, which determines the scale. +bool AArch64DAGToDAGISel::SelectAddrModeIndexed7S(SDValue N, unsigned Size, + SDValue &Base, + SDValue &OffImm) { + SDLoc dl(N); + // Base only. The address will be materialized into a register before + // the memory is accessed. + // add x0, Xbase, #offset + // stp x1, x2, [x0] + Base = N; + OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64); + return true; +} + /// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit /// immediate" address. The "Size" argument is the size in bytes of the memory /// reference, which determines the scale. diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index 0903f320601..d644f264eb9 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -248,6 +248,12 @@ def simm7s16 : Operand { let PrintMethod = "printImmScale<16>"; } +def am_indexed7s8 : ComplexPattern; +def am_indexed7s16 : ComplexPattern; +def am_indexed7s32 : ComplexPattern; +def am_indexed7s64 : ComplexPattern; +def am_indexed7s128 : ComplexPattern; + class AsmImmRange : AsmOperandClass { let Name = "Imm" # Low # "_" # High; let DiagnosticType = "InvalidImm" # Low # "_" # High; diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index d3c244977ec..5f01debf4ce 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -5825,6 +5825,45 @@ def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>; +// Patterns for nontemporal/no-allocate stores. +// We have to resort to tricks to turn a single-input store into a store pair, +// because there is no single-input nontemporal store, only STNP. +let Predicates = [IsLE] in { +let AddedComplexity = 15 in { +class NTStore128Pat : + Pat<(nontemporalstore (VT FPR128:$Rt), + (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)), + (STNPDi (EXTRACT_SUBREG FPR128:$Rt, dsub), + (CPYi64 FPR128:$Rt, (i64 1)), + GPR64sp:$Rn, simm7s8:$offset)>; + +def : NTStore128Pat; +def : NTStore128Pat; +def : NTStore128Pat; +def : NTStore128Pat; + +class NTStore64Pat : + Pat<(nontemporalstore (VT FPR64:$Rt), + (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)), + (STNPSi (EXTRACT_SUBREG FPR64:$Rt, ssub), + (CPYi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)), + GPR64sp:$Rn, simm7s4:$offset)>; + +// FIXME: Shouldn't v1f64 loads/stores be promoted to v1i64? +def : NTStore64Pat; +def : NTStore64Pat; +def : NTStore64Pat; +def : NTStore64Pat; +def : NTStore64Pat; + +def : Pat<(nontemporalstore GPR64:$Rt, + (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)), + (STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), + (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 0, 31), sub_32), + GPR64sp:$Rn, simm7s4:$offset)>; +} // AddedComplexity=10 +} // Predicates = [IsLE] + // Tail call return handling. These are all compiler pseudo-instructions, // so no encoding information or anything like that. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in { diff --git a/test/CodeGen/AArch64/nontemporal.ll b/test/CodeGen/AArch64/nontemporal.ll new file mode 100644 index 00000000000..6db05cb4877 --- /dev/null +++ b/test/CodeGen/AArch64/nontemporal.ll @@ -0,0 +1,192 @@ +; RUN: llc < %s -mtriple aarch64-apple-darwin -asm-verbose=false | FileCheck %s + +define void @test_stnp_v4i64(<4 x i64>* %p, <4 x i64> %v) #0 { +; CHECK-LABEL: test_stnp_v4i64: +; CHECK-NEXT: add x[[PTR:[0-9]+]], x0, #16 +; CHECK-NEXT: mov d[[HI1:[0-9]+]], v1[1] +; CHECK-NEXT: mov d[[HI0:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d1, d[[HI1]], [x[[PTR]]] +; CHECK-NEXT: stnp d0, d[[HI0]], [x0] +; CHECK-NEXT: ret + store <4 x i64> %v, <4 x i64>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v4i32(<4 x i32>* %p, <4 x i32> %v) #0 { +; CHECK-LABEL: test_stnp_v4i32: +; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d0, d[[HI]], [x0] +; CHECK-NEXT: ret + store <4 x i32> %v, <4 x i32>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v8i16(<8 x i16>* %p, <8 x i16> %v) #0 { +; CHECK-LABEL: test_stnp_v8i16: +; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d0, d[[HI]], [x0] +; CHECK-NEXT: ret + store <8 x i16> %v, <8 x i16>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 { +; CHECK-LABEL: test_stnp_v16i8: +; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d0, d[[HI]], [x0] +; CHECK-NEXT: ret + store <16 x i8> %v, <16 x i8>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v2i32(<2 x i32>* %p, <2 x i32> %v) #0 { +; CHECK-LABEL: test_stnp_v2i32: +; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp s0, s[[HI]], [x0] +; CHECK-NEXT: ret + store <2 x i32> %v, <2 x i32>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v4i16(<4 x i16>* %p, <4 x i16> %v) #0 { +; CHECK-LABEL: test_stnp_v4i16: +; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp s0, s[[HI]], [x0] +; CHECK-NEXT: ret + store <4 x i16> %v, <4 x i16>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v8i8(<8 x i8>* %p, <8 x i8> %v) #0 { +; CHECK-LABEL: test_stnp_v8i8: +; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp s0, s[[HI]], [x0] +; CHECK-NEXT: ret + store <8 x i8> %v, <8 x i8>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v2f64(<2 x double>* %p, <2 x double> %v) #0 { +; CHECK-LABEL: test_stnp_v2f64: +; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d0, d[[HI]], [x0] +; CHECK-NEXT: ret + store <2 x double> %v, <2 x double>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v4f32(<4 x float>* %p, <4 x float> %v) #0 { +; CHECK-LABEL: test_stnp_v4f32: +; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d0, d[[HI]], [x0] +; CHECK-NEXT: ret + store <4 x float> %v, <4 x float>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v2f32(<2 x float>* %p, <2 x float> %v) #0 { +; CHECK-LABEL: test_stnp_v2f32: +; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp s0, s[[HI]], [x0] +; CHECK-NEXT: ret + store <2 x float> %v, <2 x float>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v1f64(<1 x double>* %p, <1 x double> %v) #0 { +; CHECK-LABEL: test_stnp_v1f64: +; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp s0, s[[HI]], [x0] +; CHECK-NEXT: ret + store <1 x double> %v, <1 x double>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v1i64(<1 x i64>* %p, <1 x i64> %v) #0 { +; CHECK-LABEL: test_stnp_v1i64: +; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp s0, s[[HI]], [x0] +; CHECK-NEXT: ret + store <1 x i64> %v, <1 x i64>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_i64(i64* %p, i64 %v) #0 { +; CHECK-LABEL: test_stnp_i64: +; CHECK-NEXT: ubfx x[[HI:[0-9]+]], x1, #0, #32 +; CHECK-NEXT: stnp w1, w[[HI]], [x0] +; CHECK-NEXT: ret + store i64 %v, i64* %p, align 1, !nontemporal !0 + ret void +} + + +define void @test_stnp_v2f64_offset(<2 x double>* %p, <2 x double> %v) #0 { +; CHECK-LABEL: test_stnp_v2f64_offset: +; CHECK-NEXT: add x[[PTR:[0-9]+]], x0, #16 +; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d0, d[[HI]], [x[[PTR]]] +; CHECK-NEXT: ret + %tmp0 = getelementptr <2 x double>, <2 x double>* %p, i32 1 + store <2 x double> %v, <2 x double>* %tmp0, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v2f64_offset_neg(<2 x double>* %p, <2 x double> %v) #0 { +; CHECK-LABEL: test_stnp_v2f64_offset_neg: +; CHECK-NEXT: sub x[[PTR:[0-9]+]], x0, #16 +; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d0, d[[HI]], [x[[PTR]]] +; CHECK-NEXT: ret + %tmp0 = getelementptr <2 x double>, <2 x double>* %p, i32 -1 + store <2 x double> %v, <2 x double>* %tmp0, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v2f32_offset(<2 x float>* %p, <2 x float> %v) #0 { +; CHECK-LABEL: test_stnp_v2f32_offset: +; CHECK-NEXT: add x[[PTR:[0-9]+]], x0, #8 +; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp s0, s[[HI]], [x[[PTR]]] +; CHECK-NEXT: ret + %tmp0 = getelementptr <2 x float>, <2 x float>* %p, i32 1 + store <2 x float> %v, <2 x float>* %tmp0, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v2f32_offset_neg(<2 x float>* %p, <2 x float> %v) #0 { +; CHECK-LABEL: test_stnp_v2f32_offset_neg: +; CHECK-NEXT: sub x[[PTR:[0-9]+]], x0, #8 +; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp s0, s[[HI]], [x[[PTR]]] +; CHECK-NEXT: ret + %tmp0 = getelementptr <2 x float>, <2 x float>* %p, i32 -1 + store <2 x float> %v, <2 x float>* %tmp0, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_i64_offset(i64* %p, i64 %v) #0 { +; CHECK-LABEL: test_stnp_i64_offset: +; CHECK-NEXT: add x[[PTR:[0-9]+]], x0, #8 +; CHECK-NEXT: ubfx x[[HI:[0-9]+]], x1, #0, #32 +; CHECK-NEXT: stnp w1, w[[HI]], [x[[PTR]]] +; CHECK-NEXT: ret + %tmp0 = getelementptr i64, i64* %p, i32 1 + store i64 %v, i64* %tmp0, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_i64_offset_neg(i64* %p, i64 %v) #0 { +; CHECK-LABEL: test_stnp_i64_offset_neg: +; CHECK-NEXT: sub x[[PTR:[0-9]+]], x0, #8 +; CHECK-NEXT: ubfx x[[HI:[0-9]+]], x1, #0, #32 +; CHECK-NEXT: stnp w1, w[[HI]], [x[[PTR]]] +; CHECK-NEXT: ret + %tmp0 = getelementptr i64, i64* %p, i32 -1 + store i64 %v, i64* %tmp0, align 1, !nontemporal !0 + ret void +} + +!0 = !{ i32 1 } + +attributes #0 = { nounwind } -- 2.34.1