From 1e9e8c6572ab99f36e5aa3d1f3f127fcedbfea43 Mon Sep 17 00:00:00 2001 From: Charlie Turner Date: Mon, 9 Nov 2015 13:10:52 +0000 Subject: [PATCH] [AArch64] Add UABDL patterns for log2 shuffle. Summary: This matches the sum-of-absdiff patterns emitted by the vectoriser using log2 shuffles. Relies on D14207 to be able to match the `extract_subvector(..., 0)` Reviewers: t.p.northover, jmolloy Subscribers: aemerson, llvm-commits, rengolin Differential Revision: http://reviews.llvm.org/D14208 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@252465 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64InstrInfo.td | 36 +++++++++++++- test/CodeGen/AArch64/arm64-vabs.ll | 66 ++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 2 deletions(-) diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index 5df147aa320..3b4fbc87b52 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -2622,6 +2622,40 @@ defm FMOV : FPMoveImmediate<"fmov">; // Advanced SIMD two vector instructions. //===----------------------------------------------------------------------===// +defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", + uabsdiff>; +// Match UABDL in log2-shuffle patterns. +def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))), + (v8i16 (add (sub (zext (v8i8 V64:$opA)), + (zext (v8i8 V64:$opB))), + (AArch64vashr v8i16:$src, (i32 15))))), + (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>; +def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))), + (v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)), + (zext (extract_high_v16i8 V128:$opB))), + (AArch64vashr v8i16:$src, (i32 15))))), + (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>; +def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))), + (v4i32 (add (sub (zext (v4i16 V64:$opA)), + (zext (v4i16 V64:$opB))), + (AArch64vashr v4i32:$src, (i32 31))))), + (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>; +def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))), + (v4i32 (add (sub (zext (extract_high_v8i16 V128:$opA)), + (zext (extract_high_v8i16 V128:$opB))), + (AArch64vashr v4i32:$src, (i32 31))))), + (UABDLv8i16_v4i32 V128:$opA, V128:$opB)>; +def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))), + (v2i64 (add (sub (zext (v2i32 V64:$opA)), + (zext (v2i32 V64:$opB))), + (AArch64vashr v2i64:$src, (i32 63))))), + (UABDLv2i32_v2i64 V64:$opA, V64:$opB)>; +def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))), + (v2i64 (add (sub (zext (extract_high_v4i32 V128:$opA)), + (zext (extract_high_v4i32 V128:$opB))), + (AArch64vashr v2i64:$src, (i32 63))))), + (UABDLv4i32_v2i64 V128:$opA, V128:$opB)>; + defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>; def : Pat<(xor (v8i8 (AArch64vashr V64:$src, (i32 7))), (v8i8 (add V64:$src, (AArch64vashr V64:$src, (i32 7))))), @@ -3375,8 +3409,6 @@ defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw", BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>; defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal", uabsdiff>; -defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", - uabsdiff>; defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl", BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>; defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw", diff --git a/test/CodeGen/AArch64/arm64-vabs.ll b/test/CodeGen/AArch64/arm64-vabs.ll index a52c4ebf13e..c1800085884 100644 --- a/test/CodeGen/AArch64/arm64-vabs.ll +++ b/test/CodeGen/AArch64/arm64-vabs.ll @@ -134,6 +134,72 @@ define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { ret <2 x i64> %tmp4 } +define i16 @uabdl8h_log2_shuffle(<16 x i8>* %a, <16 x i8>* %b) { +; CHECK-LABEL: uabdl8h_log2_shuffle +; CHECK: uabdl2.8h +; CHECK: uabdl.8h + %aload = load <16 x i8>, <16 x i8>* %a, align 1 + %bload = load <16 x i8>, <16 x i8>* %b, align 1 + %aext = zext <16 x i8> %aload to <16 x i16> + %bext = zext <16 x i8> %bload to <16 x i16> + %abdiff = sub nsw <16 x i16> %aext, %bext + %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer + %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff + %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff + %rdx.shuf = shufflevector <16 x i16> %absel, <16 x i16> undef, <16 x i32> + %bin1.rdx = add <16 x i16> %absel, %rdx.shuf + %rdx.shufx = shufflevector <16 x i16> %bin1.rdx, <16 x i16> undef, <16 x i32> + %bin.rdx = add <16 x i16> %bin1.rdx, %rdx.shufx + %rdx.shuf136 = shufflevector <16 x i16> %bin.rdx, <16 x i16> undef, <16 x i32> + %bin.rdx137 = add <16 x i16> %bin.rdx, %rdx.shuf136 + %rdx.shuf138 = shufflevector <16 x i16> %bin.rdx137, <16 x i16> undef, <16 x i32> + %bin.rdx139 = add <16 x i16> %bin.rdx137, %rdx.shuf138 + %reduced_v = extractelement <16 x i16> %bin.rdx139, i16 0 + ret i16 %reduced_v +} + +define i32 @uabdl4s_log2_shuffle(<8 x i16>* %a, <8 x i16>* %b) { +; CHECK-LABEL: uabdl4s_log2_shuffle +; CHECK: uabdl2.4s +; CHECK: uabdl.4s + %aload = load <8 x i16>, <8 x i16>* %a, align 1 + %bload = load <8 x i16>, <8 x i16>* %b, align 1 + %aext = zext <8 x i16> %aload to <8 x i32> + %bext = zext <8 x i16> %bload to <8 x i32> + %abdiff = sub nsw <8 x i32> %aext, %bext + %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer + %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff + %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff + %rdx.shuf = shufflevector <8 x i32> %absel, <8 x i32> undef, <8 x i32> + %bin.rdx = add <8 x i32> %absel, %rdx.shuf + %rdx.shuf136 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> + %bin.rdx137 = add <8 x i32> %bin.rdx, %rdx.shuf136 + %rdx.shuf138 = shufflevector <8 x i32> %bin.rdx137, <8 x i32> undef, <8 x i32> + %bin.rdx139 = add <8 x i32> %bin.rdx137, %rdx.shuf138 + %reduced_v = extractelement <8 x i32> %bin.rdx139, i32 0 + ret i32 %reduced_v +} + +define i64 @uabdl2d_log2_shuffle(<4 x i32>* %a, <4 x i32>* %b, i32 %h) { +; CHECK: uabdl2d_log2_shuffle +; CHECK: uabdl2.2d +; CHECK: uabdl.2d + %aload = load <4 x i32>, <4 x i32>* %a, align 1 + %bload = load <4 x i32>, <4 x i32>* %b, align 1 + %aext = zext <4 x i32> %aload to <4 x i64> + %bext = zext <4 x i32> %bload to <4 x i64> + %abdiff = sub nsw <4 x i64> %aext, %bext + %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer + %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff + %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff + %rdx.shuf136 = shufflevector <4 x i64> %absel, <4 x i64> undef, <4 x i32> + %bin.rdx137 = add <4 x i64> %absel, %rdx.shuf136 + %rdx.shuf138 = shufflevector <4 x i64> %bin.rdx137, <4 x i64> undef, <4 x i32> + %bin.rdx139 = add <4 x i64> %bin.rdx137, %rdx.shuf138 + %reduced_v = extractelement <4 x i64> %bin.rdx139, i16 0 + ret i64 %reduced_v +} + define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind { ;CHECK-LABEL: fabd_2s: ;CHECK: fabd.2s -- 2.34.1