From f586decae108e6ea1c5de3d68785226baca31d47 Mon Sep 17 00:00:00 2001
From: Mohammad Shahid <Asghar-ahmad.Shahid@amd.com>
Date: Thu, 24 Sep 2015 10:35:03 +0000
Subject: [PATCH] Codegen: Fix llvm.*absdiff semantic.

Fixes the overflow case of llvm.*absdiff intrinsic also updats the tests and LangRef.rst accordingly.

Differential Revision: http://reviews.llvm.org/D11678

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@248483 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/LangRef.rst                              |  23 ++-
 .../SelectionDAG/LegalizeVectorOps.cpp        |  38 ++--
 test/CodeGen/X86/absdiff_128.ll               | 181 ++++++++++++++++++
 test/CodeGen/X86/absdiff_256.ll               |  29 +++
 4 files changed, 245 insertions(+), 26 deletions(-)
 create mode 100644 test/CodeGen/X86/absdiff_128.ll
 create mode 100644 test/CodeGen/X86/absdiff_256.ll
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index c7ceb73da37..a609722d80e 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -10920,16 +10920,19 @@ This is an overloaded intrinsic. The loaded data is a vector of any integer bit
 Overview:
 """""""""
 
-The ``llvm.uabsdiff`` intrinsic returns a vector result of the absolute difference of
-the two operands, treating them both as unsigned integers.
+The ``llvm.uabsdiff`` intrinsic returns a vector result of the absolute difference
+of the two operands, treating them both as unsigned integers. The intermediate
+calculations are computed using infinitely precise unsigned arithmetic. The final
+result will be truncated to the given type.
 
 The ``llvm.sabsdiff`` intrinsic returns a vector result of the absolute difference of
-the two operands, treating them both as signed integers.
+the two operands, treating them both as signed integers. If the result overflows, the
+behavior is undefined.
 
 .. note::
 
     These intrinsics are primarily used during the code generation stage of compilation.
-    They are generated by compiler passes such as the Loop and SLP vectorizers.it is not
+    They are generated by compiler passes such as the Loop and SLP vectorizers. It is not
     recommended for users to create them manually.
 
 Arguments:
@@ -10946,19 +10949,19 @@ The expression::
 
 is equivalent to::
 
-    %sub = sub <4 x i32> %a, %b
-    %ispos = icmp ugt <4 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1>
-    %neg = sub <4 x i32> zeroinitializer, %sub
-    %1 = select <4 x i1> %ispos, <4 x i32> %sub, <4 x i32> %neg
+    %1 = zext <4 x i32> %a to <4 x i64>
+    %2 = zext <4 x i32> %b to <4 x i64>
+    %sub = sub <4 x i64> %1, %2
+    %trunc = trunc <4 x i64> to <4 x i32>
 
-Similarly the expression::
+and the expression::
 
     call <4 x i32> @llvm.sabsdiff.v4i32(<4 x i32> %a, <4 x i32> %b)
 
 is equivalent to::
 
     %sub = sub nsw <4 x i32> %a, %b
-    %ispos = icmp sgt <4 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1>
+    %ispos = icmp sge <4 x i32> %sub, zeroinitializer
     %neg = sub nsw <4 x i32> zeroinitializer, %sub
     %1 = select <4 x i1> %ispos, <4 x i32> %sub, <4 x i32> %neg
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 35ccd13adcc..329a0179178 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -723,24 +723,30 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
 
 SDValue VectorLegalizer::ExpandABSDIFF(SDValue Op) {
   SDLoc dl(Op);
-  SDValue Tmp1, Tmp2, Tmp3, Tmp4;
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
   EVT VT = Op.getValueType();
+
+  // For unsigned intrinsic, promote the type to handle unsigned overflow.
+  bool isUabsdiff = (Op->getOpcode() == ISD::UABSDIFF);
+  if (isUabsdiff) {
+    VT = VT.widenIntegerVectorElementType(*DAG.getContext());
+    Op0 = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Op0);
+    Op1 = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Op1);
+  }
+
   SDNodeFlags Flags;
-  Flags.setNoSignedWrap(Op->getOpcode() == ISD::SABSDIFF);
-
-  Tmp2 = Op.getOperand(0);
-  Tmp3 = Op.getOperand(1);
-  Tmp1 = DAG.getNode(ISD::SUB, dl, VT, Tmp2, Tmp3, &Flags);
-  Tmp2 =
-      DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Tmp1, &Flags);
-  Tmp4 = DAG.getNode(
-      ISD::SETCC, dl,
-      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Tmp2,
-      DAG.getConstant(0, dl, VT),
-      DAG.getCondCode(Op->getOpcode() == ISD::SABSDIFF ? ISD::SETLT
-                                                       : ISD::SETULT));
-  Tmp1 = DAG.getNode(ISD::VSELECT, dl, VT, Tmp4, Tmp1, Tmp2);
-  return Tmp1;
+  Flags.setNoSignedWrap(!isUabsdiff);
+  SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op0, Op1, &Flags);
+  if (isUabsdiff)
+    return DAG.getNode(ISD::TRUNCATE, dl, Op.getValueType(), Sub);
+
+  SDValue Cmp =
+      DAG.getNode(ISD::SETCC, dl, TLI.getSetCCResultType(DAG.getDataLayout(),
+                                                         *DAG.getContext(), VT),
+                  Sub, DAG.getConstant(0, dl, VT), DAG.getCondCode(ISD::SETGE));
+  SDValue Neg = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Sub, &Flags);
+  return DAG.getNode(ISD::VSELECT, dl, VT, Cmp, Sub, Neg);
 }
 
 SDValue VectorLegalizer::ExpandSELECT(SDValue Op) {
diff --git a/test/CodeGen/X86/absdiff_128.ll b/test/CodeGen/X86/absdiff_128.ll
new file mode 100644
index 00000000000..24055ccc79e
--- /dev/null
+++ b/test/CodeGen/X86/absdiff_128.ll
@@ -0,0 +1,181 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+declare <4 x i8> @llvm.uabsdiff.v4i8(<4 x i8>, <4 x i8>)
+
+define <4 x i8> @test_uabsdiff_v4i8_expand(<4 x i8> %a1, <4 x i8> %a2) {
+; CHECK-LABEL: test_uabsdiff_v4i8_expand
+; CHECK:      pshufd
+; CHECK:      movd
+; CHECK:      subl
+; CHECK:      punpckldq
+; CHECK-DAG:  movd   %xmm1, [[SRC:%.*]]
+; CHECK-DAG:  movd   %xmm0, [[DST:%.*]]
+; CHECK:      subl [[SRC]], [[DST]]
+; CHECK:      movd
+; CHECK:      pshufd
+; CHECK:      movd
+; CHECK:      punpckldq
+; CHECK:      movdqa
+; CHECK:      retq
+
+  %1 = call <4 x i8> @llvm.uabsdiff.v4i8(<4 x i8> %a1, <4 x i8> %a2)
+  ret <4 x i8> %1
+}
+
+declare <4 x i8> @llvm.sabsdiff.v4i8(<4 x i8>, <4 x i8>)
+
+define <4 x i8> @test_sabsdiff_v4i8_expand(<4 x i8> %a1, <4 x i8> %a2) {
+; CHECK-LABEL: test_sabsdiff_v4i8_expand
+; CHECK:      psubd
+; CHECK:      pcmpgtd
+; CHECK:      pcmpeqd
+; CHECK:      pxor
+; CHECK-DAG:  psubd  {{%xmm[0-9]+}}, [[SRC1:%xmm[0-9]+]]
+; CHECK-DAG:  pandn  {{%xmm[0-9]+}}, [[SRC2:%xmm[0-9]+]]
+; CHECK-DAG:  pandn  [[SRC1]], [[DST:%xmm[0-9]+]]
+; CHECK:      por    [[SRC2]], [[DST]]
+; CHECK:      retq
+
+  %1 = call <4 x i8> @llvm.sabsdiff.v4i8(<4 x i8> %a1, <4 x i8> %a2)
+  ret <4 x i8> %1
+}
+
+declare <8 x i8> @llvm.sabsdiff.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_sabsdiff_v8i8_expand(<8 x i8> %a1, <8 x i8> %a2) {
+; CHECK-LABEL: test_sabsdiff_v8i8_expand
+; CHECK:      psubw
+; CHECK:      pcmpgtw
+; CHECK:      pcmpeqd
+; CHECK:      pxor
+; CHECK-DAG:  psubw  {{%xmm[0-9]+}}, [[SRC1:%xmm[0-9]+]]
+; CHECK-DAG:  pandn  {{%xmm[0-9]+}}, [[SRC2:%xmm[0-9]+]]
+; CHECK-DAG:  pandn  [[SRC1]], [[DST:%xmm[0-9]+]]
+; CHECK:      por    [[SRC2]], [[DST]]
+; CHECK:      retq
+
+  %1 = call <8 x i8> @llvm.sabsdiff.v8i8(<8 x i8> %a1, <8 x i8> %a2)
+  ret <8 x i8> %1
+}
+
+declare <16 x i8> @llvm.uabsdiff.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_uabsdiff_v16i8_expand(<16 x i8> %a1, <16 x i8> %a2) {
+; CHECK-LABEL: test_uabsdiff_v16i8_expand
+; CHECK:      movd
+; CHECK:      movzbl
+; CHECK:      movzbl
+; CHECK:      subl
+; CHECK:      punpcklbw
+; CHECK:      retq
+
+  %1 = call <16 x i8> @llvm.uabsdiff.v16i8(<16 x i8> %a1, <16 x i8> %a2)
+  ret <16 x i8> %1
+}
+
+declare <8 x i16> @llvm.uabsdiff.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_uabsdiff_v8i16_expand(<8 x i16> %a1, <8 x i16> %a2) {
+; CHECK-LABEL: test_uabsdiff_v8i16_expand
+; CHECK:      pextrw
+; CHECK:      pextrw
+; CHECK:      subl
+; CHECK:      punpcklwd
+; CHECK:      retq
+
+  %1 = call <8 x i16> @llvm.uabsdiff.v8i16(<8 x i16> %a1, <8 x i16> %a2)
+  ret <8 x i16> %1
+}
+
+declare <8 x i16> @llvm.sabsdiff.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_sabsdiff_v8i16_expand(<8 x i16> %a1, <8 x i16> %a2) {
+; CHECK-LABEL: test_sabsdiff_v8i16_expand
+; CHECK:      psubw
+; CHECK:      pcmpgtw
+; CHECK:      pcmpeqd
+; CHECK:      pxor
+; CHECK-DAG:  psubw  {{%xmm[0-9]+}}, [[SRC1:%xmm[0-9]+]]
+; CHECK-DAG:  pandn  {{%xmm[0-9]+}}, [[SRC2:%xmm[0-9]+]]
+; CHECK-DAG:  pandn  [[SRC1]], [[DST:%xmm[0-9]+]]
+; CHECK:      por    [[SRC2]], [[DST]]
+; CHECK:      retq
+
+  %1 = call <8 x i16> @llvm.sabsdiff.v8i16(<8 x i16> %a1, <8 x i16> %a2)
+  ret <8 x i16> %1
+}
+
+declare <4 x i32> @llvm.sabsdiff.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_sabsdiff_v4i32_expand(<4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: test_sabsdiff_v4i32_expand
+; CHECK:      psubd
+; CHECK:      pcmpgtd
+; CHECK:      pcmpeqd
+; CHECK:      pxor
+; CHECK-DAG:  psubd  {{%xmm[0-9]+}}, [[SRC1:%xmm[0-9]+]]
+; CHECK-DAG:  pandn  {{%xmm[0-9]+}}, [[SRC2:%xmm[0-9]+]]
+; CHECK-DAG:  pandn  [[SRC1]], [[DST:%xmm[0-9]+]]
+; CHECK:      por    [[SRC2]], [[DST]]
+; CHECK:      retq
+  %1 = call <4 x i32> @llvm.sabsdiff.v4i32(<4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %1
+}
+
+declare <4 x i32> @llvm.uabsdiff.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_uabsdiff_v4i32_expand(<4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: test_uabsdiff_v4i32_expand
+; CHECK:      pshufd
+; CHECK:      movd
+; CHECK:      subl
+; CHECK:      punpckldq
+; CHECK-DAG:  movd   %xmm1, [[SRC:%.*]]
+; CHECK-DAG:  movd   %xmm0, [[DST:%.*]]
+; CHECK:      subl [[SRC]], [[DST]]
+; CHECK:      movd
+; CHECK:      pshufd
+; CHECK:      movd
+; CHECK:      punpckldq
+; CHECK:      movdqa
+; CHECK:      retq
+
+  %1 = call <4 x i32> @llvm.uabsdiff.v4i32(<4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %1
+}
+
+declare <2 x i32> @llvm.sabsdiff.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_sabsdiff_v2i32_expand(<2 x i32> %a1, <2 x i32> %a2) {
+; CHECK-LABEL: test_sabsdiff_v2i32_expand
+; CHECK:      psubq
+; CHECK:      pcmpgtd
+; CHECK:      pcmpeqd
+; CHECK:      pxor
+; CHECK-DAG:  psubq  {{%xmm[0-9]+}}, [[SRC1:%xmm[0-9]+]]
+; CHECK-DAG:  pandn  {{%xmm[0-9]+}}, [[SRC2:%xmm[0-9]+]]
+; CHECK-DAG:  pandn  [[SRC1]], [[DST:%xmm[0-9]+]]
+; CHECK:      por    [[SRC2]], [[DST]]
+; CHECK:      retq
+
+  %1 = call <2 x i32> @llvm.sabsdiff.v2i32(<2 x i32> %a1, <2 x i32> %a2)
+  ret <2 x i32> %1
+}
+
+declare <2 x i64> @llvm.sabsdiff.v2i64(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_sabsdiff_v2i64_expand(<2 x i64> %a1, <2 x i64> %a2) {
+; CHECK-LABEL: test_sabsdiff_v2i64_expand
+; CHECK:      psubq
+; CHECK:      pcmpgtd
+; CHECK:      pcmpeqd
+; CHECK:      pxor
+; CHECK-DAG:  psubq  {{%xmm[0-9]+}}, [[SRC1:%xmm[0-9]+]]
+; CHECK-DAG:  pandn  {{%xmm[0-9]+}}, [[SRC2:%xmm[0-9]+]]
+; CHECK-DAG:  pandn  [[SRC1]], [[DST:%xmm[0-9]+]]
+; CHECK:      por    [[SRC2]], [[DST]]
+; CHECK:      retq
+
+  %1 = call <2 x i64> @llvm.sabsdiff.v2i64(<2 x i64> %a1, <2 x i64> %a2)
+  ret <2 x i64> %1
+}
diff --git a/test/CodeGen/X86/absdiff_256.ll b/test/CodeGen/X86/absdiff_256.ll
new file mode 100644
index 00000000000..acc8a1fa51d
--- /dev/null
+++ b/test/CodeGen/X86/absdiff_256.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown  | FileCheck %s
+
+declare <16 x i16> @llvm.sabsdiff.v16i16(<16 x i16>, <16 x i16>)
+
+define <16 x i16> @test_sabsdiff_v16i16_expand(<16 x i16> %a1, <16 x i16> %a2) {
+; CHECK-LABEL: test_sabsdiff_v16i16_expand:
+; CHECK:       # BB#0:
+; CHECK:         psubw
+; CHECK:         pxor
+; CHECK:         pcmpgtw
+; CHECK:         movdqa
+; CHECK:         pandn
+; CHECK:         pxor
+; CHECK:         psubw
+; CHECK:         pcmpeqd
+; CHECK:         pxor
+; CHECK:         pandn
+; CHECK:         por
+; CHECK:         pcmpgtw
+; CHECK-DAG:     psubw {{%xmm[0-9]+}}, [[SRC:%xmm[0-9]+]]
+; CHECK-DAG:     pxor {{%xmm[0-9]+}}, [[DST:%xmm[0-9]+]]
+; CHECK:         pandn [[SRC]], [[DST]]
+; CHECK:         por
+; CHECK:         movdqa
+; CHECK:         retq
+  %1 = call <16 x i16> @llvm.sabsdiff.v16i16(<16 x i16> %a1, <16 x i16> %a2)
+  ret <16 x i16> %1
+}
+
-- 
2.34.1