From acd275a629652c97f6ed136de0873258814465d7 Mon Sep 17 00:00:00 2001
From: James Molloy <james.molloy@arm.com>
Date: Thu, 16 Jul 2015 15:22:46 +0000
Subject: [PATCH] [Codegen] Add intrinsics 'absdiff' and corresponding SDNodes
 for absolute difference operation

This adds new intrinsics "*absdiff" for absolute difference ops to facilitate efficient code generation for "sum of absolute differences" operation.
The patch also contains the introduction of corresponding SDNodes and basic legalization support.Sanity of the generated code is tested on X86.

This is 1st of the three patches.

Patch by Shahid Asghar-ahmad!

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@242409 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/LangRef.rst                              |  59 +++++
 include/llvm/CodeGen/ISDOpcodes.h             |   4 +
 include/llvm/IR/Intrinsics.td                 |   6 +
 include/llvm/Target/TargetSelectionDAG.td     |   2 +
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |   4 +
 .../SelectionDAG/LegalizeVectorOps.cpp        |  28 ++
 .../SelectionDAG/LegalizeVectorTypes.cpp      |   2 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  12 +
 .../SelectionDAG/SelectionDAGDumper.cpp       |   2 +
 lib/CodeGen/TargetLoweringBase.cpp            |   2 +
 test/CodeGen/X86/absdiff_expand.ll            | 242 ++++++++++++++++++
 11 files changed, 363 insertions(+)
 create mode 100644 test/CodeGen/X86/absdiff_expand.ll
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index e7d6f67c939..17ee4b32bc3 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -10328,6 +10328,65 @@ Examples:
 
       %r2 = call float @llvm.fmuladd.f32(float %a, float %b, float %c) ; yields float:r2 = (a * b) + c
 
+
+'``llvm.uabsdiff.*``' and '``llvm.sabsdiff.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic. The loaded data is a vector of any integer bit width.
+
+.. code-block:: llvm
+
+      declare <4 x integer> @llvm.uabsdiff.v4i32(<4 x integer> %a, <4 x integer> %b)
+
+
+Overview:
+"""""""""
+
+The ``llvm.uabsdiff`` intrinsic returns a vector result of the absolute difference of the two operands,
+treating them both as unsigned integers.
+
+The ``llvm.sabsdiff`` intrinsic returns  a vector result of the absolute difference of the two operands,
+treating them both as signed integers.
+
+.. note::
+
+    These intrinsics are primarily used during the code generation stage of compilation.
+    They are generated by compiler passes such as the Loop and SLP vectorizers.it is not
+    recommended for users to create them manually.
+
+Arguments:
+""""""""""
+
+Both intrinsics take two integer of the same bitwidth.
+
+Semantics:
+""""""""""
+
+The expression::
+
+    call <4 x i32> @llvm.uabsdiff.v4i32(<4 x i32> %a, <4 x i32> %b)
+
+is equivalent to::
+
+    %sub = sub <4 x i32> %a, %b
+    %ispos = icmp ugt <4 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1>
+    %neg = sub <4 x i32> zeroinitializer, %sub
+    %1 = select <4 x i1> %ispos, <4 x i32> %sub, <4 x i32> %neg
+
+Similarly the expression::
+
+    call <4 x i32> @llvm.sabsdiff.v4i32(<4 x i32> %a, <4 x i32> %b)
+
+is equivalent to::
+
+    %sub = sub nsw <4 x i32> %a, %b
+    %ispos = icmp sgt <4 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1>
+    %neg = sub nsw <4 x i32> zeroinitializer, %sub
+    %1 = select <4 x i1> %ispos, <4 x i32> %sub, <4 x i32> %neg
+
+
 Half Precision Floating Point Intrinsics
 ----------------------------------------
 
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index fa44301a2d4..8a4b779f03a 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -334,6 +334,10 @@ namespace ISD {
     /// Byte Swap and Counting operators.
     BSWAP, CTTZ, CTLZ, CTPOP,
 
+    /// [SU]ABSDIFF - Signed/Unsigned absolute difference of two input integer
+    /// vector. These nodes are generated from llvm.*absdiff* intrinsics.
+    SABSDIFF, UABSDIFF,
+
     /// Bit counting operators with an undefined result for zero inputs.
     CTTZ_ZERO_UNDEF, CTLZ_ZERO_UNDEF,
 
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index bbae720b4e1..af312be186c 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -605,6 +605,12 @@ def int_convertuu  : Intrinsic<[llvm_anyint_ty],
 def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty],
                                 [], "llvm.clear_cache">;
 
+// Calculate the Absolute Differences of the two input vectors.
+def int_sabsdiff : Intrinsic<[llvm_anyvector_ty],
+                        [ LLVMMatchType<0>, LLVMMatchType<0> ], [IntrNoMem]>;
+def int_uabsdiff : Intrinsic<[llvm_anyvector_ty],
+                        [ LLVMMatchType<0>, LLVMMatchType<0> ], [IntrNoMem]>;
+
 //===-------------------------- Masked Intrinsics -------------------------===//
 //
 def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, LLVMPointerTo<0>,
diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index 4abbe379399..6c7eef14715 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -386,6 +386,8 @@ def smax       : SDNode<"ISD::SMAX"      , SDTIntBinOp>;
 def umin       : SDNode<"ISD::UMIN"      , SDTIntBinOp>;
 def umax       : SDNode<"ISD::UMAX"      , SDTIntBinOp>;
 
+def sabsdiff   : SDNode<"ISD::SABSDIFF"   , SDTIntBinOp>;
+def uabsdiff   : SDNode<"ISD::UABSDIFF"   , SDTIntBinOp>;
 def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>;
 def bswap      : SDNode<"ISD::BSWAP"      , SDTIntUnaryOp>;
 def ctlz       : SDNode<"ISD::CTLZ"       , SDTIntUnaryOp>;
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 9f060a09a0f..511239ce477 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -146,6 +146,10 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
     Res = PromoteIntRes_AtomicCmpSwap(cast<AtomicSDNode>(N), ResNo);
     break;
+  case ISD::UABSDIFF:
+  case ISD::SABSDIFF:
+    Res = PromoteIntRes_SimpleIntBinOp(N);
+    break;
   }
 
   // If the result is null then the sub-method took care of registering it.
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 83d4ad5ea1f..0f25a610724 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -105,6 +105,7 @@ class VectorLegalizer {
   SDValue ExpandLoad(SDValue Op);
   SDValue ExpandStore(SDValue Op);
   SDValue ExpandFNEG(SDValue Op);
+  SDValue ExpandABSDIFF(SDValue Op);
 
   /// \brief Implements vector promotion.
   ///
@@ -326,6 +327,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::UMAX:
+  case ISD::UABSDIFF:
+  case ISD::SABSDIFF:
     QueryType = Node->getValueType(0);
     break;
   case ISD::FP_ROUND_INREG:
@@ -708,11 +711,36 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
     return ExpandFNEG(Op);
   case ISD::SETCC:
     return UnrollVSETCC(Op);
+  case ISD::UABSDIFF:
+  case ISD::SABSDIFF:
+    return ExpandABSDIFF(Op);
   default:
     return DAG.UnrollVectorOp(Op.getNode());
   }
 }
 
+SDValue VectorLegalizer::ExpandABSDIFF(SDValue Op) {
+  SDLoc dl(Op);
+  SDValue Tmp1, Tmp2, Tmp3, Tmp4;
+  EVT VT = Op.getValueType();
+  SDNodeFlags Flags;
+  Flags.setNoSignedWrap(Op->getOpcode() == ISD::SABSDIFF);
+
+  Tmp2 = Op.getOperand(0);
+  Tmp3 = Op.getOperand(1);
+  Tmp1 = DAG.getNode(ISD::SUB, dl, VT, Tmp2, Tmp3, &Flags);
+  Tmp2 =
+      DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Tmp1, &Flags);
+  Tmp4 = DAG.getNode(
+      ISD::SETCC, dl,
+      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Tmp2,
+      DAG.getConstant(0, dl, VT),
+      DAG.getCondCode(Op->getOpcode() == ISD::SABSDIFF ? ISD::SETLT
+                                                       : ISD::SETULT));
+  Tmp1 = DAG.getNode(ISD::VSELECT, dl, VT, Tmp4, Tmp1, Tmp2);
+  return Tmp1;
+}
+
 SDValue VectorLegalizer::ExpandSELECT(SDValue Op) {
   // Lower a select instruction where the condition is a scalar and the
   // operands are vectors. Lower this select to VSELECT and implement it
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 4348ab79f7d..5f9afc9cfc5 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -678,6 +678,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::UMAX:
+  case ISD::UABSDIFF:
+  case ISD::SABSDIFF:
     SplitVecRes_BinOp(N, Lo, Hi);
     break;
   case ISD::FMA:
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 5c8db914845..73de6e3cfbd 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4646,6 +4646,18 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0))));
     return nullptr;
+  case Intrinsic::uabsdiff:
+    setValue(&I, DAG.getNode(ISD::UABSDIFF, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1))));
+    return nullptr;
+  case Intrinsic::sabsdiff:
+    setValue(&I, DAG.getNode(ISD::SABSDIFF, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1))));
+    return nullptr;
   case Intrinsic::cttz: {
     SDValue Arg = getValue(I.getArgOperand(0));
     ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 5b9b18286fa..8dabddc642b 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -225,6 +225,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::SHL_PARTS:                  return "shl_parts";
   case ISD::SRA_PARTS:                  return "sra_parts";
   case ISD::SRL_PARTS:                  return "srl_parts";
+  case ISD::UABSDIFF:                   return "uabsdiff";
+  case ISD::SABSDIFF:                   return "sabsdiff";
 
   // Conversion operators.
   case ISD::SIGN_EXTEND:                return "sign_extend";
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 50240bf7046..e6d07f5134b 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -827,6 +827,8 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::USUBO, VT, Expand);
     setOperationAction(ISD::SMULO, VT, Expand);
     setOperationAction(ISD::UMULO, VT, Expand);
+    setOperationAction(ISD::UABSDIFF, VT, Expand);
+    setOperationAction(ISD::SABSDIFF, VT, Expand);
 
     // These library functions default to expand.
     setOperationAction(ISD::FROUND, VT, Expand);
diff --git a/test/CodeGen/X86/absdiff_expand.ll b/test/CodeGen/X86/absdiff_expand.ll
new file mode 100644
index 00000000000..8ba87274d9b
--- /dev/null
+++ b/test/CodeGen/X86/absdiff_expand.ll
@@ -0,0 +1,242 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu  < %s | FileCheck %s -check-prefix=CHECK
+
+declare <4 x i8> @llvm.uabsdiff.v4i8(<4 x i8>, <4 x i8>)
+
+define <4 x i8> @test_uabsdiff_v4i8_expand(<4 x i8> %a1, <4 x i8> %a2) {
+; CHECK-LABEL: test_uabsdiff_v4i8_expand
+; CHECK:             psubd  %xmm1, %xmm0
+; CHECK-NEXT:        pxor   %xmm1, %xmm1
+; CHECK-NEXT:        psubd  %xmm0, %xmm1
+; CHECK-NEXT:        movdqa  .LCPI{{[0-9_]*}}
+; CHECK-NEXT:        movdqa  %xmm1, %xmm3
+; CHECK-NEXT:        pxor   %xmm2, %xmm3
+; CHECK-NEXT:        pcmpgtd        %xmm3, %xmm2
+; CHECK-NEXT:        pand    %xmm2, %xmm0
+; CHECK-NEXT:        pandn   %xmm1, %xmm2
+; CHECK-NEXT:        por     %xmm2, %xmm0
+; CHECK-NEXT:        retq
+
+  %1 = call <4 x i8> @llvm.uabsdiff.v4i8(<4 x i8> %a1, <4 x i8> %a2)
+  ret <4 x i8> %1
+}
+
+declare <4 x i8> @llvm.sabsdiff.v4i8(<4 x i8>, <4 x i8>)
+
+define <4 x i8> @test_sabsdiff_v4i8_expand(<4 x i8> %a1, <4 x i8> %a2) {
+; CHECK-LABEL: test_sabsdiff_v4i8_expand
+; CHECK:      psubd  %xmm1, %xmm0
+; CHECK-NEXT: pxor   %xmm1, %xmm1
+; CHECK-NEXT: pxor    %xmm2, %xmm2
+; CHECK-NEXT: psubd  %xmm0, %xmm2
+; CHECK-NEXT: pcmpgtd  %xmm2, %xmm1
+; CHECK-NEXT: pand    %xmm1, %xmm0
+; CHECK-NEXT: pandn   %xmm2, %xmm1
+; CHECK-NEXT: por     %xmm1, %xmm0
+; CHECK-NEXT: retq
+
+  %1 = call <4 x i8> @llvm.sabsdiff.v4i8(<4 x i8> %a1, <4 x i8> %a2)
+  ret <4 x i8> %1
+}
+
+
+declare <8 x i8> @llvm.sabsdiff.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_sabsdiff_v8i8_expand(<8 x i8> %a1, <8 x i8> %a2) {
+; CHECK-LABEL: test_sabsdiff_v8i8_expand
+; CHECK:      psubw  %xmm1, %xmm0
+; CHECK-NEXT: pxor   %xmm1, %xmm1
+; CHECK-NEXT: pxor   %xmm2, %xmm2
+; CHECK-NEXT: psubw  %xmm0, %xmm2
+; CHECK-NEXT: pcmpgtw        %xmm2, %xmm1
+; CHECK-NEXT: pand  %xmm1, %xmm0
+; CHECK-NEXT: pandn %xmm2, %xmm1
+; CHECK-NEXT: por  %xmm1, %xmm0
+; CHECK-NEXT: retq
+  %1 = call <8 x i8> @llvm.sabsdiff.v8i8(<8 x i8> %a1, <8 x i8> %a2)
+  ret <8 x i8> %1
+}
+
+declare <16 x i8> @llvm.uabsdiff.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_uabsdiff_v16i8_expand(<16 x i8> %a1, <16 x i8> %a2) {
+; CHECK-LABEL: test_uabsdiff_v16i8_expand
+; CHECK:             psubb  %xmm1, %xmm0
+; CHECK-NEXT:        pxor   %xmm1, %xmm1
+; CHECK-NEXT:        psubb  %xmm0, %xmm1
+; CHECK-NEXT:        movdqa  .LCPI{{[0-9_]*}}
+; CHECK-NEXT:        movdqa  %xmm1, %xmm3
+; CHECK-NEXT:        pxor   %xmm2, %xmm3
+; CHECK-NEXT:        pcmpgtb        %xmm3, %xmm2
+; CHECK-NEXT:        pand    %xmm2, %xmm0
+; CHECK-NEXT:        pandn   %xmm1, %xmm2
+; CHECK-NEXT:        por     %xmm2, %xmm0
+; CHECK-NEXT:        retq
+  %1 = call <16 x i8> @llvm.uabsdiff.v16i8(<16 x i8> %a1, <16 x i8> %a2)
+  ret <16 x i8> %1
+}
+
+declare <8 x i16> @llvm.uabsdiff.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_uabsdiff_v8i16_expand(<8 x i16> %a1, <8 x i16> %a2) {
+; CHECK-LABEL: test_uabsdiff_v8i16_expand
+; CHECK:             psubw  %xmm1, %xmm0
+; CHECK-NEXT:        pxor   %xmm1, %xmm1
+; CHECK-NEXT:        psubw  %xmm0, %xmm1
+; CHECK-NEXT:        movdqa  .LCPI{{[0-9_]*}}
+; CHECK-NEXT:        movdqa  %xmm1, %xmm3
+; CHECK-NEXT:        pxor   %xmm2, %xmm3
+; CHECK-NEXT:        pcmpgtw        %xmm3, %xmm2
+; CHECK-NEXT:        pand    %xmm2, %xmm0
+; CHECK-NEXT:        pandn   %xmm1, %xmm2
+; CHECK-NEXT:        por     %xmm2, %xmm0
+; CHECK-NEXT:        retq
+  %1 = call <8 x i16> @llvm.uabsdiff.v8i16(<8 x i16> %a1, <8 x i16> %a2)
+  ret <8 x i16> %1
+}
+
+declare <8 x i16> @llvm.sabsdiff.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_sabsdiff_v8i16_expand(<8 x i16> %a1, <8 x i16> %a2) {
+; CHECK-LABEL: test_sabsdiff_v8i16_expand
+; CHECK:      psubw  %xmm1, %xmm0
+; CHECK-NEXT: pxor   %xmm1, %xmm1
+; CHECK-NEXT: pxor   %xmm2, %xmm2
+; CHECK-NEXT: psubw  %xmm0, %xmm2
+; CHECK-NEXT: pcmpgtw        %xmm2, %xmm1
+; CHECK-NEXT: pand  %xmm1, %xmm0
+; CHECK-NEXT: pandn %xmm2, %xmm1
+; CHECK-NEXT: por  %xmm1, %xmm0
+; CHECK-NEXT: retq
+  %1 = call <8 x i16> @llvm.sabsdiff.v8i16(<8 x i16> %a1, <8 x i16> %a2)
+  ret <8 x i16> %1
+}
+
+declare <4 x i32> @llvm.sabsdiff.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_sabsdiff_v4i32_expand(<4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: test_sabsdiff_v4i32_expand
+; CHECK:             psubd  %xmm1, %xmm0
+; CHECK-NEXT:        pxor  %xmm1, %xmm1
+; CHECK-NEXT:        pxor  %xmm2, %xmm2
+; CHECK-NEXT:        psubd  %xmm0, %xmm2
+; CHECK-NEXT:        pcmpgtd        %xmm2, %xmm1
+; CHECK-NEXT:        pand    %xmm1, %xmm0
+; CHECK-NEXT:        pandn   %xmm2, %xmm1
+; CHECK-NEXT:        por    %xmm1, %xmm0
+; CHECK-NEXT:        retq
+  %1 = call <4 x i32> @llvm.sabsdiff.v4i32(<4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %1
+}
+
+declare <4 x i32> @llvm.uabsdiff.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_uabsdiff_v4i32_expand(<4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: test_uabsdiff_v4i32_expand
+; CHECK:             psubd  %xmm1, %xmm0
+; CHECK-NEXT:        pxor   %xmm1, %xmm1
+; CHECK-NEXT:        psubd  %xmm0, %xmm1
+; CHECK-NEXT:        movdqa  .LCPI{{[0-9_]*}}
+; CHECK-NEXT:        movdqa  %xmm1, %xmm3
+; CHECK-NEXT:        pxor   %xmm2, %xmm3
+; CHECK-NEXT:        pcmpgtd        %xmm3, %xmm2
+; CHECK-NEXT:        pand    %xmm2, %xmm0
+; CHECK-NEXT:        pandn   %xmm1, %xmm2
+; CHECK-NEXT:        por     %xmm2, %xmm0
+; CHECK-NEXT:        retq
+  %1 = call <4 x i32> @llvm.uabsdiff.v4i32(<4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %1
+}
+
+declare <2 x i32> @llvm.sabsdiff.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_sabsdiff_v2i32_expand(<2 x i32> %a1, <2 x i32> %a2) {
+; CHECK-LABEL: test_sabsdiff_v2i32_expand
+; CHECK:        psubq   %xmm1, %xmm0
+; CHECK-NEXT:   pxor    %xmm1, %xmm1
+; CHECK-NEXT:   psubq   %xmm0, %xmm1
+; CHECK-NEXT:   movdqa  .LCPI{{[0-9_]*}}
+; CHECK-NEXT:   movdqa  %xmm1, %xmm3
+; CHECK-NEXT:   pxor    %xmm2, %xmm3
+; CHECK-NEXT:   movdqa  %xmm2, %xmm4
+; CHECK-NEXT:   pcmpgtd %xmm3, %xmm4
+; CHECK-NEXT:   pshufd  $160, %xmm4, %xmm5      # xmm5 = xmm4[0,0,2,2]
+; CHECK-NEXT:   pcmpeqd %xmm2, %xmm3
+; CHECK-NEXT:   pshufd  $245, %xmm3, %xmm2      # xmm2 = xmm3[1,1,3,3]
+; CHECK-NEXT:   pand    %xmm5, %xmm2
+; CHECK-NEXT:   pshufd  $245, %xmm4, %xmm3      # xmm3 = xmm4[1,1,3,3]
+; CHECK-NEXT:   por     %xmm2, %xmm3
+; CHECK-NEXT:   pand    %xmm3, %xmm0
+; CHECK-NEXT:   pandn   %xmm1, %xmm3
+; CHECK-NEXT:   por     %xmm3, %xmm0
+; CHECK-NEXT:   retq
+  %1 = call <2 x i32> @llvm.sabsdiff.v2i32(<2 x i32> %a1, <2 x i32> %a2)
+  ret <2 x i32> %1
+}
+
+declare <2 x i64> @llvm.sabsdiff.v2i64(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_sabsdiff_v2i64_expand(<2 x i64> %a1, <2 x i64> %a2) {
+; CHECK-LABEL: test_sabsdiff_v2i64_expand
+; CHECK:        psubq   %xmm1, %xmm0
+; CHECK-NEXT:   pxor    %xmm1, %xmm1
+; CHECK-NEXT:   psubq   %xmm0, %xmm1
+; CHECK-NEXT:   movdqa  .LCPI{{[0-9_]*}}
+; CHECK-NEXT:   movdqa  %xmm1, %xmm3
+; CHECK-NEXT:   pxor    %xmm2, %xmm3
+; CHECK-NEXT:   movdqa  %xmm2, %xmm4
+; CHECK-NEXT:   pcmpgtd %xmm3, %xmm4
+; CHECK-NEXT:   pshufd  $160, %xmm4, %xmm5      # xmm5 = xmm4[0,0,2,2]
+; CHECK-NEXT:   pcmpeqd %xmm2, %xmm3
+; CHECK-NEXT:   pshufd  $245, %xmm3, %xmm2      # xmm2 = xmm3[1,1,3,3]
+; CHECK-NEXT:   pand    %xmm5, %xmm2
+; CHECK-NEXT:   pshufd  $245, %xmm4, %xmm3      # xmm3 = xmm4[1,1,3,3]
+; CHECK-NEXT:   por     %xmm2, %xmm3
+; CHECK-NEXT:   pand    %xmm3, %xmm0
+; CHECK-NEXT:   pandn   %xmm1, %xmm3
+; CHECK-NEXT:   por     %xmm3, %xmm0
+; CHECK-NEXT:   retq
+  %1 = call <2 x i64> @llvm.sabsdiff.v2i64(<2 x i64> %a1, <2 x i64> %a2)
+  ret <2 x i64> %1
+}
+
+declare <16 x i32> @llvm.sabsdiff.v16i32(<16 x i32>, <16 x i32>)
+
+define <16 x i32> @test_sabsdiff_v16i32_expand(<16 x i32> %a1, <16 x i32> %a2) {
+; CHECK-LABEL: test_sabsdiff_v16i32_expand
+; CHECK:             psubd  %xmm4, %xmm0
+; CHECK-NEXT:        pxor    %xmm8, %xmm8
+; CHECK-NEXT:        pxor    %xmm9, %xmm9
+; CHECK-NEXT:        psubd   %xmm0, %xmm9
+; CHECK-NEXT:        pxor    %xmm4, %xmm4
+; CHECK-NEXT:        pcmpgtd %xmm9, %xmm4
+; CHECK-NEXT:        pand    %xmm4, %xmm0
+; CHECK-NEXT:        pandn   %xmm9, %xmm4
+; CHECK-NEXT:        por     %xmm4, %xmm0
+; CHECK-NEXT:        psubd   %xmm5, %xmm1
+; CHECK-NEXT:        pxor    %xmm4, %xmm4
+; CHECK-NEXT:        psubd   %xmm1, %xmm4
+; CHECK-NEXT:        pxor    %xmm5, %xmm5
+; CHECK-NEXT:        pcmpgtd %xmm4, %xmm5
+; CHECK-NEXT:        pand    %xmm5, %xmm1
+; CHECK-NEXT:        pandn   %xmm4, %xmm5
+; CHECK-NEXT:        por     %xmm5, %xmm1
+; CHECK-NEXT:        psubd   %xmm6, %xmm2
+; CHECK-NEXT:        pxor    %xmm4, %xmm4
+; CHECK-NEXT:        psubd   %xmm2, %xmm4
+; CHECK-NEXT:        pxor    %xmm5, %xmm5
+; CHECK-NEXT:        pcmpgtd %xmm4, %xmm5
+; CHECK-NEXT:        pand    %xmm5, %xmm2
+; CHECK-NEXT:        pandn   %xmm4, %xmm5
+; CHECK-NEXT:        por     %xmm5, %xmm2
+; CHECK-NEXT:        psubd   %xmm7, %xmm3
+; CHECK-NEXT:        pxor    %xmm4, %xmm4
+; CHECK-NEXT:        psubd   %xmm3, %xmm4
+; CHECK-NEXT:        pcmpgtd %xmm4, %xmm8
+; CHECK-NEXT:        pand    %xmm8, %xmm3
+; CHECK-NEXT:        pandn   %xmm4, %xmm8
+; CHECK-NEXT:        por     %xmm8, %xmm3
+; CHECK-NEXT:        retq
+  %1 = call <16 x i32> @llvm.sabsdiff.v16i32(<16 x i32> %a1, <16 x i32> %a2)
+  ret <16 x i32> %1
+}
+
-- 
2.34.1