SDAG: Legalize vector BSWAP into a shuffle if the shuffle is legal but the bswap...

author Benjamin Kramer <benny.kra@googlemail.com>

Mon, 19 May 2014 13:12:38 +0000 (13:12 +0000)

committer Benjamin Kramer <benny.kra@googlemail.com>

Mon, 19 May 2014 13:12:38 +0000 (13:12 +0000)
author Benjamin Kramer <benny.kra@googlemail.com>
Mon, 19 May 2014 13:12:38 +0000 (13:12 +0000)
committer Benjamin Kramer <benny.kra@googlemail.com>
Mon, 19 May 2014 13:12:38 +0000 (13:12 +0000)
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp

index 6bcbc8e0ff431c1b609f0c6029ba686ba0a14047..898cd29c9141720cff5d75ba0d792a607833058c 100644 (file)
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -63,6 +63,8 @@ class VectorLegalizer {
    SDValue ExpandUINT_TO_FLOAT(SDValue Op);
    // Implement expansion for SIGN_EXTEND_INREG using SRL and SRA.
    SDValue ExpandSEXTINREG(SDValue Op);
+  // Expand bswap of vectors into a shuffle if legal.
+  SDValue ExpandBSWAP(SDValue Op);
    // Implement vselect in terms of XOR, AND, OR when blend is not supported
    // by the target.
    SDValue ExpandVSELECT(SDValue Op);
@@ -297,6 +299,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
    case TargetLowering::Expand:
      if (Node->getOpcode() == ISD::SIGN_EXTEND_INREG)
        Result = ExpandSEXTINREG(Op);
+    else if (Node->getOpcode() == ISD::BSWAP)
+      Result = ExpandBSWAP(Op);
      else if (Node->getOpcode() == ISD::VSELECT)
        Result = ExpandVSELECT(Op);
      else if (Node->getOpcode() == ISD::SELECT)
@@ -682,6 +686,29 @@ SDValue VectorLegalizer::ExpandSEXTINREG(SDValue Op) {
    return DAG.getNode(ISD::SRA, DL, VT, Op, ShiftSz);
  }
  
+SDValue VectorLegalizer::ExpandBSWAP(SDValue Op) {
+  EVT VT = Op.getValueType();
+
+  // Generate a byte wise shuffle mask for the BSWAP.
+  SmallVector<int, 16> ShuffleMask;
+  int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
+  for (int I = 0, E = VT.getVectorNumElements(); I != E; ++I)
+    for (int J = ScalarSizeInBytes - 1; J >= 0; --J)
+      ShuffleMask.push_back((I * ScalarSizeInBytes) + J);
+
+  EVT ByteVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8, ShuffleMask.size());
+
+  // Only emit a shuffle if the mask is legal.
+  if (!TLI.isShuffleMaskLegal(ShuffleMask, ByteVT))
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  SDLoc DL(Op);
+  Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Op.getOperand(0));
+  Op = DAG.getVectorShuffle(ByteVT, DL, Op, DAG.getUNDEF(ByteVT),
+                            ShuffleMask.data());
+  return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+}
+
  SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {
    // Implement VSELECT in terms of XOR, AND, OR
    // on platforms which do not support blend natively.
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp

index ac47f3e9eb1c7d87e0d80a3c43cdaf4b0a2a1a9a..d02a03ccb2a7c21db259f4bb098c7579d5b4b2c3 100644 (file)
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -520,6 +520,8 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
        setOperationAction(ISD::SMUL_LOHI, VT, Expand);
        setOperationAction(ISD::MULHU, VT, Expand);
        setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+
+      setOperationAction(ISD::BSWAP, VT, Expand);
      }
  
      // There is no v1i64/v2i64 multiply, expand v1i64/v2i64 to GPR i64 multiply.
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp

index 976cef39a21bfc66f1518bde10190e1d06a0de74..5beb752d3a429362159e5d70c4fff5f4d6480354 100644 (file)
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -414,6 +414,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
      setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
      setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+
+    setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
    }
  
    setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
diff --git a/lib/Target/ARM64/ARM64ISelLowering.cpp b/lib/Target/ARM64/ARM64ISelLowering.cpp

index e6790cfdfd59c1133738e7da675d87939e05ff3b..538360cf39dc2dacaf0bbca4018203a7c48ea755 100644 (file)
--- a/lib/Target/ARM64/ARM64ISelLowering.cpp
+++ b/lib/Target/ARM64/ARM64ISelLowering.cpp
@@ -450,6 +450,8 @@ ARM64TargetLowering::ARM64TargetLowering(ARM64TargetMachine &TM)
        setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
        setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
  
+      setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
+
        for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
             InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
          setTruncStoreAction((MVT::SimpleValueType)VT,
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp

index a0b3e467d209544b22ceb953078308022967b942..04bd43547c746e21fe403208db4b43a16f2a9b56 100644 (file)
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -460,6 +460,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
        setOperationAction(ISD::SDIVREM, VT, Expand);
        setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
        setOperationAction(ISD::FPOW, VT, Expand);
+      setOperationAction(ISD::BSWAP, VT, Expand);
        setOperationAction(ISD::CTPOP, VT, Expand);
        setOperationAction(ISD::CTLZ, VT, Expand);
        setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 63d28034ca8c1aac30251073ba5a5665615e26a6..e1db618baf18b97f2390df141fe192f0c3d8f7ca 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -15116,7 +15116,23 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
    if (VT.getSizeInBits() == 64)
      return false;
  
-  // FIXME: pshufb, blends, shifts.
+  // If this is a single-input shuffle with no 128 bit lane crossings we can
+  // lower it into pshufb.
+  if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
+      (SVT.is256BitVector() && Subtarget->hasInt256())) {
+    bool isLegal = true;
+    for (unsigned I = 0, E = M.size(); I != E; ++I) {
+      if (M[I] >= (int)SVT.getVectorNumElements() ||
+          ShuffleCrosses128bitLane(SVT, I, M[I])) {
+        isLegal = false;
+        break;
+      }
+    }
+    if (isLegal)
+      return true;
+  }
+
+  // FIXME: blends, shifts.
    return (SVT.getVectorNumElements() == 2 ||
            ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
            isMOVLMask(M, SVT) ||
diff --git a/test/CodeGen/ARM/vrev.ll b/test/CodeGen/ARM/vrev.ll

index eb76ba6ea08480bbbbc0ee9aa3bae6c0ca2e58f3..7215ad615e811dc624a69724dc09d85c154662d1 100644 (file)
--- a/test/CodeGen/ARM/vrev.ll
+++ b/test/CodeGen/ARM/vrev.ll
@@ -178,3 +178,11 @@ entry:
    ret void
  }
  
+define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind {
+; CHECK-LABEL: test_vrev32_bswap:
+; CHECK: vrev32.8
+  %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source)
+  ret <4 x i32> %bswap
+}
+
+declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/rev.ll b/test/CodeGen/ARM64/rev.ll

index 0006ea33175aec0c157e46749edaa080cc4d57c5..1da59e42f6b95b2a50c15e5e4af1a599329ea3f0 100644 (file)
--- a/test/CodeGen/ARM64/rev.ll
+++ b/test/CodeGen/ARM64/rev.ll
@@ -222,3 +222,14 @@ entry:
    ret void
  }
  
+
+define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind {
+; CHECK-LABEL: test_vrev32_bswap:
+; CHECK: rev32.16b
+; CHECK-NOT: rev
+; CHECK: ret
+  %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source)
+  ret <4 x i32> %bswap
+}
+
+declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/X86/bswap-vector.ll b/test/CodeGen/X86/bswap-vector.ll

index 6b77176cb69e289f0193d09fbed010bec1e13c94..a18fd68865cc738a76026bdd705a83aa98ee376e 100644 (file)
--- a/test/CodeGen/X86/bswap-vector.ll
+++ b/test/CodeGen/X86/bswap-vector.ll
@@ -1,19 +1,127 @@
-; RUN: llc < %s -mcpu=x86_64 | FileCheck %s
+; RUN: llc < %s -mcpu=x86-64 | FileCheck %s -check-prefix=CHECK-NOSSSE3
+; RUN: llc < %s -mcpu=core2 | FileCheck %s -check-prefix=CHECK-SSSE3
+; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK-AVX2
  target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
  target triple = "x86_64-unknown-linux-gnu"
  
+declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)
+declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
  declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
  
-define <2 x i64> @foo(<2 x i64> %v) #0 {
+define <8 x i16> @test1(<8 x i16> %v) #0 {
+entry:
+  %r = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %v)
+  ret <8 x i16> %r
+
+; CHECK-NOSSSE3-LABEL: @test1
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: retq
+
+; CHECK-SSSE3-LABEL: @test1
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test1
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
+}
+
+define <4 x i32> @test2(<4 x i32> %v) #0 {
+entry:
+  %r = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %v)
+  ret <4 x i32> %r
+
+; CHECK-NOSSSE3-LABEL: @test2
+; CHECK-NOSSSE3: bswapl
+; CHECK-NOSSSE3: bswapl
+; CHECK-NOSSSE3: bswapl
+; CHECK-NOSSSE3: bswapl
+; CHECK-NOSSSE3: retq
+
+; CHECK-SSSE3-LABEL: @test2
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test2
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
+}
+
+define <2 x i64> @test3(<2 x i64> %v) #0 {
  entry:
    %r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %v)
    ret <2 x i64> %r
+
+; CHECK-NOSSSE3-LABEL: @test3
+; CHECK-NOSSSE3: bswapq
+; CHECK-NOSSSE3: bswapq
+; CHECK-NOSSSE3: retq
+
+; CHECK-SSSE3-LABEL: @test3
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test3
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
+}
+
+declare <16 x i16> @llvm.bswap.v16i16(<16 x i16>)
+declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>)
+declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)
+
+define <16 x i16> @test4(<16 x i16> %v) #0 {
+entry:
+  %r = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %v)
+  ret <16 x i16> %r
+
+; CHECK-SSSE3-LABEL: @test4
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test4
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
+}
+
+define <8 x i32> @test5(<8 x i32> %v) #0 {
+entry:
+  %r = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %v)
+  ret <8 x i32> %r
+
+; CHECK-SSSE3-LABEL: @test5
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test5
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
+}
+
+define <4 x i64> @test6(<4 x i64> %v) #0 {
+entry:
+  %r = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %v)
+  ret <4 x i64> %r
+
+; CHECK-SSSE3-LABEL: @test6
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test6
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
  }
  
-; CHECK-LABEL: @foo
-; CHECK: bswapq
-; CHECK: bswapq
-; CHECK: retq
  
  attributes #0 = { nounwind uwtable }
author	Benjamin Kramer <benny.kra@googlemail.com>
	Mon, 19 May 2014 13:12:38 +0000 (13:12 +0000)
committer	Benjamin Kramer <benny.kra@googlemail.com>
	Mon, 19 May 2014 13:12:38 +0000 (13:12 +0000)
lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp		patch \| blob \| history
lib/Target/AArch64/AArch64ISelLowering.cpp		patch \| blob \| history
lib/Target/ARM/ARMISelLowering.cpp		patch \| blob \| history
lib/Target/ARM64/ARM64ISelLowering.cpp		patch \| blob \| history
lib/Target/PowerPC/PPCISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/ARM/vrev.ll		patch \| blob \| history
test/CodeGen/ARM64/rev.ll		patch \| blob \| history
test/CodeGen/X86/bswap-vector.ll		patch \| blob \| history