Implement vector shift up / down and insert zero with ps{rl}lq / ps{rl}ldq.

author Evan Cheng <evan.cheng@apple.com>

Thu, 29 May 2008 08:22:04 +0000 (08:22 +0000)

committer Evan Cheng <evan.cheng@apple.com>

Thu, 29 May 2008 08:22:04 +0000 (08:22 +0000)
author Evan Cheng <evan.cheng@apple.com>
Thu, 29 May 2008 08:22:04 +0000 (08:22 +0000)
committer Evan Cheng <evan.cheng@apple.com>
Thu, 29 May 2008 08:22:04 +0000 (08:22 +0000)
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

index 44c72b6bec4c89cc1a1d02be1b61866ed042b39c..24954d75d2e57c94ef37ce9a87004846ddfd86b4 100644 (file)
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1853,10 +1853,17 @@ SDOperand SelectionDAG::getShuffleScalarElt(const SDNode *N, unsigned Idx) {
    unsigned NumElems = PermMask.getNumOperands();
    SDOperand V = (Idx < NumElems) ? N->getOperand(0) : N->getOperand(1);
    Idx %= NumElems;
-  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) {
-    return (Idx == 0)
-     ? V.getOperand(0) : getNode(ISD::UNDEF, MVT::getVectorElementType(VT));
+
+  if (V.getOpcode() == ISD::BIT_CONVERT) {
+    V = V.getOperand(0);
+    if (MVT::getVectorNumElements(V.getValueType()) != NumElems)
+      return SDOperand();
    }
+  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
+    return (Idx == 0) ? V.getOperand(0)
+                      : getNode(ISD::UNDEF, MVT::getVectorElementType(VT));
+  if (V.getOpcode() == ISD::BUILD_VECTOR)
+    return V.getOperand(Idx);
    if (V.getOpcode() == ISD::VECTOR_SHUFFLE) {
      SDOperand Elt = PermMask.getOperand(Idx);
      if (Elt.getOpcode() == ISD::UNDEF)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 5343971d34b3560798f568c7a1e65992cb874a33..d194d38e1ce6e3073fb2e809a45331b2bcf86f47 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -2923,6 +2923,70 @@ static SDOperand getShuffleVectorZeroOrUndef(SDOperand V2, unsigned Idx,
    return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
  }
  
+/// getNumOfConsecutiveZeros - Return the number of elements in a result of
+/// a shuffle that is zero.
+static
+unsigned getNumOfConsecutiveZeros(SDOperand Op, SDOperand Mask,
+                                  unsigned NumElems, bool Low,
+                                  SelectionDAG &DAG) {
+  unsigned NumZeros = 0;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDOperand Idx = Mask.getOperand(Low ? i : NumElems-i-1);
+    if (Idx.getOpcode() == ISD::UNDEF) {
+      ++NumZeros;
+      continue;
+    }
+    unsigned Index = cast<ConstantSDNode>(Idx)->getValue();
+    SDOperand Elt = DAG.getShuffleScalarElt(Op.Val, Index);
+    if (Elt.Val && isZeroNode(Elt))
+      ++NumZeros;
+    else
+      break;
+  }
+  return NumZeros;
+}
+
+/// isVectorShift - Returns true if the shuffle can be implemented as a
+/// logical left or right shift of a vector.
+static bool isVectorShift(SDOperand Op, SDOperand Mask, SelectionDAG &DAG,
+                          bool &isLeft, SDOperand &ShVal, unsigned &ShAmt) {
+  unsigned NumElems = Mask.getNumOperands();
+
+  isLeft = true;
+  unsigned NumZeros= getNumOfConsecutiveZeros(Op, Mask, NumElems, true, DAG);
+  if (!NumZeros) {
+    isLeft = false;
+    NumZeros = getNumOfConsecutiveZeros(Op, Mask, NumElems, false, DAG);
+    if (!NumZeros)
+      return false;
+  }
+
+  bool SeenV1 = false;
+  bool SeenV2 = false;
+  for (unsigned i = NumZeros; i < NumElems; ++i) {
+    unsigned Val = isLeft ? (i - NumZeros) : i;
+    SDOperand Idx = Mask.getOperand(isLeft ? i : (i - NumZeros));
+    if (Idx.getOpcode() == ISD::UNDEF)
+      continue;
+    unsigned Index = cast<ConstantSDNode>(Idx)->getValue();
+    if (Index < NumElems)
+      SeenV1 = true;
+    else {
+      Index -= NumElems;
+      SeenV2 = true;
+    }
+    if (Index != Val)
+      return false;
+  }
+  if (SeenV1 && SeenV2)
+    return false;
+
+  ShVal = SeenV1 ? Op.getOperand(0) : Op.getOperand(1);
+  ShAmt = NumZeros;
+  return true;
+}
+
+
  /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
  ///
  static SDOperand LowerBuildVectorv16i8(SDOperand Op, unsigned NonZeros,
@@ -2995,6 +3059,20 @@ static SDOperand LowerBuildVectorv8i16(SDOperand Op, unsigned NonZeros,
    return V;
  }
  
+/// getVShift - Return a vector logical shift node.
+///
+static SDOperand getVShift(bool isLeft, MVT::ValueType VT, SDOperand SrcOp,
+                           unsigned NumBits, SelectionDAG &DAG,
+                           const TargetLowering &TLI) {
+  bool isMMX = MVT::getSizeInBits(VT) == 64;
+  MVT::ValueType ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
+  unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
+  SrcOp = DAG.getNode(ISD::BIT_CONVERT, ShVT, SrcOp);
+  return DAG.getNode(ISD::BIT_CONVERT, VT,
+                     DAG.getNode(Opc, ShVT, SrcOp,
+                              DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
+}
+
  SDOperand
  X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
    // All zero's are handled with pxor, all one's are handled with pcmpeqd.
@@ -3091,6 +3169,15 @@ X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
        return getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
                                           Subtarget->hasSSE2(), DAG);
      }
+
+    // Is it a vector logical left shift?
+    if (NumElems == 2 && Idx == 1 &&
+        isZeroNode(Op.getOperand(0)) && !isZeroNode(Op.getOperand(1))) {
+      unsigned NumBits = MVT::getSizeInBits(VT);
+      return getVShift(true, VT,
+                       DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(1)),
+                       NumBits/2, DAG, *this);
+    }
      
      if (IsAllConstants) // Otherwise, it's better to do a constpool load.
        return SDOperand();
@@ -3615,6 +3702,19 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
      }
    }
  
+  // Check if this can be converted into a logical shift.
+  bool isLeft = false;
+  unsigned ShAmt = 0;
+  SDOperand ShVal;
+  bool isShift = isVectorShift(Op, PermMask, DAG, isLeft, ShVal, ShAmt);
+  if (isShift && ShVal.hasOneUse()) {
+    // If the shifted value has multiple uses, it may be cheaper to use 
+    // v_set0 + movlhps or movhlps, etc.
+    MVT::ValueType EVT = MVT::getVectorElementType(VT);
+    ShAmt *= MVT::getSizeInBits(EVT);
+    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this);
+  }
+
    if (X86::isMOVLMask(PermMask.Val)) {
      if (V1IsUndef)
        return V2;
@@ -3634,6 +3734,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
        ShouldXformToMOVLP(V1.Val, V2.Val, PermMask.Val))
      return CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
  
+  if (isShift) {
+    // No better options. Use a vshl / vsrl.
+    MVT::ValueType EVT = MVT::getVectorElementType(VT);
+    ShAmt *= MVT::getSizeInBits(EVT);
+    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this);
+  }
+
    bool Commuted = false;
    // FIXME: This should also accept a bitcast of a splat?  Be careful, not
    // 1,1,1,1 -> v8i16 though.
@@ -5729,6 +5836,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
    case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
    case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
+  case X86ISD::VSHL:               return "X86ISD::VSHL";
+  case X86ISD::VSRL:               return "X86ISD::VSRL";
    }
  }
  
@@ -6296,8 +6405,10 @@ static SDOperand PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
  static SDOperand PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
                                             const X86Subtarget *Subtarget,
                                             const TargetLowering &TLI) {
+  unsigned NumOps = N->getNumOperands();
+
    // Ignore single operand BUILD_VECTOR.
-  if (N->getNumOperands() == 1)
+  if (NumOps == 1)
      return SDOperand();
  
    MVT::ValueType VT = N->getValueType(0);
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h

index b99a09be7cede1b36a2836f7ed260cfc1ae314d3..0c67794c932989e8bbebd5ded9bc3e7e8d7530de 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -205,7 +205,10 @@ namespace llvm {
        VZEXT_MOVL,
  
        // VZEXT_LOAD - Load, scalar_to_vector, and zero extend.
-      VZEXT_LOAD
+      VZEXT_LOAD,
+
+      // VSHL, VSRL - Vector logical left / right shift.
+      VSHL, VSRL
      };
    }
  
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td

index 42f19af1f8b6e956661d92a9ba592eee6e25523b..b167a7ac88d808e08a2481705e14e6b1d2514569 100644 (file)
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -294,6 +294,12 @@ defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
  defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
                                      int_x86_mmx_psra_d, int_x86_mmx_psrai_d>;
  
+// Shift up / down and insert zero's.
+def : Pat<(v1i64 (X86vshl     VR64:$src, (i8 imm:$amt))),
+          (v1i64 (MMX_PSLLQri VR64:$src, imm:$amt))>;
+def : Pat<(v1i64 (X86vshr     VR64:$src, (i8 imm:$amt))),
+          (v1i64 (MMX_PSRLQri VR64:$src, imm:$amt))>;
+
  // Comparison Instructions
  defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b>;
  defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w>;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td

index 1ea4bfd35e030f1cd3ade95c97cf9a770800bc96..3d5959aa2f684d5a5d63085a8e63d55521305b8a 100644 (file)
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -51,6 +51,8 @@ def X86vzmovl  : SDNode<"X86ISD::VZEXT_MOVL",
                   SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
  def X86vzload  : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
                          [SDNPHasChain, SDNPMayLoad]>;
+def X86vshl    : SDNode<"X86ISD::VSHL",      SDTIntShiftOp>;
+def X86vshr    : SDNode<"X86ISD::VSRL",      SDTIntShiftOp>;
  
  //===----------------------------------------------------------------------===//
  // SSE Complex Patterns
@@ -1957,6 +1959,12 @@ let Predicates = [HasSSE2] in {
              (v2i64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
    def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
              (v2f64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
+
+  // Shift up / down and insert zero's.
+  def : Pat<(v2i64 (X86vshl  VR128:$src, (i8 imm:$amt))),
+            (v2i64 (PSLLDQri VR128:$src, (PSxLDQ_imm imm:$amt)))>;
+  def : Pat<(v2i64 (X86vshr  VR128:$src, (i8 imm:$amt))),
+            (v2i64 (PSRLDQri VR128:$src, (PSxLDQ_imm imm:$amt)))>;
  }
  
  // Logical
diff --git a/test/CodeGen/X86/mmx-insert-element.ll b/test/CodeGen/X86/mmx-insert-element.ll

index dc488363e7f52e28bea86d12f9076d9b599747f6..0aa476dba80e612e2c752b6a0734a26d734e5d2b 100644 (file)
--- a/test/CodeGen/X86/mmx-insert-element.ll
+++ b/test/CodeGen/X86/mmx-insert-element.ll
@@ -1,23 +1,7 @@
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+mmx | grep movq | count 3
-
-; FIXME: This code outputs:
-;
-;   subl $28, %esp
-;   movl 32(%esp), %eax
-;   movd %eax, %mm0
-;   movq %mm0, (%esp)
-;   movl (%esp), %eax
-;   movl %eax, 20(%esp)
-;   movq %mm0, 8(%esp)
-;   movl 12(%esp), %eax
-;   movl %eax, 16(%esp)
-;   movq 16(%esp), %mm0
-;   addl $28, %esp
-;
-; Which is ugly. We need to fix this.
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+mmx | not grep movq
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+mmx | grep psllq
  
  define <2 x i32> @qux(i32 %A) nounwind {
-entry:
         %tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1             ; <<2 x i32>> [#uses=1]
         ret <2 x i32> %tmp3
  }
diff --git a/test/CodeGen/X86/vec_clear.ll b/test/CodeGen/X86/vec_clear.ll

index d4641294b4565daa0b33221ba51d23fde0ae8b5e..c119a94f74f6cd908af9f87847491fbd783a7b86 100644 (file)
--- a/test/CodeGen/X86/vec_clear.ll
+++ b/test/CodeGen/X86/vec_clear.ll
@@ -1,6 +1,7 @@
  ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin | not grep and
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin | grep psrldq
  
-define <4 x float> @test(<4 x float>* %v1) {
+define <4 x float> @test(<4 x float>* %v1) nounwind {
          %tmp = load <4 x float>* %v1            ; <<4 x float>> [#uses=1]
          %tmp15 = bitcast <4 x float> %tmp to <2 x i64>          ; <<2 x i64>> [#uses=1]
          %tmp24 = and <2 x i64> %tmp15, bitcast (<4 x i32> < i32 0, i32 0, i32 -1, i32 -1 > to <2 x i64>)              ; <<2 x i64>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_insert-3.ll b/test/CodeGen/X86/vec_insert-3.ll

index 1d374b4b9c62a1bc75f1b3a223a067eccc5d46d7..e42a3684899a99966898e3d676e5c0aae8880311 100644 (file)
--- a/test/CodeGen/X86/vec_insert-3.ll
+++ b/test/CodeGen/X86/vec_insert-3.ll
@@ -1,6 +1,6 @@
  ; RUN: llvm-as < %s | llc -march=x86-64 -mattr=+sse2 | grep punpcklqdq | count 1
  
-define <2 x i64> @t1(i64 %s, <2 x i64> %tmp) {
+define <2 x i64> @t1(i64 %s, <2 x i64> %tmp) nounwind {
          %tmp1 = insertelement <2 x i64> %tmp, i64 %s, i32 1
          ret <2 x i64> %tmp1
  }
diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll

new file mode 100644 (file)

index 0000000..eaa523e
--- /dev/null
+++ b/test/CodeGen/X86/vec_insert-5.ll
@@ -0,0 +1,31 @@
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep psllq  | grep 32
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pslldq | grep 12
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep psrldq | grep 8
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep psrldq | grep 12
+
+define void  @t1(i32 %a, <1 x i64>* %P) nounwind {
+       %tmp12 = shl i32 %a, 12
+       %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1
+       %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0
+       %tmp23 = bitcast <2 x i32> %tmp22 to <1 x i64>
+       store <1 x i64> %tmp23, <1 x i64>* %P
+       ret void
+}
+
+define <4 x float> @t2(<4 x float>* %P) nounwind {
+        %tmp1 = load <4 x float>* %P
+        %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
+        ret <4 x float> %tmp2
+}
+
+define <4 x float> @t3(<4 x float>* %P) nounwind {
+        %tmp1 = load <4 x float>* %P
+        %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 >
+        ret <4 x float> %tmp2
+}
+
+define <4 x float> @t4(<4 x float>* %P) nounwind {
+        %tmp1 = load <4 x float>* %P
+        %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 >
+        ret <4 x float> %tmp2
+}
diff --git a/test/CodeGen/X86/vec_insert-6.ll b/test/CodeGen/X86/vec_insert-6.ll

new file mode 100644 (file)

index 0000000..405152e
--- /dev/null
+++ b/test/CodeGen/X86/vec_insert-6.ll
@@ -0,0 +1,7 @@
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | not grep pslldq
+
+define <4 x float> @t3(<4 x float>* %P) nounwind  {
+       %tmp1 = load <4 x float>* %P
+       %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
+       ret <4 x float> %tmp2
+}
author	Evan Cheng <evan.cheng@apple.com>
	Thu, 29 May 2008 08:22:04 +0000 (08:22 +0000)
committer	Evan Cheng <evan.cheng@apple.com>
	Thu, 29 May 2008 08:22:04 +0000 (08:22 +0000)
lib/CodeGen/SelectionDAG/SelectionDAG.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.h		patch \| blob \| history
lib/Target/X86/X86InstrMMX.td		patch \| blob \| history
lib/Target/X86/X86InstrSSE.td		patch \| blob \| history
test/CodeGen/X86/mmx-insert-element.ll		patch \| blob \| history
test/CodeGen/X86/vec_clear.ll		patch \| blob \| history
test/CodeGen/X86/vec_insert-3.ll		patch \| blob \| history
test/CodeGen/X86/vec_insert-5.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/X86/vec_insert-6.ll	[new file with mode: 0644]	patch \| blob