[InstCombine][SSE4A] Remove broken INSERTQI range combining optimization

author Simon Pilgrim <llvm-dev@redking.me.uk>

Tue, 13 Oct 2015 14:48:54 +0000 (14:48 +0000)

committer Simon Pilgrim <llvm-dev@redking.me.uk>

Tue, 13 Oct 2015 14:48:54 +0000 (14:48 +0000)
author Simon Pilgrim <llvm-dev@redking.me.uk>
Tue, 13 Oct 2015 14:48:54 +0000 (14:48 +0000)
committer Simon Pilgrim <llvm-dev@redking.me.uk>
Tue, 13 Oct 2015 14:48:54 +0000 (14:48 +0000)
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp

index f741d1d485bf2c1e91555d7848b1da8122642493..b81b9358aa15812ddc9d5cb903d05585efa20534 100644 (file)
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1059,6 +1059,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
      if (auto CILength = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
        if (auto CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3))) {
          unsigned Index = CIIndex->getZExtValue();
+
          // From AMD documentation: "a value of zero in the field length is
          // defined as length of 64".
          unsigned Length = CILength->equalsInt(0) ? 64 : CILength->getZExtValue();
@@ -1077,54 +1078,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
          if (Length == 64 && Index == 0) {
            Value *Vec = II->getArgOperand(1);
            Value *Undef = UndefValue::get(Vec->getType());
-          const uint32_t Mask[] = { 0, 2 };
+          const uint32_t Mask[] = {0, 2};
            return ReplaceInstUsesWith(
                CI,
                Builder->CreateShuffleVector(
                    Vec, Undef, ConstantDataVector::get(
                                    II->getContext(), makeArrayRef(Mask))));
-        } else if (auto Source =
-                       dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
-          if (Source->hasOneUse() &&
-              Source->getArgOperand(1) == II->getArgOperand(1)) {
-            // If the source of the insert has only one use and it's another
-            // insert (and they're both inserting from the same vector), try to
-            // bundle both together.
-            auto CISourceLength =
-                dyn_cast<ConstantInt>(Source->getArgOperand(2));
-            auto CISourceIndex =
-                dyn_cast<ConstantInt>(Source->getArgOperand(3));
-            if (CISourceIndex && CISourceLength) {
-              unsigned SourceIndex = CISourceIndex->getZExtValue();
-              unsigned SourceLength = CISourceLength->getZExtValue();
-              unsigned SourceEnd = SourceIndex + SourceLength;
-              unsigned NewIndex, NewLength;
-              bool ShouldReplace = false;
-              if (Index <= SourceIndex && SourceIndex <= End) {
-                NewIndex = Index;
-                NewLength = std::max(End, SourceEnd) - NewIndex;
-                ShouldReplace = true;
-              } else if (SourceIndex <= Index && Index <= SourceEnd) {
-                NewIndex = SourceIndex;
-                NewLength = std::max(SourceEnd, End) - NewIndex;
-                ShouldReplace = true;
-              }
-
-              if (ShouldReplace) {
-                Constant *ConstantLength = ConstantInt::get(
-                    II->getArgOperand(2)->getType(), NewLength, false);
-                Constant *ConstantIndex = ConstantInt::get(
-                    II->getArgOperand(3)->getType(), NewIndex, false);
-                Value *Args[4] = { Source->getArgOperand(0),
-                                   II->getArgOperand(1), ConstantLength,
-                                   ConstantIndex };
-                Module *M = CI.getParent()->getParent()->getParent();
-                Value *F =
-                    Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
-                return ReplaceInstUsesWith(CI, Builder->CreateCall(F, Args));
-              }
-            }
-          }
          }
        }
      }
@@ -1220,9 +1179,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
          // control mask is set, then zero is written in the result byte.
          // The zero vector is in the right-hand side of the resulting
          // shufflevector.
- 
+
          // The value of each index is the least significant 4 bits of the
-        // shuffle control byte.      
+        // shuffle control byte.
          Indexes[I] = (Index < 0) ? NumElts : Index & 0xF;
        }
      } else if (!isa<ConstantAggregateZero>(V))
diff --git a/test/Transforms/InstCombine/x86-sse4a.ll b/test/Transforms/InstCombine/x86-sse4a.ll

index 40404f39b9766f42d2540316a598228d6b0fed58..92d5e8ad655ebb2d4078d2e6c7001c24dea9362b 100644 (file)
--- a/test/Transforms/InstCombine/x86-sse4a.ll
+++ b/test/Transforms/InstCombine/x86-sse4a.ll
@@ -1,15 +1,5 @@
  ; RUN: opt < %s -instcombine -S | FileCheck %s
  
-; We should optimize these two redundant insertqi into one
-define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testInsertTwice
-; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
-; CHECK-NEXT: ret <2 x i64> %1
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32)
-  ret <2 x i64> %2
-}
-
  ; The result of this insert is the second arg, since the top 64 bits of
  ; the result are undefined, and we copy the bottom 64 bits from the
  ; second arg
@@ -20,81 +10,6 @@ define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) {
    ret <2 x i64> %1
  }
  
-; Test the several types of ranges and ordering that exist for two insertqi
-define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testInsertContainedRange
-; CHECK: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
-; CHECK: ret <2 x i64> %1
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16)
-  ret <2 x i64> %2
-}
-
-define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testInsertContainedRange_2
-; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
-; CHECK-NEXT: ret <2 x i64> %1
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16)
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
-  ret <2 x i64> %2
-}
-
-define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testInsertOverlappingRange
-; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
-; CHECK-NEXT: ret <2 x i64> %1
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16)
-  ret <2 x i64> %2
-}
-
-define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testInsertOverlappingRange_2
-; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
-; CHECK-NEXT: ret <2 x i64> %1
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16)
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
-  ret <2 x i64> %2
-}
-
-define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testInsertAdjacentRange
-; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
-; CHECK-NEXT: ret <2 x i64> %1
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
-  ret <2 x i64> %2
-}
-
-define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testInsertAdjacentRange_2
-; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
-; CHECK-NEXT: ret <2 x i64> %1
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32)
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
-  ret <2 x i64> %2
-}
-
-define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testInsertDisjointRange
-; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
-; CHECK-NEXT: %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
-; CHECK-NEXT: ret <2 x i64> %2
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
-  ret <2 x i64> %2
-}
-
-define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testInsertDisjointRange_2
-; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
-; CHECK-NEXT: %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
-; CHECK-NEXT: ret <2 x i64> %2
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
-  ret <2 x i64> %2
-}
-
  define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) {
  ; CHECK-LABEL: @testZeroLength
  ; CHECK-NEXT: ret <2 x i64> %i
author	Simon Pilgrim <llvm-dev@redking.me.uk>
	Tue, 13 Oct 2015 14:48:54 +0000 (14:48 +0000)
committer	Simon Pilgrim <llvm-dev@redking.me.uk>
	Tue, 13 Oct 2015 14:48:54 +0000 (14:48 +0000)
lib/Transforms/InstCombine/InstCombineCalls.cpp		patch \| blob \| history
test/Transforms/InstCombine/x86-sse4a.ll		patch \| blob \| history