if (auto CILength = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
if (auto CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3))) {
unsigned Index = CIIndex->getZExtValue();
+
// From AMD documentation: "a value of zero in the field length is
// defined as length of 64".
unsigned Length = CILength->equalsInt(0) ? 64 : CILength->getZExtValue();
if (Length == 64 && Index == 0) {
Value *Vec = II->getArgOperand(1);
Value *Undef = UndefValue::get(Vec->getType());
- const uint32_t Mask[] = { 0, 2 };
+ const uint32_t Mask[] = {0, 2};
return ReplaceInstUsesWith(
CI,
Builder->CreateShuffleVector(
Vec, Undef, ConstantDataVector::get(
II->getContext(), makeArrayRef(Mask))));
- } else if (auto Source =
- dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
- if (Source->hasOneUse() &&
- Source->getArgOperand(1) == II->getArgOperand(1)) {
- // If the source of the insert has only one use and it's another
- // insert (and they're both inserting from the same vector), try to
- // bundle both together.
- auto CISourceLength =
- dyn_cast<ConstantInt>(Source->getArgOperand(2));
- auto CISourceIndex =
- dyn_cast<ConstantInt>(Source->getArgOperand(3));
- if (CISourceIndex && CISourceLength) {
- unsigned SourceIndex = CISourceIndex->getZExtValue();
- unsigned SourceLength = CISourceLength->getZExtValue();
- unsigned SourceEnd = SourceIndex + SourceLength;
- unsigned NewIndex, NewLength;
- bool ShouldReplace = false;
- if (Index <= SourceIndex && SourceIndex <= End) {
- NewIndex = Index;
- NewLength = std::max(End, SourceEnd) - NewIndex;
- ShouldReplace = true;
- } else if (SourceIndex <= Index && Index <= SourceEnd) {
- NewIndex = SourceIndex;
- NewLength = std::max(SourceEnd, End) - NewIndex;
- ShouldReplace = true;
- }
-
- if (ShouldReplace) {
- Constant *ConstantLength = ConstantInt::get(
- II->getArgOperand(2)->getType(), NewLength, false);
- Constant *ConstantIndex = ConstantInt::get(
- II->getArgOperand(3)->getType(), NewIndex, false);
- Value *Args[4] = { Source->getArgOperand(0),
- II->getArgOperand(1), ConstantLength,
- ConstantIndex };
- Module *M = CI.getParent()->getParent()->getParent();
- Value *F =
- Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
- return ReplaceInstUsesWith(CI, Builder->CreateCall(F, Args));
- }
- }
- }
}
}
}
// control mask is set, then zero is written in the result byte.
// The zero vector is in the right-hand side of the resulting
// shufflevector.
-
+
// The value of each index is the least significant 4 bits of the
- // shuffle control byte.
+ // shuffle control byte.
Indexes[I] = (Index < 0) ? NumElts : Index & 0xF;
}
} else if (!isa<ConstantAggregateZero>(V))
; RUN: opt < %s -instcombine -S | FileCheck %s
-; We should optimize these two redundant insertqi into one
-define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testInsertTwice
-; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
-; CHECK-NEXT: ret <2 x i64> %1
- %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
- %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32)
- ret <2 x i64> %2
-}
-
; The result of this insert is the second arg, since the top 64 bits of
; the result are undefined, and we copy the bottom 64 bits from the
; second arg
ret <2 x i64> %1
}
-; Test the several types of ranges and ordering that exist for two insertqi
-define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testInsertContainedRange
-; CHECK: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
-; CHECK: ret <2 x i64> %1
- %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
- %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16)
- ret <2 x i64> %2
-}
-
-define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testInsertContainedRange_2
-; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
-; CHECK-NEXT: ret <2 x i64> %1
- %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16)
- %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
- ret <2 x i64> %2
-}
-
-define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testInsertOverlappingRange
-; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
-; CHECK-NEXT: ret <2 x i64> %1
- %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
- %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16)
- ret <2 x i64> %2
-}
-
-define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testInsertOverlappingRange_2
-; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
-; CHECK-NEXT: ret <2 x i64> %1
- %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16)
- %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
- ret <2 x i64> %2
-}
-
-define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testInsertAdjacentRange
-; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
-; CHECK-NEXT: ret <2 x i64> %1
- %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
- %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
- ret <2 x i64> %2
-}
-
-define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testInsertAdjacentRange_2
-; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
-; CHECK-NEXT: ret <2 x i64> %1
- %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32)
- %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
- ret <2 x i64> %2
-}
-
-define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testInsertDisjointRange
-; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
-; CHECK-NEXT: %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
-; CHECK-NEXT: ret <2 x i64> %2
- %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
- %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
- ret <2 x i64> %2
-}
-
-define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testInsertDisjointRange_2
-; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
-; CHECK-NEXT: %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
-; CHECK-NEXT: ret <2 x i64> %2
- %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
- %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
- ret <2 x i64> %2
-}
-
define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) {
; CHECK-LABEL: @testZeroLength
; CHECK-NEXT: ret <2 x i64> %i