def int_x86_sse2_psrai_d : GCCBuiltin<"__builtin_ia32_psradi128">,
Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
llvm_i32_ty], [IntrNoMem]>;
-
- def int_x86_sse2_psll_dq :
- Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
- llvm_i32_ty], [IntrNoMem]>;
- def int_x86_sse2_psrl_dq :
- Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
- llvm_i32_ty], [IntrNoMem]>;
}
// Conversion ops
Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
llvm_i32_ty], [IntrNoMem]>;
- def int_x86_avx2_psll_dq :
- Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
- llvm_i32_ty], [IntrNoMem]>;
- def int_x86_avx2_psrl_dq :
- Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
- llvm_i32_ty], [IntrNoMem]>;
-
def int_x86_avx512_mask_pslli_d : GCCBuiltin<"__builtin_ia32_pslldi512">,
Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
llvm_i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
Name == "x86.avx.vbroadcast.ss" ||
Name == "x86.avx.vbroadcast.ss.256" ||
Name == "x86.avx.vbroadcast.sd.256" ||
+ Name == "x86.sse2.psll.dq" ||
+ Name == "x86.sse2.psrl.dq" ||
+ Name == "x86.avx2.psll.dq" ||
+ Name == "x86.avx2.psrl.dq" ||
Name == "x86.sse2.psll.dq.bs" ||
Name == "x86.sse2.psrl.dq.bs" ||
Name == "x86.avx2.psll.dq.bs" ||
return MetadataAsValue::get(F->getContext(), Expr);
}
+// Handles upgrading SSE2 and AVX2 PSLLDQ intrinsics by converting them
+// to byte shuffles.
+static Value *UpgradeX86PSLLDQIntrinsics(IRBuilder<> &Builder, LLVMContext &C,
+ Value *Op, unsigned NumLanes,
+ unsigned Shift) {
+ // Each lane is 16 bytes.
+ unsigned NumElts = NumLanes * 16;
+
+ // Bitcast from a 64-bit element type to a byte element type.
+ Op = Builder.CreateBitCast(Op,
+ VectorType::get(Type::getInt8Ty(C), NumElts),
+ "cast");
+ // We'll be shuffling in zeroes.
+ Value *Res = ConstantVector::getSplat(NumElts, Builder.getInt8(0));
+
+ // If shift is less than 16, emit a shuffle to move the bytes. Otherwise,
+ // we'll just return the zero vector.
+ if (Shift < 16) {
+ SmallVector<Constant*, 32> Idxs;
+ // 256-bit version is split into two 16-byte lanes.
+ for (unsigned l = 0; l != NumElts; l += 16)
+ for (unsigned i = 0; i != 16; ++i) {
+ unsigned Idx = NumElts + i - Shift;
+ if (Idx < NumElts)
+ Idx -= NumElts - 16; // end of lane, switch operand.
+ Idxs.push_back(Builder.getInt32(Idx + l));
+ }
+
+ Res = Builder.CreateShuffleVector(Res, Op, ConstantVector::get(Idxs));
+ }
+
+ // Bitcast back to a 64-bit element type.
+ return Builder.CreateBitCast(Res,
+ VectorType::get(Type::getInt64Ty(C), 2*NumLanes),
+ "cast");
+}
+
+// Handles upgrading SSE2 and AVX2 PSRLDQ intrinsics by converting them
+// to byte shuffles.
+static Value *UpgradeX86PSRLDQIntrinsics(IRBuilder<> &Builder, LLVMContext &C,
+ Value *Op, unsigned NumLanes,
+ unsigned Shift) {
+ // Each lane is 16 bytes.
+ unsigned NumElts = NumLanes * 16;
+
+ // Bitcast from a 64-bit element type to a byte element type.
+ Op = Builder.CreateBitCast(Op,
+ VectorType::get(Type::getInt8Ty(C), NumElts),
+ "cast");
+ // We'll be shuffling in zeroes.
+ Value *Res = ConstantVector::getSplat(NumElts, Builder.getInt8(0));
+
+ // If shift is less than 16, emit a shuffle to move the bytes. Otherwise,
+ // we'll just return the zero vector.
+ if (Shift < 16) {
+ SmallVector<Constant*, 32> Idxs;
+ // 256-bit version is split into two 16-byte lanes.
+ for (unsigned l = 0; l != NumElts; l += 16)
+ for (unsigned i = 0; i != 16; ++i) {
+ unsigned Idx = i + Shift;
+ if (Idx >= 16)
+ Idx += NumElts - 16; // end of lane, switch operand.
+ Idxs.push_back(Builder.getInt32(Idx + l));
+ }
+
+ Res = Builder.CreateShuffleVector(Op, Res, ConstantVector::get(Idxs));
+ }
+
+ // Bitcast back to a 64-bit element type.
+ return Builder.CreateBitCast(Res,
+ VectorType::get(Type::getInt64Ty(C), 2*NumLanes),
+ "cast");
+}
+
// UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the
// upgraded intrinsic. All argument and return casting must be provided in
// order to seamlessly integrate with existing context.
for (unsigned I = 0; I < EltNum; ++I)
Rep = Builder.CreateInsertElement(Rep, Load,
ConstantInt::get(I32Ty, I));
+ } else if (Name == "llvm.x86.sse2.psll.dq") {
+ // 128-bit shift left specified in bits.
+ unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+ Rep = UpgradeX86PSLLDQIntrinsics(Builder, C, CI->getArgOperand(0), 1,
+ Shift / 8); // Shift is in bits.
+ } else if (Name == "llvm.x86.sse2.psrl.dq") {
+ // 128-bit shift right specified in bits.
+ unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+ Rep = UpgradeX86PSRLDQIntrinsics(Builder, C, CI->getArgOperand(0), 1,
+ Shift / 8); // Shift is in bits.
+ } else if (Name == "llvm.x86.avx2.psll.dq") {
+ // 256-bit shift left specified in bits.
+ unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+ Rep = UpgradeX86PSLLDQIntrinsics(Builder, C, CI->getArgOperand(0), 2,
+ Shift / 8); // Shift is in bits.
+ } else if (Name == "llvm.x86.avx2.psrl.dq") {
+ // 256-bit shift right specified in bits.
+ unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+ Rep = UpgradeX86PSRLDQIntrinsics(Builder, C, CI->getArgOperand(0), 2,
+ Shift / 8); // Shift is in bits.
} else if (Name == "llvm.x86.sse2.psll.dq.bs") {
- Value *Op0 = ConstantVector::getSplat(16, Builder.getInt8(0));
- Value *Op1 = Builder.CreateBitCast(CI->getArgOperand(0),
- VectorType::get(Type::getInt8Ty(C),16),
- "cast");
-
+ // 128-bit shift left specified in bytes.
unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-
- if (Shift < 16) {
- SmallVector<Constant*, 16> Idxs;
- for (unsigned i = 16; i != 32; ++i)
- Idxs.push_back(Builder.getInt32(i - Shift));
-
- Op0 = Builder.CreateShuffleVector(Op0, Op1, ConstantVector::get(Idxs));
- }
-
- Rep = Builder.CreateBitCast(Op0,
- VectorType::get(Type::getInt64Ty(C), 2),
- "cast");
+ Rep = UpgradeX86PSLLDQIntrinsics(Builder, C, CI->getArgOperand(0), 1,
+ Shift);
} else if (Name == "llvm.x86.sse2.psrl.dq.bs") {
- Value *Op0 = Builder.CreateBitCast(CI->getArgOperand(0),
- VectorType::get(Type::getInt8Ty(C),16),
- "cast");
- Value *Op1 = ConstantVector::getSplat(16, Builder.getInt8(0));
-
+ // 128-bit shift right specified in bytes.
unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-
- if (Shift < 16) {
- SmallVector<Constant*, 16> Idxs;
- for (unsigned i = 0; i != 16; ++i)
- Idxs.push_back(Builder.getInt32(i + Shift));
-
- Op1 = Builder.CreateShuffleVector(Op0, Op1, ConstantVector::get(Idxs));
- }
- Rep = Builder.CreateBitCast(Op1,
- VectorType::get(Type::getInt64Ty(C), 2),
- "cast");
+ Rep = UpgradeX86PSRLDQIntrinsics(Builder, C, CI->getArgOperand(0), 1,
+ Shift);
} else if (Name == "llvm.x86.avx2.psll.dq.bs") {
- Value *Op0 = ConstantVector::getSplat(32, Builder.getInt8(0));
- Value *Op1 = Builder.CreateBitCast(CI->getArgOperand(0),
- VectorType::get(Type::getInt8Ty(C),32),
- "cast");
-
+ // 256-bit shift left specified in bytes.
unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-
- if (Shift < 16) {
- SmallVector<Constant*, 32> Idxs;
- for (unsigned l = 0; l != 32; l += 16)
- for (unsigned i = 0; i != 16; ++i) {
- unsigned Idx = 32 + i - Shift;
- if (Idx < 32) Idx -= 16; // end of lane, switch operand.
- Idxs.push_back(Builder.getInt32(Idx + l));
- }
-
- Op1 = Builder.CreateShuffleVector(Op0, Op1, ConstantVector::get(Idxs));
- }
-
- Rep = Builder.CreateBitCast(Op1,
- VectorType::get(Type::getInt64Ty(C), 4),
- "cast");
+ Rep = UpgradeX86PSLLDQIntrinsics(Builder, C, CI->getArgOperand(0), 2,
+ Shift);
} else if (Name == "llvm.x86.avx2.psrl.dq.bs") {
- Value *Op0 = Builder.CreateBitCast(CI->getArgOperand(0),
- VectorType::get(Type::getInt8Ty(C),32),
- "cast");
- Value *Op1 = ConstantVector::getSplat(32, Builder.getInt8(0));
-
+ // 256-bit shift right specified in bytes.
unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-
- if (Shift < 16) {
- SmallVector<Constant*, 32> Idxs;
- for (unsigned l = 0; l != 32; l += 16)
- for (unsigned i = 0; i != 16; ++i) {
- unsigned Idx = i + Shift;
- if (Idx >= 16) Idx += 16; // end of lane, switch operand.
- Idxs.push_back(Builder.getInt32(Idx + l));
- }
-
- Op0 = Builder.CreateShuffleVector(Op0, Op1, ConstantVector::get(Idxs));
- }
-
- Rep = Builder.CreateBitCast(Op0,
- VectorType::get(Type::getInt64Ty(C), 4),
- "cast");
+ Rep = UpgradeX86PSRLDQIntrinsics(Builder, C, CI->getArgOperand(0), 2,
+ Shift);
} else {
bool PD128 = false, PD256 = false, PS128 = false, PS256 = false;
if (Name == "llvm.x86.avx.vpermil.pd.256")
} // Constraints = "$src1 = $dst"
let Predicates = [HasAVX] in {
- def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
- (VPSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
- def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
- (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
(VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
}
-let Predicates = [HasAVX2] in {
- def : Pat<(int_x86_avx2_psll_dq VR256:$src1, imm:$src2),
- (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
- def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2),
- (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
-}
-
let Predicates = [UseSSE2] in {
- def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
- (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
- def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
- (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
(PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
}
case llvm::Intrinsic::x86_avx2_pslli_w:
case llvm::Intrinsic::x86_avx2_pslli_d:
case llvm::Intrinsic::x86_avx2_pslli_q:
- case llvm::Intrinsic::x86_avx2_psll_dq:
case llvm::Intrinsic::x86_avx2_psrl_w:
case llvm::Intrinsic::x86_avx2_psrl_d:
case llvm::Intrinsic::x86_avx2_psrl_q:
case llvm::Intrinsic::x86_avx2_psrli_q:
case llvm::Intrinsic::x86_avx2_psrai_w:
case llvm::Intrinsic::x86_avx2_psrai_d:
- case llvm::Intrinsic::x86_avx2_psrl_dq:
case llvm::Intrinsic::x86_sse2_psll_w:
case llvm::Intrinsic::x86_sse2_psll_d:
case llvm::Intrinsic::x86_sse2_psll_q:
case llvm::Intrinsic::x86_sse2_pslli_w:
case llvm::Intrinsic::x86_sse2_pslli_d:
case llvm::Intrinsic::x86_sse2_pslli_q:
- case llvm::Intrinsic::x86_sse2_psll_dq:
case llvm::Intrinsic::x86_sse2_psrl_w:
case llvm::Intrinsic::x86_sse2_psrl_d:
case llvm::Intrinsic::x86_sse2_psrl_q:
case llvm::Intrinsic::x86_sse2_psrli_q:
case llvm::Intrinsic::x86_sse2_psrai_w:
case llvm::Intrinsic::x86_sse2_psrai_d:
- case llvm::Intrinsic::x86_sse2_psrl_dq:
case llvm::Intrinsic::x86_mmx_psll_w:
case llvm::Intrinsic::x86_mmx_psll_d:
case llvm::Intrinsic::x86_mmx_psll_q:
// Byte shifts are not implemented.
// case llvm::Intrinsic::x86_avx512_psll_dq_bs:
// case llvm::Intrinsic::x86_avx512_psrl_dq_bs:
- // case llvm::Intrinsic::x86_avx2_psll_dq_bs:
- // case llvm::Intrinsic::x86_avx2_psrl_dq_bs:
- // case llvm::Intrinsic::x86_sse2_psll_dq_bs:
- // case llvm::Intrinsic::x86_sse2_psrl_dq_bs:
case llvm::Intrinsic::x86_sse2_packsswb_128:
case llvm::Intrinsic::x86_sse2_packssdw_128:
declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone
+define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
+ ; CHECK: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+ %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
+ ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+ %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
-define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
- ; CHECK: vpslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
- %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
-
-
define <2 x i64> @test_x86_sse2_psll_q(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK: vpsllq
%res = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
-define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
- ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
- %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
-
-
define <2 x i64> @test_x86_sse2_psrl_q(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK: vpsrlq
%res = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64>, i32) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) {
+ ; CHECK: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+ %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 8) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) {
+ ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero
+ %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 8) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone
declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
-define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) {
- ; CHECK: vpslldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
- %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone
-
-
define <4 x i64> @test_x86_avx2_psll_q(<4 x i64> %a0, <2 x i64> %a1) {
; CHECK: vpsllq
%res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
-define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) {
- ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
- %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone
-
-
define <4 x i64> @test_x86_avx2_psrl_q(<4 x i64> %a0, <2 x i64> %a1) {
; CHECK: vpsrlq
%res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
--- /dev/null
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=pentium4 -mattr=sse2 | FileCheck %s
+
+define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
+ ; CHECK: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]
+ %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
+ ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero
+ %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone
+
+define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
+ ; CHECK: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+ %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
+ ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+ %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
-define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
- ; CHECK: pslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
- %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
-
-
define <2 x i64> @test_x86_sse2_psll_q(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK: psllq
%res = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
-define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
- ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
- %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
-
-
define <2 x i64> @test_x86_sse2_psrl_q(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK: psrlq
%res = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]