}
if (Subtarget->hasSSE2()) {
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
+
setOperationAction(ISD::SRL, MVT::v8i16, Custom);
setOperationAction(ISD::SRL, MVT::v16i8, Custom);
return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
}
+static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDValue In = Op->getOperand(0);
+ MVT VT = Op->getSimpleValueType(0);
+ MVT InVT = In.getSimpleValueType();
+ assert(VT.getSizeInBits() == InVT.getSizeInBits());
+
+ MVT SVT = VT.getScalarType();
+ MVT InSVT = InVT.getScalarType();
+ assert(SVT.getScalarSizeInBits() > InSVT.getScalarSizeInBits());
+
+ if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
+ return SDValue();
+ if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
+ return SDValue();
+
+ SDLoc dl(Op);
+
+ // SSE41 targets can use the pmovsx* instructions directly.
+ if (Subtarget->hasSSE41())
+ return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+
+ // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
+ SDValue Curr = In;
+ MVT CurrVT = InVT;
+
+ // As SRAI is only available on i16/i32 types, we expand only up to i32
+ // and handle i64 separately.
+ while (CurrVT != VT && CurrVT.getScalarType() != MVT::i32) {
+ Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
+ MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
+ CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
+ Curr = DAG.getNode(ISD::BITCAST, dl, CurrVT, Curr);
+ }
+
+ SDValue SignExt = Curr;
+ if (CurrVT != InVT) {
+ unsigned SignExtShift =
+ CurrVT.getScalarSizeInBits() - InSVT.getScalarSizeInBits();
+ SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
+ DAG.getConstant(SignExtShift, dl, MVT::i8));
+ }
+
+ if (CurrVT == VT)
+ return SignExt;
+
+ if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
+ SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
+ DAG.getConstant(31, dl, MVT::i8));
+ SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
+ return DAG.getNode(ISD::BITCAST, dl, VT, Ext);
+ }
+
+ return SDValue();
+}
+
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
MVT VT = Op->getSimpleValueType(0);
case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG);
case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
const X86Subtarget *Subtarget) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
- SDLoc dl(N);
+ EVT SVT = VT.getScalarType();
+ EVT InVT = N0->getValueType(0);
+ EVT InSVT = InVT.getScalarType();
+ SDLoc DL(N);
// (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
// (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
// This exposes the sext to the sdivrem lowering, so that it directly extends
// from AH (which we otherwise need to do contortions to access).
if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
- N0.getValueType() == MVT::i8 && VT == MVT::i32) {
+ InVT == MVT::i8 && VT == MVT::i32) {
SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
- SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
+ SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, DL, NodeTys,
N0.getOperand(0), N0.getOperand(1));
DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
return R.getValue(1);
if (!DCI.isBeforeLegalizeOps()) {
if (N0.getValueType() == MVT::i1) {
- SDValue Zero = DAG.getConstant(0, dl, VT);
+ SDValue Zero = DAG.getConstant(0, DL, VT);
SDValue AllOnes =
- DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, VT);
- return DAG.getNode(ISD::SELECT, dl, VT, N0, AllOnes, Zero);
+ DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
+ return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
}
return SDValue();
}
+ if (VT.isVector()) {
+ auto ExtendToVec128 = [&DAG](SDLoc DL, SDValue N) {
+ EVT InVT = N->getValueType(0);
+ EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
+ 128 / InVT.getScalarSizeInBits());
+ SmallVector<SDValue, 8> Opnds(128 / InVT.getSizeInBits(),
+ DAG.getUNDEF(InVT));
+ Opnds[0] = N;
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
+ };
+
+ // If target-size is 128-bits, then convert to ISD::SIGN_EXTEND_VECTOR_INREG
+ // which ensures lowering to X86ISD::VSEXT (pmovsx*).
+ if (VT.getSizeInBits() == 128 &&
+ (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
+ (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
+ SDValue ExOp = ExtendToVec128(DL, N0);
+ return DAG.getSignExtendVectorInReg(ExOp, DL, VT);
+ }
+
+ // On pre-AVX2 targets, split into 128-bit nodes of
+ // ISD::SIGN_EXTEND_VECTOR_INREG.
+ if (!Subtarget->hasInt256() && !(VT.getSizeInBits() % 128) &&
+ (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
+ (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
+ unsigned NumVecs = VT.getSizeInBits() / 128;
+ unsigned NumSubElts = 128 / SVT.getSizeInBits();
+ EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
+ EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
+
+ SmallVector<SDValue, 8> Opnds;
+ for (unsigned i = 0, Offset = 0; i != NumVecs;
+ ++i, Offset += NumSubElts) {
+ SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
+ DAG.getIntPtrConstant(Offset, DL));
+ SrcVec = ExtendToVec128(DL, SrcVec);
+ SrcVec = DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT);
+ Opnds.push_back(SrcVec);
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
+ }
+ }
+
if (!Subtarget->hasFp256())
return SDValue();
define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_8i16_to_8i32:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: # kill: XMM0<def> XMM1<kill>
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_8i16_to_8i32:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: # kill: XMM0<def> XMM1<kill>
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: pslld $16, %xmm0
-; SSSE3-NEXT: psrad $16, %xmm0
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: pslld $16, %xmm1
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT: psrad $16, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSSE3-NEXT: psrad $16, %xmm1
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_8i16_to_8i32:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pmovzxwd %xmm1, %xmm0
-; SSE41-NEXT: pslld $16, %xmm0
-; SSE41-NEXT: psrad $16, %xmm0
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE41-NEXT: pslld $16, %xmm1
-; SSE41-NEXT: psrad $16, %xmm1
+; SSE41-NEXT: pmovsxwd %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovsxwd %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: sext_8i16_to_8i32:
;
; X32-SSE41-LABEL: sext_8i16_to_8i32:
; X32-SSE41: # BB#0: # %entry
-; X32-SSE41-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE41-NEXT: pmovzxwd %xmm1, %xmm0
-; X32-SSE41-NEXT: pslld $16, %xmm0
-; X32-SSE41-NEXT: psrad $16, %xmm0
-; X32-SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; X32-SSE41-NEXT: pslld $16, %xmm1
-; X32-SSE41-NEXT: psrad $16, %xmm1
+; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm2
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm1
+; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
; X32-SSE41-NEXT: retl
entry:
%B = sext <8 x i16> %A to <8 x i32>
define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_4i32_to_4i64:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
-; SSE2-NEXT: movd %xmm1, %rax
-; SSE2-NEXT: cltq
-; SSE2-NEXT: movd %rax, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: movd %xmm1, %rax
-; SSE2-NEXT: cltq
-; SSE2-NEXT: movd %rax, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; SSE2-NEXT: movd %xmm0, %rax
-; SSE2-NEXT: cltq
-; SSE2-NEXT: movd %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %rax
-; SSE2-NEXT: cltq
-; SSE2-NEXT: movd %rax, %xmm0
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_4i32_to_4i64:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
-; SSSE3-NEXT: movd %xmm1, %rax
-; SSSE3-NEXT: cltq
-; SSSE3-NEXT: movd %rax, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSSE3-NEXT: movd %xmm1, %rax
-; SSSE3-NEXT: cltq
-; SSSE3-NEXT: movd %rax, %xmm1
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; SSSE3-NEXT: movd %xmm0, %rax
-; SSSE3-NEXT: cltq
-; SSSE3-NEXT: movd %rax, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSSE3-NEXT: movd %xmm0, %rax
-; SSSE3-NEXT: cltq
-; SSSE3-NEXT: movd %rax, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_4i32_to_4i64:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: pmovzxdq %xmm0, %xmm1
-; SSE41-NEXT: pextrq $1, %xmm1, %rax
-; SSE41-NEXT: cltq
-; SSE41-NEXT: movd %rax, %xmm3
-; SSE41-NEXT: movd %xmm1, %rax
-; SSE41-NEXT: cltq
-; SSE41-NEXT: movd %rax, %xmm2
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: cltq
-; SSE41-NEXT: movd %rax, %xmm3
-; SSE41-NEXT: movd %xmm0, %rax
-; SSE41-NEXT: cltq
-; SSE41-NEXT: movd %rax, %xmm1
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovsxdq %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
;
; X32-SSE41-LABEL: sext_4i32_to_4i64:
; X32-SSE41: # BB#0: # %entry
-; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2
-; X32-SSE41-NEXT: movd %xmm2, %eax
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx
-; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2
-; X32-SSE41-NEXT: sarl $31, %ecx
-; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
-; X32-SSE41-NEXT: movd %xmm1, %eax
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx
-; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1
-; X32-SSE41-NEXT: sarl $31, %ecx
-; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1
+; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1
; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
; X32-SSE41-NEXT: retl
entry:
define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) {
; SSE2-LABEL: load_sext_test3:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movsbq 1(%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm1
-; SSE2-NEXT: movsbq (%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm0
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movzwl (%rdi), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: psrad $24, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_test3:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movsbq 1(%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm1
-; SSSE3-NEXT: movsbq (%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: movzwl (%rdi), %eax
+; SSSE3-NEXT: movd %eax, %xmm0
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: psrad $24, %xmm0
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_test3:
define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) {
; SSE2-LABEL: load_sext_test4:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movswq 2(%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm1
-; SSE2-NEXT: movswq (%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm0
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movd (%rdi), %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_test4:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movswq 2(%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm1
-; SSSE3-NEXT: movswq (%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: movd (%rdi), %xmm0
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: psrad $16, %xmm0
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_test4:
define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) {
; SSE2-LABEL: load_sext_test5:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movslq 4(%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm1
-; SSE2-NEXT: movslq (%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm0
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movq (%rdi), %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_test5:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movslq 4(%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm1
-; SSSE3-NEXT: movslq (%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: movq (%rdi), %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_test5:
; SSE2: # BB#0:
; SSE2-NEXT: pslld $31, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
-; SSE2-NEXT: movd %xmm1, %rax
-; SSE2-NEXT: cltq
-; SSE2-NEXT: movd %rax, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: movd %xmm1, %rax
-; SSE2-NEXT: cltq
-; SSE2-NEXT: movd %rax, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; SSE2-NEXT: movd %xmm0, %rax
-; SSE2-NEXT: cltq
-; SSE2-NEXT: movd %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %rax
-; SSE2-NEXT: cltq
-; SSE2-NEXT: movd %rax, %xmm0
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_4i1_to_4i64:
; SSSE3: # BB#0:
; SSSE3-NEXT: pslld $31, %xmm0
; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
-; SSSE3-NEXT: movd %xmm1, %rax
-; SSSE3-NEXT: cltq
-; SSSE3-NEXT: movd %rax, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSSE3-NEXT: movd %xmm1, %rax
-; SSSE3-NEXT: cltq
-; SSSE3-NEXT: movd %rax, %xmm1
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; SSSE3-NEXT: movd %xmm0, %rax
-; SSSE3-NEXT: cltq
-; SSSE3-NEXT: movd %rax, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSSE3-NEXT: movd %xmm0, %rax
-; SSSE3-NEXT: cltq
-; SSSE3-NEXT: movd %rax, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_4i1_to_4i64:
; SSE41: # BB#0:
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: pmovzxdq %xmm0, %xmm1
-; SSE41-NEXT: pextrq $1, %xmm1, %rax
-; SSE41-NEXT: cltq
-; SSE41-NEXT: movd %rax, %xmm3
-; SSE41-NEXT: movd %xmm1, %rax
-; SSE41-NEXT: cltq
-; SSE41-NEXT: movd %rax, %xmm2
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: cltq
-; SSE41-NEXT: movd %rax, %xmm3
-; SSE41-NEXT: movd %xmm0, %rax
-; SSE41-NEXT: cltq
-; SSE41-NEXT: movd %rax, %xmm1
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovsxdq %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; X32-SSE41: # BB#0:
; X32-SSE41-NEXT: pslld $31, %xmm0
; X32-SSE41-NEXT: psrad $31, %xmm0
-; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2
-; X32-SSE41-NEXT: movd %xmm2, %eax
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx
-; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2
-; X32-SSE41-NEXT: sarl $31, %ecx
-; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
-; X32-SSE41-NEXT: movd %xmm1, %eax
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx
-; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1
-; X32-SSE41-NEXT: sarl $31, %ecx
-; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1
+; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1
; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
; X32-SSE41-NEXT: retl
%extmask = sext <4 x i1> %mask to <4 x i64>
; SSE2: # BB#0:
; SSE2-NEXT: pslld $24, %xmm0
; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
-; SSE2-NEXT: movd %xmm1, %rax
-; SSE2-NEXT: cltq
-; SSE2-NEXT: movd %rax, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: movd %xmm1, %rax
-; SSE2-NEXT: cltq
-; SSE2-NEXT: movd %rax, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; SSE2-NEXT: movd %xmm0, %rax
-; SSE2-NEXT: cltq
-; SSE2-NEXT: movd %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %rax
-; SSE2-NEXT: cltq
-; SSE2-NEXT: movd %rax, %xmm0
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_4i8_to_4i64:
; SSSE3: # BB#0:
; SSSE3-NEXT: pslld $24, %xmm0
; SSSE3-NEXT: psrad $24, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
-; SSSE3-NEXT: movd %xmm1, %rax
-; SSSE3-NEXT: cltq
-; SSSE3-NEXT: movd %rax, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSSE3-NEXT: movd %xmm1, %rax
-; SSSE3-NEXT: cltq
-; SSSE3-NEXT: movd %rax, %xmm1
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; SSSE3-NEXT: movd %xmm0, %rax
-; SSSE3-NEXT: cltq
-; SSSE3-NEXT: movd %rax, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSSE3-NEXT: movd %xmm0, %rax
-; SSSE3-NEXT: cltq
-; SSSE3-NEXT: movd %rax, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_4i8_to_4i64:
; SSE41: # BB#0:
; SSE41-NEXT: pslld $24, %xmm0
; SSE41-NEXT: psrad $24, %xmm0
-; SSE41-NEXT: pmovzxdq %xmm0, %xmm1
-; SSE41-NEXT: pextrq $1, %xmm1, %rax
-; SSE41-NEXT: cltq
-; SSE41-NEXT: movd %rax, %xmm3
-; SSE41-NEXT: movd %xmm1, %rax
-; SSE41-NEXT: cltq
-; SSE41-NEXT: movd %rax, %xmm2
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: cltq
-; SSE41-NEXT: movd %rax, %xmm3
-; SSE41-NEXT: movd %xmm0, %rax
-; SSE41-NEXT: cltq
-; SSE41-NEXT: movd %rax, %xmm1
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovsxdq %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; X32-SSE41: # BB#0:
; X32-SSE41-NEXT: pslld $24, %xmm0
; X32-SSE41-NEXT: psrad $24, %xmm0
-; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2
-; X32-SSE41-NEXT: movd %xmm2, %eax
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx
-; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2
-; X32-SSE41-NEXT: sarl $31, %ecx
-; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
-; X32-SSE41-NEXT: movd %xmm1, %eax
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx
-; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1
-; X32-SSE41-NEXT: sarl $31, %ecx
-; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1
+; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1
; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
; X32-SSE41-NEXT: retl
%extmask = sext <4 x i8> %mask to <4 x i64>