// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
// since the result of setcc_c is all zero's or all ones.
- if (N1C && N0.getOpcode() == ISD::AND &&
+ if (VT.isInteger() && !VT.isVector() &&
+ N1C && N0.getOpcode() == ISD::AND &&
N0.getOperand(1).getOpcode() == ISD::Constant) {
SDValue N00 = N0.getOperand(0);
if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
}
}
+
+ // Hardware support for vector shifts is sparse which makes us scalarize the
+ // vector operations in many cases. Also, on sandybridge ADD is faster than
+ // shl.
+ // (shl V, 1) -> add V,V
+ if (isSplatVector(N1.getNode())) {
+ assert(N0.getValueType().isVector() && "Invalid vector shift type");
+ ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0));
+ // We shift all of the values by one. In many cases we do not have
+ // hardware support for this operation. This is better expressed as an ADD
+ // of two values.
+ if (N1C && (1 == N1C->getZExtValue())) {
+ return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0);
+ }
+ }
+
return SDValue();
}
static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
EVT VT = N->getValueType(0);
- if (!VT.isVector() && VT.isInteger() &&
- N->getOpcode() == ISD::SHL)
- return PerformSHLCombine(N, DAG);
+ if (N->getOpcode() == ISD::SHL) {
+ SDValue V = PerformSHLCombine(N, DAG);
+ if (V.getNode()) return V;
+ }
// On X86 with SSE2 support, we can transform this to a vector shift if
// all elements are shifted by the same amount. We can't do this in legalize
define <4 x i32> @shl4(<4 x i32> %A) nounwind {
entry:
; CHECK: shl4
+; CHECK: padd
; CHECK: pslld
-; CHECK-NEXT: pslld
+; CHECK: ret
%B = shl <4 x i32> %A, < i32 2, i32 2, i32 2, i32 2>
%C = shl <4 x i32> %A, < i32 1, i32 1, i32 1, i32 1>
%K = xor <4 x i32> %B, %C
; CHECK: shr4
; CHECK: psrld
; CHECK-NEXT: psrld
+; CHECK: ret
%B = lshr <4 x i32> %A, < i32 2, i32 2, i32 2, i32 2>
%C = lshr <4 x i32> %A, < i32 1, i32 1, i32 1, i32 1>
%K = xor <4 x i32> %B, %C
; CHECK: sra4
; CHECK: psrad
; CHECK-NEXT: psrad
+; CHECK: ret
%B = ashr <4 x i32> %A, < i32 2, i32 2, i32 2, i32 2>
%C = ashr <4 x i32> %A, < i32 1, i32 1, i32 1, i32 1>
%K = xor <4 x i32> %B, %C
; CHECK: shl2
; CHECK: psllq
; CHECK-NEXT: psllq
+; CHECK: ret
%B = shl <2 x i64> %A, < i64 2, i64 2>
%C = shl <2 x i64> %A, < i64 9, i64 9>
%K = xor <2 x i64> %B, %C
; CHECK: shr2
; CHECK: psrlq
; CHECK-NEXT: psrlq
+; CHECK: ret
%B = lshr <2 x i64> %A, < i64 8, i64 8>
%C = lshr <2 x i64> %A, < i64 1, i64 1>
%K = xor <2 x i64> %B, %C
define <8 x i16> @shl8(<8 x i16> %A) nounwind {
entry:
; CHECK: shl8
+; CHECK: padd
; CHECK: psllw
-; CHECK-NEXT: psllw
+; CHECK: ret
%B = shl <8 x i16> %A, < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
%C = shl <8 x i16> %A, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%K = xor <8 x i16> %B, %C
; CHECK: shr8
; CHECK: psrlw
; CHECK-NEXT: psrlw
+; CHECK: ret
%B = lshr <8 x i16> %A, < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
%C = lshr <8 x i16> %A, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%K = xor <8 x i16> %B, %C
; CHECK: sra8
; CHECK: psraw
; CHECK-NEXT: psraw
+; CHECK: ret
%B = ashr <8 x i16> %A, < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
%C = ashr <8 x i16> %A, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%K = xor <8 x i16> %B, %C
; CHECK: sll8_nosplat
; CHECK-NOT: psll
; CHECK-NOT: psll
+; CHECK: ret
%B = shl <8 x i16> %A, < i16 1, i16 2, i16 3, i16 6, i16 2, i16 2, i16 2, i16 2>
%C = shl <8 x i16> %A, < i16 9, i16 7, i16 5, i16 1, i16 4, i16 1, i16 1, i16 1>
%K = xor <8 x i16> %B, %C
; CHECK: shr2_nosplat
; CHECK-NOT: psrlq
; CHECK-NOT: psrlq
+; CHECK: ret
%B = lshr <2 x i64> %A, < i64 8, i64 1>
%C = lshr <2 x i64> %A, < i64 1, i64 0>
%K = xor <2 x i64> %B, %C
entry:
; CHECK: shl2_other
; CHECK: psllq
+; CHECK: ret
%B = shl <2 x i32> %A, < i32 2, i32 2>
%C = shl <2 x i32> %A, < i32 9, i32 9>
%K = xor <2 x i32> %B, %C
entry:
; CHECK: shr2_other
; CHECK: psrlq
+; CHECK: ret
%B = lshr <2 x i32> %A, < i32 8, i32 8>
%C = lshr <2 x i32> %A, < i32 1, i32 1>
%K = xor <2 x i32> %B, %C