return SDValue();
}
+/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
+/// Promoting a sign extension ahead of an 'add nsw' exposes opportunities
+/// to combine math ops, use an LEA, or use a complex addressing mode. This can
+/// eliminate extend, add, and shift instructions.
+static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ // TODO: This should be valid for other integer types.
+ EVT VT = Sext->getValueType(0);
+ if (VT != MVT::i64)
+ return SDValue();
+
+ // We need an 'add nsw' feeding into the 'sext'.
+ SDValue Add = Sext->getOperand(0);
+ if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap())
+ return SDValue();
+
+ // Having a constant operand to the 'add' ensures that we are not increasing
+ // the instruction count because the constant is extended for free below.
+ // A constant operand can also become the displacement field of an LEA.
+ auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
+ if (!AddOp1)
+ return SDValue();
+
+ // Don't make the 'add' bigger if there's no hope of combining it with some
+ // other 'add' or 'shl' instruction.
+ // TODO: It may be profitable to generate simpler LEA instructions in place
+ // of single 'add' instructions, but the cost model for selecting an LEA
+ // currently has a high threshold.
+ bool HasLEAPotential = false;
+ for (auto *User : Sext->uses()) {
+ if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
+ HasLEAPotential = true;
+ break;
+ }
+ }
+ if (!HasLEAPotential)
+ return SDValue();
+
+ // Everything looks good, so pull the 'sext' ahead of the 'add'.
+ int64_t AddConstant = AddOp1->getSExtValue();
+ SDValue AddOp0 = Add.getOperand(0);
+ SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0);
+ SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
+
+ // The wider add is guaranteed to not wrap because both operands are
+ // sign-extended.
+ SDNodeFlags Flags;
+ Flags.setNoSignedWrap(true);
+ return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags);
+}
+
static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
return R;
+ if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget))
+ return NewAdd;
+
return SDValue();
}
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
; The fundamental problem: an add separated from other arithmetic by a sext can't
-; be combined with the later instructions. However, if the first add is 'nsw',
+; be combined with the later instructions. However, if the first add is 'nsw',
; then we can promote the sext ahead of that add to allow optimizations.
define i64 @add_nsw_consts(i32 %i) {
; CHECK-LABEL: add_nsw_consts:
; CHECK: # BB#0:
-; CHECK-NEXT: addl $5, %edi
; CHECK-NEXT: movslq %edi, %rax
-; CHECK-NEXT: addq $7, %rax
+; CHECK-NEXT: addq $12, %rax
; CHECK-NEXT: retq
%add = add nsw i32 %i, 5
define i64 @add_nsw_sext_add(i32 %i, i64 %x) {
; CHECK-LABEL: add_nsw_sext_add:
; CHECK: # BB#0:
-; CHECK-NEXT: addl $5, %edi
; CHECK-NEXT: movslq %edi, %rax
-; CHECK-NEXT: addq %rsi, %rax
+; CHECK-NEXT: leaq 5(%rax,%rsi), %rax
; CHECK-NEXT: retq
%add = add nsw i32 %i, 5
define i64 @add_nsw_sext_lsh_add(i32 %i, i64 %x) {
; CHECK-LABEL: add_nsw_sext_lsh_add:
; CHECK: # BB#0:
-; CHECK-NEXT: addl $-5, %edi
; CHECK-NEXT: movslq %edi, %rax
-; CHECK-NEXT: leaq (%rsi,%rax,8), %rax
+; CHECK-NEXT: leaq -40(%rsi,%rax,8), %rax
; CHECK-NEXT: retq
%add = add nsw i32 %i, -5
define i8* @gep8(i32 %i, i8* %x) {
; CHECK-LABEL: gep8:
; CHECK: # BB#0:
-; CHECK-NEXT: addl $5, %edi
; CHECK-NEXT: movslq %edi, %rax
-; CHECK-NEXT: addq %rsi, %rax
+; CHECK-NEXT: leaq 5(%rax,%rsi), %rax
; CHECK-NEXT: retq
%add = add nsw i32 %i, 5
define i16* @gep16(i32 %i, i16* %x) {
; CHECK-LABEL: gep16:
; CHECK: # BB#0:
-; CHECK-NEXT: addl $-5, %edi
; CHECK-NEXT: movslq %edi, %rax
-; CHECK-NEXT: leaq (%rsi,%rax,2), %rax
+; CHECK-NEXT: leaq -10(%rsi,%rax,2), %rax
; CHECK-NEXT: retq
%add = add nsw i32 %i, -5
define i32* @gep32(i32 %i, i32* %x) {
; CHECK-LABEL: gep32:
; CHECK: # BB#0:
-; CHECK-NEXT: addl $5, %edi
; CHECK-NEXT: movslq %edi, %rax
-; CHECK-NEXT: leaq (%rsi,%rax,4), %rax
+; CHECK-NEXT: leaq 20(%rsi,%rax,4), %rax
; CHECK-NEXT: retq
%add = add nsw i32 %i, 5
define i64* @gep64(i32 %i, i64* %x) {
; CHECK-LABEL: gep64:
; CHECK: # BB#0:
-; CHECK-NEXT: addl $-5, %edi
; CHECK-NEXT: movslq %edi, %rax
-; CHECK-NEXT: leaq (%rsi,%rax,8), %rax
+; CHECK-NEXT: leaq -40(%rsi,%rax,8), %rax
; CHECK-NEXT: retq
%add = add nsw i32 %i, -5
define i128* @gep128(i32 %i, i128* %x) {
; CHECK-LABEL: gep128:
; CHECK: # BB#0:
-; CHECK-NEXT: addl $5, %edi
; CHECK-NEXT: movslq %edi, %rax
; CHECK-NEXT: shlq $4, %rax
-; CHECK-NEXT: addq %rsi, %rax
+; CHECK-NEXT: leaq 80(%rax,%rsi), %rax
; CHECK-NEXT: retq
%add = add nsw i32 %i, 5
define void @PR20134(i32* %a, i32 %i) {
; CHECK-LABEL: PR20134:
; CHECK: # BB#0:
-; CHECK-NEXT: leal 1(%rsi), %eax
-; CHECK-NEXT: cltq
-; CHECK-NEXT: movl (%rdi,%rax,4), %eax
-; CHECK-NEXT: leal 2(%rsi), %ecx
-; CHECK-NEXT: movslq %ecx, %rcx
-; CHECK-NEXT: addl (%rdi,%rcx,4), %eax
-; CHECK-NEXT: movslq %esi, %rcx
-; CHECK-NEXT: movl %eax, (%rdi,%rcx,4)
+; CHECK-NEXT: movslq %esi, %rax
+; CHECK-NEXT: movl 4(%rdi,%rax,4), %ecx
+; CHECK-NEXT: addl 8(%rdi,%rax,4), %ecx
+; CHECK-NEXT: movl %ecx, (%rdi,%rax,4)
; CHECK-NEXT: retq
%add1 = add nsw i32 %i, 1