From 40453da779d8e511e69ace5e07a35af6a27848b7 Mon Sep 17 00:00:00 2001 From: Ahmed Bougacha Date: Mon, 3 Nov 2014 20:26:35 +0000 Subject: [PATCH] [X86] 8bit divrem: Improve codegen for AH register extraction. For 8-bit divrems where the remainder is used, we used to generate: divb %sil shrw $8, %ax movzbl %al, %eax That was to avoid an H-reg access, which is problematic mainly because it isn't possible in REX-prefixed instructions. This patch optimizes that to: divb %sil movzbl %ah, %eax To do that, we explicitly extend AH, and extract the L-subreg in the resulting register. The extension is done using the NOREX variants of MOVZX. To support signed operations, MOVSX_NOREX is also added. Further, this introduces a new SDNode type, [us]divrem_ext_hreg, which is then lowered to a sequence containing a single zext (rather than 2). Differential Revision: http://reviews.llvm.org/D6064 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@221176 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelDAGToDAG.cpp | 63 +++++++++++------- lib/Target/X86/X86ISelLowering.cpp | 34 +++++++++- lib/Target/X86/X86ISelLowering.h | 4 ++ lib/Target/X86/X86InstrExtension.td | 14 +++- test/CodeGen/X86/divrem8_ext.ll | 100 ++++++++++++++++++++++++++++ 5 files changed, 187 insertions(+), 28 deletions(-) create mode 100644 test/CodeGen/X86/divrem8_ext.ll diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 4386028e9f5..15b60ba5bf6 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2412,11 +2412,14 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } case ISD::SDIVREM: - case ISD::UDIVREM: { + case ISD::UDIVREM: + case X86ISD::SDIVREM8_SEXT_HREG: + case X86ISD::UDIVREM8_ZEXT_HREG: { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); - bool isSigned = Opcode == ISD::SDIVREM; + bool isSigned = (Opcode == ISD::SDIVREM || + Opcode == X86ISD::SDIVREM8_SEXT_HREG); if (!isSigned) { switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); @@ -2532,33 +2535,43 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0); } - // Prevent use of AH in a REX instruction by referencing AX instead. - // Shift it down 8 bits. + // Prevent use of AH in a REX instruction by explicitly copying it to + // an ABCD_L register. // // The current assumption of the register allocator is that isel - // won't generate explicit references to the GPR8_NOREX registers. If + // won't generate explicit references to the GR8_ABCD_H registers. If // the allocator and/or the backend get enhanced to be more robust in // that regard, this can be, and should be, removed. - if (HiReg == X86::AH && Subtarget->is64Bit() && - !SDValue(Node, 1).use_empty()) { - SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, - X86::AX, MVT::i16, InFlag); - InFlag = Result.getValue(2); - - // If we also need AL (the quotient), get it by extracting a subreg from - // Result. The fast register allocator does not like multiple CopyFromReg - // nodes using aliasing registers. - if (!SDValue(Node, 0).use_empty()) - ReplaceUses(SDValue(Node, 0), - CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result)); - - // Shift AX right by 8 bits instead of using AH. - Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16, - Result, - CurDAG->getTargetConstant(8, MVT::i8)), - 0); - ReplaceUses(SDValue(Node, 1), - CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result)); + if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) { + SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8); + unsigned AHExtOpcode = + isSigned ? X86::MOVSX32_NOREXrr8 : X86::MOVZX32_NOREXrr8; + + SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32, + MVT::Glue, AHCopy, InFlag); + SDValue Result(RNode, 0); + InFlag = SDValue(RNode, 1); + + if (Opcode == X86ISD::UDIVREM8_ZEXT_HREG || + Opcode == X86ISD::SDIVREM8_SEXT_HREG) { + if (Node->getValueType(1) == MVT::i64) { + // It's not possible to directly movsx AH to a 64bit register, because + // the latter needs the REX prefix, but the former can't have it. + assert(Opcode != X86ISD::SDIVREM8_SEXT_HREG && + "Unexpected i64 sext of h-register"); + Result = + SDValue(CurDAG->getMachineNode( + TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, + CurDAG->getTargetConstant(0, MVT::i64), Result, + CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)), + 0); + } + } else { + Result = + CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result); + } + ReplaceUses(SDValue(Node, 1), Result); + DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); } // Copy the division (low) result, if it is needed. if (!SDValue(Node, 0).use_empty()) { diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index d8ffc36891d..75ca7cfbe7d 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -19080,6 +19080,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SBB: return "X86ISD::SBB"; case X86ISD::SMUL: return "X86ISD::SMUL"; case X86ISD::UMUL: return "X86ISD::UMUL"; + case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG"; + case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG"; case X86ISD::INC: return "X86ISD::INC"; case X86ISD::DEC: return "X86ISD::DEC"; case X86ISD::OR: return "X86ISD::OR"; @@ -24278,13 +24280,29 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + // (i8,i32 sext (sdivrem (i8 x, i8 y)) -> + // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y) + // This exposes the sext to the sdivrem lowering, so that it directly extends + // from AH (which we otherwise need to do contortions to access). + if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 && + N0.getValueType() == MVT::i8 && VT == MVT::i32) { + SDLoc dl(N); + SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); + SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys, + N0.getOperand(0), N0.getOperand(1)); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); + return R.getValue(1); + } + if (!DCI.isBeforeLegalizeOps()) return SDValue(); if (!Subtarget->hasFp256()) return SDValue(); - EVT VT = N->getValueType(0); if (VT.isVector() && VT.getSizeInBits() == 256) { SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); if (R.getNode()) @@ -24377,6 +24395,20 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, return R; } + // (i8,i32 zext (udivrem (i8 x, i8 y)) -> + // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y) + // This exposes the zext to the udivrem lowering, so that it directly extends + // from AH (which we otherwise need to do contortions to access). + if (N0.getOpcode() == ISD::UDIVREM && + N0.getResNo() == 1 && N0.getValueType() == MVT::i8 && + (VT == MVT::i32 || VT == MVT::i64)) { + SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); + SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys, + N0.getOperand(0), N0.getOperand(1)); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); + return R.getValue(1); + } + return SDValue(); } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index e81a9d1209c..35e132b944a 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -304,6 +304,10 @@ namespace llvm { // 8-bit SMUL/UMUL - AX, FLAGS = smul8/umul8 AL, RHS SMUL8, UMUL8, + // 8-bit divrem that zero-extend the high result (AH). + UDIVREM8_ZEXT_HREG, + SDIVREM8_SEXT_HREG, + // MUL_IMM - X86 specific multiply by immediate. MUL_IMM, diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td index 6be6a1fc6cb..b38129a41f2 100644 --- a/lib/Target/X86/X86InstrExtension.td +++ b/lib/Target/X86/X86InstrExtension.td @@ -97,13 +97,23 @@ def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), let neverHasSideEffects = 1, isCodeGenOnly = 1 in { def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg, (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src), - "movz{bl|x}\t{$src, $dst|$dst, $src}", + "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOVZX>, TB, Sched<[WriteALU]>; let mayLoad = 1 in def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem, (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src), - "movz{bl|x}\t{$src, $dst|$dst, $src}", + "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOVZX>, TB, Sched<[WriteALULd]>; + +def MOVSX32_NOREXrr8 : I<0xBE, MRMSrcReg, + (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src), + "movs{bl|x}\t{$src, $dst|$dst, $src} # NOREX", + [], IIC_MOVSX>, TB, Sched<[WriteALU]>; +let mayLoad = 1 in +def MOVSX32_NOREXrm8 : I<0xBE, MRMSrcMem, + (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src), + "movs{bl|x}\t{$src, $dst|$dst, $src} # NOREX", + [], IIC_MOVSX>, TB, Sched<[WriteALULd]>; } // MOVSX64rr8 always has a REX prefix and it has an 8-bit register diff --git a/test/CodeGen/X86/divrem8_ext.ll b/test/CodeGen/X86/divrem8_ext.ll new file mode 100644 index 00000000000..ec367c86526 --- /dev/null +++ b/test/CodeGen/X86/divrem8_ext.ll @@ -0,0 +1,100 @@ +; RUN: llc -march=x86-64 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-64 +; RUN: llc -march=x86 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-32 +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.10.0" + +define zeroext i8 @test_udivrem_zext_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_udivrem_zext_ah +; CHECK: divb +; CHECK: movzbl %ah, [[REG_REM:%[a-z0-9]+]] +; CHECK: movb %al, ([[REG_ZPTR:%[a-z0-9]+]]) +; CHECK: movl [[REG_REM]], %eax +; CHECK: ret + %div = udiv i8 %x, %y + store i8 %div, i8* @z + %1 = urem i8 %x, %y + ret i8 %1 +} + +define zeroext i8 @test_urem_zext_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_urem_zext_ah +; CHECK: divb +; CHECK: movzbl %ah, %eax +; CHECK: ret + %1 = urem i8 %x, %y + ret i8 %1 +} + +define i8 @test_urem_noext_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_urem_noext_ah +; CHECK: divb [[REG_X:%[a-z0-9]+]] +; CHECK: movzbl %ah, %eax +; CHECK: addb [[REG_X]], %al +; CHECK: ret + %1 = urem i8 %x, %y + %2 = add i8 %1, %y + ret i8 %2 +} + +define i64 @test_urem_zext64_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_urem_zext64_ah +; CHECK: divb +; CHECK: movzbl %ah, %eax +; CHECK-32: xorl %edx, %edx +; CHECK: ret + %1 = urem i8 %x, %y + %2 = zext i8 %1 to i64 + ret i64 %2 +} + +define signext i8 @test_sdivrem_sext_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_sdivrem_sext_ah +; CHECK: cbtw +; CHECK: idivb +; CHECK: movsbl %ah, [[REG_REM:%[a-z0-9]+]] +; CHECK: movb %al, ([[REG_ZPTR]]) +; CHECK: movl [[REG_REM]], %eax +; CHECK: ret + %div = sdiv i8 %x, %y + store i8 %div, i8* @z + %1 = srem i8 %x, %y + ret i8 %1 +} + +define signext i8 @test_srem_sext_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_srem_sext_ah +; CHECK: cbtw +; CHECK: idivb +; CHECK: movsbl %ah, %eax +; CHECK: ret + %1 = srem i8 %x, %y + ret i8 %1 +} + +define i8 @test_srem_noext_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_srem_noext_ah +; CHECK: cbtw +; CHECK: idivb [[REG_X:%[a-z0-9]+]] +; CHECK: movsbl %ah, %eax +; CHECK: addb [[REG_X]], %al +; CHECK: ret + %1 = srem i8 %x, %y + %2 = add i8 %1, %y + ret i8 %2 +} + +define i64 @test_srem_sext64_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_srem_sext64_ah +; CHECK: cbtw +; CHECK: idivb +; CHECK: movsbl %ah, %eax +; CHECK-32: movl %eax, %edx +; CHECK-32: sarl $31, %edx +; CHECK-64: movsbq %al, %rax +; CHECK: ret + %1 = srem i8 %x, %y + %2 = sext i8 %1 to i64 + ret i64 %2 +} + +@z = external global i8 -- 2.34.1