From 40453da779d8e511e69ace5e07a35af6a27848b7 Mon Sep 17 00:00:00 2001
From: Ahmed Bougacha <ahmed.bougacha@gmail.com>
Date: Mon, 3 Nov 2014 20:26:35 +0000
Subject: [PATCH] [X86] 8bit divrem: Improve codegen for AH register
 extraction.

For 8-bit divrems where the remainder is used, we used to generate:
    divb  %sil
    shrw  $8, %ax
    movzbl  %al, %eax

That was to avoid an H-reg access, which is problematic mainly because
it isn't possible in REX-prefixed instructions.

This patch optimizes that to:
    divb  %sil
    movzbl  %ah, %eax

To do that, we explicitly extend AH, and extract the L-subreg in the
resulting register.  The extension is done using the NOREX variants of
MOVZX.  To support signed operations, MOVSX_NOREX is also added.
Further, this introduces a new SDNode type, [us]divrem_ext_hreg, which is
then lowered to a sequence containing a single zext (rather than 2).

Differential Revision: http://reviews.llvm.org/D6064


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@221176 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelDAGToDAG.cpp  |  63 +++++++++++-------
 lib/Target/X86/X86ISelLowering.cpp  |  34 +++++++++-
 lib/Target/X86/X86ISelLowering.h    |   4 ++
 lib/Target/X86/X86InstrExtension.td |  14 +++-
 test/CodeGen/X86/divrem8_ext.ll     | 100 ++++++++++++++++++++++++++++
 5 files changed, 187 insertions(+), 28 deletions(-)
 create mode 100644 test/CodeGen/X86/divrem8_ext.ll

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 4386028e9f5..15b60ba5bf6 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2412,11 +2412,14 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
   }
 
   case ISD::SDIVREM:
-  case ISD::UDIVREM: {
+  case ISD::UDIVREM:
+  case X86ISD::SDIVREM8_SEXT_HREG:
+  case X86ISD::UDIVREM8_ZEXT_HREG: {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
 
-    bool isSigned = Opcode == ISD::SDIVREM;
+    bool isSigned = (Opcode == ISD::SDIVREM ||
+                     Opcode == X86ISD::SDIVREM8_SEXT_HREG);
     if (!isSigned) {
       switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
@@ -2532,33 +2535,43 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
     }
 
-    // Prevent use of AH in a REX instruction by referencing AX instead.
-    // Shift it down 8 bits.
+    // Prevent use of AH in a REX instruction by explicitly copying it to
+    // an ABCD_L register.
     //
     // The current assumption of the register allocator is that isel
-    // won't generate explicit references to the GPR8_NOREX registers. If
+    // won't generate explicit references to the GR8_ABCD_H registers. If
     // the allocator and/or the backend get enhanced to be more robust in
     // that regard, this can be, and should be, removed.
-    if (HiReg == X86::AH && Subtarget->is64Bit() &&
-        !SDValue(Node, 1).use_empty()) {
-      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                              X86::AX, MVT::i16, InFlag);
-      InFlag = Result.getValue(2);
-
-      // If we also need AL (the quotient), get it by extracting a subreg from
-      // Result. The fast register allocator does not like multiple CopyFromReg
-      // nodes using aliasing registers.
-      if (!SDValue(Node, 0).use_empty())
-        ReplaceUses(SDValue(Node, 0),
-          CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
-
-      // Shift AX right by 8 bits instead of using AH.
-      Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
-                                         Result,
-                                         CurDAG->getTargetConstant(8, MVT::i8)),
-                       0);
-      ReplaceUses(SDValue(Node, 1),
-        CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+    if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
+      SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
+      unsigned AHExtOpcode =
+          isSigned ? X86::MOVSX32_NOREXrr8 : X86::MOVZX32_NOREXrr8;
+
+      SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
+                                             MVT::Glue, AHCopy, InFlag);
+      SDValue Result(RNode, 0);
+      InFlag = SDValue(RNode, 1);
+
+      if (Opcode == X86ISD::UDIVREM8_ZEXT_HREG ||
+          Opcode == X86ISD::SDIVREM8_SEXT_HREG) {
+        if (Node->getValueType(1) == MVT::i64) {
+          // It's not possible to directly movsx AH to a 64bit register, because
+          // the latter needs the REX prefix, but the former can't have it.
+          assert(Opcode != X86ISD::SDIVREM8_SEXT_HREG &&
+                 "Unexpected i64 sext of h-register");
+          Result =
+              SDValue(CurDAG->getMachineNode(
+                          TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+                          CurDAG->getTargetConstant(0, MVT::i64), Result,
+                          CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)),
+                      0);
+        }
+      } else {
+        Result =
+            CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
+      }
+      ReplaceUses(SDValue(Node, 1), Result);
+      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
     }
     // Copy the division (low) result, if it is needed.
     if (!SDValue(Node, 0).use_empty()) {
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index d8ffc36891d..75ca7cfbe7d 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -19080,6 +19080,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::SBB:                return "X86ISD::SBB";
   case X86ISD::SMUL:               return "X86ISD::SMUL";
   case X86ISD::UMUL:               return "X86ISD::UMUL";
+  case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
+  case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
   case X86ISD::INC:                return "X86ISD::INC";
   case X86ISD::DEC:                return "X86ISD::DEC";
   case X86ISD::OR:                 return "X86ISD::OR";
@@ -24278,13 +24280,29 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const X86Subtarget *Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
+  // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
+  // This exposes the sext to the sdivrem lowering, so that it directly extends
+  // from AH (which we otherwise need to do contortions to access).
+  if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
+      N0.getValueType() == MVT::i8 && VT == MVT::i32) {
+    SDLoc dl(N);
+    SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
+    SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
+                            N0.getOperand(0), N0.getOperand(1));
+    DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
+    return R.getValue(1);
+  }
+
   if (!DCI.isBeforeLegalizeOps())
     return SDValue();
 
   if (!Subtarget->hasFp256())
     return SDValue();
 
-  EVT VT = N->getValueType(0);
   if (VT.isVector() && VT.getSizeInBits() == 256) {
     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
     if (R.getNode())
@@ -24377,6 +24395,20 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
       return R;
   }
 
+  // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
+  // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
+  // This exposes the zext to the udivrem lowering, so that it directly extends
+  // from AH (which we otherwise need to do contortions to access).
+  if (N0.getOpcode() == ISD::UDIVREM &&
+      N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
+      (VT == MVT::i32 || VT == MVT::i64)) {
+    SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
+    SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
+                            N0.getOperand(0), N0.getOperand(1));
+    DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
+    return R.getValue(1);
+  }
+
   return SDValue();
 }
 
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index e81a9d1209c..35e132b944a 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -304,6 +304,10 @@ namespace llvm {
       // 8-bit SMUL/UMUL - AX, FLAGS = smul8/umul8 AL, RHS
       SMUL8, UMUL8,
 
+      // 8-bit divrem that zero-extend the high result (AH).
+      UDIVREM8_ZEXT_HREG,
+      SDIVREM8_SEXT_HREG,
+
       // MUL_IMM - X86 specific multiply by immediate.
       MUL_IMM,
 
diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index 6be6a1fc6cb..b38129a41f2 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td
@@ -97,13 +97,23 @@ def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
 let neverHasSideEffects = 1, isCodeGenOnly = 1 in {
 def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
                          (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
-                         "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
                          [], IIC_MOVZX>, TB, Sched<[WriteALU]>;
 let mayLoad = 1 in
 def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
                          (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
-                         "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
                          [], IIC_MOVZX>, TB, Sched<[WriteALULd]>;
+
+def MOVSX32_NOREXrr8 : I<0xBE, MRMSrcReg,
+                         (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
+                         "movs{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
+                         [], IIC_MOVSX>, TB, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVSX32_NOREXrm8 : I<0xBE, MRMSrcMem,
+                         (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
+                         "movs{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
+                         [], IIC_MOVSX>, TB, Sched<[WriteALULd]>;
 }
 
 // MOVSX64rr8 always has a REX prefix and it has an 8-bit register
diff --git a/test/CodeGen/X86/divrem8_ext.ll b/test/CodeGen/X86/divrem8_ext.ll
new file mode 100644
index 00000000000..ec367c86526
--- /dev/null
+++ b/test/CodeGen/X86/divrem8_ext.ll
@@ -0,0 +1,100 @@
+; RUN: llc -march=x86-64 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-64
+; RUN: llc -march=x86    < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-32
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+define zeroext i8 @test_udivrem_zext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_udivrem_zext_ah
+; CHECK:   divb
+; CHECK:   movzbl %ah, [[REG_REM:%[a-z0-9]+]]
+; CHECK:   movb   %al, ([[REG_ZPTR:%[a-z0-9]+]])
+; CHECK:   movl   [[REG_REM]], %eax
+; CHECK:   ret
+  %div = udiv i8 %x, %y
+  store i8 %div, i8* @z
+  %1 = urem i8 %x, %y
+  ret i8 %1
+}
+
+define zeroext i8 @test_urem_zext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_urem_zext_ah
+; CHECK:   divb
+; CHECK:   movzbl %ah, %eax
+; CHECK:   ret
+  %1 = urem i8 %x, %y
+  ret i8 %1
+}
+
+define i8 @test_urem_noext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_urem_noext_ah
+; CHECK:   divb   [[REG_X:%[a-z0-9]+]]
+; CHECK:   movzbl %ah, %eax
+; CHECK:   addb   [[REG_X]], %al
+; CHECK:   ret
+  %1 = urem i8 %x, %y
+  %2 = add i8 %1, %y
+  ret i8 %2
+}
+
+define i64 @test_urem_zext64_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_urem_zext64_ah
+; CHECK:    divb
+; CHECK:    movzbl %ah, %eax
+; CHECK-32: xorl %edx, %edx
+; CHECK:    ret
+  %1 = urem i8 %x, %y
+  %2 = zext i8 %1 to i64
+  ret i64 %2
+}
+
+define signext i8 @test_sdivrem_sext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_sdivrem_sext_ah
+; CHECK:   cbtw
+; CHECK:   idivb
+; CHECK:   movsbl %ah, [[REG_REM:%[a-z0-9]+]]
+; CHECK:   movb   %al, ([[REG_ZPTR]])
+; CHECK:   movl   [[REG_REM]], %eax
+; CHECK:   ret
+  %div = sdiv i8 %x, %y
+  store i8 %div, i8* @z
+  %1 = srem i8 %x, %y
+  ret i8 %1
+}
+
+define signext i8 @test_srem_sext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_srem_sext_ah
+; CHECK:   cbtw
+; CHECK:   idivb
+; CHECK:   movsbl %ah, %eax
+; CHECK:   ret
+  %1 = srem i8 %x, %y
+  ret i8 %1
+}
+
+define i8 @test_srem_noext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_srem_noext_ah
+; CHECK:   cbtw
+; CHECK:   idivb [[REG_X:%[a-z0-9]+]]
+; CHECK:   movsbl %ah, %eax
+; CHECK:   addb   [[REG_X]], %al
+; CHECK:   ret
+  %1 = srem i8 %x, %y
+  %2 = add i8 %1, %y
+  ret i8 %2
+}
+
+define i64 @test_srem_sext64_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_srem_sext64_ah
+; CHECK:    cbtw
+; CHECK:    idivb
+; CHECK:    movsbl %ah, %eax
+; CHECK-32: movl %eax, %edx
+; CHECK-32: sarl $31, %edx
+; CHECK-64: movsbq %al, %rax
+; CHECK:    ret
+  %1 = srem i8 %x, %y
+  %2 = sext i8 %1 to i64
+  ret i64 %2
+}
+
+@z = external global i8
-- 
2.34.1