Use PPC reciprocal estimates with Newton iteration in fast-math mode
authorHal Finkel <hfinkel@anl.gov>
Wed, 3 Apr 2013 04:01:11 +0000 (04:01 +0000)
committerHal Finkel <hfinkel@anl.gov>
Wed, 3 Apr 2013 04:01:11 +0000 (04:01 +0000)
When unsafe FP math operations are enabled, we can use the fre[s] and
frsqrte[s] instructions, which generate reciprocal (sqrt) estimates, together
with some Newton iteration, in order to quickly generate floating-point
division and sqrt results. All of these instructions are separately optional,
and so each has its own feature flag (except for the Altivec instructions,
which are covered under the existing Altivec flag). Doing this is not only
faster than using the IEEE-compliant fdiv/fsqrt instructions, but allows these
computations to be pipelined with other computations in order to hide their
overall latency.

I've also added a couple of missing fnmsub patterns which turned out to be
missing (but are necessary for good code generation of the Newton iterations).
Altivec needs a similar fix, but that will probably be more complicated because
fneg is expanded for Altivec's v4f32.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@178617 91177308-0d34-0410-b5e6-96231b3b80d8

lib/Target/PowerPC/PPC.td
lib/Target/PowerPC/PPCISelLowering.cpp
lib/Target/PowerPC/PPCISelLowering.h
lib/Target/PowerPC/PPCInstrAltivec.td
lib/Target/PowerPC/PPCInstrInfo.td
lib/Target/PowerPC/PPCSubtarget.cpp
lib/Target/PowerPC/PPCSubtarget.h
test/CodeGen/PowerPC/recipest.ll [new file with mode: 0644]

index a1ea2297bf5a930b3d1245ed381d89ae54ec2f32..98869eaa17fa010d0c7d5d22f6ef4943f23ae0e4 100644 (file)
@@ -57,6 +57,16 @@ def FeatureMFOCRF    : SubtargetFeature<"mfocrf","HasMFOCRF", "true",
                                         "Enable the MFOCRF instruction">;
 def FeatureFSqrt     : SubtargetFeature<"fsqrt","HasFSQRT", "true",
                                         "Enable the fsqrt instruction">;
+def FeatureFRE       : SubtargetFeature<"fre", "HasFRE", "true",
+                                        "Enable the fre instruction">;
+def FeatureFRES      : SubtargetFeature<"fres", "HasFRES", "true",
+                                        "Enable the fres instruction">;
+def FeatureFRSQRTE   : SubtargetFeature<"frsqrte", "HasFRSQRTE", "true",
+                                        "Enable the frsqrte instruction">;
+def FeatureFRSQRTES  : SubtargetFeature<"frsqrtes", "HasFRSQRTES", "true",
+                                        "Enable the frsqrtes instruction">;
+def FeatureRecipPrec : SubtargetFeature<"recipprec", "HasRecipPrec", "true",
+                              "Assume higher precision reciprocal estimates">;
 def FeatureSTFIWX    : SubtargetFeature<"stfiwx","HasSTFIWX", "true",
                                         "Enable the stfiwx instruction">;
 def FeatureLFIWAX    : SubtargetFeature<"lfiwax","HasLFIWAX", "true",
@@ -101,30 +111,46 @@ include "PPCInstrInfo.td"
 
 def : Processor<"generic", G3Itineraries, [Directive32]>;
 def : Processor<"440", PPC440Itineraries, [Directive440, FeatureISEL,
+                                           FeatureFRES, FeatureFRSQRTE,
                                            FeatureBookE]>;
 def : Processor<"450", PPC440Itineraries, [Directive440, FeatureISEL,
+                                           FeatureFRES, FeatureFRSQRTE,
                                            FeatureBookE]>;
 def : Processor<"601", G3Itineraries, [Directive601]>;
 def : Processor<"602", G3Itineraries, [Directive602]>;
-def : Processor<"603", G3Itineraries, [Directive603]>;
-def : Processor<"603e", G3Itineraries, [Directive603]>;
-def : Processor<"603ev", G3Itineraries, [Directive603]>;
-def : Processor<"604", G3Itineraries, [Directive604]>;
-def : Processor<"604e", G3Itineraries, [Directive604]>;
-def : Processor<"620", G3Itineraries, [Directive620]>;
-def : Processor<"750", G4Itineraries, [Directive750]>;
-def : Processor<"g3", G3Itineraries, [Directive750]>;
-def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec]>;
-def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec]>;
-def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec]>;
-def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec]>;
+def : Processor<"603", G3Itineraries, [Directive603,
+                                       FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"603e", G3Itineraries, [Directive603,
+                                        FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"603ev", G3Itineraries, [Directive603,
+                                         FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"604", G3Itineraries, [Directive604,
+                                       FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"604e", G3Itineraries, [Directive604,
+                                        FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"620", G3Itineraries, [Directive620,
+                                       FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"750", G4Itineraries, [Directive750,
+                                       FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"g3", G3Itineraries, [Directive750,
+                                      FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec,
+                                        FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec,
+                                      FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec,
+                                            FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec,
+                                           FeatureFRES, FeatureFRSQRTE]>;
 def : Processor<"970", G5Itineraries,
                   [Directive970, FeatureAltivec,
-                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
+                   FeatureMFOCRF, FeatureFSqrt,
+                   FeatureFRES, FeatureFRSQRTE, FeatureSTFIWX,
                    Feature64Bit /*, Feature64BitRegs */]>;
 def : Processor<"g5", G5Itineraries,
                   [Directive970, FeatureAltivec,
                    FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
+                   FeatureFRES, FeatureFRSQRTE,
                    Feature64Bit /*, Feature64BitRegs */]>;
 def : ProcessorModel<"e500mc", PPCE500mcModel,
                   [DirectiveE500mc, FeatureMFOCRF,
@@ -134,43 +160,57 @@ def : ProcessorModel<"e5500", PPCE5500Model,
                    FeatureSTFIWX, FeatureBookE, FeatureISEL]>;
 def : Processor<"a2", PPCA2Itineraries,
                   [DirectiveA2, FeatureBookE, FeatureMFOCRF,
-                   FeatureFSqrt, FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
+                   FeatureSTFIWX, FeatureLFIWAX,
                    FeatureFPRND, FeatureFPCVT, FeatureISEL,
                    FeaturePOPCNTD, FeatureLDBRX, Feature64Bit
                /*, Feature64BitRegs */]>;
 def : Processor<"a2q", PPCA2Itineraries,
                   [DirectiveA2, FeatureBookE, FeatureMFOCRF,
-                   FeatureFSqrt, FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
+                   FeatureSTFIWX, FeatureLFIWAX,
                    FeatureFPRND, FeatureFPCVT, FeatureISEL,
                    FeaturePOPCNTD, FeatureLDBRX, Feature64Bit
                /*, Feature64BitRegs */, FeatureQPX]>;
 def : Processor<"pwr3", G5Itineraries,
-                  [DirectivePwr3, FeatureAltivec, FeatureMFOCRF,
+                  [DirectivePwr3, FeatureAltivec,
+                   FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF,
                    FeatureSTFIWX, Feature64Bit]>;
 def : Processor<"pwr4", G5Itineraries,
                   [DirectivePwr4, FeatureAltivec, FeatureMFOCRF,
-                   FeatureFSqrt, FeatureSTFIWX, Feature64Bit]>;
+                   FeatureFSqrt, FeatureFRES, FeatureFRSQRTE,
+                   FeatureSTFIWX, Feature64Bit]>;
 def : Processor<"pwr5", G5Itineraries,
                   [DirectivePwr5, FeatureAltivec, FeatureMFOCRF,
-                   FeatureFSqrt, FeatureSTFIWX, Feature64Bit]>;
+                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES,
+                   FeatureSTFIWX, Feature64Bit]>;
 def : Processor<"pwr5x", G5Itineraries,
                   [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
-                   FeatureFSqrt, FeatureSTFIWX, FeatureFPRND,
-                   Feature64Bit]>;
+                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES,
+                   FeatureSTFIWX, FeatureFPRND, Feature64Bit]>;
 def : Processor<"pwr6", G5Itineraries,
                   [DirectivePwr6, FeatureAltivec,
-                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
-                   FeatureLFIWAX, FeatureFPRND, Feature64Bit
-               /*, Feature64BitRegs */]>;
+                   FeatureMFOCRF, FeatureFSqrt, FeatureFRE,
+                   FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
+                   FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFPRND, Feature64Bit /*, Feature64BitRegs */]>;
 def : Processor<"pwr6x", G5Itineraries,
                   [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
-                   FeatureFSqrt, FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
+                   FeatureSTFIWX, FeatureLFIWAX,
                    FeatureFPRND, Feature64Bit]>;
 def : Processor<"pwr7", G5Itineraries,
                   [DirectivePwr7, FeatureAltivec,
-                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
-                   FeatureLFIWAX, FeatureFPRND, FeatureFPCVT,
-                   FeatureISEL, FeaturePOPCNTD, FeatureLDBRX,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureFRE,
+                   FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
+                   FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFPRND, FeatureFPCVT, FeatureISEL,
+                   FeaturePOPCNTD, FeatureLDBRX,
                    Feature64Bit /*, Feature64BitRegs */]>;
 def : Processor<"ppc", G3Itineraries, [Directive32]>;
 def : Processor<"ppc64", G5Itineraries,
index ee43fc668e7c351eb5580937dca61478ed87cf8b..3bda37b43e0d60f96a3003928f15c9c767e2ba58 100644 (file)
@@ -150,10 +150,15 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
 
   // If we're enabling GP optimizations, use hardware square root
-  if (!Subtarget->hasFSQRT()) {
+  if (!Subtarget->hasFSQRT() &&
+      !(TM.Options.UnsafeFPMath &&
+        Subtarget->hasFRSQRTE() && Subtarget->hasFRE()))
     setOperationAction(ISD::FSQRT, MVT::f64, Expand);
+
+  if (!Subtarget->hasFSQRT() &&
+      !(TM.Options.UnsafeFPMath &&
+        Subtarget->hasFRSQRTES() && Subtarget->hasFRES()))
     setOperationAction(ISD::FSQRT, MVT::f32, Expand);
-  }
 
   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
@@ -469,6 +474,12 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
 
     setOperationAction(ISD::MUL, MVT::v4f32, Legal);
     setOperationAction(ISD::FMA, MVT::v4f32, Legal);
+
+    if (TM.Options.UnsafeFPMath) {
+      setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+      setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
+    }
+
     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
     setOperationAction(ISD::MUL, MVT::v16i8, Custom);
@@ -519,6 +530,12 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setTargetDAGCombine(ISD::BR_CC);
   setTargetDAGCombine(ISD::BSWAP);
 
+  // Use reciprocal estimates.
+  if (TM.Options.UnsafeFPMath) {
+    setTargetDAGCombine(ISD::FDIV);
+    setTargetDAGCombine(ISD::FSQRT);
+  }
+
   // Darwin long double math library functions have $LDBL128 appended.
   if (Subtarget->isDarwin()) {
     setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
@@ -590,6 +607,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::FCFID:           return "PPCISD::FCFID";
   case PPCISD::FCTIDZ:          return "PPCISD::FCTIDZ";
   case PPCISD::FCTIWZ:          return "PPCISD::FCTIWZ";
+  case PPCISD::FRE:             return "PPCISD::FRE";
+  case PPCISD::FRSQRTE:         return "PPCISD::FRSQRTE";
   case PPCISD::STFIWX:          return "PPCISD::STFIWX";
   case PPCISD::VMADDFP:         return "PPCISD::VMADDFP";
   case PPCISD::VNMSUBFP:        return "PPCISD::VNMSUBFP";
@@ -6658,6 +6677,153 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 // Target Optimization Hooks
 //===----------------------------------------------------------------------===//
 
+SDValue PPCTargetLowering::DAGCombineFastRecip(SDNode *N,
+                                             DAGCombinerInfo &DCI,
+                                             bool UseOperand) const {
+  if (DCI.isAfterLegalizeVectorOps())
+    return SDValue();
+
+  if ((N->getValueType(0) == MVT::f32 && PPCSubTarget.hasFRES()) ||
+      (N->getValueType(0) == MVT::f64 && PPCSubTarget.hasFRE())  ||
+      (N->getValueType(0) == MVT::v4f32 && PPCSubTarget.hasAltivec())) {
+
+    // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+    // For the reciprocal, we need to find the zero of the function:
+    //   F(X) = A X - 1 [which has a zero at X = 1/A]
+    //     =>
+    //   X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
+    //     does not require additional intermediate precision]
+
+    // Convergence is quadratic, so we essentially double the number of digits
+    // correct after every iteration. The minimum architected relative
+    // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
+    // 23 digits and double has 52 digits.
+    int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3;
+    if (N->getValueType(0).getScalarType() == MVT::f64)
+      ++Iterations;
+
+    SelectionDAG &DAG = DCI.DAG;
+    DebugLoc dl = N->getDebugLoc();
+
+    SDValue FPOne =
+      DAG.getConstantFP(1.0, N->getValueType(0).getScalarType());
+    if (N->getValueType(0).isVector()) {
+      assert(N->getValueType(0).getVectorNumElements() == 4 &&
+             "Unknown vector type");
+      FPOne = DAG.getNode(ISD::BUILD_VECTOR, dl, N->getValueType(0),
+                          FPOne, FPOne, FPOne, FPOne);
+    }
+
+    SDValue Est = DAG.getNode(PPCISD::FRE, dl,
+                              N->getValueType(0),
+                              UseOperand ? N->getOperand(1) :
+                                           SDValue(N, 0));
+    DCI.AddToWorklist(Est.getNode());
+
+    // Newton iterations: Est = Est + Est (1 - Arg * Est)
+    for (int i = 0; i < Iterations; ++i) {
+      SDValue NewEst = DAG.getNode(ISD::FMUL, dl,
+                                   N->getValueType(0),
+                                   UseOperand ? N->getOperand(1) :
+                                                SDValue(N, 0),
+                                   Est);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      NewEst = DAG.getNode(ISD::FSUB, dl,
+                           N->getValueType(0), FPOne, NewEst);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      NewEst = DAG.getNode(ISD::FMUL, dl,
+                           N->getValueType(0), Est, NewEst);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      Est = DAG.getNode(ISD::FADD, dl,
+                        N->getValueType(0), Est, NewEst);
+      DCI.AddToWorklist(Est.getNode());
+    }
+
+    return Est;
+  }
+
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  if (DCI.isAfterLegalizeVectorOps())
+    return SDValue();
+
+  if ((N->getValueType(0) == MVT::f32 && PPCSubTarget.hasFRSQRTES()) ||
+      (N->getValueType(0) == MVT::f64 && PPCSubTarget.hasFRSQRTE())  ||
+      (N->getValueType(0) == MVT::v4f32 && PPCSubTarget.hasAltivec())) {
+
+    // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+    // For the reciprocal sqrt, we need to find the zero of the function:
+    //   F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
+    //     =>
+    //   X_{i+1} = X_i (1.5 - A X_i^2 / 2)
+    // As a result, we precompute A/2 prior to the iteration loop.
+
+    // Convergence is quadratic, so we essentially double the number of digits
+    // correct after every iteration. The minimum architected relative
+    // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
+    // 23 digits and double has 52 digits.
+    int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3;
+    if (N->getValueType(0).getScalarType() == MVT::f64)
+      ++Iterations;
+
+    SelectionDAG &DAG = DCI.DAG;
+    DebugLoc dl = N->getDebugLoc();
+
+    SDValue FPThreeHalfs =
+      DAG.getConstantFP(1.5, N->getValueType(0).getScalarType());
+    if (N->getValueType(0).isVector()) {
+      assert(N->getValueType(0).getVectorNumElements() == 4 &&
+             "Unknown vector type");
+      FPThreeHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, N->getValueType(0),
+                                 FPThreeHalfs, FPThreeHalfs,
+                                 FPThreeHalfs, FPThreeHalfs);
+    }
+
+    SDValue Est = DAG.getNode(PPCISD::FRSQRTE, dl,
+                              N->getValueType(0), N->getOperand(0));
+    DCI.AddToWorklist(Est.getNode());
+
+    // We now need 0.5*Arg which we can write as (1.5*Arg - Arg) so that
+    // this entire sequence requires only one FP constant.
+    SDValue HalfArg = DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
+                                  FPThreeHalfs, N->getOperand(0));
+    DCI.AddToWorklist(HalfArg.getNode());
+
+    HalfArg = DAG.getNode(ISD::FSUB, dl, N->getValueType(0),
+                          HalfArg, N->getOperand(0));
+    DCI.AddToWorklist(HalfArg.getNode());
+
+    // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
+    for (int i = 0; i < Iterations; ++i) {
+      SDValue NewEst = DAG.getNode(ISD::FMUL, dl,
+                                   N->getValueType(0), Est, Est);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      NewEst = DAG.getNode(ISD::FMUL, dl,
+                           N->getValueType(0), HalfArg, NewEst);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      NewEst = DAG.getNode(ISD::FSUB, dl,
+                           N->getValueType(0), FPThreeHalfs, NewEst);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      Est = DAG.getNode(ISD::FMUL, dl,
+                        N->getValueType(0), Est, NewEst);
+      DCI.AddToWorklist(Est.getNode());
+    }
+
+    return Est;
+  }
+
+  return SDValue();
+}
+
 SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   const TargetMachine &TM = getTargetMachine();
@@ -6684,7 +6850,44 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         return N->getOperand(0);
     }
     break;
+  case ISD::FDIV: {
+    assert(TM.Options.UnsafeFPMath &&
+           "Reciprocal estimates require UnsafeFPMath");
+
+    if (N->getOperand(1).getOpcode() == ISD::FSQRT) {
+      SDValue RV = DAGCombineFastRecipFSQRT(N->getOperand(1).getNode(), DCI);
+      if (RV.getNode() != 0) {
+        DCI.AddToWorklist(RV.getNode());
+        return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
+                           N->getOperand(0), RV);
+      }
+    }
+
+    SDValue RV = DAGCombineFastRecip(N, DCI);
+    if (RV.getNode() != 0) {
+      DCI.AddToWorklist(RV.getNode());
+      return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
+                         N->getOperand(0), RV);
+    }
+
+    }
+    break;
+  case ISD::FSQRT: {
+    assert(TM.Options.UnsafeFPMath &&
+           "Reciprocal estimates require UnsafeFPMath");
+
+    // Compute this as 1/(1/sqrt(X)), which is the reciprocal of the
+    // reciprocal sqrt.
+    SDValue RV = DAGCombineFastRecipFSQRT(N, DCI);
+    if (RV.getNode() != 0) {
+      DCI.AddToWorklist(RV.getNode());
+      RV = DAGCombineFastRecip(RV.getNode(), DCI, false);
+      if (RV.getNode() != 0)
+        return RV;
+    }
 
+    }
+    break;
   case ISD::SINT_TO_FP:
     if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) {
       if (N->getOperand(0).getOpcode() == ISD::FP_TO_SINT) {
index 6690899e5d2cd8f257936b4ba2dae064f316d7e4..2ddcb25a23eb00f7af8848c9603361ba2dc80ecc 100644 (file)
@@ -49,6 +49,9 @@ namespace llvm {
       /// unsigned integers.
       FCTIDUZ, FCTIWUZ,
 
+      /// Reciprocal estimate instructions (unary FP ops).
+      FRE, FRSQRTE,
+
       // VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking
       // three v4f32 operands and producing a v4f32 result.
       VMADDFP, VNMSUBFP,
@@ -620,6 +623,10 @@ namespace llvm {
 
     SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue DAGCombineFastRecip(SDNode *N, DAGCombinerInfo &DCI,
+                                bool UseOperand = true) const;
+    SDValue DAGCombineFastRecipFSQRT(SDNode *N, DAGCombinerInfo &DCI) const;
   };
 }
 
index fff91df4184c79de36b7f154660407e18597aec4..18968ba0dd97633bfcf30323bf7a774747062622 100644 (file)
@@ -795,6 +795,9 @@ def : Pat<(int_ppc_altivec_vnmsubfp v4f32:$A, v4f32:$B, v4f32:$C),
 def : Pat<(PPCvperm (v16i8 VRRC:$vA), VRRC:$vB, VRRC:$vC),
           (VPERM $vA, $vB, $vC)>;
 
+def : Pat<(PPCfre v4f32:$A), (VREFP $A)>;
+def : Pat<(PPCfrsqrte v4f32:$A), (VRSQRTEFP $A)>;
+
 // Vector shifts
 def : Pat<(v16i8 (shl v16i8:$vA, v16i8:$vB)),
           (v16i8 (VSLB $vA, $vB))>;
index 067f5aacfaab58ad96406a825254085521689d2a..feca7c037019eba812f266088c6d8900870b46f3 100644 (file)
@@ -62,6 +62,9 @@ def SDT_PPCTC_ret : SDTypeProfile<0, 2, [
 // PowerPC specific DAG Nodes.
 //
 
+def PPCfre    : SDNode<"PPCISD::FRE",     SDTFPUnaryOp, []>;
+def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>;
+
 def PPCfcfid  : SDNode<"PPCISD::FCFID",   SDTFPUnaryOp, []>;
 def PPCfcfidu : SDNode<"PPCISD::FCFIDU",  SDTFPUnaryOp, []>;
 def PPCfcfids : SDNode<"PPCISD::FCFIDS",  SDTFPRoundOp, []>;
@@ -1223,8 +1226,21 @@ def FNEGS  : XForm_26<63, 40, (outs F4RC:$frD), (ins F4RC:$frB),
 def FNEGD  : XForm_26<63, 40, (outs F8RC:$frD), (ins F8RC:$frB),
                       "fneg $frD, $frB", FPGeneral,
                       [(set f64:$frD, (fneg f64:$frB))]>;
+
+// Reciprocal estimates.
+def FRE      : XForm_26<63, 24, (outs F8RC:$frD), (ins F8RC:$frB),
+                        "fre $frD, $frB", FPGeneral,
+                        [(set f64:$frD, (PPCfre f64:$frB))]>;
+def FRES     : XForm_26<59, 24, (outs F4RC:$frD), (ins F4RC:$frB),
+                        "fres $frD, $frB", FPGeneral,
+                        [(set f32:$frD, (PPCfre f32:$frB))]>;
+def FRSQRTE  : XForm_26<63, 26, (outs F8RC:$frD), (ins F8RC:$frB),
+                        "frsqrte $frD, $frB", FPGeneral,
+                        [(set f64:$frD, (PPCfrsqrte f64:$frB))]>;
+def FRSQRTES : XForm_26<59, 26, (outs F4RC:$frD), (ins F4RC:$frB),
+                        "frsqrtes $frD, $frB", FPGeneral,
+                        [(set f32:$frD, (PPCfrsqrte f32:$frB))]>;
 }
-                      
 
 // XL-Form instructions.  condition register logical ops.
 //
@@ -1687,5 +1703,15 @@ def : Pat<(membarrier (i32 imm /*ll*/),
 
 def : Pat<(atomic_fence (imm), (imm)), (SYNC)>;
 
+// Additional FNMSUB patterns: -a*c + b == -(a*c - b)
+def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
+          (FNMSUB $A, $C, $B)>;
+def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B),
+          (FNMSUB $A, $C, $B)>;
+def : Pat<(fma (fneg f32:$A), f32:$C, f32:$B),
+          (FNMSUBS $A, $C, $B)>;
+def : Pat<(fma f32:$A, (fneg f32:$C), f32:$B),
+          (FNMSUBS $A, $C, $B)>;
+
 include "PPCInstrAltivec.td"
 include "PPCInstr64Bit.td"
index 1b7150fd37dcee61176d6afaa16df861a1ad303e..a8f2b3f47d1b52ea0746838582a89931ba2b978e 100644 (file)
@@ -38,6 +38,11 @@ PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
   , HasAltivec(false)
   , HasQPX(false)
   , HasFSQRT(false)
+  , HasFRE(false)
+  , HasFRES(false)
+  , HasFRSQRTE(false)
+  , HasFRSQRTES(false)
+  , HasRecipPrec(false)
   , HasSTFIWX(false)
   , HasLFIWAX(false)
   , HasFPRND(false)
index d5dfa1e8e148effbeafcc35e3bcdaaa8fbb855e2..65b4d211fc6ad1abd104e351d35d2d10f398cc50 100644 (file)
@@ -77,6 +77,8 @@ protected:
   bool HasAltivec;
   bool HasQPX;
   bool HasFSQRT;
+  bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES;
+  bool HasRecipPrec;
   bool HasSTFIWX;
   bool HasLFIWAX;
   bool HasFPRND;
@@ -159,6 +161,11 @@ public:
 
   // Specific obvious features.
   bool hasFSQRT() const { return HasFSQRT; }
+  bool hasFRE() const { return HasFRE; }
+  bool hasFRES() const { return HasFRES; }
+  bool hasFRSQRTE() const { return HasFRSQRTE; }
+  bool hasFRSQRTES() const { return HasFRSQRTES; }
+  bool hasRecipPrec() const { return HasRecipPrec; }
   bool hasSTFIWX() const { return HasSTFIWX; }
   bool hasLFIWAX() const { return HasLFIWAX; }
   bool hasFPRND() const { return HasFPRND; }
diff --git a/test/CodeGen/PowerPC/recipest.ll b/test/CodeGen/PowerPC/recipest.ll
new file mode 100644 (file)
index 0000000..cad98c5
--- /dev/null
@@ -0,0 +1,178 @@
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck -check-prefix=CHECK-SAFE %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare double @llvm.sqrt.f64(double)
+declare float @llvm.sqrt.f32(float)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define double @foo(double %a, double %b) nounwind {
+entry:
+  %x = call double @llvm.sqrt.f64(double %b)
+  %r = fdiv double %a, %x
+  ret double %r
+
+; CHECK: @foo
+; CHECK: frsqrte
+; CHECK: fnmsub
+; CHECK: fmul
+; CHECK: fmadd
+; CHECK: fmul
+; CHECK: fmul
+; CHECK: fmadd
+; CHECK: fmul
+; CHECK: fmul
+; CHECK: blr
+
+; CHECK-SAFE: @foo
+; CHECK-SAFE: fsqrt
+; CHECK-SAFE: fdiv
+; CHECK-SAFE: blr
+}
+
+define float @goo(float %a, float %b) nounwind {
+entry:
+  %x = call float @llvm.sqrt.f32(float %b)
+  %r = fdiv float %a, %x
+  ret float %r
+
+; CHECK: @goo
+; CHECK: frsqrtes
+; CHECK: fnmsubs
+; CHECK: fmuls
+; CHECK: fmadds
+; CHECK: fmuls
+; CHECK: fmuls
+; CHECK: blr
+
+; CHECK-SAFE: @goo
+; CHECK-SAFE: fsqrts
+; CHECK-SAFE: fdivs
+; CHECK-SAFE: blr
+}
+
+define <4 x float> @hoo(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+  %x = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %b)
+  %r = fdiv <4 x float> %a, %x
+  ret <4 x float> %r
+
+; CHECK: @hoo
+; CHECK: vrsqrtefp
+
+; CHECK-SAFE: @hoo
+; CHECK-SAFE-NOT: vrsqrtefp
+; CHECK-SAFE: blr
+}
+
+define double @foo2(double %a, double %b) nounwind {
+entry:
+  %r = fdiv double %a, %b
+  ret double %r
+
+; CHECK: @foo2
+; CHECK: fre
+; CHECK: fnmsub
+; CHECK: fmadd
+; CHECK: fnmsub
+; CHECK: fmadd
+; CHECK: fmul
+; CHECK: blr
+
+; CHECK-SAFE: @foo2
+; CHECK-SAFE: fdiv
+; CHECK-SAFE: blr
+}
+
+define float @goo2(float %a, float %b) nounwind {
+entry:
+  %r = fdiv float %a, %b
+  ret float %r
+
+; CHECK: @goo2
+; CHECK: fres
+; CHECK: fnmsubs
+; CHECK: fmadds
+; CHECK: fmuls
+; CHECK: blr
+
+; CHECK-SAFE: @goo2
+; CHECK-SAFE: fdivs
+; CHECK-SAFE: blr
+}
+
+define <4 x float> @hoo2(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+  %r = fdiv <4 x float> %a, %b
+  ret <4 x float> %r
+
+; CHECK: @hoo2
+; CHECK: vrefp
+
+; CHECK-SAFE: @hoo2
+; CHECK-SAFE-NOT: vrefp
+; CHECK-SAFE: blr
+}
+
+define double @foo3(double %a) nounwind {
+entry:
+  %r = call double @llvm.sqrt.f64(double %a)
+  ret double %r
+
+; CHECK: @foo3
+; CHECK: frsqrte
+; CHECK: fnmsub
+; CHECK: fmul
+; CHECK: fmadd
+; CHECK: fmul
+; CHECK: fmul
+; CHECK: fmadd
+; CHECK: fmul
+; CHECK: fre
+; CHECK: fnmsub
+; CHECK: fmadd
+; CHECK: fnmsub
+; CHECK: fmadd
+; CHECK: blr
+
+; CHECK-SAFE: @foo3
+; CHECK-SAFE: fsqrt
+; CHECK-SAFE: blr
+}
+
+define float @goo3(float %a) nounwind {
+entry:
+  %r = call float @llvm.sqrt.f32(float %a)
+  ret float %r
+
+; CHECK: @goo3
+; CHECK: frsqrtes
+; CHECK: fnmsubs
+; CHECK: fmuls
+; CHECK: fmadds
+; CHECK: fmuls
+; CHECK: fres
+; CHECK: fnmsubs
+; CHECK: fmadds
+; CHECK: blr
+
+; CHECK-SAFE: @goo3
+; CHECK-SAFE: fsqrts
+; CHECK-SAFE: blr
+}
+
+define <4 x float> @hoo3(<4 x float> %a) nounwind {
+entry:
+  %r = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a)
+  ret <4 x float> %r
+
+; CHECK: @hoo3
+; CHECK: vrsqrtefp
+; CHECK: vrefp
+
+; CHECK-SAFE: @hoo3
+; CHECK-SAFE-NOT: vrsqrtefp
+; CHECK-SAFE: blr
+}
+