[SystemZ] Remove redundant frame MMOs

[oota-llvm.git] / lib / Target / SystemZ / SystemZISelLowering.cpp
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp

index ea094b39f35f02229a916d6d7d6188a3821124f2..256c27829d7f2bd3a7499056265852e3a2982d4b 100644 (file)
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -67,7 +67,7 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
  
    // TODO: It may be better to default to latency-oriented scheduling, however
    // LLVM's current latency-oriented scheduler can't handle physreg definitions
-  // such as SystemZ has with PSW, so set this to the register-pressure
+  // such as SystemZ has with CC, so set this to the register-pressure
    // scheduler, because it can.
    setSchedulingPreference(Sched::RegPressure);
  
@@ -200,11 +200,6 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
    setOperationAction(ISD::STACKSAVE,    MVT::Other, Custom);
    setOperationAction(ISD::STACKRESTORE, MVT::Other, Custom);
  
-  // Expand these using getExceptionSelectorRegister() and
-  // getExceptionPointerRegister().
-  setOperationAction(ISD::EXCEPTIONADDR, PtrVT, Expand);
-  setOperationAction(ISD::EHSELECTION,   PtrVT, Expand);
-
    // Handle floating-point types.
    for (unsigned I = MVT::FIRST_FP_VALUETYPE;
         I <= MVT::LAST_FP_VALUETYPE;
@@ -253,6 +248,16 @@ bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
    return Imm.isZero() || Imm.isNegZero();
  }
  
+bool SystemZTargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
+                                                          bool *Fast) const {
+  // Unaligned accesses should never be slower than the expanded version.
+  // We check specifically for aligned accesses in the few cases where
+  // they are required.
+  if (Fast)
+    *Fast = true;
+  return true;
+}
+  
  //===----------------------------------------------------------------------===//
  // Inline asm support
  //===----------------------------------------------------------------------===//
@@ -350,7 +355,7 @@ getSingleConstraintMatchWeight(AsmOperandInfo &info,
  }
  
  std::pair<unsigned, const TargetRegisterClass *> SystemZTargetLowering::
-getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const {
+getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const {
    if (Constraint.size() == 1) {
      // GCC Constraint Letters
      switch (Constraint[0]) {
@@ -436,7 +441,7 @@ LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
  // Value is a value that has been passed to us in the location described by VA
  // (and so has type VA.getLocVT()).  Convert Value to VA.getValVT(), chaining
  // any loads onto Chain.
-static SDValue convertLocVTToValVT(SelectionDAG &DAG, DebugLoc DL,
+static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDLoc DL,
                                     CCValAssign &VA, SDValue Chain,
                                     SDValue Value) {
    // If the argument has been promoted from a smaller type, insert an
@@ -461,7 +466,7 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, DebugLoc DL,
  // Value is a value of type VA.getValVT() that we need to copy into
  // the location described by VA.  Return a copy of Value converted to
  // VA.getValVT().  The caller is responsible for handling indirect values.
-static SDValue convertValVTToLocVT(SelectionDAG &DAG, DebugLoc DL,
+static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDLoc DL,
                                     CCValAssign &VA, SDValue Value) {
    switch (VA.getLocInfo()) {
    case CCValAssign::SExt:
@@ -480,7 +485,7 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, DebugLoc DL,
  SDValue SystemZTargetLowering::
  LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                     DebugLoc DL, SelectionDAG &DAG,
+                     SDLoc DL, SelectionDAG &DAG,
                       SmallVectorImpl<SDValue> &InVals) const {
    MachineFunction &MF = DAG.getMachineFunction();
    MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -599,7 +604,7 @@ SDValue
  SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
                                   SmallVectorImpl<SDValue> &InVals) const {
    SelectionDAG &DAG = CLI.DAG;
-  DebugLoc &DL = CLI.DL;
+  SDLoc &DL = CLI.DL;
    SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
    SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
    SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
@@ -623,7 +628,8 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
    unsigned NumBytes = ArgCCInfo.getNextStackOffset();
  
    // Mark the start of the call.
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumBytes, PtrVT, true));
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumBytes, PtrVT, true),
+                               DL);
  
    // Copy argument values to their designated locations.
    SmallVector<std::pair<unsigned, SDValue>, 9> RegsToPass;
@@ -714,7 +720,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
    Chain = DAG.getCALLSEQ_END(Chain,
                               DAG.getConstant(NumBytes, PtrVT, true),
                               DAG.getConstant(0, PtrVT, true),
-                             Glue);
+                             Glue, DL);
    Glue = Chain.getValue(1);
  
    // Assign locations to each value returned by this call.
@@ -745,7 +751,7 @@ SystemZTargetLowering::LowerReturn(SDValue Chain,
                                     CallingConv::ID CallConv, bool IsVarArg,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
-                                   DebugLoc DL, SelectionDAG &DAG) const {
+                                   SDLoc DL, SelectionDAG &DAG) const {
    MachineFunction &MF = DAG.getMachineFunction();
  
    // Assign locations to each returned value.
@@ -879,7 +885,7 @@ static void adjustSubwordCmp(SelectionDAG &DAG, bool &IsUnsigned,
    ISD::LoadExtType ExtType = IsUnsigned ? ISD::ZEXTLOAD : ISD::SEXTLOAD;
    if (CmpOp0.getValueType() != MVT::i32 ||
        Load->getExtensionType() != ExtType)
-    CmpOp0 = DAG.getExtLoad(ExtType, Load->getDebugLoc(), MVT::i32,
+    CmpOp0 = DAG.getExtLoad(ExtType, SDLoc(Load), MVT::i32,
                              Load->getChain(), Load->getBasePtr(),
                              Load->getPointerInfo(), Load->getMemoryVT(),
                              Load->isVolatile(), Load->isNonTemporal(),
@@ -949,7 +955,7 @@ static SDValue emitCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
        IsUnsigned = true;
    }
  
-  DebugLoc DL = CmpOp0.getDebugLoc();
+  SDLoc DL(CmpOp0);
    return DAG.getNode((IsUnsigned ? SystemZISD::UCMP : SystemZISD::CMP),
                       DL, MVT::Glue, CmpOp0, CmpOp1);
  }
@@ -959,7 +965,7 @@ static SDValue emitCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
  // Extend extends Op0 to a GR128, and Opcode performs the GR128 operation
  // on the extended Op0 and (unextended) Op1.  Store the even register result
  // in Even and the odd register result in Odd.
-static void lowerGR128Binary(SelectionDAG &DAG, DebugLoc DL, EVT VT,
+static void lowerGR128Binary(SelectionDAG &DAG, SDLoc DL, EVT VT,
                               unsigned Extend, unsigned Opcode,
                               SDValue Op0, SDValue Op1,
                               SDValue &Even, SDValue &Odd) {
@@ -983,7 +989,7 @@ SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
    SDValue CmpOp0   = Op.getOperand(2);
    SDValue CmpOp1   = Op.getOperand(3);
    SDValue Dest     = Op.getOperand(4);
-  DebugLoc DL      = Op.getDebugLoc();
+  SDLoc DL(Op);
  
    unsigned CCMask;
    SDValue Flags = emitCmp(DAG, CmpOp0, CmpOp1, CC, CCMask);
@@ -998,7 +1004,7 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
    SDValue TrueOp   = Op.getOperand(2);
    SDValue FalseOp  = Op.getOperand(3);
    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
-  DebugLoc DL      = Op.getDebugLoc();
+  SDLoc DL(Op);
  
    unsigned CCMask;
    SDValue Flags = emitCmp(DAG, CmpOp0, CmpOp1, CC, CCMask);
@@ -1015,7 +1021,7 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
  
  SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
                                                    SelectionDAG &DAG) const {
-  DebugLoc DL = Node->getDebugLoc();
+  SDLoc DL(Node);
    const GlobalValue *GV = Node->getGlobal();
    int64_t Offset = Node->getOffset();
    EVT PtrVT = getPointerTy();
@@ -1054,7 +1060,7 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
  
  SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
                                                      SelectionDAG &DAG) const {
-  DebugLoc DL = Node->getDebugLoc();
+  SDLoc DL(Node);
    const GlobalValue *GV = Node->getGlobal();
    EVT PtrVT = getPointerTy();
    TLSModel::Model model = TM.getTLSModel(GV);
@@ -1093,7 +1099,7 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
  
  SDValue SystemZTargetLowering::lowerBlockAddress(BlockAddressSDNode *Node,
                                                   SelectionDAG &DAG) const {
-  DebugLoc DL = Node->getDebugLoc();
+  SDLoc DL(Node);
    const BlockAddress *BA = Node->getBlockAddress();
    int64_t Offset = Node->getOffset();
    EVT PtrVT = getPointerTy();
@@ -1105,7 +1111,7 @@ SDValue SystemZTargetLowering::lowerBlockAddress(BlockAddressSDNode *Node,
  
  SDValue SystemZTargetLowering::lowerJumpTable(JumpTableSDNode *JT,
                                                SelectionDAG &DAG) const {
-  DebugLoc DL = JT->getDebugLoc();
+  SDLoc DL(JT);
    EVT PtrVT = getPointerTy();
    SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
  
@@ -1115,7 +1121,7 @@ SDValue SystemZTargetLowering::lowerJumpTable(JumpTableSDNode *JT,
  
  SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
                                                   SelectionDAG &DAG) const {
-  DebugLoc DL = CP->getDebugLoc();
+  SDLoc DL(CP);
    EVT PtrVT = getPointerTy();
  
    SDValue Result;
@@ -1132,7 +1138,7 @@ SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
  
  SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op,
                                              SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
    SDValue In = Op.getOperand(0);
    EVT InVT = In.getValueType();
    EVT ResVT = Op.getValueType();
@@ -1169,7 +1175,7 @@ SDValue SystemZTargetLowering::lowerVASTART(SDValue Op,
    SDValue Chain   = Op.getOperand(0);
    SDValue Addr    = Op.getOperand(1);
    const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  DebugLoc DL     = Op.getDebugLoc();
+  SDLoc DL(Op);
  
    // The initial values of each field.
    const unsigned NumFields = 4;
@@ -1203,7 +1209,7 @@ SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op,
    SDValue SrcPtr     = Op.getOperand(2);
    const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
    const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
-  DebugLoc DL        = Op.getDebugLoc();
+  SDLoc DL(Op);
  
    return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(32),
                         /*Align*/8, /*isVolatile*/false, /*AlwaysInline*/false,
@@ -1214,7 +1220,7 @@ SDValue SystemZTargetLowering::
  lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
    SDValue Chain = Op.getOperand(0);
    SDValue Size  = Op.getOperand(1);
-  DebugLoc DL   = Op.getDebugLoc();
+  SDLoc DL(Op);
  
    unsigned SPReg = getStackPointerRegisterToSaveRestore();
  
@@ -1240,7 +1246,7 @@ lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
  SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
                                                SelectionDAG &DAG) const {
    EVT VT = Op.getValueType();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
    assert(!is32Bit(VT) && "Only support 64-bit UMUL_LOHI");
  
    // UMUL_LOHI64 returns the low result in the odd register and the high
@@ -1257,19 +1263,24 @@ SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op,
    SDValue Op0 = Op.getOperand(0);
    SDValue Op1 = Op.getOperand(1);
    EVT VT = Op.getValueType();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
+  unsigned Opcode;
  
    // We use DSGF for 32-bit division.
    if (is32Bit(VT)) {
      Op0 = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Op0);
-    Op1 = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Op1);
-  }
+    Opcode = SystemZISD::SDIVREM32;
+  } else if (DAG.ComputeNumSignBits(Op1) > 32) {
+    Op1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op1);
+    Opcode = SystemZISD::SDIVREM32;
+  } else    
+    Opcode = SystemZISD::SDIVREM64;
  
    // DSG(F) takes a 64-bit dividend, so the even register in the GR128
    // input is "don't care".  The instruction returns the remainder in
    // the even register and the quotient in the odd register.
    SDValue Ops[2];
-  lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::SDIVREM64,
+  lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, Opcode,
                     Op0, Op1, Ops[1], Ops[0]);
    return DAG.getMergeValues(Ops, 2, DL);
  }
@@ -1277,7 +1288,7 @@ SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op,
  SDValue SystemZTargetLowering::lowerUDIVREM(SDValue Op,
                                              SelectionDAG &DAG) const {
    EVT VT = Op.getValueType();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
  
    // DL(G) uses a double-width dividend, so we need to clear the even
    // register in the GR128 input.  The instruction returns the remainder
@@ -1342,7 +1353,7 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
    // low 32 bits by truncating Low to an i32 and inserting it directly
    // using a subreg.  The interesting cases are those where the truncation
    // can be folded.
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
    SDValue Low32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, LowOp);
    SDValue SubReg32 = DAG.getTargetConstant(SystemZ::subreg_32bit, MVT::i64);
    SDNode *Result = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
@@ -1368,7 +1379,7 @@ SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
    SDValue Addr = Node->getBasePtr();
    SDValue Src2 = Node->getVal();
    MachineMemOperand *MMO = Node->getMemOperand();
-  DebugLoc DL = Node->getDebugLoc();
+  SDLoc DL(Node);
    EVT PtrVT = Addr.getValueType();
  
    // Convert atomic subtracts of constants into additions.
@@ -1442,7 +1453,7 @@ SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
    SDValue CmpVal = Node->getOperand(2);
    SDValue SwapVal = Node->getOperand(3);
    MachineMemOperand *MMO = Node->getMemOperand();
-  DebugLoc DL = Node->getDebugLoc();
+  SDLoc DL(Node);
    EVT PtrVT = Addr.getValueType();
  
    // Get the address of the containing word.
@@ -1474,7 +1485,7 @@ SDValue SystemZTargetLowering::lowerSTACKSAVE(SDValue Op,
                                                SelectionDAG &DAG) const {
    MachineFunction &MF = DAG.getMachineFunction();
    MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
-  return DAG.getCopyFromReg(Op.getOperand(0), Op.getDebugLoc(),
+  return DAG.getCopyFromReg(Op.getOperand(0), SDLoc(Op),
                              SystemZ::R15D, Op.getValueType());
  }
  
@@ -1482,7 +1493,7 @@ SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
                                                   SelectionDAG &DAG) const {
    MachineFunction &MF = DAG.getMachineFunction();
    MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
-  return DAG.getCopyToReg(Op.getOperand(0), Op.getDebugLoc(),
+  return DAG.getCopyToReg(Op.getOperand(0), SDLoc(Op),
                            SystemZ::R15D, Op.getOperand(1));
  }
  
@@ -1609,6 +1620,34 @@ static MachineBasicBlock *splitBlockAfter(MachineInstr *MI,
    return NewMBB;
  }
  
+bool SystemZTargetLowering::
+convertPrevCompareToBranch(MachineBasicBlock *MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned CCMask, MachineBasicBlock *Target) const {
+  MachineBasicBlock::iterator Compare = MBBI;
+  MachineBasicBlock::iterator Begin = MBB->begin();
+  do
+    {
+      if (Compare == Begin)
+        return false;
+      --Compare;
+    }
+  while (Compare->isDebugValue());
+
+  const SystemZInstrInfo *TII = TM.getInstrInfo();
+  unsigned FusedOpcode = TII->getCompareAndBranch(Compare->getOpcode(),
+                                                  Compare);
+  if (!FusedOpcode)
+    return false;
+
+  DebugLoc DL = Compare->getDebugLoc();
+  BuildMI(*MBB, MBBI, DL, TII->get(FusedOpcode))
+    .addOperand(Compare->getOperand(0)).addOperand(Compare->getOperand(1))
+    .addImm(CCMask).addMBB(Target);
+  Compare->removeFromParent();
+  return true;
+}
+
  // Implement EmitInstrWithCustomInserter for pseudo Select* instruction MI.
  MachineBasicBlock *
  SystemZTargetLowering::emitSelect(MachineInstr *MI,
@@ -1626,13 +1665,17 @@ SystemZTargetLowering::emitSelect(MachineInstr *MI,
    MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
  
    //  StartMBB:
-  //   ...
-  //   TrueVal = ...
-  //   cmpTY ccX, r1, r2
-  //   jCC JoinMBB
+  //   BRC CCMask, JoinMBB
    //   # fallthrough to FalseMBB
+  //
+  // The original DAG glues comparisons to their uses, both to ensure
+  // that no CC-clobbering instructions are inserted between them, and
+  // to ensure that comparison results are not reused.  This means that
+  // this Select is the sole user of any preceding comparison instruction
+  // and that we can try to use a fused compare and branch instead.
    MBB = StartMBB;
-  BuildMI(MBB, DL, TII->get(SystemZ::BRCL)).addImm(CCMask).addMBB(JoinMBB);
+  if (!convertPrevCompareToBranch(MBB, MI, CCMask, JoinMBB))
+    BuildMI(MBB, DL, TII->get(SystemZ::BRC)).addImm(CCMask).addMBB(JoinMBB);
    MBB->addSuccessor(JoinMBB);
    MBB->addSuccessor(FalseMBB);
  
@@ -1653,6 +1696,59 @@ SystemZTargetLowering::emitSelect(MachineInstr *MI,
    return JoinMBB;
  }
  
+// Implement EmitInstrWithCustomInserter for pseudo CondStore* instruction MI.
+// StoreOpcode is the store to use and Invert says whether the store should
+// happen when the condition is false rather than true.
+MachineBasicBlock *
+SystemZTargetLowering::emitCondStore(MachineInstr *MI,
+                                     MachineBasicBlock *MBB,
+                                     unsigned StoreOpcode, bool Invert) const {
+  const SystemZInstrInfo *TII = TM.getInstrInfo();
+
+  MachineOperand Base = MI->getOperand(0);
+  int64_t Disp        = MI->getOperand(1).getImm();
+  unsigned IndexReg   = MI->getOperand(2).getReg();
+  unsigned SrcReg     = MI->getOperand(3).getReg();
+  unsigned CCMask     = MI->getOperand(4).getImm();
+  DebugLoc DL         = MI->getDebugLoc();
+
+  StoreOpcode = TII->getOpcodeForOffset(StoreOpcode, Disp);
+
+  // Get the condition needed to branch around the store.
+  if (!Invert)
+    CCMask = CCMask ^ SystemZ::CCMASK_ANY;
+
+  MachineBasicBlock *StartMBB = MBB;
+  MachineBasicBlock *JoinMBB  = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
+
+  //  StartMBB:
+  //   BRC CCMask, JoinMBB
+  //   # fallthrough to FalseMBB
+  //
+  // The original DAG glues comparisons to their uses, both to ensure
+  // that no CC-clobbering instructions are inserted between them, and
+  // to ensure that comparison results are not reused.  This means that
+  // this CondStore is the sole user of any preceding comparison instruction
+  // and that we can try to use a fused compare and branch instead.
+  MBB = StartMBB;
+  if (!convertPrevCompareToBranch(MBB, MI, CCMask, JoinMBB))
+    BuildMI(MBB, DL, TII->get(SystemZ::BRC)).addImm(CCMask).addMBB(JoinMBB);
+  MBB->addSuccessor(JoinMBB);
+  MBB->addSuccessor(FalseMBB);
+
+  //  FalseMBB:
+  //   store %SrcReg, %Disp(%Index,%Base)
+  //   # fallthrough to JoinMBB
+  MBB = FalseMBB;
+  BuildMI(MBB, DL, TII->get(StoreOpcode))
+    .addReg(SrcReg).addOperand(Base).addImm(Disp).addReg(IndexReg);
+  MBB->addSuccessor(JoinMBB);
+
+  MI->eraseFromParent();
+  return JoinMBB;
+}
+
  // Implement EmitInstrWithCustomInserter for pseudo ATOMIC_LOAD{,W}_*
  // or ATOMIC_SWAP{,W} instruction MI.  BinOpcode is the instruction that
  // performs the binary operation elided by "*", or 0 for ATOMIC_SWAP{,W}.
@@ -1769,7 +1865,7 @@ SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI,
        .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
    BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
      .addReg(OldVal).addReg(NewVal).addOperand(Base).addImm(Disp);
-  BuildMI(MBB, DL, TII->get(SystemZ::BRCL)).addImm(MaskNE).addMBB(LoopMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC)).addImm(MaskNE).addMBB(LoopMBB);
    MBB->addSuccessor(LoopMBB);
    MBB->addSuccessor(DoneMBB);
  
@@ -1846,7 +1942,7 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI,
    //   %OldVal        = phi [ %OrigVal, StartMBB ], [ %Dest, UpdateMBB ]
    //   %RotatedOldVal = RLL %OldVal, 0(%BitShift)
    //   CompareOpcode %RotatedOldVal, %Src2
-  //   BRCL KeepOldMask, UpdateMBB
+  //   BRC KeepOldMask, UpdateMBB
    MBB = LoopMBB;
    BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
      .addReg(OrigVal).addMBB(StartMBB)
@@ -1854,10 +1950,17 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI,
    if (IsSubWord)
      BuildMI(MBB, DL, TII->get(SystemZ::RLL), RotatedOldVal)
        .addReg(OldVal).addReg(BitShift).addImm(0);
-  BuildMI(MBB, DL, TII->get(CompareOpcode))
-    .addReg(RotatedOldVal).addReg(Src2);
-  BuildMI(MBB, DL, TII->get(SystemZ::BRCL))
-    .addImm(KeepOldMask).addMBB(UpdateMBB);
+  unsigned FusedOpcode = TII->getCompareAndBranch(CompareOpcode);
+  if (FusedOpcode)
+    BuildMI(MBB, DL, TII->get(FusedOpcode))
+      .addReg(RotatedOldVal).addReg(Src2)
+      .addImm(KeepOldMask).addMBB(UpdateMBB);
+  else {
+    BuildMI(MBB, DL, TII->get(CompareOpcode))
+      .addReg(RotatedOldVal).addReg(Src2);
+    BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+      .addImm(KeepOldMask).addMBB(UpdateMBB);
+  }
    MBB->addSuccessor(UpdateMBB);
    MBB->addSuccessor(UseAltMBB);
  
@@ -1887,7 +1990,7 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI,
        .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
    BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
      .addReg(OldVal).addReg(NewVal).addOperand(Base).addImm(Disp);
-  BuildMI(MBB, DL, TII->get(SystemZ::BRCL)).addImm(MaskNE).addMBB(LoopMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC)).addImm(MaskNE).addMBB(LoopMBB);
    MBB->addSuccessor(LoopMBB);
    MBB->addSuccessor(DoneMBB);
  
@@ -1959,8 +2062,7 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI,
    //                      ^^ Replace the upper 32-BitSize bits of the
    //                         comparison value with those that we loaded,
    //                         so that we can use a full word comparison.
-  //   CR %Dest, %RetryCmpVal
-  //   JNE DoneMBB
+  //   CRJNE %Dest, %RetryCmpVal, DoneMBB
    //   # Fall through to SetMBB
    MBB = LoopMBB;
    BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
@@ -1976,9 +2078,9 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI,
      .addReg(OldVal).addReg(BitShift).addImm(BitSize);
    BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RetryCmpVal)
      .addReg(CmpVal).addReg(Dest).addImm(32).addImm(63 - BitSize).addImm(0);
-  BuildMI(MBB, DL, TII->get(SystemZ::CR))
-    .addReg(Dest).addReg(RetryCmpVal);
-  BuildMI(MBB, DL, TII->get(SystemZ::BRCL)).addImm(MaskNE).addMBB(DoneMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::CRJ))
+    .addReg(Dest).addReg(RetryCmpVal)
+    .addImm(MaskNE).addMBB(DoneMBB);
    MBB->addSuccessor(DoneMBB);
    MBB->addSuccessor(SetMBB);
  
@@ -1998,7 +2100,7 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI,
      .addReg(RetrySwapVal).addReg(NegBitShift).addImm(-BitSize);
    BuildMI(MBB, DL, TII->get(CSOpcode), RetryOldVal)
      .addReg(OldVal).addReg(StoreVal).addOperand(Base).addImm(Disp);
-  BuildMI(MBB, DL, TII->get(SystemZ::BRCL)).addImm(MaskNE).addMBB(LoopMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC)).addImm(MaskNE).addMBB(LoopMBB);
    MBB->addSuccessor(LoopMBB);
    MBB->addSuccessor(DoneMBB);
  
@@ -2051,6 +2153,43 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
    case SystemZ::SelectF128:
      return emitSelect(MI, MBB);
  
+  case SystemZ::CondStore8_32:
+    return emitCondStore(MI, MBB, SystemZ::STC32, false);
+  case SystemZ::CondStore8_32Inv:
+    return emitCondStore(MI, MBB, SystemZ::STC32, true);
+  case SystemZ::CondStore16_32:
+    return emitCondStore(MI, MBB, SystemZ::STH32, false);
+  case SystemZ::CondStore16_32Inv:
+    return emitCondStore(MI, MBB, SystemZ::STH32, true);
+  case SystemZ::CondStore32_32:
+    return emitCondStore(MI, MBB, SystemZ::ST32, false);
+  case SystemZ::CondStore32_32Inv:
+    return emitCondStore(MI, MBB, SystemZ::ST32, true);
+  case SystemZ::CondStore8:
+    return emitCondStore(MI, MBB, SystemZ::STC, false);
+  case SystemZ::CondStore8Inv:
+    return emitCondStore(MI, MBB, SystemZ::STC, true);
+  case SystemZ::CondStore16:
+    return emitCondStore(MI, MBB, SystemZ::STH, false);
+  case SystemZ::CondStore16Inv:
+    return emitCondStore(MI, MBB, SystemZ::STH, true);
+  case SystemZ::CondStore32:
+    return emitCondStore(MI, MBB, SystemZ::ST, false);
+  case SystemZ::CondStore32Inv:
+    return emitCondStore(MI, MBB, SystemZ::ST, true);
+  case SystemZ::CondStore64:
+    return emitCondStore(MI, MBB, SystemZ::STG, false);
+  case SystemZ::CondStore64Inv:
+    return emitCondStore(MI, MBB, SystemZ::STG, true);
+  case SystemZ::CondStoreF32:
+    return emitCondStore(MI, MBB, SystemZ::STE, false);
+  case SystemZ::CondStoreF32Inv:
+    return emitCondStore(MI, MBB, SystemZ::STE, true);
+  case SystemZ::CondStoreF64:
+    return emitCondStore(MI, MBB, SystemZ::STD, false);
+  case SystemZ::CondStoreF64Inv:
+    return emitCondStore(MI, MBB, SystemZ::STD, true);
+
    case SystemZ::AEXT128_64:
      return emitExt128(MI, MBB, false, SystemZ::subreg_low);
    case SystemZ::ZEXT128_32:
@@ -2227,6 +2366,16 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
  
    case SystemZ::ATOMIC_CMP_SWAPW:
      return emitAtomicCmpSwapW(MI, MBB);
+  case SystemZ::BRC:
+    // The original DAG glues comparisons to their uses, both to ensure
+    // that no CC-clobbering instructions are inserted between them, and
+    // to ensure that comparison results are not reused.  This means that
+    // a BRC is the sole user of a preceding comparison and that we can
+    // try to use a fused compare and branch instead.
+    if (convertPrevCompareToBranch(MBB, MI, MI->getOperand(0).getImm(),
+                                   MI->getOperand(1).getMBB()))
+      MI->eraseFromParent();
+    return MBB;
    default:
      llvm_unreachable("Unexpected instr type to insert");
    }