[WebAssembly] Switch WebAssemblyMCAsmInfo.h from MCAsmInfo to MCAsmInfoELF.

[oota-llvm.git] / lib / Target / AArch64 / AArch64ISelDAGToDAG.cpp
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

index 7007ffcce29b766ab1a458c71f814e0b8f75a3fd..6c868880bcac4f760719ce3da759f3f62d5b7aae 100644 (file)
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -34,7 +34,6 @@ using namespace llvm;
  namespace {
  
  class AArch64DAGToDAGISel : public SelectionDAGISel {
-  AArch64TargetMachine &TM;
  
    /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
    /// make the right decision when generating code for different targets.
@@ -45,7 +44,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
  public:
    explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
                                 CodeGenOpt::Level OptLevel)
-      : SelectionDAGISel(tm, OptLevel), TM(tm), Subtarget(nullptr),
+      : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
          ForCodeSize(false) {}
  
    const char *getPassName() const override {
@@ -53,12 +52,8 @@ public:
    }
  
    bool runOnMachineFunction(MachineFunction &MF) override {
-    AttributeSet FnAttrs = MF.getFunction()->getAttributes();
-    ForCodeSize =
-        FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
-                             Attribute::OptimizeForSize) ||
-        FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
-    Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+    ForCodeSize = MF.getFunction()->optForSize();
+    Subtarget = &MF.getSubtarget<AArch64Subtarget>();
      return SelectionDAGISel::runOnMachineFunction(MF);
    }
  
@@ -67,7 +62,7 @@ public:
    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
    /// inline asm expressions.
    bool SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                    char ConstraintCode,
+                                    unsigned ConstraintID,
                                      std::vector<SDValue> &OutOps) override;
  
    SDNode *SelectMLAV64LaneV128(SDNode *N);
@@ -81,6 +76,21 @@ public:
    bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
      return SelectShiftedRegister(N, true, Reg, Shift);
    }
+  bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed7S(N, 1, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed7S(N, 2, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed7S(N, 4, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed7S(N, 8, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed7S(N, 16, Base, OffImm);
+  }
    bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
      return SelectAddrModeIndexed(N, 1, Base, OffImm);
    }
@@ -134,8 +144,8 @@ public:
  
    /// Generic helper for the createDTuple/createQTuple
    /// functions. Those should almost always be called instead.
-  SDValue createTuple(ArrayRef<SDValue> Vecs, unsigned RegClassIDs[],
-                      unsigned SubRegs[]);
+  SDValue createTuple(ArrayRef<SDValue> Vecs, const unsigned RegClassIDs[],
+                      const unsigned SubRegs[]);
  
    SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
  
@@ -153,13 +163,12 @@ public:
    SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
    SDNode *SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
  
-  SDNode *SelectSIMDAddSubNarrowing(unsigned IntNo, SDNode *Node);
-  SDNode *SelectSIMDXtnNarrowing(unsigned IntNo, SDNode *Node);
-
    SDNode *SelectBitfieldExtractOp(SDNode *N);
    SDNode *SelectBitfieldInsertOp(SDNode *N);
+  SDNode *SelectBitfieldInsertInZeroOp(SDNode *N);
  
-  SDNode *SelectLIBM(SDNode *N);
+  SDNode *SelectReadRegister(SDNode *N);
+  SDNode *SelectWriteRegister(SDNode *N);
  
  // Include the pieces autogenerated from the target description.
  #include "AArch64GenDAGISel.inc"
@@ -167,6 +176,8 @@ public:
  private:
    bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
                               SDValue &Shift);
+  bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base,
+                               SDValue &OffImm);
    bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
                               SDValue &OffImm);
    bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
@@ -216,13 +227,20 @@ static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
  }
  
  bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
-    const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps) {
-  assert(ConstraintCode == 'm' && "unexpected asm memory constraint");
-  // Require the address to be in a register.  That is safe for all AArch64
-  // variants and it is hard to do anything much smarter without knowing
-  // how the operand is used.
-  OutOps.push_back(Op);
-  return false;
+    const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
+  switch(ConstraintID) {
+  default:
+    llvm_unreachable("Unexpected asm memory constraint");
+  case InlineAsm::Constraint_i:
+  case InlineAsm::Constraint_m:
+  case InlineAsm::Constraint_Q:
+    // Require the address to be in a register.  That is safe for all AArch64
+    // variants and it is hard to do anything much smarter without knowing
+    // how the operand is used.
+    OutOps.push_back(Op);
+    return false;
+  }
+  return true;
  }
  
  /// SelectArithImmed - Select an immediate value that can be represented as
@@ -250,8 +268,9 @@ bool AArch64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
      return false;
  
    unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
-  Val = CurDAG->getTargetConstant(Immed, MVT::i32);
-  Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
+  SDLoc dl(N);
+  Val = CurDAG->getTargetConstant(Immed, dl, MVT::i32);
+  Shift = CurDAG->getTargetConstant(ShVal, dl, MVT::i32);
    return true;
  }
  
@@ -284,7 +303,8 @@ bool AArch64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
      return false;
  
    Immed &= 0xFFFFFFULL;
-  return SelectArithImmed(CurDAG->getConstant(Immed, MVT::i32), Val, Shift);
+  return SelectArithImmed(CurDAG->getConstant(Immed, SDLoc(N), MVT::i32), Val,
+                          Shift);
  }
  
  /// getShiftTypeForNode - Translate a shift node to the corresponding
@@ -304,9 +324,9 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
    }
  }
  
-/// \brief Determine wether it is worth to fold V into an extended register.
+/// \brief Determine whether it is worth to fold V into an extended register.
  bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
-  // it hurts if the a value is used at least twice, unless we are optimizing
+  // it hurts if the value is used at least twice, unless we are optimizing
    // for code size.
    if (ForCodeSize || V.hasOneUse())
      return true;
@@ -332,7 +352,7 @@ bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
      unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);
  
      Reg = N.getOperand(0);
-    Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
+    Shift = CurDAG->getTargetConstant(ShVal, SDLoc(N), MVT::i32);
      return isWorthFolding(N);
    }
  
@@ -415,7 +435,7 @@ static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
    return true;
  }
  
-// Helper for SelectOpcV64LaneV128 - Recogzine operatinos where one operand is a
+// Helper for SelectOpcV64LaneV128 - Recognize operations where one operand is a
  // high lane extract.
  static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
                               SDValue &LaneOp, int &LaneIdx) {
@@ -433,6 +453,7 @@ static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
  /// is a lane in the upper half of a 128-bit vector.  Recognize and select this
  /// so that we don't emit unnecessary lane extracts.
  SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
+  SDLoc dl(N);
    SDValue Op0 = N->getOperand(0);
    SDValue Op1 = N->getOperand(1);
    SDValue MLAOp1;   // Will hold ordinary multiplicand for MLA.
@@ -449,7 +470,7 @@ SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
        return nullptr;
    }
  
-  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
+  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
  
    SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal };
  
@@ -472,10 +493,11 @@ SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
      break;
    }
  
-  return CurDAG->getMachineNode(MLAOpc, SDLoc(N), N->getValueType(0), Ops);
+  return CurDAG->getMachineNode(MLAOpc, dl, N->getValueType(0), Ops);
  }
  
  SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
+  SDLoc dl(N);
    SDValue SMULLOp0;
    SDValue SMULLOp1;
    int LaneIdx;
@@ -484,7 +506,7 @@ SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
                          LaneIdx))
      return nullptr;
  
-  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
+  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
  
    SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal };
  
@@ -515,7 +537,7 @@ SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
    } else
      llvm_unreachable("Unrecognized intrinsic.");
  
-  return CurDAG->getMachineNode(SMULLOpc, SDLoc(N), N->getValueType(0), Ops);
+  return CurDAG->getMachineNode(SMULLOpc, dl, N->getValueType(0), Ops);
  }
  
  /// Instructions that accept extend modifiers like UXTW expect the register
@@ -526,9 +548,10 @@ static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) {
    if (N.getValueType() == MVT::i32)
      return N;
  
-  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+  SDLoc dl(N);
+  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
    MachineSDNode *Node = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-                                               SDLoc(N), MVT::i32, N, SubReg);
+                                               dl, MVT::i32, N, SubReg);
    return SDValue(Node, 0);
  }
  
@@ -562,30 +585,99 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
    }
  
    // AArch64 mandates that the RHS of the operation must use the smallest
-  // register classs that could contain the size being extended from.  Thus,
+  // register class that could contain the size being extended from.  Thus,
    // if we're folding a (sext i8), we need the RHS to be a GPR32, even though
    // there might not be an actual 32-bit value in the program.  We can
    // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
    assert(Ext != AArch64_AM::UXTX && Ext != AArch64_AM::SXTX);
    Reg = narrowIfNeeded(CurDAG, Reg);
-  Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), MVT::i32);
+  Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N),
+                                    MVT::i32);
    return isWorthFolding(N);
  }
  
+/// If there's a use of this ADDlow that's not itself a load/store then we'll
+/// need to create a real ADD instruction from it anyway and there's no point in
+/// folding it into the mem op. Theoretically, it shouldn't matter, but there's
+/// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding
+/// leads to duplicated ADRP instructions.
+static bool isWorthFoldingADDlow(SDValue N) {
+  for (auto Use : N->uses()) {
+    if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE &&
+        Use->getOpcode() != ISD::ATOMIC_LOAD &&
+        Use->getOpcode() != ISD::ATOMIC_STORE)
+      return false;
+
+    // ldar and stlr have much more restrictive addressing modes (just a
+    // register).
+    if (cast<MemSDNode>(Use)->getOrdering() > Monotonic)
+      return false;
+  }
+
+  return true;
+}
+
+/// SelectAddrModeIndexed7S - Select a "register plus scaled signed 7-bit
+/// immediate" address.  The "Size" argument is the size in bytes of the memory
+/// reference, which determines the scale.
+bool AArch64DAGToDAGISel::SelectAddrModeIndexed7S(SDValue N, unsigned Size,
+                                                  SDValue &Base,
+                                                  SDValue &OffImm) {
+  SDLoc dl(N);
+  const DataLayout &DL = CurDAG->getDataLayout();
+  const TargetLowering *TLI = getTargetLowering();
+  if (N.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+    OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
+    return true;
+  }
+
+  // As opposed to the (12-bit) Indexed addressing mode below, the 7-bit signed
+  // selected here doesn't support labels/immediates, only base+offset.
+
+  if (CurDAG->isBaseWithConstantOffset(N)) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      int64_t RHSC = RHS->getSExtValue();
+      unsigned Scale = Log2_32(Size);
+      if ((RHSC & (Size - 1)) == 0 && RHSC >= -(0x40 << Scale) &&
+          RHSC < (0x40 << Scale)) {
+        Base = N.getOperand(0);
+        if (Base.getOpcode() == ISD::FrameIndex) {
+          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+          Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+        }
+        OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
+        return true;
+      }
+    }
+  }
+
+  // Base only. The address will be materialized into a register before
+  // the memory is accessed.
+  //    add x0, Xbase, #offset
+  //    stp x1, x2, [x0]
+  Base = N;
+  OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
+  return true;
+}
+
  /// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
  /// immediate" address.  The "Size" argument is the size in bytes of the memory
  /// reference, which determines the scale.
  bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
                                                SDValue &Base, SDValue &OffImm) {
+  SDLoc dl(N);
+  const DataLayout &DL = CurDAG->getDataLayout();
    const TargetLowering *TLI = getTargetLowering();
    if (N.getOpcode() == ISD::FrameIndex) {
      int FI = cast<FrameIndexSDNode>(N)->getIndex();
-    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
-    OffImm = CurDAG->getTargetConstant(0, MVT::i64);
+    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+    OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
      return true;
    }
  
-  if (N.getOpcode() == AArch64ISD::ADDlow) {
+  if (N.getOpcode() == AArch64ISD::ADDlow && isWorthFoldingADDlow(N)) {
      GlobalAddressSDNode *GAN =
          dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
      Base = N.getOperand(0);
@@ -595,9 +687,9 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
  
      const GlobalValue *GV = GAN->getGlobal();
      unsigned Alignment = GV->getAlignment();
-    const DataLayout *DL = TLI->getDataLayout();
-    if (Alignment == 0 && !Subtarget->isTargetDarwin())
-      Alignment = DL->getABITypeAlignment(GV->getType()->getElementType());
+    Type *Ty = GV->getType()->getElementType();
+    if (Alignment == 0 && Ty->isSized())
+      Alignment = DL.getABITypeAlignment(Ty);
  
      if (Alignment >= Size)
        return true;
@@ -611,9 +703,9 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
          Base = N.getOperand(0);
          if (Base.getOpcode() == ISD::FrameIndex) {
            int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-          Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+          Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
          }
-        OffImm = CurDAG->getTargetConstant(RHSC >> Scale, MVT::i64);
+        OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
          return true;
        }
      }
@@ -629,7 +721,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
    //    add x0, Xbase, #offset
    //    ldr x0, [x0]
    Base = N;
-  OffImm = CurDAG->getTargetConstant(0, MVT::i64);
+  OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
    return true;
  }
  
@@ -654,9 +746,10 @@ bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
        if (Base.getOpcode() == ISD::FrameIndex) {
          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
          const TargetLowering *TLI = getTargetLowering();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(
+            FI, TLI->getPointerTy(CurDAG->getDataLayout()));
        }
-      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i64);
+      OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i64);
        return true;
      }
    }
@@ -664,12 +757,12 @@ bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
  }
  
  static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
-  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+  SDLoc dl(N);
+  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
    SDValue ImpDef = SDValue(
-      CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SDLoc(N), MVT::i64),
-      0);
+      CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, MVT::i64), 0);
    MachineSDNode *Node = CurDAG->getMachineNode(
-      TargetOpcode::INSERT_SUBREG, SDLoc(N), MVT::i64, ImpDef, N, SubReg);
+      TargetOpcode::INSERT_SUBREG, dl, MVT::i64, ImpDef, N, SubReg);
    return SDValue(Node, 0);
  }
  
@@ -683,6 +776,7 @@ bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
    if (!CSD || (CSD->getZExtValue() & 0x7) != CSD->getZExtValue())
      return false;
  
+  SDLoc dl(N);
    if (WantExtend) {
      AArch64_AM::ShiftExtendType Ext =
          getExtendTypeForNode(N.getOperand(0), true);
@@ -690,10 +784,11 @@ bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
        return false;
  
      Offset = narrowIfNeeded(CurDAG, N.getOperand(0).getOperand(0));
-    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32);
+    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
+                                           MVT::i32);
    } else {
      Offset = N.getOperand(0);
-    SignExtend = CurDAG->getTargetConstant(0, MVT::i32);
+    SignExtend = CurDAG->getTargetConstant(0, dl, MVT::i32);
    }
  
    unsigned LegalShiftVal = Log2_32(Size);
@@ -716,6 +811,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
      return false;
    SDValue LHS = N.getOperand(0);
    SDValue RHS = N.getOperand(1);
+  SDLoc dl(N);
  
    // We don't want to match immediate adds here, because they are better lowered
    // to the register-immediate addressing modes.
@@ -738,7 +834,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
    if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
        SelectExtendedSHL(RHS, Size, true, Offset, SignExtend)) {
      Base = LHS;
-    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32);
      return true;
    }
  
@@ -746,12 +842,12 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
    if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
        SelectExtendedSHL(LHS, Size, true, Offset, SignExtend)) {
      Base = RHS;
-    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32);
      return true;
    }
  
    // There was no shift, whatever else we find.
-  DoShift = CurDAG->getTargetConstant(false, MVT::i32);
+  DoShift = CurDAG->getTargetConstant(false, dl, MVT::i32);
  
    AArch64_AM::ShiftExtendType Ext = AArch64_AM::InvalidShiftExtend;
    // Try to match an unshifted extend on the LHS.
@@ -760,7 +856,8 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
            AArch64_AM::InvalidShiftExtend) {
      Base = RHS;
      Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
-    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32);
+    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
+                                           MVT::i32);
      if (isWorthFolding(LHS))
        return true;
    }
@@ -771,7 +868,8 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
            AArch64_AM::InvalidShiftExtend) {
      Base = LHS;
      Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
-    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32);
+    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
+                                           MVT::i32);
      if (isWorthFolding(RHS))
        return true;
    }
@@ -779,6 +877,21 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
    return false;
  }
  
+// Check if the given immediate is preferred by ADD. If an immediate can be
+// encoded in an ADD, or it can be encoded in an "ADD LSL #12" and can not be
+// encoded by one MOVZ, return true.
+static bool isPreferredADD(int64_t ImmOff) {
+  // Constant in [0x0, 0xfff] can be encoded in ADD.
+  if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
+    return true;
+  // Check if it can be encoded in an "ADD LSL #12".
+  if ((ImmOff & 0xffffffffff000fffLL) == 0x0LL)
+    // As a single MOVZ is faster than a "ADD of LSL #12", ignore such constant.
+    return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
+           (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
+  return false;
+}
+
  bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
                                              SDValue &Base, SDValue &Offset,
                                              SDValue &SignExtend,
@@ -787,11 +900,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
      return false;
    SDValue LHS = N.getOperand(0);
    SDValue RHS = N.getOperand(1);
-
-  // We don't want to match immediate adds here, because they are better lowered
-  // to the register-immediate addressing modes.
-  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
-    return false;
+  SDLoc DL(N);
  
    // Check if this particular node is reused in any non-memory related
    // operation.  If yes, do not try to fold this node into the address
@@ -802,6 +911,35 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
        return false;
    }
  
+  // Watch out if RHS is a wide immediate, it can not be selected into
+  // [BaseReg+Imm] addressing mode. Also it may not be able to be encoded into
+  // ADD/SUB. Instead it will use [BaseReg + 0] address mode and generate
+  // instructions like:
+  //     MOV  X0, WideImmediate
+  //     ADD  X1, BaseReg, X0
+  //     LDR  X2, [X1, 0]
+  // For such situation, using [BaseReg, XReg] addressing mode can save one
+  // ADD/SUB:
+  //     MOV  X0, WideImmediate
+  //     LDR  X2, [BaseReg, X0]
+  if (isa<ConstantSDNode>(RHS)) {
+    int64_t ImmOff = (int64_t)cast<ConstantSDNode>(RHS)->getZExtValue();
+    unsigned Scale = Log2_32(Size);
+    // Skip the immediate can be selected by load/store addressing mode.
+    // Also skip the immediate can be encoded by a single ADD (SUB is also
+    // checked by using -ImmOff).
+    if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) ||
+        isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
+      return false;
+
+    SDValue Ops[] = { RHS };
+    SDNode *MOVI =
+        CurDAG->getMachineNode(AArch64::MOVi64imm, DL, MVT::i64, Ops);
+    SDValue MOVIV = SDValue(MOVI, 0);
+    // This ADD of two X register will be selected into [Reg+Reg] mode.
+    N = CurDAG->getNode(ISD::ADD, DL, MVT::i64, LHS, MOVIV);
+  }
+
    // Remember if it is worth folding N when it produces extended register.
    bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
  
@@ -809,7 +947,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
    if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
        SelectExtendedSHL(RHS, Size, false, Offset, SignExtend)) {
      Base = LHS;
-    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32);
      return true;
    }
  
@@ -817,40 +955,40 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
    if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
        SelectExtendedSHL(LHS, Size, false, Offset, SignExtend)) {
      Base = RHS;
-    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32);
      return true;
    }
  
    // Match any non-shifted, non-extend, non-immediate add expression.
    Base = LHS;
    Offset = RHS;
-  SignExtend = CurDAG->getTargetConstant(false, MVT::i32);
-  DoShift = CurDAG->getTargetConstant(false, MVT::i32);
+  SignExtend = CurDAG->getTargetConstant(false, DL, MVT::i32);
+  DoShift = CurDAG->getTargetConstant(false, DL, MVT::i32);
    // Reg1 + Reg2 is free: no check needed.
    return true;
  }
  
  SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
-  static unsigned RegClassIDs[] = {
+  static const unsigned RegClassIDs[] = {
        AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
-  static unsigned SubRegs[] = { AArch64::dsub0, AArch64::dsub1,
-                                AArch64::dsub2, AArch64::dsub3 };
+  static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
+                                     AArch64::dsub2, AArch64::dsub3};
  
    return createTuple(Regs, RegClassIDs, SubRegs);
  }
  
  SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
-  static unsigned RegClassIDs[] = {
+  static const unsigned RegClassIDs[] = {
        AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
-  static unsigned SubRegs[] = { AArch64::qsub0, AArch64::qsub1,
-                                AArch64::qsub2, AArch64::qsub3 };
+  static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
+                                     AArch64::qsub2, AArch64::qsub3};
  
    return createTuple(Regs, RegClassIDs, SubRegs);
  }
  
  SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
-                                         unsigned RegClassIDs[],
-                                         unsigned SubRegs[]) {
+                                         const unsigned RegClassIDs[],
+                                         const unsigned SubRegs[]) {
    // There's no special register-class for a vector-list of 1 element: it's just
    // a vector.
    if (Regs.size() == 1)
@@ -858,18 +996,18 @@ SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
  
    assert(Regs.size() >= 2 && Regs.size() <= 4);
  
-  SDLoc DL(Regs[0].getNode());
+  SDLoc DL(Regs[0]);
  
    SmallVector<SDValue, 4> Ops;
  
    // First operand of REG_SEQUENCE is the desired RegClass.
    Ops.push_back(
-      CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], MVT::i32));
+      CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], DL, MVT::i32));
  
    // Then we get pairs of source & subregister-position for the components.
    for (unsigned i = 0; i < Regs.size(); ++i) {
      Ops.push_back(Regs[i]);
-    Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], MVT::i32));
+    Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], DL, MVT::i32));
    }
  
    SDNode *N =
@@ -954,6 +1092,8 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
        // it into an i64.
        DstVT = MVT::i32;
      }
+  } else if (VT == MVT::f16) {
+    Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
    } else if (VT == MVT::f32) {
      Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
    } else if (VT == MVT::f64 || VT.is64BitVector()) {
@@ -966,19 +1106,21 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
    SDValue Base = LD->getBasePtr();
    ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
    int OffsetVal = (int)OffsetOp->getZExtValue();
-  SDValue Offset = CurDAG->getTargetConstant(OffsetVal, MVT::i64);
+  SDLoc dl(N);
+  SDValue Offset = CurDAG->getTargetConstant(OffsetVal, dl, MVT::i64);
    SDValue Ops[] = { Base, Offset, Chain };
-  SDNode *Res = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i64, DstVT,
+  SDNode *Res = CurDAG->getMachineNode(Opcode, dl, MVT::i64, DstVT,
                                         MVT::Other, Ops);
    // Either way, we're replacing the node, so tell the caller that.
    Done = true;
    SDValue LoadedVal = SDValue(Res, 1);
    if (InsertTo64) {
-    SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+    SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
      LoadedVal =
          SDValue(CurDAG->getMachineNode(
-                    AArch64::SUBREG_TO_REG, SDLoc(N), MVT::i64,
-                    CurDAG->getTargetConstant(0, MVT::i64), LoadedVal, SubReg),
+                    AArch64::SUBREG_TO_REG, dl, MVT::i64,
+                    CurDAG->getTargetConstant(0, dl, MVT::i64), LoadedVal,
+                    SubReg),
                  0);
    }
  
@@ -995,13 +1137,10 @@ SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs,
    EVT VT = N->getValueType(0);
    SDValue Chain = N->getOperand(0);
  
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(N->getOperand(2)); // Mem operand;
-  Ops.push_back(Chain);
+  SDValue Ops[] = {N->getOperand(2), // Mem operand;
+                   Chain};
  
-  std::vector<EVT> ResTys;
-  ResTys.push_back(MVT::Untyped);
-  ResTys.push_back(MVT::Other);
+  const EVT ResTys[] = {MVT::Untyped, MVT::Other};
  
    SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
    SDValue SuperReg = SDValue(Ld, 0);
@@ -1019,15 +1158,12 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
    EVT VT = N->getValueType(0);
    SDValue Chain = N->getOperand(0);
  
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(N->getOperand(1)); // Mem operand
-  Ops.push_back(N->getOperand(2)); // Incremental
-  Ops.push_back(Chain);
+  SDValue Ops[] = {N->getOperand(1), // Mem operand
+                   N->getOperand(2), // Incremental
+                   Chain};
  
-  std::vector<EVT> ResTys;
-  ResTys.push_back(MVT::i64); // Type of the write back register
-  ResTys.push_back(MVT::Untyped);
-  ResTys.push_back(MVT::Other);
+  const EVT ResTys[] = {MVT::i64, // Type of the write back register
+                        MVT::Untyped, MVT::Other};
  
    SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
  
@@ -1058,10 +1194,7 @@ SDNode *AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
    SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
    SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
  
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(N->getOperand(NumVecs + 2));
-  Ops.push_back(N->getOperand(0));
+  SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), N->getOperand(0)};
    SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
  
    return St;
@@ -1071,25 +1204,24 @@ SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
                                               unsigned Opc) {
    SDLoc dl(N);
    EVT VT = N->getOperand(2)->getValueType(0);
-  SmallVector<EVT, 2> ResTys;
-  ResTys.push_back(MVT::i64);   // Type of the write back register
-  ResTys.push_back(MVT::Other); // Type for the Chain
+  const EVT ResTys[] = {MVT::i64,    // Type of the write back register
+                        MVT::Other}; // Type for the Chain
  
    // Form a REG_SEQUENCE to force register allocation.
    bool Is128Bit = VT.getSizeInBits() == 128;
    SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
    SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
  
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(N->getOperand(NumVecs + 1)); // base register
-  Ops.push_back(N->getOperand(NumVecs + 2)); // Incremental
-  Ops.push_back(N->getOperand(0)); // Chain
+  SDValue Ops[] = {RegSeq,
+                   N->getOperand(NumVecs + 1), // base register
+                   N->getOperand(NumVecs + 2), // Incremental
+                   N->getOperand(0)};          // Chain
    SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
  
    return St;
  }
  
+namespace {
  /// WidenVector - Given a value in the V64 register class, produce the
  /// equivalent value in the V128 register class.
  class WidenVector {
@@ -1110,6 +1242,7 @@ public:
      return DAG.getTargetInsertSubreg(AArch64::dsub, DL, WideTy, Undef, V64Reg);
    }
  };
+} // namespace
  
  /// NarrowVector - Given a value in the V128 register class, produce the
  /// equivalent value in the V64 register class.
@@ -1138,24 +1271,19 @@ SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
  
    SDValue RegSeq = createQTuple(Regs);
  
-  std::vector<EVT> ResTys;
-  ResTys.push_back(MVT::Untyped);
-  ResTys.push_back(MVT::Other);
+  const EVT ResTys[] = {MVT::Untyped, MVT::Other};
  
    unsigned LaneNo =
        cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
  
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
-  Ops.push_back(N->getOperand(NumVecs + 3));
-  Ops.push_back(N->getOperand(0));
+  SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
+                   N->getOperand(NumVecs + 3), N->getOperand(0)};
    SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
    SDValue SuperReg = SDValue(Ld, 0);
  
    EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
-  static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2,
-                              AArch64::qsub3 };
+  static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
+                                    AArch64::qsub2, AArch64::qsub3 };
    for (unsigned i = 0; i < NumVecs; ++i) {
      SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);
      if (Narrow)
@@ -1183,20 +1311,18 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
  
    SDValue RegSeq = createQTuple(Regs);
  
-  std::vector<EVT> ResTys;
-  ResTys.push_back(MVT::i64); // Type of the write back register
-  ResTys.push_back(MVT::Untyped);
-  ResTys.push_back(MVT::Other);
+  const EVT ResTys[] = {MVT::i64, // Type of the write back register
+                        RegSeq->getValueType(0), MVT::Other};
  
    unsigned LaneNo =
        cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
  
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); // Lane Number
-  Ops.push_back(N->getOperand(NumVecs + 2)); // Base register
-  Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental
-  Ops.push_back(N->getOperand(0));
+  SDValue Ops[] = {RegSeq,
+                   CurDAG->getTargetConstant(LaneNo, dl,
+                                             MVT::i64),         // Lane Number
+                   N->getOperand(NumVecs + 2),                  // Base register
+                   N->getOperand(NumVecs + 3),                  // Incremental
+                   N->getOperand(0)};
    SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
  
    // Update uses of the write back register
@@ -1209,8 +1335,8 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
                  Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg);
    } else {
      EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
-    static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2,
-                                AArch64::qsub3 };
+    static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
+                                      AArch64::qsub2, AArch64::qsub3 };
      for (unsigned i = 0; i < NumVecs; ++i) {
        SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT,
                                                    SuperReg);
@@ -1244,11 +1370,8 @@ SDNode *AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
    unsigned LaneNo =
        cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
  
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
-  Ops.push_back(N->getOperand(NumVecs + 3));
-  Ops.push_back(N->getOperand(0));
+  SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
+                   N->getOperand(NumVecs + 3), N->getOperand(0)};
    SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
  
    // Transfer memoperands.
@@ -1274,19 +1397,16 @@ SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
  
    SDValue RegSeq = createQTuple(Regs);
  
-  SmallVector<EVT, 2> ResTys;
-  ResTys.push_back(MVT::i64);   // Type of the write back register
-  ResTys.push_back(MVT::Other);
+  const EVT ResTys[] = {MVT::i64, // Type of the write back register
+                        MVT::Other};
  
    unsigned LaneNo =
        cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
  
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
-  Ops.push_back(N->getOperand(NumVecs + 2)); // Base Register
-  Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental
-  Ops.push_back(N->getOperand(0));
+  SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
+                   N->getOperand(NumVecs + 2), // Base Register
+                   N->getOperand(NumVecs + 3), // Incremental
+                   N->getOperand(0)};
    SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
  
    // Transfer memoperands.
@@ -1360,17 +1480,22 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
      // The resulting code will be at least as good as the original one
      // plus it may expose more opportunities for bitfield insert pattern.
      // FIXME: Currently we limit this to the bigger pattern, because
-    // some optimizations expect AND and not UBFM
+    // some optimizations expect AND and not UBFM.
      Opd0 = N->getOperand(0);
    } else
      return false;
  
-  assert((BiggerPattern || (Srl_imm > 0 && Srl_imm < VT.getSizeInBits())) &&
-         "bad amount in shift node!");
+  // Bail out on large immediates. This happens when no proper
+  // combining/constant folding was performed.
+  if (!BiggerPattern && (Srl_imm <= 0 || Srl_imm >= VT.getSizeInBits())) {
+    DEBUG((dbgs() << N
+           << ": Found large shift immediate, this should not happen\n"));
+    return false;
+  }
  
    LSB = Srl_imm;
-  MSB = Srl_imm + (VT == MVT::i32 ? CountTrailingOnes_32(And_imm)
-                                  : CountTrailingOnes_64(And_imm)) -
+  MSB = Srl_imm + (VT == MVT::i32 ? countTrailingOnes<uint32_t>(And_imm)
+                                  : countTrailingOnes<uint64_t>(And_imm)) -
          1;
    if (ClampMSB)
      // Since we're moving the extend before the right shift operation, we need
@@ -1383,20 +1508,21 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
    return true;
  }
  
-static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
-                                     unsigned &LSB, unsigned &MSB) {
-  // We are looking for the following pattern which basically extracts a single
-  // bit from the source value and places it in the LSB of the destination
-  // value, all other bits of the destination value or set to zero:
+static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
+                                          SDValue &Opd0, unsigned &LSB,
+                                          unsigned &MSB) {
+  // We are looking for the following pattern which basically extracts several
+  // continuous bits from the source value and places it from the LSB of the
+  // destination value, all other bits of the destination value or set to zero:
    //
    // Value2 = AND Value, MaskImm
    // SRL Value2, ShiftImm
    //
-  // with MaskImm >> ShiftImm == 1.
+  // with MaskImm >> ShiftImm to search for the bit width.
    //
    // This gets selected into a single UBFM:
    //
-  // UBFM Value, ShiftImm, ShiftImm
+  // UBFM Value, ShiftImm, BitWide + Srl_imm -1
    //
  
    if (N->getOpcode() != ISD::SRL)
@@ -1412,15 +1538,16 @@ static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
    if (!isIntImmediate(N->getOperand(1), Srl_imm))
      return false;
  
-  // Check whether we really have a one bit extract here.
-  if (And_mask >> Srl_imm == 0x1) {
+  // Check whether we really have several bits extract here.
+  unsigned BitWide = 64 - countLeadingOnes(~(And_mask >> Srl_imm));
+  if (BitWide && isMask_64(And_mask >> Srl_imm)) {
      if (N->getValueType(0) == MVT::i32)
        Opc = AArch64::UBFMWri;
      else
        Opc = AArch64::UBFMXri;
  
-    LSB = MSB = Srl_imm;
-
+    LSB = Srl_imm;
+    MSB = BitWide + Srl_imm - 1;
      return true;
    }
  
@@ -1428,7 +1555,7 @@ static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
  }
  
  static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
-                                       unsigned &LSB, unsigned &MSB,
+                                       unsigned &Immr, unsigned &Imms,
                                         bool BiggerPattern) {
    assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
           "N must be a SHR/SRA operation to call this function");
@@ -1441,8 +1568,8 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
    assert((VT == MVT::i32 || VT == MVT::i64) &&
           "Type checking must have been done before calling this function");
  
-  // Check for AND + SRL doing a one bit extract.
-  if (isOneBitExtractOpFromShr(N, Opc, Opd0, LSB, MSB))
+  // Check for AND + SRL doing several bits extract.
+  if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, Immr, Imms))
      return true;
  
    // we're looking for a shift of a shift
@@ -1468,20 +1595,23 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
    } else
      return false;
  
-  assert(Shl_imm < VT.getSizeInBits() && "bad amount in shift node!");
+  // Missing combines/constant folding may have left us with strange
+  // constants.
+  if (Shl_imm >= VT.getSizeInBits()) {
+    DEBUG((dbgs() << N
+           << ": Found large shift immediate, this should not happen\n"));
+    return false;
+  }
+
    uint64_t Srl_imm = 0;
    if (!isIntImmediate(N->getOperand(1), Srl_imm))
      return false;
  
    assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
           "bad amount in shift node!");
-  // Note: The width operand is encoded as width-1.
-  unsigned Width = VT.getSizeInBits() - Trunc_bits - Srl_imm - 1;
-  int sLSB = Srl_imm - Shl_imm;
-  if (sLSB < 0)
-    return false;
-  LSB = sLSB;
-  MSB = LSB + Width;
+  int immr = Srl_imm - Shl_imm;
+  Immr = immr < 0 ? immr + VT.getSizeInBits() : immr;
+  Imms = VT.getSizeInBits() - Shl_imm - Trunc_bits - 1;
    // SRA requires a signed extraction
    if (VT == MVT::i32)
      Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri;
@@ -1491,7 +1621,7 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
  }
  
  static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
-                                SDValue &Opd0, unsigned &LSB, unsigned &MSB,
+                                SDValue &Opd0, unsigned &Immr, unsigned &Imms,
                                  unsigned NumberOfIgnoredLowBits = 0,
                                  bool BiggerPattern = false) {
    if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
@@ -1503,11 +1633,11 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
        return false;
      break;
    case ISD::AND:
-    return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, LSB, MSB,
+    return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, Immr, Imms,
                                        NumberOfIgnoredLowBits, BiggerPattern);
    case ISD::SRL:
    case ISD::SRA:
-    return isBitfieldExtractOpFromShr(N, Opc, Opd0, LSB, MSB, BiggerPattern);
+    return isBitfieldExtractOpFromShr(N, Opc, Opd0, Immr, Imms, BiggerPattern);
    }
  
    unsigned NOpc = N->getMachineOpcode();
@@ -1520,8 +1650,8 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
    case AArch64::UBFMXri:
      Opc = NOpc;
      Opd0 = N->getOperand(0);
-    LSB = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
-    MSB = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+    Immr = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+    Imms = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
      return true;
    }
    // Unreachable
@@ -1529,29 +1659,30 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
  }
  
  SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
-  unsigned Opc, LSB, MSB;
+  unsigned Opc, Immr, Imms;
    SDValue Opd0;
-  if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, LSB, MSB))
+  if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, Immr, Imms))
      return nullptr;
  
    EVT VT = N->getValueType(0);
+  SDLoc dl(N);
  
    // If the bit extract operation is 64bit but the original type is 32bit, we
    // need to add one EXTRACT_SUBREG.
    if ((Opc == AArch64::SBFMXri || Opc == AArch64::UBFMXri) && VT == MVT::i32) {
-    SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(LSB, MVT::i64),
-                       CurDAG->getTargetConstant(MSB, MVT::i64)};
+    SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, MVT::i64),
+                       CurDAG->getTargetConstant(Imms, dl, MVT::i64)};
  
-    SDNode *BFM = CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i64, Ops64);
-    SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+    SDNode *BFM = CurDAG->getMachineNode(Opc, dl, MVT::i64, Ops64);
+    SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
      MachineSDNode *Node =
-        CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N), MVT::i32,
+        CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::i32,
                                 SDValue(BFM, 0), SubReg);
      return Node;
    }
  
-  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(LSB, VT),
-                   CurDAG->getTargetConstant(MSB, VT)};
+  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
+                   CurDAG->getTargetConstant(Imms, dl, VT)};
    return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
  }
  
@@ -1755,6 +1886,7 @@ static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
      return Op;
  
    EVT VT = Op.getValueType();
+  SDLoc dl(Op);
    unsigned BitWidth = VT.getSizeInBits();
    unsigned UBFMOpc = BitWidth == 32 ? AArch64::UBFMWri : AArch64::UBFMXri;
  
@@ -1762,16 +1894,16 @@ static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
    if (ShlAmount > 0) {
      // LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt
      ShiftNode = CurDAG->getMachineNode(
-        UBFMOpc, SDLoc(Op), VT, Op,
-        CurDAG->getTargetConstant(BitWidth - ShlAmount, VT),
-        CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, VT));
+        UBFMOpc, dl, VT, Op,
+        CurDAG->getTargetConstant(BitWidth - ShlAmount, dl, VT),
+        CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, dl, VT));
    } else {
      // LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1
      assert(ShlAmount < 0 && "expected right shift");
      int ShrAmount = -ShlAmount;
      ShiftNode = CurDAG->getMachineNode(
-        UBFMOpc, SDLoc(Op), VT, Op, CurDAG->getTargetConstant(ShrAmount, VT),
-        CurDAG->getTargetConstant(BitWidth - 1, VT));
+        UBFMOpc, dl, VT, Op, CurDAG->getTargetConstant(ShrAmount, dl, VT),
+        CurDAG->getTargetConstant(BitWidth - 1, dl, VT));
    }
  
    return SDValue(ShiftNode, 0);
@@ -1780,6 +1912,7 @@ static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
  /// Does this tree qualify as an attempt to move a bitfield into position,
  /// essentially "(and (shl VAL, N), Mask)".
  static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
+                                    bool BiggerPattern,
                                      SDValue &Src, int &ShiftAmount,
                                      int &MaskWidth) {
    EVT VT = Op.getValueType();
@@ -1802,6 +1935,11 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
      Op = Op.getOperand(0);
    }
  
+  // Don't match if the SHL has more than one use, since then we'll end up
+  // generating SHL+UBFIZ instead of just keeping SHL+AND.
+  if (!BiggerPattern && !Op.hasOneUse())
+    return false;
+
    uint64_t ShlImm;
    if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
      return false;
@@ -1811,11 +1949,15 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
      return false;
  
    ShiftAmount = countTrailingZeros(NonZeroBits);
-  MaskWidth = CountTrailingOnes_64(NonZeroBits >> ShiftAmount);
+  MaskWidth = countTrailingOnes(NonZeroBits >> ShiftAmount);
  
    // BFI encompasses sufficiently many nodes that it's worth inserting an extra
    // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
-  // amount.
+  // amount.  BiggerPattern is true when this pattern is being matched for BFI,
+  // BiggerPattern is false when this pattern is being matched for UBFIZ, in
+  // which case it is not profitable to insert an extra shift.
+  if (ShlImm - ShiftAmount != 0 && !BiggerPattern)
+    return false;
    Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
  
    return true;
@@ -1832,7 +1974,8 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
  // f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
  static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
                                       SDValue &Src, unsigned &ImmR,
-                                     unsigned &ImmS, SelectionDAG *CurDAG) {
+                                     unsigned &ImmS, const APInt &UsefulBits,
+                                     SelectionDAG *CurDAG) {
    assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
  
    // Set Opc
@@ -1846,23 +1989,30 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
  
    // Because of simplify-demanded-bits in DAGCombine, involved masks may not
    // have the expected shape. Try to undo that.
-  APInt UsefulBits;
-  getUsefulBits(SDValue(N, 0), UsefulBits);
  
    unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
    unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
  
-  // OR is commutative, check both possibilities (does llvm provide a
-  // way to do that directely, e.g., via code matcher?)
-  SDValue OrOpd1Val = N->getOperand(1);
-  SDNode *OrOpd0 = N->getOperand(0).getNode();
-  SDNode *OrOpd1 = N->getOperand(1).getNode();
-  for (int i = 0; i < 2;
-       ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) {
+  // OR is commutative, check all combinations of operand order and values of
+  // BiggerPattern, i.e.
+  //     Opd0, Opd1, BiggerPattern=false
+  //     Opd1, Opd0, BiggerPattern=false
+  //     Opd0, Opd1, BiggerPattern=true
+  //     Opd1, Opd0, BiggerPattern=true
+  // Several of these combinations may match, so check with BiggerPattern=false
+  // first since that will produce better results by matching more instructions
+  // and/or inserting fewer extra instructions.
+  for (int I = 0; I < 4; ++I) {
+
+    bool BiggerPattern = I / 2;
+    SDNode *OrOpd0 = N->getOperand(I % 2).getNode();
+    SDValue OrOpd1Val = N->getOperand((I + 1) % 2);
+    SDNode *OrOpd1 = OrOpd1Val.getNode();
+
      unsigned BFXOpc;
      int DstLSB, Width;
      if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
-                            NumberOfIgnoredLowBits, true)) {
+                            NumberOfIgnoredLowBits, BiggerPattern)) {
        // Check that the returned opcode is compatible with the pattern,
        // i.e., same type and zero extended (U and not S)
        if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) ||
@@ -1880,8 +2030,9 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
  
        // If the mask on the insertee is correct, we have a BFXIL operation. We
        // can share the ImmR and ImmS values from the already-computed UBFM.
-    } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src,
-                                       DstLSB, Width)) {
+    } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0),
+                                       BiggerPattern,
+                                       Src, DstLSB, Width)) {
        ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
        ImmS = Width - 1;
      } else
@@ -1931,70 +2082,57 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
    unsigned Opc;
    unsigned LSB, MSB;
    SDValue Opd0, Opd1;
+  EVT VT = N->getValueType(0);
+  APInt NUsefulBits;
+  getUsefulBits(SDValue(N, 0), NUsefulBits);
+
+  // If all bits are not useful, just return UNDEF.
+  if (!NUsefulBits)
+    return CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, VT);
  
-  if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG))
+  if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, NUsefulBits,
+                                CurDAG))
      return nullptr;
  
-  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
    SDValue Ops[] = { Opd0,
                      Opd1,
-                    CurDAG->getTargetConstant(LSB, VT),
-                    CurDAG->getTargetConstant(MSB, VT) };
+                    CurDAG->getTargetConstant(LSB, dl, VT),
+                    CurDAG->getTargetConstant(MSB, dl, VT) };
    return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
  }
  
-SDNode *AArch64DAGToDAGISel::SelectLIBM(SDNode *N) {
+/// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the
+/// equivalent of a left shift by a constant amount followed by an and masking
+/// out a contiguous set of bits.
+SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertInZeroOp(SDNode *N) {
+  if (N->getOpcode() != ISD::AND)
+    return nullptr;
+
    EVT VT = N->getValueType(0);
-  unsigned Variant;
    unsigned Opc;
-  unsigned FRINTXOpcs[] = { AArch64::FRINTXSr, AArch64::FRINTXDr };
-
-  if (VT == MVT::f32) {
-    Variant = 0;
-  } else if (VT == MVT::f64) {
-    Variant = 1;
-  } else
-    return nullptr; // Unrecognized argument type. Fall back on default codegen.
-
-  // Pick the FRINTX variant needed to set the flags.
-  unsigned FRINTXOpc = FRINTXOpcs[Variant];
-
-  switch (N->getOpcode()) {
-  default:
-    return nullptr; // Unrecognized libm ISD node. Fall back on default codegen.
-  case ISD::FCEIL: {
-    unsigned FRINTPOpcs[] = { AArch64::FRINTPSr, AArch64::FRINTPDr };
-    Opc = FRINTPOpcs[Variant];
-    break;
-  }
-  case ISD::FFLOOR: {
-    unsigned FRINTMOpcs[] = { AArch64::FRINTMSr, AArch64::FRINTMDr };
-    Opc = FRINTMOpcs[Variant];
-    break;
-  }
-  case ISD::FTRUNC: {
-    unsigned FRINTZOpcs[] = { AArch64::FRINTZSr, AArch64::FRINTZDr };
-    Opc = FRINTZOpcs[Variant];
-    break;
-  }
-  case ISD::FROUND: {
-    unsigned FRINTAOpcs[] = { AArch64::FRINTASr, AArch64::FRINTADr };
-    Opc = FRINTAOpcs[Variant];
-    break;
-  }
-  }
+  if (VT == MVT::i32)
+    Opc = AArch64::UBFMWri;
+  else if (VT == MVT::i64)
+    Opc = AArch64::UBFMXri;
+  else
+    return nullptr;
  
-  SDLoc dl(N);
-  SDValue In = N->getOperand(0);
-  SmallVector<SDValue, 2> Ops;
-  Ops.push_back(In);
+  SDValue Op0;
+  int DstLSB, Width;
+  if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false,
+                               Op0, DstLSB, Width))
+    return nullptr;
  
-  if (!TM.Options.UnsafeFPMath) {
-    SDNode *FRINTX = CurDAG->getMachineNode(FRINTXOpc, dl, VT, MVT::Glue, In);
-    Ops.push_back(SDValue(FRINTX, 1));
-  }
+  // ImmR is the rotate right amount.
+  unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
+  // ImmS is the most significant bit of the source to be moved.
+  unsigned ImmS = Width - 1;
  
-  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+  SDLoc DL(N);
+  SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT),
+                   CurDAG->getTargetConstant(ImmS, DL, VT)};
+  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
  }
  
  bool
@@ -2037,10 +2175,132 @@ AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
    // finding FBits, but it must still be in range.
    if (FBits == 0 || FBits > RegWidth) return false;
  
-  FixedPos = CurDAG->getTargetConstant(FBits, MVT::i32);
+  FixedPos = CurDAG->getTargetConstant(FBits, SDLoc(N), MVT::i32);
    return true;
  }
  
+// Inspects a register string of the form o0:op1:CRn:CRm:op2 gets the fields
+// of the string and obtains the integer values from them and combines these
+// into a single value to be used in the MRS/MSR instruction.
+static int getIntOperandFromRegisterString(StringRef RegString) {
+  SmallVector<StringRef, 5> Fields;
+  RegString.split(Fields, ':');
+
+  if (Fields.size() == 1)
+    return -1;
+
+  assert(Fields.size() == 5
+            && "Invalid number of fields in read register string");
+
+  SmallVector<int, 5> Ops;
+  bool AllIntFields = true;
+
+  for (StringRef Field : Fields) {
+    unsigned IntField;
+    AllIntFields &= !Field.getAsInteger(10, IntField);
+    Ops.push_back(IntField);
+  }
+
+  assert(AllIntFields &&
+          "Unexpected non-integer value in special register string.");
+
+  // Need to combine the integer fields of the string into a single value
+  // based on the bit encoding of MRS/MSR instruction.
+  return (Ops[0] << 14) | (Ops[1] << 11) | (Ops[2] << 7) |
+         (Ops[3] << 3) | (Ops[4]);
+}
+
+// Lower the read_register intrinsic to an MRS instruction node if the special
+// register string argument is either of the form detailed in the ALCE (the
+// form described in getIntOperandsFromRegsterString) or is a named register
+// known by the MRS SysReg mapper.
+SDNode *AArch64DAGToDAGISel::SelectReadRegister(SDNode *N) {
+  const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
+  const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  SDLoc DL(N);
+
+  int Reg = getIntOperandFromRegisterString(RegString->getString());
+  if (Reg != -1)
+    return CurDAG->getMachineNode(AArch64::MRS, DL, N->getSimpleValueType(0),
+                                  MVT::Other,
+                                  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                                  N->getOperand(0));
+
+  // Use the sysreg mapper to map the remaining possible strings to the
+  // value for the register to be used for the instruction operand.
+  AArch64SysReg::MRSMapper mapper;
+  bool IsValidSpecialReg;
+  Reg = mapper.fromString(RegString->getString(),
+                          Subtarget->getFeatureBits(),
+                          IsValidSpecialReg);
+  if (IsValidSpecialReg)
+    return CurDAG->getMachineNode(AArch64::MRS, DL, N->getSimpleValueType(0),
+                                  MVT::Other,
+                                  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                                  N->getOperand(0));
+
+  return nullptr;
+}
+
+// Lower the write_register intrinsic to an MSR instruction node if the special
+// register string argument is either of the form detailed in the ALCE (the
+// form described in getIntOperandsFromRegsterString) or is a named register
+// known by the MSR SysReg mapper.
+SDNode *AArch64DAGToDAGISel::SelectWriteRegister(SDNode *N) {
+  const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
+  const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  SDLoc DL(N);
+
+  int Reg = getIntOperandFromRegisterString(RegString->getString());
+  if (Reg != -1)
+    return CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
+                                  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                                  N->getOperand(2), N->getOperand(0));
+
+  // Check if the register was one of those allowed as the pstatefield value in
+  // the MSR (immediate) instruction. To accept the values allowed in the
+  // pstatefield for the MSR (immediate) instruction, we also require that an
+  // immediate value has been provided as an argument, we know that this is
+  // the case as it has been ensured by semantic checking.
+  AArch64PState::PStateMapper PMapper;
+  bool IsValidSpecialReg;
+  Reg = PMapper.fromString(RegString->getString(),
+                           Subtarget->getFeatureBits(),
+                           IsValidSpecialReg);
+  if (IsValidSpecialReg) {
+    assert (isa<ConstantSDNode>(N->getOperand(2))
+              && "Expected a constant integer expression.");
+    uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+    unsigned State;
+    if (Reg == AArch64PState::PAN || Reg == AArch64PState::UAO) {
+      assert(Immed < 2 && "Bad imm");
+      State = AArch64::MSRpstateImm1;
+    } else {
+      assert(Immed < 16 && "Bad imm");
+      State = AArch64::MSRpstateImm4;
+    }
+    return CurDAG->getMachineNode(State, DL, MVT::Other,
+                                  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                                  CurDAG->getTargetConstant(Immed, DL, MVT::i16),
+                                  N->getOperand(0));
+  }
+
+  // Use the sysreg mapper to attempt to map the remaining possible strings
+  // to the value for the register to be used for the MSR (register)
+  // instruction operand.
+  AArch64SysReg::MSRMapper Mapper;
+  Reg = Mapper.fromString(RegString->getString(),
+                          Subtarget->getFeatureBits(),
+                          IsValidSpecialReg);
+
+  if (IsValidSpecialReg)
+    return CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
+                                  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                                  N->getOperand(2), N->getOperand(0));
+
+  return nullptr;
+}
+
  SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
    // Dump information about the Node being selected
    DEBUG(errs() << "Selecting: ");
@@ -2062,6 +2322,16 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
    default:
      break;
  
+  case ISD::READ_REGISTER:
+    if (SDNode *Res = SelectReadRegister(Node))
+      return Res;
+    break;
+
+  case ISD::WRITE_REGISTER:
+    if (SDNode *Res = SelectWriteRegister(Node))
+      return Res;
+    break;
+
    case ISD::ADD:
      if (SDNode *I = SelectMLAV64LaneV128(Node))
        return I;
@@ -2082,6 +2352,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
    case ISD::SRA:
      if (SDNode *I = SelectBitfieldExtractOp(Node))
        return I;
+    if (SDNode *I = SelectBitfieldInsertInZeroOp(Node))
+      return I;
      break;
  
    case ISD::OR:
@@ -2111,14 +2383,16 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
                  .getVectorElementType()
                  .getSizeInBits()) {
      default:
-      assert(0 && "Unexpected vector element type!");
+      llvm_unreachable("Unexpected vector element type!");
      case 64:
        SubReg = AArch64::dsub;
        break;
      case 32:
        SubReg = AArch64::ssub;
        break;
-    case 16: // FALLTHROUGH
+    case 16:
+      SubReg = AArch64::hsub;
+      break;
      case 8:
        llvm_unreachable("unexpected zext-requiring extract element!");
      }
@@ -2149,9 +2423,11 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
      int FI = cast<FrameIndexSDNode>(Node)->getIndex();
      unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
      const TargetLowering *TLI = getTargetLowering();
-    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
-    SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
-                      CurDAG->getTargetConstant(Shifter, MVT::i32) };
+    SDValue TFI = CurDAG->getTargetFrameIndex(
+        FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+    SDLoc DL(Node);
+    SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, DL, MVT::i32),
+                      CurDAG->getTargetConstant(Shifter, DL, MVT::i32) };
      return CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
    }
    case ISD::INTRINSIC_W_CHAIN: {
@@ -2187,11 +2463,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
        SDValue MemAddr = Node->getOperand(4);
  
        // Place arguments in the right order.
-      SmallVector<SDValue, 7> Ops;
-      Ops.push_back(ValLo);
-      Ops.push_back(ValHi);
-      Ops.push_back(MemAddr);
-      Ops.push_back(Chain);
+      SDValue Ops[] = {ValLo, ValHi, MemAddr, Chain};
  
        SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops);
        // Transfer memoperands.
@@ -2206,9 +2478,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
          return SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
        else if (VT == MVT::v16i8)
          return SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
          return SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
          return SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
        else if (VT == MVT::v2i32 || VT == MVT::v2f32)
          return SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
@@ -2224,9 +2496,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
          return SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
        else if (VT == MVT::v16i8)
          return SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
          return SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
          return SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
        else if (VT == MVT::v2i32 || VT == MVT::v2f32)
          return SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
@@ -2242,9 +2514,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
          return SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
        else if (VT == MVT::v16i8)
          return SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
          return SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
          return SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
        else if (VT == MVT::v2i32 || VT == MVT::v2f32)
          return SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
@@ -2260,9 +2532,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
          return SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
        else if (VT == MVT::v16i8)
          return SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
          return SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
          return SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
        else if (VT == MVT::v2i32 || VT == MVT::v2f32)
          return SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
@@ -2278,9 +2550,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
          return SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
        else if (VT == MVT::v16i8)
          return SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
          return SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
          return SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
        else if (VT == MVT::v2i32 || VT == MVT::v2f32)
          return SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
@@ -2296,9 +2568,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
          return SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
        else if (VT == MVT::v16i8)
          return SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
          return SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16  || VT == MVT::v8f16)
          return SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
        else if (VT == MVT::v2i32 || VT == MVT::v2f32)
          return SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
@@ -2314,9 +2586,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
          return SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
        else if (VT == MVT::v16i8)
          return SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
          return SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
          return SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
        else if (VT == MVT::v2i32 || VT == MVT::v2f32)
          return SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
@@ -2332,9 +2604,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
          return SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
        else if (VT == MVT::v16i8)
          return SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
          return SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
          return SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
        else if (VT == MVT::v2i32 || VT == MVT::v2f32)
          return SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
@@ -2350,9 +2622,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
          return SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
        else if (VT == MVT::v16i8)
          return SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
          return SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
          return SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
        else if (VT == MVT::v2i32 || VT == MVT::v2f32)
          return SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
@@ -2366,7 +2638,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
      case Intrinsic::aarch64_neon_ld2lane:
        if (VT == MVT::v16i8 || VT == MVT::v8i8)
          return SelectLoadLane(Node, 2, AArch64::LD2i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16)
          return SelectLoadLane(Node, 2, AArch64::LD2i16);
        else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
                 VT == MVT::v2f32)
@@ -2378,7 +2651,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
      case Intrinsic::aarch64_neon_ld3lane:
        if (VT == MVT::v16i8 || VT == MVT::v8i8)
          return SelectLoadLane(Node, 3, AArch64::LD3i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16)
          return SelectLoadLane(Node, 3, AArch64::LD3i16);
        else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
                 VT == MVT::v2f32)
@@ -2390,7 +2664,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
      case Intrinsic::aarch64_neon_ld4lane:
        if (VT == MVT::v16i8 || VT == MVT::v8i8)
          return SelectLoadLane(Node, 4, AArch64::LD4i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16)
          return SelectLoadLane(Node, 4, AArch64::LD4i16);
        else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
                 VT == MVT::v2f32)
@@ -2450,9 +2725,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
          return SelectStore(Node, 2, AArch64::ST1Twov8b);
        else if (VT == MVT::v16i8)
          return SelectStore(Node, 2, AArch64::ST1Twov16b);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
          return SelectStore(Node, 2, AArch64::ST1Twov4h);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
          return SelectStore(Node, 2, AArch64::ST1Twov8h);
        else if (VT == MVT::v2i32 || VT == MVT::v2f32)
          return SelectStore(Node, 2, AArch64::ST1Twov2s);
@@ -2469,9 +2744,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
          return SelectStore(Node, 3, AArch64::ST1Threev8b);
        else if (VT == MVT::v16i8)
          return SelectStore(Node, 3, AArch64::ST1Threev16b);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
          return SelectStore(Node, 3, AArch64::ST1Threev4h);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
          return SelectStore(Node, 3, AArch64::ST1Threev8h);
        else if (VT == MVT::v2i32 || VT == MVT::v2f32)
          return SelectStore(Node, 3, AArch64::ST1Threev2s);
@@ -2488,9 +2763,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
          return SelectStore(Node, 4, AArch64::ST1Fourv8b);
        else if (VT == MVT::v16i8)
          return SelectStore(Node, 4, AArch64::ST1Fourv16b);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
          return SelectStore(Node, 4, AArch64::ST1Fourv4h);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
          return SelectStore(Node, 4, AArch64::ST1Fourv8h);
        else if (VT == MVT::v2i32 || VT == MVT::v2f32)
          return SelectStore(Node, 4, AArch64::ST1Fourv2s);
@@ -2507,9 +2782,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
          return SelectStore(Node, 2, AArch64::ST2Twov8b);
        else if (VT == MVT::v16i8)
          return SelectStore(Node, 2, AArch64::ST2Twov16b);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
          return SelectStore(Node, 2, AArch64::ST2Twov4h);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
          return SelectStore(Node, 2, AArch64::ST2Twov8h);
        else if (VT == MVT::v2i32 || VT == MVT::v2f32)
          return SelectStore(Node, 2, AArch64::ST2Twov2s);
@@ -2526,9 +2801,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
          return SelectStore(Node, 3, AArch64::ST3Threev8b);
        else if (VT == MVT::v16i8)
          return SelectStore(Node, 3, AArch64::ST3Threev16b);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
          return SelectStore(Node, 3, AArch64::ST3Threev4h);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
          return SelectStore(Node, 3, AArch64::ST3Threev8h);
        else if (VT == MVT::v2i32 || VT == MVT::v2f32)
          return SelectStore(Node, 3, AArch64::ST3Threev2s);
@@ -2545,9 +2820,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
          return SelectStore(Node, 4, AArch64::ST4Fourv8b);
        else if (VT == MVT::v16i8)
          return SelectStore(Node, 4, AArch64::ST4Fourv16b);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
          return SelectStore(Node, 4, AArch64::ST4Fourv4h);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
          return SelectStore(Node, 4, AArch64::ST4Fourv8h);
        else if (VT == MVT::v2i32 || VT == MVT::v2f32)
          return SelectStore(Node, 4, AArch64::ST4Fourv2s);
@@ -2562,7 +2837,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
      case Intrinsic::aarch64_neon_st2lane: {
        if (VT == MVT::v16i8 || VT == MVT::v8i8)
          return SelectStoreLane(Node, 2, AArch64::ST2i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16)
          return SelectStoreLane(Node, 2, AArch64::ST2i16);
        else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
                 VT == MVT::v2f32)
@@ -2575,7 +2851,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
      case Intrinsic::aarch64_neon_st3lane: {
        if (VT == MVT::v16i8 || VT == MVT::v8i8)
          return SelectStoreLane(Node, 3, AArch64::ST3i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16)
          return SelectStoreLane(Node, 3, AArch64::ST3i16);
        else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
                 VT == MVT::v2f32)
@@ -2588,7 +2865,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
      case Intrinsic::aarch64_neon_st4lane: {
        if (VT == MVT::v16i8 || VT == MVT::v8i8)
          return SelectStoreLane(Node, 4, AArch64::ST4i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16)
          return SelectStoreLane(Node, 4, AArch64::ST4i16);
        else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
                 VT == MVT::v2f32)
@@ -2599,15 +2877,16 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
        break;
      }
      }
+    break;
    }
    case AArch64ISD::LD2post: {
      if (VT == MVT::v8i8)
        return SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
      else if (VT == MVT::v16i8)
        return SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
        return SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
        return SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
        return SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
@@ -2624,9 +2903,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
        return SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
      else if (VT == MVT::v16i8)
        return SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
        return SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
        return SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
        return SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
@@ -2643,9 +2922,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
        return SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
      else if (VT == MVT::v16i8)
        return SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
        return SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
        return SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
        return SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
@@ -2662,9 +2941,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
        return SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
      else if (VT == MVT::v16i8)
        return SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
        return SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
        return SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
        return SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
@@ -2681,9 +2960,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
        return SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
      else if (VT == MVT::v16i8)
        return SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
        return SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
        return SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
        return SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
@@ -2700,9 +2979,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
        return SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
      else if (VT == MVT::v16i8)
        return SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
        return SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
        return SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
        return SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
@@ -2719,9 +2998,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
        return SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
      else if (VT == MVT::v16i8)
        return SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
        return SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
        return SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
        return SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
@@ -2738,9 +3017,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
        return SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
      else if (VT == MVT::v16i8)
        return SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
        return SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
        return SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
        return SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
@@ -2757,9 +3036,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
        return SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
      else if (VT == MVT::v16i8)
        return SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
        return SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
        return SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
        return SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
@@ -2776,9 +3055,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
        return SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
      else if (VT == MVT::v16i8)
        return SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
        return SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
        return SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
        return SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
@@ -2793,7 +3072,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
    case AArch64ISD::LD1LANEpost: {
      if (VT == MVT::v16i8 || VT == MVT::v8i8)
        return SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+             VT == MVT::v8f16)
        return SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
               VT == MVT::v2f32)
@@ -2806,7 +3086,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
    case AArch64ISD::LD2LANEpost: {
      if (VT == MVT::v16i8 || VT == MVT::v8i8)
        return SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+             VT == MVT::v8f16)
        return SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
               VT == MVT::v2f32)
@@ -2819,7 +3100,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
    case AArch64ISD::LD3LANEpost: {
      if (VT == MVT::v16i8 || VT == MVT::v8i8)
        return SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+             VT == MVT::v8f16)
        return SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
               VT == MVT::v2f32)
@@ -2832,7 +3114,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
    case AArch64ISD::LD4LANEpost: {
      if (VT == MVT::v16i8 || VT == MVT::v8i8)
        return SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+             VT == MVT::v8f16)
        return SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
               VT == MVT::v2f32)
@@ -2848,9 +3131,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
        return SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
      else if (VT == MVT::v16i8)
        return SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
        return SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
        return SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
        return SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
@@ -2868,9 +3151,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
        return SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
      else if (VT == MVT::v16i8)
        return SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
        return SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
        return SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
        return SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
@@ -2888,9 +3171,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
        return SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
      else if (VT == MVT::v16i8)
        return SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
        return SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
        return SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
        return SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
@@ -2908,9 +3191,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
        return SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
      else if (VT == MVT::v16i8)
        return SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
        return SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
        return SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
        return SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
@@ -2928,9 +3211,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
        return SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
      else if (VT == MVT::v16i8)
        return SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
        return SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
        return SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
        return SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
@@ -2948,9 +3231,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
        return SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
      else if (VT == MVT::v16i8)
        return SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
        return SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
        return SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
        return SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
@@ -2966,7 +3249,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
      VT = Node->getOperand(1).getValueType();
      if (VT == MVT::v16i8 || VT == MVT::v8i8)
        return SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+             VT == MVT::v8f16)
        return SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
               VT == MVT::v2f32)
@@ -2980,7 +3264,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
      VT = Node->getOperand(1).getValueType();
      if (VT == MVT::v16i8 || VT == MVT::v8i8)
        return SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+             VT == MVT::v8f16)
        return SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
               VT == MVT::v2f32)
@@ -2994,7 +3279,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
      VT = Node->getOperand(1).getValueType();
      if (VT == MVT::v16i8 || VT == MVT::v8i8)
        return SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+             VT == MVT::v8f16)
        return SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
               VT == MVT::v2f32)
@@ -3004,14 +3290,6 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
        return SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
      break;
    }
-
-  case ISD::FCEIL:
-  case ISD::FFLOOR:
-  case ISD::FTRUNC:
-  case ISD::FROUND:
-    if (SDNode *I = SelectLIBM(Node))
-      return I;
-    break;
    }
  
    // Select the default instruction