[AArch64] Don't force MVT::Untyped when selecting LD1LANEpost.

[oota-llvm.git] / lib / Target / AArch64 / AArch64ISelLowering.cpp
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp

index 9b8ff72dded5c783eb34651efc611069c7754b3c..3dc236a89da34172518587d541c57ed8f43998f1 100644 (file)
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12,9 +12,10 @@
  //===----------------------------------------------------------------------===//
  
  #include "AArch64ISelLowering.h"
+#include "AArch64CallingConvention.h"
+#include "AArch64MachineFunctionInfo.h"
  #include "AArch64PerfectShuffle.h"
  #include "AArch64Subtarget.h"
-#include "AArch64MachineFunctionInfo.h"
  #include "AArch64TargetMachine.h"
  #include "AArch64TargetObjectFile.h"
  #include "MCTargetDesc/AArch64AddressingModes.h"
@@ -38,10 +39,12 @@ using namespace llvm;
  STATISTIC(NumTailCalls, "Number of tail calls");
  STATISTIC(NumShiftInserts, "Number of vector shift inserts");
  
+namespace {
  enum AlignMode {
    StrictAlign,
    NoStrictAlign
  };
+}
  
  static cl::opt<AlignMode>
  Align(cl::desc("Load/store alignment support"),
@@ -61,22 +64,20 @@ EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden,
  
  static cl::opt<bool>
  EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
-                         cl::desc("Allow AArch64 SLI/SRI formation"),
-                         cl::init(false));
+                           cl::desc("Allow AArch64 SLI/SRI formation"),
+                           cl::init(false));
  
-//===----------------------------------------------------------------------===//
-// AArch64 Lowering public interface.
-//===----------------------------------------------------------------------===//
-static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
-  if (TT.isOSBinFormatMachO())
-    return new AArch64_MachoTargetObjectFile();
-
-  return new AArch64_ELFTargetObjectFile();
-}
+// FIXME: The necessary dtprel relocations don't seem to be supported
+// well in the GNU bfd and gold linkers at the moment. Therefore, by
+// default, for now, fall back to GeneralDynamic code generation.
+cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
+    "aarch64-elf-ldtls-generation", cl::Hidden,
+    cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
+    cl::init(false));
  
-AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
-    : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
-  Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
+                                             const AArch64Subtarget &STI)
+    : TargetLowering(TM), Subtarget(&STI) {
  
    // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
    // we have to make something up. Arbitrarily, choose ZeroOrOne.
@@ -106,6 +107,7 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
      addDRTypeForNEON(MVT::v2i32);
      addDRTypeForNEON(MVT::v1i64);
      addDRTypeForNEON(MVT::v1f64);
+    addDRTypeForNEON(MVT::v4f16);
  
      addQRTypeForNEON(MVT::v4f32);
      addQRTypeForNEON(MVT::v2f64);
@@ -113,10 +115,11 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
      addQRTypeForNEON(MVT::v8i16);
      addQRTypeForNEON(MVT::v4i32);
      addQRTypeForNEON(MVT::v2i64);
+    addQRTypeForNEON(MVT::v8f16);
    }
  
    // Compute derived properties from the register classes
-  computeRegisterProperties();
+  computeRegisterProperties(Subtarget->getRegisterInfo());
  
    // Provide all sorts of operation actions
    setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
@@ -278,10 +281,121 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
  
+  // f16 is a storage-only type, always promote it to f32.
+  setOperationAction(ISD::SETCC,       MVT::f16,  Promote);
+  setOperationAction(ISD::BR_CC,       MVT::f16,  Promote);
+  setOperationAction(ISD::SELECT_CC,   MVT::f16,  Promote);
+  setOperationAction(ISD::SELECT,      MVT::f16,  Promote);
+  setOperationAction(ISD::FADD,        MVT::f16,  Promote);
+  setOperationAction(ISD::FSUB,        MVT::f16,  Promote);
+  setOperationAction(ISD::FMUL,        MVT::f16,  Promote);
+  setOperationAction(ISD::FDIV,        MVT::f16,  Promote);
+  setOperationAction(ISD::FREM,        MVT::f16,  Promote);
+  setOperationAction(ISD::FMA,         MVT::f16,  Promote);
+  setOperationAction(ISD::FNEG,        MVT::f16,  Promote);
+  setOperationAction(ISD::FABS,        MVT::f16,  Promote);
+  setOperationAction(ISD::FCEIL,       MVT::f16,  Promote);
+  setOperationAction(ISD::FCOPYSIGN,   MVT::f16,  Promote);
+  setOperationAction(ISD::FCOS,        MVT::f16,  Promote);
+  setOperationAction(ISD::FFLOOR,      MVT::f16,  Promote);
+  setOperationAction(ISD::FNEARBYINT,  MVT::f16,  Promote);
+  setOperationAction(ISD::FPOW,        MVT::f16,  Promote);
+  setOperationAction(ISD::FPOWI,       MVT::f16,  Promote);
+  setOperationAction(ISD::FRINT,       MVT::f16,  Promote);
+  setOperationAction(ISD::FSIN,        MVT::f16,  Promote);
+  setOperationAction(ISD::FSINCOS,     MVT::f16,  Promote);
+  setOperationAction(ISD::FSQRT,       MVT::f16,  Promote);
+  setOperationAction(ISD::FEXP,        MVT::f16,  Promote);
+  setOperationAction(ISD::FEXP2,       MVT::f16,  Promote);
+  setOperationAction(ISD::FLOG,        MVT::f16,  Promote);
+  setOperationAction(ISD::FLOG2,       MVT::f16,  Promote);
+  setOperationAction(ISD::FLOG10,      MVT::f16,  Promote);
+  setOperationAction(ISD::FROUND,      MVT::f16,  Promote);
+  setOperationAction(ISD::FTRUNC,      MVT::f16,  Promote);
+  setOperationAction(ISD::FMINNUM,     MVT::f16,  Promote);
+  setOperationAction(ISD::FMAXNUM,     MVT::f16,  Promote);
+
+  // v4f16 is also a storage-only type, so promote it to v4f32 when that is
+  // known to be safe.
+  setOperationAction(ISD::FADD, MVT::v4f16, Promote);
+  setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
+  setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
+  setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
+  setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote);
+  setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote);
+  AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32);
+
+  // Expand all other v4f16 operations.
+  // FIXME: We could generate better code by promoting some operations to
+  // a pair of v4f32s
+  setOperationAction(ISD::FABS, MVT::v4f16, Expand);
+  setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
+  setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
+  setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
+  setOperationAction(ISD::FMA, MVT::v4f16, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
+  setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
+  setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
+  setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
+  setOperationAction(ISD::FREM, MVT::v4f16, Expand);
+  setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
+  setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
+  setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
+  setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
+  setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
+  setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
+  setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
+  setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
+  setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
+  setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
+  setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
+  setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);
+
+
+  // v8f16 is also a storage-only type, so expand it.
+  setOperationAction(ISD::FABS, MVT::v8f16, Expand);
+  setOperationAction(ISD::FADD, MVT::v8f16, Expand);
+  setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
+  setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
+  setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
+  setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
+  setOperationAction(ISD::FMA, MVT::v8f16, Expand);
+  setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
+  setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
+  setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
+  setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
+  setOperationAction(ISD::FREM, MVT::v8f16, Expand);
+  setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
+  setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
+  setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
+  setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
+  setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
+  setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
+  setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
+  setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
+  setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
+  setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
+  setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);
+
    // AArch64 has implementations of a lot of rounding-like FP operations.
-  static MVT RoundingTypes[] = { MVT::f32, MVT::f64};
-  for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
-    MVT Ty = RoundingTypes[I];
+  for (MVT Ty : {MVT::f32, MVT::f64}) {
      setOperationAction(ISD::FFLOOR, Ty, Legal);
      setOperationAction(ISD::FNEARBYINT, Ty, Legal);
      setOperationAction(ISD::FCEIL, Ty, Legal);
@@ -303,13 +417,24 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
      setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
    }
  
+  // Make floating-point constants legal for the large code model, so they don't
+  // become loads from the constant pool.
+  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
+    setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+    setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+  }
+
    // AArch64 does not have floating-point extending loads, i1 sign-extending
    // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
-  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
+  for (MVT VT : MVT::fp_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
+  }
+  for (MVT VT : MVT::integer_valuetypes())
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
+
    setTruncStoreAction(MVT::f32, MVT::f16, Expand);
    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
    setTruncStoreAction(MVT::f64, MVT::f16, Expand);
@@ -381,6 +506,7 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
  
    // Enable TBZ/TBNZ
    MaskAndBranchFoldingIsLegal = true;
+  EnableExtLdPromotion = true;
  
    setMinFunctionAlignment(2);
  
@@ -439,36 +565,35 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
  
      // AArch64 doesn't have MUL.2d:
      setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+    // Custom handling for some quad-vector types to detect MULL.
+    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+
      setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
      setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
      // Likewise, narrowing and extending vector loads/stores aren't handled
      // directly.
-    for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-         VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
-
-      setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
-                         Expand);
-
-      setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
-
-      setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
-
-      for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-           InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
-        setTruncStoreAction((MVT::SimpleValueType)VT,
-                            (MVT::SimpleValueType)InnerVT, Expand);
-      setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
-      setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
-      setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
+    for (MVT VT : MVT::vector_valuetypes()) {
+      setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+
+      setOperationAction(ISD::MULHS, VT, Expand);
+      setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+      setOperationAction(ISD::MULHU, VT, Expand);
+      setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+
+      setOperationAction(ISD::BSWAP, VT, Expand);
+
+      for (MVT InnerVT : MVT::vector_valuetypes()) {
+        setTruncStoreAction(VT, InnerVT, Expand);
+        setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+        setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+        setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+      }
      }
  
      // AArch64 has implementations of a lot of rounding-like FP operations.
-    static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 };
-    for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) {
-      MVT Ty = RoundingVecTypes[I];
+    for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
        setOperationAction(ISD::FFLOOR, Ty, Legal);
        setOperationAction(ISD::FNEARBYINT, Ty, Legal);
        setOperationAction(ISD::FCEIL, Ty, Legal);
@@ -477,16 +602,20 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
        setOperationAction(ISD::FROUND, Ty, Legal);
      }
    }
+
+  // Prefer likely predicted branches to selects on out-of-order cores.
+  if (Subtarget->isCortexA57())
+    PredictableSelectIsExpensive = true;
  }
  
  void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
-  if (VT == MVT::v2f32) {
+  if (VT == MVT::v2f32 || VT == MVT::v4f16) {
      setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
      AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
  
      setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
      AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
-  } else if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+  } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) {
      setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
      AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
  
@@ -523,7 +652,8 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
    setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
    setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
    setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
-  setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
+  for (MVT InnerVT : MVT::all_valuetypes())
+    setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand);
  
    // CNT supports only B element sizes.
    if (VT != MVT::v8i8 && VT != MVT::v16i8)
@@ -630,13 +760,6 @@ MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
    return MVT::i64;
  }
  
-unsigned AArch64TargetLowering::getMaximalGlobalOffset() const {
-  // FIXME: On AArch64, this depends on the type.
-  // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes().
-  // and the offset has to be a multiple of the related size in bytes.
-  return 4095;
-}
-
  FastISel *
  AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
                                        const TargetLibraryInfo *libInfo) const {
@@ -659,7 +782,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case AArch64ISD::CSNEG:             return "AArch64ISD::CSNEG";
    case AArch64ISD::CSINC:             return "AArch64ISD::CSINC";
    case AArch64ISD::THREAD_POINTER:    return "AArch64ISD::THREAD_POINTER";
-  case AArch64ISD::TLSDESC_CALL:      return "AArch64ISD::TLSDESC_CALL";
+  case AArch64ISD::TLSDESC_CALLSEQ:   return "AArch64ISD::TLSDESC_CALLSEQ";
    case AArch64ISD::ADC:               return "AArch64ISD::ADC";
    case AArch64ISD::SBC:               return "AArch64ISD::SBC";
    case AArch64ISD::ADDS:              return "AArch64ISD::ADDS";
@@ -718,6 +841,12 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case AArch64ISD::FCMGTz:            return "AArch64ISD::FCMGTz";
    case AArch64ISD::FCMLEz:            return "AArch64ISD::FCMLEz";
    case AArch64ISD::FCMLTz:            return "AArch64ISD::FCMLTz";
+  case AArch64ISD::SADDV:             return "AArch64ISD::SADDV";
+  case AArch64ISD::UADDV:             return "AArch64ISD::UADDV";
+  case AArch64ISD::SMINV:             return "AArch64ISD::SMINV";
+  case AArch64ISD::UMINV:             return "AArch64ISD::UMINV";
+  case AArch64ISD::SMAXV:             return "AArch64ISD::SMAXV";
+  case AArch64ISD::UMAXV:             return "AArch64ISD::UMAXV";
    case AArch64ISD::NOT:               return "AArch64ISD::NOT";
    case AArch64ISD::BIT:               return "AArch64ISD::BIT";
    case AArch64ISD::CBZ:               return "AArch64ISD::CBZ";
@@ -727,6 +856,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case AArch64ISD::TC_RETURN:         return "AArch64ISD::TC_RETURN";
    case AArch64ISD::SITOF:             return "AArch64ISD::SITOF";
    case AArch64ISD::UITOF:             return "AArch64ISD::UITOF";
+  case AArch64ISD::NVCAST:            return "AArch64ISD::NVCAST";
    case AArch64ISD::SQSHL_I:           return "AArch64ISD::SQSHL_I";
    case AArch64ISD::UQSHL_I:           return "AArch64ISD::UQSHL_I";
    case AArch64ISD::SRSHR_I:           return "AArch64ISD::SRSHR_I";
@@ -756,6 +886,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case AArch64ISD::ST2LANEpost:       return "AArch64ISD::ST2LANEpost";
    case AArch64ISD::ST3LANEpost:       return "AArch64ISD::ST3LANEpost";
    case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
+  case AArch64ISD::SMULL:             return "AArch64ISD::SMULL";
+  case AArch64ISD::UMULL:             return "AArch64ISD::UMULL";
    }
  }
  
@@ -774,8 +906,8 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
    // EndBB:
    //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
  
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
    MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
    DebugLoc DL = MI->getDebugLoc();
    MachineFunction::iterator It = MBB;
@@ -1020,6 +1152,8 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
  
  static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                               SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
+  SDValue Cmp;
+  AArch64CC::CondCode AArch64CC;
    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
      EVT VT = RHS.getValueType();
      uint64_t C = RHSC->getZExtValue();
@@ -1051,9 +1185,9 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
          break;
        case ISD::SETLE:
        case ISD::SETGT:
-        if ((VT == MVT::i32 && C != 0x7fffffff &&
+        if ((VT == MVT::i32 && C != INT32_MAX &&
               isLegalArithImmed((uint32_t)(C + 1))) ||
-            (VT == MVT::i64 && C != 0x7ffffffffffffffULL &&
+            (VT == MVT::i64 && C != INT64_MAX &&
               isLegalArithImmed(C + 1ULL))) {
            CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
            C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
@@ -1062,9 +1196,9 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
          break;
        case ISD::SETULE:
        case ISD::SETUGT:
-        if ((VT == MVT::i32 && C != 0xffffffff &&
+        if ((VT == MVT::i32 && C != UINT32_MAX &&
               isLegalArithImmed((uint32_t)(C + 1))) ||
-            (VT == MVT::i64 && C != 0xfffffffffffffffULL &&
+            (VT == MVT::i64 && C != UINT64_MAX &&
               isLegalArithImmed(C + 1ULL))) {
            CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
            C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
@@ -1074,9 +1208,45 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
        }
      }
    }
-
-  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
-  AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
+  // For the i8 operand, the largest immediate is 255, so this can be easily
+  // encoded in the compare instruction. For the i16 operand, however, the
+  // largest immediate cannot be encoded in the compare.
+  // Therefore, use a sign extending load and cmn to avoid materializing the -1
+  // constant. For example,
+  // movz w1, #65535
+  // ldrh w0, [x0, #0]
+  // cmp w0, w1
+  // >
+  // ldrsh w0, [x0, #0]
+  // cmn w0, #1
+  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
+  // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure
+  // both the LHS and RHS are truely zero extended and to make sure the
+  // transformation is profitable.
+  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
+    if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) &&
+        isa<LoadSDNode>(LHS)) {
+      if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
+          cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
+          LHS.getNode()->hasNUsesOfValue(1, 0)) {
+        int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
+        if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
+          SDValue SExt =
+              DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
+                          DAG.getValueType(MVT::i16));
+          Cmp = emitComparison(SExt,
+                               DAG.getConstant(ValueofRHS, RHS.getValueType()),
+                               CC, dl, DAG);
+          AArch64CC = changeIntCCToAArch64CC(CC);
+          AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
+          return Cmp;
+        }
+      }
+    }
+  }
+  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+  AArch64CC = changeIntCCToAArch64CC(CC);
    AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
    return Cmp;
  }
@@ -1113,7 +1283,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
    case ISD::SMULO:
    case ISD::UMULO: {
      CC = AArch64CC::NE;
-    bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false;
+    bool IsSigned = Op.getOpcode() == ISD::SMULO;
      if (Op.getValueType() == MVT::i32) {
        unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
        // For a 32 bit multiply with overflow check we want the instruction
@@ -1196,10 +1366,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
  
  SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
                                               RTLIB::Libcall Call) const {
-  SmallVector<SDValue, 2> Ops;
-  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
-    Ops.push_back(Op.getOperand(i));
-
+  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
    return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
                       SDLoc(Op)).first;
  }
@@ -1333,8 +1500,7 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
    SDLoc DL(Op);
    unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
    unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
-  // The data thing is not used.
-  // unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
  
    bool IsStream = !Locality;
    // When the locality number is set
@@ -1349,6 +1515,7 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
  
    // built the mask value encoding the expected behavior.
    unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
+                   (!IsData << 3) |     // IsDataCache bit
                     (Locality << 1) |    // Cache level bits
                     (unsigned)IsStream;  // Stream bit
    return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
@@ -1400,7 +1567,10 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
  
    if (VT.getSizeInBits() > InVT.getSizeInBits()) {
      SDLoc dl(Op);
-    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0));
+    MVT ExtVT =
+        MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
+                         VT.getVectorNumElements());
+    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
      return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
    }
  
@@ -1413,6 +1583,14 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
    if (Op.getOperand(0).getValueType().isVector())
      return LowerVectorFP_TO_INT(Op, DAG);
  
+  // f16 conversions are promoted to f32.
+  if (Op.getOperand(0).getValueType() == MVT::f16) {
+    SDLoc dl(Op);
+    return DAG.getNode(
+        Op.getOpcode(), dl, Op.getValueType(),
+        DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
+  }
+
    if (Op.getOperand(0).getValueType() != MVT::f128) {
      // It's legal except when f128 is involved
      return Op;
@@ -1424,10 +1602,7 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
    else
      LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
  
-  SmallVector<SDValue, 2> Ops;
-  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
-    Ops.push_back(Op.getOperand(i));
-
+  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
    return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
                       SDLoc(Op)).first;
  }
@@ -1465,6 +1640,15 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
    if (Op.getValueType().isVector())
      return LowerVectorINT_TO_FP(Op, DAG);
  
+  // f16 conversions are promoted to f32.
+  if (Op.getValueType() == MVT::f16) {
+    SDLoc dl(Op);
+    return DAG.getNode(
+        ISD::FP_ROUND, dl, MVT::f16,
+        DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
+        DAG.getIntPtrConstant(0));
+  }
+
    // i128 conversions are libcalls.
    if (Op.getOperand(0).getValueType() == MVT::i128)
      return SDValue();
@@ -1505,7 +1689,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
        (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
    SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
  
-  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+  StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
    TargetLowering::CallLoweringInfo CLI(DAG);
    CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
      .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0);
@@ -1529,6 +1713,197 @@ static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
        0);
  }
  
+static EVT getExtensionTo64Bits(const EVT &OrigVT) {
+  if (OrigVT.getSizeInBits() >= 64)
+    return OrigVT;
+
+  assert(OrigVT.isSimple() && "Expecting a simple value type");
+
+  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
+  switch (OrigSimpleTy) {
+  default: llvm_unreachable("Unexpected Vector Type");
+  case MVT::v2i8:
+  case MVT::v2i16:
+     return MVT::v2i32;
+  case MVT::v4i8:
+    return  MVT::v4i16;
+  }
+}
+
+static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
+                                                 const EVT &OrigTy,
+                                                 const EVT &ExtTy,
+                                                 unsigned ExtOpcode) {
+  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
+  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
+  // 64-bits we need to insert a new extension so that it will be 64-bits.
+  assert(ExtTy.is128BitVector() && "Unexpected extension size");
+  if (OrigTy.getSizeInBits() >= 64)
+    return N;
+
+  // Must extend size to at least 64 bits to be used as an operand for VMULL.
+  EVT NewVT = getExtensionTo64Bits(OrigTy);
+
+  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
+}
+
+static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
+                                   bool isSigned) {
+  EVT VT = N->getValueType(0);
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDNode *Elt = N->getOperand(i).getNode();
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
+      unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+      unsigned HalfSize = EltSize / 2;
+      if (isSigned) {
+        if (!isIntN(HalfSize, C->getSExtValue()))
+          return false;
+      } else {
+        if (!isUIntN(HalfSize, C->getZExtValue()))
+          return false;
+      }
+      continue;
+    }
+    return false;
+  }
+
+  return true;
+}
+
+static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
+    return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
+                                             N->getOperand(0)->getValueType(0),
+                                             N->getValueType(0),
+                                             N->getOpcode());
+
+  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
+  EVT VT = N->getValueType(0);
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2;
+  unsigned NumElts = VT.getVectorNumElements();
+  MVT TruncVT = MVT::getIntegerVT(EltSize);
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
+    const APInt &CInt = C->getAPIntValue();
+    // Element types smaller than 32 bits are not legal, so use i32 elements.
+    // The values are implicitly truncated so sext vs. zext doesn't matter.
+    Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
+                     MVT::getVectorVT(TruncVT, NumElts), Ops);
+}
+
+static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::SIGN_EXTEND)
+    return true;
+  if (isExtendedBUILD_VECTOR(N, DAG, true))
+    return true;
+  return false;
+}
+
+static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::ZERO_EXTEND)
+    return true;
+  if (isExtendedBUILD_VECTOR(N, DAG, false))
+    return true;
+  return false;
+}
+
+static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
+  }
+  return false;
+}
+
+static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
+  }
+  return false;
+}
+
+static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
+  // Multiplications are only custom-lowered for 128-bit vectors so that
+  // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
+  EVT VT = Op.getValueType();
+  assert(VT.is128BitVector() && VT.isInteger() &&
+         "unexpected type for custom-lowering ISD::MUL");
+  SDNode *N0 = Op.getOperand(0).getNode();
+  SDNode *N1 = Op.getOperand(1).getNode();
+  unsigned NewOpc = 0;
+  bool isMLA = false;
+  bool isN0SExt = isSignExtended(N0, DAG);
+  bool isN1SExt = isSignExtended(N1, DAG);
+  if (isN0SExt && isN1SExt)
+    NewOpc = AArch64ISD::SMULL;
+  else {
+    bool isN0ZExt = isZeroExtended(N0, DAG);
+    bool isN1ZExt = isZeroExtended(N1, DAG);
+    if (isN0ZExt && isN1ZExt)
+      NewOpc = AArch64ISD::UMULL;
+    else if (isN1SExt || isN1ZExt) {
+      // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
+      // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
+      if (isN1SExt && isAddSubSExt(N0, DAG)) {
+        NewOpc = AArch64ISD::SMULL;
+        isMLA = true;
+      } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
+        NewOpc =  AArch64ISD::UMULL;
+        isMLA = true;
+      } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
+        std::swap(N0, N1);
+        NewOpc =  AArch64ISD::UMULL;
+        isMLA = true;
+      }
+    }
+
+    if (!NewOpc) {
+      if (VT == MVT::v2i64)
+        // Fall through to expand this.  It is not legal.
+        return SDValue();
+      else
+        // Other vector multiplications are legal.
+        return Op;
+    }
+  }
+
+  // Legalize to a S/UMULL instruction
+  SDLoc DL(Op);
+  SDValue Op0;
+  SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
+  if (!isMLA) {
+    Op0 = skipExtensionForVectorMULL(N0, DAG);
+    assert(Op0.getValueType().is64BitVector() &&
+           Op1.getValueType().is64BitVector() &&
+           "unexpected types for extended operands to VMULL");
+    return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
+  }
+  // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
+  // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
+  // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
+  SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
+  SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
+  EVT Op1VT = Op1.getValueType();
+  return DAG.getNode(N0->getOpcode(), DL, VT,
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
+}
  
  SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
                                                SelectionDAG &DAG) const {
@@ -1629,6 +2004,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
      return LowerFP_TO_INT(Op, DAG);
    case ISD::FSINCOS:
      return LowerFSINCOS(Op, DAG);
+  case ISD::MUL:
+    return LowerMUL(Op, DAG);
    }
  }
  
@@ -1643,8 +2020,7 @@ unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const {
  
  #include "AArch64GenCallingConv.inc"
  
-/// Selects the correct CCAssignFn for a the given CallingConvention
-/// value.
+/// Selects the correct CCAssignFn for a given CallingConvention value.
  CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
                                                       bool IsVarArg) const {
    switch (CC) {
@@ -1652,6 +2028,8 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
      llvm_unreachable("Unsupported calling convention.");
    case CallingConv::WebKit_JS:
      return CC_AArch64_WebKit_JS;
+  case CallingConv::GHC:
+    return CC_AArch64_GHC;
    case CallingConv::C:
    case CallingConv::Fast:
      if (!Subtarget->isTargetDarwin())
@@ -1669,8 +2047,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
  
    // Assign locations to all of the incoming arguments.
    SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
  
    // At this point, Ins[].VT may already be promoted to i32. To correctly
    // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
@@ -1683,18 +2061,19 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
    unsigned CurArgIdx = 0;
    for (unsigned i = 0; i != NumArgs; ++i) {
      MVT ValVT = Ins[i].VT;
-    std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx);
-    CurArgIdx = Ins[i].OrigArgIndex;
-
-    // Get type of the original argument.
-    EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
-    MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
-    // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
-    if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
-      ValVT = MVT::i8;
-    else if (ActualMVT == MVT::i16)
-      ValVT = MVT::i16;
+    if (Ins[i].isOrigArg()) {
+      std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
+      CurArgIdx = Ins[i].getOrigArgIndex();
  
+      // Get type of the original argument.
+      EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
+      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
+      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+        ValVT = MVT::i8;
+      else if (ActualMVT == MVT::i16)
+        ValVT = MVT::i16;
+    }
      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
      bool Res =
          AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
@@ -1774,10 +2153,11 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
      } else { // VA.isRegLoc()
        assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
        unsigned ArgOffset = VA.getLocMemOffset();
-      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
+      unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
  
        uint32_t BEAlign = 0;
-      if (ArgSize < 8 && !Subtarget->isLittleEndian())
+      if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
+          !Ins[i].Flags.isInConsecutiveRegs())
          BEAlign = 8 - ArgSize;
  
        int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
@@ -1809,7 +2189,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
  
        ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN,
                                  MachinePointerInfo::getFixedStack(FI),
-                                MemVT, false, false, false, nullptr);
+                                MemVT, false, false, false, 0);
  
        InVals.push_back(ArgValue);
      }
@@ -1869,8 +2249,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
                                            AArch64::X3, AArch64::X4, AArch64::X5,
                                            AArch64::X6, AArch64::X7 };
    static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
-  unsigned FirstVariadicGPR =
-      CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs);
+  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
  
    unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
    int GPRIdx = 0;
@@ -1898,8 +2277,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
          AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
          AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
      static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
-    unsigned FirstVariadicFPR =
-        CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs);
+    unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
  
      unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
      int FPRIdx = 0;
@@ -1941,8 +2319,8 @@ SDValue AArch64TargetLowering::LowerCallResult(
                            : RetCC_AArch64_AAPCS;
    // Assign locations to each value returned by this call.
    SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
    CCInfo.AnalyzeCallResult(Ins, RetCC);
  
    // Copy all of the result registers out of their specified physreg.
@@ -2011,6 +2389,21 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
      return false;
    }
  
+  // Externally-defined functions with weak linkage should not be
+  // tail-called on AArch64 when the OS does not support dynamic
+  // pre-emption of symbols, as the AAELF spec requires normal calls
+  // to undefined weak functions to be replaced with a NOP or jump to the
+  // next instruction. The behaviour of branch instructions in this
+  // situation (as used for tail calls) is implementation-defined, so we
+  // cannot rely on the linker replacing the tail call with a return.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    const Triple TT(getTargetMachine().getTargetTriple());
+    if (GV->hasExternalWeakLinkage() &&
+        (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
+      return false;
+  }
+
    // Now we search for cases where we can use a tail call without changing the
    // ABI. Sibcall is used in some places (particularly gcc) to refer to this
    // concept.
@@ -2028,8 +2421,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
      // FIXME: for now we take the most conservative of these in both cases:
      // disallow all variadic memory operands.
      SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                   getTargetMachine(), ArgLocs, *DAG.getContext());
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                   *DAG.getContext());
  
      CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
@@ -2041,13 +2434,13 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
    // results are returned in the same way as what the caller expects.
    if (!CCMatch) {
      SmallVector<CCValAssign, 16> RVLocs1;
-    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs1, *DAG.getContext());
+    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
+                    *DAG.getContext());
      CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
  
      SmallVector<CCValAssign, 16> RVLocs2;
-    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs2, *DAG.getContext());
+    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
+                    *DAG.getContext());
      CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
  
      if (RVLocs1.size() != RVLocs2.size())
@@ -2072,8 +2465,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
      return true;
  
    SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
  
    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
  
@@ -2170,8 +2563,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
  
    // Analyze operands of the call, assigning locations to each operand.
    SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
  
    if (IsVarArg) {
      // Handle fixed and variable vector arguments differently.
@@ -2316,9 +2709,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
        // common case. It should also work for fundamental types too.
        uint32_t BEAlign = 0;
        unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
-                                        : VA.getLocVT().getSizeInBits();
+                                        : VA.getValVT().getSizeInBits();
        OpSize = (OpSize + 7) / 8;
-      if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
+      if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
+          !Flags.isInConsecutiveRegs()) {
          if (OpSize < 8)
            BEAlign = 8 - OpSize;
        }
@@ -2350,8 +2744,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
              DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
          SDValue Cpy = DAG.getMemcpy(
              Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
-            /*isVolatile = */ false,
-            /*alwaysInline = */ false, DstInfo, MachinePointerInfo());
+            /*isVol = */ false, /*AlwaysInline = */ false,
+            /*isTailCall = */ false,
+            DstInfo, MachinePointerInfo());
  
          MemOpChains.push_back(Cpy);
        } else {
@@ -2440,18 +2835,16 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
  
    // Add a register mask operand representing the call-preserved registers.
    const uint32_t *Mask;
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const AArch64RegisterInfo *ARI =
-      static_cast<const AArch64RegisterInfo *>(TRI);
+  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
    if (IsThisReturn) {
      // For 'this' returns, use the X0-preserving mask if applicable
-    Mask = ARI->getThisReturnPreservedMask(CallConv);
+    Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
      if (!Mask) {
        IsThisReturn = false;
-      Mask = ARI->getCallPreservedMask(CallConv);
+      Mask = TRI->getCallPreservedMask(MF, CallConv);
      }
    } else
-    Mask = ARI->getCallPreservedMask(CallConv);
+    Mask = TRI->getCallPreservedMask(MF, CallConv);
  
    assert(Mask && "Missing call preserved mask for calling convention");
    Ops.push_back(DAG.getRegisterMask(Mask));
@@ -2494,7 +2887,7 @@ bool AArch64TargetLowering::CanLowerReturn(
                            ? RetCC_AArch64_WebKit_JS
                            : RetCC_AArch64_AAPCS;
    SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
    return CCInfo.CheckReturn(Outs, RetCC);
  }
  
@@ -2508,8 +2901,8 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                            ? RetCC_AArch64_WebKit_JS
                            : RetCC_AArch64_AAPCS;
    SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
    CCInfo.AnalyzeReturn(Outs, RetCC);
  
    // Copy the result values into the output registers.
@@ -2560,7 +2953,8 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
                                                    SelectionDAG &DAG) const {
    EVT PtrVT = getPointerTy();
    SDLoc DL(Op);
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = GN->getGlobal();
    unsigned char OpFlags =
        Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
  
@@ -2575,6 +2969,25 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
      return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
    }
  
+  if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) {
+    assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
+           "use of MO_CONSTPOOL only supported on small model");
+    SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE);
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
+    SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags);
+    SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+    SDValue GlobalAddr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), PoolAddr,
+                                     MachinePointerInfo::getConstantPool(),
+                                     /*isVolatile=*/ false,
+                                     /*isNonTemporal=*/ true,
+                                     /*isInvariant=*/ true, 8);
+    if (GN->getOffset() != 0)
+      return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr,
+                         DAG.getConstant(GN->getOffset(), PtrVT));
+    return GlobalAddr;
+  }
+
    if (getTargetMachine().getCodeModel() == CodeModel::Large) {
      const unsigned char MO_NC = AArch64II::MO_NC;
      return DAG.getNode(
@@ -2651,10 +3064,8 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
    // TLS calls preserve all registers except those that absolutely must be
    // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
    // silly).
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const AArch64RegisterInfo *ARI =
-      static_cast<const AArch64RegisterInfo *>(TRI);
-  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
+  const uint32_t *Mask =
+      Subtarget->getRegisterInfo()->getTLSCallPreservedMask();
  
    // Finally, we can make the call. This is just a degenerate version of a
    // normal AArch64 call node: x0 takes the address of the descriptor, and
@@ -2670,60 +3081,34 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
  /// When accessing thread-local variables under either the general-dynamic or
  /// local-dynamic system, we make a "TLS-descriptor" call. The variable will
  /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
-/// is a function pointer to carry out the resolution. This function takes the
-/// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All
-/// other registers (except LR, NZCV) are preserved.
-///
-/// Thus, the ideal call sequence on AArch64 is:
-///
-///     adrp x0, :tlsdesc:thread_var
-///     ldr x8, [x0, :tlsdesc_lo12:thread_var]
-///     add x0, x0, :tlsdesc_lo12:thread_var
-///     .tlsdesccall thread_var
-///     blr x8
-///     (TPIDR_EL0 offset now in x0).
+/// is a function pointer to carry out the resolution.
  ///
-/// The ".tlsdesccall" directive instructs the assembler to insert a particular
-/// relocation to help the linker relax this sequence if it turns out to be too
-/// conservative.
+/// The sequence is:
+///    adrp  x0, :tlsdesc:var
+///    ldr   x1, [x0, #:tlsdesc_lo12:var]
+///    add   x0, x0, #:tlsdesc_lo12:var
+///    .tlsdesccall var
+///    blr   x1
+///    (TPIDR_EL0 offset now in x0)
  ///
-/// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this
-/// is harmless.
-SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
-                                                   SDValue DescAddr, SDLoc DL,
-                                                   SelectionDAG &DAG) const {
+///  The above sequence must be produced unscheduled, to enable the linker to
+///  optimize/relax this sequence.
+///  Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
+///  above sequence, and expanded really late in the compilation flow, to ensure
+///  the sequence is produced as per above.
+SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL,
+                                                      SelectionDAG &DAG) const {
    EVT PtrVT = getPointerTy();
  
-  // The function we need to call is simply the first entry in the GOT for this
-  // descriptor, load it in preparation.
-  SDValue Func = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, SymAddr);
+  SDValue Chain = DAG.getEntryNode();
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
  
-  // TLS calls preserve all registers except those that absolutely must be
-  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
-  // silly).
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const AArch64RegisterInfo *ARI =
-      static_cast<const AArch64RegisterInfo *>(TRI);
-  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
-
-  // The function takes only one argument: the address of the descriptor itself
-  // in X0.
-  SDValue Glue, Chain;
-  Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue);
-  Glue = Chain.getValue(1);
-
-  // We're now ready to populate the argument list, as with a normal call:
-  SmallVector<SDValue, 6> Ops;
+  SmallVector<SDValue, 2> Ops;
    Ops.push_back(Chain);
-  Ops.push_back(Func);
    Ops.push_back(SymAddr);
-  Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT));
-  Ops.push_back(DAG.getRegisterMask(Mask));
-  Ops.push_back(Glue);
  
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-  Chain = DAG.getNode(AArch64ISD::TLSDESC_CALL, DL, NodeTys, Ops);
-  Glue = Chain.getValue(1);
+  Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, Ops);
+  SDValue Glue = Chain.getValue(1);
  
    return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
  }
@@ -2734,9 +3119,18 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
    assert(Subtarget->isTargetELF() && "This function expects an ELF target");
    assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
           "ELF TLS only supported in small memory model");
+  // Different choices can be made for the maximum size of the TLS area for a
+  // module. For the small address model, the default TLS size is 16MiB and the
+  // maximum TLS size is 4GiB.
+  // FIXME: add -mtls-size command line option and make it control the 16MiB
+  // vs. 4GiB code sequence generation.
    const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
  
    TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
+  if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
+    if (Model == TLSModel::LocalDynamic)
+      Model = TLSModel::GeneralDynamic;
+  }
  
    SDValue TPOff;
    EVT PtrVT = getPointerTy();
@@ -2747,17 +3141,20 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
  
    if (Model == TLSModel::LocalExec) {
      SDValue HiVar = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
+        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
      SDValue LoVar = DAG.getTargetGlobalAddress(
          GV, DL, PtrVT, 0,
-        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
+        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
  
-    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
-                                       DAG.getTargetConstant(16, MVT::i32)),
-                    0);
-    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
-                                       DAG.getTargetConstant(0, MVT::i32)),
-                    0);
+    SDValue TPWithOff_lo =
+        SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
+                                   HiVar, DAG.getTargetConstant(0, MVT::i32)),
+                0);
+    SDValue TPWithOff =
+        SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo,
+                                   LoVar, DAG.getTargetConstant(0, MVT::i32)),
+                0);
+    return TPWithOff;
    } else if (Model == TLSModel::InitialExec) {
      TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
      TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
@@ -2772,19 +3169,6 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
          DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
      MFI->incNumLocalDynamicTLSAccesses();
  
-    // Accesses used in this sequence go via the TLS descriptor which lives in
-    // the GOT. Prepare an address we can use to handle this.
-    SDValue HiDesc = DAG.getTargetExternalSymbol(
-        "_TLS_MODULE_BASE_", PtrVT, AArch64II::MO_TLS | AArch64II::MO_PAGE);
-    SDValue LoDesc = DAG.getTargetExternalSymbol(
-        "_TLS_MODULE_BASE_", PtrVT,
-        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
-
-    // First argument to the descriptor call is the address of the descriptor
-    // itself.
-    SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
-    DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
-
      // The call needs a relocation too for linker relaxation. It doesn't make
      // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
      // the address.
@@ -2793,40 +3177,23 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
  
      // Now we can calculate the offset from TPIDR_EL0 to this module's
      // thread-local area.
-    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
+    TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
  
      // Now use :dtprel_whatever: operations to calculate this variable's offset
      // in its thread-storage area.
      SDValue HiVar = DAG.getTargetGlobalAddress(
-        GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
+        GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
      SDValue LoVar = DAG.getTargetGlobalAddress(
          GV, DL, MVT::i64, 0,
-        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
-
-    SDValue DTPOff =
-        SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
-                                   DAG.getTargetConstant(16, MVT::i32)),
-                0);
-    DTPOff =
-        SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, DTPOff, LoVar,
-                                   DAG.getTargetConstant(0, MVT::i32)),
-                0);
-
-    TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff);
-  } else if (Model == TLSModel::GeneralDynamic) {
-    // Accesses used in this sequence go via the TLS descriptor which lives in
-    // the GOT. Prepare an address we can use to handle this.
-    SDValue HiDesc = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGE);
-    SDValue LoDesc = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0,
          AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
  
-    // First argument to the descriptor call is the address of the descriptor
-    // itself.
-    SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
-    DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
-
+    TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
+                                       DAG.getTargetConstant(0, MVT::i32)),
+                    0);
+    TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
+                                       DAG.getTargetConstant(0, MVT::i32)),
+                    0);
+  } else if (Model == TLSModel::GeneralDynamic) {
      // The call needs a relocation too for linker relaxation. It doesn't make
      // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
      // the address.
@@ -2834,7 +3201,7 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
          DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
  
      // Finally we can make a call to calculate the offset from tpidr_el0.
-    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
+    TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
    } else
      llvm_unreachable("Unsupported ELF TLS access model");
  
@@ -2894,8 +3261,8 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
        OFCC = getInvertedCondCode(OFCC);
      SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
  
-    return DAG.getNode(AArch64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest,
-                       CCVal, Overflow);
+    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+                       Overflow);
    }
  
    if (LHS.getValueType().isInteger()) {
@@ -2916,11 +3283,6 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
              isPowerOf2_64(LHS.getConstantOperandVal(1))) {
            SDValue Test = LHS.getOperand(0);
            uint64_t Mask = LHS.getConstantOperandVal(1);
-
-          // TBZ only operates on i64's, but the ext should be free.
-          if (Test.getValueType() == MVT::i32)
-            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
-
            return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
                               DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
          }
@@ -2936,18 +3298,29 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
              isPowerOf2_64(LHS.getConstantOperandVal(1))) {
            SDValue Test = LHS.getOperand(0);
            uint64_t Mask = LHS.getConstantOperandVal(1);
-
-          // TBNZ only operates on i64's, but the ext should be free.
-          if (Test.getValueType() == MVT::i32)
-            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
-
            return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
                               DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
          }
  
          return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
+      } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
+        // Don't combine AND since emitComparison converts the AND to an ANDS
+        // (a.k.a. TST) and the test in the test bit and branch instruction
+        // becomes redundant.  This would also increase register pressure.
+        uint64_t Mask = LHS.getValueType().getSizeInBits() - 1;
+        return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
+                           DAG.getConstant(Mask, MVT::i64), Dest);
        }
      }
+    if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
+        LHS.getOpcode() != ISD::AND) {
+      // Don't combine AND since emitComparison converts the AND to an ANDS
+      // (a.k.a. TST) and the test in the test bit and branch instruction
+      // becomes redundant.  This would also increase register pressure.
+      uint64_t Mask = LHS.getValueType().getSizeInBits() - 1;
+      return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
+                         DAG.getConstant(Mask, MVT::i64), Dest);
+    }
  
      SDValue CCVal;
      SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
@@ -2995,11 +3368,12 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
  
    EVT VecVT;
    EVT EltVT;
-  SDValue EltMask, VecVal1, VecVal2;
+  uint64_t EltMask;
+  SDValue VecVal1, VecVal2;
    if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
      EltVT = MVT::i32;
      VecVT = MVT::v4i32;
-    EltMask = DAG.getConstant(0x80000000ULL, EltVT);
+    EltMask = 0x80000000ULL;
  
      if (!VT.isVector()) {
        VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
@@ -3017,7 +3391,7 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
      // We want to materialize a mask with the the high bit set, but the AdvSIMD
      // immediate moves cannot materialize that in a single instruction for
      // 64-bit elements. Instead, materialize zero and then negate it.
-    EltMask = DAG.getConstant(0, EltVT);
+    EltMask = 0;
  
      if (!VT.isVector()) {
        VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
@@ -3032,11 +3406,7 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
      llvm_unreachable("Invalid type for copysign!");
    }
  
-  std::vector<SDValue> BuildVectorOps;
-  for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i)
-    BuildVectorOps.push_back(EltMask);
-
-  SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps);
+  SDValue BuildVec = DAG.getConstant(EltMask, VecVT);
  
    // If we couldn't materialize the mask above, then the mask vector will be
    // the zero vector, and we need to negate it here.
@@ -3058,8 +3428,11 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
  }
  
  SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
-  if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
-          AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
+  if (DAG.getMachineFunction().getFunction()->hasFnAttribute(
+          Attribute::NoImplicitFloat))
+    return SDValue();
+
+  if (!Subtarget->hasNEON())
      return SDValue();
  
    // While there is no integer popcount instruction, it can
@@ -3073,18 +3446,12 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
    SDValue Val = Op.getOperand(0);
    SDLoc DL(Op);
    EVT VT = Op.getValueType();
-  SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8);
  
-  SDValue VecVal;
-  if (VT == MVT::i32) {
-    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val);
-    VecVal = DAG.getTargetInsertSubreg(AArch64::ssub, DL, MVT::v8i8, ZeroVec,
-                                       VecVal);
-  } else {
-    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
-  }
+  if (VT == MVT::i32)
+    Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
+  Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
  
-  SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal);
+  SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
    SDValue UaddLV = DAG.getNode(
        ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
        DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, MVT::i32), CtPop);
@@ -3191,49 +3558,10 @@ static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) {
    return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp;
  }
  
-SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  SDValue CC = Op->getOperand(0);
-  SDValue TVal = Op->getOperand(1);
-  SDValue FVal = Op->getOperand(2);
-  SDLoc DL(Op);
-
-  unsigned Opc = CC.getOpcode();
-  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
-  // instruction.
-  if (CC.getResNo() == 1 &&
-      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
-       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
-    // Only lower legal XALUO ops.
-    if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0)))
-      return SDValue();
-
-    AArch64CC::CondCode OFCC;
-    SDValue Value, Overflow;
-    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CC.getValue(0), DAG);
-    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
-
-    return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
-                       CCVal, Overflow);
-  }
-
-  if (CC.getOpcode() == ISD::SETCC)
-    return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal,
-                           cast<CondCodeSDNode>(CC.getOperand(2))->get());
-  else
-    return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal,
-                           FVal, ISD::SETNE);
-}
-
-SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
+SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
+                                              SDValue RHS, SDValue TVal,
+                                              SDValue FVal, SDLoc dl,
                                                SelectionDAG &DAG) const {
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue TVal = Op.getOperand(2);
-  SDValue FVal = Op.getOperand(3);
-  SDLoc dl(Op);
-
    // Handle f128 first, because it will result in a comparison of some RTLIB
    // call result against zero.
    if (LHS.getValueType() == MVT::f128) {
@@ -3341,14 +3669,14 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
      SDValue CCVal;
      SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
  
-    EVT VT = Op.getValueType();
+    EVT VT = TVal.getValueType();
      return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
    }
  
    // Now we know we're dealing with FP values.
    assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
    assert(LHS.getValueType() == RHS.getValueType());
-  EVT VT = Op.getValueType();
+  EVT VT = TVal.getValueType();
  
    // Try to match this select into a max/min operation, which have dedicated
    // opcode in the instruction set.
@@ -3409,6 +3737,58 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
    return CS1;
  }
  
+SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue TVal = Op.getOperand(2);
+  SDValue FVal = Op.getOperand(3);
+  SDLoc DL(Op);
+  return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
+}
+
+SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDValue CCVal = Op->getOperand(0);
+  SDValue TVal = Op->getOperand(1);
+  SDValue FVal = Op->getOperand(2);
+  SDLoc DL(Op);
+
+  unsigned Opc = CCVal.getOpcode();
+  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
+  // instruction.
+  if (CCVal.getResNo() == 1 &&
+      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
+    // Only lower legal XALUO ops.
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
+      return SDValue();
+
+    AArch64CC::CondCode OFCC;
+    SDValue Value, Overflow;
+    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
+    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
+
+    return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
+                       CCVal, Overflow);
+  }
+
+  // Lower it the same way as we would lower a SELECT_CC node.
+  ISD::CondCode CC;
+  SDValue LHS, RHS;
+  if (CCVal.getOpcode() == ISD::SETCC) {
+    LHS = CCVal.getOperand(0);
+    RHS = CCVal.getOperand(1);
+    CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get();
+  } else {
+    LHS = CCVal;
+    RHS = DAG.getConstant(0, CCVal.getValueType());
+    CC = ISD::SETNE;
+  }
+  return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
+}
+
  SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
                                                SelectionDAG &DAG) const {
    // Jump table entries as PC relative offsets. No additional tweaking
@@ -3597,7 +3977,7 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
  
    return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1),
                         Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32),
-                       8, false, false, MachinePointerInfo(DestSV),
+                       8, false, false, false, MachinePointerInfo(DestSV),
                         MachinePointerInfo(SrcSV));
  }
  
@@ -3905,7 +4285,8 @@ AArch64TargetLowering::getSingleConstraintMatchWeight(
  
  std::pair<unsigned, const TargetRegisterClass *>
  AArch64TargetLowering::getRegForInlineAsmConstraint(
-    const std::string &Constraint, MVT VT) const {
+    const TargetRegisterInfo *TRI, const std::string &Constraint,
+    MVT VT) const {
    if (Constraint.size() == 1) {
      switch (Constraint[0]) {
      case 'r':
@@ -3934,7 +4315,7 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
    // Use the default implementation in TargetLowering to convert the register
    // constraint into a member of a register class.
    std::pair<unsigned, const TargetRegisterClass *> Res;
-  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+  Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
  
    // Not found as a standard register?
    if (!Res.second) {
@@ -4013,8 +4394,10 @@ void AArch64TargetLowering::LowerAsmOperandForConstraint(
        return;
      case 'J': {
        uint64_t NVal = -C->getSExtValue();
-      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal))
+      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
+        CVal = C->getSExtValue();
          break;
+      }
        return;
      }
      // The K and L constraints apply *only* to logical immediates, including
@@ -4138,10 +4521,30 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
    EVT VT = Op.getValueType();
    unsigned NumElts = VT.getVectorNumElements();
  
-  SmallVector<SDValue, 2> SourceVecs;
-  SmallVector<unsigned, 2> MinElts;
-  SmallVector<unsigned, 2> MaxElts;
+  struct ShuffleSourceInfo {
+    SDValue Vec;
+    unsigned MinElt;
+    unsigned MaxElt;
+
+    // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
+    // be compatible with the shuffle we intend to construct. As a result
+    // ShuffleVec will be some sliding window into the original Vec.
+    SDValue ShuffleVec;
+
+    // Code should guarantee that element i in Vec starts at element "WindowBase
+    // + i * WindowScale in ShuffleVec".
+    int WindowBase;
+    int WindowScale;
+
+    bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
+    ShuffleSourceInfo(SDValue Vec)
+        : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
+          WindowScale(1) {}
+  };
  
+  // First gather all vectors used as an immediate source for this BUILD_VECTOR
+  // node.
+  SmallVector<ShuffleSourceInfo, 2> Sources;
    for (unsigned i = 0; i < NumElts; ++i) {
      SDValue V = Op.getOperand(i);
      if (V.getOpcode() == ISD::UNDEF)
@@ -4152,133 +4555,155 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
        return SDValue();
      }
  
-    // Record this extraction against the appropriate vector if possible...
+    // Add this element source to the list if it's not already there.
      SDValue SourceVec = V.getOperand(0);
-    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
-    bool FoundSource = false;
-    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
-      if (SourceVecs[j] == SourceVec) {
-        if (MinElts[j] > EltNo)
-          MinElts[j] = EltNo;
-        if (MaxElts[j] < EltNo)
-          MaxElts[j] = EltNo;
-        FoundSource = true;
-        break;
-      }
-    }
+    auto Source = std::find(Sources.begin(), Sources.end(), SourceVec);
+    if (Source == Sources.end())
+      Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
  
-    // Or record a new source if not...
-    if (!FoundSource) {
-      SourceVecs.push_back(SourceVec);
-      MinElts.push_back(EltNo);
-      MaxElts.push_back(EltNo);
-    }
+    // Update the minimum and maximum lane number seen.
+    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+    Source->MinElt = std::min(Source->MinElt, EltNo);
+    Source->MaxElt = std::max(Source->MaxElt, EltNo);
    }
  
    // Currently only do something sane when at most two source vectors
-  // involved.
-  if (SourceVecs.size() > 2)
+  // are involved.
+  if (Sources.size() > 2)
      return SDValue();
  
-  SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
-  int VEXTOffsets[2] = { 0, 0 };
-  int OffsetMultipliers[2] = { 1, 1 };
-
-  // This loop extracts the usage patterns of the source vectors
-  // and prepares appropriate SDValues for a shuffle if possible.
-  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
-    unsigned NumSrcElts = SourceVecs[i].getValueType().getVectorNumElements();
-    SDValue CurSource = SourceVecs[i];
-    if (SourceVecs[i].getValueType().getVectorElementType() !=
-        VT.getVectorElementType()) {
-      // It may hit this case if SourceVecs[i] is AssertSext/AssertZext.
-      // Then bitcast it to the vector which holds asserted element type,
-      // and record the multiplier of element width between SourceVecs and
-      // Build_vector which is needed to extract the correct lanes later.
-      EVT CastVT =
-          EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
-                           SourceVecs[i].getValueSizeInBits() /
-                               VT.getVectorElementType().getSizeInBits());
-
-      CurSource = DAG.getNode(ISD::BITCAST, dl, CastVT, SourceVecs[i]);
-      OffsetMultipliers[i] = CastVT.getVectorNumElements() / NumSrcElts;
-      NumSrcElts *= OffsetMultipliers[i];
-      MaxElts[i] *= OffsetMultipliers[i];
-      MinElts[i] *= OffsetMultipliers[i];
+  // Find out the smallest element size among result and two sources, and use
+  // it as element size to build the shuffle_vector.
+  EVT SmallestEltTy = VT.getVectorElementType();
+  for (auto &Source : Sources) {
+    EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
+    if (SrcEltTy.bitsLT(SmallestEltTy)) {
+      SmallestEltTy = SrcEltTy;
      }
+  }
+  unsigned ResMultiplier =
+      VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits();
+  NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
+  EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
  
-    if (CurSource.getValueType() == VT) {
-      // No VEXT necessary
-      ShuffleSrcs[i] = CurSource;
-      VEXTOffsets[i] = 0;
+  // If the source vector is too wide or too narrow, we may nevertheless be able
+  // to construct a compatible shuffle either by concatenating it with UNDEF or
+  // extracting a suitable range of elements.
+  for (auto &Src : Sources) {
+    EVT SrcVT = Src.ShuffleVec.getValueType();
+
+    if (SrcVT.getSizeInBits() == VT.getSizeInBits())
        continue;
-    } else if (NumSrcElts < NumElts) {
+
+    // This stage of the search produces a source with the same element type as
+    // the original, but with a total width matching the BUILD_VECTOR output.
+    EVT EltVT = SrcVT.getVectorElementType();
+    unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
+    EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
+
+    if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
+      assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
        // We can pad out the smaller vector for free, so if it's part of a
        // shuffle...
-      ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, CurSource,
-                                   DAG.getUNDEF(CurSource.getValueType()));
+      Src.ShuffleVec =
+          DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
+                      DAG.getUNDEF(Src.ShuffleVec.getValueType()));
        continue;
      }
  
-    // Since only 64-bit and 128-bit vectors are legal on ARM and
-    // we've eliminated the other cases...
-    assert(NumSrcElts == 2 * NumElts &&
-           "unexpected vector sizes in ReconstructShuffle");
+    assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());
  
-    if (MaxElts[i] - MinElts[i] >= NumElts) {
+    if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
        // Span too large for a VEXT to cope
        return SDValue();
      }
  
-    if (MinElts[i] >= NumElts) {
+    if (Src.MinElt >= NumSrcElts) {
        // The extraction can just take the second half
-      VEXTOffsets[i] = NumElts;
-      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
-                                   DAG.getIntPtrConstant(NumElts));
-    } else if (MaxElts[i] < NumElts) {
+      Src.ShuffleVec =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(NumSrcElts, MVT::i64));
+      Src.WindowBase = -NumSrcElts;
+    } else if (Src.MaxElt < NumSrcElts) {
        // The extraction can just take the first half
-      VEXTOffsets[i] = 0;
-      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
-                                   DAG.getIntPtrConstant(0));
+      Src.ShuffleVec =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(0, MVT::i64));
      } else {
        // An actual VEXT is needed
-      VEXTOffsets[i] = MinElts[i];
-      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
-                                     DAG.getIntPtrConstant(0));
-      SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
-                                     DAG.getIntPtrConstant(NumElts));
-      unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1);
-      ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2,
-                                   DAG.getConstant(Imm, MVT::i32));
+      SDValue VEXTSrc1 =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(0, MVT::i64));
+      SDValue VEXTSrc2 =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(NumSrcElts, MVT::i64));
+      unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
+
+      Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
+                                   VEXTSrc2, DAG.getConstant(Imm, MVT::i32));
+      Src.WindowBase = -Src.MinElt;
      }
    }
  
-  SmallVector<int, 8> Mask;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
+  // Another possible incompatibility occurs from the vector element types. We
+  // can fix this by bitcasting the source vectors to the same type we intend
+  // for the shuffle.
+  for (auto &Src : Sources) {
+    EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
+    if (SrcEltTy == SmallestEltTy)
+      continue;
+    assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
+    Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
+    Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
+    Src.WindowBase *= Src.WindowScale;
+  }
+
+  // Final sanity check before we try to actually produce a shuffle.
+  DEBUG(
+    for (auto Src : Sources)
+      assert(Src.ShuffleVec.getValueType() == ShuffleVT);
+  );
+
+  // The stars all align, our next step is to produce the mask for the shuffle.
+  SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
+  int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits();
+  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
      SDValue Entry = Op.getOperand(i);
-    if (Entry.getOpcode() == ISD::UNDEF) {
-      Mask.push_back(-1);
+    if (Entry.getOpcode() == ISD::UNDEF)
        continue;
-    }
  
-    SDValue ExtractVec = Entry.getOperand(0);
-    int ExtractElt =
-        cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue();
-    if (ExtractVec == SourceVecs[0]) {
-      Mask.push_back(ExtractElt * OffsetMultipliers[0] - VEXTOffsets[0]);
-    } else {
-      Mask.push_back(ExtractElt * OffsetMultipliers[1] + NumElts -
-                     VEXTOffsets[1]);
-    }
+    auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0));
+    int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
+
+    // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
+    // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
+    // segment.
+    EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
+    int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
+                               VT.getVectorElementType().getSizeInBits());
+    int LanesDefined = BitsDefined / BitsPerShuffleLane;
+
+    // This source is expected to fill ResMultiplier lanes of the final shuffle,
+    // starting at the appropriate offset.
+    int *LaneMask = &Mask[i * ResMultiplier];
+
+    int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
+    ExtractBase += NumElts * (Src - Sources.begin());
+    for (int j = 0; j < LanesDefined; ++j)
+      LaneMask[j] = ExtractBase + j;
    }
  
    // Final check before we try to produce nonsense...
-  if (isShuffleMaskLegal(Mask, VT))
-    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
-                                &Mask[0]);
+  if (!isShuffleMaskLegal(Mask, ShuffleVT))
+    return SDValue();
  
-  return SDValue();
+  SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
+  for (unsigned i = 0; i < Sources.size(); ++i)
+    ShuffleOps[i] = Sources[i].ShuffleVec;
+
+  SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
+                                         ShuffleOps[1], &Mask[0]);
+  return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
  }
  
  // check if an EXT instruction can handle the shuffle mask when the
@@ -4607,7 +5032,8 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
          VT.getVectorElementType() == MVT::f32)
        return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
      // vrev <4 x i16> -> REV32
-    if (VT.getVectorElementType() == MVT::i16)
+    if (VT.getVectorElementType() == MVT::i16 ||
+        VT.getVectorElementType() == MVT::f16)
        return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
      // vrev <4 x i8> -> REV16
      assert(VT.getVectorElementType() == MVT::i8);
@@ -4620,7 +5046,7 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
      unsigned Opcode;
      if (EltTy == MVT::i8)
        Opcode = AArch64ISD::DUPLANE8;
-    else if (EltTy == MVT::i16)
+    else if (EltTy == MVT::i16 || EltTy == MVT::f16)
        Opcode = AArch64ISD::DUPLANE16;
      else if (EltTy == MVT::i32 || EltTy == MVT::f32)
        Opcode = AArch64ISD::DUPLANE32;
@@ -4727,7 +5153,7 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
  static unsigned getDUPLANEOp(EVT EltType) {
    if (EltType == MVT::i8)
      return AArch64ISD::DUPLANE8;
-  if (EltType == MVT::i16)
+  if (EltType == MVT::i16 || EltType == MVT::f16)
      return AArch64ISD::DUPLANE16;
    if (EltType == MVT::i32 || EltType == MVT::f32)
      return AArch64ISD::DUPLANE32;
@@ -4857,7 +5283,8 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
      SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64);
  
      EVT ScalarVT = VT.getVectorElementType();
-    if (ScalarVT.getSizeInBits() < 32)
+
+    if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
        ScalarVT = MVT::i32;
  
      return DAG.getNode(
@@ -4945,7 +5372,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
@@ -4954,7 +5381,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
@@ -4963,7 +5390,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
@@ -4972,7 +5399,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
@@ -4981,7 +5408,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
@@ -4990,7 +5417,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
      }
  
@@ -5145,7 +5572,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
@@ -5154,7 +5581,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
@@ -5163,7 +5590,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
@@ -5172,7 +5599,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
@@ -5181,7 +5608,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
@@ -5190,7 +5617,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
      }
  
@@ -5263,13 +5690,13 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          if (VT.getSizeInBits() == 128) {
            SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
                                      DAG.getConstant(CnstVal, MVT::i32));
-          return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+          return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
          }
  
          // Support the V64 version via subregister insertion.
          SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
                                    DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
@@ -5278,7 +5705,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
@@ -5287,7 +5714,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
@@ -5296,7 +5723,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
@@ -5305,7 +5732,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
@@ -5314,7 +5741,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
@@ -5323,7 +5750,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
@@ -5332,7 +5759,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(264, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
@@ -5341,7 +5768,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(272, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
@@ -5349,7 +5776,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
          SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        // The few faces of FMOV...
@@ -5358,7 +5785,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
          SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
@@ -5366,7 +5793,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
          SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
                                    DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        // The many faces of MVNI...
@@ -5377,7 +5804,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
@@ -5386,7 +5813,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
@@ -5395,7 +5822,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
@@ -5404,7 +5831,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
@@ -5413,7 +5840,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
@@ -5422,7 +5849,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
@@ -5431,7 +5858,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(264, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
@@ -5440,7 +5867,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(272, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
      }
  
@@ -5522,8 +5949,10 @@ FailedModImm:
  
      if (VT.getVectorElementType().isFloatingPoint()) {
        SmallVector<SDValue, 8> Ops;
-      MVT NewType =
-          (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
+      EVT EltTy = VT.getVectorElementType();
+      assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) &&
+              "Unsupported floating-point vector type");
+      MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
        for (unsigned i = 0; i < NumElts; ++i)
          Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
        EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
@@ -5616,11 +6045,12 @@ SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
  
    // Insertion/extraction are legal for V128 types.
    if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
-      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
+      VT == MVT::v8f16)
      return Op;
  
    if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32)
+      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
      return SDValue();
  
    // For V64 types, we perform insertion by expanding the value
@@ -5649,11 +6079,12 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
  
    // Insertion/extraction are legal for V128 types.
    if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
-      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
+      VT == MVT::v8f16)
      return Op;
  
    if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32)
+      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
      return SDValue();
  
    // For V64 types, we perform extraction by expanding the value
@@ -5850,6 +6281,8 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
                                      AArch64CC::CondCode CC, bool NoNans, EVT VT,
                                      SDLoc dl, SelectionDAG &DAG) {
    EVT SrcVT = LHS.getValueType();
+  assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
+         "function only supposed to emit natural comparisons");
  
    BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
    APInt CnstBits(VT.getSizeInBits(), 0);
@@ -5944,13 +6377,15 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
    SDValue LHS = Op.getOperand(0);
    SDValue RHS = Op.getOperand(1);
+  EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
    SDLoc dl(Op);
  
    if (LHS.getValueType().getVectorElementType().isInteger()) {
      assert(LHS.getValueType() == RHS.getValueType());
      AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
-    return EmitVectorComparison(LHS, RHS, AArch64CC, false, Op.getValueType(),
-                                dl, DAG);
+    SDValue Cmp =
+        EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
+    return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
    }
  
    assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
@@ -5964,19 +6399,21 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
  
    bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
    SDValue Cmp =
-      EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG);
+      EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
    if (!Cmp.getNode())
      return SDValue();
  
    if (CC2 != AArch64CC::AL) {
      SDValue Cmp2 =
-        EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG);
+        EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
      if (!Cmp2.getNode())
        return SDValue();
  
-    Cmp = DAG.getNode(ISD::OR, dl, Cmp.getValueType(), Cmp, Cmp2);
+    Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
    }
  
+  Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
+
    if (ShouldInvert)
      return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
  
@@ -6114,6 +6551,34 @@ bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
    return NumBits1 > NumBits2;
  }
  
+/// Check if it is profitable to hoist instruction in then/else to if.
+/// Not profitable if I and it's user can form a FMA instruction
+/// because we prefer FMSUB/FMADD.
+bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
+  if (I->getOpcode() != Instruction::FMul)
+    return true;
+
+  if (I->getNumUses() != 1)
+    return true;
+
+  Instruction *User = I->user_back();
+
+  if (User &&
+      !(User->getOpcode() == Instruction::FSub ||
+        User->getOpcode() == Instruction::FAdd))
+    return true;
+
+  const TargetOptions &Options = getTargetMachine().Options;
+  EVT VT = getValueType(User->getOperand(0)->getType());
+
+  if (isFMAFasterThanFMulAndFAdd(VT) &&
+      isOperationLegalOrCustom(ISD::FMA, VT) &&
+      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath))
+    return false;
+
+  return true;
+}
+
  // All 32-bit GPR operations implicitly zero the high-half of the corresponding
  // 64-bit GPR.
  bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
@@ -6146,6 +6611,59 @@ bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
            VT1.getSizeInBits() <= 32);
  }
  
+bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
+  if (isa<FPExtInst>(Ext))
+    return false;
+
+  // Vector types are next free.
+  if (Ext->getType()->isVectorTy())
+    return false;
+
+  for (const Use &U : Ext->uses()) {
+    // The extension is free if we can fold it with a left shift in an
+    // addressing mode or an arithmetic operation: add, sub, and cmp.
+
+    // Is there a shift?
+    const Instruction *Instr = cast<Instruction>(U.getUser());
+
+    // Is this a constant shift?
+    switch (Instr->getOpcode()) {
+    case Instruction::Shl:
+      if (!isa<ConstantInt>(Instr->getOperand(1)))
+        return false;
+      break;
+    case Instruction::GetElementPtr: {
+      gep_type_iterator GTI = gep_type_begin(Instr);
+      std::advance(GTI, U.getOperandNo());
+      Type *IdxTy = *GTI;
+      // This extension will end up with a shift because of the scaling factor.
+      // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
+      // Get the shift amount based on the scaling factor:
+      // log2(sizeof(IdxTy)) - log2(8).
+      uint64_t ShiftAmt =
+        countTrailingZeros(getDataLayout()->getTypeStoreSizeInBits(IdxTy)) - 3;
+      // Is the constant foldable in the shift of the addressing mode?
+      // I.e., shift amount is between 1 and 4 inclusive.
+      if (ShiftAmt == 0 || ShiftAmt > 4)
+        return false;
+      break;
+    }
+    case Instruction::Trunc:
+      // Check if this is a noop.
+      // trunc(sext ty1 to ty2) to ty1.
+      if (Instr->getType() == Ext->getOperand(0)->getType())
+        continue;
+    // FALL THROUGH.
+    default:
+      return false;
+    }
+
+    // At this point we can use the bfm family, so this extension is free
+    // for that use.
+  }
+  return true;
+}
+
  bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType,
                                            unsigned &RequiredAligment) const {
    if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy())
@@ -6184,13 +6702,22 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
    bool Fast;
    const Function *F = MF.getFunction();
    if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
-      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::NoImplicitFloat) &&
+      !F->hasFnAttribute(Attribute::NoImplicitFloat) &&
        (memOpAlign(SrcAlign, DstAlign, 16) ||
-       (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast)))
+       (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast)))
      return MVT::f128;
  
-  return Size >= 8 ? MVT::i64 : MVT::i32;
+  if (Size >= 8 &&
+      (memOpAlign(SrcAlign, DstAlign, 8) ||
+       (allowsMisalignedMemoryAccesses(MVT::i64, 0, 1, &Fast) && Fast)))
+    return MVT::i64;
+
+  if (Size >= 4 &&
+      (memOpAlign(SrcAlign, DstAlign, 4) ||
+       (allowsMisalignedMemoryAccesses(MVT::i32, 0, 1, &Fast) && Fast)))
+    return MVT::i32;
+
+  return MVT::Other;
  }
  
  // 12-bit optionally shifted immediates are legal for adds.
@@ -6341,7 +6868,7 @@ bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
    unsigned LZ = countLeadingZeros((uint64_t)Val);
    unsigned Shift = (63 - LZ) / 16;
    // MOVZ is free so return true for one or fewer MOVK.
-  return (Shift < 3) ? true : false;
+  return Shift < 3;
  }
  
  // Generate SUBS and CSEL for integer abs.
@@ -6396,7 +6923,7 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
    SDValue N0 = N->getOperand(0);
    unsigned Lg2 = Divisor.countTrailingZeros();
    SDValue Zero = DAG.getConstant(0, VT);
-  SDValue Pow2MinusOne = DAG.getConstant((1 << Lg2) - 1, VT);
+  SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, VT);
  
    // Add (N0 < 0) ? Pow2 - 1 : 0;
    SDValue CCVal;
@@ -6458,6 +6985,15 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
                             N->getOperand(0));
        }
      } else {
+      // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
+      APInt VNP1 = -Value + 1;
+      if (VNP1.isPowerOf2()) {
+        SDValue ShiftedVal =
+            DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                        DAG.getConstant(VNP1.logBase2(), MVT::i64));
+        return DAG.getNode(ISD::SUB, SDLoc(N), VT, N->getOperand(0),
+                           ShiftedVal);
+      }
        // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
        APInt VNM1 = -Value - 1;
        if (VNM1.isPowerOf2()) {
@@ -6468,15 +7004,6 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
              DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
          return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), Add);
        }
-      // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
-      APInt VNP1 = -Value + 1;
-      if (VNP1.isPowerOf2()) {
-        SDValue ShiftedVal =
-            DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
-                        DAG.getConstant(VNP1.logBase2(), MVT::i64));
-        return DAG.getNode(ISD::SUB, SDLoc(N), VT, N->getOperand(0),
-                           ShiftedVal);
-      }
      }
    }
    return SDValue();
@@ -6501,14 +7028,14 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
        VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
      return SDValue();
  
-  // Now check that the other operand of the AND is a constant splat. We could
+  // Now check that the other operand of the AND is a constant. We could
    // make the transformation for non-constant splats as well, but it's unclear
    // that would be a benefit as it would not eliminate any operations, just
    // perform one more step in scalar code before moving to the vector unit.
    if (BuildVectorSDNode *BV =
            dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
-    // Bail out if the vector isn't a constant splat.
-    if (!BV->getConstantSplatNode())
+    // Bail out if the vector isn't a constant.
+    if (!BV->isConstant())
        return SDValue();
  
      // Everything checks out. Build up the new and improved node.
@@ -6528,7 +7055,8 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
    return SDValue();
  }
  
-static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
+                                     const AArch64Subtarget *Subtarget) {
    // First try to optimize away the conversion when it's conditionally from
    // a constant. Vectors only.
    SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
@@ -6547,7 +7075,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
    // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
    // This eliminates an "integer-to-vector-move UOP and improve throughput.
    SDValue N0 = N->getOperand(0);
-  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+  if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
        // Do not change the width of a volatile load.
        !cast<LoadSDNode>(N0)->isVolatile()) {
      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
@@ -6770,21 +7298,54 @@ static SDValue performBitcastCombine(SDNode *N,
  static SDValue performConcatVectorsCombine(SDNode *N,
                                             TargetLowering::DAGCombinerInfo &DCI,
                                             SelectionDAG &DAG) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+
+  // Optimize concat_vectors of truncated vectors, where the intermediate
+  // type is illegal, to avoid said illegality,  e.g.,
+  //   (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
+  //                          (v2i16 (truncate (v2i64)))))
+  // ->
+  //   (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
+  //                                    (v4i32 (bitcast (v2i64))),
+  //                                    <0, 2, 4, 6>)))
+  // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
+  // on both input and result type, so we might generate worse code.
+  // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
+  if (N->getNumOperands() == 2 &&
+      N0->getOpcode() == ISD::TRUNCATE &&
+      N1->getOpcode() == ISD::TRUNCATE) {
+    SDValue N00 = N0->getOperand(0);
+    SDValue N10 = N1->getOperand(0);
+    EVT N00VT = N00.getValueType();
+
+    if (N00VT == N10.getValueType() &&
+        (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
+        N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
+      MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
+      SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
+      for (size_t i = 0; i < Mask.size(); ++i)
+        Mask[i] = i * 2;
+      return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                         DAG.getVectorShuffle(
+                             MidVT, dl,
+                             DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
+                             DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
+    }
+  }
+
    // Wait 'til after everything is legalized to try this. That way we have
    // legal vector types and such.
    if (DCI.isBeforeLegalizeOps())
      return SDValue();
  
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-
    // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
    // splat. The indexed instructions are going to be expecting a DUPLANE64, so
    // canonicalise to that.
-  if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) {
+  if (N0 == N1 && VT.getVectorNumElements() == 2) {
      assert(VT.getVectorElementType().getSizeInBits() == 64);
-    return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT,
-                       WidenVector(N->getOperand(0), DAG),
+    return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
                         DAG.getConstant(0, MVT::i64));
    }
  
@@ -6797,10 +7358,9 @@ static SDValue performConcatVectorsCombine(SDNode *N,
    // becomes
    //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
  
-  SDValue Op1 = N->getOperand(1);
-  if (Op1->getOpcode() != ISD::BITCAST)
+  if (N1->getOpcode() != ISD::BITCAST)
      return SDValue();
-  SDValue RHS = Op1->getOperand(0);
+  SDValue RHS = N1->getOperand(0);
    MVT RHSTy = RHS.getValueType().getSimpleVT();
    // If the RHS is not a vector, this is not the pattern we're looking for.
    if (!RHSTy.isVector())
@@ -6810,10 +7370,10 @@ static SDValue performConcatVectorsCombine(SDNode *N,
  
    MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
                                    RHSTy.getVectorNumElements() * 2);
-  return DAG.getNode(
-      ISD::BITCAST, dl, VT,
-      DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
-                  DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS));
+  return DAG.getNode(ISD::BITCAST, dl, VT,
+                     DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
+                                 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
+                                 RHS));
  }
  
  static SDValue tryCombineFixedPointConvert(SDNode *N,
@@ -7210,6 +7770,15 @@ static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
                       N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
  }
  
+static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
+                                           SelectionDAG &DAG) {
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), N->getValueType(0),
+                     DAG.getNode(Opc, SDLoc(N),
+                                 N->getOperand(1).getSimpleValueType(),
+                                 N->getOperand(1)),
+                     DAG.getConstant(0, MVT::i64));
+}
+
  static SDValue performIntrinsicCombine(SDNode *N,
                                         TargetLowering::DAGCombinerInfo &DCI,
                                         const AArch64Subtarget *Subtarget) {
@@ -7222,6 +7791,18 @@ static SDValue performIntrinsicCombine(SDNode *N,
    case Intrinsic::aarch64_neon_vcvtfxu2fp:
      return tryCombineFixedPointConvert(N, DCI, DAG);
      break;
+  case Intrinsic::aarch64_neon_saddv:
+    return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
+  case Intrinsic::aarch64_neon_uaddv:
+    return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
+  case Intrinsic::aarch64_neon_sminv:
+    return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
+  case Intrinsic::aarch64_neon_uminv:
+    return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
+  case Intrinsic::aarch64_neon_smaxv:
+    return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
+  case Intrinsic::aarch64_neon_umaxv:
+    return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
    case Intrinsic::aarch64_neon_fmax:
      return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0),
                         N->getOperand(1), N->getOperand(2));
@@ -7308,11 +7889,11 @@ static SDValue performExtendCombine(SDNode *N,
    // If the vector type isn't a simple VT, it's beyond the scope of what
    // we're  worried about here. Let legalization do its thing and hope for
    // the best.
-  if (!ResVT.isSimple())
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src->getValueType(0);
+  if (!ResVT.isSimple() || !SrcVT.isSimple())
      return SDValue();
  
-  SDValue Src = N->getOperand(0);
-  MVT SrcVT = Src->getValueType(0).getSimpleVT();
    // If the source VT is a 64-bit vector, we can play games and get the
    // better results we want.
    if (SrcVT.getSizeInBits() != 64)
@@ -7336,9 +7917,9 @@ static SDValue performExtendCombine(SDNode *N,
    EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
                                 LoVT.getVectorNumElements());
    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
-                   DAG.getIntPtrConstant(0));
+                   DAG.getConstant(0, MVT::i64));
    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
-                   DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+                   DAG.getConstant(InNVT.getVectorNumElements(), MVT::i64));
    Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
    Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
  
@@ -7419,14 +8000,13 @@ static SDValue performSTORECombine(SDNode *N,
      return SDValue();
  
    // Cyclone has bad performance on unaligned 16B stores when crossing line and
-  // page boundries. We want to split such stores.
+  // page boundaries. We want to split such stores.
    if (!Subtarget->isCyclone())
      return SDValue();
  
    // Don't split at Oz.
    MachineFunction &MF = DAG.getMachineFunction();
-  bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute(
-      AttributeSet::FunctionIndex, Attribute::MinSize);
+  bool IsMinSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize);
    if (IsMinSize)
      return SDValue();
  
@@ -7460,9 +8040,9 @@ static SDValue performSTORECombine(SDNode *N,
    EVT HalfVT =
        EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
    SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
-                                   DAG.getIntPtrConstant(0));
+                                   DAG.getConstant(0, MVT::i64));
    SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
-                                   DAG.getIntPtrConstant(NumElts));
+                                   DAG.getConstant(NumElts, MVT::i64));
    SDValue BasePtr = S->getBasePtr();
    SDValue NewST1 =
        DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
@@ -7536,6 +8116,13 @@ static SDValue performPostLD1Combine(SDNode *N,
        Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
      }
  
+    // Finally, check that the vector doesn't depend on the load.
+    // Again, this would create a cycle.
+    // The load depending on the vector is fine, as that's the case for the
+    // LD1*post we'll eventually generate anyway.
+    if (LoadSDN->isPredecessorOf(Vector.getNode()))
+      continue;
+
      SmallVector<SDValue, 8> Ops;
      Ops.push_back(LD->getOperand(0));  // Chain
      if (IsLaneOp) {
@@ -7546,14 +8133,14 @@ static SDValue performPostLD1Combine(SDNode *N,
      Ops.push_back(Inc);
  
      EVT Tys[3] = { VT, MVT::i64, MVT::Other };
-    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, 3));
+    SDVTList SDTys = DAG.getVTList(Tys);
      unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
      SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
                                             MemVT,
                                             LoadSDN->getMemOperand());
  
      // Update the uses.
-    std::vector<SDValue> NewResults;
+    SmallVector<SDValue, 2> NewResults;
      NewResults.push_back(SDValue(LD, 0));             // The result of load
      NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain
      DCI.CombineTo(LD, NewResults);
@@ -7676,7 +8263,7 @@ static SDValue performNEONPostLDSTCombine(SDNode *N,
        Tys[n] = VecTy;
      Tys[n++] = MVT::i64;  // Type of write back register
      Tys[n] = MVT::Other;  // Type of the chain
-    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs + 2));
+    SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
  
      MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
      SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
@@ -7697,10 +8284,272 @@ static SDValue performNEONPostLDSTCombine(SDNode *N,
    return SDValue();
  }
  
+// Checks to see if the value is the prescribed width and returns information
+// about its extension mode.
+static
+bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
+  ExtType = ISD::NON_EXTLOAD;
+  switch(V.getNode()->getOpcode()) {
+  default:
+    return false;
+  case ISD::LOAD: {
+    LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
+    if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
+       || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
+      ExtType = LoadNode->getExtensionType();
+      return true;
+    }
+    return false;
+  }
+  case ISD::AssertSext: {
+    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
+    if ((TypeNode->getVT() == MVT::i8 && width == 8)
+       || (TypeNode->getVT() == MVT::i16 && width == 16)) {
+      ExtType = ISD::SEXTLOAD;
+      return true;
+    }
+    return false;
+  }
+  case ISD::AssertZext: {
+    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
+    if ((TypeNode->getVT() == MVT::i8 && width == 8)
+       || (TypeNode->getVT() == MVT::i16 && width == 16)) {
+      ExtType = ISD::ZEXTLOAD;
+      return true;
+    }
+    return false;
+  }
+  case ISD::Constant:
+  case ISD::TargetConstant: {
+    if (std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
+        1LL << (width - 1))
+      return true;
+    return false;
+  }
+  }
+
+  return true;
+}
+
+// This function does a whole lot of voodoo to determine if the tests are
+// equivalent without and with a mask. Essentially what happens is that given a
+// DAG resembling:
+//
+//  +-------------+ +-------------+ +-------------+ +-------------+
+//  |    Input    | | AddConstant | | CompConstant| |     CC      |
+//  +-------------+ +-------------+ +-------------+ +-------------+
+//           |           |           |               |
+//           V           V           |    +----------+
+//          +-------------+  +----+  |    |
+//          |     ADD     |  |0xff|  |    |
+//          +-------------+  +----+  |    |
+//                  |           |    |    |
+//                  V           V    |    |
+//                 +-------------+   |    |
+//                 |     AND     |   |    |
+//                 +-------------+   |    |
+//                      |            |    |
+//                      +-----+      |    |
+//                            |      |    |
+//                            V      V    V
+//                           +-------------+
+//                           |     CMP     |
+//                           +-------------+
+//
+// The AND node may be safely removed for some combinations of inputs. In
+// particular we need to take into account the extension type of the Input,
+// the exact values of AddConstant, CompConstant, and CC, along with the nominal
+// width of the input (this can work for any width inputs, the above graph is
+// specific to 8 bits.
+//
+// The specific equations were worked out by generating output tables for each
+// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
+// problem was simplified by working with 4 bit inputs, which means we only
+// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
+// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
+// patterns present in both extensions (0,7). For every distinct set of
+// AddConstant and CompConstants bit patterns we can consider the masked and
+// unmasked versions to be equivalent if the result of this function is true for
+// all 16 distinct bit patterns of for the current extension type of Input (w0).
+//
+//   sub      w8, w0, w1
+//   and      w10, w8, #0x0f
+//   cmp      w8, w2
+//   cset     w9, AArch64CC
+//   cmp      w10, w2
+//   cset     w11, AArch64CC
+//   cmp      w9, w11
+//   cset     w0, eq
+//   ret
+//
+// Since the above function shows when the outputs are equivalent it defines
+// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
+// would be expensive to run during compiles. The equations below were written
+// in a test harness that confirmed they gave equivalent outputs to the above
+// for all inputs function, so they can be used determine if the removal is
+// legal instead.
+//
+// isEquivalentMaskless() is the code for testing if the AND can be removed
+// factored out of the DAG recognition as the DAG can take several forms.
+
+static
+bool isEquivalentMaskless(unsigned CC, unsigned width,
+                          ISD::LoadExtType ExtType, signed AddConstant,
+                          signed CompConstant) {
+  // By being careful about our equations and only writing the in term
+  // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
+  // make them generally applicable to all bit widths.
+  signed MaxUInt = (1 << width);
+
+  // For the purposes of these comparisons sign extending the type is
+  // equivalent to zero extending the add and displacing it by half the integer
+  // width. Provided we are careful and make sure our equations are valid over
+  // the whole range we can just adjust the input and avoid writing equations
+  // for sign extended inputs.
+  if (ExtType == ISD::SEXTLOAD)
+    AddConstant -= (1 << (width-1));
+
+  switch(CC) {
+  case AArch64CC::LE:
+  case AArch64CC::GT: {
+    if ((AddConstant == 0) ||
+        (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
+        (AddConstant >= 0 && CompConstant < 0) ||
+        (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
+      return true;
+  } break;
+  case AArch64CC::LT:
+  case AArch64CC::GE: {
+    if ((AddConstant == 0) ||
+        (AddConstant >= 0 && CompConstant <= 0) ||
+        (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
+      return true;
+  } break;
+  case AArch64CC::HI:
+  case AArch64CC::LS: {
+    if ((AddConstant >= 0 && CompConstant < 0) ||
+       (AddConstant <= 0 && CompConstant >= -1 &&
+        CompConstant < AddConstant + MaxUInt))
+      return true;
+  } break;
+  case AArch64CC::PL:
+  case AArch64CC::MI: {
+    if ((AddConstant == 0) ||
+        (AddConstant > 0 && CompConstant <= 0) ||
+        (AddConstant < 0 && CompConstant <= AddConstant))
+      return true;
+  } break;
+  case AArch64CC::LO:
+  case AArch64CC::HS: {
+    if ((AddConstant >= 0 && CompConstant <= 0) ||
+        (AddConstant <= 0 && CompConstant >= 0 &&
+         CompConstant <= AddConstant + MaxUInt))
+      return true;
+  } break;
+  case AArch64CC::EQ:
+  case AArch64CC::NE: {
+    if ((AddConstant > 0 && CompConstant < 0) ||
+        (AddConstant < 0 && CompConstant >= 0 &&
+         CompConstant < AddConstant + MaxUInt) ||
+        (AddConstant >= 0 && CompConstant >= 0 &&
+         CompConstant >= AddConstant) ||
+        (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
+
+      return true;
+  } break;
+  case AArch64CC::VS:
+  case AArch64CC::VC:
+  case AArch64CC::AL:
+  case AArch64CC::NV:
+    return true;
+  case AArch64CC::Invalid:
+    break;
+  }
+
+  return false;
+}
+
+static
+SDValue performCONDCombine(SDNode *N,
+                           TargetLowering::DAGCombinerInfo &DCI,
+                           SelectionDAG &DAG, unsigned CCIndex,
+                           unsigned CmpIndex) {
+  unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
+  SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
+  unsigned CondOpcode = SubsNode->getOpcode();
+
+  if (CondOpcode != AArch64ISD::SUBS)
+    return SDValue();
+
+  // There is a SUBS feeding this condition. Is it fed by a mask we can
+  // use?
+
+  SDNode *AndNode = SubsNode->getOperand(0).getNode();
+  unsigned MaskBits = 0;
+
+  if (AndNode->getOpcode() != ISD::AND)
+    return SDValue();
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
+    uint32_t CNV = CN->getZExtValue();
+    if (CNV == 255)
+      MaskBits = 8;
+    else if (CNV == 65535)
+      MaskBits = 16;
+  }
+
+  if (!MaskBits)
+    return SDValue();
+
+  SDValue AddValue = AndNode->getOperand(0);
+
+  if (AddValue.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  // The basic dag structure is correct, grab the inputs and validate them.
+
+  SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
+  SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
+  SDValue SubsInputValue = SubsNode->getOperand(1);
+
+  // The mask is present and the provenance of all the values is a smaller type,
+  // lets see if the mask is superfluous.
+
+  if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
+      !isa<ConstantSDNode>(SubsInputValue.getNode()))
+    return SDValue();
+
+  ISD::LoadExtType ExtType;
+
+  if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
+      !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
+      !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
+    return SDValue();
+
+  if(!isEquivalentMaskless(CC, MaskBits, ExtType,
+                cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
+                cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
+    return SDValue();
+
+  // The AND is not necessary, remove it.
+
+  SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
+                               SubsNode->getValueType(1));
+  SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
+
+  SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
+  DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
+
+  return SDValue(N, 0);
+}
+
  // Optimize compare with zero and branch.
  static SDValue performBRCONDCombine(SDNode *N,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      SelectionDAG &DAG) {
+  SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3);
+  if (NV.getNode())
+    N = NV.getNode();
    SDValue Chain = N->getOperand(0);
    SDValue Dest = N->getOperand(1);
    SDValue CCVal = N->getOperand(2);
@@ -7789,21 +8638,29 @@ static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
    SDValue N0 = N->getOperand(0);
    EVT ResVT = N->getValueType(0);
  
-  if (!N->getOperand(1).getValueType().isVector())
+  if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
      return SDValue();
  
-  if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
+  // If NumMaskElts == 0, the comparison is larger than select result. The
+  // largest real NEON comparison is 64-bits per lane, which means the result is
+  // at most 32-bits and an illegal vector. Just bail out for now.
+  EVT SrcVT = N0.getOperand(0).getValueType();
+
+  // Don't try to do this optimization when the setcc itself has i1 operands.
+  // There are no legal vectors of i1, so this would be pointless.
+  if (SrcVT == MVT::i1)
      return SDValue();
  
-  SDLoc DL(N0);
+  int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
+  if (!ResVT.isVector() || NumMaskElts == 0)
+    return SDValue();
  
-  EVT SrcVT = N0.getOperand(0).getValueType();
-  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT,
-                           ResVT.getSizeInBits() / SrcVT.getSizeInBits());
+  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
    EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
  
    // First perform a vector comparison, where lane 0 is the one we're interested
    // in.
+  SDLoc DL(N0);
    SDValue LHS =
        DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
    SDValue RHS =
@@ -7813,8 +8670,8 @@ static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
    // Now duplicate the comparison mask we want across all other lanes.
    SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
    SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
-  Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(),
-                     Mask);
+  Mask = DAG.getNode(ISD::BITCAST, DL,
+                     ResVT.changeVectorElementTypeToInteger(), Mask);
  
    return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
  }
@@ -7834,7 +8691,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
      return performMulCombine(N, DAG, DCI, Subtarget);
    case ISD::SINT_TO_FP:
    case ISD::UINT_TO_FP:
-    return performIntToFpCombine(N, DAG);
+    return performIntToFpCombine(N, DAG, Subtarget);
    case ISD::OR:
      return performORCombine(N, DCI, Subtarget);
    case ISD::INTRINSIC_WO_CHAIN:
@@ -7855,6 +8712,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
      return performSTORECombine(N, DCI, DAG, Subtarget);
    case AArch64ISD::BRCOND:
      return performBRCONDCombine(N, DCI, DAG);
+  case AArch64ISD::CSEL:
+    return performCONDCombine(N, DCI, DAG, 2, 3);
    case AArch64ISD::DUP:
      return performPostLD1Combine(N, DCI, false);
    case ISD::INSERT_VECTOR_ELT:
@@ -8010,13 +8869,12 @@ bool AArch64TargetLowering::getPostIndexedAddressParts(
  
  static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
                                    SelectionDAG &DAG) {
-  if (N->getValueType(0) != MVT::i16)
-    return;
-
    SDLoc DL(N);
    SDValue Op = N->getOperand(0);
-  assert(Op.getValueType() == MVT::f16 &&
-         "Inconsistent bitcast? Only 16-bit types should be i16 or f16");
+
+  if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16)
+    return;
+
    Op = SDValue(
        DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
                           DAG.getUNDEF(MVT::i32), Op,
@@ -8042,17 +8900,14 @@ void AArch64TargetLowering::ReplaceNodeResults(
    }
  }
  
-bool AArch64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
-  // Loads and stores less than 128-bits are already atomic; ones above that
-  // are doomed anyway, so defer to the default libcall and blame the OS when
-  // things go wrong:
-  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128;
-  else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
-    return LI->getType()->getPrimitiveSizeInBits() == 128;
+bool AArch64TargetLowering::useLoadStackGuardNode() const {
+  return true;
+}
  
-  // For the real atomic operations, we have ldxr/stxr up to 128 bits.
-  return Inst->getType()->getPrimitiveSizeInBits() <= 128;
+bool AArch64TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const {
+  // Combine multiple FDIVs with the same divisor into multiple FMULs by the
+  // reciprocal if there are three or more FDIVs.
+  return NumUsers > 2;
  }
  
  TargetLoweringBase::LegalizeTypeAction
@@ -8067,12 +8922,39 @@ AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
    return TargetLoweringBase::getPreferredVectorAction(VT);
  }
  
+// Loads and stores less than 128-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong.
+bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+  unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
+  return Size == 128;
+}
+
+// Loads and stores less than 128-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong.
+bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+  unsigned Size = LI->getType()->getPrimitiveSizeInBits();
+  return Size == 128;
+}
+
+// For the real atomic operations, we have ldxr/stxr up to 128 bits,
+TargetLoweringBase::AtomicRMWExpansionKind
+AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+  unsigned Size = AI->getType()->getPrimitiveSizeInBits();
+  return Size <= 128 ? AtomicRMWExpansionKind::LLSC
+                     : AtomicRMWExpansionKind::None;
+}
+
+bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const {
+  return true;
+}
+
  Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                                               AtomicOrdering Ord) const {
    Module *M = Builder.GetInsertBlock()->getParent()->getParent();
    Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
-  bool IsAcquire =
-      Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+  bool IsAcquire = isAtLeastAcquire(Ord);
  
    // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
    // intrinsic must return {i64, i64} and we have to recombine them into a
@@ -8107,8 +8989,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
                                                     Value *Val, Value *Addr,
                                                     AtomicOrdering Ord) const {
    Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  bool IsRelease =
-      Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+  bool IsRelease = isAtLeastRelease(Ord);
  
    // Since the intrinsics must have legal type, the i128 intrinsics take two
    // parameters: "i64, i64". We must marshal Val into the appropriate form
@@ -8135,3 +9016,8 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
                  Val, Stxr->getFunctionType()->getParamType(0)),
        Addr);
  }
+
+bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
+    Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
+  return Ty->isArrayTy();
+}