Fix typos in comments, NFC

[oota-llvm.git] / lib / Target / AArch64 / AArch64ISelLowering.cpp
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp

index fd9e69d32319c88f44dcc5dd98c4b26154bd7168..05a6ade726229dbd32159d2a5a6f23a4f7a80712 100644 (file)
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12,9 +12,9 @@
  //===----------------------------------------------------------------------===//
  
  #include "AArch64ISelLowering.h"
+#include "AArch64MachineFunctionInfo.h"
  #include "AArch64PerfectShuffle.h"
  #include "AArch64Subtarget.h"
-#include "AArch64MachineFunctionInfo.h"
  #include "AArch64TargetMachine.h"
  #include "AArch64TargetObjectFile.h"
  #include "MCTargetDesc/AArch64AddressingModes.h"
@@ -38,10 +38,12 @@ using namespace llvm;
  STATISTIC(NumTailCalls, "Number of tail calls");
  STATISTIC(NumShiftInserts, "Number of vector shift inserts");
  
+namespace {
  enum AlignMode {
    StrictAlign,
    NoStrictAlign
  };
+}
  
  static cl::opt<AlignMode>
  Align(cl::desc("Load/store alignment support"),
@@ -106,6 +108,7 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
      addDRTypeForNEON(MVT::v2i32);
      addDRTypeForNEON(MVT::v1i64);
      addDRTypeForNEON(MVT::v1f64);
+    addDRTypeForNEON(MVT::v4f16);
  
      addQRTypeForNEON(MVT::v4f32);
      addQRTypeForNEON(MVT::v2f64);
@@ -113,6 +116,7 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
      addQRTypeForNEON(MVT::v8i16);
      addQRTypeForNEON(MVT::v4i32);
      addQRTypeForNEON(MVT::v2i64);
+    addQRTypeForNEON(MVT::v8f16);
    }
  
    // Compute derived properties from the register classes
@@ -278,6 +282,94 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
  
+  // f16 is storage-only, so we promote operations to f32 if we know this is
+  // valid, and ignore them otherwise. The operations not mentioned here will
+  // fail to select, but this is not a major problem as no source language
+  // should be emitting native f16 operations yet.
+  setOperationAction(ISD::FADD, MVT::f16, Promote);
+  setOperationAction(ISD::FDIV, MVT::f16, Promote);
+  setOperationAction(ISD::FMUL, MVT::f16, Promote);
+  setOperationAction(ISD::FSUB, MVT::f16, Promote);
+
+  // v4f16 is also a storage-only type, so promote it to v4f32 when that is
+  // known to be safe.
+  setOperationAction(ISD::FADD, MVT::v4f16, Promote);
+  setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
+  setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
+  setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
+  setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote);
+  setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote);
+  AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32);
+
+  // Expand all other v4f16 operations.
+  // FIXME: We could generate better code by promoting some operations to
+  // a pair of v4f32s
+  setOperationAction(ISD::FABS, MVT::v4f16, Expand);
+  setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
+  setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
+  setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
+  setOperationAction(ISD::FMA, MVT::v4f16, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
+  setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
+  setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
+  setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
+  setOperationAction(ISD::FREM, MVT::v4f16, Expand);
+  setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
+  setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
+  setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
+  setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
+  setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
+  setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
+  setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
+  setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
+  setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
+  setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
+  setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
+  setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);
+
+
+  // v8f16 is also a storage-only type, so expand it.
+  setOperationAction(ISD::FABS, MVT::v8f16, Expand);
+  setOperationAction(ISD::FADD, MVT::v8f16, Expand);
+  setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
+  setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
+  setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
+  setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
+  setOperationAction(ISD::FMA, MVT::v8f16, Expand);
+  setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
+  setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
+  setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
+  setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
+  setOperationAction(ISD::FREM, MVT::v8f16, Expand);
+  setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
+  setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
+  setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
+  setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
+  setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
+  setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
+  setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
+  setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
+  setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
+  setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
+  setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);
+
    // AArch64 has implementations of a lot of rounding-like FP operations.
    static MVT RoundingTypes[] = { MVT::f32, MVT::f64};
    for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
@@ -305,6 +397,7 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
  
    // AArch64 does not have floating-point extending loads, i1 sign-extending
    // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
+  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
    setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
    setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
    setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
@@ -316,6 +409,10 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
    setTruncStoreAction(MVT::f128, MVT::f64, Expand);
    setTruncStoreAction(MVT::f128, MVT::f32, Expand);
    setTruncStoreAction(MVT::f128, MVT::f16, Expand);
+
+  setOperationAction(ISD::BITCAST, MVT::i16, Custom);
+  setOperationAction(ISD::BITCAST, MVT::f16, Custom);
+
    // Indexed loads and stores are supported.
    for (unsigned im = (unsigned)ISD::PRE_INC;
         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
@@ -472,16 +569,20 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
        setOperationAction(ISD::FROUND, Ty, Legal);
      }
    }
+
+  // Prefer likely predicted branches to selects on out-of-order cores.
+  if (Subtarget->isCortexA57())
+    PredictableSelectIsExpensive = true;
  }
  
  void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
-  if (VT == MVT::v2f32) {
+  if (VT == MVT::v2f32 || VT == MVT::v4f16) {
      setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
      AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
  
      setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
      AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
-  } else if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+  } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) {
      setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
      AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
  
@@ -627,7 +728,7 @@ MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
  
  unsigned AArch64TargetLowering::getMaximalGlobalOffset() const {
    // FIXME: On AArch64, this depends on the type.
-  // Basically, the addressable offsets are o to 4095 * Ty.getSizeInBytes().
+  // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes().
    // and the offset has to be a multiple of the related size in bytes.
    return 4095;
  }
@@ -769,7 +870,8 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
    // EndBB:
    //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
  
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
    MachineFunction *MF = MBB->getParent();
    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
    DebugLoc DL = MI->getDebugLoc();
@@ -823,8 +925,7 @@ AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
  #ifndef NDEBUG
      MI->dump();
  #endif
-    assert(0 && "Unexpected instruction for custom inserter!");
-    break;
+    llvm_unreachable("Unexpected instruction for custom inserter!");
  
    case AArch64::F128CSEL:
      return EmitF128CSEL(MI, BB);
@@ -833,7 +934,6 @@ AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    case TargetOpcode::PATCHPOINT:
      return emitPatchPoint(MI, BB);
    }
-  llvm_unreachable("Unexpected instruction for custom inserter!");
  }
  
  //===----------------------------------------------------------------------===//
@@ -1017,6 +1117,8 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
  
  static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                               SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
+  SDValue Cmp;
+  AArch64CC::CondCode AArch64CC;
    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
      EVT VT = RHS.getValueType();
      uint64_t C = RHSC->getZExtValue();
@@ -1071,9 +1173,45 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
        }
      }
    }
-
-  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
-  AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
+  // For the i8 operand, the largest immediate is 255, so this can be easily
+  // encoded in the compare instruction. For the i16 operand, however, the
+  // largest immediate cannot be encoded in the compare.
+  // Therefore, use a sign extending load and cmn to avoid materializing the -1
+  // constant. For example,
+  // movz w1, #65535
+  // ldrh w0, [x0, #0]
+  // cmp w0, w1
+  // >
+  // ldrsh w0, [x0, #0]
+  // cmn w0, #1
+  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
+  // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure
+  // both the LHS and RHS are truely zero extended and to make sure the
+  // transformation is profitable.
+  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
+    if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) &&
+        isa<LoadSDNode>(LHS)) {
+      if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
+          cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
+          LHS.getNode()->hasNUsesOfValue(1, 0)) {
+        int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
+        if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
+          SDValue SExt =
+              DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
+                          DAG.getValueType(MVT::i16));
+          Cmp = emitComparison(SExt,
+                               DAG.getConstant(ValueofRHS, RHS.getValueType()),
+                               CC, dl, DAG);
+          AArch64CC = changeIntCCToAArch64CC(CC);
+          AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
+          return Cmp;
+        }
+      }
+    }
+  }
+  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+  AArch64CC = changeIntCCToAArch64CC(CC);
    AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
    return Cmp;
  }
@@ -1273,7 +1411,7 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
    bool ExtraOp = false;
    switch (Op.getOpcode()) {
    default:
-    assert(0 && "Invalid code");
+    llvm_unreachable("Invalid code");
    case ISD::ADDC:
      Opc = AArch64ISD::ADDS;
      break;
@@ -1330,8 +1468,7 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
    SDLoc DL(Op);
    unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
    unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
-  // The data thing is not used.
-  // unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
  
    bool IsStream = !Locality;
    // When the locality number is set
@@ -1346,6 +1483,7 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
  
    // built the mask value encoding the expected behavior.
    unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
+                   (!IsData << 3) |     // IsDataCache bit
                     (Locality << 1) |    // Cache level bits
                     (unsigned)IsStream;  // Stream bit
    return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
@@ -1387,24 +1525,25 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
    EVT InVT = Op.getOperand(0).getValueType();
    EVT VT = Op.getValueType();
  
-  // FP_TO_XINT conversion from the same type are legal.
-  if (VT.getSizeInBits() == InVT.getSizeInBits())
-    return Op;
-
-  if (InVT == MVT::v2f64 || InVT == MVT::v4f32) {
+  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
      SDLoc dl(Op);
      SDValue Cv =
          DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
                      Op.getOperand(0));
      return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
-  } else if (InVT == MVT::v2f32) {
+  }
+
+  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
      SDLoc dl(Op);
-    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0));
+    MVT ExtVT =
+        MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
+                         VT.getVectorNumElements());
+    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
      return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
    }
  
    // Type changing conversions are illegal.
-  return SDValue();
+  return Op;
  }
  
  SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
@@ -1507,18 +1646,36 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
    StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
    TargetLowering::CallLoweringInfo CLI(DAG);
    CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(CallingConv::Fast, RetTy, Callee, &Args, 0);
+    .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0);
  
    std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
    return CallResult.first;
  }
  
+static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
+  if (Op.getValueType() != MVT::f16)
+    return SDValue();
+
+  assert(Op.getOperand(0).getValueType() == MVT::i16);
+  SDLoc DL(Op);
+
+  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
+  Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
+  return SDValue(
+      DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
+                         DAG.getTargetConstant(AArch64::hsub, MVT::i32)),
+      0);
+}
+
+
  SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
                                                SelectionDAG &DAG) const {
    switch (Op.getOpcode()) {
    default:
      llvm_unreachable("unimplemented operand");
      return SDValue();
+  case ISD::BITCAST:
+    return LowerBITCAST(Op, DAG);
    case ISD::GlobalAddress:
      return LowerGlobalAddress(Op, DAG);
    case ISD::GlobalTLSAddress:
@@ -1624,8 +1781,7 @@ unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const {
  
  #include "AArch64GenCallingConv.inc"
  
-/// Selects the correct CCAssignFn for a the given CallingConvention
-/// value.
+/// Selects the correct CCAssignFn for a given CallingConvention value.
  CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
                                                       bool IsVarArg) const {
    switch (CC) {
@@ -1650,8 +1806,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
  
    // Assign locations to all of the incoming arguments.
    SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
  
    // At this point, Ins[].VT may already be promoted to i32. To correctly
    // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
@@ -1715,6 +1871,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
          RC = &AArch64::GPR32RegClass;
        else if (RegVT == MVT::i64)
          RC = &AArch64::GPR64RegClass;
+      else if (RegVT == MVT::f16)
+        RC = &AArch64::FPR16RegClass;
        else if (RegVT == MVT::f32)
          RC = &AArch64::FPR32RegClass;
        else if (RegVT == MVT::f64 || RegVT.is64BitVector())
@@ -1753,7 +1911,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
      } else { // VA.isRegLoc()
        assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
        unsigned ArgOffset = VA.getLocMemOffset();
-      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
+      unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
  
        uint32_t BEAlign = 0;
        if (ArgSize < 8 && !Subtarget->isLittleEndian())
@@ -1788,7 +1946,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
  
        ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN,
                                  MachinePointerInfo::getFixedStack(FI),
-                                MemVT, false, false, false, nullptr);
+                                MemVT, false, false, false, 0, nullptr);
  
        InVals.push_back(ArgValue);
      }
@@ -1920,8 +2078,8 @@ SDValue AArch64TargetLowering::LowerCallResult(
                            : RetCC_AArch64_AAPCS;
    // Assign locations to each value returned by this call.
    SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
    CCInfo.AnalyzeCallResult(Ins, RetCC);
  
    // Copy all of the result registers out of their specified physreg.
@@ -1990,6 +2148,19 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
      return false;
    }
  
+  // Externally-defined functions with weak linkage should not be
+  // tail-called on AArch64 when the OS does not support dynamic
+  // pre-emption of symbols, as the AAELF spec requires normal calls
+  // to undefined weak functions to be replaced with a NOP or jump to the
+  // next instruction. The behaviour of branch instructions in this
+  // situation (as used for tail calls) is implementation-defined, so we
+  // cannot rely on the linker replacing the tail call with a return.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    if (GV->hasExternalWeakLinkage())
+      return false;
+  }
+
    // Now we search for cases where we can use a tail call without changing the
    // ABI. Sibcall is used in some places (particularly gcc) to refer to this
    // concept.
@@ -2007,8 +2178,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
      // FIXME: for now we take the most conservative of these in both cases:
      // disallow all variadic memory operands.
      SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                   getTargetMachine(), ArgLocs, *DAG.getContext());
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                   *DAG.getContext());
  
      CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
@@ -2020,13 +2191,13 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
    // results are returned in the same way as what the caller expects.
    if (!CCMatch) {
      SmallVector<CCValAssign, 16> RVLocs1;
-    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs1, *DAG.getContext());
+    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
+                    *DAG.getContext());
      CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
  
      SmallVector<CCValAssign, 16> RVLocs2;
-    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs2, *DAG.getContext());
+    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
+                    *DAG.getContext());
      CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
  
      if (RVLocs1.size() != RVLocs2.size())
@@ -2051,8 +2222,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
      return true;
  
    SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
  
    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
  
@@ -2149,8 +2320,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
  
    // Analyze operands of the call, assigning locations to each operand.
    SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
  
    if (IsVarArg) {
      // Handle fixed and variable vector arguments differently.
@@ -2295,7 +2466,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
        // common case. It should also work for fundamental types too.
        uint32_t BEAlign = 0;
        unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
-                                        : VA.getLocVT().getSizeInBits();
+                                        : VA.getValVT().getSizeInBits();
        OpSize = (OpSize + 7) / 8;
        if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
          if (OpSize < 8)
@@ -2329,8 +2500,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
              DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
          SDValue Cpy = DAG.getMemcpy(
              Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
-            /*isVolatile = */ false,
-            /*alwaysInline = */ false, DstInfo, MachinePointerInfo());
+            /*isVol = */ false,
+            /*AlwaysInline = */ false, DstInfo, MachinePointerInfo());
  
          MemOpChains.push_back(Cpy);
        } else {
@@ -2419,7 +2590,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
  
    // Add a register mask operand representing the call-preserved registers.
    const uint32_t *Mask;
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *TRI =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
    const AArch64RegisterInfo *ARI =
        static_cast<const AArch64RegisterInfo *>(TRI);
    if (IsThisReturn) {
@@ -2473,7 +2645,7 @@ bool AArch64TargetLowering::CanLowerReturn(
                            ? RetCC_AArch64_WebKit_JS
                            : RetCC_AArch64_AAPCS;
    SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
    return CCInfo.CheckReturn(Outs, RetCC);
  }
  
@@ -2487,8 +2659,8 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                            ? RetCC_AArch64_WebKit_JS
                            : RetCC_AArch64_AAPCS;
    SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
    CCInfo.AnalyzeReturn(Outs, RetCC);
  
    // Copy the result values into the output registers.
@@ -2630,7 +2802,8 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
    // TLS calls preserve all registers except those that absolutely must be
    // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
    // silly).
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *TRI =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
    const AArch64RegisterInfo *ARI =
        static_cast<const AArch64RegisterInfo *>(TRI);
    const uint32_t *Mask = ARI->getTLSCallPreservedMask();
@@ -2680,7 +2853,8 @@ SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
    // TLS calls preserve all registers except those that absolutely must be
    // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
    // silly).
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *TRI =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
    const AArch64RegisterInfo *ARI =
        static_cast<const AArch64RegisterInfo *>(TRI);
    const uint32_t *Mask = ARI->getTLSCallPreservedMask();
@@ -2895,11 +3069,6 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
              isPowerOf2_64(LHS.getConstantOperandVal(1))) {
            SDValue Test = LHS.getOperand(0);
            uint64_t Mask = LHS.getConstantOperandVal(1);
-
-          // TBZ only operates on i64's, but the ext should be free.
-          if (Test.getValueType() == MVT::i32)
-            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
-
            return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
                               DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
          }
@@ -2915,18 +3084,29 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
              isPowerOf2_64(LHS.getConstantOperandVal(1))) {
            SDValue Test = LHS.getOperand(0);
            uint64_t Mask = LHS.getConstantOperandVal(1);
-
-          // TBNZ only operates on i64's, but the ext should be free.
-          if (Test.getValueType() == MVT::i32)
-            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
-
            return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
                               DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
          }
  
          return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
+      } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
+        // Don't combine AND since emitComparison converts the AND to an ANDS
+        // (a.k.a. TST) and the test in the test bit and branch instruction
+        // becomes redundant.  This would also increase register pressure.
+        uint64_t Mask = LHS.getValueType().getSizeInBits() - 1;
+        return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
+                           DAG.getConstant(Mask, MVT::i64), Dest);
        }
      }
+    if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
+        LHS.getOpcode() != ISD::AND) {
+      // Don't combine AND since emitComparison converts the AND to an ANDS
+      // (a.k.a. TST) and the test in the test bit and branch instruction
+      // becomes redundant.  This would also increase register pressure.
+      uint64_t Mask = LHS.getValueType().getSizeInBits() - 1;
+      return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
+                         DAG.getConstant(Mask, MVT::i64), Dest);
+    }
  
      SDValue CCVal;
      SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
@@ -3992,8 +4172,10 @@ void AArch64TargetLowering::LowerAsmOperandForConstraint(
        return;
      case 'J': {
        uint64_t NVal = -C->getSExtValue();
-      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal))
+      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
+        CVal = C->getSExtValue();
          break;
+      }
        return;
      }
      // The K and L constraints apply *only* to logical immediates, including
@@ -4112,14 +4294,35 @@ static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
  // shuffle in combination with VEXTs.
  SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
                                                    SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
    SDLoc dl(Op);
    EVT VT = Op.getValueType();
    unsigned NumElts = VT.getVectorNumElements();
  
-  SmallVector<SDValue, 2> SourceVecs;
-  SmallVector<unsigned, 2> MinElts;
-  SmallVector<unsigned, 2> MaxElts;
+  struct ShuffleSourceInfo {
+    SDValue Vec;
+    unsigned MinElt;
+    unsigned MaxElt;
+
+    // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
+    // be compatible with the shuffle we intend to construct. As a result
+    // ShuffleVec will be some sliding window into the original Vec.
+    SDValue ShuffleVec;
+
+    // Code should guarantee that element i in Vec starts at element "WindowBase
+    // + i * WindowScale in ShuffleVec".
+    int WindowBase;
+    int WindowScale;
+
+    bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
+    ShuffleSourceInfo(SDValue Vec)
+        : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
+          WindowScale(1) {}
+  };
  
+  // First gather all vectors used as an immediate source for this BUILD_VECTOR
+  // node.
+  SmallVector<ShuffleSourceInfo, 2> Sources;
    for (unsigned i = 0; i < NumElts; ++i) {
      SDValue V = Op.getOperand(i);
      if (V.getOpcode() == ISD::UNDEF)
@@ -4130,122 +4333,153 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
        return SDValue();
      }
  
-    // Record this extraction against the appropriate vector if possible...
+    // Add this element source to the list if it's not already there.
      SDValue SourceVec = V.getOperand(0);
-    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
-    bool FoundSource = false;
-    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
-      if (SourceVecs[j] == SourceVec) {
-        if (MinElts[j] > EltNo)
-          MinElts[j] = EltNo;
-        if (MaxElts[j] < EltNo)
-          MaxElts[j] = EltNo;
-        FoundSource = true;
-        break;
-      }
-    }
+    auto Source = std::find(Sources.begin(), Sources.end(), SourceVec);
+    if (Source == Sources.end())
+      Sources.push_back(ShuffleSourceInfo(SourceVec));
  
-    // Or record a new source if not...
-    if (!FoundSource) {
-      SourceVecs.push_back(SourceVec);
-      MinElts.push_back(EltNo);
-      MaxElts.push_back(EltNo);
-    }
+    // Update the minimum and maximum lane number seen.
+    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+    Source->MinElt = std::min(Source->MinElt, EltNo);
+    Source->MaxElt = std::max(Source->MaxElt, EltNo);
    }
  
    // Currently only do something sane when at most two source vectors
-  // involved.
-  if (SourceVecs.size() > 2)
+  // are involved.
+  if (Sources.size() > 2)
      return SDValue();
  
-  SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
-  int VEXTOffsets[2] = { 0, 0 };
+  // Find out the smallest element size among result and two sources, and use
+  // it as element size to build the shuffle_vector.
+  EVT SmallestEltTy = VT.getVectorElementType();
+  for (auto &Source : Sources) {
+    EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
+    if (SrcEltTy.bitsLT(SmallestEltTy)) {
+      SmallestEltTy = SrcEltTy;
+    }
+  }
+  unsigned ResMultiplier =
+      VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits();
+  NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
+  EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
+
+  // If the source vector is too wide or too narrow, we may nevertheless be able
+  // to construct a compatible shuffle either by concatenating it with UNDEF or
+  // extracting a suitable range of elements.
+  for (auto &Src : Sources) {
+    EVT SrcVT = Src.ShuffleVec.getValueType();
  
-  // This loop extracts the usage patterns of the source vectors
-  // and prepares appropriate SDValues for a shuffle if possible.
-  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
-    if (SourceVecs[i].getValueType() == VT) {
-      // No VEXT necessary
-      ShuffleSrcs[i] = SourceVecs[i];
-      VEXTOffsets[i] = 0;
+    if (SrcVT.getSizeInBits() == VT.getSizeInBits())
        continue;
-    } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
+
+    // This stage of the search produces a source with the same element type as
+    // the original, but with a total width matching the BUILD_VECTOR output.
+    EVT EltVT = SrcVT.getVectorElementType();
+    EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
+                                  VT.getSizeInBits() / EltVT.getSizeInBits());
+
+    if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
+      assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
        // We can pad out the smaller vector for free, so if it's part of a
        // shuffle...
-      ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, SourceVecs[i],
-                                   DAG.getUNDEF(SourceVecs[i].getValueType()));
+      Src.ShuffleVec =
+          DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
+                      DAG.getUNDEF(Src.ShuffleVec.getValueType()));
        continue;
      }
  
-    // Don't attempt to extract subvectors from BUILD_VECTOR sources
-    // that expand or trunc the original value.
-    // TODO: We can try to bitcast and ANY_EXTEND the result but
-    // we need to consider the cost of vector ANY_EXTEND, and the
-    // legality of all the types.
-    if (SourceVecs[i].getValueType().getVectorElementType() !=
-        VT.getVectorElementType())
-      return SDValue();
-
-    // Since only 64-bit and 128-bit vectors are legal on ARM and
-    // we've eliminated the other cases...
-    assert(SourceVecs[i].getValueType().getVectorNumElements() == 2 * NumElts &&
-           "unexpected vector sizes in ReconstructShuffle");
+    assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());
  
-    if (MaxElts[i] - MinElts[i] >= NumElts) {
+    if (Src.MaxElt - Src.MinElt >= NumElts) {
        // Span too large for a VEXT to cope
        return SDValue();
      }
  
-    if (MinElts[i] >= NumElts) {
+    if (Src.MinElt >= NumElts) {
        // The extraction can just take the second half
-      VEXTOffsets[i] = NumElts;
-      ShuffleSrcs[i] =
-          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
+      Src.ShuffleVec =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
                        DAG.getIntPtrConstant(NumElts));
-    } else if (MaxElts[i] < NumElts) {
+      Src.WindowBase = -NumElts;
+    } else if (Src.MaxElt < NumElts) {
        // The extraction can just take the first half
-      VEXTOffsets[i] = 0;
-      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
-                                   SourceVecs[i], DAG.getIntPtrConstant(0));
+      Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT,
+                                   Src.ShuffleVec, DAG.getIntPtrConstant(0));
      } else {
        // An actual VEXT is needed
-      VEXTOffsets[i] = MinElts[i];
-      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
-                                     SourceVecs[i], DAG.getIntPtrConstant(0));
+      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT,
+                                     Src.ShuffleVec, DAG.getIntPtrConstant(0));
        SDValue VEXTSrc2 =
-          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
                        DAG.getIntPtrConstant(NumElts));
-      unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1);
-      ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2,
-                                   DAG.getConstant(Imm, MVT::i32));
+      unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
+
+      Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
+                                   VEXTSrc2, DAG.getConstant(Imm, MVT::i32));
+      Src.WindowBase = -Src.MinElt;
      }
    }
  
-  SmallVector<int, 8> Mask;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
+  // Another possible incompatibility occurs from the vector element types. We
+  // can fix this by bitcasting the source vectors to the same type we intend
+  // for the shuffle.
+  for (auto &Src : Sources) {
+    EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
+    if (SrcEltTy == SmallestEltTy)
+      continue;
+    assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
+    Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
+    Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
+    Src.WindowBase *= Src.WindowScale;
+  }
+
+  // Final sanity check before we try to actually produce a shuffle.
+  DEBUG(
+    for (auto Src : Sources)
+      assert(Src.ShuffleVec.getValueType() == ShuffleVT);
+  );
+
+  // The stars all align, our next step is to produce the mask for the shuffle.
+  SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
+  int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits();
+  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
      SDValue Entry = Op.getOperand(i);
-    if (Entry.getOpcode() == ISD::UNDEF) {
-      Mask.push_back(-1);
+    if (Entry.getOpcode() == ISD::UNDEF)
        continue;
-    }
  
-    SDValue ExtractVec = Entry.getOperand(0);
-    int ExtractElt =
-        cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue();
-    if (ExtractVec == SourceVecs[0]) {
-      Mask.push_back(ExtractElt - VEXTOffsets[0]);
-    } else {
-      Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
-    }
+    auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0));
+    int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
+
+    // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
+    // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
+    // segment.
+    EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
+    int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
+                               VT.getVectorElementType().getSizeInBits());
+    int LanesDefined = BitsDefined / BitsPerShuffleLane;
+
+    // This source is expected to fill ResMultiplier lanes of the final shuffle,
+    // starting at the appropriate offset.
+    int *LaneMask = &Mask[i * ResMultiplier];
+
+    int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
+    ExtractBase += NumElts * (Src - Sources.begin());
+    for (int j = 0; j < LanesDefined; ++j)
+      LaneMask[j] = ExtractBase + j;
    }
  
    // Final check before we try to produce nonsense...
-  if (isShuffleMaskLegal(Mask, VT))
-    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
-                                &Mask[0]);
+  if (!isShuffleMaskLegal(Mask, ShuffleVT))
+    return SDValue();
  
-  return SDValue();
+  SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
+  for (unsigned i = 0; i < Sources.size(); ++i)
+    ShuffleOps[i] = Sources[i].ShuffleVec;
+
+  SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
+                                         ShuffleOps[1], &Mask[0]);
+  return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
  }
  
  // check if an EXT instruction can handle the shuffle mask when the
@@ -4574,7 +4808,8 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
          VT.getVectorElementType() == MVT::f32)
        return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
      // vrev <4 x i16> -> REV32
-    if (VT.getVectorElementType() == MVT::i16)
+    if (VT.getVectorElementType() == MVT::i16 ||
+        VT.getVectorElementType() == MVT::f16)
        return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
      // vrev <4 x i8> -> REV16
      assert(VT.getVectorElementType() == MVT::i8);
@@ -4694,7 +4929,7 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
  static unsigned getDUPLANEOp(EVT EltType) {
    if (EltType == MVT::i8)
      return AArch64ISD::DUPLANE8;
-  if (EltType == MVT::i16)
+  if (EltType == MVT::i16 || EltType == MVT::f16)
      return AArch64ISD::DUPLANE16;
    if (EltType == MVT::i32 || EltType == MVT::f32)
      return AArch64ISD::DUPLANE32;
@@ -4824,7 +5059,8 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
      SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64);
  
      EVT ScalarVT = VT.getVectorElementType();
-    if (ScalarVT.getSizeInBits() < 32)
+
+    if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
        ScalarVT = MVT::i32;
  
      return DAG.getNode(
@@ -5173,11 +5409,37 @@ FailedModImm:
    return Op;
  }
  
+// Normalize the operands of BUILD_VECTOR. The value of constant operands will
+// be truncated to fit element width.
+static SDValue NormalizeBuildVector(SDValue Op,
+                                    SelectionDAG &DAG) {
+  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  EVT EltTy= VT.getVectorElementType();
+
+  if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
+    return Op;
+
+  SmallVector<SDValue, 16> Ops;
+  for (unsigned I = 0, E = VT.getVectorNumElements(); I != E; ++I) {
+    SDValue Lane = Op.getOperand(I);
+    if (Lane.getOpcode() == ISD::Constant) {
+      APInt LowBits(EltTy.getSizeInBits(),
+                    cast<ConstantSDNode>(Lane)->getZExtValue());
+      Lane = DAG.getConstant(LowBits.getZExtValue(), MVT::i32);
+    }
+    Ops.push_back(Lane);
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+}
+
  SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
                                                   SelectionDAG &DAG) const {
-  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
    SDLoc dl(Op);
    EVT VT = Op.getValueType();
+  Op = NormalizeBuildVector(Op, DAG);
+  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
  
    APInt CnstBits(VT.getSizeInBits(), 0);
    APInt UndefBits(VT.getSizeInBits(), 0);
@@ -5548,19 +5810,21 @@ SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
                                                        SelectionDAG &DAG) const {
    assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
  
-  // Check for non-constant lane.
-  if (!isa<ConstantSDNode>(Op.getOperand(2)))
+  // Check for non-constant or out of range lane.
+  EVT VT = Op.getOperand(0).getValueType();
+  ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+  if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
      return SDValue();
  
-  EVT VT = Op.getOperand(0).getValueType();
  
    // Insertion/extraction are legal for V128 types.
    if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
-      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
+      VT == MVT::v8f16)
      return Op;
  
    if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32)
+      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
      return SDValue();
  
    // For V64 types, we perform insertion by expanding the value
@@ -5580,19 +5844,21 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                                 SelectionDAG &DAG) const {
    assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
  
-  // Check for non-constant lane.
-  if (!isa<ConstantSDNode>(Op.getOperand(1)))
+  // Check for non-constant or out of range lane.
+  EVT VT = Op.getOperand(0).getValueType();
+  ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
      return SDValue();
  
-  EVT VT = Op.getOperand(0).getValueType();
  
    // Insertion/extraction are legal for V128 types.
    if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
-      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
+      VT == MVT::v8f16)
      return Op;
  
    if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32)
+      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
      return SDValue();
  
    // For V64 types, we perform extraction by expanding the value
@@ -6126,7 +6392,7 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
        !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                         Attribute::NoImplicitFloat) &&
        (memOpAlign(SrcAlign, DstAlign, 16) ||
-       (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast)))
+       (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast)))
      return MVT::f128;
  
    return Size >= 8 ? MVT::i64 : MVT::i32;
@@ -6321,6 +6587,48 @@ static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
    return performIntegerAbsCombine(N, DAG);
  }
  
+SDValue
+AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+                                     SelectionDAG &DAG,
+                                     std::vector<SDNode *> *Created) const {
+  // fold (sdiv X, pow2)
+  EVT VT = N->getValueType(0);
+  if ((VT != MVT::i32 && VT != MVT::i64) ||
+      !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue N0 = N->getOperand(0);
+  unsigned Lg2 = Divisor.countTrailingZeros();
+  SDValue Zero = DAG.getConstant(0, VT);
+  SDValue Pow2MinusOne = DAG.getConstant((1 << Lg2) - 1, VT);
+
+  // Add (N0 < 0) ? Pow2 - 1 : 0;
+  SDValue CCVal;
+  SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
+  SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
+  SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
+
+  if (Created) {
+    Created->push_back(Cmp.getNode());
+    Created->push_back(Add.getNode());
+    Created->push_back(CSel.getNode());
+  }
+
+  // Divide by pow2.
+  SDValue SRA =
+      DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, MVT::i64));
+
+  // If we're dividing by a positive value, we're done.  Otherwise, we must
+  // negate the result.
+  if (Divisor.isNonNegative())
+    return SRA;
+
+  if (Created)
+    Created->push_back(SRA.getNode());
+  return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), SRA);
+}
+
  static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const AArch64Subtarget *Subtarget) {
@@ -6335,32 +6643,107 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
      APInt Value = C->getAPIntValue();
      EVT VT = N->getValueType(0);
-    APInt VM1 = Value - 1;
-    if (VM1.isPowerOf2()) {
-      // Multiplying by one more than a power of two, replace with a shift
-      // and an add.
-      SDValue ShiftedVal =
-          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
-                      DAG.getConstant(VM1.logBase2(), MVT::i64));
-      return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
-    }
-    APInt VP1 = Value + 1;
-    if (VP1.isPowerOf2()) {
-      // Multiplying by one less than a power of two, replace with a shift
-      // and a subtract.
-      SDValue ShiftedVal =
-          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
-                      DAG.getConstant(VP1.logBase2(), MVT::i64));
-      return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
+    if (Value.isNonNegative()) {
+      // (mul x, 2^N + 1) => (add (shl x, N), x)
+      APInt VM1 = Value - 1;
+      if (VM1.isPowerOf2()) {
+        SDValue ShiftedVal =
+            DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                        DAG.getConstant(VM1.logBase2(), MVT::i64));
+        return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal,
+                           N->getOperand(0));
+      }
+      // (mul x, 2^N - 1) => (sub (shl x, N), x)
+      APInt VP1 = Value + 1;
+      if (VP1.isPowerOf2()) {
+        SDValue ShiftedVal =
+            DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                        DAG.getConstant(VP1.logBase2(), MVT::i64));
+        return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal,
+                           N->getOperand(0));
+      }
+    } else {
+      // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
+      APInt VNM1 = -Value - 1;
+      if (VNM1.isPowerOf2()) {
+        SDValue ShiftedVal =
+            DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                        DAG.getConstant(VNM1.logBase2(), MVT::i64));
+        SDValue Add =
+            DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
+        return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), Add);
+      }
+      // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
+      APInt VNP1 = -Value + 1;
+      if (VNP1.isPowerOf2()) {
+        SDValue ShiftedVal =
+            DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                        DAG.getConstant(VNP1.logBase2(), MVT::i64));
+        return DAG.getNode(ISD::SUB, SDLoc(N), VT, N->getOperand(0),
+                           ShiftedVal);
+      }
      }
    }
    return SDValue();
  }
  
+static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
+                                                         SelectionDAG &DAG) {
+  // Take advantage of vector comparisons producing 0 or -1 in each lane to
+  // optimize away operation when it's from a constant.
+  //
+  // The general transformation is:
+  //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
+  //       AND(VECTOR_CMP(x,y), constant2)
+  //    constant2 = UNARYOP(constant)
+
+  // Early exit if this isn't a vector operation, the operand of the
+  // unary operation isn't a bitwise AND, or if the sizes of the operations
+  // aren't the same.
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
+      N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
+      VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
+    return SDValue();
+
+  // Now check that the other operand of the AND is a constant. We could
+  // make the transformation for non-constant splats as well, but it's unclear
+  // that would be a benefit as it would not eliminate any operations, just
+  // perform one more step in scalar code before moving to the vector unit.
+  if (BuildVectorSDNode *BV =
+          dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
+    // Bail out if the vector isn't a constant.
+    if (!BV->isConstant())
+      return SDValue();
+
+    // Everything checks out. Build up the new and improved node.
+    SDLoc DL(N);
+    EVT IntVT = BV->getValueType(0);
+    // Create a new constant of the appropriate type for the transformed
+    // DAG.
+    SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
+    // The AND node needs bitcasts to/from an integer vector type around it.
+    SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
+    SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
+                                 N->getOperand(0)->getOperand(0), MaskConst);
+    SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
+    return Res;
+  }
+
+  return SDValue();
+}
+
  static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
+  // First try to optimize away the conversion when it's conditionally from
+  // a constant. Vectors only.
+  SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
+  if (Res != SDValue())
+    return Res;
+
    EVT VT = N->getValueType(0);
    if (VT != MVT::f32 && VT != MVT::f64)
      return SDValue();
+
    // Only optimize when the source and destination types have the same width.
    if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits())
      return SDValue();
@@ -6676,7 +7059,7 @@ static SDValue tryCombineFixedPointConvert(SDNode *N,
      else if (Vec.getValueType() == MVT::v2i64)
        VecResTy = MVT::v2f64;
      else
-      assert(0 && "unexpected vector type!");
+      llvm_unreachable("unexpected vector type!");
  
      SDValue Convert =
          DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
@@ -7009,7 +7392,7 @@ static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
    if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits)
      return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
                         DAG.getConstant(-ShiftAmount, MVT::i32));
-  else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount <= ElemBits)
+  else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits)
      return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
                         DAG.getConstant(ShiftAmount, MVT::i32));
  
@@ -7130,11 +7513,11 @@ static SDValue performExtendCombine(SDNode *N,
    // If the vector type isn't a simple VT, it's beyond the scope of what
    // we're  worried about here. Let legalization do its thing and hope for
    // the best.
-  if (!ResVT.isSimple())
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src->getValueType(0);
+  if (!ResVT.isSimple() || !SrcVT.isSimple())
      return SDValue();
  
-  SDValue Src = N->getOperand(0);
-  MVT SrcVT = Src->getValueType(0).getSimpleVT();
    // If the source VT is a 64-bit vector, we can play games and get the
    // better results we want.
    if (SrcVT.getSizeInBits() != 64)
@@ -7368,7 +7751,7 @@ static SDValue performPostLD1Combine(SDNode *N,
      Ops.push_back(Inc);
  
      EVT Tys[3] = { VT, MVT::i64, MVT::Other };
-    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, 3));
+    SDVTList SDTys = DAG.getVTList(Tys);
      unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
      SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
                                             MemVT,
@@ -7498,7 +7881,7 @@ static SDValue performNEONPostLDSTCombine(SDNode *N,
        Tys[n] = VecTy;
      Tys[n++] = MVT::i64;  // Type of write back register
      Tys[n] = MVT::Other;  // Type of the chain
-    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs + 2));
+    SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
  
      MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
      SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
@@ -7519,10 +7902,271 @@ static SDValue performNEONPostLDSTCombine(SDNode *N,
    return SDValue();
  }
  
+// Checks to see if the value is the prescribed width and returns information
+// about its extension mode.
+static
+bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
+  ExtType = ISD::NON_EXTLOAD;
+  switch(V.getNode()->getOpcode()) {
+  default:
+    return false;
+  case ISD::LOAD: {
+    LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
+    if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
+       || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
+      ExtType = LoadNode->getExtensionType();
+      return true;
+    }
+    return false;
+  }
+  case ISD::AssertSext: {
+    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
+    if ((TypeNode->getVT() == MVT::i8 && width == 8)
+       || (TypeNode->getVT() == MVT::i16 && width == 16)) {
+      ExtType = ISD::SEXTLOAD;
+      return true;
+    }
+    return false;
+  }
+  case ISD::AssertZext: {
+    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
+    if ((TypeNode->getVT() == MVT::i8 && width == 8)
+       || (TypeNode->getVT() == MVT::i16 && width == 16)) {
+      ExtType = ISD::ZEXTLOAD;
+      return true;
+    }
+    return false;
+  }
+  case ISD::Constant:
+  case ISD::TargetConstant: {
+    if(abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) < 1<<(width-1))
+      return true;
+    return false;
+  }
+  }
+
+  return true;
+}
+
+// This function does a whole lot of voodoo to determine if the tests are
+// equivalent without and with a mask. Essentially what happens is that given a
+// DAG resembling:
+//
+//  +-------------+ +-------------+ +-------------+ +-------------+
+//  |    Input    | | AddConstant | | CompConstant| |     CC      |
+//  +-------------+ +-------------+ +-------------+ +-------------+
+//           |           |           |               |
+//           V           V           |    +----------+
+//          +-------------+  +----+  |    |
+//          |     ADD     |  |0xff|  |    |
+//          +-------------+  +----+  |    |
+//                  |           |    |    |
+//                  V           V    |    |
+//                 +-------------+   |    |
+//                 |     AND     |   |    |
+//                 +-------------+   |    |
+//                      |            |    |
+//                      +-----+      |    |
+//                            |      |    |
+//                            V      V    V
+//                           +-------------+
+//                           |     CMP     |
+//                           +-------------+
+//
+// The AND node may be safely removed for some combinations of inputs. In
+// particular we need to take into account the extension type of the Input,
+// the exact values of AddConstant, CompConstant, and CC, along with the nominal
+// width of the input (this can work for any width inputs, the above graph is
+// specific to 8 bits.
+//
+// The specific equations were worked out by generating output tables for each
+// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
+// problem was simplified by working with 4 bit inputs, which means we only
+// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
+// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
+// patterns present in both extensions (0,7). For every distinct set of
+// AddConstant and CompConstants bit patterns we can consider the masked and
+// unmasked versions to be equivalent if the result of this function is true for
+// all 16 distinct bit patterns of for the current extension type of Input (w0).
+//
+//   sub      w8, w0, w1
+//   and      w10, w8, #0x0f
+//   cmp      w8, w2
+//   cset     w9, AArch64CC
+//   cmp      w10, w2
+//   cset     w11, AArch64CC
+//   cmp      w9, w11
+//   cset     w0, eq
+//   ret
+//
+// Since the above function shows when the outputs are equivalent it defines
+// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
+// would be expensive to run during compiles. The equations below were written
+// in a test harness that confirmed they gave equivalent outputs to the above
+// for all inputs function, so they can be used determine if the removal is
+// legal instead.
+//
+// isEquivalentMaskless() is the code for testing if the AND can be removed
+// factored out of the DAG recognition as the DAG can take several forms.
+
+static
+bool isEquivalentMaskless(unsigned CC, unsigned width,
+                          ISD::LoadExtType ExtType, signed AddConstant,
+                          signed CompConstant) {
+  // By being careful about our equations and only writing the in term
+  // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
+  // make them generally applicable to all bit widths.
+  signed MaxUInt = (1 << width);
+
+  // For the purposes of these comparisons sign extending the type is
+  // equivalent to zero extending the add and displacing it by half the integer
+  // width. Provided we are careful and make sure our equations are valid over
+  // the whole range we can just adjust the input and avoid writing equations
+  // for sign extended inputs.
+  if (ExtType == ISD::SEXTLOAD)
+    AddConstant -= (1 << (width-1));
+
+  switch(CC) {
+  case AArch64CC::LE:
+  case AArch64CC::GT: {
+    if ((AddConstant == 0) ||
+        (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
+        (AddConstant >= 0 && CompConstant < 0) ||
+        (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
+      return true;
+  } break;
+  case AArch64CC::LT:
+  case AArch64CC::GE: {
+    if ((AddConstant == 0) ||
+        (AddConstant >= 0 && CompConstant <= 0) ||
+        (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
+      return true;
+  } break;
+  case AArch64CC::HI:
+  case AArch64CC::LS: {
+    if ((AddConstant >= 0 && CompConstant < 0) ||
+       (AddConstant <= 0 && CompConstant >= -1 &&
+        CompConstant < AddConstant + MaxUInt))
+      return true;
+  } break;
+  case AArch64CC::PL:
+  case AArch64CC::MI: {
+    if ((AddConstant == 0) ||
+        (AddConstant > 0 && CompConstant <= 0) ||
+        (AddConstant < 0 && CompConstant <= AddConstant))
+      return true;
+  } break;
+  case AArch64CC::LO:
+  case AArch64CC::HS: {
+    if ((AddConstant >= 0 && CompConstant <= 0) ||
+        (AddConstant <= 0 && CompConstant >= 0 &&
+         CompConstant <= AddConstant + MaxUInt))
+      return true;
+  } break;
+  case AArch64CC::EQ:
+  case AArch64CC::NE: {
+    if ((AddConstant > 0 && CompConstant < 0) ||
+        (AddConstant < 0 && CompConstant >= 0 &&
+         CompConstant < AddConstant + MaxUInt) ||
+        (AddConstant >= 0 && CompConstant >= 0 &&
+         CompConstant >= AddConstant) ||
+        (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
+
+      return true;
+  } break;
+  case AArch64CC::VS:
+  case AArch64CC::VC:
+  case AArch64CC::AL:
+  case AArch64CC::NV:
+    return true;
+  case AArch64CC::Invalid:
+    break;
+  }
+
+  return false;
+}
+
+static
+SDValue performCONDCombine(SDNode *N,
+                           TargetLowering::DAGCombinerInfo &DCI,
+                           SelectionDAG &DAG, unsigned CCIndex,
+                           unsigned CmpIndex) {
+  unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
+  SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
+  unsigned CondOpcode = SubsNode->getOpcode();
+
+  if (CondOpcode != AArch64ISD::SUBS)
+    return SDValue();
+
+  // There is a SUBS feeding this condition. Is it fed by a mask we can
+  // use?
+
+  SDNode *AndNode = SubsNode->getOperand(0).getNode();
+  unsigned MaskBits = 0;
+
+  if (AndNode->getOpcode() != ISD::AND)
+    return SDValue();
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
+    uint32_t CNV = CN->getZExtValue();
+    if (CNV == 255)
+      MaskBits = 8;
+    else if (CNV == 65535)
+      MaskBits = 16;
+  }
+
+  if (!MaskBits)
+    return SDValue();
+
+  SDValue AddValue = AndNode->getOperand(0);
+
+  if (AddValue.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  // The basic dag structure is correct, grab the inputs and validate them.
+
+  SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
+  SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
+  SDValue SubsInputValue = SubsNode->getOperand(1);
+
+  // The mask is present and the provenance of all the values is a smaller type,
+  // lets see if the mask is superfluous.
+
+  if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
+      !isa<ConstantSDNode>(SubsInputValue.getNode()))
+    return SDValue();
+
+  ISD::LoadExtType ExtType;
+
+  if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
+      !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
+      !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
+    return SDValue();
+
+  if(!isEquivalentMaskless(CC, MaskBits, ExtType,
+                cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
+                cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
+    return SDValue();
+
+  // The AND is not necessary, remove it.
+
+  SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
+                               SubsNode->getValueType(1));
+  SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
+
+  SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
+  DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
+
+  return SDValue(N, 0);
+}
+
  // Optimize compare with zero and branch.
  static SDValue performBRCONDCombine(SDNode *N,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      SelectionDAG &DAG) {
+  SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3);
+  if (NV.getNode())
+    N = NV.getNode();
    SDValue Chain = N->getOperand(0);
    SDValue Dest = N->getOperand(1);
    SDValue CCVal = N->getOperand(2);
@@ -7611,21 +8255,23 @@ static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
    SDValue N0 = N->getOperand(0);
    EVT ResVT = N->getValueType(0);
  
-  if (!N->getOperand(1).getValueType().isVector())
-    return SDValue();
-
    if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
      return SDValue();
  
-  SDLoc DL(N0);
-
+  // If NumMaskElts == 0, the comparison is larger than select result. The
+  // largest real NEON comparison is 64-bits per lane, which means the result is
+  // at most 32-bits and an illegal vector. Just bail out for now.
    EVT SrcVT = N0.getOperand(0).getValueType();
-  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT,
-                           ResVT.getSizeInBits() / SrcVT.getSizeInBits());
+  int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
+  if (!ResVT.isVector() || NumMaskElts == 0)
+    return SDValue();
+
+  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
    EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
  
    // First perform a vector comparison, where lane 0 is the one we're interested
    // in.
+  SDLoc DL(N0);
    SDValue LHS =
        DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
    SDValue RHS =
@@ -7635,8 +8281,8 @@ static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
    // Now duplicate the comparison mask we want across all other lanes.
    SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
    SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
-  Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(),
-                     Mask);
+  Mask = DAG.getNode(ISD::BITCAST, DL,
+                     ResVT.changeVectorElementTypeToInteger(), Mask);
  
    return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
  }
@@ -7677,6 +8323,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
      return performSTORECombine(N, DCI, DAG, Subtarget);
    case AArch64ISD::BRCOND:
      return performBRCONDCombine(N, DCI, DAG);
+  case AArch64ISD::CSEL:
+    return performCONDCombine(N, DCI, DAG, 2, 3);
    case AArch64ISD::DUP:
      return performPostLD1Combine(N, DCI, false);
    case ISD::INSERT_VECTOR_ELT:
@@ -7830,11 +8478,32 @@ bool AArch64TargetLowering::getPostIndexedAddressParts(
    return true;
  }
  
+static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                                  SelectionDAG &DAG) {
+  if (N->getValueType(0) != MVT::i16)
+    return;
+
+  SDLoc DL(N);
+  SDValue Op = N->getOperand(0);
+  assert(Op.getValueType() == MVT::f16 &&
+         "Inconsistent bitcast? Only 16-bit types should be i16 or f16");
+  Op = SDValue(
+      DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
+                         DAG.getUNDEF(MVT::i32), Op,
+                         DAG.getTargetConstant(AArch64::hsub, MVT::i32)),
+      0);
+  Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
+  Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
+}
+
  void AArch64TargetLowering::ReplaceNodeResults(
      SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
    switch (N->getOpcode()) {
    default:
      llvm_unreachable("Don't know how to custom expand this");
+  case ISD::BITCAST:
+    ReplaceBITCASTResults(N, Results, DAG);
+    return;
    case ISD::FP_TO_UINT:
    case ISD::FP_TO_SINT:
      assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
@@ -7856,12 +8525,27 @@ bool AArch64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
    return Inst->getType()->getPrimitiveSizeInBits() <= 128;
  }
  
+bool AArch64TargetLowering::useLoadStackGuardNode() const {
+  return true;
+}
+
+TargetLoweringBase::LegalizeTypeAction
+AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
+  MVT SVT = VT.getSimpleVT();
+  // During type legalization, we prefer to widen v1i8, v1i16, v1i32  to v8i8,
+  // v4i16, v2i32 instead of to promote.
+  if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32
+      || SVT == MVT::v1f32)
+    return TypeWidenVector;
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
  Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                                               AtomicOrdering Ord) const {
    Module *M = Builder.GetInsertBlock()->getParent()->getParent();
    Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
-  bool IsAcquire =
-      Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+  bool IsAcquire = isAtLeastAcquire(Ord);
  
    // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
    // intrinsic must return {i64, i64} and we have to recombine them into a
@@ -7896,8 +8580,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
                                                     Value *Val, Value *Addr,
                                                     AtomicOrdering Ord) const {
    Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  bool IsRelease =
-      Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+  bool IsRelease = isAtLeastRelease(Ord);
  
    // Since the intrinsics must have legal type, the i128 intrinsics take two
    // parameters: "i64, i64". We must marshal Val into the appropriate form