setLibcallCallingConv(RTLIB::UDIV_I32, CallingConv::ARM_AAPCS);
}
+ if (HasDivModLibcall) {
+ setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
+ setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
+ }
+
if (Subtarget->isThumb1Only())
addRegisterClass(MVT::i32, ARM::tGPRRegisterClass);
else
const {
DebugLoc dl = Op.getDebugLoc();
return DAG.getNode(ARMISD::EH_SJLJ_DISPATCHSETUP, dl, MVT::Other,
- Op.getOperand(0), Op.getOperand(1));
+ Op.getOperand(0));
}
SDValue
assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
int index = ArgLocs[i].getValNo();
-
+
// Some Ins[] entries become multiple ArgLoc[] entries.
// Process them only once.
if (index != lastInsIndex)
AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
-
+
SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
&VTBLMask[0], 8));
- return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
+ return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
&VTBLMask[0], 8));
}
DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
}
-static SDValue
+static SDValue
LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) {
// Convert to float
// float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
// Get reciprocal estimate.
// float4 recip = vrecpeq_f32(yf);
- Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+ Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), Y);
// Because char has a smaller range than uchar, we can actually get away
// without any newton steps. This requires that we use a weird bias
return X;
}
-static SDValue
+static SDValue
LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) {
SDValue N2;
// Convert to float.
N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
-
+
// Use reciprocal estimate and one refinement step.
// float4 recip = vrecpeq_f32(yf);
// recip *= vrecpsq_f32(yf, recip);
- N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+ N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1);
- N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+ N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
N1, N2);
N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
SDValue N2, N3;
-
+
if (VT == MVT::v8i8) {
N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
-
+
N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
DAG.getIntPtrConstant(4));
N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
- DAG.getIntPtrConstant(4));
+ DAG.getIntPtrConstant(4));
N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
DAG.getIntPtrConstant(0));
N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
N0 = LowerCONCAT_VECTORS(N0, DAG);
-
+
N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
return N0;
}
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
SDValue N2, N3;
-
+
if (VT == MVT::v8i8) {
N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
-
+
N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
DAG.getIntPtrConstant(4));
N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
- DAG.getIntPtrConstant(4));
+ DAG.getIntPtrConstant(4));
N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
DAG.getIntPtrConstant(0));
N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
DAG.getIntPtrConstant(0));
-
+
N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
-
+
N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
N0 = LowerCONCAT_VECTORS(N0, DAG);
-
- N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
+
+ N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, MVT::i32),
N0);
return N0;
}
-
+
// v4i16 sdiv ... Convert to float.
// float4 yf = vcvt_f32_s32(vmovl_u16(y));
// float4 xf = vcvt_f32_s32(vmovl_u16(x));
// float4 recip = vrecpeq_f32(yf);
// recip *= vrecpsq_f32(yf, recip);
// recip *= vrecpsq_f32(yf, recip);
- N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+ N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1);
- N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+ N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
N1, N2);
N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
- N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+ N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
N1, N2);
N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2);
case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4);
+ case ARM::ADCSSri:
+ case ARM::ADCSSrr:
+ case ARM::ADCSSrs:
+ case ARM::SBCSSri:
+ case ARM::SBCSSrr:
+ case ARM::SBCSSrs:
+ case ARM::RSBSri:
+ case ARM::RSBSrr:
+ case ARM::RSBSrs:
+ case ARM::RSCSri:
+ case ARM::RSCSrs: {
+ unsigned OldOpc = MI->getOpcode();
+ unsigned Opc = 0;
+ switch (OldOpc) {
+ case ARM::ADCSSrr:
+ Opc = ARM::ADCrr;
+ break;
+ case ARM::ADCSSri:
+ Opc = ARM::ADCri;
+ break;
+ case ARM::ADCSSrs:
+ Opc = ARM::ADCrs;
+ break;
+ case ARM::SBCSSrr:
+ Opc = ARM::SBCrr;
+ break;
+ case ARM::SBCSSri:
+ Opc = ARM::SBCri;
+ break;
+ case ARM::SBCSSrs:
+ Opc = ARM::SBCrs;
+ break;
+ case ARM::RSBSri:
+ Opc = ARM::RSBri;
+ break;
+ case ARM::RSBSrr:
+ Opc = ARM::RSBrr;
+ break;
+ case ARM::RSBSrs:
+ Opc = ARM::RSBrs;
+ break;
+ case ARM::RSCSri:
+ Opc = ARM::RSCri;
+ break;
+ case ARM::RSCSrs:
+ Opc = ARM::RSCrs;
+ break;
+ default:
+ llvm_unreachable("Unknown opcode?");
+ }
+
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(Opc));
+ for (unsigned i = 0; i < MI->getNumOperands(); ++i)
+ MIB.addOperand(MI->getOperand(i));
+ AddDefaultPred(MIB);
+ MIB.addReg(ARM::CPSR, RegState::Define); // S bit
+ MI->eraseFromParent();
+ return BB;
+ }
+
+
case ARM::tMOVCCr_pseudo: {
// To "insert" a SELECT_CC instruction, we actually have to insert the
// diamond control-flow pattern. The incoming instruction knows the
return SDValue();
}
+/// PerformVMULCombine
+/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
+/// special multiplier accumulator forwarding.
+/// vmul d3, d0, d2
+/// vmla d3, d1, d2
+/// is faster than
+/// vadd d3, d0, d1
+/// vmul d3, d3, d2
+static SDValue PerformVMULCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ if (!Subtarget->hasVMLxForwarding())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ unsigned Opcode = N0.getOpcode();
+ if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
+ Opcode != ISD::FADD && Opcode != ISD::FSUB) {
+ Opcode = N0.getOpcode();
+ if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
+ Opcode != ISD::FADD && Opcode != ISD::FSUB)
+ return SDValue();
+ std::swap(N0, N1);
+ }
+
+ EVT VT = N->getValueType(0);
+ DebugLoc DL = N->getDebugLoc();
+ SDValue N00 = N0->getOperand(0);
+ SDValue N01 = N0->getOperand(1);
+ return DAG.getNode(Opcode, DL, VT,
+ DAG.getNode(ISD::MUL, DL, VT, N00, N1),
+ DAG.getNode(ISD::MUL, DL, VT, N01, N1));
+}
+
static SDValue PerformMULCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
return SDValue();
EVT VT = N->getValueType(0);
+ if (VT.is64BitVector() || VT.is128BitVector())
+ return PerformVMULCombine(N, DCI, Subtarget);
if (VT != MVT::i32)
return SDValue();
static SDValue PerformANDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
-
+
// Attempt to use immediate-form VBIC
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
DebugLoc dl = N->getDebugLoc();
EVT VT = N->getValueType(0);
SelectionDAG &DAG = DCI.DAG;
+ if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
EVT VT = N->getValueType(0);
SelectionDAG &DAG = DCI.DAG;
+ if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
SDValue InDouble = N->getOperand(0);
if (InDouble.getOpcode() == ARMISD::VMOVDRR)
return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
+
+ // vmovrrd(load f64) -> (load i32), (load i32)
+ SDNode *InNode = InDouble.getNode();
+ if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
+ InNode->getValueType(0) == MVT::f64 &&
+ InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
+ !cast<LoadSDNode>(InNode)->isVolatile()) {
+ // TODO: Should this be done for non-FrameIndex operands?
+ LoadSDNode *LD = cast<LoadSDNode>(InNode);
+
+ SelectionDAG &DAG = DCI.DAG;
+ DebugLoc DL = LD->getDebugLoc();
+ SDValue BasePtr = LD->getBasePtr();
+ SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr,
+ LD->getPointerInfo(), LD->isVolatile(),
+ LD->isNonTemporal(), LD->getAlignment());
+
+ SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
+ DAG.getConstant(4, MVT::i32));
+ SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr,
+ LD->getPointerInfo(), LD->isVolatile(),
+ LD->isNonTemporal(),
+ std::min(4U, LD->getAlignment() / 2));
+
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
+ SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
+ DCI.RemoveFromWorklist(LD);
+ DAG.DeleteNode(LD);
+ return Result;
+ }
+
return SDValue();
}
// Otherwise, the i64 value will be legalized to a pair of i32 values.
StoreSDNode *St = cast<StoreSDNode>(N);
SDValue StVal = St->getValue();
- if (!ISD::isNormalStore(St) || St->isVolatile() ||
- StVal.getValueType() != MVT::i64 ||
+ if (!ISD::isNormalStore(St) || St->isVolatile())
+ return SDValue();
+
+ if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
+ StVal.getNode()->hasOneUse() && !St->isVolatile()) {
+ SelectionDAG &DAG = DCI.DAG;
+ DebugLoc DL = St->getDebugLoc();
+ SDValue BasePtr = St->getBasePtr();
+ SDValue NewST1 = DAG.getStore(St->getChain(), DL,
+ StVal.getNode()->getOperand(0), BasePtr,
+ St->getPointerInfo(), St->isVolatile(),
+ St->isNonTemporal(), St->getAlignment());
+
+ SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
+ DAG.getConstant(4, MVT::i32));
+ return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(1),
+ OffsetPtr, St->getPointerInfo(), St->isVolatile(),
+ St->isNonTemporal(),
+ std::min(4U, St->getAlignment() / 2));
+ }
+
+ if (StVal.getValueType() != MVT::i64 ||
StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
EVT VecTy;
if (isLoad)
VecTy = N->getValueType(0);
- else
+ else
VecTy = N->getOperand(AddrOpIdx+1).getValueType();
unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
if (isLaneOp)
DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
break;
- }
+ }
return SDValue();
}