setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
if (Subtarget->is64Bit()) {
- setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand);
setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
+ setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand);
} else {
- if (X86ScalarSSEf64) {
+ if (!UseSoftFloat && !NoImplicitFloat && X86ScalarSSEf64) {
// We have an impenetrably clever algorithm for ui64->double only.
setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
// We have faster algorithm for ui32->single only.
setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
- } else
+ } else {
setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
+ }
}
// Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
// this operation.
setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
- // SSE has no i16 to fp conversion, only i32
- if (X86ScalarSSEf32) {
- setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
- // f32 and f64 cases are Legal, f80 case is not
- setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
+
+ if (!UseSoftFloat && !NoImplicitFloat) {
+ // SSE has no i16 to fp conversion, only i32
+ if (X86ScalarSSEf32) {
+ setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
+ // f32 and f64 cases are Legal, f80 case is not
+ setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
+ } else {
+ setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
+ }
} else {
- setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
- setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
}
// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
- setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
+ setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v8i8, Expand);
setOperationAction(ISD::SELECT, MVT::v8i8, Promote);
setOperationAction(ISD::SELECT, MVT::v4i16, Promote);
if (!UseSoftFloat && Subtarget->hasSSE2()) {
addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
- // FIXME: Unfortunately -soft-float means XMM registers cannot be used even
- // for integer operations.
+ // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
+ // registers cannot be used even for integer operations.
addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
}
+
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
+
if (Subtarget->is64Bit()) {
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
setTargetDAGCombine(ISD::SRA);
setTargetDAGCombine(ISD::SRL);
setTargetDAGCombine(ISD::STORE);
+ if (Subtarget->is64Bit())
+ setTargetDAGCombine(ISD::MUL);
computeRegisterProperties();
// FIXME: This turns off use of xmm stores for memset/memcpy on targets like
// linux. This is because the stack realignment code can't handle certain
// cases like PR2962. This should be removed when PR2962 is fixed.
- if (Subtarget->getStackAlignment() >= 16) {
+ if (!NoImplicitFloat && Subtarget->getStackAlignment() >= 16) {
if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16)
return MVT::v4i32;
if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16)
return MVT::i32;
}
-
/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
/// jumptable.
SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
"SSE register cannot be used when SSE is disabled!");
- assert(!(NumXMMRegs && UseSoftFloat) &&
+ assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloat) &&
"SSE register cannot be used when SSE is disabled!");
- if (UseSoftFloat || !Subtarget->hasSSE1()) {
+ if (UseSoftFloat || NoImplicitFloat || !Subtarget->hasSSE1())
// Kernel mode asks for SSE to be disabled, so don't push them
// on the stack.
TotalNumXMMRegs = 0;
- }
+
// For X86-64, if there are vararg parameters that are passed via
// registers, then we must store them to their spots on the stack so they
// may be loaded by deferencing the result of va_next.
// Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
// source words for the shuffle, to aid later transformations.
bool AllWordsInNewV = true;
+ bool InOrder[2] = { true, true };
for (unsigned i = 0; i != 8; ++i) {
int idx = MaskVals[i];
+ if (idx != (int)i)
+ InOrder[i/4] = false;
if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
continue;
AllWordsInNewV = false;
// If we've eliminated the use of V2, and the new mask is a pshuflw or
// pshufhw, that's as cheap as it gets. Return the new shuffle.
- if (pshufhw || pshuflw) {
+ if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
MaskV.clear();
for (unsigned i = 0; i != 8; ++i)
MaskV.push_back((MaskVals[i] < 0) ? DAG.getUNDEF(MVT::i16)
SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
SDValue InsElt;
-
+
+ // If Elt0 and Elt1 are defined, are consecutive, and can be load
+ // using a single extract together, load it and store it.
+ if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
+ InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
+ DAG.getIntPtrConstant(Elt1 / 2));
+ NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
+ DAG.getIntPtrConstant(i));
+ continue;
+ }
+
// If Elt1 is defined, extract it from the appropriate source. If the
- // source byte is not also odd, shift the extracted word left 8 bits.
+ // source byte is not also odd, shift the extracted word left 8 bits
+ // otherwise clear the bottom 8 bits if we need to do an or.
if (Elt1 >= 0) {
InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
DAG.getIntPtrConstant(Elt1 / 2));
if ((Elt1 & 1) == 0)
InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
DAG.getConstant(8, TLI.getShiftAmountTy()));
+ else if (Elt0 >= 0)
+ InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
+ DAG.getConstant(0xFF00, MVT::i16));
}
// If Elt0 is defined, extract it from the appropriate source. If the
// source byte is not also even, shift the extracted word right 8 bits. If
if ((Elt0 & 1) != 0)
InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
DAG.getConstant(8, TLI.getShiftAmountTy()));
+ else if (Elt1 >= 0)
+ InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
+ DAG.getConstant(0x00FF, MVT::i16));
InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
: InsElt0;
}
ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
// FIXME there isn't really any debug info here, should come from the parent
DebugLoc dl = CP->getDebugLoc();
- SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(),
- getPointerTy(),
- CP->getAlignment());
+ SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
+ CP->getAlignment());
Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
// With PIC, the address is actually $g + Offset.
if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size);
SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
- StackSlot,
- PseudoSourceValue::getFixedStack(SSFI), 0);
+ StackSlot,
+ PseudoSourceValue::getFixedStack(SSFI), 0);
// Build the FILD
SDVTList Tys;
CV0.push_back(ConstantInt::get(APInt(32, 0)));
CV0.push_back(ConstantInt::get(APInt(32, 0)));
Constant *C0 = ConstantVector::get(CV0);
- SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 4);
+ SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
std::vector<Constant*> CV1;
CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4530000000000000ULL))));
CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4330000000000000ULL))));
Constant *C1 = ConstantVector::get(CV1);
- SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 4);
+ SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
SmallVector<SDValue, 4> MaskVec;
MaskVec.push_back(DAG.getConstant(0, MVT::i32));
CV.push_back(C);
}
Constant *C = ConstantVector::get(CV);
- SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 4);
+ SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
PseudoSourceValue::getConstantPool(), 0,
false, 16);
CV.push_back(C);
}
Constant *C = ConstantVector::get(CV);
- SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 4);
+ SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
PseudoSourceValue::getConstantPool(), 0,
false, 16);
CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
}
Constant *C = ConstantVector::get(CV);
- SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 4);
+ SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
PseudoSourceValue::getConstantPool(), 0,
false, 16);
CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
}
C = ConstantVector::get(CV);
- CPIdx = DAG.getConstantPool(C, getPointerTy(), 4);
+ CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
PseudoSourceValue::getConstantPool(), 0,
false, 16);
/// Emit nodes that will be selected as "test Op0,Op0", or something
/// equivalent.
-SDValue X86TargetLowering::EmitTest(SDValue Op, SelectionDAG &DAG) {
+SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
+ SelectionDAG &DAG) {
DebugLoc dl = Op.getDebugLoc();
+ // CF and OF aren't always set the way we want. Determine which
+ // of these we need.
+ bool NeedCF = false;
+ bool NeedOF = false;
+ switch (X86CC) {
+ case X86::COND_A: case X86::COND_AE:
+ case X86::COND_B: case X86::COND_BE:
+ NeedCF = true;
+ break;
+ case X86::COND_G: case X86::COND_GE:
+ case X86::COND_L: case X86::COND_LE:
+ case X86::COND_O: case X86::COND_NO:
+ NeedOF = true;
+ break;
+ default: break;
+ }
+
// See if we can use the EFLAGS value from the operand instead of
- // doing a separate TEST.
- if (Op.getResNo() == 0) {
+ // doing a separate TEST. TEST always sets OF and CF to 0, so unless
+ // we prove that the arithmetic won't overflow, we can't use OF or CF.
+ if (Op.getResNo() == 0 && !NeedOF && !NeedCF) {
unsigned Opcode = 0;
+ unsigned NumOperands = 0;
switch (Op.getNode()->getOpcode()) {
case ISD::ADD:
// Due to an isel shortcoming, be conservative if this add is likely to
UE = Op.getNode()->use_end(); UI != UE; ++UI)
if (UI->getOpcode() == ISD::STORE)
goto default_case;
- // An add of one will be selected as an INC.
if (ConstantSDNode *C =
- dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1)))
+ dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
+ // An add of one will be selected as an INC.
if (C->getAPIntValue() == 1) {
Opcode = X86ISD::INC;
+ NumOperands = 1;
+ break;
+ }
+ // An add of negative one (subtract of one) will be selected as a DEC.
+ if (C->getAPIntValue().isAllOnesValue()) {
+ Opcode = X86ISD::DEC;
+ NumOperands = 1;
break;
}
+ }
// Otherwise use a regular EFLAGS-setting add.
Opcode = X86ISD::ADD;
+ NumOperands = 2;
break;
case ISD::SUB:
// Due to the ISEL shortcoming noted above, be conservative if this sub is
UE = Op.getNode()->use_end(); UI != UE; ++UI)
if (UI->getOpcode() == ISD::STORE)
goto default_case;
- // A subtract of one will be selected as a DEC.
- if (ConstantSDNode *C =
- dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1)))
- if (C->getAPIntValue() == 1) {
- Opcode = X86ISD::DEC;
- break;
- }
// Otherwise use a regular EFLAGS-setting sub.
Opcode = X86ISD::SUB;
+ NumOperands = 2;
break;
case X86ISD::ADD:
case X86ISD::SUB:
if (Opcode != 0) {
const MVT *VTs = DAG.getNodeValueTypes(Op.getValueType(), MVT::i32);
SmallVector<SDValue, 4> Ops;
- for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i)
+ for (unsigned i = 0; i != NumOperands; ++i)
Ops.push_back(Op.getOperand(i));
- SDValue New = DAG.getNode(Opcode, dl, VTs, 2, &Ops[0], Ops.size());
+ SDValue New = DAG.getNode(Opcode, dl, VTs, 2, &Ops[0], NumOperands);
DAG.ReplaceAllUsesWith(Op, New);
return SDValue(New.getNode(), 1);
}
/// Emit nodes that will be selected as "cmp Op0,Op1", or something
/// equivalent.
-SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
+SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+ SelectionDAG &DAG) {
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
if (C->getAPIntValue() == 0)
- return EmitTest(Op0, DAG);
+ return EmitTest(Op0, X86CC, DAG);
DebugLoc dl = Op0.getDebugLoc();
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
- SDValue Cond = EmitCmp(Op0, Op1, DAG);
+ SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG);
return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
DAG.getConstant(X86CC, MVT::i8), Cond);
}
!isScalarFPTypeInSSEReg(VT)) // FPStack?
IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
- if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || Opc == X86ISD::BT) { // FIXME
+ if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
+ Opc == X86ISD::BT) { // FIXME
Cond = Cmp;
addTest = false;
}
if (addTest) {
CC = DAG.getConstant(X86::COND_NE, MVT::i8);
- Cond = EmitTest(Cond, DAG);
+ Cond = EmitTest(Cond, X86::COND_NE, DAG);
}
const MVT *VTs = DAG.getNodeValueTypes(Op.getValueType(),
if (addTest) {
CC = DAG.getConstant(X86::COND_NE, MVT::i8);
- Cond = EmitTest(Cond, DAG);
+ Cond = EmitTest(Cond, X86::COND_NE, DAG);
}
return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
Chain, Dest, CC, Cond);
newMBB->addSuccessor(newMBB);
// Insert instructions into newMBB based on incoming instruction
- assert(bInstr->getNumOperands() < 8 && "unexpected number of operands");
+ assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 &&
+ "unexpected number of operands");
DebugLoc dl = bInstr->getDebugLoc();
MachineOperand& destOper = bInstr->getOperand(0);
- MachineOperand* argOpers[6];
+ MachineOperand* argOpers[2 + X86AddrNumOperands];
int numArgs = bInstr->getNumOperands() - 1;
for (int i=0; i < numArgs; ++i)
argOpers[i] = &bInstr->getOperand(i+1);
// x86 address has 4 operands: base, index, scale, and displacement
- int lastAddrIndx = 3; // [0,3]
- int valArgIndx = 4;
+ int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
+ int valArgIndx = lastAddrIndx + 1;
unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
DebugLoc dl = bInstr->getDebugLoc();
// Insert instructions into newMBB based on incoming instruction
// There are 8 "real" operands plus 9 implicit def/uses, ignored here.
- assert(bInstr->getNumOperands() < 18 && "unexpected number of operands");
+ assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 &&
+ "unexpected number of operands");
MachineOperand& dest1Oper = bInstr->getOperand(0);
MachineOperand& dest2Oper = bInstr->getOperand(1);
- MachineOperand* argOpers[6];
- for (int i=0; i < 6; ++i)
+ MachineOperand* argOpers[2 + X86AddrNumOperands];
+ for (int i=0; i < 2 + X86AddrNumOperands; ++i)
argOpers[i] = &bInstr->getOperand(i+2);
// x86 address has 4 operands: base, index, scale, and displacement
- int lastAddrIndx = 3; // [0,3]
+ int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
tt2 = t2;
}
- assert((argOpers[4]->isReg() || argOpers[4]->isImm()) &&
+ int valArgIndx = lastAddrIndx + 1;
+ assert((argOpers[valArgIndx]->isReg() ||
+ argOpers[valArgIndx]->isImm()) &&
"invalid operand");
unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
- if (argOpers[4]->isReg())
+ if (argOpers[valArgIndx]->isReg())
MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
else
MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
if (regOpcL != X86::MOV32rr)
MIB.addReg(tt1);
- (*MIB).addOperand(*argOpers[4]);
- assert(argOpers[5]->isReg() == argOpers[4]->isReg());
- assert(argOpers[5]->isImm() == argOpers[4]->isImm());
- if (argOpers[5]->isReg())
+ (*MIB).addOperand(*argOpers[valArgIndx]);
+ assert(argOpers[valArgIndx + 1]->isReg() ==
+ argOpers[valArgIndx]->isReg());
+ assert(argOpers[valArgIndx + 1]->isImm() ==
+ argOpers[valArgIndx]->isImm());
+ if (argOpers[valArgIndx + 1]->isReg())
MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
else
MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
if (regOpcH != X86::MOV32rr)
MIB.addReg(tt2);
- (*MIB).addOperand(*argOpers[5]);
+ (*MIB).addOperand(*argOpers[valArgIndx + 1]);
MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX);
MIB.addReg(t1);
DebugLoc dl = mInstr->getDebugLoc();
// Insert instructions into newMBB based on incoming instruction
- assert(mInstr->getNumOperands() < 8 && "unexpected number of operands");
+ assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 &&
+ "unexpected number of operands");
MachineOperand& destOper = mInstr->getOperand(0);
- MachineOperand* argOpers[6];
+ MachineOperand* argOpers[2 + X86AddrNumOperands];
int numArgs = mInstr->getNumOperands() - 1;
for (int i=0; i < numArgs; ++i)
argOpers[i] = &mInstr->getOperand(i+1);
// x86 address has 4 operands: base, index, scale, and displacement
- int lastAddrIndx = 3; // [0,3]
- int valArgIndx = 4;
+ int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
+ int valArgIndx = lastAddrIndx + 1;
unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
- DebugLoc dl = N->getDebugLoc();
+ const X86Subtarget *Subtarget) {
+ DebugLoc DL = N->getDebugLoc();
SDValue Cond = N->getOperand(0);
-
+ // Get the LHS/RHS of the select.
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+
// If we have SSE[12] support, try to form min/max nodes.
if (Subtarget->hasSSE2() &&
- (N->getValueType(0) == MVT::f32 || N->getValueType(0) == MVT::f64)) {
- if (Cond.getOpcode() == ISD::SETCC) {
- // Get the LHS/RHS of the select.
- SDValue LHS = N->getOperand(1);
- SDValue RHS = N->getOperand(2);
- ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
-
- unsigned Opcode = 0;
- if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
- switch (CC) {
- default: break;
- case ISD::SETOLE: // (X <= Y) ? X : Y -> min
- case ISD::SETULE:
- case ISD::SETLE:
- if (!UnsafeFPMath) break;
- // FALL THROUGH.
- case ISD::SETOLT: // (X olt/lt Y) ? X : Y -> min
- case ISD::SETLT:
- Opcode = X86ISD::FMIN;
- break;
+ (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
+ Cond.getOpcode() == ISD::SETCC) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
- case ISD::SETOGT: // (X > Y) ? X : Y -> max
- case ISD::SETUGT:
- case ISD::SETGT:
- if (!UnsafeFPMath) break;
- // FALL THROUGH.
- case ISD::SETUGE: // (X uge/ge Y) ? X : Y -> max
- case ISD::SETGE:
- Opcode = X86ISD::FMAX;
- break;
- }
- } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) {
- switch (CC) {
- default: break;
- case ISD::SETOGT: // (X > Y) ? Y : X -> min
- case ISD::SETUGT:
- case ISD::SETGT:
- if (!UnsafeFPMath) break;
- // FALL THROUGH.
- case ISD::SETUGE: // (X uge/ge Y) ? Y : X -> min
- case ISD::SETGE:
- Opcode = X86ISD::FMIN;
- break;
+ unsigned Opcode = 0;
+ if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
+ switch (CC) {
+ default: break;
+ case ISD::SETOLE: // (X <= Y) ? X : Y -> min
+ case ISD::SETULE:
+ case ISD::SETLE:
+ if (!UnsafeFPMath) break;
+ // FALL THROUGH.
+ case ISD::SETOLT: // (X olt/lt Y) ? X : Y -> min
+ case ISD::SETLT:
+ Opcode = X86ISD::FMIN;
+ break;
- case ISD::SETOLE: // (X <= Y) ? Y : X -> max
- case ISD::SETULE:
- case ISD::SETLE:
- if (!UnsafeFPMath) break;
- // FALL THROUGH.
- case ISD::SETOLT: // (X olt/lt Y) ? Y : X -> max
- case ISD::SETLT:
- Opcode = X86ISD::FMAX;
- break;
- }
+ case ISD::SETOGT: // (X > Y) ? X : Y -> max
+ case ISD::SETUGT:
+ case ISD::SETGT:
+ if (!UnsafeFPMath) break;
+ // FALL THROUGH.
+ case ISD::SETUGE: // (X uge/ge Y) ? X : Y -> max
+ case ISD::SETGE:
+ Opcode = X86ISD::FMAX;
+ break;
}
+ } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) {
+ switch (CC) {
+ default: break;
+ case ISD::SETOGT: // (X > Y) ? Y : X -> min
+ case ISD::SETUGT:
+ case ISD::SETGT:
+ if (!UnsafeFPMath) break;
+ // FALL THROUGH.
+ case ISD::SETUGE: // (X uge/ge Y) ? Y : X -> min
+ case ISD::SETGE:
+ Opcode = X86ISD::FMIN;
+ break;
- if (Opcode)
- return DAG.getNode(Opcode, dl, N->getValueType(0), LHS, RHS);
+ case ISD::SETOLE: // (X <= Y) ? Y : X -> max
+ case ISD::SETULE:
+ case ISD::SETLE:
+ if (!UnsafeFPMath) break;
+ // FALL THROUGH.
+ case ISD::SETOLT: // (X olt/lt Y) ? Y : X -> max
+ case ISD::SETLT:
+ Opcode = X86ISD::FMAX;
+ break;
+ }
}
+ if (Opcode)
+ return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
}
+
+ // If this is a select between two integer constants, try to do some
+ // optimizations.
+ if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
+ if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
+ // Don't do this for crazy integer types.
+ if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
+ // If this is efficiently invertible, canonicalize the LHSC/RHSC values
+ // so that TrueC (the true value) is larger than FalseC.
+ bool NeedsCondInvert = false;
+
+ if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
+ // Efficiently invertible.
+ (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
+ (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
+ isa<ConstantSDNode>(Cond.getOperand(1))))) {
+ NeedsCondInvert = true;
+ std::swap(TrueC, FalseC);
+ }
+
+ // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
+ if (FalseC->getAPIntValue() == 0 &&
+ TrueC->getAPIntValue().isPowerOf2()) {
+ if (NeedsCondInvert) // Invert the condition if needed.
+ Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, Cond.getValueType()));
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
+
+ unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+ return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
+ DAG.getConstant(ShAmt, MVT::i8));
+ }
+
+ // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
+ if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+ if (NeedsCondInvert) // Invert the condition if needed.
+ Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, Cond.getValueType()));
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+ FalseC->getValueType(0), Cond);
+ return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ }
+
+ // Optimize cases that will turn into an LEA instruction. This requires
+ // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+ if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+ uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+ if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+
+ bool isFastMultiplier = false;
+ if (Diff < 10) {
+ switch ((unsigned char)Diff) {
+ default: break;
+ case 1: // result = add base, cond
+ case 2: // result = lea base( , cond*2)
+ case 3: // result = lea base(cond, cond*2)
+ case 4: // result = lea base( , cond*4)
+ case 5: // result = lea base(cond, cond*4)
+ case 8: // result = lea base( , cond*8)
+ case 9: // result = lea base(cond, cond*8)
+ isFastMultiplier = true;
+ break;
+ }
+ }
+
+ if (isFastMultiplier) {
+ APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+ if (NeedsCondInvert) // Invert the condition if needed.
+ Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, Cond.getValueType()));
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+ Cond);
+ // Scale the condition by the difference.
+ if (Diff != 1)
+ Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(Diff, Cond.getValueType()));
+
+ // Add the base if non-zero.
+ if (FalseC->getAPIntValue() != 0)
+ Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ return Cond;
+ }
+ }
+ }
+ }
+
+ return SDValue();
+}
+/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
+static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ DebugLoc DL = N->getDebugLoc();
+
+ // If the flag operand isn't dead, don't touch this CMOV.
+ if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
+ return SDValue();
+
+ // If this is a select between two integer constants, try to do some
+ // optimizations. Note that the operands are ordered the opposite of SELECT
+ // operands.
+ if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+ if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+ // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
+ // larger than FalseC (the false value).
+ X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
+
+ if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
+ CC = X86::GetOppositeBranchCondition(CC);
+ std::swap(TrueC, FalseC);
+ }
+
+ // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
+ // This is efficient for any integer data type (including i8/i16) and
+ // shift amount.
+ if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
+ SDValue Cond = N->getOperand(3);
+ Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+ DAG.getConstant(CC, MVT::i8), Cond);
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
+
+ unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+ Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(ShAmt, MVT::i8));
+ if (N->getNumValues() == 2) // Dead flag value?
+ return DCI.CombineTo(N, Cond, SDValue());
+ return Cond;
+ }
+
+ // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
+ // for any integer data type, including i8/i16.
+ if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+ SDValue Cond = N->getOperand(3);
+ Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+ DAG.getConstant(CC, MVT::i8), Cond);
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+ FalseC->getValueType(0), Cond);
+ Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+
+ if (N->getNumValues() == 2) // Dead flag value?
+ return DCI.CombineTo(N, Cond, SDValue());
+ return Cond;
+ }
+
+ // Optimize cases that will turn into an LEA instruction. This requires
+ // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+ if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+ uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+ if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+
+ bool isFastMultiplier = false;
+ if (Diff < 10) {
+ switch ((unsigned char)Diff) {
+ default: break;
+ case 1: // result = add base, cond
+ case 2: // result = lea base( , cond*2)
+ case 3: // result = lea base(cond, cond*2)
+ case 4: // result = lea base( , cond*4)
+ case 5: // result = lea base(cond, cond*4)
+ case 8: // result = lea base( , cond*8)
+ case 9: // result = lea base(cond, cond*8)
+ isFastMultiplier = true;
+ break;
+ }
+ }
+
+ if (isFastMultiplier) {
+ APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+ SDValue Cond = N->getOperand(3);
+ Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+ DAG.getConstant(CC, MVT::i8), Cond);
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+ Cond);
+ // Scale the condition by the difference.
+ if (Diff != 1)
+ Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(Diff, Cond.getValueType()));
+
+ // Add the base if non-zero.
+ if (FalseC->getAPIntValue() != 0)
+ Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ if (N->getNumValues() == 2) // Dead flag value?
+ return DCI.CombineTo(N, Cond, SDValue());
+ return Cond;
+ }
+ }
+ }
+ }
return SDValue();
}
+
+/// PerformMulCombine - Optimize a single multiply with constant into two
+/// in order to implement it with two cheaper instructions, e.g.
+/// LEA + SHL, LEA + LEA.
+static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (DAG.getMachineFunction().
+ getFunction()->hasFnAttr(Attribute::OptimizeForSize))
+ return SDValue();
+
+ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+ return SDValue();
+
+ MVT VT = N->getValueType(0);
+ if (VT != MVT::i64)
+ return SDValue();
+
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!C)
+ return SDValue();
+ uint64_t MulAmt = C->getZExtValue();
+ if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
+ return SDValue();
+
+ uint64_t MulAmt1 = 0;
+ uint64_t MulAmt2 = 0;
+ if ((MulAmt % 9) == 0) {
+ MulAmt1 = 9;
+ MulAmt2 = MulAmt / 9;
+ } else if ((MulAmt % 5) == 0) {
+ MulAmt1 = 5;
+ MulAmt2 = MulAmt / 5;
+ } else if ((MulAmt % 3) == 0) {
+ MulAmt1 = 3;
+ MulAmt2 = MulAmt / 3;
+ }
+ if (MulAmt2 &&
+ (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
+ DebugLoc DL = N->getDebugLoc();
+
+ if (isPowerOf2_64(MulAmt2) &&
+ !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
+ // If second multiplifer is pow2, issue it first. We want the multiply by
+ // 3, 5, or 9 to be folded into the addressing mode unless the lone use
+ // is an add.
+ std::swap(MulAmt1, MulAmt2);
+
+ SDValue NewMul;
+ if (isPowerOf2_64(MulAmt1))
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
+ else
+ NewMul = DAG.getNode(ISD::MUL, DL, VT, N->getOperand(0),
+ DAG.getConstant(MulAmt1, VT));
+
+ if (isPowerOf2_64(MulAmt2))
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
+ DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
+ else
+ NewMul = DAG.getNode(ISD::MUL, DL, VT, NewMul,
+ DAG.getConstant(MulAmt2, VT));
+
+ // Do not add new nodes to DAG combiner worklist.
+ DCI.CombineTo(N, NewMul, false);
+ }
+ return SDValue();
+}
+
+
/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
/// when possible.
static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
SDValue ShAmtOp = N->getOperand(1);
MVT EltVT = VT.getVectorElementType();
- DebugLoc dl = N->getDebugLoc();
+ DebugLoc DL = N->getDebugLoc();
SDValue BaseShAmt;
if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
unsigned NumElts = VT.getVectorNumElements();
}
} else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
isSplatMask(ShAmtOp.getOperand(2).getNode())) {
- BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ShAmtOp,
+ BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
DAG.getIntPtrConstant(0));
} else
return SDValue();
if (EltVT.bitsGT(MVT::i32))
- BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt);
+ BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
else if (EltVT.bitsLT(MVT::i32))
- BaseShAmt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BaseShAmt);
+ BaseShAmt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, BaseShAmt);
// The shift amount is identical so we can do a vector shift.
SDValue ValOp = N->getOperand(0);
break;
case ISD::SHL:
if (VT == MVT::v2i64)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
ValOp, BaseShAmt);
if (VT == MVT::v4i32)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
ValOp, BaseShAmt);
if (VT == MVT::v8i16)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
ValOp, BaseShAmt);
break;
case ISD::SRA:
if (VT == MVT::v4i32)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
ValOp, BaseShAmt);
if (VT == MVT::v8i16)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
ValOp, BaseShAmt);
break;
case ISD::SRL:
if (VT == MVT::v2i64)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
ValOp, BaseShAmt);
if (VT == MVT::v4i32)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
ValOp, BaseShAmt);
if (VT == MVT::v8i16)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
ValOp, BaseShAmt);
break;
/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+ const X86Subtarget *Subtarget) {
// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
// the FP state in cases where an emms may be missing.
// A preferable solution to the general problem is to figure out the right
// places to insert EMMS. This qualifies as a quick hack.
+
+ // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
StoreSDNode *St = cast<StoreSDNode>(N);
- if (St->getValue().getValueType().isVector() &&
- St->getValue().getValueType().getSizeInBits() == 64 &&
+ MVT VT = St->getValue().getValueType();
+ if (VT.getSizeInBits() != 64)
+ return SDValue();
+
+ bool F64IsLegal = !UseSoftFloat && !NoImplicitFloat && Subtarget->hasSSE2();
+ if ((VT.isVector() ||
+ (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
isa<LoadSDNode>(St->getValue()) &&
!cast<LoadSDNode>(St->getValue())->isVolatile() &&
St->getChain().hasOneUse() && !St->isVolatile()) {
Ops.push_back(ChainVal->getOperand(i));
}
}
- if (Ld) {
- DebugLoc dl = N->getDebugLoc();
- // If we are a 64-bit capable x86, lower to a single movq load/store pair.
- if (Subtarget->is64Bit()) {
- SDValue NewLd = DAG.getLoad(MVT::i64, dl, Ld->getChain(),
- Ld->getBasePtr(), Ld->getSrcValue(),
- Ld->getSrcValueOffset(), Ld->isVolatile(),
- Ld->getAlignment());
- SDValue NewChain = NewLd.getValue(1);
- if (TokenFactorIndex != -1) {
- Ops.push_back(NewChain);
- NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Ops[0],
- Ops.size());
- }
- return DAG.getStore(NewChain, dl, NewLd, St->getBasePtr(),
- St->getSrcValue(), St->getSrcValueOffset(),
- St->isVolatile(), St->getAlignment());
- }
- // Otherwise, lower to two 32-bit copies.
- SDValue LoAddr = Ld->getBasePtr();
- SDValue HiAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, LoAddr,
- DAG.getConstant(4, MVT::i32));
+ if (!Ld || !ISD::isNormalLoad(Ld))
+ return SDValue();
- SDValue LoLd = DAG.getLoad(MVT::i32, dl, Ld->getChain(), LoAddr,
- Ld->getSrcValue(), Ld->getSrcValueOffset(),
- Ld->isVolatile(), Ld->getAlignment());
- SDValue HiLd = DAG.getLoad(MVT::i32, dl, Ld->getChain(), HiAddr,
- Ld->getSrcValue(), Ld->getSrcValueOffset()+4,
- Ld->isVolatile(),
- MinAlign(Ld->getAlignment(), 4));
+ // If this is not the MMX case, i.e. we are just turning i64 load/store
+ // into f64 load/store, avoid the transformation if there are multiple
+ // uses of the loaded value.
+ if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
+ return SDValue();
- SDValue NewChain = LoLd.getValue(1);
+ DebugLoc LdDL = Ld->getDebugLoc();
+ DebugLoc StDL = N->getDebugLoc();
+ // If we are a 64-bit capable x86, lower to a single movq load/store pair.
+ // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
+ // pair instead.
+ if (Subtarget->is64Bit() || F64IsLegal) {
+ MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
+ SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(),
+ Ld->getBasePtr(), Ld->getSrcValue(),
+ Ld->getSrcValueOffset(), Ld->isVolatile(),
+ Ld->getAlignment());
+ SDValue NewChain = NewLd.getValue(1);
if (TokenFactorIndex != -1) {
- Ops.push_back(LoLd);
- Ops.push_back(HiLd);
- NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Ops[0],
+ Ops.push_back(NewChain);
+ NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
Ops.size());
}
-
- LoAddr = St->getBasePtr();
- HiAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, LoAddr,
- DAG.getConstant(4, MVT::i32));
-
- SDValue LoSt = DAG.getStore(NewChain, dl, LoLd, LoAddr,
+ return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
St->getSrcValue(), St->getSrcValueOffset(),
St->isVolatile(), St->getAlignment());
- SDValue HiSt = DAG.getStore(NewChain, dl, HiLd, HiAddr,
- St->getSrcValue(),
- St->getSrcValueOffset() + 4,
- St->isVolatile(),
- MinAlign(St->getAlignment(), 4));
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoSt, HiSt);
}
+
+ // Otherwise, lower to two pairs of 32-bit loads / stores.
+ SDValue LoAddr = Ld->getBasePtr();
+ SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
+ DAG.getConstant(4, MVT::i32));
+
+ SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
+ Ld->getSrcValue(), Ld->getSrcValueOffset(),
+ Ld->isVolatile(), Ld->getAlignment());
+ SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
+ Ld->getSrcValue(), Ld->getSrcValueOffset()+4,
+ Ld->isVolatile(),
+ MinAlign(Ld->getAlignment(), 4));
+
+ SDValue NewChain = LoLd.getValue(1);
+ if (TokenFactorIndex != -1) {
+ Ops.push_back(LoLd);
+ Ops.push_back(HiLd);
+ NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
+ Ops.size());
+ }
+
+ LoAddr = St->getBasePtr();
+ HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
+ DAG.getConstant(4, MVT::i32));
+
+ SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
+ St->getSrcValue(), St->getSrcValueOffset(),
+ St->isVolatile(), St->getAlignment());
+ SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
+ St->getSrcValue(),
+ St->getSrcValueOffset() + 4,
+ St->isVolatile(),
+ MinAlign(St->getAlignment(), 4));
+ return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
}
return SDValue();
}
case ISD::BUILD_VECTOR:
return PerformBuildVectorCombine(N, DAG, DCI, Subtarget, *this);
case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget);
+ case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI);
+ case ISD::MUL: return PerformMulCombine(N, DAG, DCI);
case ISD::SHL:
case ISD::SRA:
case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget);