X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86ISelLowering.cpp;h=724899b00eee3606d3f0563b3d84b33b933d2199;hb=fca82deecb1e1909a97db41cf324bf70e39af1e5;hp=d56e672e1f82e615a758e136de084d321cc143db;hpb=076aee32e86bc4a0c096262b3261923f25220dc6;p=oota-llvm.git diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index d56e672e1f8..724899b00ee 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -113,31 +113,38 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); if (Subtarget->is64Bit()) { - setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); + setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); } else { - if (X86ScalarSSEf64) { + if (!UseSoftFloat && !NoImplicitFloat && X86ScalarSSEf64) { // We have an impenetrably clever algorithm for ui64->double only. setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); // We have faster algorithm for ui32->single only. setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); - } else + } else { setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); + } } // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have // this operation. setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); - // SSE has no i16 to fp conversion, only i32 - if (X86ScalarSSEf32) { - setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); - // f32 and f64 cases are Legal, f80 case is not - setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); + + if (!UseSoftFloat && !NoImplicitFloat) { + // SSE has no i16 to fp conversion, only i32 + if (X86ScalarSSEf32) { + setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); + // f32 and f64 cases are Legal, f80 case is not + setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); + } else { + setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); + setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); + } } else { - setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); - setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); + setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); + setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); } // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 @@ -654,7 +661,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); - setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); setOperationAction(ISD::TRUNCATE, MVT::v8i8, Expand); setOperationAction(ISD::SELECT, MVT::v8i8, Promote); setOperationAction(ISD::SELECT, MVT::v4i16, Promote); @@ -682,8 +689,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (!UseSoftFloat && Subtarget->hasSSE2()) { addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); - // FIXME: Unfortunately -soft-float means XMM registers cannot be used even - // for integer operations. + // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM + // registers cannot be used even for integer operations. addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); @@ -727,12 +734,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); + if (Subtarget->is64Bit()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); @@ -815,6 +824,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::STORE); + if (Subtarget->is64Bit()) + setTargetDAGCombine(ISD::MUL); computeRegisterProperties(); @@ -888,7 +899,7 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align, // FIXME: This turns off use of xmm stores for memset/memcpy on targets like // linux. This is because the stack realignment code can't handle certain // cases like PR2962. This should be removed when PR2962 is fixed. - if (Subtarget->getStackAlignment() >= 16) { + if (!NoImplicitFloat && Subtarget->getStackAlignment() >= 16) { if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16) return MVT::v4i32; if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16) @@ -899,7 +910,6 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align, return MVT::i32; } - /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC /// jumptable. SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, @@ -1434,13 +1444,13 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && "SSE register cannot be used when SSE is disabled!"); - assert(!(NumXMMRegs && UseSoftFloat) && + assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloat) && "SSE register cannot be used when SSE is disabled!"); - if (UseSoftFloat || !Subtarget->hasSSE1()) { + if (UseSoftFloat || NoImplicitFloat || !Subtarget->hasSSE1()) // Kernel mode asks for SSE to be disabled, so don't push them // on the stack. TotalNumXMMRegs = 0; - } + // For X86-64, if there are vararg parameters that are passed via // registers, then we must store them to their spots on the stack so they // may be loaded by deferencing the result of va_next. @@ -3630,8 +3640,11 @@ SDValue LowerVECTOR_SHUFFLEv8i16(SDValue V1, SDValue V2, // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the // source words for the shuffle, to aid later transformations. bool AllWordsInNewV = true; + bool InOrder[2] = { true, true }; for (unsigned i = 0; i != 8; ++i) { int idx = MaskVals[i]; + if (idx != (int)i) + InOrder[i/4] = false; if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) continue; AllWordsInNewV = false; @@ -3658,7 +3671,7 @@ SDValue LowerVECTOR_SHUFFLEv8i16(SDValue V1, SDValue V2, // If we've eliminated the use of V2, and the new mask is a pshuflw or // pshufhw, that's as cheap as it gets. Return the new shuffle. - if (pshufhw || pshuflw) { + if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { MaskV.clear(); for (unsigned i = 0; i != 8; ++i) MaskV.push_back((MaskVals[i] < 0) ? DAG.getUNDEF(MVT::i16) @@ -3897,15 +3910,29 @@ SDValue LowerVECTOR_SHUFFLEv16i8(SDValue V1, SDValue V2, SDValue Elt0Src = Elt0 < 16 ? V1 : V2; SDValue Elt1Src = Elt1 < 16 ? V1 : V2; SDValue InsElt; - + + // If Elt0 and Elt1 are defined, are consecutive, and can be load + // using a single extract together, load it and store it. + if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { + InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, + DAG.getIntPtrConstant(Elt1 / 2)); + NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, + DAG.getIntPtrConstant(i)); + continue; + } + // If Elt1 is defined, extract it from the appropriate source. If the - // source byte is not also odd, shift the extracted word left 8 bits. + // source byte is not also odd, shift the extracted word left 8 bits + // otherwise clear the bottom 8 bits if we need to do an or. if (Elt1 >= 0) { InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, DAG.getIntPtrConstant(Elt1 / 2)); if ((Elt1 & 1) == 0) InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, DAG.getConstant(8, TLI.getShiftAmountTy())); + else if (Elt0 >= 0) + InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, + DAG.getConstant(0xFF00, MVT::i16)); } // If Elt0 is defined, extract it from the appropriate source. If the // source byte is not also even, shift the extracted word right 8 bits. If @@ -3917,6 +3944,9 @@ SDValue LowerVECTOR_SHUFFLEv16i8(SDValue V1, SDValue V2, if ((Elt0 & 1) != 0) InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, DAG.getConstant(8, TLI.getShiftAmountTy())); + else if (Elt1 >= 0) + InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, + DAG.getConstant(0x00FF, MVT::i16)); InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) : InsElt0; } @@ -4651,9 +4681,8 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) { ConstantPoolSDNode *CP = cast(Op); // FIXME there isn't really any debug info here, should come from the parent DebugLoc dl = CP->getDebugLoc(); - SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), - getPointerTy(), - CP->getAlignment()); + SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), + CP->getAlignment()); Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); // With PIC, the address is actually $g + Offset. if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && @@ -4955,8 +4984,8 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) { int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), - StackSlot, - PseudoSourceValue::getFixedStack(SSFI), 0); + StackSlot, + PseudoSourceValue::getFixedStack(SSFI), 0); // Build the FILD SDVTList Tys; @@ -5042,13 +5071,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) { CV0.push_back(ConstantInt::get(APInt(32, 0))); CV0.push_back(ConstantInt::get(APInt(32, 0))); Constant *C0 = ConstantVector::get(CV0); - SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 4); + SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); std::vector CV1; CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4530000000000000ULL)))); CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4330000000000000ULL)))); Constant *C1 = ConstantVector::get(CV1); - SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 4); + SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); SmallVector MaskVec; MaskVec.push_back(DAG.getConstant(0, MVT::i32)); @@ -5245,7 +5274,7 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) { CV.push_back(C); } Constant *C = ConstantVector::get(CV); - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 4); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, PseudoSourceValue::getConstantPool(), 0, false, 16); @@ -5274,7 +5303,7 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) { CV.push_back(C); } Constant *C = ConstantVector::get(CV); - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 4); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, PseudoSourceValue::getConstantPool(), 0, false, 16); @@ -5322,7 +5351,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); } Constant *C = ConstantVector::get(CV); - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 4); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, PseudoSourceValue::getConstantPool(), 0, false, 16); @@ -5351,7 +5380,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); } C = ConstantVector::get(CV); - CPIdx = DAG.getConstantPool(C, getPointerTy(), 4); + CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, PseudoSourceValue::getConstantPool(), 0, false, 16); @@ -5363,13 +5392,33 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. -SDValue X86TargetLowering::EmitTest(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, + SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); + // CF and OF aren't always set the way we want. Determine which + // of these we need. + bool NeedCF = false; + bool NeedOF = false; + switch (X86CC) { + case X86::COND_A: case X86::COND_AE: + case X86::COND_B: case X86::COND_BE: + NeedCF = true; + break; + case X86::COND_G: case X86::COND_GE: + case X86::COND_L: case X86::COND_LE: + case X86::COND_O: case X86::COND_NO: + NeedOF = true; + break; + default: break; + } + // See if we can use the EFLAGS value from the operand instead of - // doing a separate TEST. - if (Op.getResNo() == 0) { + // doing a separate TEST. TEST always sets OF and CF to 0, so unless + // we prove that the arithmetic won't overflow, we can't use OF or CF. + if (Op.getResNo() == 0 && !NeedOF && !NeedCF) { unsigned Opcode = 0; + unsigned NumOperands = 0; switch (Op.getNode()->getOpcode()) { case ISD::ADD: // Due to an isel shortcoming, be conservative if this add is likely to @@ -5382,15 +5431,24 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, SelectionDAG &DAG) { UE = Op.getNode()->use_end(); UI != UE; ++UI) if (UI->getOpcode() == ISD::STORE) goto default_case; - // An add of one will be selected as an INC. if (ConstantSDNode *C = - dyn_cast(Op.getNode()->getOperand(1))) + dyn_cast(Op.getNode()->getOperand(1))) { + // An add of one will be selected as an INC. if (C->getAPIntValue() == 1) { Opcode = X86ISD::INC; + NumOperands = 1; + break; + } + // An add of negative one (subtract of one) will be selected as a DEC. + if (C->getAPIntValue().isAllOnesValue()) { + Opcode = X86ISD::DEC; + NumOperands = 1; break; } + } // Otherwise use a regular EFLAGS-setting add. Opcode = X86ISD::ADD; + NumOperands = 2; break; case ISD::SUB: // Due to the ISEL shortcoming noted above, be conservative if this sub is @@ -5399,15 +5457,9 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, SelectionDAG &DAG) { UE = Op.getNode()->use_end(); UI != UE; ++UI) if (UI->getOpcode() == ISD::STORE) goto default_case; - // A subtract of one will be selected as a DEC. - if (ConstantSDNode *C = - dyn_cast(Op.getNode()->getOperand(1))) - if (C->getAPIntValue() == 1) { - Opcode = X86ISD::DEC; - break; - } // Otherwise use a regular EFLAGS-setting sub. Opcode = X86ISD::SUB; + NumOperands = 2; break; case X86ISD::ADD: case X86ISD::SUB: @@ -5421,9 +5473,9 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, SelectionDAG &DAG) { if (Opcode != 0) { const MVT *VTs = DAG.getNodeValueTypes(Op.getValueType(), MVT::i32); SmallVector Ops; - for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) + for (unsigned i = 0; i != NumOperands; ++i) Ops.push_back(Op.getOperand(i)); - SDValue New = DAG.getNode(Opcode, dl, VTs, 2, &Ops[0], Ops.size()); + SDValue New = DAG.getNode(Opcode, dl, VTs, 2, &Ops[0], NumOperands); DAG.ReplaceAllUsesWith(Op, New); return SDValue(New.getNode(), 1); } @@ -5436,10 +5488,11 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, SelectionDAG &DAG) { /// Emit nodes that will be selected as "cmp Op0,Op1", or something /// equivalent. -SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, SelectionDAG &DAG) { +SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, + SelectionDAG &DAG) { if (ConstantSDNode *C = dyn_cast(Op1)) if (C->getAPIntValue() == 0) - return EmitTest(Op0, DAG); + return EmitTest(Op0, X86CC, DAG); DebugLoc dl = Op0.getDebugLoc(); return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); @@ -5507,7 +5560,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) { bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); - SDValue Cond = EmitCmp(Op0, Op1, DAG); + SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86CC, MVT::i8), Cond); } @@ -5665,7 +5718,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { !isScalarFPTypeInSSEReg(VT)) // FPStack? IllegalFPCMov = !hasFPCMov(cast(CC)->getSExtValue()); - if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || Opc == X86ISD::BT) { // FIXME + if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || + Opc == X86ISD::BT) { // FIXME Cond = Cmp; addTest = false; } @@ -5673,7 +5727,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { if (addTest) { CC = DAG.getConstant(X86::COND_NE, MVT::i8); - Cond = EmitTest(Cond, DAG); + Cond = EmitTest(Cond, X86::COND_NE, DAG); } const MVT *VTs = DAG.getNodeValueTypes(Op.getValueType(), @@ -5823,7 +5877,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) { if (addTest) { CC = DAG.getConstant(X86::COND_NE, MVT::i8); - Cond = EmitTest(Cond, DAG); + Cond = EmitTest(Cond, X86::COND_NE, DAG); } return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cond); @@ -7285,17 +7339,18 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, newMBB->addSuccessor(newMBB); // Insert instructions into newMBB based on incoming instruction - assert(bInstr->getNumOperands() < 8 && "unexpected number of operands"); + assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 && + "unexpected number of operands"); DebugLoc dl = bInstr->getDebugLoc(); MachineOperand& destOper = bInstr->getOperand(0); - MachineOperand* argOpers[6]; + MachineOperand* argOpers[2 + X86AddrNumOperands]; int numArgs = bInstr->getNumOperands() - 1; for (int i=0; i < numArgs; ++i) argOpers[i] = &bInstr->getOperand(i+1); // x86 address has 4 operands: base, index, scale, and displacement - int lastAddrIndx = 3; // [0,3] - int valArgIndx = 4; + int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] + int valArgIndx = lastAddrIndx + 1; unsigned t1 = F->getRegInfo().createVirtualRegister(RC); MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); @@ -7394,15 +7449,16 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, DebugLoc dl = bInstr->getDebugLoc(); // Insert instructions into newMBB based on incoming instruction // There are 8 "real" operands plus 9 implicit def/uses, ignored here. - assert(bInstr->getNumOperands() < 18 && "unexpected number of operands"); + assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 && + "unexpected number of operands"); MachineOperand& dest1Oper = bInstr->getOperand(0); MachineOperand& dest2Oper = bInstr->getOperand(1); - MachineOperand* argOpers[6]; - for (int i=0; i < 6; ++i) + MachineOperand* argOpers[2 + X86AddrNumOperands]; + for (int i=0; i < 2 + X86AddrNumOperands; ++i) argOpers[i] = &bInstr->getOperand(i+2); // x86 address has 4 operands: base, index, scale, and displacement - int lastAddrIndx = 3; // [0,3] + int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] unsigned t1 = F->getRegInfo().createVirtualRegister(RC); MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); @@ -7438,26 +7494,30 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, tt2 = t2; } - assert((argOpers[4]->isReg() || argOpers[4]->isImm()) && + int valArgIndx = lastAddrIndx + 1; + assert((argOpers[valArgIndx]->isReg() || + argOpers[valArgIndx]->isImm()) && "invalid operand"); unsigned t5 = F->getRegInfo().createVirtualRegister(RC); unsigned t6 = F->getRegInfo().createVirtualRegister(RC); - if (argOpers[4]->isReg()) + if (argOpers[valArgIndx]->isReg()) MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); else MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); if (regOpcL != X86::MOV32rr) MIB.addReg(tt1); - (*MIB).addOperand(*argOpers[4]); - assert(argOpers[5]->isReg() == argOpers[4]->isReg()); - assert(argOpers[5]->isImm() == argOpers[4]->isImm()); - if (argOpers[5]->isReg()) + (*MIB).addOperand(*argOpers[valArgIndx]); + assert(argOpers[valArgIndx + 1]->isReg() == + argOpers[valArgIndx]->isReg()); + assert(argOpers[valArgIndx + 1]->isImm() == + argOpers[valArgIndx]->isImm()); + if (argOpers[valArgIndx + 1]->isReg()) MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); else MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); if (regOpcH != X86::MOV32rr) MIB.addReg(tt2); - (*MIB).addOperand(*argOpers[5]); + (*MIB).addOperand(*argOpers[valArgIndx + 1]); MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX); MIB.addReg(t1); @@ -7530,16 +7590,17 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, DebugLoc dl = mInstr->getDebugLoc(); // Insert instructions into newMBB based on incoming instruction - assert(mInstr->getNumOperands() < 8 && "unexpected number of operands"); + assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 && + "unexpected number of operands"); MachineOperand& destOper = mInstr->getOperand(0); - MachineOperand* argOpers[6]; + MachineOperand* argOpers[2 + X86AddrNumOperands]; int numArgs = mInstr->getNumOperands() - 1; for (int i=0; i < numArgs; ++i) argOpers[i] = &mInstr->getOperand(i+1); // x86 address has 4 operands: base, index, scale, and displacement - int lastAddrIndx = 3; // [0,3] - int valArgIndx = 4; + int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] + int valArgIndx = lastAddrIndx + 1; unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); @@ -8074,77 +8135,346 @@ static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG, /// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { - DebugLoc dl = N->getDebugLoc(); + const X86Subtarget *Subtarget) { + DebugLoc DL = N->getDebugLoc(); SDValue Cond = N->getOperand(0); - + // Get the LHS/RHS of the select. + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + // If we have SSE[12] support, try to form min/max nodes. if (Subtarget->hasSSE2() && - (N->getValueType(0) == MVT::f32 || N->getValueType(0) == MVT::f64)) { - if (Cond.getOpcode() == ISD::SETCC) { - // Get the LHS/RHS of the select. - SDValue LHS = N->getOperand(1); - SDValue RHS = N->getOperand(2); - ISD::CondCode CC = cast(Cond.getOperand(2))->get(); - - unsigned Opcode = 0; - if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) { - switch (CC) { - default: break; - case ISD::SETOLE: // (X <= Y) ? X : Y -> min - case ISD::SETULE: - case ISD::SETLE: - if (!UnsafeFPMath) break; - // FALL THROUGH. - case ISD::SETOLT: // (X olt/lt Y) ? X : Y -> min - case ISD::SETLT: - Opcode = X86ISD::FMIN; - break; + (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && + Cond.getOpcode() == ISD::SETCC) { + ISD::CondCode CC = cast(Cond.getOperand(2))->get(); - case ISD::SETOGT: // (X > Y) ? X : Y -> max - case ISD::SETUGT: - case ISD::SETGT: - if (!UnsafeFPMath) break; - // FALL THROUGH. - case ISD::SETUGE: // (X uge/ge Y) ? X : Y -> max - case ISD::SETGE: - Opcode = X86ISD::FMAX; - break; - } - } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) { - switch (CC) { - default: break; - case ISD::SETOGT: // (X > Y) ? Y : X -> min - case ISD::SETUGT: - case ISD::SETGT: - if (!UnsafeFPMath) break; - // FALL THROUGH. - case ISD::SETUGE: // (X uge/ge Y) ? Y : X -> min - case ISD::SETGE: - Opcode = X86ISD::FMIN; - break; + unsigned Opcode = 0; + if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) { + switch (CC) { + default: break; + case ISD::SETOLE: // (X <= Y) ? X : Y -> min + case ISD::SETULE: + case ISD::SETLE: + if (!UnsafeFPMath) break; + // FALL THROUGH. + case ISD::SETOLT: // (X olt/lt Y) ? X : Y -> min + case ISD::SETLT: + Opcode = X86ISD::FMIN; + break; - case ISD::SETOLE: // (X <= Y) ? Y : X -> max - case ISD::SETULE: - case ISD::SETLE: - if (!UnsafeFPMath) break; - // FALL THROUGH. - case ISD::SETOLT: // (X olt/lt Y) ? Y : X -> max - case ISD::SETLT: - Opcode = X86ISD::FMAX; - break; - } + case ISD::SETOGT: // (X > Y) ? X : Y -> max + case ISD::SETUGT: + case ISD::SETGT: + if (!UnsafeFPMath) break; + // FALL THROUGH. + case ISD::SETUGE: // (X uge/ge Y) ? X : Y -> max + case ISD::SETGE: + Opcode = X86ISD::FMAX; + break; } + } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) { + switch (CC) { + default: break; + case ISD::SETOGT: // (X > Y) ? Y : X -> min + case ISD::SETUGT: + case ISD::SETGT: + if (!UnsafeFPMath) break; + // FALL THROUGH. + case ISD::SETUGE: // (X uge/ge Y) ? Y : X -> min + case ISD::SETGE: + Opcode = X86ISD::FMIN; + break; - if (Opcode) - return DAG.getNode(Opcode, dl, N->getValueType(0), LHS, RHS); + case ISD::SETOLE: // (X <= Y) ? Y : X -> max + case ISD::SETULE: + case ISD::SETLE: + if (!UnsafeFPMath) break; + // FALL THROUGH. + case ISD::SETOLT: // (X olt/lt Y) ? Y : X -> max + case ISD::SETLT: + Opcode = X86ISD::FMAX; + break; + } } + if (Opcode) + return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); } + + // If this is a select between two integer constants, try to do some + // optimizations. + if (ConstantSDNode *TrueC = dyn_cast(LHS)) { + if (ConstantSDNode *FalseC = dyn_cast(RHS)) + // Don't do this for crazy integer types. + if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { + // If this is efficiently invertible, canonicalize the LHSC/RHSC values + // so that TrueC (the true value) is larger than FalseC. + bool NeedsCondInvert = false; + + if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && + // Efficiently invertible. + (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. + (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. + isa(Cond.getOperand(1))))) { + NeedsCondInvert = true; + std::swap(TrueC, FalseC); + } + + // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. + if (FalseC->getAPIntValue() == 0 && + TrueC->getAPIntValue().isPowerOf2()) { + if (NeedsCondInvert) // Invert the condition if needed. + Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getConstant(1, Cond.getValueType())); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); + + unsigned ShAmt = TrueC->getAPIntValue().logBase2(); + return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, + DAG.getConstant(ShAmt, MVT::i8)); + } + + // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. + if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { + if (NeedsCondInvert) // Invert the condition if needed. + Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getConstant(1, Cond.getValueType())); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, + FalseC->getValueType(0), Cond); + return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + } + + // Optimize cases that will turn into an LEA instruction. This requires + // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). + if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { + uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); + if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; + + bool isFastMultiplier = false; + if (Diff < 10) { + switch ((unsigned char)Diff) { + default: break; + case 1: // result = add base, cond + case 2: // result = lea base( , cond*2) + case 3: // result = lea base(cond, cond*2) + case 4: // result = lea base( , cond*4) + case 5: // result = lea base(cond, cond*4) + case 8: // result = lea base( , cond*8) + case 9: // result = lea base(cond, cond*8) + isFastMultiplier = true; + break; + } + } + + if (isFastMultiplier) { + APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); + if (NeedsCondInvert) // Invert the condition if needed. + Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getConstant(1, Cond.getValueType())); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), + Cond); + // Scale the condition by the difference. + if (Diff != 1) + Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, + DAG.getConstant(Diff, Cond.getValueType())); + + // Add the base if non-zero. + if (FalseC->getAPIntValue() != 0) + Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + return Cond; + } + } + } + } + + return SDValue(); +} +/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] +static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + DebugLoc DL = N->getDebugLoc(); + + // If the flag operand isn't dead, don't touch this CMOV. + if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) + return SDValue(); + + // If this is a select between two integer constants, try to do some + // optimizations. Note that the operands are ordered the opposite of SELECT + // operands. + if (ConstantSDNode *TrueC = dyn_cast(N->getOperand(1))) { + if (ConstantSDNode *FalseC = dyn_cast(N->getOperand(0))) { + // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is + // larger than FalseC (the false value). + X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); + + if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { + CC = X86::GetOppositeBranchCondition(CC); + std::swap(TrueC, FalseC); + } + + // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. + // This is efficient for any integer data type (including i8/i16) and + // shift amount. + if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { + SDValue Cond = N->getOperand(3); + Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(CC, MVT::i8), Cond); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); + + unsigned ShAmt = TrueC->getAPIntValue().logBase2(); + Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, + DAG.getConstant(ShAmt, MVT::i8)); + if (N->getNumValues() == 2) // Dead flag value? + return DCI.CombineTo(N, Cond, SDValue()); + return Cond; + } + + // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient + // for any integer data type, including i8/i16. + if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { + SDValue Cond = N->getOperand(3); + Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(CC, MVT::i8), Cond); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, + FalseC->getValueType(0), Cond); + Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + + if (N->getNumValues() == 2) // Dead flag value? + return DCI.CombineTo(N, Cond, SDValue()); + return Cond; + } + + // Optimize cases that will turn into an LEA instruction. This requires + // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). + if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { + uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); + if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; + + bool isFastMultiplier = false; + if (Diff < 10) { + switch ((unsigned char)Diff) { + default: break; + case 1: // result = add base, cond + case 2: // result = lea base( , cond*2) + case 3: // result = lea base(cond, cond*2) + case 4: // result = lea base( , cond*4) + case 5: // result = lea base(cond, cond*4) + case 8: // result = lea base( , cond*8) + case 9: // result = lea base(cond, cond*8) + isFastMultiplier = true; + break; + } + } + + if (isFastMultiplier) { + APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); + SDValue Cond = N->getOperand(3); + Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(CC, MVT::i8), Cond); + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), + Cond); + // Scale the condition by the difference. + if (Diff != 1) + Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, + DAG.getConstant(Diff, Cond.getValueType())); + + // Add the base if non-zero. + if (FalseC->getAPIntValue() != 0) + Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + if (N->getNumValues() == 2) // Dead flag value? + return DCI.CombineTo(N, Cond, SDValue()); + return Cond; + } + } + } + } return SDValue(); } + +/// PerformMulCombine - Optimize a single multiply with constant into two +/// in order to implement it with two cheaper instructions, e.g. +/// LEA + SHL, LEA + LEA. +static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + if (DAG.getMachineFunction(). + getFunction()->hasFnAttr(Attribute::OptimizeForSize)) + return SDValue(); + + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + MVT VT = N->getValueType(0); + if (VT != MVT::i64) + return SDValue(); + + ConstantSDNode *C = dyn_cast(N->getOperand(1)); + if (!C) + return SDValue(); + uint64_t MulAmt = C->getZExtValue(); + if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) + return SDValue(); + + uint64_t MulAmt1 = 0; + uint64_t MulAmt2 = 0; + if ((MulAmt % 9) == 0) { + MulAmt1 = 9; + MulAmt2 = MulAmt / 9; + } else if ((MulAmt % 5) == 0) { + MulAmt1 = 5; + MulAmt2 = MulAmt / 5; + } else if ((MulAmt % 3) == 0) { + MulAmt1 = 3; + MulAmt2 = MulAmt / 3; + } + if (MulAmt2 && + (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ + DebugLoc DL = N->getDebugLoc(); + + if (isPowerOf2_64(MulAmt2) && + !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) + // If second multiplifer is pow2, issue it first. We want the multiply by + // 3, 5, or 9 to be folded into the addressing mode unless the lone use + // is an add. + std::swap(MulAmt1, MulAmt2); + + SDValue NewMul; + if (isPowerOf2_64(MulAmt1)) + NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); + else + NewMul = DAG.getNode(ISD::MUL, DL, VT, N->getOperand(0), + DAG.getConstant(MulAmt1, VT)); + + if (isPowerOf2_64(MulAmt2)) + NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, + DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); + else + NewMul = DAG.getNode(ISD::MUL, DL, VT, NewMul, + DAG.getConstant(MulAmt2, VT)); + + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, NewMul, false); + } + return SDValue(); +} + + /// PerformShiftCombine - Transforms vector shift nodes to use vector shifts /// when possible. static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, @@ -8162,7 +8492,7 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, SDValue ShAmtOp = N->getOperand(1); MVT EltVT = VT.getVectorElementType(); - DebugLoc dl = N->getDebugLoc(); + DebugLoc DL = N->getDebugLoc(); SDValue BaseShAmt; if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { unsigned NumElts = VT.getVectorNumElements(); @@ -8182,15 +8512,15 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, } } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && isSplatMask(ShAmtOp.getOperand(2).getNode())) { - BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ShAmtOp, + BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, DAG.getIntPtrConstant(0)); } else return SDValue(); if (EltVT.bitsGT(MVT::i32)) - BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt); + BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); else if (EltVT.bitsLT(MVT::i32)) - BaseShAmt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BaseShAmt); + BaseShAmt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, BaseShAmt); // The shift amount is identical so we can do a vector shift. SDValue ValOp = N->getOperand(0); @@ -8200,39 +8530,39 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, break; case ISD::SHL: if (VT == MVT::v2i64) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), ValOp, BaseShAmt); if (VT == MVT::v4i32) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), ValOp, BaseShAmt); if (VT == MVT::v8i16) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), ValOp, BaseShAmt); break; case ISD::SRA: if (VT == MVT::v4i32) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), ValOp, BaseShAmt); if (VT == MVT::v8i16) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), ValOp, BaseShAmt); break; case ISD::SRL: if (VT == MVT::v2i64) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), ValOp, BaseShAmt); if (VT == MVT::v4i32) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), ValOp, BaseShAmt); if (VT == MVT::v8i16) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), ValOp, BaseShAmt); break; @@ -8242,14 +8572,21 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, /// PerformSTORECombine - Do target-specific dag combines on STORE nodes. static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { + const X86Subtarget *Subtarget) { // Turn load->store of MMX types into GPR load/stores. This avoids clobbering // the FP state in cases where an emms may be missing. // A preferable solution to the general problem is to figure out the right // places to insert EMMS. This qualifies as a quick hack. + + // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. StoreSDNode *St = cast(N); - if (St->getValue().getValueType().isVector() && - St->getValue().getValueType().getSizeInBits() == 64 && + MVT VT = St->getValue().getValueType(); + if (VT.getSizeInBits() != 64) + return SDValue(); + + bool F64IsLegal = !UseSoftFloat && !NoImplicitFloat && Subtarget->hasSSE2(); + if ((VT.isVector() || + (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && isa(St->getValue()) && !cast(St->getValue())->isVolatile() && St->getChain().hasOneUse() && !St->isVolatile()) { @@ -8273,60 +8610,72 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, Ops.push_back(ChainVal->getOperand(i)); } } - if (Ld) { - DebugLoc dl = N->getDebugLoc(); - // If we are a 64-bit capable x86, lower to a single movq load/store pair. - if (Subtarget->is64Bit()) { - SDValue NewLd = DAG.getLoad(MVT::i64, dl, Ld->getChain(), - Ld->getBasePtr(), Ld->getSrcValue(), - Ld->getSrcValueOffset(), Ld->isVolatile(), - Ld->getAlignment()); - SDValue NewChain = NewLd.getValue(1); - if (TokenFactorIndex != -1) { - Ops.push_back(NewChain); - NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Ops[0], - Ops.size()); - } - return DAG.getStore(NewChain, dl, NewLd, St->getBasePtr(), - St->getSrcValue(), St->getSrcValueOffset(), - St->isVolatile(), St->getAlignment()); - } - // Otherwise, lower to two 32-bit copies. - SDValue LoAddr = Ld->getBasePtr(); - SDValue HiAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, LoAddr, - DAG.getConstant(4, MVT::i32)); + if (!Ld || !ISD::isNormalLoad(Ld)) + return SDValue(); - SDValue LoLd = DAG.getLoad(MVT::i32, dl, Ld->getChain(), LoAddr, - Ld->getSrcValue(), Ld->getSrcValueOffset(), - Ld->isVolatile(), Ld->getAlignment()); - SDValue HiLd = DAG.getLoad(MVT::i32, dl, Ld->getChain(), HiAddr, - Ld->getSrcValue(), Ld->getSrcValueOffset()+4, - Ld->isVolatile(), - MinAlign(Ld->getAlignment(), 4)); + // If this is not the MMX case, i.e. we are just turning i64 load/store + // into f64 load/store, avoid the transformation if there are multiple + // uses of the loaded value. + if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) + return SDValue(); - SDValue NewChain = LoLd.getValue(1); + DebugLoc LdDL = Ld->getDebugLoc(); + DebugLoc StDL = N->getDebugLoc(); + // If we are a 64-bit capable x86, lower to a single movq load/store pair. + // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store + // pair instead. + if (Subtarget->is64Bit() || F64IsLegal) { + MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; + SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), + Ld->getBasePtr(), Ld->getSrcValue(), + Ld->getSrcValueOffset(), Ld->isVolatile(), + Ld->getAlignment()); + SDValue NewChain = NewLd.getValue(1); if (TokenFactorIndex != -1) { - Ops.push_back(LoLd); - Ops.push_back(HiLd); - NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Ops[0], + Ops.push_back(NewChain); + NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], Ops.size()); } - - LoAddr = St->getBasePtr(); - HiAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, LoAddr, - DAG.getConstant(4, MVT::i32)); - - SDValue LoSt = DAG.getStore(NewChain, dl, LoLd, LoAddr, + return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), St->getSrcValue(), St->getSrcValueOffset(), St->isVolatile(), St->getAlignment()); - SDValue HiSt = DAG.getStore(NewChain, dl, HiLd, HiAddr, - St->getSrcValue(), - St->getSrcValueOffset() + 4, - St->isVolatile(), - MinAlign(St->getAlignment(), 4)); - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoSt, HiSt); } + + // Otherwise, lower to two pairs of 32-bit loads / stores. + SDValue LoAddr = Ld->getBasePtr(); + SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, + DAG.getConstant(4, MVT::i32)); + + SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, + Ld->getSrcValue(), Ld->getSrcValueOffset(), + Ld->isVolatile(), Ld->getAlignment()); + SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, + Ld->getSrcValue(), Ld->getSrcValueOffset()+4, + Ld->isVolatile(), + MinAlign(Ld->getAlignment(), 4)); + + SDValue NewChain = LoLd.getValue(1); + if (TokenFactorIndex != -1) { + Ops.push_back(LoLd); + Ops.push_back(HiLd); + NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], + Ops.size()); + } + + LoAddr = St->getBasePtr(); + HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, + DAG.getConstant(4, MVT::i32)); + + SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, + St->getSrcValue(), St->getSrcValueOffset(), + St->isVolatile(), St->getAlignment()); + SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, + St->getSrcValue(), + St->getSrcValueOffset() + 4, + St->isVolatile(), + MinAlign(St->getAlignment(), 4)); + return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); } return SDValue(); } @@ -8386,6 +8735,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::BUILD_VECTOR: return PerformBuildVectorCombine(N, DAG, DCI, Subtarget, *this); case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); + case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); + case ISD::MUL: return PerformMulCombine(N, DAG, DCI); case ISD::SHL: case ISD::SRA: case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget);