// MM0, it doesn't support these vector types.
CCIfType<[x86mmx], CCAssignToReg<[MM0]>>,
- // Long double types are always returned in ST0 (even with SSE).
- CCIfType<[f80], CCAssignToReg<[ST0, ST1]>>
+ // Long double types are always returned in FP0 (even with SSE).
+ CCIfType<[f80], CCAssignToReg<[FP0, FP1]>>
]>;
// X86-32 C return-value convention.
def RetCC_X86_32_C : CallingConv<[
- // The X86-32 calling convention returns FP values in ST0, unless marked
+ // The X86-32 calling convention returns FP values in FP0, unless marked
// with "inreg" (used here to distinguish one kind of reg from another,
// weirdly; this is really the sse-regparm calling convention) in which
// case they use XMM0, otherwise it is the same as the common X86 calling
// conv.
CCIfInReg<CCIfSubtarget<"hasSSE2()",
CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
- CCIfType<[f32,f64], CCAssignToReg<[ST0, ST1]>>,
+ CCIfType<[f32,f64], CCAssignToReg<[FP0, FP1]>>,
CCDelegateTo<RetCC_X86Common>
]>;
// The calling-convention tables for x87 returns don't tell
// the whole story.
- if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
+ if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
return false;
unsigned SrcReg = Reg + VA.getValNo();
report_fatal_error("SSE register return with SSE disabled");
}
- // If this is a call to a function that returns an fp value on the floating
- // point stack, we must guarantee the value is popped from the stack, so
- // a COPY is not good enough - the copy instruction may be eliminated if the
- // return value is not used. We use the FpPOP_RETVAL instruction instead.
- if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
- // If we prefer to use the value in xmm registers, copy it out as f80 and
- // use a truncate to move it from fp stack reg to xmm reg.
- if (isScalarFPTypeInSSEReg(VA.getValVT())) {
- CopyVT = MVT::f80;
- CopyReg = createResultReg(&X86::RFP80RegClass);
- }
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(X86::FpPOP_RETVAL), CopyReg);
-
- // Round the f80 to the right size, which also moves it to the appropriate
- // xmm register. This is accomplished by storing the f80 value in memory
- // and then loading it back.
- if (CopyVT != VA.getValVT()) {
- EVT ResVT = VA.getValVT();
- unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
- unsigned MemSize = ResVT.getSizeInBits()/8;
- int FI = MFI.CreateStackObject(MemSize, MemSize, false);
- addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(Opc)), FI)
- .addReg(CopyReg);
- Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
- addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(Opc), ResultReg + i), FI);
- }
- } else {
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg());
- InRegs.push_back(VA.getLocReg());
+ // If we prefer to use the value in xmm registers, copy it out as f80 and
+ // use a truncate to move it from fp stack reg to xmm reg.
+ if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+ isScalarFPTypeInSSEReg(VA.getValVT())) {
+ CopyVT = MVT::f80;
+ CopyReg = createResultReg(&X86::RFP80RegClass);
+ }
+
+ // Copy out the result.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg());
+ InRegs.push_back(VA.getLocReg());
+
+ // Round the f80 to the right size, which also moves it to the appropriate
+ // xmm register. This is accomplished by storing the f80 value in memory
+ // and then loading it back.
+ if (CopyVT != VA.getValVT()) {
+ EVT ResVT = VA.getValVT();
+ unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
+ unsigned MemSize = ResVT.getSizeInBits()/8;
+ int FI = MFI.CreateStackObject(MemSize, MemSize, false);
+ addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc)), FI)
+ .addReg(CopyReg);
+ Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
+ addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg + i), FI);
}
}
#include "X86.h"
#include "X86InstrInfo.h"
+#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/EdgeBundles.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/Support/Debug.h"
STATISTIC(NumFP , "Number of floating point instructions");
namespace {
+ const unsigned ScratchFPReg = 7;
+
struct FPS : public MachineFunctionPass {
static char ID;
FPS() : MachineFunctionPass(ID) {
unsigned StackTop; // The current top of the FP stack.
enum {
- NumFPRegs = 16 // Including scratch pseudo-registers.
+ NumFPRegs = 8 // Including scratch pseudo-registers.
};
// For each live FP<n> register, point to its Stack[] entry.
// register allocator thinks.
unsigned RegMap[NumFPRegs];
- // Pending fixed registers - Inline assembly needs FP registers to appear
- // in fixed stack slot positions. This is handled by copying FP registers
- // to ST registers before the instruction, and copying back after the
- // instruction.
- //
- // This is modeled with pending ST registers. NumPendingSTs is the number
- // of ST registers (ST0-STn) we are tracking. PendingST[n] points to an FP
- // register that holds the ST value. The ST registers are not moved into
- // place until immediately before the instruction that needs them.
- //
- // It can happen that we need an ST register to be live when no FP register
- // holds the value:
- //
- // %ST0 = COPY %FP4<kill>
- //
- // When that happens, we allocate a scratch FP register to hold the ST
- // value. That means every register in PendingST must be live.
-
- unsigned NumPendingSTs;
- unsigned char PendingST[8];
-
// Set up our stack model to match the incoming registers to MBB.
void setupBlockStack();
dbgs() << " FP" << Stack[i];
assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
}
- for (unsigned i = 0; i != NumPendingSTs; ++i)
- dbgs() << ", ST" << i << " in FP" << unsigned(PendingST[i]);
- dbgs() << "\n";
}
#endif
return Slot < StackTop && Stack[Slot] == RegNo;
}
- /// getScratchReg - Return an FP register that is not currently in use.
- unsigned getScratchReg() const {
- for (int i = NumFPRegs - 1; i >= 8; --i)
- if (!isLive(i))
- return i;
- llvm_unreachable("Ran out of scratch FP registers");
- }
-
- /// isScratchReg - Returns trus if RegNo is a scratch FP register.
- static bool isScratchReg(unsigned RegNo) {
- return RegNo > 8 && RegNo < NumFPRegs;
- }
-
/// getStackEntry - Return the X86::FP<n> register in register ST(i).
unsigned getStackEntry(unsigned STi) const {
if (STi >= StackTop)
BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg);
}
- /// duplicatePendingSTBeforeKill - The instruction at I is about to kill
- /// RegNo. If any PendingST registers still need the RegNo value, duplicate
- /// them to new scratch registers.
- void duplicatePendingSTBeforeKill(unsigned RegNo, MachineInstr *I) {
- for (unsigned i = 0; i != NumPendingSTs; ++i) {
- if (PendingST[i] != RegNo)
- continue;
- unsigned SR = getScratchReg();
- DEBUG(dbgs() << "Duplicating pending ST" << i
- << " in FP" << RegNo << " to FP" << SR << '\n');
- duplicateToTop(RegNo, SR, I);
- PendingST[i] = SR;
- }
- }
-
/// popStackAfter - Pop the current value off of the top of the FP stack
/// after the specified instruction.
void popStackAfter(MachineBasicBlock::iterator &I);
bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+ void handleCall(MachineBasicBlock::iterator &I);
void handleZeroArgFP(MachineBasicBlock::iterator &I);
void handleOneArgFP(MachineBasicBlock::iterator &I);
void handleOneArgFPRW(MachineBasicBlock::iterator &I);
return X86::RFP80RegClass.contains(DstReg) ||
X86::RFP80RegClass.contains(SrcReg);
}
+
+ void setKillFlags(MachineBasicBlock &MBB) const;
};
char FPS::ID = 0;
}
bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
bool Changed = false;
MBB = &BB;
- NumPendingSTs = 0;
+ setKillFlags(BB);
setupBlockStack();
for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
X86::RFP80RegClass.contains(MI->getOperand(0).getReg()))
FPInstClass = X86II::SpecialFP;
+ if (MI->isCall())
+ FPInstClass = X86II::SpecialFP;
+
if (FPInstClass == X86II::NotFP)
continue; // Efficiently ignore non-fp insts!
// after definition. If so, pop them.
for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
unsigned Reg = DeadRegs[i];
- if (Reg >= X86::FP0 && Reg <= X86::FP6) {
+ // Check if Reg is live on the stack. An inline-asm register operand that
+ // is in the clobber list and marked dead might not be live on the stack.
+ if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg-X86::FP0)) {
DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n");
freeStackSlotAfter(I, Reg-X86::FP0);
}
// Instruction transformation implementation
//===----------------------------------------------------------------------===//
+void FPS::handleCall(MachineBasicBlock::iterator &I) {
+ unsigned STReturns = 0;
+
+ for (const auto &MO : I->operands()) {
+ if (!MO.isReg())
+ continue;
+
+ unsigned R = MO.getReg() - X86::FP0;
+
+ if (R < 8) {
+ assert(MO.isDef() && MO.isImplicit());
+ STReturns |= 1 << R;
+ }
+ }
+
+ unsigned N = CountTrailingOnes_32(STReturns);
+
+ // FP registers used for function return must be consecutive starting at
+ // FP0.
+ assert(STReturns == 0 || isMask_32(STReturns) && N <= 2);
+
+ for (unsigned I = 0; I < N; ++I)
+ pushReg(N - I - 1);
+}
+
/// handleZeroArgFP - ST(0) = fld0 ST(0) = flds <mem>
///
void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
unsigned Reg = getFPReg(MI->getOperand(NumOps-1));
bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
- if (KillsSrc)
- duplicatePendingSTBeforeKill(Reg, I);
-
// FISTP64m is strange because there isn't a non-popping versions.
// If we have one _and_ we don't want to pop the operand, duplicate the value
// on the stack instead of moving it. This ensure that popping the value is
MI->getOpcode() == X86::ISTT_Fp32m80 ||
MI->getOpcode() == X86::ISTT_Fp64m80 ||
MI->getOpcode() == X86::ST_FpP80m)) {
- duplicateToTop(Reg, getScratchReg(), I);
+ duplicateToTop(Reg, ScratchFPReg, I);
} else {
moveToTop(Reg, I); // Move to the top of the stack...
}
bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
if (KillsSrc) {
- duplicatePendingSTBeforeKill(Reg, I);
// If this is the last use of the source register, just make sure it's on
// the top of the stack.
moveToTop(Reg, I);
///
void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
MachineInstr *MI = I;
+
+ if (MI->isCall()) {
+ handleCall(I);
+ return;
+ }
+
switch (MI->getOpcode()) {
default: llvm_unreachable("Unknown SpecialFP instruction!");
case TargetOpcode::COPY: {
// We handle three kinds of copies: FP <- FP, FP <- ST, and ST <- FP.
const MachineOperand &MO1 = MI->getOperand(1);
const MachineOperand &MO0 = MI->getOperand(0);
- unsigned DstST = MO0.getReg() - X86::ST0;
- unsigned SrcST = MO1.getReg() - X86::ST0;
bool KillsSrc = MI->killsRegister(MO1.getReg());
- // ST = COPY FP. Set up a pending ST register.
- if (DstST < 8) {
- unsigned SrcFP = getFPReg(MO1);
- assert(isLive(SrcFP) && "Cannot copy dead register");
- assert(!MO0.isDead() && "Cannot copy to dead ST register");
-
- // Unallocated STs are marked as the nonexistent FP255.
- while (NumPendingSTs <= DstST)
- PendingST[NumPendingSTs++] = NumFPRegs;
-
- // STi could still be live from a previous inline asm.
- if (isScratchReg(PendingST[DstST])) {
- DEBUG(dbgs() << "Clobbering old ST in FP" << unsigned(PendingST[DstST])
- << '\n');
- freeStackSlotBefore(MI, PendingST[DstST]);
- }
-
- // When the source is killed, allocate a scratch FP register.
- if (KillsSrc) {
- duplicatePendingSTBeforeKill(SrcFP, I);
- unsigned Slot = getSlot(SrcFP);
- unsigned SR = getScratchReg();
- PendingST[DstST] = SR;
- Stack[Slot] = SR;
- RegMap[SR] = Slot;
- } else
- PendingST[DstST] = SrcFP;
- break;
- }
-
- // FP = COPY ST. Extract fixed stack value.
- // Any instruction defining ST registers must have assigned them to a
- // scratch register.
- if (SrcST < 8) {
- unsigned DstFP = getFPReg(MO0);
- assert(!isLive(DstFP) && "Cannot copy ST to live FP register");
- assert(NumPendingSTs > SrcST && "Cannot copy from dead ST register");
- unsigned SrcFP = PendingST[SrcST];
- assert(isScratchReg(SrcFP) && "Expected ST in a scratch register");
- assert(isLive(SrcFP) && "Scratch holding ST is dead");
-
- // DstFP steals the stack slot from SrcFP.
- unsigned Slot = getSlot(SrcFP);
- Stack[Slot] = DstFP;
- RegMap[DstFP] = Slot;
-
- // Always treat the ST as killed.
- PendingST[SrcST] = NumFPRegs;
- while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs)
- --NumPendingSTs;
- break;
- }
-
// FP <- FP copy.
unsigned DstFP = getFPReg(MO0);
unsigned SrcFP = getFPReg(MO1);
break;
}
- case X86::FpPOP_RETVAL: {
- // The FpPOP_RETVAL instruction is used after calls that return a value on
- // the floating point stack. We cannot model this with ST defs since CALL
- // instructions have fixed clobber lists. This instruction is interpreted
- // to mean that there is one more live register on the stack than we
- // thought.
- //
- // This means that StackTop does not match the hardware stack between a
- // call and the FpPOP_RETVAL instructions. We do tolerate FP instructions
- // between CALL and FpPOP_RETVAL as long as they don't overflow the
- // hardware stack.
- unsigned DstFP = getFPReg(MI->getOperand(0));
-
- // Move existing stack elements up to reflect reality.
- assert(StackTop < 8 && "Stack overflowed before FpPOP_RETVAL");
- if (StackTop) {
- std::copy_backward(Stack, Stack + StackTop, Stack + StackTop + 1);
- for (unsigned i = 0; i != NumFPRegs; ++i)
- ++RegMap[i];
- }
- ++StackTop;
-
- // DstFP is the new bottom of the stack.
- Stack[0] = DstFP;
- RegMap[DstFP] = 0;
-
- // DstFP will be killed by processBasicBlock if this was a dead def.
- break;
- }
-
case TargetOpcode::INLINEASM: {
// The inline asm MachineInstr currently only *uses* FP registers for the
// 'f' constraint. These should be turned into the current ST(x) register
// only tell clobbers from defs by looking at the asm descriptor.
unsigned STUses = 0, STDefs = 0, STClobbers = 0, STDeadDefs = 0;
unsigned NumOps = 0;
+ SmallSet<unsigned, 1> FRegIdx;
+ unsigned RCID;
+
for (unsigned i = InlineAsm::MIOp_FirstOperand, e = MI->getNumOperands();
i != e && MI->getOperand(i).isImm(); i += 1 + NumOps) {
unsigned Flags = MI->getOperand(i).getImm();
+
NumOps = InlineAsm::getNumOperandRegisters(Flags);
if (NumOps != 1)
continue;
const MachineOperand &MO = MI->getOperand(i + 1);
if (!MO.isReg())
continue;
- unsigned STReg = MO.getReg() - X86::ST0;
+ unsigned STReg = MO.getReg() - X86::FP0;
if (STReg >= 8)
continue;
+ // If the flag has a register class constraint, this must be an operand
+ // with constraint "f". Record its index and continue.
+ if (InlineAsm::hasRegClassConstraint(Flags, RCID)) {
+ FRegIdx.insert(i + 1);
+ continue;
+ }
+
switch (InlineAsm::getKind(Flags)) {
case InlineAsm::Kind_RegUse:
STUses |= (1u << STReg);
DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops "
<< NumSTPopped << ", and defines " << NumSTDefs << " regs.\n");
- // Scan the instruction for FP uses corresponding to "f" constraints.
- // Collect FP registers to kill afer the instruction.
- // Always kill all the scratch regs.
+#ifndef NDEBUG
+ // If any input operand uses constraint "f", all output register
+ // constraints must be early-clobber defs.
+ for (unsigned I = 0, E = MI->getNumOperands(); I < E; ++I)
+ if (FRegIdx.count(I)) {
+ assert((1 << getFPReg(MI->getOperand(I)) & STDefs) == 0 &&
+ "Operands with constraint \"f\" cannot overlap with defs");
+ }
+#endif
+
+ // Collect all FP registers (register operands with constraints "t", "u",
+ // and "f") to kill afer the instruction.
unsigned FPKills = ((1u << NumFPRegs) - 1) & ~0xff;
- unsigned FPUsed = 0;
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
MachineOperand &Op = MI->getOperand(i);
if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
continue;
- if (!Op.isUse())
- MI->emitError("illegal \"f\" output constraint");
unsigned FPReg = getFPReg(Op);
- FPUsed |= 1U << FPReg;
// If we kill this operand, make sure to pop it from the stack after the
// asm. We just remember it for now, and pop them all off at the end in
// a batch.
- if (Op.isKill())
+ if (Op.isUse() && Op.isKill())
FPKills |= 1U << FPReg;
}
- // The popped inputs will be killed by the instruction, so duplicate them
- // if the FP register needs to be live after the instruction, or if it is
- // used in the instruction itself. We effectively treat the popped inputs
- // as early clobbers.
- for (unsigned i = 0; i < NumSTPopped; ++i) {
- if ((FPKills & ~FPUsed) & (1u << PendingST[i]))
- continue;
- unsigned SR = getScratchReg();
- duplicateToTop(PendingST[i], SR, I);
- DEBUG(dbgs() << "Duplicating ST" << i << " in FP"
- << unsigned(PendingST[i]) << " to avoid clobbering it.\n");
- PendingST[i] = SR;
- }
-
- // Make sure we have a unique live register for every fixed use. Some of
- // them could be undef uses, and we need to emit LD_F0 instructions.
- for (unsigned i = 0; i < NumSTUses; ++i) {
- if (i < NumPendingSTs && PendingST[i] < NumFPRegs) {
- // Check for shared assignments.
- for (unsigned j = 0; j < i; ++j) {
- if (PendingST[j] != PendingST[i])
- continue;
- // STi and STj are inn the same register, create a copy.
- unsigned SR = getScratchReg();
- duplicateToTop(PendingST[i], SR, I);
- DEBUG(dbgs() << "Duplicating ST" << i << " in FP"
- << unsigned(PendingST[i])
- << " to avoid collision with ST" << j << '\n');
- PendingST[i] = SR;
- }
- continue;
- }
- unsigned SR = getScratchReg();
- DEBUG(dbgs() << "Emitting LD_F0 for ST" << i << " in FP" << SR << '\n');
- BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0));
- pushReg(SR);
- PendingST[i] = SR;
- if (NumPendingSTs == i)
- ++NumPendingSTs;
- }
- assert(NumPendingSTs >= NumSTUses && "Fixed registers should be assigned");
+ // Do not include registers that are implicitly popped by defs/clobbers.
+ FPKills &= ~(STDefs | STClobbers);
// Now we can rearrange the live registers to match what was requested.
- shuffleStackTop(PendingST, NumPendingSTs, I);
+ unsigned char STUsesArray[8];
+
+ for (unsigned I = 0; I < NumSTUses; ++I)
+ STUsesArray[I] = I;
+
+ shuffleStackTop(STUsesArray, NumSTUses, I);
DEBUG({dbgs() << "Before asm: "; dumpStack();});
// With the stack layout fixed, rewrite the FP registers.
MachineOperand &Op = MI->getOperand(i);
if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
continue;
+
unsigned FPReg = getFPReg(Op);
- Op.setReg(getSTReg(FPReg));
+
+ if (FRegIdx.count(i))
+ // Operand with constraint "f".
+ Op.setReg(getSTReg(FPReg));
+ else
+ // Operand with a single register class constraint ("t" or "u").
+ Op.setReg(X86::ST0 + FPReg);
}
// Simulate the inline asm popping its inputs and pushing its outputs.
StackTop -= NumSTPopped;
- // Hold the fixed output registers in scratch FP registers. They will be
- // transferred to real FP registers by copies.
- NumPendingSTs = 0;
- for (unsigned i = 0; i < NumSTDefs; ++i) {
- unsigned SR = getScratchReg();
- pushReg(SR);
- FPKills &= ~(1u << SR);
- }
for (unsigned i = 0; i < NumSTDefs; ++i)
- PendingST[NumPendingSTs++] = getStackEntry(i);
- DEBUG({dbgs() << "After asm: "; dumpStack();});
-
- // If any of the ST defs were dead, pop them immediately. Our caller only
- // handles dead FP defs.
- MachineBasicBlock::iterator InsertPt = MI;
- for (unsigned i = 0; STDefs & (1u << i); ++i) {
- if (!(STDeadDefs & (1u << i)))
- continue;
- freeStackSlotAfter(InsertPt, PendingST[i]);
- PendingST[i] = NumFPRegs;
- }
- while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs)
- --NumPendingSTs;
+ pushReg(NumSTDefs - i - 1);
// If this asm kills any FP registers (is the last use of them) we must
// explicitly emit pop instructions for them. Do this now after the asm has
while (FPKills) {
unsigned FPReg = countTrailingZeros(FPKills);
if (isLive(FPReg))
- freeStackSlotAfter(InsertPt, FPReg);
+ freeStackSlotAfter(I, FPReg);
FPKills &= ~(1U << FPReg);
}
+
// Don't delete the inline asm!
return;
}
// Duplicate the TOS so that we return it twice. Just pick some other FPx
// register to hold it.
- unsigned NewReg = getScratchReg();
+ unsigned NewReg = ScratchFPReg;
duplicateToTop(FirstFPRegOp, NewReg, MI);
FirstFPRegOp = NewReg;
}
} else
--I;
}
+
+void FPS::setKillFlags(MachineBasicBlock &MBB) const {
+ const TargetRegisterInfo *TRI = MBB.getParent()->getTarget()
+ .getRegisterInfo();
+ LivePhysRegs LPR(TRI);
+
+ LPR.addLiveOuts(&MBB);
+
+ for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
+ I != E; ++I) {
+ BitVector Defs(8);
+ SmallVector<MachineOperand *, 2> Uses;
+ MachineInstr &MI = *I;
+
+ for (auto &MO : I->operands()) {
+ if (!MO.isReg())
+ continue;
+
+ unsigned Reg = MO.getReg() - X86::FP0;
+
+ if (Reg >= 8)
+ continue;
+
+ if (MO.isDef()) {
+ Defs.set(Reg);
+ if (!LPR.contains(MO.getReg()))
+ MO.setIsDead();
+ } else
+ Uses.push_back(&MO);
+ }
+
+ for (auto *MO : Uses)
+ if (Defs.test(getFPReg(*MO)) || !LPR.contains(MO->getReg()))
+ MO->setIsKill();
+
+ LPR.stepBackward(MI);
+ }
+}
// Returns in ST0/ST1 are handled specially: these are pushed as operands to
// the RET instruction and handled by the FP Stackifier.
- if (VA.getLocReg() == X86::ST0 ||
- VA.getLocReg() == X86::ST1) {
+ if (VA.getLocReg() == X86::FP0 ||
+ VA.getLocReg() == X86::FP1) {
// If this is a copy from an xmm register to ST(0), use an FPExtend to
// change the value to the FP stack register class.
if (isScalarFPTypeInSSEReg(VA.getValVT()))
report_fatal_error("SSE register return with SSE disabled");
}
- SDValue Val;
-
- // If this is a call to a function that returns an fp value on the floating
- // point stack, we must guarantee the value is popped from the stack, so
- // a CopyFromReg is not good enough - the copy instruction may be eliminated
- // if the return value is not used. We use the FpPOP_RETVAL instruction
- // instead.
- if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
- // If we prefer to use the value in xmm registers, copy it out as f80 and
- // use a truncate to move it from fp stack reg to xmm reg.
- if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
- SDValue Ops[] = { Chain, InFlag };
- Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
- MVT::Other, MVT::Glue, Ops), 1);
- Val = Chain.getValue(0);
-
- // Round the f80 to the right size, which also moves it to the appropriate
- // xmm register.
- if (CopyVT != VA.getValVT())
- Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
- // This truncation won't change the value.
- DAG.getIntPtrConstant(1));
- } else {
- Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
- CopyVT, InFlag).getValue(1);
- Val = Chain.getValue(0);
- }
+ // If we prefer to use the value in xmm registers, copy it out as f80 and
+ // use a truncate to move it from fp stack reg to xmm reg.
+ if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+ isScalarFPTypeInSSEReg(VA.getValVT()))
+ CopyVT = MVT::f80;
+
+ Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+ CopyVT, InFlag).getValue(1);
+ SDValue Val = Chain.getValue(0);
+
+ if (CopyVT != VA.getValVT())
+ Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
+ // This truncation won't change the value.
+ DAG.getIntPtrConstant(1));
+
InFlag = Chain.getValue(2);
InVals.push_back(Val);
}
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
CCValAssign &VA = RVLocs[i];
- if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
+ if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
return false;
}
}
Constraint[5] == ')' &&
Constraint[6] == '}') {
- Res.first = X86::ST0+Constraint[4]-'0';
+ Res.first = X86::FP0+Constraint[4]-'0';
Res.second = &X86::RFP80RegClass;
return Res;
}
// GCC allows "st(0)" to be called just plain "st".
if (StringRef("{st}").equals_lower(Constraint)) {
- Res.first = X86::ST0;
+ Res.first = X86::FP0;
Res.second = &X86::RFP80RegClass;
return Res;
}
// All calls clobber the non-callee saved registers. ESP is marked as
// a use to prevent stack-pointer assignments that appear immediately
// before calls from potentially appearing dead.
-let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
+ ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
// a use to prevent stack-pointer assignments that appear immediately
// before calls from potentially appearing dead.
let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
- FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+ FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
+ ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
// a pattern) and the FPI instruction should have emission info (e.g. opcode
// encoding and asm printing info).
-// Pseudo Instruction for FP stack return values.
-def FpPOP_RETVAL : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>;
-
// FpIf32, FpIf64 - Floating Point Pseudo Instruction template.
// f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
// f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2.
def FP4 : X86Reg<"fp4", 0>;
def FP5 : X86Reg<"fp5", 0>;
def FP6 : X86Reg<"fp6", 0>;
+def FP7 : X86Reg<"fp7", 0>;
// XMM Registers, used by the various SSE instruction set extensions.
def XMM0: X86Reg<"xmm0", 0>, DwarfRegNum<[17, 21, 21]>;
def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>;
def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>;
-class STRegister<string n, bits<16> Enc, list<Register> A> : X86Reg<n, Enc> {
- let Aliases = A;
-}
-
// Floating point stack registers. These don't map one-to-one to the FP
// pseudo registers, but we still mark them as aliasing FP registers. That
// way both kinds can be live without exceeding the stack depth. ST registers
// are only live around inline assembly.
-def ST0 : STRegister<"st(0)", 0, []>, DwarfRegNum<[33, 12, 11]>;
-def ST1 : STRegister<"st(1)", 1, [FP6]>, DwarfRegNum<[34, 13, 12]>;
-def ST2 : STRegister<"st(2)", 2, [FP5]>, DwarfRegNum<[35, 14, 13]>;
-def ST3 : STRegister<"st(3)", 3, [FP4]>, DwarfRegNum<[36, 15, 14]>;
-def ST4 : STRegister<"st(4)", 4, [FP3]>, DwarfRegNum<[37, 16, 15]>;
-def ST5 : STRegister<"st(5)", 5, [FP2]>, DwarfRegNum<[38, 17, 16]>;
-def ST6 : STRegister<"st(6)", 6, [FP1]>, DwarfRegNum<[39, 18, 17]>;
-def ST7 : STRegister<"st(7)", 7, [FP0]>, DwarfRegNum<[40, 19, 18]>;
+def ST0 : X86Reg<"st(0)", 0>, DwarfRegNum<[33, 12, 11]>;
+def ST1 : X86Reg<"st(1)", 1>, DwarfRegNum<[34, 13, 12]>;
+def ST2 : X86Reg<"st(2)", 2>, DwarfRegNum<[35, 14, 13]>;
+def ST3 : X86Reg<"st(3)", 3>, DwarfRegNum<[36, 15, 14]>;
+def ST4 : X86Reg<"st(4)", 4>, DwarfRegNum<[37, 16, 15]>;
+def ST5 : X86Reg<"st(5)", 5>, DwarfRegNum<[38, 17, 16]>;
+def ST6 : X86Reg<"st(6)", 6>, DwarfRegNum<[39, 18, 17]>;
+def ST7 : X86Reg<"st(7)", 7>, DwarfRegNum<[40, 19, 18]>;
// Floating-point status word
def FPSW : X86Reg<"fpsw", 0>;
%0 = tail call i32 asm "fcomi $2, $1; pushf; pop $0", "=r,{st},{st(1)},~{dirflag},~{fpsr},~{flags}"(double 2.000000e+00, double 2.000000e+00) nounwind
ret i32 %0
}
+
+; <rdar://problem/16952634>
+; X87 stackifier asserted when there was an ST register defined by an
+; inline-asm instruction and the ST register was live across another
+; inline-asm instruction.
+;
+; INLINEASM <es:frndint> [sideeffect] [attdialect], $0:[regdef], %ST0<imp-def,tied5>, $1:[reguse tiedto:$0], %ST0<tied3>, $2:[clobber], %EFLAGS<earlyclobber,imp-def,dead>
+; INLINEASM <es:fldcw $0> [sideeffect] [mayload] [attdialect], $0:[mem], %EAX<undef>, 1, %noreg, 0, %noreg, $1:[clobber], %EFLAGS<earlyclobber,imp-def,dead>
+; %FP0<def> = COPY %ST0
+
+; CHECK-LABEL: _test_live_st
+; CHECK: ## InlineAsm Start
+; CHECK: frndint
+; CHECK: ## InlineAsm End
+; CHECK: ## InlineAsm Start
+; CHECK: fldcw
+; CHECK: ## InlineAsm End
+
+%struct.fpu_t = type { [8 x x86_fp80], x86_fp80, %struct.anon1, %struct.anon2, i32, i8, [15 x i8] }
+%struct.anon1 = type { i32, i32, i32 }
+%struct.anon2 = type { i32, i32, i32, i32 }
+
+@fpu = external global %struct.fpu_t, align 16
+
+; Function Attrs: ssp
+define void @test_live_st(i32 %a1) {
+entry:
+ %0 = load x86_fp80* undef, align 16
+ %cond = icmp eq i32 %a1, 1
+ br i1 %cond, label %sw.bb4.i, label %_Z5tointRKe.exit
+
+sw.bb4.i:
+ %1 = call x86_fp80 asm sideeffect "frndint", "={st},0,~{dirflag},~{fpsr},~{flags}"(x86_fp80 %0)
+ call void asm sideeffect "fldcw $0", "*m,~{dirflag},~{fpsr},~{flags}"(i32* undef)
+ br label %_Z5tointRKe.exit
+
+_Z5tointRKe.exit:
+ %result.0.i = phi x86_fp80 [ %1, %sw.bb4.i ], [ %0, %entry ]
+ %conv.i1814 = fptosi x86_fp80 %result.0.i to i32
+ %conv626 = sitofp i32 %conv.i1814 to x86_fp80
+ store x86_fp80 %conv626, x86_fp80* getelementptr inbounds (%struct.fpu_t* @fpu, i32 0, i32 1)
+ br label %return
+
+return:
+ ret void
+}
+
+; Check that x87 stackifier is correctly rewriting FP registers to ST registers.
+;
+; CHECK-LABEL: _test_operand_rewrite
+; CHECK: ## InlineAsm Start
+; CHECK: foo %st(0), %st(1)
+; CHECK: ## InlineAsm End
+
+define double @test_operand_rewrite() {
+entry:
+ %0 = tail call { double, double } asm sideeffect "foo $0, $1", "={st},={st(1)},~{dirflag},~{fpsr},~{flags}"()
+ %asmresult = extractvalue { double, double } %0, 0
+ %asmresult1 = extractvalue { double, double } %0, 1
+ %sub = fsub double %asmresult, %asmresult1
+ ret double %sub
+}