return SDValue();
}
-static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
- SelectionDAG &DAG) {
- SDLoc dl(Load);
- MVT VT = Load->getSimpleValueType(0);
- MVT EVT = VT.getVectorElementType();
- SDValue Addr = Load->getOperand(1);
- SDValue NewAddr = DAG.getNode(
- ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
- DAG.getConstant(Index * EVT.getStoreSize(), dl,
- Addr.getSimpleValueType()));
-
- SDValue NewLoad =
- DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
- DAG.getMachineFunction().getMachineMemOperand(
- Load->getMemOperand(), 0, EVT.getStoreSize()));
- return NewLoad;
-}
-
-static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
- SDLoc dl(N);
- MVT VT = N->getOperand(1)->getSimpleValueType(0);
- assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
- "X86insertps is only defined for v4x32");
-
- SDValue Ld = N->getOperand(1);
- if (MayFoldLoad(Ld)) {
- // Extract the countS bits from the immediate so we can get the proper
- // address when narrowing the vector load to a specific element.
- // When the second source op is a memory address, insertps doesn't use
- // countS and just gets an f32 from that address.
- unsigned DestIndex =
- cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
-
- Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
-
- // Create this as a scalar to vector to match the instruction pattern.
- SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
- // countS bits are ignored when loading from memory on insertps, which
- // means we don't need to explicitly set them to 0.
- return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
- LoadScalarToVector, N->getOperand(2));
- }
- return SDValue();
-}
-
static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
SDValue V0 = N->getOperand(0);
SDValue V1 = N->getOperand(1);
case X86ISD::VPERM2X128:
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
- case X86ISD::INSERTPS: {
- if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
- return PerformINSERTPSCombine(N, DAG, Subtarget);
- break;
- }
case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG);
}
return false;
}
-static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs) {
+static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
+ int PtrOffset = 0) {
unsigned NumAddrOps = MOs.size();
- for (unsigned i = 0; i != NumAddrOps; ++i)
- MIB.addOperand(MOs[i]);
- if (NumAddrOps < 4) // FrameIndex only
- addOffset(MIB, 0);
+
+ if (NumAddrOps < 4) {
+ // FrameIndex only - add an immediate offset (whether its zero or not).
+ for (unsigned i = 0; i != NumAddrOps; ++i)
+ MIB.addOperand(MOs[i]);
+ addOffset(MIB, PtrOffset);
+ } else {
+ // General Memory Addressing - we need to add any offset to an existing
+ // offset.
+ assert(MOs.size() == 5 && "Unexpected memory operand list length");
+ for (unsigned i = 0; i != NumAddrOps; ++i) {
+ const MachineOperand &MO = MOs[i];
+ if (i == 3 && PtrOffset != 0) {
+ assert((MO.isImm() || MO.isGlobal()) &&
+ "Unexpected memory operand type");
+ if (MO.isImm()) {
+ MIB.addImm(MO.getImm() + PtrOffset);
+ } else {
+ MIB.addGlobalAddress(MO.getGlobal(), MO.getOffset() + PtrOffset,
+ MO.getTargetFlags());
+ }
+ } else {
+ MIB.addOperand(MO);
+ }
+ }
+ }
}
static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
unsigned OpNo, ArrayRef<MachineOperand> MOs,
MachineBasicBlock::iterator InsertPt,
- MachineInstr *MI, const TargetInstrInfo &TII) {
+ MachineInstr *MI, const TargetInstrInfo &TII,
+ int PtrOffset = 0) {
// Omit the implicit operands, something BuildMI can't do.
MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
MI->getDebugLoc(), true);
MachineOperand &MO = MI->getOperand(i);
if (i == OpNo) {
assert(MO.isReg() && "Expected to fold into reg operand!");
- addOperands(MIB, MOs);
+ addOperands(MIB, MOs, PtrOffset);
} else {
MIB.addOperand(MO);
}
return MIB.addImm(0);
}
+MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
+ MachineFunction &MF, MachineInstr *MI, unsigned OpNum,
+ ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
+ unsigned Size, unsigned Align) const {
+ switch (MI->getOpcode()) {
+ case X86::INSERTPSrr:
+ case X86::VINSERTPSrr:
+ // Attempt to convert the load of inserted vector into a fold load
+ // of a single float.
+ if (OpNum == 2) {
+ unsigned Imm = MI->getOperand(MI->getNumOperands() - 1).getImm();
+ unsigned ZMask = Imm & 15;
+ unsigned DstIdx = (Imm >> 4) & 3;
+ unsigned SrcIdx = (Imm >> 6) & 3;
+
+ unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize();
+ if (Size <= RCSize && 4 <= Align) {
+ int PtrOffset = SrcIdx * 4;
+ unsigned NewImm = (DstIdx << 4) | ZMask;
+ unsigned NewOpCode =
+ (MI->getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm
+ : X86::INSERTPSrm);
+ MachineInstr *NewMI =
+ FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
+ NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
+ return NewMI;
+ }
+ }
+ break;
+ };
+
+ return nullptr;
+}
+
MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr *MI, unsigned OpNum,
ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
return nullptr;
MachineInstr *NewMI = nullptr;
+
+ // Attempt to fold any custom cases we have.
+ if (NewMI =
+ foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align))
+ return NewMI;
+
// Folding a memory location into the two-address part of a two-address
// instruction is different than folding it other places. It requires
// replacing the *two* registers with the memory location.
MachineBasicBlock::iterator &MBBI,
LiveVariables *LV) const;
+ /// Handles memory folding for special case instructions, for instance those
+ /// requiring custom manipulation of the address.
+ MachineInstr *foldMemoryOperandCustom(MachineFunction &MF, MachineInstr *MI,
+ unsigned OpNum,
+ ArrayRef<MachineOperand> MOs,
+ MachineBasicBlock::iterator InsertPt,
+ unsigned Size, unsigned Align) const;
+
/// isFrameOperand - Return true and the FrameIndex if the specified
/// operand and follow operands form a reference to the stack frame.
bool isFrameOperand(const MachineInstr *MI, unsigned int Op,
; On X32, account for the argument's move to registers
; X32: movl 4(%esp), %eax
; CHECK-NOT: mov
-; CHECK: insertps $48
+; CHECK: vinsertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; CHECK-NEXT: ret
%1 = load <4 x float>, <4 x float>* %pb, align 16
%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
; X32: movl 4(%esp), %eax
; CHECK-NOT: mov
;; Try to match a bit more of the instr, since we need the load's offset.
-; CHECK: insertps $96, 4(%{{...}}), %
+; CHECK: vinsertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; CHECK-NEXT: ret
%1 = load <4 x float>, <4 x float>* %pb, align 16
%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
; X32: movl 8(%esp), %ecx
; CHECK-NOT: mov
;; Try to match a bit more of the instr, since we need the load's offset.
-; CHECK: vinsertps $192, 12(%{{...}},%{{...}}), %
+; CHECK: vinsertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
; CHECK-NEXT: ret
%1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
%2 = load <4 x float>, <4 x float>* %1, align 16
--- /dev/null
+; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=X32
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=X64
+
+; Test for case where insertps was folding the load of the insertion element, but a later optimization
+; was then manipulating the load.
+
+define <4 x float> @insertps_unfold(<4 x float>* %v0, <4 x float>* %v1) {
+; X32-LABEL: insertps_unfold:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: movaps (%eax), %xmm0
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X32-NEXT: addps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_unfold:
+; X64: # BB#0:
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movaps (%rdi), %xmm0
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X64-NEXT: addps %xmm1, %xmm0
+; X64-NEXT: retq
+ %a = getelementptr inbounds <4 x float>, <4 x float>* %v1, i64 0, i64 1
+ %b = load float, float* %a, align 4
+ %c = insertelement <4 x float> undef, float %b, i32 0
+ %d = load <4 x float>, <4 x float>* %v1, align 16
+ %e = load <4 x float>, <4 x float>* %v0, align 16
+ %f = shufflevector <4 x float> %e, <4 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+ %g = fadd <4 x float> %c, %f
+ ret <4 x float> %g
+}
; X32-LABEL: insertps_from_vector_load:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X32-NEXT: insertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X32-NEXT: retl
;
; X64-LABEL: insertps_from_vector_load:
; X64: ## BB#0:
-; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X64-NEXT: insertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X64-NEXT: retq
%1 = load <4 x float>, <4 x float>* %pb, align 16
%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
; X32-LABEL: insertps_from_vector_load_offset:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3]
+; X32-NEXT: insertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; X32-NEXT: retl
;
; X64-LABEL: insertps_from_vector_load_offset:
; X64: ## BB#0:
-; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3]
+; X64-NEXT: insertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; X64-NEXT: retq
%1 = load <4 x float>, <4 x float>* %pb, align 16
%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: shll $4, %ecx
-; X32-NEXT: insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3]
+; X32-NEXT: insertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
; X32-NEXT: retl
;
; X64-LABEL: insertps_from_vector_load_offset_2:
; X64: ## BB#0:
; X64-NEXT: shlq $4, %rsi
-; X64-NEXT: insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3]
+; X64-NEXT: insertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
; X64-NEXT: retq
%1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
%2 = load <4 x float>, <4 x float>* %1, align 16
; X32-LABEL: pr20087:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2]
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
; X32-NEXT: retl
;
; X64-LABEL: pr20087:
; X64: ## BB#0:
-; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2]
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
; X64-NEXT: retq
%load = load <4 x float> , <4 x float> *%ptr
%ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
ret <8 x float> %2
}
-; TODO stack_fold_insertps
+define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
+ ;CHECK-LABEL: stack_fold_insertps
+ ;CHECK: vinsertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ ;CHECK-NEXT: {{.*#+}} xmm0 = zero,mem[0],xmm0[2,3]
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209)
+ ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
;CHECK-LABEL: stack_fold_maxpd
}
declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
-; TODO stack_fold_insertps
+define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
+ ;CHECK-LABEL: stack_fold_insertps
+ ;CHECK: insertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ ;CHECK-NEXT: {{.*#+}} xmm0 = zero,mem[0],xmm0[2,3]
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209)
+ ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
;CHECK-LABEL: stack_fold_maxpd