Size, Alignment, /*AllowCommute=*/true);
}
-static bool isPartialRegisterLoad(const MachineInstr &LoadMI,
- const MachineFunction &MF) {
+/// Check if \p LoadMI is a partial register load that we can't fold into \p MI
+/// because the latter uses contents that wouldn't be defined in the folded
+/// version. For instance, this transformation isn't legal:
+/// movss (%rdi), %xmm0
+/// addps %xmm0, %xmm0
+/// ->
+/// addps (%rdi), %xmm0
+///
+/// But this one is:
+/// movss (%rdi), %xmm0
+/// addss %xmm0, %xmm0
+/// ->
+/// addss (%rdi), %xmm0
+///
+static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
+ const MachineInstr &UserMI,
+ const MachineFunction &MF) {
unsigned Opc = LoadMI.getOpcode();
+ unsigned UserOpc = UserMI.getOpcode();
unsigned RegSize =
MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize();
- if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4)
+ if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4) {
// These instructions only load 32 bits, we can't fold them if the
- // destination register is wider than 32 bits (4 bytes).
- return true;
+ // destination register is wider than 32 bits (4 bytes), and its user
+ // instruction isn't scalar (SS).
+ switch (UserOpc) {
+ case X86::ADDSSrr_Int: case X86::VADDSSrr_Int:
+ case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int:
+ case X86::MULSSrr_Int: case X86::VMULSSrr_Int:
+ case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int:
+ return false;
+ default:
+ return true;
+ }
+ }
- if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8)
+ if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8) {
// These instructions only load 64 bits, we can't fold them if the
- // destination register is wider than 64 bits (8 bytes).
- return true;
+ // destination register is wider than 64 bits (8 bytes), and its user
+ // instruction isn't scalar (SD).
+ switch (UserOpc) {
+ case X86::ADDSDrr_Int: case X86::VADDSDrr_Int:
+ case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int:
+ case X86::MULSDrr_Int: case X86::VMULSDrr_Int:
+ case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int:
+ return false;
+ default:
+ return true;
+ }
+ }
return false;
}
unsigned NumOps = LoadMI->getDesc().getNumOperands();
int FrameIndex;
if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
- if (isPartialRegisterLoad(*LoadMI, MF))
+ if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF))
return nullptr;
return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex);
}
break;
}
default: {
- if (isPartialRegisterLoad(*LoadMI, MF))
+ if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF))
return nullptr;
// Folding a normal load. Just copy the load's address operands.
--- /dev/null
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
+
+; Verify that we're folding the load into the math instruction.
+; This pattern is generated out of the simplest intrinsics usage:
+; _mm_add_ss(a, _mm_load_ss(b));
+
+define <4 x float> @addss(<4 x float> %va, float* %pb) {
+; SSE-LABEL: addss:
+; SSE: # BB#0:
+; SSE-NEXT: addss (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: addss:
+; AVX: # BB#0:
+; AVX-NEXT: vaddss (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = extractelement <4 x float> %va, i32 0
+ %b = load float, float* %pb
+ %r = fadd float %a, %b
+ %vr = insertelement <4 x float> %va, float %r, i32 0
+ ret <4 x float> %vr
+}
+
+define <2 x double> @addsd(<2 x double> %va, double* %pb) {
+; SSE-LABEL: addsd:
+; SSE: # BB#0:
+; SSE-NEXT: addsd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: addsd:
+; AVX: # BB#0:
+; AVX-NEXT: vaddsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = extractelement <2 x double> %va, i32 0
+ %b = load double, double* %pb
+ %r = fadd double %a, %b
+ %vr = insertelement <2 x double> %va, double %r, i32 0
+ ret <2 x double> %vr
+}
+
+define <4 x float> @subss(<4 x float> %va, float* %pb) {
+; SSE-LABEL: subss:
+; SSE: # BB#0:
+; SSE-NEXT: subss (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: subss:
+; AVX: # BB#0:
+; AVX-NEXT: vsubss (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = extractelement <4 x float> %va, i32 0
+ %b = load float, float* %pb
+ %r = fsub float %a, %b
+ %vr = insertelement <4 x float> %va, float %r, i32 0
+ ret <4 x float> %vr
+}
+
+define <2 x double> @subsd(<2 x double> %va, double* %pb) {
+; SSE-LABEL: subsd:
+; SSE: # BB#0:
+; SSE-NEXT: subsd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: subsd:
+; AVX: # BB#0:
+; AVX-NEXT: vsubsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = extractelement <2 x double> %va, i32 0
+ %b = load double, double* %pb
+ %r = fsub double %a, %b
+ %vr = insertelement <2 x double> %va, double %r, i32 0
+ ret <2 x double> %vr
+}
+
+define <4 x float> @mulss(<4 x float> %va, float* %pb) {
+; SSE-LABEL: mulss:
+; SSE: # BB#0:
+; SSE-NEXT: mulss (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: mulss:
+; AVX: # BB#0:
+; AVX-NEXT: vmulss (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = extractelement <4 x float> %va, i32 0
+ %b = load float, float* %pb
+ %r = fmul float %a, %b
+ %vr = insertelement <4 x float> %va, float %r, i32 0
+ ret <4 x float> %vr
+}
+
+define <2 x double> @mulsd(<2 x double> %va, double* %pb) {
+; SSE-LABEL: mulsd:
+; SSE: # BB#0:
+; SSE-NEXT: mulsd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: mulsd:
+; AVX: # BB#0:
+; AVX-NEXT: vmulsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = extractelement <2 x double> %va, i32 0
+ %b = load double, double* %pb
+ %r = fmul double %a, %b
+ %vr = insertelement <2 x double> %va, double %r, i32 0
+ ret <2 x double> %vr
+}
+
+define <4 x float> @divss(<4 x float> %va, float* %pb) {
+; SSE-LABEL: divss:
+; SSE: # BB#0:
+; SSE-NEXT: divss (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: divss:
+; AVX: # BB#0:
+; AVX-NEXT: vdivss (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = extractelement <4 x float> %va, i32 0
+ %b = load float, float* %pb
+ %r = fdiv float %a, %b
+ %vr = insertelement <4 x float> %va, float %r, i32 0
+ ret <4 x float> %vr
+}
+
+define <2 x double> @divsd(<2 x double> %va, double* %pb) {
+; SSE-LABEL: divsd:
+; SSE: # BB#0:
+; SSE-NEXT: divsd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: divsd:
+; AVX: # BB#0:
+; AVX-NEXT: vdivsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = extractelement <2 x double> %va, i32 0
+ %b = load double, double* %pb
+ %r = fdiv double %a, %b
+ %vr = insertelement <2 x double> %va, double %r, i32 0
+ ret <2 x double> %vr
+}