def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>;
+// These are needed to match a scalar load that is used in a vector-only
+// math instruction such as the FP logical ops: andps, andnps, orps, xorps.
+// The memory operand is required to be a 128-bit load, so it must be converted
+// from a vector to a scalar.
+def loadf32_128 : PatFrag<(ops node:$ptr),
+ (f32 (vector_extract (loadv4f32 node:$ptr), (iPTR 0)))>;
+def loadf64_128 : PatFrag<(ops node:$ptr),
+ (f64 (vector_extract (loadv2f64 node:$ptr), (iPTR 0)))>;
+
// Like 'store', but always requires 128-bit vector alignment.
def alignedstore : PatFrag<(ops node:$val, node:$ptr),
(store node:$val, node:$ptr), [{
def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
+// These are needed to match a scalar memop that is used in a vector-only
+// math instruction such as the FP logical ops: andps, andnps, orps, xorps.
+// The memory operand is required to be a 128-bit load, so it must be converted
+// from a vector to a scalar.
+def memopfsf32_128 : PatFrag<(ops node:$ptr),
+ (f32 (vector_extract (memopv4f32 node:$ptr), (iPTR 0)))>;
+def memopfsf64_128 : PatFrag<(ops node:$ptr),
+ (f64 (vector_extract (memopv2f64 node:$ptr), (iPTR 0)))>;
+
+
// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
// 16-byte boundary.
// FIXME: 8 byte alignment for mmx reads is not required
{ X86::DIVSSrr_Int, X86::DIVSSrm_Int, 0 },
{ X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 },
{ X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 },
+
+ // FIXME: We should not be folding Fs* scalar loads into vector
+ // instructions because the vector instructions require vector-sized
+ // loads. Lowering should create vector-sized instructions (the Fv*
+ // variants below) to allow load folding.
{ X86::FsANDNPDrr, X86::FsANDNPDrm, TB_ALIGN_16 },
{ X86::FsANDNPSrr, X86::FsANDNPSrm, TB_ALIGN_16 },
{ X86::FsANDPDrr, X86::FsANDPDrm, TB_ALIGN_16 },
{ X86::FsORPSrr, X86::FsORPSrm, TB_ALIGN_16 },
{ X86::FsXORPDrr, X86::FsXORPDrm, TB_ALIGN_16 },
{ X86::FsXORPSrr, X86::FsXORPSrm, TB_ALIGN_16 },
+
+ { X86::FvANDNPDrr, X86::FvANDNPDrm, TB_ALIGN_16 },
+ { X86::FvANDNPSrr, X86::FvANDNPSrm, TB_ALIGN_16 },
+ { X86::FvANDPDrr, X86::FvANDPDrm, TB_ALIGN_16 },
+ { X86::FvANDPSrr, X86::FvANDPSrm, TB_ALIGN_16 },
+ { X86::FvORPDrr, X86::FvORPDrm, TB_ALIGN_16 },
+ { X86::FvORPSrr, X86::FvORPSrm, TB_ALIGN_16 },
+ { X86::FvXORPDrr, X86::FvXORPDrm, TB_ALIGN_16 },
+ { X86::FvXORPSrr, X86::FvXORPSrm, TB_ALIGN_16 },
{ X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 },
{ X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 },
{ X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 },
{ X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, 0 },
{ X86::VDPPDrri, X86::VDPPDrmi, 0 },
{ X86::VDPPSrri, X86::VDPPSrmi, 0 },
- { X86::VFsANDNPDrr, X86::VFsANDNPDrm, 0 },
- { X86::VFsANDNPSrr, X86::VFsANDNPSrm, 0 },
- { X86::VFsANDPDrr, X86::VFsANDPDrm, 0 },
- { X86::VFsANDPSrr, X86::VFsANDPSrm, 0 },
- { X86::VFsORPDrr, X86::VFsORPDrm, 0 },
- { X86::VFsORPSrr, X86::VFsORPSrm, 0 },
- { X86::VFsXORPDrr, X86::VFsXORPDrm, 0 },
- { X86::VFsXORPSrr, X86::VFsXORPSrm, 0 },
+ // Do not fold VFs* loads because there are no scalar load variants for
+ // these instructions. When folded, the load is required to be 128-bits, so
+ // the load size would not match.
+ { X86::VFvANDNPDrr, X86::VFvANDNPDrm, 0 },
+ { X86::VFvANDNPSrr, X86::VFvANDNPSrm, 0 },
+ { X86::VFvANDPDrr, X86::VFvANDPDrm, 0 },
+ { X86::VFvANDPSrr, X86::VFvANDPSrm, 0 },
+ { X86::VFvORPDrr, X86::VFvORPDrm, 0 },
+ { X86::VFvORPSrr, X86::VFvORPSrm, 0 },
+ { X86::VFvXORPDrr, X86::VFvXORPDrm, 0 },
+ { X86::VFvXORPSrr, X86::VFvXORPSrm, 0 },
{ X86::VHADDPDrr, X86::VHADDPDrm, 0 },
{ X86::VHADDPSrr, X86::VHADDPSrm, 0 },
{ X86::VHSUBPDrr, X86::VHSUBPDrm, 0 },
multiclass sse12_fp_packed_scalar_logical_alias<
bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
- FR32, f32, f128mem, loadf32, SSEPackedSingle, itins, 0>,
- PS, VEX_4V;
+ FR32, f32, f128mem, loadf32_128, SSEPackedSingle, itins, 0>,
+ PS, VEX_4V;
defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
- FR64, f64, f128mem, loadf64, SSEPackedDouble, itins, 0>,
- PD, VEX_4V;
+ FR64, f64, f128mem, loadf64_128, SSEPackedDouble, itins, 0>,
+ PD, VEX_4V;
let Constraints = "$src1 = $dst" in {
defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
- f32, f128mem, memopfsf32, SSEPackedSingle, itins>,
- PS;
+ f32, f128mem, memopfsf32_128, SSEPackedSingle, itins>, PS;
defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
- f64, f128mem, memopfsf64, SSEPackedDouble, itins>,
- PD;
+ f64, f128mem, memopfsf64_128, SSEPackedDouble, itins>, PD;
}
}
--- /dev/null
+; RUN: llc < %s -mcpu=x86-64 -mattr=sse2,sse-unaligned-mem | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mcpu=x86-64 -mattr=avx | FileCheck %s --check-prefix=AVX
+
+; Although we have the ability to fold an unaligned load with AVX
+; and under special conditions with some SSE implementations, we
+; can not fold the load under any circumstances in these test
+; cases because they are not 16-byte loads. The load must be
+; executed as a scalar ('movs*') with a zero extension to
+; 128-bits and then used in the packed logical ('andp*') op.
+; PR22371 - http://llvm.org/bugs/show_bug.cgi?id=22371
+
+define double @load_double_no_fold(double %x, double %y) {
+; SSE2-LABEL: load_double_no_fold:
+; SSE2: ## BB#0:
+; SSE2-NEXT: cmplesd %xmm0, %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: andpd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: load_double_no_fold:
+; AVX: ## BB#0:
+; AVX-NEXT: vcmplesd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vandpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+
+ %cmp = fcmp oge double %x, %y
+ %zext = zext i1 %cmp to i32
+ %conv = sitofp i32 %zext to double
+ ret double %conv
+}
+
+define float @load_float_no_fold(float %x, float %y) {
+; SSE2-LABEL: load_float_no_fold:
+; SSE2: ## BB#0:
+; SSE2-NEXT: cmpless %xmm0, %xmm1
+; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: andps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: load_float_no_fold:
+; AVX: ## BB#0:
+; AVX-NEXT: vcmpless %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+
+ %cmp = fcmp oge float %x, %y
+ %zext = zext i1 %cmp to i32
+ %conv = sitofp i32 %zext to float
+ ret float %conv
+}
+
; RUN: llc < %s -relocation-model=static -mcpu=yonah | FileCheck %s
-; The double argument is at 4(esp) which is 16-byte aligned, allowing us to
-; fold the load into the andpd.
+; The double argument is at 4(esp) which is 16-byte aligned, but we
+; are required to read in extra bytes of memory in order to fold the
+; load. Bad Things may happen when reading/processing undefined bytes,
+; so don't fold the load.
+; PR22371 / http://reviews.llvm.org/D7474
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i686-apple-darwin8"
%tmp = getelementptr { double, double }* %z, i32 0, i32 0 ; <double*> [#uses=1]
%tmp1 = load volatile double* %tmp, align 8 ; <double> [#uses=1]
%tmp2 = tail call double @fabs( double %tmp1 ) readnone ; <double> [#uses=1]
- ; CHECK: andpd{{.*}}4(%esp), %xmm
%tmp6 = fadd double %tmp4, %tmp2 ; <double> [#uses=1]
store volatile double %tmp6, double* %P, align 8
ret void
+
+; CHECK-LABEL: test:
+; CHECK: movsd {{.*}}G, %xmm{{.*}}
+; CHECK: andpd %xmm{{.*}}, %xmm{{.*}}
+; CHECK: movsd 4(%esp), %xmm{{.*}}
+; CHECK: andpd %xmm{{.*}}, %xmm{{.*}}
+
+
}
define void @test2() alignstack(16) nounwind {
entry:
- ; CHECK: andl{{.*}}$-16, %esp
+; CHECK-LABEL: test2:
+; CHECK: andl{{.*}}$-16, %esp
ret void
}
; Use a call to force a spill.
define <2 x double> @test3(<2 x double> %x, <2 x double> %y) alignstack(32) nounwind {
entry:
- ; CHECK: andl{{.*}}$-32, %esp
+; CHECK-LABEL: test3:
+; CHECK: andl{{.*}}$-32, %esp
call void @test2()
%A = fmul <2 x double> %x, %y
ret <2 x double> %A