if (User->getOpcode() == ISD::FNEG)
return Op;
- SDValue Op0 = Op.getOperand(0);
- bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
-
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
- // Assume scalar op for initialization; update for vector if needed.
- // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
- // generate a 16-byte vector constant and logic op even for the scalar case.
- // Using a 16-byte mask allows folding the load of the mask with
- // the logic op, so it can save (~4 bytes) on code size.
- MVT EltVT = VT;
- unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+
// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
// decide if we should generate a 16-byte constant mask when we only need 4 or
// 8 bytes for the scalar case.
+
+ MVT LogicVT;
+ MVT EltVT;
+ unsigned NumElts;
+
if (VT.isVector()) {
+ LogicVT = VT;
EltVT = VT.getVectorElementType();
NumElts = VT.getVectorNumElements();
+ } else {
+ // There are no scalar bitwise logical SSE/AVX instructions, so we
+ // generate a 16-byte vector constant and logic op even for the scalar case.
+ // Using a 16-byte mask allows folding the load of the mask with
+ // the logic op, so it can save (~4 bytes) on code size.
+ LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+ EltVT = VT;
+ NumElts = (VT == MVT::f64) ? 2 : 4;
}
unsigned EltBits = EltVT.getSizeInBits();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
- SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+ SDValue Mask = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(),
false, false, false, Alignment);
- if (VT.isVector()) {
- // For a vector, cast operands to a vector type, perform the logic op,
- // and cast the result back to the original value type.
- MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
- SDValue MaskCasted = DAG.getBitcast(VecVT, Mask);
- SDValue Operand = IsFNABS ? DAG.getBitcast(VecVT, Op0.getOperand(0))
- : DAG.getBitcast(VecVT, Op0);
- unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
- return DAG.getBitcast(VT,
- DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
- }
-
- // If not vector, then scalar.
- unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
+ SDValue Op0 = Op.getOperand(0);
+ bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
+ unsigned LogicOp =
+ IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
- return DAG.getNode(BitOp, dl, VT, Operand, Mask);
+
+ if (VT.isVector())
+ return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+
+ // For the scalar case extend to a 128-bit vector, perform the logic op,
+ // and extract the scalar result back out.
+ Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
+ SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
+ DAG.getIntPtrConstant(0, dl));
}
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
Constant *C = ConstantVector::get(CV);
auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16);
- SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
+
+ // Perform all logic operations as 16-byte vectors because there are no
+ // scalar FP logic instructions in SSE. This allows load folding of the
+ // constants into the logic instructions.
+ MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+ SDValue Mask1 = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(),
false, false, false, 16);
- SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
+ Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
+ SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
// Next, clear the sign bit from the first operand (magnitude).
// If it's a constant, we can clear it here.
APFloat APF = Op0CN->getValueAPF();
// If the magnitude is a positive zero, the sign bit alone is enough.
if (APF.isPosZero())
- return SignBit;
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
+ DAG.getIntPtrConstant(0, dl));
APF.clearSign();
CV[0] = ConstantFP::get(*Context, APF);
} else {
}
C = ConstantVector::get(CV);
CPIdx = DAG.getConstantPool(C, PtrVT, 16);
- SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+ SDValue Val = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(),
false, false, false, 16);
// If the magnitude operand wasn't a constant, we need to AND out the sign.
- if (!isa<ConstantFPSDNode>(Op0))
- Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
-
+ if (!isa<ConstantFPSDNode>(Op0)) {
+ Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
+ Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
+ }
// OR the magnitude value with the sign bit.
- return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
+ Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
+ DAG.getIntPtrConstant(0, dl));
}
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
{ X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 },
{ X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 },
- // FIXME: We should not be folding Fs* scalar loads into vector
- // instructions because the vector instructions require vector-sized
- // loads. Lowering should create vector-sized instructions (the Fv*
- // variants below) to allow load folding.
- { X86::FsANDNPDrr, X86::FsANDNPDrm, TB_ALIGN_16 },
- { X86::FsANDNPSrr, X86::FsANDNPSrm, TB_ALIGN_16 },
- { X86::FsANDPDrr, X86::FsANDPDrm, TB_ALIGN_16 },
- { X86::FsANDPSrr, X86::FsANDPSrm, TB_ALIGN_16 },
- { X86::FsORPDrr, X86::FsORPDrm, TB_ALIGN_16 },
- { X86::FsORPSrr, X86::FsORPSrm, TB_ALIGN_16 },
- { X86::FsXORPDrr, X86::FsXORPDrm, TB_ALIGN_16 },
- { X86::FsXORPSrr, X86::FsXORPSrm, TB_ALIGN_16 },
+ // Do not fold Fs* scalar logical op loads because there are no scalar
+ // load variants for these instructions. When folded, the load is required
+ // to be 128-bits, so the load size would not match.
{ X86::FvANDNPDrr, X86::FvANDNPDrm, TB_ALIGN_16 },
{ X86::FvANDNPSrr, X86::FvANDNPSrm, TB_ALIGN_16 },
defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>,
PD, VEX_4V;
+
+ defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+ VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>,
+ PS, VEX_4V, VEX_L;
+
+ defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+ VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>,
+ PD, VEX_4V, VEX_L;
}
let Constraints = "$src1 = $dst" in {
; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
; PR2656
-; CHECK: {{xorps.*sp}}
-; CHECK-NOT: {{xorps.*sp}}
-
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i686-apple-darwin9.4.0"
%struct.anon = type <{ float, float }>
@.str = internal constant [17 x i8] c"pt: %.0f, %.0f\0A\00\00" ; <[17 x i8]*> [#uses=1]
+; We can not fold either stack load into an 'xor' instruction because that
+; would change what should be a 4-byte load into a 16-byte load.
+; We can fold the 16-byte constant load into either 'xor' instruction,
+; but we do not. It has more than one use, so it gets loaded into a register.
+
define void @foo(%struct.anon* byval %p) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: movaps {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-NEXT: xorps %xmm2, %xmm0
+; CHECK-NEXT: cvtss2sd %xmm0, %xmm0
+; CHECK-NEXT: xorps %xmm2, %xmm1
entry:
%tmp = getelementptr %struct.anon, %struct.anon* %p, i32 0, i32 0 ; <float*> [#uses=1]
%tmp1 = load float, float* %tmp ; <float> [#uses=1]
}
declare i32 @printf(...)
+
+; We can not fold the load from the stack into the 'and' instruction because
+; that changes an 8-byte load into a 16-byte load (illegal memory access).
+; We can fold the load of the constant because it is a 16-byte vector constant.
+
+define double @PR22371(double %x) {
+; CHECK-LABEL: PR22371:
+; CHECK: movsd 16(%esp), %xmm0
+; CHECK-NEXT: andpd LCPI1_0, %xmm0
+; CHECK-NEXT: movlpd %xmm0, (%esp)
+ %call = tail call double @fabs(double %x) #0
+ ret double %call
+}
+
+declare double @fabs(double) #0
+attributes #0 = { readnone }
+
define float @int1(float %a, float %b) {
; X32-LABEL: @int1
-; X32: movss 12(%esp), %xmm0 {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: movss 8(%esp), %xmm1 {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: andps .LCPI2_0, %xmm1
-; X32-NEXT: andps .LCPI2_1, %xmm0
-; X32-NEXT: orps %xmm1, %xmm0
-; X32-NEXT: movss %xmm0, (%esp)
+; X32: movss 8(%esp), %xmm0 {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: andps .LCPI2_0, %xmm0
+; X32-NEXT: movss 12(%esp), %xmm1 {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: andps .LCPI2_1, %xmm1
+; X32-NEXT: orps %xmm0, %xmm1
+; X32-NEXT: movss %xmm1, (%esp)
; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax
; X32-NEXT: retl
define double @int2(double %a, float %b, float %c) {
; X32-LABEL: @int2
-; X32: movsd 8(%ebp), %xmm0 {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: movss 16(%ebp), %xmm1 {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: addss 20(%ebp), %xmm1
-; X32-NEXT: andpd .LCPI3_0, %xmm0
-; X32-NEXT: cvtss2sd %xmm1, %xmm1
-; X32-NEXT: andpd .LCPI3_1, %xmm1
-; X32-NEXT: orpd %xmm0, %xmm1
-; X32-NEXT: movsd %xmm1, (%esp)
+; X32: movss 16(%ebp), %xmm0 {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: addss 20(%ebp), %xmm0
+; X32-NEXT: movsd 8(%ebp), %xmm1 {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT: andpd .LCPI3_0, %xmm1
+; X32-NEXT: cvtss2sd %xmm0, %xmm0
+; X32-NEXT: andpd .LCPI3_1, %xmm0
+; X32-NEXT: orpd %xmm1, %xmm0
+; X32-NEXT: movlpd %xmm0, (%esp)
; X32-NEXT: fldl (%esp)
; X32-NEXT: movl %ebp, %esp
; X32-NEXT: popl %ebp
;
; X64-LABEL: @int2
; X64: addss %xmm2, %xmm1
-; X64-NEXT: andpd .LCPI3_0(%rip), %xmm0
; X64-NEXT: cvtss2sd %xmm1, %xmm1
-; X64-NEXT: andpd .LCPI3_1(%rip), %xmm1
+; X64-NEXT: andpd .LCPI3_0(%rip), %xmm1
+; X64-NEXT: andpd .LCPI3_1(%rip), %xmm0
; X64-NEXT: orpd %xmm1, %xmm0
; X64-NEXT: retq
%tmp1 = fadd float %b, %c
define <2 x double> @fabs_v2f64(<2 x double> %p)
{
; CHECK-LABEL: fabs_v2f64
- ; CHECK: vandps
+ ; CHECK: vandpd
%t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
ret <2 x double> %t
}
define <4 x double> @fabs_v4f64(<4 x double> %p)
{
; CHECK-LABEL: fabs_v4f64
- ; CHECK: vandps
+ ; CHECK: vandpd
%t = call <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
ret <4 x double> %t
}