fix invalid load folding with SSE/AVX FP logical instructions (PR22371)

author Sanjay Patel <spatel@rotateright.com>

Tue, 28 Jul 2015 00:48:32 +0000 (00:48 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Tue, 28 Jul 2015 00:48:32 +0000 (00:48 +0000)
author Sanjay Patel <spatel@rotateright.com>
Tue, 28 Jul 2015 00:48:32 +0000 (00:48 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Tue, 28 Jul 2015 00:48:32 +0000 (00:48 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 8ed9733cf2df8db8f37711244492540941969ef4..109047751791be80a6644935a70ebfa8a84ad375 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -12677,24 +12677,29 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
        if (User->getOpcode() == ISD::FNEG)
          return Op;
  
-  SDValue Op0 = Op.getOperand(0);
-  bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
-
    SDLoc dl(Op);
    MVT VT = Op.getSimpleValueType();
-  // Assume scalar op for initialization; update for vector if needed.
-  // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
-  // generate a 16-byte vector constant and logic op even for the scalar case.
-  // Using a 16-byte mask allows folding the load of the mask with
-  // the logic op, so it can save (~4 bytes) on code size.
-  MVT EltVT = VT;
-  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+
    // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
    // decide if we should generate a 16-byte constant mask when we only need 4 or
    // 8 bytes for the scalar case.
+
+  MVT LogicVT;
+  MVT EltVT;
+  unsigned NumElts;
+  
    if (VT.isVector()) {
+    LogicVT = VT;
      EltVT = VT.getVectorElementType();
      NumElts = VT.getVectorNumElements();
+  } else {
+    // There are no scalar bitwise logical SSE/AVX instructions, so we
+    // generate a 16-byte vector constant and logic op even for the scalar case.
+    // Using a 16-byte mask allows folding the load of the mask with
+    // the logic op, so it can save (~4 bytes) on code size.
+    LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+    EltVT = VT;
+    NumElts = (VT == MVT::f64) ? 2 : 4;
    }
  
    unsigned EltBits = EltVT.getSizeInBits();
@@ -12707,26 +12712,25 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
    unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
-  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+  SDValue Mask = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
                               MachinePointerInfo::getConstantPool(),
                               false, false, false, Alignment);
  
-  if (VT.isVector()) {
-    // For a vector, cast operands to a vector type, perform the logic op,
-    // and cast the result back to the original value type.
-    MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
-    SDValue MaskCasted = DAG.getBitcast(VecVT, Mask);
-    SDValue Operand = IsFNABS ? DAG.getBitcast(VecVT, Op0.getOperand(0))
-                              : DAG.getBitcast(VecVT, Op0);
-    unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
-    return DAG.getBitcast(VT,
-                          DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
-  }
-
-  // If not vector, then scalar.
-  unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
+  SDValue Op0 = Op.getOperand(0);
+  bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
+  unsigned LogicOp =
+    IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
    SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
-  return DAG.getNode(BitOp, dl, VT, Operand, Mask);
+
+  if (VT.isVector())
+    return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+
+  // For the scalar case extend to a 128-bit vector, perform the logic op,
+  // and extract the scalar result back out.
+  Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
+  SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
+                     DAG.getIntPtrConstant(0, dl));
  }
  
  static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
@@ -12766,10 +12770,16 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
    Constant *C = ConstantVector::get(CV);
    auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
    SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16);
-  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
+
+  // Perform all logic operations as 16-byte vectors because there are no
+  // scalar FP logic instructions in SSE. This allows load folding of the
+  // constants into the logic instructions.
+  MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+  SDValue Mask1 = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
                                MachinePointerInfo::getConstantPool(),
                                false, false, false, 16);
-  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
+  Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
+  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
  
    // Next, clear the sign bit from the first operand (magnitude).
    // If it's a constant, we can clear it here.
@@ -12777,7 +12787,8 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
      APFloat APF = Op0CN->getValueAPF();
      // If the magnitude is a positive zero, the sign bit alone is enough.
      if (APF.isPosZero())
-      return SignBit;
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
+                         DAG.getIntPtrConstant(0, dl));
      APF.clearSign();
      CV[0] = ConstantFP::get(*Context, APF);
    } else {
@@ -12787,15 +12798,18 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
    }
    C = ConstantVector::get(CV);
    CPIdx = DAG.getConstantPool(C, PtrVT, 16);
-  SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+  SDValue Val = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
                              false, false, false, 16);
    // If the magnitude operand wasn't a constant, we need to AND out the sign.
-  if (!isa<ConstantFPSDNode>(Op0))
-    Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
-
+  if (!isa<ConstantFPSDNode>(Op0)) {
+    Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
+    Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
+  }
    // OR the magnitude value with the sign bit.
-  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
+  Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
+                     DAG.getIntPtrConstant(0, dl));
  }
  
  static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp

index 65df840a935ef62ea6f4d060e06ef8c4090ac198..98697e834e476f97ee8dc497378edc8ca27b998a 100644 (file)
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -959,18 +959,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::DPPDrri,         X86::DPPDrmi,       TB_ALIGN_16 },
      { X86::DPPSrri,         X86::DPPSrmi,       TB_ALIGN_16 },
  
-    // FIXME: We should not be folding Fs* scalar loads into vector
-    // instructions because the vector instructions require vector-sized
-    // loads. Lowering should create vector-sized instructions (the Fv*
-    // variants below) to allow load folding.
-    { X86::FsANDNPDrr,      X86::FsANDNPDrm,    TB_ALIGN_16 },
-    { X86::FsANDNPSrr,      X86::FsANDNPSrm,    TB_ALIGN_16 },
-    { X86::FsANDPDrr,       X86::FsANDPDrm,     TB_ALIGN_16 },
-    { X86::FsANDPSrr,       X86::FsANDPSrm,     TB_ALIGN_16 },
-    { X86::FsORPDrr,        X86::FsORPDrm,      TB_ALIGN_16 },
-    { X86::FsORPSrr,        X86::FsORPSrm,      TB_ALIGN_16 },
-    { X86::FsXORPDrr,       X86::FsXORPDrm,     TB_ALIGN_16 },
-    { X86::FsXORPSrr,       X86::FsXORPSrm,     TB_ALIGN_16 },
+    // Do not fold Fs* scalar logical op loads because there are no scalar
+    // load variants for these instructions. When folded, the load is required
+    // to be 128-bits, so the load size would not match.
  
      { X86::FvANDNPDrr,      X86::FvANDNPDrm,    TB_ALIGN_16 },
      { X86::FvANDNPSrr,      X86::FvANDNPSrm,    TB_ALIGN_16 },
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td

index 5a3fbca9ff7294c73fc69b802e6937b2a75f68a1..1a77d0efdd3f233f8bb3ff30694cd5c4ba23af7e 100644 (file)
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -2924,6 +2924,14 @@ multiclass sse12_fp_packed_vector_logical_alias<
    defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
          VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>,
          PD, VEX_4V;
+
+  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+        VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>,
+        PS, VEX_4V, VEX_L;
+        
+  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+        VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>,
+        PD, VEX_4V, VEX_L;
    }
  
    let Constraints = "$src1 = $dst" in {
diff --git a/test/CodeGen/X86/pr2656.ll b/test/CodeGen/X86/pr2656.ll

index 9a162d77ef48c0b3eb3d3272f2800473b1894f6a..095ab831d48d7fb7d64f8014c7e4da0ac69f23ca 100644 (file)
--- a/test/CodeGen/X86/pr2656.ll
+++ b/test/CodeGen/X86/pr2656.ll
@@ -1,15 +1,24 @@
  ; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
  ; PR2656
  
-; CHECK:     {{xorps.*sp}}
-; CHECK-NOT: {{xorps.*sp}}
-
  target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
  target triple = "i686-apple-darwin9.4.0"
         %struct.anon = type <{ float, float }>
  @.str = internal constant [17 x i8] c"pt: %.0f, %.0f\0A\00\00"         ; <[17 x i8]*> [#uses=1]
  
+; We can not fold either stack load into an 'xor' instruction because that
+; would change what should be a 4-byte load into a 16-byte load.
+; We can fold the 16-byte constant load into either 'xor' instruction,
+; but we do not. It has more than one use, so it gets loaded into a register.
+
  define void @foo(%struct.anon* byval %p) nounwind {
+; CHECK-LABEL: foo:
+; CHECK:         movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movaps {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-NEXT:    xorps %xmm2, %xmm0
+; CHECK-NEXT:    cvtss2sd %xmm0, %xmm0
+; CHECK-NEXT:    xorps %xmm2, %xmm1
  entry:
         %tmp = getelementptr %struct.anon, %struct.anon* %p, i32 0, i32 0               ; <float*> [#uses=1]
         %tmp1 = load float, float* %tmp         ; <float> [#uses=1]
@@ -24,3 +33,20 @@ entry:
  }
  
  declare i32 @printf(...)
+
+; We can not fold the load from the stack into the 'and' instruction because
+; that changes an 8-byte load into a 16-byte load (illegal memory access).
+; We can fold the load of the constant because it is a 16-byte vector constant.
+
+define double @PR22371(double %x) {
+; CHECK-LABEL: PR22371:
+; CHECK:       movsd  16(%esp), %xmm0
+; CHECK-NEXT:  andpd  LCPI1_0, %xmm0
+; CHECK-NEXT:  movlpd  %xmm0, (%esp)
+  %call = tail call double @fabs(double %x) #0
+  ret double %call
+}
+
+declare double @fabs(double) #0
+attributes #0 = { readnone }
+
diff --git a/test/CodeGen/X86/sse-fcopysign.ll b/test/CodeGen/X86/sse-fcopysign.ll

index 25634b5472aa90baca3be16411b4206f5f7493e4..8a5462bea82d0383013ec8bc6e4764fb3a570714 100644 (file)
--- a/test/CodeGen/X86/sse-fcopysign.ll
+++ b/test/CodeGen/X86/sse-fcopysign.ll
@@ -55,12 +55,12 @@ declare double @copysign(double, double)
  
  define float @int1(float %a, float %b) {
  ; X32-LABEL: @int1
-; X32:       movss 12(%esp), %xmm0 {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT:  movss  8(%esp), %xmm1 {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:  andps .LCPI2_0, %xmm1
-; X32-NEXT:  andps .LCPI2_1, %xmm0
-; X32-NEXT:  orps  %xmm1, %xmm0
-; X32-NEXT:  movss %xmm0, (%esp)
+; X32:       movss  8(%esp), %xmm0 {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:  andps .LCPI2_0, %xmm0
+; X32-NEXT:  movss 12(%esp), %xmm1 {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:  andps .LCPI2_1, %xmm1
+; X32-NEXT:  orps  %xmm0, %xmm1
+; X32-NEXT:  movss %xmm1, (%esp)
  ; X32-NEXT:  flds  (%esp)
  ; X32-NEXT:  popl %eax
  ; X32-NEXT:  retl
@@ -76,14 +76,14 @@ define float @int1(float %a, float %b) {
  
  define double @int2(double %a, float %b, float %c) {
  ; X32-LABEL: @int2
-; X32:       movsd  8(%ebp), %xmm0 {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:  movss 16(%ebp), %xmm1 {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:  addss 20(%ebp), %xmm1
-; X32-NEXT:  andpd .LCPI3_0, %xmm0
-; X32-NEXT:  cvtss2sd %xmm1, %xmm1
-; X32-NEXT:  andpd .LCPI3_1, %xmm1
-; X32-NEXT:  orpd  %xmm0, %xmm1
-; X32-NEXT:  movsd %xmm1, (%esp)
+; X32:       movss 16(%ebp), %xmm0 {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:  addss 20(%ebp), %xmm0
+; X32-NEXT:  movsd  8(%ebp), %xmm1 {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT:  andpd .LCPI3_0, %xmm1
+; X32-NEXT:  cvtss2sd %xmm0, %xmm0
+; X32-NEXT:  andpd .LCPI3_1, %xmm0
+; X32-NEXT:  orpd  %xmm1, %xmm0
+; X32-NEXT:  movlpd %xmm0, (%esp)
  ; X32-NEXT:  fldl  (%esp)
  ; X32-NEXT:  movl %ebp, %esp
  ; X32-NEXT:  popl %ebp
@@ -91,9 +91,9 @@ define double @int2(double %a, float %b, float %c) {
  ;
  ; X64-LABEL: @int2
  ; X64:       addss %xmm2, %xmm1
-; X64-NEXT:  andpd .LCPI3_0(%rip), %xmm0
  ; X64-NEXT:  cvtss2sd %xmm1, %xmm1
-; X64-NEXT:  andpd .LCPI3_1(%rip), %xmm1
+; X64-NEXT:  andpd .LCPI3_0(%rip), %xmm1
+; X64-NEXT:  andpd .LCPI3_1(%rip), %xmm0
  ; X64-NEXT:  orpd %xmm1, %xmm0
  ; X64-NEXT:  retq
    %tmp1 = fadd float %b, %c
diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll

index bfefbcf5ebd3cb148040a6a96f580000c37b1b01..960b5f27cf5357edc9df7cc6333fd9bf5260f89e 100644 (file)
--- a/test/CodeGen/X86/vec_fabs.ll
+++ b/test/CodeGen/X86/vec_fabs.ll
@@ -4,7 +4,7 @@
  define <2 x double> @fabs_v2f64(<2 x double> %p)
  {
    ; CHECK-LABEL: fabs_v2f64
-  ; CHECK: vandps
+  ; CHECK: vandpd
    %t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
    ret <2 x double> %t
  }
@@ -22,7 +22,7 @@ declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
  define <4 x double> @fabs_v4f64(<4 x double> %p)
  {
    ; CHECK-LABEL: fabs_v4f64
-  ; CHECK: vandps
+  ; CHECK: vandpd
    %t = call <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
    ret <4 x double> %t
  }
author	Sanjay Patel <spatel@rotateright.com>
	Tue, 28 Jul 2015 00:48:32 +0000 (00:48 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Tue, 28 Jul 2015 00:48:32 +0000 (00:48 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86InstrInfo.cpp		patch \| blob \| history
lib/Target/X86/X86InstrSSE.td		patch \| blob \| history
test/CodeGen/X86/pr2656.ll		patch \| blob \| history
test/CodeGen/X86/sse-fcopysign.ll		patch \| blob \| history
test/CodeGen/X86/vec_fabs.ll		patch \| blob \| history