[X86] Require 32-byte alignment for 32-byte VMOVNTs.

author Ahmed Bougacha <ahmed.bougacha@gmail.com>

Wed, 2 Sep 2015 23:25:39 +0000 (23:25 +0000)

committer Ahmed Bougacha <ahmed.bougacha@gmail.com>

Wed, 2 Sep 2015 23:25:39 +0000 (23:25 +0000)
author Ahmed Bougacha <ahmed.bougacha@gmail.com>
Wed, 2 Sep 2015 23:25:39 +0000 (23:25 +0000)
committer Ahmed Bougacha <ahmed.bougacha@gmail.com>
Wed, 2 Sep 2015 23:25:39 +0000 (23:25 +0000)
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp

index aeefa38f74d6cb6ebe4f1092020915339ce33700..71448feb9ab33fe4ea0fd668dcfc7d9394929e5d 100644 (file)
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -370,7 +370,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                                          "cast");
        StoreInst *SI = Builder.CreateStore(Arg1, BC);
        SI->setMetadata(M->getMDKindID("nontemporal"), Node);
-      SI->setAlignment(16);
+      SI->setAlignment(32);
  
        // Remove intrinsic.
        CI->eraseFromParent();
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td

index b8ab1feed9ea7d30bd92fc36185dca7f430f7822..19bf986c33c951809c4ee3921686551eb21a0dc7 100644 (file)
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -626,12 +626,14 @@ def nontemporalstore : PatFrag<(ops node:$val, node:$ptr),
  
  def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
                                        (nontemporalstore node:$val, node:$ptr), [{
-  return cast<StoreSDNode>(N)->getAlignment() >= 16;
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  return St->getAlignment() >= St->getMemoryVT().getStoreSize();
  }]>;
  
  def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
                                          (nontemporalstore node:$val, node:$ptr), [{
-  return cast<StoreSDNode>(N)->getAlignment() < 16;
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  return St->getAlignment() < St->getMemoryVT().getStoreSize();
  }]>;
  
  def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
diff --git a/test/CodeGen/X86/avx2-nontemporal.ll b/test/CodeGen/X86/avx2-nontemporal.ll

index 544c096c52df07b14b1c0b7c6962d05875dabc2c..058358f13b86461c4ca7fa03fac256ae2f94ab35 100644 (file)
--- a/test/CodeGen/X86/avx2-nontemporal.ll
+++ b/test/CodeGen/X86/avx2-nontemporal.ll
@@ -4,15 +4,15 @@ define void @f(<8 x float> %A, i8* %B, <4 x double> %C, <4 x i64> %E) {
  ; CHECK: vmovntps %y
    %cast = bitcast i8* %B to <8 x float>*
    %A2 = fadd <8 x float> %A, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x4200000000000000>
-  store <8 x float> %A2, <8 x float>* %cast, align 16, !nontemporal !0
+  store <8 x float> %A2, <8 x float>* %cast, align 32, !nontemporal !0
  ; CHECK: vmovntdq %y
    %cast1 = bitcast i8* %B to <4 x i64>*
    %E2 = add <4 x i64> %E, <i64 1, i64 2, i64 3, i64 4>
-  store <4 x i64> %E2, <4 x i64>* %cast1, align 16, !nontemporal !0
+  store <4 x i64> %E2, <4 x i64>* %cast1, align 32, !nontemporal !0
  ; CHECK: vmovntpd %y
    %cast2 = bitcast i8* %B to <4 x double>*
    %C2 = fadd <4 x double> %C, <double 0x0, double 0x0, double 0x0, double 0x4200000000000000>
-  store <4 x double> %C2, <4 x double>* %cast2, align 16, !nontemporal !0
+  store <4 x double> %C2, <4 x double>* %cast2, align 32, !nontemporal !0
    ret void
  }
  
diff --git a/test/CodeGen/X86/movntdq-no-avx.ll b/test/CodeGen/X86/movntdq-no-avx.ll

index cc35e201e6b3e07c01c07193e81a644967ffb6e4..2bf09dd6f5816e3362d5e1571b2d44185dff4a54 100644 (file)
--- a/test/CodeGen/X86/movntdq-no-avx.ll
+++ b/test/CodeGen/X86/movntdq-no-avx.ll
@@ -5,7 +5,7 @@
  
  define void @test(<2 x i64>* nocapture %a, <2 x i64> %b) nounwind optsize {
  entry:
-  store <2 x i64> %b, <2 x i64>* %a, align 16, !nontemporal !0
+  store <2 x i64> %b, <2 x i64>* %a, align 32, !nontemporal !0
    ret void
  }
  
diff --git a/test/CodeGen/X86/nontemporal-2.ll b/test/CodeGen/X86/nontemporal-2.ll

index 8c08b3c163c0249493ce15abecad80f1ca422c90..c9767f88488c9d7b0b0ecd4a4d0e7ff0f8470d04 100644 (file)
--- a/test/CodeGen/X86/nontemporal-2.ll
+++ b/test/CodeGen/X86/nontemporal-2.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
  
  ; Make sure that we generate non-temporal stores for the test cases below.
  ; We use xorps for zeroing, so domain information isn't available anymore.
@@ -300,4 +300,19 @@ define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) {
    ret void
  }
  
+; 256-bit NT stores require 256-bit alignment.
+; FIXME: For AVX, we could lower this to 2x movntps %xmm. Taken further, we
+; could even scalarize to movnti when we have 1-alignment: nontemporal is
+; probably always worth even some 20 instruction scalarization.
+define void @test_unaligned_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
+; CHECK-LABEL: test_unaligned_v8f32:
+; SSE: movntps %xmm
+; SSE: movntps %xmm
+; AVX-NOT: movnt
+; AVX: vmovups %ymm
+  %r = fadd <8 x float> %a, %b
+  store <8 x float> %r, <8 x float>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
  !1 = !{i32 1}
author	Ahmed Bougacha <ahmed.bougacha@gmail.com>
	Wed, 2 Sep 2015 23:25:39 +0000 (23:25 +0000)
committer	Ahmed Bougacha <ahmed.bougacha@gmail.com>
	Wed, 2 Sep 2015 23:25:39 +0000 (23:25 +0000)
lib/IR/AutoUpgrade.cpp		patch \| blob \| history
lib/Target/X86/X86InstrFragmentsSIMD.td		patch \| blob \| history
test/CodeGen/X86/avx2-nontemporal.ll		patch \| blob \| history
test/CodeGen/X86/movntdq-no-avx.ll		patch \| blob \| history
test/CodeGen/X86/nontemporal-2.ll		patch \| blob \| history