make fast unaligned memory accesses implicit with SSE4.2 or SSE4a

author Sanjay Patel <spatel@rotateright.com>

Tue, 25 Aug 2015 16:29:21 +0000 (16:29 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Tue, 25 Aug 2015 16:29:21 +0000 (16:29 +0000)
author Sanjay Patel <spatel@rotateright.com>
Tue, 25 Aug 2015 16:29:21 +0000 (16:29 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Tue, 25 Aug 2015 16:29:21 +0000 (16:29 +0000)
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp

index 565ba1ded7e53a1441c1030f161131853c60422f..b23b3c0e99a4de8da8affa6f7b04e60d4de9d0a9 100644 (file)
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -192,6 +192,13 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
    // Parse features string and set the CPU.
    ParseSubtargetFeatures(CPUName, FullFS);
  
+  // All CPUs that implement SSE4.2 or SSE4A support unaligned accesses of
+  // 16-bytes and under that are reasonably fast. These features were
+  // introduced with Intel's Nehalem/Silvermont and AMD's Family10h
+  // micro-architectures respectively.
+  if (hasSSE42() || hasSSE4A())
+    IsUAMemUnder32Slow = false;
+  
    InstrItins = getInstrItineraryForCPU(CPUName);
  
    // It's important to keep the MCSubtargetInfo feature bits in sync with
diff --git a/test/CodeGen/X86/slow-unaligned-mem.ll b/test/CodeGen/X86/slow-unaligned-mem.ll

index f8688e3435cd0cf9cba5d62f8f80c346811e1c84..27cbef681b7e5071d413cb349aaf9175c010bd60 100644 (file)
--- a/test/CodeGen/X86/slow-unaligned-mem.ll
+++ b/test/CodeGen/X86/slow-unaligned-mem.ll
@@ -55,6 +55,11 @@
  ; Slow chips use 4-byte stores. Fast chips with SSE or later use something other than 4-byte stores.
  ; Chips that don't have SSE use 4-byte stores either way, so they're not tested.
  
+; Also verify that SSE4.2 or SSE4a imply fast unaligned accesses.
+
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=sse4.2       2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=sse4a        2>&1 | FileCheck %s --check-prefix=FAST
+
  define void @store_zeros(i8* %a) {
  ; SLOW-NOT: not a recognized processor
  ; SLOW-LABEL: store_zeros:
author	Sanjay Patel <spatel@rotateright.com>
	Tue, 25 Aug 2015 16:29:21 +0000 (16:29 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Tue, 25 Aug 2015 16:29:21 +0000 (16:29 +0000)
lib/Target/X86/X86Subtarget.cpp		patch \| blob \| history
test/CodeGen/X86/slow-unaligned-mem.ll		patch \| blob \| history