test/CodeGen/X86/unaligned-32-byte-memops.ll

   1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB
   2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB
   3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
   4 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL
   5
   6 ; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load
   7 ; because that is slower than two 16-byte loads.
   8 ; Other AVX-capable chips don't have that problem.
   9
  10 define <8 x float> @load32bytes(<8 x float>* %Ap) {
  11   ; CHECK-LABEL: load32bytes
  12
  13   ; SANDYB: vmovaps
  14   ; SANDYB: vinsertf128
  15   ; SANDYB: retq
  16
  17   ; BTVER2: vmovups
  18   ; BTVER2: retq
  19
  20   ; HASWELL: vmovups
  21   ; HASWELL: retq
  22
  23   %A = load <8 x float>* %Ap, align 16
  24   ret <8 x float> %A
  25 }
  26
  27 ; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte store
  28 ; because that is slowerthan two 16-byte stores.
  29 ; Other AVX-capable chips don't have that problem.
  30
  31 define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
  32   ; CHECK-LABEL: store32bytes
  33
  34   ; SANDYB: vextractf128
  35   ; SANDYB: vmovaps
  36   ; SANDYB: retq
  37
  38   ; BTVER2: vmovups
  39   ; BTVER2: retq
  40
  41   ; HASWELL: vmovups
  42   ; HASWELL: retq
  43
  44   store <8 x float> %A, <8 x float>* %P, align 16
  45   ret void
  46 }