test/CodeGen/X86/vec_shift6.ll

   1 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
   2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 -check-prefix=AVX2ONLY
   3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=knl | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 -check-prefix=AVX512
   4
   5
   6 ; Verify that we don't scalarize a packed vector shift left of 16-bit
   7 ; signed integers if the amount is a constant build_vector.
   8 ; Check that we produce a SSE2 packed integer multiply (pmullw) instead.
   9
  10 define <8 x i16> @test1(<8 x i16> %a) {
  11   %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
  12   ret <8 x i16> %shl
  13 }
  14 ; CHECK-LABEL: test1
  15 ; CHECK: pmullw
  16 ; CHECK-NEXT: ret
  17
  18
  19 define <8 x i16> @test2(<8 x i16> %a) {
  20   %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
  21   ret <8 x i16> %shl
  22 }
  23 ; CHECK-LABEL: test2
  24 ; CHECK: pmullw
  25 ; CHECK-NEXT: ret
  26
  27
  28 ; Verify that a vector shift left of 32-bit signed integers is simply expanded
  29 ; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift
  30 ; counts is a constant build_vector.
  31
  32 define <4 x i32> @test3(<4 x i32> %a) {
  33   %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
  34   ret <4 x i32> %shl
  35 }
  36 ; CHECK-LABEL: test3
  37 ; CHECK-NOT: cvttps2dq
  38 ; SSE: pmulld
  39 ; AVX2: vpsllvd
  40 ; CHECK-NEXT: ret
  41
  42
  43 define <4 x i32> @test4(<4 x i32> %a) {
  44   %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
  45   ret <4 x i32> %shl
  46 }
  47 ; CHECK-LABEL: test4
  48 ; CHECK-NOT: cvttps2dq
  49 ; SSE: pmulld
  50 ; AVX2: vpsllvd
  51 ; CHECK-NEXT: ret
  52
  53
  54 ; If we have AVX/SSE2 but not AVX2, verify that the following shift is split
  55 ; into two pmullw instructions. With AVX2, the test case below would produce
  56 ; a single vpmullw.
  57
  58 define <16 x i16> @test5(<16 x i16> %a) {
  59   %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
  60   ret <16 x i16> %shl
  61 }
  62 ; CHECK-LABEL: test5
  63 ; SSE: pmullw
  64 ; SSE-NEXT: pmullw
  65 ; AVX2: vpmullw
  66 ; AVX2-NOT: vpmullw
  67 ; CHECK: ret
  68
  69
  70 ; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split
  71 ; into two pmulld instructions. With AVX2, the test case below would produce
  72 ; a single vpsllvd instead.
  73
  74 define <8 x i32> @test6(<8 x i32> %a) {
  75   %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
  76   ret <8 x i32> %shl
  77 }
  78 ; CHECK-LABEL: test6
  79 ; SSE: pmulld
  80 ; SSE-NEXT: pmulld
  81 ; AVX2: vpsllvd
  82 ; CHECK: ret
  83
  84
  85 ; With AVX2 and AVX512, the test case below should produce a sequence of
  86 ; two vpmullw instructions. On SSE2 instead, we split the shift in four
  87 ; parts and then we convert each part into a pmullw.
  88
  89 define <32 x i16> @test7(<32 x i16> %a) {
  90   %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
  91   ret <32 x i16> %shl
  92 }
  93 ; CHECK-LABEL: test7
  94 ; SSE: pmullw
  95 ; SSE-NEXT: pmullw
  96 ; SSE-NEXT: pmullw
  97 ; SSE-NEXT: pmullw
  98 ; AVX2: vpmullw
  99 ; AVX2-NEXT: vpmullw
 100 ; CHECK: ret
 101
 102
 103 ; Similar to test7; the difference is that with AVX512 support
 104 ; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq.
 105
 106 define <16 x i32> @test8(<16 x i32> %a) {
 107   %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
 108   ret <16 x i32> %shl
 109 }
 110 ; CHECK-LABEL: test8
 111 ; SSE: pmulld
 112 ; SSE-NEXT: pmulld
 113 ; SSE-NEXT: pmulld
 114 ; SSE-NEXT: pmulld
 115 ; AVX2ONLY: vpsllvd
 116 ; AVX2ONLY-NEXT: vpsllvd
 117 ; AVX512: vpsllvd
 118 ; AVX512-NOT: vpsllvd
 119 ; CHECK: ret
 120
 121
 122 ; The shift from 'test9' gets scalarized if we don't have AVX2/AVX512f support.
 123
 124 define <8 x i64> @test9(<8 x i64> %a) {
 125   %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
 126   ret <8 x i64> %shl
 127 }
 128 ; CHECK-LABEL: test9
 129 ; AVX2ONLY: vpsllvq
 130 ; AVX2ONLY-NEXT: vpsllvq
 131 ; AVX512: vpsllvq
 132 ; AVX512-NOT: vpsllvq
 133 ; CHECK: ret
 134