test/CodeGen/X86/combine-multiplies.ll

   1 ; RUN: llc < %s -mattr=sse2 -mtriple=i386-unknown-linux-gnu | FileCheck %s
   2
   3 ; Source file looks something like this:
   4 ;
   5 ; typedef int AAA[100][100];
   6 ;
   7 ; void testCombineMultiplies(AAA a,int lll)
   8 ; {
   9 ;   int LOC = lll + 5;
  10 ;
  11 ;   a[LOC][LOC] = 11;
  12 ;
  13 ;   a[LOC][20] = 22;
  14 ;   a[LOC+20][20] = 33;
  15 ; }
  16 ;
  17 ; We want to make sure we don't generate 2 multiply instructions,
  18 ; one for a[LOC][] and one for a[LOC+20]. visitMUL in DAGCombiner.cpp
  19 ; should combine the instructions in such a way to avoid the extra
  20 ; multiply.
  21 ;
  22 ; Output looks roughly like this:
  23 ;
  24 ;       movl    8(%esp), %eax
  25 ;       movl    12(%esp), %ecx
  26 ;       imull   $400, %ecx, %edx        # imm = 0x190
  27 ;       leal    (%edx,%eax), %esi
  28 ;       movl    $11, 2020(%esi,%ecx,4)
  29 ;       movl    $22, 2080(%edx,%eax)
  30 ;       movl    $33, 10080(%edx,%eax)
  31 ;
  32 ; CHECK-LABEL: testCombineMultiplies
  33 ; CHECK: imull $400, [[ARG1:%[a-z]+]], [[MUL:%[a-z]+]] # imm = 0x190
  34 ; CHECK-NEXT: leal ([[MUL]],[[ARG2:%[a-z]+]]), [[LEA:%[a-z]+]]
  35 ; CHECK-NEXT: movl $11, {{[0-9]+}}([[LEA]],[[ARG1]],4)
  36 ; CHECK-NEXT: movl $22, {{[0-9]+}}([[MUL]],[[ARG2]])
  37 ; CHECK-NEXT: movl $33, {{[0-9]+}}([[MUL]],[[ARG2]])
  38 ; CHECK: retl
  39 ;
  40
  41 ; Function Attrs: nounwind
  42 define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) {
  43 entry:
  44   %add = add nsw i32 %lll, 5
  45   %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 %add
  46   store i32 11, i32* %arrayidx1, align 4
  47   %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 20
  48   store i32 22, i32* %arrayidx3, align 4
  49   %add4 = add nsw i32 %lll, 25
  50   %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add4, i32 20
  51   store i32 33, i32* %arrayidx6, align 4
  52   ret void
  53 }
  54
  55
  56 ; Test for the same optimization on vector multiplies.
  57 ;
  58 ; Source looks something like this:
  59 ;
  60 ; typedef int v4int __attribute__((__vector_size__(16)));
  61 ;
  62 ; v4int x;
  63 ; v4int v2, v3;
  64 ; void testCombineMultiplies_splat(v4int v1) {
  65 ;   v2 = (v1 + (v4int){ 11, 11, 11, 11 }) * (v4int) {22, 22, 22, 22};
  66 ;   v3 = (v1 + (v4int){ 33, 33, 33, 33 }) * (v4int) {22, 22, 22, 22};
  67 ;   x = (v1 + (v4int){ 11, 11, 11, 11 });
  68 ; }
  69 ;
  70 ; Output looks something like this:
  71 ;
  72 ; testCombineMultiplies_splat:                              # @testCombineMultiplies_splat
  73 ; # BB#0:                                 # %entry
  74 ;       movdqa  .LCPI1_0, %xmm1         # xmm1 = [11,11,11,11]
  75 ;       paddd   %xmm0, %xmm1
  76 ;       movdqa  .LCPI1_1, %xmm2         # xmm2 = [22,22,22,22]
  77 ;       pshufd  $245, %xmm0, %xmm3      # xmm3 = xmm0[1,1,3,3]
  78 ;       pmuludq %xmm2, %xmm0
  79 ;       pshufd  $232, %xmm0, %xmm0      # xmm0 = xmm0[0,2,2,3]
  80 ;       pmuludq %xmm2, %xmm3
  81 ;       pshufd  $232, %xmm3, %xmm2      # xmm2 = xmm3[0,2,2,3]
  82 ;       punpckldq       %xmm2, %xmm0    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
  83 ;       movdqa  .LCPI1_2, %xmm2         # xmm2 = [242,242,242,242]
  84 ;       paddd   %xmm0, %xmm2
  85 ;       paddd   .LCPI1_3, %xmm0
  86 ;       movdqa  %xmm2, v2
  87 ;       movdqa  %xmm0, v3
  88 ;       movdqa  %xmm1, x
  89 ;       retl
  90 ;
  91 ; Again, we want to make sure we don't generate two different multiplies.
  92 ; We should have a single multiply for "v1 * {22, 22, 22, 22}" (made up of two
  93 ; pmuludq instructions), followed by two adds. Without this optimization, we'd
  94 ; do 2 adds, followed by 2 multiplies (i.e. 4 pmuludq instructions).
  95 ;
  96 ; CHECK-LABEL: testCombineMultiplies_splat
  97 ; CHECK:       movdqa .LCPI1_0, [[C11:%xmm[0-9]]]
  98 ; CHECK-NEXT:  paddd %xmm0, [[C11]]
  99 ; CHECK-NEXT:  movdqa .LCPI1_1, [[C22:%xmm[0-9]]]
 100 ; CHECK-NEXT:  pshufd $245, %xmm0, [[T1:%xmm[0-9]]]
 101 ; CHECK-NEXT:  pmuludq [[C22]], [[T2:%xmm[0-9]]]
 102 ; CHECK-NEXT:  pshufd $232, [[T2]], [[T3:%xmm[0-9]]]
 103 ; CHECK-NEXT:  pmuludq [[C22]], [[T4:%xmm[0-9]]]
 104 ; CHECK-NEXT:  pshufd $232, [[T4]], [[T5:%xmm[0-9]]]
 105 ; CHECK-NEXT:  punpckldq [[T5]], [[T6:%xmm[0-9]]]
 106 ; CHECK-NEXT:  movdqa .LCPI1_2, [[C242:%xmm[0-9]]]
 107 ; CHECK-NEXT:  paddd [[T6]], [[C242]]
 108 ; CHECK-NEXT:  paddd .LCPI1_3, [[C726:%xmm[0-9]]]
 109 ; CHECK-NEXT:  movdqa [[C242]], v2
 110 ; CHECK-NEXT:  [[C726]], v3
 111 ; CHECK-NEXT:  [[C11]], x
 112 ; CHECK-NEXT:  retl
 113
 114 @v2 = common global <4 x i32> zeroinitializer, align 16
 115 @v3 = common global <4 x i32> zeroinitializer, align 16
 116 @x = common global <4 x i32> zeroinitializer, align 16
 117
 118 ; Function Attrs: nounwind
 119 define void @testCombineMultiplies_splat(<4 x i32> %v1) {
 120 entry:
 121   %add1 = add <4 x i32> %v1, <i32 11, i32 11, i32 11, i32 11>
 122   %mul1 = mul <4 x i32> %add1, <i32 22, i32 22, i32 22, i32 22>
 123   %add2 = add <4 x i32> %v1, <i32 33, i32 33, i32 33, i32 33>
 124   %mul2 = mul <4 x i32> %add2, <i32 22, i32 22, i32 22, i32 22>
 125   store <4 x i32> %mul1, <4 x i32>* @v2, align 16
 126   store <4 x i32> %mul2, <4 x i32>* @v3, align 16
 127   store <4 x i32> %add1, <4 x i32>* @x, align 16
 128   ret void
 129 }
 130
 131 ; Finally, check the non-splatted vector case. This is very similar
 132 ; to the previous test case, except for the vector values.
 133 ;
 134 ; CHECK-LABEL: testCombineMultiplies_non_splat
 135 ; CHECK:       movdqa .LCPI2_0, [[C11:%xmm[0-9]]]
 136 ; CHECK-NEXT:  paddd %xmm0, [[C11]]
 137 ; CHECK-NEXT:  movdqa .LCPI2_1, [[C22:%xmm[0-9]]]
 138 ; CHECK-NEXT:  pshufd $245, %xmm0, [[T1:%xmm[0-9]]]
 139 ; CHECK-NEXT:  pmuludq [[C22]], [[T2:%xmm[0-9]]]
 140 ; CHECK-NEXT:  pshufd $232, [[T2]], [[T3:%xmm[0-9]]]
 141 ; CHECK-NEXT:  pshufd $245, [[C22]], [[T7:%xmm[0-9]]]
 142 ; CHECK-NEXT:  pmuludq [[T1]], [[T7]]
 143 ; CHECK-NEXT:  pshufd $232, [[T7]], [[T5:%xmm[0-9]]]
 144 ; CHECK-NEXT:  punpckldq [[T5]], [[T6:%xmm[0-9]]]
 145 ; CHECK-NEXT:  movdqa .LCPI2_2, [[C242:%xmm[0-9]]]
 146 ; CHECK-NEXT:  paddd [[T6]], [[C242]]
 147 ; CHECK-NEXT:  paddd .LCPI2_3, [[C726:%xmm[0-9]]]
 148 ; CHECK-NEXT:  movdqa [[C242]], v2
 149 ; CHECK-NEXT:  [[C726]], v3
 150 ; CHECK-NEXT:  [[C11]], x
 151 ; CHECK-NEXT:  retl
 152 ; Function Attrs: nounwind
 153 define void @testCombineMultiplies_non_splat(<4 x i32> %v1) {
 154 entry:
 155   %add1 = add <4 x i32> %v1, <i32 11, i32 22, i32 33, i32 44>
 156   %mul1 = mul <4 x i32> %add1, <i32 22, i32 33, i32 44, i32 55>
 157   %add2 = add <4 x i32> %v1, <i32 33, i32 44, i32 55, i32 66>
 158   %mul2 = mul <4 x i32> %add2, <i32 22, i32 33, i32 44, i32 55>
 159   store <4 x i32> %mul1, <4 x i32>* @v2, align 16
 160   store <4 x i32> %mul2, <4 x i32>* @v3, align 16
 161   store <4 x i32> %add1, <4 x i32>* @x, align 16
 162   ret void
 163 }