test/CodeGen/X86/misched-matrix.ll

   1 ; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
   2 ; RUN:          -misched-topdown -verify-machineinstrs \
   3 ; RUN:     | FileCheck %s -check-prefix=TOPDOWN
   4 ; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
   5 ; RUN:          -misched=ilpmin -verify-machineinstrs \
   6 ; RUN:     | FileCheck %s -check-prefix=ILPMIN
   7 ; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
   8 ; RUN:          -misched=ilpmax -verify-machineinstrs \
   9 ; RUN:     | FileCheck %s -check-prefix=ILPMAX
  10 ;
  11 ; Very temporary xfail during SchedDFSResult churn.
  12 ; XFAIL: *
  13 ;
  14 ; Verify that the MI scheduler minimizes register pressure for a
  15 ; uniform set of bottom-up subtrees (unrolled matrix multiply).
  16 ;
  17 ; For current top-down heuristics, ensure that some folded imulls have
  18 ; been reordered with the stores. This tests the scheduler's cheap
  19 ; alias analysis ability (that doesn't require any AliasAnalysis pass).
  20 ;
  21 ; TOPDOWN: %for.body
  22 ; TOPDOWN: movl %{{.*}}, (
  23 ; TOPDOWN: imull {{[0-9]*}}(
  24 ; TOPDOWN: movl %{{.*}}, 4(
  25 ; TOPDOWN: imull {{[0-9]*}}(
  26 ; TOPDOWN: movl %{{.*}}, 8(
  27 ; TOPDOWN: movl %{{.*}}, 12(
  28 ; TOPDOWN: %for.end
  29 ;
  30 ; For -misched=ilpmin, verify that each expression subtree is
  31 ; scheduled independently, and that the imull/adds are interleaved.
  32 ;
  33 ; ILPMIN: %for.body
  34 ; ILPMIN: movl %{{.*}}, (
  35 ; ILPMIN: imull
  36 ; ILPMIN: imull
  37 ; ILPMIN: addl
  38 ; ILPMIN: imull
  39 ; ILPMIN: addl
  40 ; ILPMIN: imull
  41 ; ILPMIN: addl
  42 ; ILPMIN: movl %{{.*}}, 4(
  43 ; ILPMIN: imull
  44 ; ILPMIN: imull
  45 ; ILPMIN: addl
  46 ; ILPMIN: imull
  47 ; ILPMIN: addl
  48 ; ILPMIN: imull
  49 ; ILPMIN: addl
  50 ; ILPMIN: movl %{{.*}}, 8(
  51 ; ILPMIN: imull
  52 ; ILPMIN: imull
  53 ; ILPMIN: addl
  54 ; ILPMIN: imull
  55 ; ILPMIN: addl
  56 ; ILPMIN: imull
  57 ; ILPMIN: addl
  58 ; ILPMIN: movl %{{.*}}, 12(
  59 ; ILPMIN: %for.end
  60 ;
  61 ; For -misched=ilpmax, verify that each expression subtree is
  62 ; scheduled independently, and that the imull/adds are clustered.
  63 ;
  64 ; ILPMAX: %for.body
  65 ; ILPMAX: movl %{{.*}}, (
  66 ; ILPMAX: imull
  67 ; ILPMAX: imull
  68 ; ILPMAX: imull
  69 ; ILPMAX: imull
  70 ; ILPMAX: addl
  71 ; ILPMAX: addl
  72 ; ILPMAX: addl
  73 ; ILPMAX: movl %{{.*}}, 4(
  74 ; ILPMAX: imull
  75 ; ILPMAX: imull
  76 ; ILPMAX: imull
  77 ; ILPMAX: imull
  78 ; ILPMAX: addl
  79 ; ILPMAX: addl
  80 ; ILPMAX: addl
  81 ; ILPMAX: movl %{{.*}}, 8(
  82 ; ILPMAX: imull
  83 ; ILPMAX: imull
  84 ; ILPMAX: imull
  85 ; ILPMAX: imull
  86 ; ILPMAX: addl
  87 ; ILPMAX: addl
  88 ; ILPMAX: addl
  89 ; ILPMAX: movl %{{.*}}, 12(
  90 ; ILPMAX: %for.end
  91
  92 define void @mmult([4 x i32]* noalias nocapture %m1, [4 x i32]* noalias nocapture %m2,
  93 [4 x i32]* noalias nocapture %m3) nounwind uwtable ssp {
  94 entry:
  95   br label %for.body
  96
  97 for.body:                              ; preds = %for.body, %entry
  98   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
  99   %arrayidx8 = getelementptr inbounds [4 x i32]* %m1, i64 %indvars.iv, i64 0
 100   %tmp = load i32* %arrayidx8, align 4, !tbaa !0
 101   %arrayidx12 = getelementptr inbounds [4 x i32]* %m2, i64 0, i64 0
 102   %tmp1 = load i32* %arrayidx12, align 4, !tbaa !0
 103   %arrayidx8.1 = getelementptr inbounds [4 x i32]* %m1, i64 %indvars.iv, i64 1
 104   %tmp2 = load i32* %arrayidx8.1, align 4, !tbaa !0
 105   %arrayidx12.1 = getelementptr inbounds [4 x i32]* %m2, i64 1, i64 0
 106   %tmp3 = load i32* %arrayidx12.1, align 4, !tbaa !0
 107   %arrayidx8.2 = getelementptr inbounds [4 x i32]* %m1, i64 %indvars.iv, i64 2
 108   %tmp4 = load i32* %arrayidx8.2, align 4, !tbaa !0
 109   %arrayidx12.2 = getelementptr inbounds [4 x i32]* %m2, i64 2, i64 0
 110   %tmp5 = load i32* %arrayidx12.2, align 4, !tbaa !0
 111   %arrayidx8.3 = getelementptr inbounds [4 x i32]* %m1, i64 %indvars.iv, i64 3
 112   %tmp6 = load i32* %arrayidx8.3, align 4, !tbaa !0
 113   %arrayidx12.3 = getelementptr inbounds [4 x i32]* %m2, i64 3, i64 0
 114   %tmp8 = load i32* %arrayidx8, align 4, !tbaa !0
 115   %arrayidx12.137 = getelementptr inbounds [4 x i32]* %m2, i64 0, i64 1
 116   %tmp9 = load i32* %arrayidx12.137, align 4, !tbaa !0
 117   %tmp10 = load i32* %arrayidx8.1, align 4, !tbaa !0
 118   %arrayidx12.1.1 = getelementptr inbounds [4 x i32]* %m2, i64 1, i64 1
 119   %tmp11 = load i32* %arrayidx12.1.1, align 4, !tbaa !0
 120   %tmp12 = load i32* %arrayidx8.2, align 4, !tbaa !0
 121   %arrayidx12.2.1 = getelementptr inbounds [4 x i32]* %m2, i64 2, i64 1
 122   %tmp13 = load i32* %arrayidx12.2.1, align 4, !tbaa !0
 123   %tmp14 = load i32* %arrayidx8.3, align 4, !tbaa !0
 124   %arrayidx12.3.1 = getelementptr inbounds [4 x i32]* %m2, i64 3, i64 1
 125   %tmp15 = load i32* %arrayidx12.3.1, align 4, !tbaa !0
 126   %tmp16 = load i32* %arrayidx8, align 4, !tbaa !0
 127   %arrayidx12.239 = getelementptr inbounds [4 x i32]* %m2, i64 0, i64 2
 128   %tmp17 = load i32* %arrayidx12.239, align 4, !tbaa !0
 129   %tmp18 = load i32* %arrayidx8.1, align 4, !tbaa !0
 130   %arrayidx12.1.2 = getelementptr inbounds [4 x i32]* %m2, i64 1, i64 2
 131   %tmp19 = load i32* %arrayidx12.1.2, align 4, !tbaa !0
 132   %tmp20 = load i32* %arrayidx8.2, align 4, !tbaa !0
 133   %arrayidx12.2.2 = getelementptr inbounds [4 x i32]* %m2, i64 2, i64 2
 134   %tmp21 = load i32* %arrayidx12.2.2, align 4, !tbaa !0
 135   %tmp22 = load i32* %arrayidx8.3, align 4, !tbaa !0
 136   %arrayidx12.3.2 = getelementptr inbounds [4 x i32]* %m2, i64 3, i64 2
 137   %tmp23 = load i32* %arrayidx12.3.2, align 4, !tbaa !0
 138   %tmp24 = load i32* %arrayidx8, align 4, !tbaa !0
 139   %arrayidx12.341 = getelementptr inbounds [4 x i32]* %m2, i64 0, i64 3
 140   %tmp25 = load i32* %arrayidx12.341, align 4, !tbaa !0
 141   %tmp26 = load i32* %arrayidx8.1, align 4, !tbaa !0
 142   %arrayidx12.1.3 = getelementptr inbounds [4 x i32]* %m2, i64 1, i64 3
 143   %tmp27 = load i32* %arrayidx12.1.3, align 4, !tbaa !0
 144   %tmp28 = load i32* %arrayidx8.2, align 4, !tbaa !0
 145   %arrayidx12.2.3 = getelementptr inbounds [4 x i32]* %m2, i64 2, i64 3
 146   %tmp29 = load i32* %arrayidx12.2.3, align 4, !tbaa !0
 147   %tmp30 = load i32* %arrayidx8.3, align 4, !tbaa !0
 148   %arrayidx12.3.3 = getelementptr inbounds [4 x i32]* %m2, i64 3, i64 3
 149   %tmp31 = load i32* %arrayidx12.3.3, align 4, !tbaa !0
 150   %tmp7 = load i32* %arrayidx12.3, align 4, !tbaa !0
 151   %mul = mul nsw i32 %tmp1, %tmp
 152   %mul.1 = mul nsw i32 %tmp3, %tmp2
 153   %mul.2 = mul nsw i32 %tmp5, %tmp4
 154   %mul.3 = mul nsw i32 %tmp7, %tmp6
 155   %mul.138 = mul nsw i32 %tmp9, %tmp8
 156   %mul.1.1 = mul nsw i32 %tmp11, %tmp10
 157   %mul.2.1 = mul nsw i32 %tmp13, %tmp12
 158   %mul.3.1 = mul nsw i32 %tmp15, %tmp14
 159   %mul.240 = mul nsw i32 %tmp17, %tmp16
 160   %mul.1.2 = mul nsw i32 %tmp19, %tmp18
 161   %mul.2.2 = mul nsw i32 %tmp21, %tmp20
 162   %mul.3.2 = mul nsw i32 %tmp23, %tmp22
 163   %mul.342 = mul nsw i32 %tmp25, %tmp24
 164   %mul.1.3 = mul nsw i32 %tmp27, %tmp26
 165   %mul.2.3 = mul nsw i32 %tmp29, %tmp28
 166   %mul.3.3 = mul nsw i32 %tmp31, %tmp30
 167   %add.1 = add nsw i32 %mul.1, %mul
 168   %add.2 = add nsw i32 %mul.2, %add.1
 169   %add.3 = add nsw i32 %mul.3, %add.2
 170   %add.1.1 = add nsw i32 %mul.1.1, %mul.138
 171   %add.2.1 = add nsw i32 %mul.2.1, %add.1.1
 172   %add.3.1 = add nsw i32 %mul.3.1, %add.2.1
 173   %add.1.2 = add nsw i32 %mul.1.2, %mul.240
 174   %add.2.2 = add nsw i32 %mul.2.2, %add.1.2
 175   %add.3.2 = add nsw i32 %mul.3.2, %add.2.2
 176   %add.1.3 = add nsw i32 %mul.1.3, %mul.342
 177   %add.2.3 = add nsw i32 %mul.2.3, %add.1.3
 178   %add.3.3 = add nsw i32 %mul.3.3, %add.2.3
 179   %arrayidx16 = getelementptr inbounds [4 x i32]* %m3, i64 %indvars.iv, i64 0
 180   store i32 %add.3, i32* %arrayidx16, align 4, !tbaa !0
 181   %arrayidx16.1 = getelementptr inbounds [4 x i32]* %m3, i64 %indvars.iv, i64 1
 182   store i32 %add.3.1, i32* %arrayidx16.1, align 4, !tbaa !0
 183   %arrayidx16.2 = getelementptr inbounds [4 x i32]* %m3, i64 %indvars.iv, i64 2
 184   store i32 %add.3.2, i32* %arrayidx16.2, align 4, !tbaa !0
 185   %arrayidx16.3 = getelementptr inbounds [4 x i32]* %m3, i64 %indvars.iv, i64 3
 186   store i32 %add.3.3, i32* %arrayidx16.3, align 4, !tbaa !0
 187   %indvars.iv.next = add i64 %indvars.iv, 1
 188   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
 189   %exitcond = icmp eq i32 %lftr.wideiv, 4
 190   br i1 %exitcond, label %for.end, label %for.body
 191
 192 for.end:                                        ; preds = %for.body
 193   ret void
 194 }
 195
 196 !0 = metadata !{metadata !"int", metadata !1}
 197 !1 = metadata !{metadata !"omnipotent char", metadata !2}
 198 !2 = metadata !{metadata !"Simple C/C++ TBAA"}