1 ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -early-cse | FileCheck %s
2 ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -passes=early-cse | FileCheck %s
4 define <4 x i32> @test_cse(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
6 ; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
7 ; CHECK-LABEL: @test_cse
8 ; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
9 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
10 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
13 for.cond: ; preds = %for.body, %entry
14 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
15 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
16 %cmp = icmp slt i32 %i.0, %n
17 br i1 %cmp, label %for.body, label %for.end
19 for.body: ; preds = %for.cond
20 %0 = bitcast i32* %a to i8*
21 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
22 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
23 %3 = bitcast <16 x i8> %1 to <4 x i32>
24 %4 = bitcast <16 x i8> %2 to <4 x i32>
25 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
26 %5 = bitcast i32* %a to i8*
27 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
28 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
29 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
30 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
31 %inc = add nsw i32 %i.0, 1
34 for.end: ; preds = %for.cond
38 define <4 x i32> @test_cse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
40 ; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE.
41 ; CHECK-LABEL: @test_cse2
42 ; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
43 ; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
44 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
45 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
48 for.cond: ; preds = %for.body, %entry
49 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
50 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
51 %cmp = icmp slt i32 %i.0, %n
52 br i1 %cmp, label %for.body, label %for.end
54 for.body: ; preds = %for.cond
55 %0 = bitcast i32* %a to i8*
56 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
57 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
58 %3 = bitcast <16 x i8> %1 to <4 x i32>
59 %4 = bitcast <16 x i8> %2 to <4 x i32>
60 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
61 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
62 %5 = bitcast i32* %a to i8*
63 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
64 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
65 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
66 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
67 %inc = add nsw i32 %i.0, 1
70 for.end: ; preds = %for.cond
74 define <4 x i32> @test_cse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 {
76 ; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
77 ; CHECK-LABEL: @test_cse3
78 ; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
79 ; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
80 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
81 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
84 for.cond: ; preds = %for.body, %entry
85 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
86 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
87 %cmp = icmp slt i32 %i.0, %n
88 br i1 %cmp, label %for.body, label %for.end
90 for.body: ; preds = %for.cond
91 %0 = bitcast i32* %a to i8*
92 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %0)
93 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
94 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
95 %1 = bitcast i32* %a to i8*
96 %vld22 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %1)
97 %vld22.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 0
98 %vld22.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 1
99 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld22.fca.0.extract)
100 %inc = add nsw i32 %i.0, 1
103 for.end: ; preds = %for.cond
108 define <4 x i32> @test_nocse(i32* %a, i32* %b, [2 x <4 x i32>] %s.coerce, i32 %n) {
110 ; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized
112 ; CHECK-LABEL: @test_nocse
113 ; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
114 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
115 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
118 for.cond: ; preds = %for.body, %entry
119 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
120 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
121 %cmp = icmp slt i32 %i.0, %n
122 br i1 %cmp, label %for.body, label %for.end
124 for.body: ; preds = %for.cond
125 %0 = bitcast i32* %a to i8*
126 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
127 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
128 %3 = bitcast <16 x i8> %1 to <4 x i32>
129 %4 = bitcast <16 x i8> %2 to <4 x i32>
130 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
131 store i32 0, i32* %b, align 4
132 %5 = bitcast i32* %a to i8*
133 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
134 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
135 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
136 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
137 %inc = add nsw i32 %i.0, 1
140 for.end: ; preds = %for.cond
144 define <4 x i32> @test_nocse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
146 ; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due
147 ; to mismatch between st2 and ld3.
148 ; CHECK-LABEL: @test_nocse2
149 ; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8
150 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
151 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
154 for.cond: ; preds = %for.body, %entry
155 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
156 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
157 %cmp = icmp slt i32 %i.0, %n
158 br i1 %cmp, label %for.body, label %for.end
160 for.body: ; preds = %for.cond
161 %0 = bitcast i32* %a to i8*
162 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
163 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
164 %3 = bitcast <16 x i8> %1 to <4 x i32>
165 %4 = bitcast <16 x i8> %2 to <4 x i32>
166 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
167 %5 = bitcast i32* %a to i8*
168 %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
169 %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
170 %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
171 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.2.extract)
172 %inc = add nsw i32 %i.0, 1
175 for.end: ; preds = %for.cond
179 define <4 x i32> @test_nocse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
181 ; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to
182 ; mismatch between st2 and st3.
183 ; CHECK-LABEL: @test_nocse3
184 ; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0i8
185 ; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8
186 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
187 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
190 for.cond: ; preds = %for.body, %entry
191 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
192 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
193 %cmp = icmp slt i32 %i.0, %n
194 br i1 %cmp, label %for.body, label %for.end
196 for.body: ; preds = %for.cond
197 %0 = bitcast i32* %a to i8*
198 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
199 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
200 %3 = bitcast <16 x i8> %1 to <4 x i32>
201 %4 = bitcast <16 x i8> %2 to <4 x i32>
202 call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> %4, <4 x i32> %3, <4 x i32> %3, i8* %0)
203 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
204 %5 = bitcast i32* %a to i8*
205 %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
206 %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
207 %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
208 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.0.extract)
209 %inc = add nsw i32 %i.0, 1
212 for.end: ; preds = %for.cond
216 ; Function Attrs: nounwind
217 declare void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32>, <4 x i32>, i8* nocapture)
219 ; Function Attrs: nounwind
220 declare void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32>, <4 x i32>, <4 x i32>, i8* nocapture)
222 ; Function Attrs: nounwind readonly
223 declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8*)
225 ; Function Attrs: nounwind readonly
226 declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8*)
228 define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) {
230 %add = add <4 x i32> %__p0, %__p1