1 ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s
2 ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse4.1 -cost-model -analyze < %s | FileCheck --check-prefix=SSE41 %s
4 define void @zext_v16i16_to_v16i32(<16 x i16>* %a) {
5 ; SSE2: zext_v16i16_to_v16i32
6 ; SSE2: cost of 6 {{.*}} zext
8 ; SSE41: zext_v16i16_to_v16i32
9 ; SSE41: cost of 4 {{.*}} zext
11 %1 = load <16 x i16>, <16 x i16>* %a
12 %2 = zext <16 x i16> %1 to <16 x i32>
13 store <16 x i32> %2, <16 x i32>* undef, align 4
17 define void @sext_v16i16_to_v16i32(<16 x i16>* %a) {
18 ; SSE2: sext_v16i16_to_v16i32
19 ; SSE2: cost of 8 {{.*}} sext
21 ; SSE41: sext_v16i16_to_v16i32
22 ; SSE41: cost of 4 {{.*}} sext
24 %1 = load <16 x i16>, <16 x i16>* %a
25 %2 = sext <16 x i16> %1 to <16 x i32>
26 store <16 x i32> %2, <16 x i32>* undef, align 4
30 define void @zext_v8i16_to_v8i32(<8 x i16>* %a) {
31 ; SSE2: zext_v8i16_to_v8i32
32 ; SSE2: cost of 3 {{.*}} zext
34 ; SSE41: zext_v8i16_to_v8i32
35 ; SSE41: cost of 2 {{.*}} zext
37 %1 = load <8 x i16>, <8 x i16>* %a
38 %2 = zext <8 x i16> %1 to <8 x i32>
39 store <8 x i32> %2, <8 x i32>* undef, align 4
43 define void @sext_v8i16_to_v8i32(<8 x i16>* %a) {
44 ; SSE2: sext_v8i16_to_v8i32
45 ; SSE2: cost of 4 {{.*}} sext
47 ; SSE41: sext_v8i16_to_v8i32
48 ; SSE41: cost of 2 {{.*}} sext
50 %1 = load <8 x i16>, <8 x i16>* %a
51 %2 = sext <8 x i16> %1 to <8 x i32>
52 store <8 x i32> %2, <8 x i32>* undef, align 4
56 define void @zext_v4i16_to_v4i32(<4 x i16>* %a) {
57 ; SSE2: zext_v4i16_to_v4i32
58 ; SSE2: cost of 1 {{.*}} zext
60 ; SSE41: zext_v4i16_to_v4i32
61 ; SSE41: cost of 1 {{.*}} zext
63 %1 = load <4 x i16>, <4 x i16>* %a
64 %2 = zext <4 x i16> %1 to <4 x i32>
65 store <4 x i32> %2, <4 x i32>* undef, align 4
69 define void @sext_v4i16_to_v4i32(<4 x i16>* %a) {
70 ; SSE2: sext_v4i16_to_v4i32
71 ; SSE2: cost of 2 {{.*}} sext
73 ; SSE41: sext_v4i16_to_v4i32
74 ; SSE41: cost of 1 {{.*}} sext
76 %1 = load <4 x i16>, <4 x i16>* %a
77 %2 = sext <4 x i16> %1 to <4 x i32>
78 store <4 x i32> %2, <4 x i32>* undef, align 4
82 define void @zext_v16i8_to_v16i32(<16 x i8>* %a) {
83 ; SSE2: zext_v16i8_to_v16i32
84 ; SSE2: cost of 9 {{.*}} zext
86 ; SSE41: zext_v16i8_to_v16i32
87 ; SSE41: cost of 4 {{.*}} zext
89 %1 = load <16 x i8>, <16 x i8>* %a
90 %2 = zext <16 x i8> %1 to <16 x i32>
91 store <16 x i32> %2, <16 x i32>* undef, align 4
95 define void @sext_v16i8_to_v16i32(<16 x i8>* %a) {
96 ; SSE2: sext_v16i8_to_v16i32
97 ; SSE2: cost of 12 {{.*}} sext
99 ; SSE41: sext_v16i8_to_v16i32
100 ; SSE41: cost of 4 {{.*}} sext
102 %1 = load <16 x i8>, <16 x i8>* %a
103 %2 = sext <16 x i8> %1 to <16 x i32>
104 store <16 x i32> %2, <16 x i32>* undef, align 4
108 define void @zext_v8i8_to_v8i32(<8 x i8>* %a) {
109 ; SSE2: zext_v8i8_to_v8i32
110 ; SSE2: cost of 6 {{.*}} zext
112 ; SSE41: zext_v8i8_to_v8i32
113 ; SSE41: cost of 2 {{.*}} zext
115 %1 = load <8 x i8>, <8 x i8>* %a
116 %2 = zext <8 x i8> %1 to <8 x i32>
117 store <8 x i32> %2, <8 x i32>* undef, align 4
121 define void @sext_v8i8_to_v8i32(<8 x i8>* %a) {
122 ; SSE2: sext_v8i8_to_v8i32
123 ; SSE2: cost of 6 {{.*}} sext
125 ; SSE41: sext_v8i8_to_v8i32
126 ; SSE41: cost of 2 {{.*}} sext
128 %1 = load <8 x i8>, <8 x i8>* %a
129 %2 = sext <8 x i8> %1 to <8 x i32>
130 store <8 x i32> %2, <8 x i32>* undef, align 4
134 define void @zext_v4i8_to_v4i32(<4 x i8>* %a) {
135 ; SSE2: zext_v4i8_to_v4i32
136 ; SSE2: cost of 2 {{.*}} zext
138 ; SSE41: zext_v4i8_to_v4i32
139 ; SSE41: cost of 1 {{.*}} zext
141 %1 = load <4 x i8>, <4 x i8>* %a
142 %2 = zext <4 x i8> %1 to <4 x i32>
143 store <4 x i32> %2, <4 x i32>* undef, align 4
147 define void @sext_v4i8_to_v4i32(<4 x i8>* %a) {
148 ; SSE2: sext_v4i8_to_v4i32
149 ; SSE2: cost of 3 {{.*}} sext
151 ; SSE41: sext_v4i8_to_v4i32
152 ; SSE41: cost of 1 {{.*}} sext
154 %1 = load <4 x i8>, <4 x i8>* %a
155 %2 = sext <4 x i8> %1 to <4 x i32>
156 store <4 x i32> %2, <4 x i32>* undef, align 4
160 define void @zext_v16i8_to_v16i16(<16 x i8>* %a) {
161 ; SSE2: zext_v16i8_to_v16i16
162 ; SSE2: cost of 3 {{.*}} zext
164 ; SSE41: zext_v16i8_to_v16i16
165 ; SSE41: cost of 2 {{.*}} zext
167 %1 = load <16 x i8>, <16 x i8>* %a
168 %2 = zext <16 x i8> %1 to <16 x i16>
169 store <16 x i16> %2, <16 x i16>* undef, align 4
173 define void @sext_v16i8_to_v16i16(<16 x i8>* %a) {
174 ; SSE2: sext_v16i8_to_v16i16
175 ; SSE2: cost of 4 {{.*}} sext
177 ; SSE41: sext_v16i8_to_v16i16
178 ; SSE41: cost of 2 {{.*}} sext
180 %1 = load <16 x i8>, <16 x i8>* %a
181 %2 = sext <16 x i8> %1 to <16 x i16>
182 store <16 x i16> %2, <16 x i16>* undef, align 4
186 define void @zext_v8i8_to_v8i16(<8 x i8>* %a) {
187 ; SSE2: zext_v8i8_to_v8i16
188 ; SSE2: cost of 1 {{.*}} zext
190 ; SSE41: zext_v8i8_to_v8i16
191 ; SSE41: cost of 1 {{.*}} zext
193 %1 = load <8 x i8>, <8 x i8>* %a
194 %2 = zext <8 x i8> %1 to <8 x i16>
195 store <8 x i16> %2, <8 x i16>* undef, align 4
199 define void @sext_v8i8_to_v8i16(<8 x i8>* %a) {
200 ; SSE2: sext_v8i8_to_v8i16
201 ; SSE2: cost of 2 {{.*}} sext
203 ; SSE41: sext_v8i8_to_v8i16
204 ; SSE41: cost of 1 {{.*}} sext
206 %1 = load <8 x i8>, <8 x i8>* %a
207 %2 = sext <8 x i8> %1 to <8 x i16>
208 store <8 x i16> %2, <8 x i16>* undef, align 4
212 define void @zext_v4i8_to_v4i16(<4 x i8>* %a) {
213 ; SSE2: zext_v4i8_to_v4i16
214 ; SSE2: cost of 1 {{.*}} zext
216 ; SSE41: zext_v4i8_to_v4i16
217 ; SSE41: cost of 1 {{.*}} zext
219 %1 = load <4 x i8>, <4 x i8>* %a
220 %2 = zext <4 x i8> %1 to <4 x i16>
221 store <4 x i16> %2, <4 x i16>* undef, align 4
225 define void @sext_v4i8_to_v4i16(<4 x i8>* %a) {
226 ; SSE2: sext_v4i8_to_v4i16
227 ; SSE2: cost of 6 {{.*}} sext
229 ; SSE41: sext_v4i8_to_v4i16
230 ; SSE41: cost of 2 {{.*}} sext
232 %1 = load <4 x i8>, <4 x i8>* %a
233 %2 = sext <4 x i8> %1 to <4 x i16>
234 store <4 x i16> %2, <4 x i16>* undef, align 4
238 define void @truncate_v16i32_to_v16i16(<16 x i32>* %a) {
239 ; SSE2: truncate_v16i32_to_v16i16
240 ; SSE2: cost of 14 {{.*}} trunc
242 ; SSE41: truncate_v16i32_to_v16i16
243 ; SSE41: cost of 6 {{.*}} trunc
245 %1 = load <16 x i32>, <16 x i32>* %a
246 %2 = trunc <16 x i32> %1 to <16 x i16>
247 store <16 x i16> %2, <16 x i16>* undef, align 4
251 define void @truncate_v8i32_to_v8i16(<8 x i32>* %a) {
252 ; SSE2: truncate_v8i32_to_v8i16
253 ; SSE2: cost of 7 {{.*}} trunc
255 ; SSE41: truncate_v8i32_to_v8i16
256 ; SSE41: cost of 3 {{.*}} trunc
258 %1 = load <8 x i32>, <8 x i32>* %a
259 %2 = trunc <8 x i32> %1 to <8 x i16>
260 store <8 x i16> %2, <8 x i16>* undef, align 4
264 define void @truncate_v4i32_to_v4i16(<4 x i32>* %a) {
265 ; SSE2: truncate_v4i32_to_v4i16
266 ; SSE2: cost of 3 {{.*}} trunc
268 ; SSE41: truncate_v4i32_to_v4i16
269 ; SSE41: cost of 1 {{.*}} trunc
271 %1 = load <4 x i32>, <4 x i32>* %a
272 %2 = trunc <4 x i32> %1 to <4 x i16>
273 store <4 x i16> %2, <4 x i16>* undef, align 4
277 define void @truncate_v16i32_to_v16i8(<16 x i32>* %a) {
278 ; SSE2: truncate_v16i32_to_v16i8
279 ; SSE2: cost of 31 {{.*}} trunc
281 ; SSE41: truncate_v16i32_to_v16i8
282 ; SSE41: cost of 30 {{.*}} trunc
284 %1 = load <16 x i32>, <16 x i32>* %a
285 %2 = trunc <16 x i32> %1 to <16 x i8>
286 store <16 x i8> %2, <16 x i8>* undef, align 4
290 define void @truncate_v8i32_to_v8i8(<8 x i32>* %a) {
291 ; SSE2: truncate_v8i32_to_v8i8
292 ; SSE2: cost of 4 {{.*}} trunc
294 ; SSE41: truncate_v8i32_to_v8i8
295 ; SSE41: cost of 3 {{.*}} trunc
297 %1 = load <8 x i32>, <8 x i32>* %a
298 %2 = trunc <8 x i32> %1 to <8 x i8>
299 store <8 x i8> %2, <8 x i8>* undef, align 4
303 define void @truncate_v4i32_to_v4i8(<4 x i32>* %a) {
304 ; SSE2: truncate_v4i32_to_v4i8
305 ; SSE2: cost of 3 {{.*}} trunc
307 ; SSE41: truncate_v4i32_to_v4i8
308 ; SSE41: cost of 1 {{.*}} trunc
310 %1 = load <4 x i32>, <4 x i32>* %a
311 %2 = trunc <4 x i32> %1 to <4 x i8>
312 store <4 x i8> %2, <4 x i8>* undef, align 4
316 define void @truncate_v16i16_to_v16i8(<16 x i16>* %a) {
317 ; SSE2: truncate_v16i16_to_v16i8
318 ; SSE2: cost of 3 {{.*}} trunc
320 ; SSE41: truncate_v16i16_to_v16i8
321 ; SSE41: cost of 3 {{.*}} trunc
323 %1 = load <16 x i16>, <16 x i16>* %a
324 %2 = trunc <16 x i16> %1 to <16 x i8>
325 store <16 x i8> %2, <16 x i8>* undef, align 4
329 define void @truncate_v8i16_to_v8i8(<8 x i16>* %a) {
330 ; SSE2: truncate_v8i16_to_v8i8
331 ; SSE2: cost of 2 {{.*}} trunc
333 ; SSE41: truncate_v8i16_to_v8i8
334 ; SSE41: cost of 1 {{.*}} trunc
336 %1 = load <8 x i16>, <8 x i16>* %a
337 %2 = trunc <8 x i16> %1 to <8 x i8>
338 store <8 x i8> %2, <8 x i8>* undef, align 4
342 define void @truncate_v4i16_to_v4i8(<4 x i16>* %a) {
343 ; SSE2: truncate_v4i16_to_v4i8
344 ; SSE2: cost of 4 {{.*}} trunc
346 ; SSE41: truncate_v4i16_to_v4i8
347 ; SSE41: cost of 2 {{.*}} trunc
349 %1 = load <4 x i16>, <4 x i16>* %a
350 %2 = trunc <4 x i16> %1 to <4 x i8>
351 store <4 x i8> %2, <4 x i8>* undef, align 4