1 ; RUN: llc < %s -o - -mcpu=generic -march=x86-64 -mattr=+sse4.2 | FileCheck %s
3 ; Test based on pr5626 to load/store
6 %i32vec3 = type <3 x i32>
7 define void @add3i32(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
8 ; CHECK-LABEL: add3i32:
9 ; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
10 ; CHECK-NEXT: paddd (%{{.*}}), %[[R0]]
11 ; CHECK-NEXT: pextrd $2, %[[R0]], 8(%{{.*}})
12 ; CHECK-NEXT: movq %[[R0]], (%{{.*}})
13 %a = load %i32vec3* %ap, align 16
14 %b = load %i32vec3* %bp, align 16
15 %x = add %i32vec3 %a, %b
16 store %i32vec3 %x, %i32vec3* %ret, align 16
20 define void @add3i32_2(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
21 ; CHECK-LABEL: add3i32_2:
22 ; CHECK: movq (%{{.*}}), %[[R0:xmm[0-9]+]]
23 ; CHECK-NEXT: pinsrd $2, 8(%{{.*}}), %[[R0]]
24 ; CHECK-NEXT: movq (%{{.*}}), %[[R1:xmm[0-9]+]]
25 ; CHECK-NEXT: pinsrd $2, 8(%{{.*}}), %[[R1]]
26 ; CHECK-NEXT: paddd %[[R0]], %[[R1]]
27 ; CHECK-NEXT: pextrd $2, %[[R1]], 8(%{{.*}})
28 ; CHECK-NEXT: movq %[[R1]], (%{{.*}})
29 %a = load %i32vec3* %ap, align 8
30 %b = load %i32vec3* %bp, align 8
31 %x = add %i32vec3 %a, %b
32 store %i32vec3 %x, %i32vec3* %ret, align 8
36 %i32vec7 = type <7 x i32>
37 define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) {
38 ; CHECK-LABEL: add7i32:
39 ; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
40 ; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
41 ; CHECK-NEXT: paddd (%{{.*}}), %[[R0]]
42 ; CHECK-NEXT: paddd 16(%{{.*}}), %[[R1]]
43 ; CHECK-NEXT: pextrd $2, %[[R1]], 24(%{{.*}})
44 ; CHECK-NEXT: movq %[[R1]], 16(%{{.*}})
45 ; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
46 %a = load %i32vec7* %ap, align 16
47 %b = load %i32vec7* %bp, align 16
48 %x = add %i32vec7 %a, %b
49 store %i32vec7 %x, %i32vec7* %ret, align 16
53 %i32vec12 = type <12 x i32>
54 define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) {
55 ; CHECK-LABEL: add12i32:
56 ; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
57 ; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
58 ; CHECK-NEXT: movdqa 32(%{{.*}}), %[[R2:xmm[0-9]+]]
59 ; CHECK-NEXT: paddd (%{{.*}}), %[[R0]]
60 ; CHECK-NEXT: paddd 16(%{{.*}}), %[[R1]]
61 ; CHECK-NEXT: paddd 32(%{{.*}}), %[[R2]]
62 ; CHECK-NEXT: movdqa %[[R2]], 32(%{{.*}})
63 ; CHECK-NEXT: movdqa %[[R1]], 16(%{{.*}})
64 ; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
65 %a = load %i32vec12* %ap, align 16
66 %b = load %i32vec12* %bp, align 16
67 %x = add %i32vec12 %a, %b
68 store %i32vec12 %x, %i32vec12* %ret, align 16
73 %i16vec3 = type <3 x i16>
74 define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind {
75 ; CHECK-LABEL: add3i16:
76 ; CHECK: pmovzxwd (%{{.*}}), %[[R0:xmm[0-9]+]]
77 ; CHECK-NEXT: pmovzxwd (%{{.*}}), %[[R1:xmm[0-9]+]]
78 ; CHECK-NEXT: paddd %[[R0]], %[[R1]]
79 ; CHECK-NEXT: movdqa %[[R1]], %[[R0]]
80 ; CHECK-NEXT: pshufb {{.*}}, %[[R0]]
81 ; CHECK-NEXT: pmovzxdq %[[R0]], %[[R0]]
82 ; CHECK-NEXT: pextrw $4, %[[R1]], 4(%{{.*}})
83 ; CHECK-NEXT: movd %[[R0]], (%{{.*}})
84 %a = load %i16vec3* %ap, align 16
85 %b = load %i16vec3* %bp, align 16
86 %x = add %i16vec3 %a, %b
87 store %i16vec3 %x, %i16vec3* %ret, align 16
91 %i16vec4 = type <4 x i16>
92 define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp) nounwind {
93 ; CHECK-LABEL: add4i16:
94 ; CHECK: movq (%{{.*}}), %[[R0:xmm[0-9]+]]
95 ; CHECK-NEXT: movq (%{{.*}}), %[[R1:xmm[0-9]+]]
96 ; CHECK-NEXT: paddw %[[R0]], %[[R1]]
97 ; CHECK-NEXT: movq %[[R1]], (%{{.*}})
98 %a = load %i16vec4* %ap, align 16
99 %b = load %i16vec4* %bp, align 16
100 %x = add %i16vec4 %a, %b
101 store %i16vec4 %x, %i16vec4* %ret, align 16
105 %i16vec12 = type <12 x i16>
106 define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12* %bp) nounwind {
107 ; CHECK-LABEL: add12i16:
108 ; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
109 ; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
110 ; CHECK-NEXT: paddw (%{{.*}}), %[[R0]]
111 ; CHECK-NEXT: paddw 16(%{{.*}}), %[[R1]]
112 ; CHECK-NEXT: movq %[[R1]], 16(%{{.*}})
113 ; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
114 %a = load %i16vec12* %ap, align 16
115 %b = load %i16vec12* %bp, align 16
116 %x = add %i16vec12 %a, %b
117 store %i16vec12 %x, %i16vec12* %ret, align 16
121 %i16vec18 = type <18 x i16>
122 define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* %bp) nounwind {
123 ; CHECK-LABEL: add18i16:
124 ; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
125 ; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
126 ; CHECK-NEXT: movdqa 32(%{{.*}}), %[[R2:xmm[0-9]+]]
127 ; CHECK-NEXT: paddw (%{{.*}}), %[[R0]]
128 ; CHECK-NEXT: paddw 16(%{{.*}}), %[[R1]]
129 ; CHECK-NEXT: paddw 32(%{{.*}}), %[[R2]]
130 ; CHECK-NEXT: movd %[[R2]], 32(%{{.*}})
131 ; CHECK-NEXT: movdqa %[[R1]], 16(%{{.*}})
132 ; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
133 %a = load %i16vec18* %ap, align 16
134 %b = load %i16vec18* %bp, align 16
135 %x = add %i16vec18 %a, %b
136 store %i16vec18 %x, %i16vec18* %ret, align 16
141 %i8vec3 = type <3 x i8>
142 define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind {
143 ; CHECK-LABEL: add3i8:
144 ; CHECK: pmovzxbd (%{{.*}}), %[[R0:xmm[0-9]+]]
145 ; CHECK-NEXT: pmovzxbd (%{{.*}}), %[[R1:xmm[0-9]+]]
146 ; CHECK-NEXT: paddd %[[R0]], %[[R1]]
147 ; CHECK-NEXT: movdqa %[[R1]], %[[R0]]
148 ; CHECK-NEXT: pshufb {{.*}}, %[[R0]]
149 ; CHECK-NEXT: pmovzxwq %[[R0]], %[[R0]]
150 ; CHECK-NEXT: pextrb $8, %[[R1]], 2(%{{.*}})
151 ; CHECK-NEXT: movd %[[R0]], %e[[R2:[abcd]]]x
152 ; CHECK-NEXT: movw %[[R2]]x, (%{{.*}})
153 %a = load %i8vec3* %ap, align 16
154 %b = load %i8vec3* %bp, align 16
155 %x = add %i8vec3 %a, %b
156 store %i8vec3 %x, %i8vec3* %ret, align 16
160 %i8vec31 = type <31 x i8>
161 define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp) nounwind {
162 ; CHECK-LABEL: add31i8:
163 ; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
164 ; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
165 ; CHECK-NEXT: paddb (%{{.*}}), %[[R0]]
166 ; CHECK-NEXT: paddb 16(%{{.*}}), %[[R1]]
167 ; CHECK-NEXT: pextrb $14, %[[R1]], 30(%{{.*}})
168 ; CHECK-NEXT: pextrw $6, %[[R1]], 28(%{{.*}})
169 ; CHECK-NEXT: pextrd $2, %[[R1]], 24(%{{.*}})
170 ; CHECK-NEXT: movq %[[R1]], 16(%{{.*}})
171 ; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
172 %a = load %i8vec31* %ap, align 16
173 %b = load %i8vec31* %bp, align 16
174 %x = add %i8vec31 %a, %b
175 store %i8vec31 %x, %i8vec31* %ret, align 16
180 %i8vec3pack = type { <3 x i8>, i8 }
181 define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pack* %rot) nounwind {
183 ; CHECK: movdqa {{.*}}, %[[CONSTANT0:xmm[0-9]+]]
184 ; CHECK-NEXT: movdqa {{.*}}, %[[SHUFFLE_MASK:xmm[0-9]+]]
185 ; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[CONSTANT0]]
186 ; CHECK-NEXT: pmovzxwq %[[CONSTANT0]], %[[CONSTANT0]]
187 ; CHECK-NEXT: movd %[[CONSTANT0]], %e[[R0:[abcd]]]x
188 ; CHECK-NEXT: movw %[[R0]]x, (%[[PTR0:.*]])
189 ; CHECK-NEXT: movb $-98, 2(%[[PTR0]])
190 ; CHECK-NEXT: movdqa {{.*}}, %[[CONSTANT1:xmm[0-9]+]]
191 ; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[CONSTANT1]]
192 ; CHECK-NEXT: pmovzxwq %[[CONSTANT1]], %[[CONSTANT1]]
193 ; CHECK-NEXT: movd %[[CONSTANT1]], %e[[R1:[abcd]]]x
194 ; CHECK-NEXT: movw %[[R1]]x, (%[[PTR1:.*]])
195 ; CHECK-NEXT: movb $1, 2(%[[PTR1]])
196 ; CHECK-NEXT: pmovzxbd (%[[PTR0]]), %[[X0:xmm[0-9]+]]
197 ; CHECK-NEXT: pand {{.*}}, %[[X0]]
198 ; CHECK-NEXT: pextrd $1, %[[X0]], %e[[R0:[abcd]]]x
199 ; CHECK-NEXT: shrl %e[[R0]]x
200 ; CHECK-NEXT: movd %[[X0]], %e[[R1:[abcd]]]x
201 ; CHECK-NEXT: shrl %e[[R1]]x
202 ; CHECK-NEXT: movd %e[[R1]]x, %[[X1:xmm[0-9]+]]
203 ; CHECK-NEXT: pinsrd $1, %e[[R0]]x, %[[X1]]
204 ; CHECK-NEXT: pextrd $2, %[[X0]], %e[[R0:[abcd]]]x
205 ; CHECK-NEXT: shrl %e[[R0]]x
206 ; CHECK-NEXT: pinsrd $2, %e[[R0]]x, %[[X1]]
207 ; CHECK-NEXT: pextrd $3, %[[X0]], %e[[R0:[abcd]]]x
208 ; CHECK-NEXT: pinsrd $3, %e[[R0]]x, %[[X1]]
209 ; CHECK-NEXT: movdqa %[[X1]], %[[X2:xmm[0-9]+]]
210 ; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[X2]]
211 ; CHECK-NEXT: pmovzxwq %[[X2]], %[[X3:xmm[0-9]+]]
212 ; CHECK-NEXT: pextrb $8, %[[X1]], 2(%{{.*}})
213 ; CHECK-NEXT: movd %[[X3]], %e[[R0:[abcd]]]x
214 ; CHECK-NEXT: movw %[[R0]]x, (%{{.*}})
217 %storetmp = bitcast %i8vec3pack* %X to <3 x i8>*
218 store <3 x i8> <i8 -98, i8 -98, i8 -98>, <3 x i8>* %storetmp
219 %storetmp1 = bitcast %i8vec3pack* %rot to <3 x i8>*
220 store <3 x i8> <i8 1, i8 1, i8 1>, <3 x i8>* %storetmp1
221 %tmp = load %i8vec3pack* %X
222 %extractVec = extractvalue %i8vec3pack %tmp, 0
223 %tmp2 = load %i8vec3pack* %rot
224 %extractVec3 = extractvalue %i8vec3pack %tmp2, 0
225 %shr = lshr <3 x i8> %extractVec, %extractVec3
226 %storetmp4 = bitcast %i8vec3pack* %result to <3 x i8>*
227 store <3 x i8> %shr, <3 x i8>* %storetmp4