1 ; RUN: llc < %s -o - -mcpu=generic -march=x86-64 -mattr=+sse4.2 | FileCheck %s
3 ; Test based on pr5626 to load/store
6 %i32vec3 = type <3 x i32>
7 define void @add3i32(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
8 ; CHECK-LABEL: add3i32:
9 ; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
10 ; CHECK-NEXT: paddd (%{{.*}}), %[[R0]]
11 ; CHECK-NEXT: pextrd $2, %[[R0]], 8(%{{.*}})
12 ; CHECK-NEXT: movq %[[R0]], (%{{.*}})
13 %a = load %i32vec3* %ap, align 16
14 %b = load %i32vec3* %bp, align 16
15 %x = add %i32vec3 %a, %b
16 store %i32vec3 %x, %i32vec3* %ret, align 16
20 define void @add3i32_2(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
21 ; CHECK-LABEL: add3i32_2:
22 ; CHECK: movq (%{{.*}}), %[[R0:xmm[0-9]+]]
23 ; CHECK-NEXT: pinsrd $2, 8(%{{.*}}), %[[R0]]
24 ; CHECK-NEXT: movq (%{{.*}}), %[[R1:xmm[0-9]+]]
25 ; CHECK-NEXT: pinsrd $2, 8(%{{.*}}), %[[R1]]
26 ; CHECK-NEXT: paddd %[[R0]], %[[R1]]
27 ; CHECK-NEXT: pextrd $2, %[[R1]], 8(%{{.*}})
28 ; CHECK-NEXT: movq %[[R1]], (%{{.*}})
29 %a = load %i32vec3* %ap, align 8
30 %b = load %i32vec3* %bp, align 8
31 %x = add %i32vec3 %a, %b
32 store %i32vec3 %x, %i32vec3* %ret, align 8
36 %i32vec7 = type <7 x i32>
37 define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) {
38 ; CHECK-LABEL: add7i32:
39 ; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
40 ; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
41 ; CHECK-NEXT: paddd (%{{.*}}), %[[R0]]
42 ; CHECK-NEXT: paddd 16(%{{.*}}), %[[R1]]
43 ; CHECK-NEXT: pextrd $2, %[[R1]], 24(%{{.*}})
44 ; CHECK-NEXT: movq %[[R1]], 16(%{{.*}})
45 ; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
46 %a = load %i32vec7* %ap, align 16
47 %b = load %i32vec7* %bp, align 16
48 %x = add %i32vec7 %a, %b
49 store %i32vec7 %x, %i32vec7* %ret, align 16
53 %i32vec12 = type <12 x i32>
54 define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) {
55 ; CHECK-LABEL: add12i32:
56 ; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
57 ; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
58 ; CHECK-NEXT: movdqa 32(%{{.*}}), %[[R2:xmm[0-9]+]]
59 ; CHECK-NEXT: paddd (%{{.*}}), %[[R0]]
60 ; CHECK-NEXT: paddd 16(%{{.*}}), %[[R1]]
61 ; CHECK-NEXT: paddd 32(%{{.*}}), %[[R2]]
62 ; CHECK-NEXT: movdqa %[[R2]], 32(%{{.*}})
63 ; CHECK-NEXT: movdqa %[[R1]], 16(%{{.*}})
64 ; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
65 %a = load %i32vec12* %ap, align 16
66 %b = load %i32vec12* %bp, align 16
67 %x = add %i32vec12 %a, %b
68 store %i32vec12 %x, %i32vec12* %ret, align 16
73 %i16vec3 = type <3 x i16>
74 define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind {
75 ; CHECK-LABEL: add3i16:
76 ; CHECK: pmovzxwd (%{{.*}}), %[[R0:xmm[0-9]+]]
77 ; CHECK-NEXT: pmovzxwd (%{{.*}}), %[[R1:xmm[0-9]+]]
78 ; CHECK-NEXT: paddd %[[R0]], %[[R1]]
79 ; CHECK-NEXT: movdqa %[[R1]], %[[R0]]
80 ; CHECK-NEXT: pshufb {{.*}}, %[[R0]]
81 ; CHECK-NEXT: movd %[[R0]], %r[[R3:[abcd]]]x
82 ; CHECK-NEXT: movd %r[[R3]]x, %[[R0]]
83 ; CHECK-NEXT: pextrw $4, %[[R1]], 4(%{{.*}})
84 ; CHECK-NEXT: movd %[[R0]], (%{{.*}})
85 %a = load %i16vec3* %ap, align 16
86 %b = load %i16vec3* %bp, align 16
87 %x = add %i16vec3 %a, %b
88 store %i16vec3 %x, %i16vec3* %ret, align 16
92 %i16vec4 = type <4 x i16>
93 define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp) nounwind {
94 ; CHECK-LABEL: add4i16:
95 ; CHECK: pmovzxwd (%{{.*}}), %[[R0:xmm[0-9]+]]
96 ; CHECK-NEXT: pmovzxwd (%{{.*}}), %[[R1:xmm[0-9]+]]
97 ; CHECK-NEXT: paddd %[[R0]], %[[R1]]
98 ; CHECK-NEXT: pshufb {{.*}}, %[[R1]]
99 ; CHECK-NEXT: movq %[[R1]], (%{{.*}})
100 %a = load %i16vec4* %ap, align 16
101 %b = load %i16vec4* %bp, align 16
102 %x = add %i16vec4 %a, %b
103 store %i16vec4 %x, %i16vec4* %ret, align 16
107 %i16vec12 = type <12 x i16>
108 define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12* %bp) nounwind {
109 ; CHECK-LABEL: add12i16:
110 ; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
111 ; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
112 ; CHECK-NEXT: paddw (%{{.*}}), %[[R0]]
113 ; CHECK-NEXT: paddw 16(%{{.*}}), %[[R1]]
114 ; CHECK-NEXT: movq %[[R1]], 16(%{{.*}})
115 ; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
116 %a = load %i16vec12* %ap, align 16
117 %b = load %i16vec12* %bp, align 16
118 %x = add %i16vec12 %a, %b
119 store %i16vec12 %x, %i16vec12* %ret, align 16
123 %i16vec18 = type <18 x i16>
124 define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* %bp) nounwind {
125 ; CHECK-LABEL: add18i16:
126 ; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
127 ; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
128 ; CHECK-NEXT: movdqa 32(%{{.*}}), %[[R2:xmm[0-9]+]]
129 ; CHECK-NEXT: paddw (%{{.*}}), %[[R0]]
130 ; CHECK-NEXT: paddw 16(%{{.*}}), %[[R1]]
131 ; CHECK-NEXT: paddw 32(%{{.*}}), %[[R2]]
132 ; CHECK-NEXT: movd %[[R2]], 32(%{{.*}})
133 ; CHECK-NEXT: movdqa %[[R1]], 16(%{{.*}})
134 ; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
135 %a = load %i16vec18* %ap, align 16
136 %b = load %i16vec18* %bp, align 16
137 %x = add %i16vec18 %a, %b
138 store %i16vec18 %x, %i16vec18* %ret, align 16
143 %i8vec3 = type <3 x i8>
144 define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind {
145 ; CHECK-LABEL: add3i8:
146 ; CHECK: pmovzxbd (%{{.*}}), %[[R0:xmm[0-9]+]]
147 ; CHECK-NEXT: pmovzxbd (%{{.*}}), %[[R1:xmm[0-9]+]]
148 ; CHECK-NEXT: paddd %[[R0]], %[[R1]]
149 ; CHECK-NEXT: movdqa %[[R1]], %[[R0]]
150 ; CHECK-NEXT: pshufb {{.*}}, %[[R0]]
151 ; CHECK-NEXT: movd %[[R0]], %e[[R3:[abcd]]]x
152 ; CHECK-NEXT: pextrb $8, %[[R1]], 2(%{{.*}})
153 ; CHECK-NEXT: movw %[[R3]]x, (%{{.*}})
154 %a = load %i8vec3* %ap, align 16
155 %b = load %i8vec3* %bp, align 16
156 %x = add %i8vec3 %a, %b
157 store %i8vec3 %x, %i8vec3* %ret, align 16
161 %i8vec31 = type <31 x i8>
162 define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp) nounwind {
163 ; CHECK-LABEL: add31i8:
164 ; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
165 ; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
166 ; CHECK-NEXT: paddb (%{{.*}}), %[[R0]]
167 ; CHECK-NEXT: paddb 16(%{{.*}}), %[[R1]]
168 ; CHECK-NEXT: pextrb $14, %[[R1]], 30(%{{.*}})
169 ; CHECK-NEXT: pextrw $6, %[[R1]], 28(%{{.*}})
170 ; CHECK-NEXT: pextrd $2, %[[R1]], 24(%{{.*}})
171 ; CHECK-NEXT: movq %[[R1]], 16(%{{.*}})
172 ; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
173 %a = load %i8vec31* %ap, align 16
174 %b = load %i8vec31* %bp, align 16
175 %x = add %i8vec31 %a, %b
176 store %i8vec31 %x, %i8vec31* %ret, align 16
181 %i8vec3pack = type { <3 x i8>, i8 }
182 define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pack* %rot) nounwind {
184 ; CHECK: movdqa {{.*}}, %[[CONSTANT0:xmm[0-9]+]]
185 ; CHECK-NEXT: movdqa {{.*}}, %[[SHUFFLE_MASK:xmm[0-9]+]]
186 ; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[CONSTANT0]]
187 ; CHECK-NEXT: movd %[[CONSTANT0]], %e[[R0:[abcd]]]x
188 ; CHECK-NEXT: movw %[[R0]]x, (%[[PTR0:.*]])
189 ; CHECK-NEXT: movb $-98, 2(%[[PTR0]])
190 ; CHECK-NEXT: movdqa {{.*}}, %[[CONSTANT1:xmm[0-9]+]]
191 ; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[CONSTANT1]]
192 ; CHECK-NEXT: movd %[[CONSTANT1]], %e[[R1:[abcd]]]x
193 ; CHECK-NEXT: movw %[[R1]]x, (%[[PTR1:.*]])
194 ; CHECK-NEXT: movb $1, 2(%[[PTR1]])
195 ; CHECK-NEXT: pmovzxbd (%[[PTR0]]), %[[X0:xmm[0-9]+]]
196 ; CHECK-NEXT: pand {{.*}}, %[[X0]]
197 ; CHECK-NEXT: pextrd $1, %[[X0]], %e[[R0:[abcd]]]x
198 ; CHECK-NEXT: shrl %e[[R0]]x
199 ; CHECK-NEXT: movd %[[X0]], %e[[R1:[abcd]]]x
200 ; CHECK-NEXT: shrl %e[[R1]]x
201 ; CHECK-NEXT: movd %e[[R1]]x, %[[X1:xmm[0-9]+]]
202 ; CHECK-NEXT: pinsrd $1, %e[[R0]]x, %[[X1]]
203 ; CHECK-NEXT: pextrd $2, %[[X0]], %e[[R0:[abcd]]]x
204 ; CHECK-NEXT: shrl %e[[R0]]x
205 ; CHECK-NEXT: pinsrd $2, %e[[R0]]x, %[[X1]]
206 ; CHECK-NEXT: pextrd $3, %[[X0]], %e[[R0:[abcd]]]x
207 ; CHECK-NEXT: pinsrd $3, %e[[R0]]x, %[[X1]]
208 ; CHECK-NEXT: movdqa %[[X1]], %[[X2:xmm[0-9]+]]
209 ; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[X2]]
210 ; CHECK-NEXT: movd %[[X2]], %e[[R0:[abcd]]]x
211 ; CHECK-NEXT: pextrb $8, %[[X1]], 2(%{{.*}})
212 ; CHECK-NEXT: movw %[[R0]]x, (%{{.*}})
215 %storetmp = bitcast %i8vec3pack* %X to <3 x i8>*
216 store <3 x i8> <i8 -98, i8 -98, i8 -98>, <3 x i8>* %storetmp
217 %storetmp1 = bitcast %i8vec3pack* %rot to <3 x i8>*
218 store <3 x i8> <i8 1, i8 1, i8 1>, <3 x i8>* %storetmp1
219 %tmp = load %i8vec3pack* %X
220 %extractVec = extractvalue %i8vec3pack %tmp, 0
221 %tmp2 = load %i8vec3pack* %rot
222 %extractVec3 = extractvalue %i8vec3pack %tmp2, 0
223 %shr = lshr <3 x i8> %extractVec, %extractVec3
224 %storetmp4 = bitcast %i8vec3pack* %result to <3 x i8>*
225 store <3 x i8> %shr, <3 x i8>* %storetmp4