Merge tag 'armsoc-drivers' of git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc
[firefly-linux-kernel-4.4.55.git] / arch / x86 / crypto / chacha20-ssse3-x86_64.S
1 /*
2  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
3  *
4  * Copyright (C) 2015 Martin Willi
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  */
11
12 #include <linux/linkage.h>
13
14 .data
15 .align 16
16
17 ROT8:   .octa 0x0e0d0c0f0a09080b0605040702010003
18 ROT16:  .octa 0x0d0c0f0e09080b0a0504070601000302
19 CTRINC: .octa 0x00000003000000020000000100000000
20
21 .text
22
23 ENTRY(chacha20_block_xor_ssse3)
24         # %rdi: Input state matrix, s
25         # %rsi: 1 data block output, o
26         # %rdx: 1 data block input, i
27
28         # This function encrypts one ChaCha20 block by loading the state matrix
29         # in four SSE registers. It performs matrix operation on four words in
30         # parallel, but requireds shuffling to rearrange the words after each
31         # round. 8/16-bit word rotation is done with the slightly better
32         # performing SSSE3 byte shuffling, 7/12-bit word rotation uses
33         # traditional shift+OR.
34
35         # x0..3 = s0..3
36         movdqa          0x00(%rdi),%xmm0
37         movdqa          0x10(%rdi),%xmm1
38         movdqa          0x20(%rdi),%xmm2
39         movdqa          0x30(%rdi),%xmm3
40         movdqa          %xmm0,%xmm8
41         movdqa          %xmm1,%xmm9
42         movdqa          %xmm2,%xmm10
43         movdqa          %xmm3,%xmm11
44
45         movdqa          ROT8(%rip),%xmm4
46         movdqa          ROT16(%rip),%xmm5
47
48         mov     $10,%ecx
49
50 .Ldoubleround:
51
52         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
53         paddd           %xmm1,%xmm0
54         pxor            %xmm0,%xmm3
55         pshufb          %xmm5,%xmm3
56
57         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
58         paddd           %xmm3,%xmm2
59         pxor            %xmm2,%xmm1
60         movdqa          %xmm1,%xmm6
61         pslld           $12,%xmm6
62         psrld           $20,%xmm1
63         por             %xmm6,%xmm1
64
65         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
66         paddd           %xmm1,%xmm0
67         pxor            %xmm0,%xmm3
68         pshufb          %xmm4,%xmm3
69
70         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
71         paddd           %xmm3,%xmm2
72         pxor            %xmm2,%xmm1
73         movdqa          %xmm1,%xmm7
74         pslld           $7,%xmm7
75         psrld           $25,%xmm1
76         por             %xmm7,%xmm1
77
78         # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
79         pshufd          $0x39,%xmm1,%xmm1
80         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
81         pshufd          $0x4e,%xmm2,%xmm2
82         # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
83         pshufd          $0x93,%xmm3,%xmm3
84
85         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
86         paddd           %xmm1,%xmm0
87         pxor            %xmm0,%xmm3
88         pshufb          %xmm5,%xmm3
89
90         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
91         paddd           %xmm3,%xmm2
92         pxor            %xmm2,%xmm1
93         movdqa          %xmm1,%xmm6
94         pslld           $12,%xmm6
95         psrld           $20,%xmm1
96         por             %xmm6,%xmm1
97
98         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
99         paddd           %xmm1,%xmm0
100         pxor            %xmm0,%xmm3
101         pshufb          %xmm4,%xmm3
102
103         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
104         paddd           %xmm3,%xmm2
105         pxor            %xmm2,%xmm1
106         movdqa          %xmm1,%xmm7
107         pslld           $7,%xmm7
108         psrld           $25,%xmm1
109         por             %xmm7,%xmm1
110
111         # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
112         pshufd          $0x93,%xmm1,%xmm1
113         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
114         pshufd          $0x4e,%xmm2,%xmm2
115         # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
116         pshufd          $0x39,%xmm3,%xmm3
117
118         dec             %ecx
119         jnz             .Ldoubleround
120
121         # o0 = i0 ^ (x0 + s0)
122         movdqu          0x00(%rdx),%xmm4
123         paddd           %xmm8,%xmm0
124         pxor            %xmm4,%xmm0
125         movdqu          %xmm0,0x00(%rsi)
126         # o1 = i1 ^ (x1 + s1)
127         movdqu          0x10(%rdx),%xmm5
128         paddd           %xmm9,%xmm1
129         pxor            %xmm5,%xmm1
130         movdqu          %xmm1,0x10(%rsi)
131         # o2 = i2 ^ (x2 + s2)
132         movdqu          0x20(%rdx),%xmm6
133         paddd           %xmm10,%xmm2
134         pxor            %xmm6,%xmm2
135         movdqu          %xmm2,0x20(%rsi)
136         # o3 = i3 ^ (x3 + s3)
137         movdqu          0x30(%rdx),%xmm7
138         paddd           %xmm11,%xmm3
139         pxor            %xmm7,%xmm3
140         movdqu          %xmm3,0x30(%rsi)
141
142         ret
143 ENDPROC(chacha20_block_xor_ssse3)
144
145 ENTRY(chacha20_4block_xor_ssse3)
146         # %rdi: Input state matrix, s
147         # %rsi: 4 data blocks output, o
148         # %rdx: 4 data blocks input, i
149
150         # This function encrypts four consecutive ChaCha20 blocks by loading the
151         # the state matrix in SSE registers four times. As we need some scratch
152         # registers, we save the first four registers on the stack. The
153         # algorithm performs each operation on the corresponding word of each
154         # state matrix, hence requires no word shuffling. For final XORing step
155         # we transpose the matrix by interleaving 32- and then 64-bit words,
156         # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
157         # done with the slightly better performing SSSE3 byte shuffling,
158         # 7/12-bit word rotation uses traditional shift+OR.
159
160         sub             $0x40,%rsp
161
162         # x0..15[0-3] = s0..3[0..3]
163         movq            0x00(%rdi),%xmm1
164         pshufd          $0x00,%xmm1,%xmm0
165         pshufd          $0x55,%xmm1,%xmm1
166         movq            0x08(%rdi),%xmm3
167         pshufd          $0x00,%xmm3,%xmm2
168         pshufd          $0x55,%xmm3,%xmm3
169         movq            0x10(%rdi),%xmm5
170         pshufd          $0x00,%xmm5,%xmm4
171         pshufd          $0x55,%xmm5,%xmm5
172         movq            0x18(%rdi),%xmm7
173         pshufd          $0x00,%xmm7,%xmm6
174         pshufd          $0x55,%xmm7,%xmm7
175         movq            0x20(%rdi),%xmm9
176         pshufd          $0x00,%xmm9,%xmm8
177         pshufd          $0x55,%xmm9,%xmm9
178         movq            0x28(%rdi),%xmm11
179         pshufd          $0x00,%xmm11,%xmm10
180         pshufd          $0x55,%xmm11,%xmm11
181         movq            0x30(%rdi),%xmm13
182         pshufd          $0x00,%xmm13,%xmm12
183         pshufd          $0x55,%xmm13,%xmm13
184         movq            0x38(%rdi),%xmm15
185         pshufd          $0x00,%xmm15,%xmm14
186         pshufd          $0x55,%xmm15,%xmm15
187         # x0..3 on stack
188         movdqa          %xmm0,0x00(%rsp)
189         movdqa          %xmm1,0x10(%rsp)
190         movdqa          %xmm2,0x20(%rsp)
191         movdqa          %xmm3,0x30(%rsp)
192
193         movdqa          CTRINC(%rip),%xmm1
194         movdqa          ROT8(%rip),%xmm2
195         movdqa          ROT16(%rip),%xmm3
196
197         # x12 += counter values 0-3
198         paddd           %xmm1,%xmm12
199
200         mov             $10,%ecx
201
202 .Ldoubleround4:
203         # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
204         movdqa          0x00(%rsp),%xmm0
205         paddd           %xmm4,%xmm0
206         movdqa          %xmm0,0x00(%rsp)
207         pxor            %xmm0,%xmm12
208         pshufb          %xmm3,%xmm12
209         # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
210         movdqa          0x10(%rsp),%xmm0
211         paddd           %xmm5,%xmm0
212         movdqa          %xmm0,0x10(%rsp)
213         pxor            %xmm0,%xmm13
214         pshufb          %xmm3,%xmm13
215         # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
216         movdqa          0x20(%rsp),%xmm0
217         paddd           %xmm6,%xmm0
218         movdqa          %xmm0,0x20(%rsp)
219         pxor            %xmm0,%xmm14
220         pshufb          %xmm3,%xmm14
221         # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
222         movdqa          0x30(%rsp),%xmm0
223         paddd           %xmm7,%xmm0
224         movdqa          %xmm0,0x30(%rsp)
225         pxor            %xmm0,%xmm15
226         pshufb          %xmm3,%xmm15
227
228         # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
229         paddd           %xmm12,%xmm8
230         pxor            %xmm8,%xmm4
231         movdqa          %xmm4,%xmm0
232         pslld           $12,%xmm0
233         psrld           $20,%xmm4
234         por             %xmm0,%xmm4
235         # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
236         paddd           %xmm13,%xmm9
237         pxor            %xmm9,%xmm5
238         movdqa          %xmm5,%xmm0
239         pslld           $12,%xmm0
240         psrld           $20,%xmm5
241         por             %xmm0,%xmm5
242         # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
243         paddd           %xmm14,%xmm10
244         pxor            %xmm10,%xmm6
245         movdqa          %xmm6,%xmm0
246         pslld           $12,%xmm0
247         psrld           $20,%xmm6
248         por             %xmm0,%xmm6
249         # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
250         paddd           %xmm15,%xmm11
251         pxor            %xmm11,%xmm7
252         movdqa          %xmm7,%xmm0
253         pslld           $12,%xmm0
254         psrld           $20,%xmm7
255         por             %xmm0,%xmm7
256
257         # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
258         movdqa          0x00(%rsp),%xmm0
259         paddd           %xmm4,%xmm0
260         movdqa          %xmm0,0x00(%rsp)
261         pxor            %xmm0,%xmm12
262         pshufb          %xmm2,%xmm12
263         # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
264         movdqa          0x10(%rsp),%xmm0
265         paddd           %xmm5,%xmm0
266         movdqa          %xmm0,0x10(%rsp)
267         pxor            %xmm0,%xmm13
268         pshufb          %xmm2,%xmm13
269         # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
270         movdqa          0x20(%rsp),%xmm0
271         paddd           %xmm6,%xmm0
272         movdqa          %xmm0,0x20(%rsp)
273         pxor            %xmm0,%xmm14
274         pshufb          %xmm2,%xmm14
275         # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
276         movdqa          0x30(%rsp),%xmm0
277         paddd           %xmm7,%xmm0
278         movdqa          %xmm0,0x30(%rsp)
279         pxor            %xmm0,%xmm15
280         pshufb          %xmm2,%xmm15
281
282         # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
283         paddd           %xmm12,%xmm8
284         pxor            %xmm8,%xmm4
285         movdqa          %xmm4,%xmm0
286         pslld           $7,%xmm0
287         psrld           $25,%xmm4
288         por             %xmm0,%xmm4
289         # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
290         paddd           %xmm13,%xmm9
291         pxor            %xmm9,%xmm5
292         movdqa          %xmm5,%xmm0
293         pslld           $7,%xmm0
294         psrld           $25,%xmm5
295         por             %xmm0,%xmm5
296         # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
297         paddd           %xmm14,%xmm10
298         pxor            %xmm10,%xmm6
299         movdqa          %xmm6,%xmm0
300         pslld           $7,%xmm0
301         psrld           $25,%xmm6
302         por             %xmm0,%xmm6
303         # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
304         paddd           %xmm15,%xmm11
305         pxor            %xmm11,%xmm7
306         movdqa          %xmm7,%xmm0
307         pslld           $7,%xmm0
308         psrld           $25,%xmm7
309         por             %xmm0,%xmm7
310
311         # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
312         movdqa          0x00(%rsp),%xmm0
313         paddd           %xmm5,%xmm0
314         movdqa          %xmm0,0x00(%rsp)
315         pxor            %xmm0,%xmm15
316         pshufb          %xmm3,%xmm15
317         # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
318         movdqa          0x10(%rsp),%xmm0
319         paddd           %xmm6,%xmm0
320         movdqa          %xmm0,0x10(%rsp)
321         pxor            %xmm0,%xmm12
322         pshufb          %xmm3,%xmm12
323         # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
324         movdqa          0x20(%rsp),%xmm0
325         paddd           %xmm7,%xmm0
326         movdqa          %xmm0,0x20(%rsp)
327         pxor            %xmm0,%xmm13
328         pshufb          %xmm3,%xmm13
329         # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
330         movdqa          0x30(%rsp),%xmm0
331         paddd           %xmm4,%xmm0
332         movdqa          %xmm0,0x30(%rsp)
333         pxor            %xmm0,%xmm14
334         pshufb          %xmm3,%xmm14
335
336         # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
337         paddd           %xmm15,%xmm10
338         pxor            %xmm10,%xmm5
339         movdqa          %xmm5,%xmm0
340         pslld           $12,%xmm0
341         psrld           $20,%xmm5
342         por             %xmm0,%xmm5
343         # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
344         paddd           %xmm12,%xmm11
345         pxor            %xmm11,%xmm6
346         movdqa          %xmm6,%xmm0
347         pslld           $12,%xmm0
348         psrld           $20,%xmm6
349         por             %xmm0,%xmm6
350         # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
351         paddd           %xmm13,%xmm8
352         pxor            %xmm8,%xmm7
353         movdqa          %xmm7,%xmm0
354         pslld           $12,%xmm0
355         psrld           $20,%xmm7
356         por             %xmm0,%xmm7
357         # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
358         paddd           %xmm14,%xmm9
359         pxor            %xmm9,%xmm4
360         movdqa          %xmm4,%xmm0
361         pslld           $12,%xmm0
362         psrld           $20,%xmm4
363         por             %xmm0,%xmm4
364
365         # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
366         movdqa          0x00(%rsp),%xmm0
367         paddd           %xmm5,%xmm0
368         movdqa          %xmm0,0x00(%rsp)
369         pxor            %xmm0,%xmm15
370         pshufb          %xmm2,%xmm15
371         # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
372         movdqa          0x10(%rsp),%xmm0
373         paddd           %xmm6,%xmm0
374         movdqa          %xmm0,0x10(%rsp)
375         pxor            %xmm0,%xmm12
376         pshufb          %xmm2,%xmm12
377         # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
378         movdqa          0x20(%rsp),%xmm0
379         paddd           %xmm7,%xmm0
380         movdqa          %xmm0,0x20(%rsp)
381         pxor            %xmm0,%xmm13
382         pshufb          %xmm2,%xmm13
383         # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
384         movdqa          0x30(%rsp),%xmm0
385         paddd           %xmm4,%xmm0
386         movdqa          %xmm0,0x30(%rsp)
387         pxor            %xmm0,%xmm14
388         pshufb          %xmm2,%xmm14
389
390         # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
391         paddd           %xmm15,%xmm10
392         pxor            %xmm10,%xmm5
393         movdqa          %xmm5,%xmm0
394         pslld           $7,%xmm0
395         psrld           $25,%xmm5
396         por             %xmm0,%xmm5
397         # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
398         paddd           %xmm12,%xmm11
399         pxor            %xmm11,%xmm6
400         movdqa          %xmm6,%xmm0
401         pslld           $7,%xmm0
402         psrld           $25,%xmm6
403         por             %xmm0,%xmm6
404         # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
405         paddd           %xmm13,%xmm8
406         pxor            %xmm8,%xmm7
407         movdqa          %xmm7,%xmm0
408         pslld           $7,%xmm0
409         psrld           $25,%xmm7
410         por             %xmm0,%xmm7
411         # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
412         paddd           %xmm14,%xmm9
413         pxor            %xmm9,%xmm4
414         movdqa          %xmm4,%xmm0
415         pslld           $7,%xmm0
416         psrld           $25,%xmm4
417         por             %xmm0,%xmm4
418
419         dec             %ecx
420         jnz             .Ldoubleround4
421
422         # x0[0-3] += s0[0]
423         # x1[0-3] += s0[1]
424         movq            0x00(%rdi),%xmm3
425         pshufd          $0x00,%xmm3,%xmm2
426         pshufd          $0x55,%xmm3,%xmm3
427         paddd           0x00(%rsp),%xmm2
428         movdqa          %xmm2,0x00(%rsp)
429         paddd           0x10(%rsp),%xmm3
430         movdqa          %xmm3,0x10(%rsp)
431         # x2[0-3] += s0[2]
432         # x3[0-3] += s0[3]
433         movq            0x08(%rdi),%xmm3
434         pshufd          $0x00,%xmm3,%xmm2
435         pshufd          $0x55,%xmm3,%xmm3
436         paddd           0x20(%rsp),%xmm2
437         movdqa          %xmm2,0x20(%rsp)
438         paddd           0x30(%rsp),%xmm3
439         movdqa          %xmm3,0x30(%rsp)
440
441         # x4[0-3] += s1[0]
442         # x5[0-3] += s1[1]
443         movq            0x10(%rdi),%xmm3
444         pshufd          $0x00,%xmm3,%xmm2
445         pshufd          $0x55,%xmm3,%xmm3
446         paddd           %xmm2,%xmm4
447         paddd           %xmm3,%xmm5
448         # x6[0-3] += s1[2]
449         # x7[0-3] += s1[3]
450         movq            0x18(%rdi),%xmm3
451         pshufd          $0x00,%xmm3,%xmm2
452         pshufd          $0x55,%xmm3,%xmm3
453         paddd           %xmm2,%xmm6
454         paddd           %xmm3,%xmm7
455
456         # x8[0-3] += s2[0]
457         # x9[0-3] += s2[1]
458         movq            0x20(%rdi),%xmm3
459         pshufd          $0x00,%xmm3,%xmm2
460         pshufd          $0x55,%xmm3,%xmm3
461         paddd           %xmm2,%xmm8
462         paddd           %xmm3,%xmm9
463         # x10[0-3] += s2[2]
464         # x11[0-3] += s2[3]
465         movq            0x28(%rdi),%xmm3
466         pshufd          $0x00,%xmm3,%xmm2
467         pshufd          $0x55,%xmm3,%xmm3
468         paddd           %xmm2,%xmm10
469         paddd           %xmm3,%xmm11
470
471         # x12[0-3] += s3[0]
472         # x13[0-3] += s3[1]
473         movq            0x30(%rdi),%xmm3
474         pshufd          $0x00,%xmm3,%xmm2
475         pshufd          $0x55,%xmm3,%xmm3
476         paddd           %xmm2,%xmm12
477         paddd           %xmm3,%xmm13
478         # x14[0-3] += s3[2]
479         # x15[0-3] += s3[3]
480         movq            0x38(%rdi),%xmm3
481         pshufd          $0x00,%xmm3,%xmm2
482         pshufd          $0x55,%xmm3,%xmm3
483         paddd           %xmm2,%xmm14
484         paddd           %xmm3,%xmm15
485
486         # x12 += counter values 0-3
487         paddd           %xmm1,%xmm12
488
489         # interleave 32-bit words in state n, n+1
490         movdqa          0x00(%rsp),%xmm0
491         movdqa          0x10(%rsp),%xmm1
492         movdqa          %xmm0,%xmm2
493         punpckldq       %xmm1,%xmm2
494         punpckhdq       %xmm1,%xmm0
495         movdqa          %xmm2,0x00(%rsp)
496         movdqa          %xmm0,0x10(%rsp)
497         movdqa          0x20(%rsp),%xmm0
498         movdqa          0x30(%rsp),%xmm1
499         movdqa          %xmm0,%xmm2
500         punpckldq       %xmm1,%xmm2
501         punpckhdq       %xmm1,%xmm0
502         movdqa          %xmm2,0x20(%rsp)
503         movdqa          %xmm0,0x30(%rsp)
504         movdqa          %xmm4,%xmm0
505         punpckldq       %xmm5,%xmm4
506         punpckhdq       %xmm5,%xmm0
507         movdqa          %xmm0,%xmm5
508         movdqa          %xmm6,%xmm0
509         punpckldq       %xmm7,%xmm6
510         punpckhdq       %xmm7,%xmm0
511         movdqa          %xmm0,%xmm7
512         movdqa          %xmm8,%xmm0
513         punpckldq       %xmm9,%xmm8
514         punpckhdq       %xmm9,%xmm0
515         movdqa          %xmm0,%xmm9
516         movdqa          %xmm10,%xmm0
517         punpckldq       %xmm11,%xmm10
518         punpckhdq       %xmm11,%xmm0
519         movdqa          %xmm0,%xmm11
520         movdqa          %xmm12,%xmm0
521         punpckldq       %xmm13,%xmm12
522         punpckhdq       %xmm13,%xmm0
523         movdqa          %xmm0,%xmm13
524         movdqa          %xmm14,%xmm0
525         punpckldq       %xmm15,%xmm14
526         punpckhdq       %xmm15,%xmm0
527         movdqa          %xmm0,%xmm15
528
529         # interleave 64-bit words in state n, n+2
530         movdqa          0x00(%rsp),%xmm0
531         movdqa          0x20(%rsp),%xmm1
532         movdqa          %xmm0,%xmm2
533         punpcklqdq      %xmm1,%xmm2
534         punpckhqdq      %xmm1,%xmm0
535         movdqa          %xmm2,0x00(%rsp)
536         movdqa          %xmm0,0x20(%rsp)
537         movdqa          0x10(%rsp),%xmm0
538         movdqa          0x30(%rsp),%xmm1
539         movdqa          %xmm0,%xmm2
540         punpcklqdq      %xmm1,%xmm2
541         punpckhqdq      %xmm1,%xmm0
542         movdqa          %xmm2,0x10(%rsp)
543         movdqa          %xmm0,0x30(%rsp)
544         movdqa          %xmm4,%xmm0
545         punpcklqdq      %xmm6,%xmm4
546         punpckhqdq      %xmm6,%xmm0
547         movdqa          %xmm0,%xmm6
548         movdqa          %xmm5,%xmm0
549         punpcklqdq      %xmm7,%xmm5
550         punpckhqdq      %xmm7,%xmm0
551         movdqa          %xmm0,%xmm7
552         movdqa          %xmm8,%xmm0
553         punpcklqdq      %xmm10,%xmm8
554         punpckhqdq      %xmm10,%xmm0
555         movdqa          %xmm0,%xmm10
556         movdqa          %xmm9,%xmm0
557         punpcklqdq      %xmm11,%xmm9
558         punpckhqdq      %xmm11,%xmm0
559         movdqa          %xmm0,%xmm11
560         movdqa          %xmm12,%xmm0
561         punpcklqdq      %xmm14,%xmm12
562         punpckhqdq      %xmm14,%xmm0
563         movdqa          %xmm0,%xmm14
564         movdqa          %xmm13,%xmm0
565         punpcklqdq      %xmm15,%xmm13
566         punpckhqdq      %xmm15,%xmm0
567         movdqa          %xmm0,%xmm15
568
569         # xor with corresponding input, write to output
570         movdqa          0x00(%rsp),%xmm0
571         movdqu          0x00(%rdx),%xmm1
572         pxor            %xmm1,%xmm0
573         movdqu          %xmm0,0x00(%rsi)
574         movdqa          0x10(%rsp),%xmm0
575         movdqu          0x80(%rdx),%xmm1
576         pxor            %xmm1,%xmm0
577         movdqu          %xmm0,0x80(%rsi)
578         movdqa          0x20(%rsp),%xmm0
579         movdqu          0x40(%rdx),%xmm1
580         pxor            %xmm1,%xmm0
581         movdqu          %xmm0,0x40(%rsi)
582         movdqa          0x30(%rsp),%xmm0
583         movdqu          0xc0(%rdx),%xmm1
584         pxor            %xmm1,%xmm0
585         movdqu          %xmm0,0xc0(%rsi)
586         movdqu          0x10(%rdx),%xmm1
587         pxor            %xmm1,%xmm4
588         movdqu          %xmm4,0x10(%rsi)
589         movdqu          0x90(%rdx),%xmm1
590         pxor            %xmm1,%xmm5
591         movdqu          %xmm5,0x90(%rsi)
592         movdqu          0x50(%rdx),%xmm1
593         pxor            %xmm1,%xmm6
594         movdqu          %xmm6,0x50(%rsi)
595         movdqu          0xd0(%rdx),%xmm1
596         pxor            %xmm1,%xmm7
597         movdqu          %xmm7,0xd0(%rsi)
598         movdqu          0x20(%rdx),%xmm1
599         pxor            %xmm1,%xmm8
600         movdqu          %xmm8,0x20(%rsi)
601         movdqu          0xa0(%rdx),%xmm1
602         pxor            %xmm1,%xmm9
603         movdqu          %xmm9,0xa0(%rsi)
604         movdqu          0x60(%rdx),%xmm1
605         pxor            %xmm1,%xmm10
606         movdqu          %xmm10,0x60(%rsi)
607         movdqu          0xe0(%rdx),%xmm1
608         pxor            %xmm1,%xmm11
609         movdqu          %xmm11,0xe0(%rsi)
610         movdqu          0x30(%rdx),%xmm1
611         pxor            %xmm1,%xmm12
612         movdqu          %xmm12,0x30(%rsi)
613         movdqu          0xb0(%rdx),%xmm1
614         pxor            %xmm1,%xmm13
615         movdqu          %xmm13,0xb0(%rsi)
616         movdqu          0x70(%rdx),%xmm1
617         pxor            %xmm1,%xmm14
618         movdqu          %xmm14,0x70(%rsi)
619         movdqu          0xf0(%rdx),%xmm1
620         pxor            %xmm1,%xmm15
621         movdqu          %xmm15,0xf0(%rsi)
622
623         add             $0x40,%rsp
624         ret
625 ENDPROC(chacha20_4block_xor_ssse3)