2 * Implement AES algorithm in Intel AES-NI instructions.
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
32 #include <linux/linkage.h>
38 .Lgf128mul_x_ble_mask:
39 .octa 0x00000000000000010000000000000087
41 POLY: .octa 0xC2000000000000000000000000000001
42 TWOONE: .octa 0x00000001000000000000000000000001
44 # order of these constants should not change.
45 # more specifically, ALL_F should follow SHIFT_MASK,
46 # and ZERO should follow ALL_F
48 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
49 MASK1: .octa 0x0000000000000000ffffffffffffffff
50 MASK2: .octa 0xffffffffffffffff0000000000000000
51 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
52 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
53 ZERO: .octa 0x00000000000000000000000000000000
54 ONE: .octa 0x00000000000000000000000000000001
55 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
63 #define STACK_OFFSET 8*3
64 #define HashKey 16*0 // store HashKey <<1 mod poly here
65 #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
66 #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
67 #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
68 #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
69 // bits of HashKey <<1 mod poly here
70 //(for Karatsuba purposes)
71 #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
72 // bits of HashKey^2 <<1 mod poly here
73 // (for Karatsuba purposes)
74 #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
75 // bits of HashKey^3 <<1 mod poly here
76 // (for Karatsuba purposes)
77 #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
78 // bits of HashKey^4 <<1 mod poly here
79 // (for Karatsuba purposes)
80 #define VARIABLE_OFFSET 16*8
88 #define arg7 STACK_OFFSET+8(%r14)
89 #define arg8 STACK_OFFSET+16(%r14)
90 #define arg9 STACK_OFFSET+24(%r14)
91 #define arg10 STACK_OFFSET+32(%r14)
108 #define BSWAP_MASK %xmm10
112 #define GF128MUL_MASK %xmm10
142 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
145 * Input: A and B (128-bits each, bit-reflected)
146 * Output: C = A*B*x mod poly, (i.e. >>1 )
147 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
148 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
151 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
153 pshufd $78, \GH, \TMP2
154 pshufd $78, \HK, \TMP3
155 pxor \GH, \TMP2 # TMP2 = a1+a0
156 pxor \HK, \TMP3 # TMP3 = b1+b0
157 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
158 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
159 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
161 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
163 pslldq $8, \TMP3 # left shift TMP3 2 DWs
164 psrldq $8, \TMP2 # right shift TMP2 2 DWs
166 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
168 # first phase of the reduction
172 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
173 # in in order to perform
175 pslld $31, \TMP2 # packed right shift <<31
176 pslld $30, \TMP3 # packed right shift <<30
177 pslld $25, \TMP4 # packed right shift <<25
178 pxor \TMP3, \TMP2 # xor the shifted versions
181 psrldq $4, \TMP5 # right shift TMP5 1 DW
182 pslldq $12, \TMP2 # left shift TMP2 3 DWs
185 # second phase of the reduction
187 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
188 # in in order to perform
192 psrld $1,\TMP2 # packed left shift >>1
193 psrld $2,\TMP3 # packed left shift >>2
194 psrld $7,\TMP4 # packed left shift >>7
195 pxor \TMP3,\TMP2 # xor the shifted versions
199 pxor \TMP1, \GH # result is in TMP1
203 * if a = number of total plaintext bytes
205 * num_initial_blocks = b mod 4
206 * encrypt the initial num_initial_blocks blocks and apply ghash on
208 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
210 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
214 .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
215 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
216 mov arg7, %r10 # %r10 = AAD
217 mov arg8, %r12 # %r12 = aadLen
220 _get_AAD_loop\num_initial_blocks\operation:
227 jne _get_AAD_loop\num_initial_blocks\operation
229 je _get_AAD_loop2_done\num_initial_blocks\operation
231 _get_AAD_loop2\num_initial_blocks\operation:
235 jne _get_AAD_loop2\num_initial_blocks\operation
236 _get_AAD_loop2_done\num_initial_blocks\operation:
237 movdqa SHUF_MASK(%rip), %xmm14
238 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
240 xor %r11, %r11 # initialise the data pointer offset as zero
242 # start AES for num_initial_blocks blocks
244 mov %arg5, %rax # %rax = *Y0
245 movdqu (%rax), \XMM0 # XMM0 = Y0
246 movdqa SHUF_MASK(%rip), %xmm14
247 PSHUFB_XMM %xmm14, \XMM0
249 .if (\i == 5) || (\i == 6) || (\i == 7)
251 paddd ONE(%rip), \XMM0 # INCR Y0
252 movdqa \XMM0, %xmm\index
253 movdqa SHUF_MASK(%rip), %xmm14
254 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
258 pxor 16*0(%arg1), %xmm\index
261 movaps 0x10(%rdi), \TMP1
262 AESENC \TMP1, %xmm\index # Round 1
265 movaps 0x20(%arg1), \TMP1
266 AESENC \TMP1, %xmm\index # Round 2
269 movaps 0x30(%arg1), \TMP1
270 AESENC \TMP1, %xmm\index # Round 2
273 movaps 0x40(%arg1), \TMP1
274 AESENC \TMP1, %xmm\index # Round 2
277 movaps 0x50(%arg1), \TMP1
278 AESENC \TMP1, %xmm\index # Round 2
281 movaps 0x60(%arg1), \TMP1
282 AESENC \TMP1, %xmm\index # Round 2
285 movaps 0x70(%arg1), \TMP1
286 AESENC \TMP1, %xmm\index # Round 2
289 movaps 0x80(%arg1), \TMP1
290 AESENC \TMP1, %xmm\index # Round 2
293 movaps 0x90(%arg1), \TMP1
294 AESENC \TMP1, %xmm\index # Round 2
297 movaps 0xa0(%arg1), \TMP1
298 AESENCLAST \TMP1, %xmm\index # Round 10
301 movdqu (%arg3 , %r11, 1), \TMP1
302 pxor \TMP1, %xmm\index
303 movdqu %xmm\index, (%arg2 , %r11, 1)
304 # write back plaintext/ciphertext for num_initial_blocks
307 movdqa \TMP1, %xmm\index
308 movdqa SHUF_MASK(%rip), %xmm14
309 PSHUFB_XMM %xmm14, %xmm\index
311 # prepare plaintext/ciphertext for GHASH computation
314 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
315 # apply GHASH on num_initial_blocks blocks
319 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
321 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
323 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
326 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
328 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
331 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
334 jl _initial_blocks_done\num_initial_blocks\operation
335 # no need for precomputed values
338 * Precomputations for HashKey parallel with encryption of first 4 blocks.
339 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
341 paddd ONE(%rip), \XMM0 # INCR Y0
343 movdqa SHUF_MASK(%rip), %xmm14
344 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
346 paddd ONE(%rip), \XMM0 # INCR Y0
348 movdqa SHUF_MASK(%rip), %xmm14
349 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
351 paddd ONE(%rip), \XMM0 # INCR Y0
353 movdqa SHUF_MASK(%rip), %xmm14
354 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
356 paddd ONE(%rip), \XMM0 # INCR Y0
358 movdqa SHUF_MASK(%rip), %xmm14
359 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
361 pxor 16*0(%arg1), \XMM1
362 pxor 16*0(%arg1), \XMM2
363 pxor 16*0(%arg1), \XMM3
364 pxor 16*0(%arg1), \XMM4
366 pshufd $78, \TMP3, \TMP1
368 movdqa \TMP1, HashKey_k(%rsp)
369 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
370 # TMP5 = HashKey^2<<1 (mod poly)
371 movdqa \TMP5, HashKey_2(%rsp)
372 # HashKey_2 = HashKey^2<<1 (mod poly)
373 pshufd $78, \TMP5, \TMP1
375 movdqa \TMP1, HashKey_2_k(%rsp)
376 .irpc index, 1234 # do 4 rounds
377 movaps 0x10*\index(%arg1), \TMP1
383 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
384 # TMP5 = HashKey^3<<1 (mod poly)
385 movdqa \TMP5, HashKey_3(%rsp)
386 pshufd $78, \TMP5, \TMP1
388 movdqa \TMP1, HashKey_3_k(%rsp)
389 .irpc index, 56789 # do next 5 rounds
390 movaps 0x10*\index(%arg1), \TMP1
396 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
397 # TMP5 = HashKey^3<<1 (mod poly)
398 movdqa \TMP5, HashKey_4(%rsp)
399 pshufd $78, \TMP5, \TMP1
401 movdqa \TMP1, HashKey_4_k(%rsp)
402 movaps 0xa0(%arg1), \TMP2
403 AESENCLAST \TMP2, \XMM1
404 AESENCLAST \TMP2, \XMM2
405 AESENCLAST \TMP2, \XMM3
406 AESENCLAST \TMP2, \XMM4
407 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
409 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
411 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
413 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
415 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
417 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
419 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
421 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
424 movdqa SHUF_MASK(%rip), %xmm14
425 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
427 # combine GHASHed value with the corresponding ciphertext
428 movdqa SHUF_MASK(%rip), %xmm14
429 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
430 movdqa SHUF_MASK(%rip), %xmm14
431 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
432 movdqa SHUF_MASK(%rip), %xmm14
433 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
435 _initial_blocks_done\num_initial_blocks\operation:
441 * if a = number of total plaintext bytes
443 * num_initial_blocks = b mod 4
444 * encrypt the initial num_initial_blocks blocks and apply ghash on
446 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
448 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
452 .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
453 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
454 mov arg7, %r10 # %r10 = AAD
455 mov arg8, %r12 # %r12 = aadLen
458 _get_AAD_loop\num_initial_blocks\operation:
465 jne _get_AAD_loop\num_initial_blocks\operation
467 je _get_AAD_loop2_done\num_initial_blocks\operation
469 _get_AAD_loop2\num_initial_blocks\operation:
473 jne _get_AAD_loop2\num_initial_blocks\operation
474 _get_AAD_loop2_done\num_initial_blocks\operation:
475 movdqa SHUF_MASK(%rip), %xmm14
476 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
478 xor %r11, %r11 # initialise the data pointer offset as zero
480 # start AES for num_initial_blocks blocks
482 mov %arg5, %rax # %rax = *Y0
483 movdqu (%rax), \XMM0 # XMM0 = Y0
484 movdqa SHUF_MASK(%rip), %xmm14
485 PSHUFB_XMM %xmm14, \XMM0
487 .if (\i == 5) || (\i == 6) || (\i == 7)
489 paddd ONE(%rip), \XMM0 # INCR Y0
490 movdqa \XMM0, %xmm\index
491 movdqa SHUF_MASK(%rip), %xmm14
492 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
496 pxor 16*0(%arg1), %xmm\index
499 movaps 0x10(%rdi), \TMP1
500 AESENC \TMP1, %xmm\index # Round 1
503 movaps 0x20(%arg1), \TMP1
504 AESENC \TMP1, %xmm\index # Round 2
507 movaps 0x30(%arg1), \TMP1
508 AESENC \TMP1, %xmm\index # Round 2
511 movaps 0x40(%arg1), \TMP1
512 AESENC \TMP1, %xmm\index # Round 2
515 movaps 0x50(%arg1), \TMP1
516 AESENC \TMP1, %xmm\index # Round 2
519 movaps 0x60(%arg1), \TMP1
520 AESENC \TMP1, %xmm\index # Round 2
523 movaps 0x70(%arg1), \TMP1
524 AESENC \TMP1, %xmm\index # Round 2
527 movaps 0x80(%arg1), \TMP1
528 AESENC \TMP1, %xmm\index # Round 2
531 movaps 0x90(%arg1), \TMP1
532 AESENC \TMP1, %xmm\index # Round 2
535 movaps 0xa0(%arg1), \TMP1
536 AESENCLAST \TMP1, %xmm\index # Round 10
539 movdqu (%arg3 , %r11, 1), \TMP1
540 pxor \TMP1, %xmm\index
541 movdqu %xmm\index, (%arg2 , %r11, 1)
542 # write back plaintext/ciphertext for num_initial_blocks
545 movdqa SHUF_MASK(%rip), %xmm14
546 PSHUFB_XMM %xmm14, %xmm\index
548 # prepare plaintext/ciphertext for GHASH computation
551 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
552 # apply GHASH on num_initial_blocks blocks
556 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
558 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
560 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
563 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
565 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
568 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
571 jl _initial_blocks_done\num_initial_blocks\operation
572 # no need for precomputed values
575 * Precomputations for HashKey parallel with encryption of first 4 blocks.
576 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
578 paddd ONE(%rip), \XMM0 # INCR Y0
580 movdqa SHUF_MASK(%rip), %xmm14
581 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
583 paddd ONE(%rip), \XMM0 # INCR Y0
585 movdqa SHUF_MASK(%rip), %xmm14
586 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
588 paddd ONE(%rip), \XMM0 # INCR Y0
590 movdqa SHUF_MASK(%rip), %xmm14
591 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
593 paddd ONE(%rip), \XMM0 # INCR Y0
595 movdqa SHUF_MASK(%rip), %xmm14
596 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
598 pxor 16*0(%arg1), \XMM1
599 pxor 16*0(%arg1), \XMM2
600 pxor 16*0(%arg1), \XMM3
601 pxor 16*0(%arg1), \XMM4
603 pshufd $78, \TMP3, \TMP1
605 movdqa \TMP1, HashKey_k(%rsp)
606 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
607 # TMP5 = HashKey^2<<1 (mod poly)
608 movdqa \TMP5, HashKey_2(%rsp)
609 # HashKey_2 = HashKey^2<<1 (mod poly)
610 pshufd $78, \TMP5, \TMP1
612 movdqa \TMP1, HashKey_2_k(%rsp)
613 .irpc index, 1234 # do 4 rounds
614 movaps 0x10*\index(%arg1), \TMP1
620 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
621 # TMP5 = HashKey^3<<1 (mod poly)
622 movdqa \TMP5, HashKey_3(%rsp)
623 pshufd $78, \TMP5, \TMP1
625 movdqa \TMP1, HashKey_3_k(%rsp)
626 .irpc index, 56789 # do next 5 rounds
627 movaps 0x10*\index(%arg1), \TMP1
633 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
634 # TMP5 = HashKey^3<<1 (mod poly)
635 movdqa \TMP5, HashKey_4(%rsp)
636 pshufd $78, \TMP5, \TMP1
638 movdqa \TMP1, HashKey_4_k(%rsp)
639 movaps 0xa0(%arg1), \TMP2
640 AESENCLAST \TMP2, \XMM1
641 AESENCLAST \TMP2, \XMM2
642 AESENCLAST \TMP2, \XMM3
643 AESENCLAST \TMP2, \XMM4
644 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
646 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
648 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
650 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
652 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
653 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
654 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
655 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
658 movdqa SHUF_MASK(%rip), %xmm14
659 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
661 # combine GHASHed value with the corresponding ciphertext
662 movdqa SHUF_MASK(%rip), %xmm14
663 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
664 movdqa SHUF_MASK(%rip), %xmm14
665 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
666 movdqa SHUF_MASK(%rip), %xmm14
667 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
669 _initial_blocks_done\num_initial_blocks\operation:
674 * encrypt 4 blocks at a time
675 * ghash the 4 previously encrypted ciphertext blocks
676 * arg1, %arg2, %arg3 are used as pointers only, not modified
677 * %r11 is the data offset value
679 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
680 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
687 movdqa SHUF_MASK(%rip), %xmm15
688 # multiply TMP5 * HashKey using karatsuba
691 pshufd $78, \XMM5, \TMP6
693 paddd ONE(%rip), \XMM0 # INCR CNT
694 movdqa HashKey_4(%rsp), \TMP5
695 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
697 paddd ONE(%rip), \XMM0 # INCR CNT
699 paddd ONE(%rip), \XMM0 # INCR CNT
701 paddd ONE(%rip), \XMM0 # INCR CNT
703 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
704 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
705 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
706 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
707 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
713 movdqa HashKey_4_k(%rsp), \TMP5
714 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
715 movaps 0x10(%arg1), \TMP1
716 AESENC \TMP1, \XMM1 # Round 1
720 movaps 0x20(%arg1), \TMP1
721 AESENC \TMP1, \XMM1 # Round 2
726 pshufd $78, \XMM6, \TMP2
728 movdqa HashKey_3(%rsp), \TMP5
729 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
730 movaps 0x30(%arg1), \TMP3
731 AESENC \TMP3, \XMM1 # Round 3
735 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
736 movaps 0x40(%arg1), \TMP3
737 AESENC \TMP3, \XMM1 # Round 4
741 movdqa HashKey_3_k(%rsp), \TMP5
742 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
743 movaps 0x50(%arg1), \TMP3
744 AESENC \TMP3, \XMM1 # Round 5
749 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
753 pshufd $78, \XMM7, \TMP2
755 movdqa HashKey_2(%rsp ), \TMP5
757 # Multiply TMP5 * HashKey using karatsuba
759 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
760 movaps 0x60(%arg1), \TMP3
761 AESENC \TMP3, \XMM1 # Round 6
765 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
766 movaps 0x70(%arg1), \TMP3
767 AESENC \TMP3, \XMM1 # Round 7
771 movdqa HashKey_2_k(%rsp), \TMP5
772 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
773 movaps 0x80(%arg1), \TMP3
774 AESENC \TMP3, \XMM1 # Round 8
779 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
783 # Multiply XMM8 * HashKey
784 # XMM8 and TMP5 hold the values for the two operands
787 pshufd $78, \XMM8, \TMP2
789 movdqa HashKey(%rsp), \TMP5
790 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
791 movaps 0x90(%arg1), \TMP3
792 AESENC \TMP3, \XMM1 # Round 9
796 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
797 movaps 0xa0(%arg1), \TMP3
798 AESENCLAST \TMP3, \XMM1 # Round 10
799 AESENCLAST \TMP3, \XMM2
800 AESENCLAST \TMP3, \XMM3
801 AESENCLAST \TMP3, \XMM4
802 movdqa HashKey_k(%rsp), \TMP5
803 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
804 movdqu (%arg3,%r11,1), \TMP3
805 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
806 movdqu 16(%arg3,%r11,1), \TMP3
807 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
808 movdqu 32(%arg3,%r11,1), \TMP3
809 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
810 movdqu 48(%arg3,%r11,1), \TMP3
811 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
812 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
813 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
814 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
815 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
816 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
817 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
818 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
819 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
827 pslldq $8, \TMP3 # left shift TMP3 2 DWs
828 psrldq $8, \TMP2 # right shift TMP2 2 DWs
830 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
832 # first phase of reduction
837 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
838 pslld $31, \TMP2 # packed right shift << 31
839 pslld $30, \TMP3 # packed right shift << 30
840 pslld $25, \TMP4 # packed right shift << 25
841 pxor \TMP3, \TMP2 # xor the shifted versions
844 psrldq $4, \TMP5 # right shift T5 1 DW
845 pslldq $12, \TMP2 # left shift T2 3 DWs
848 # second phase of reduction
850 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
853 psrld $1, \TMP2 # packed left shift >>1
854 psrld $2, \TMP3 # packed left shift >>2
855 psrld $7, \TMP4 # packed left shift >>7
856 pxor \TMP3,\TMP2 # xor the shifted versions
860 pxor \TMP1, \XMM5 # result is in TMP1
866 * decrypt 4 blocks at a time
867 * ghash the 4 previously decrypted ciphertext blocks
868 * arg1, %arg2, %arg3 are used as pointers only, not modified
869 * %r11 is the data offset value
871 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
872 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
879 movdqa SHUF_MASK(%rip), %xmm15
880 # multiply TMP5 * HashKey using karatsuba
883 pshufd $78, \XMM5, \TMP6
885 paddd ONE(%rip), \XMM0 # INCR CNT
886 movdqa HashKey_4(%rsp), \TMP5
887 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
889 paddd ONE(%rip), \XMM0 # INCR CNT
891 paddd ONE(%rip), \XMM0 # INCR CNT
893 paddd ONE(%rip), \XMM0 # INCR CNT
895 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
896 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
897 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
898 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
899 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
905 movdqa HashKey_4_k(%rsp), \TMP5
906 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
907 movaps 0x10(%arg1), \TMP1
908 AESENC \TMP1, \XMM1 # Round 1
912 movaps 0x20(%arg1), \TMP1
913 AESENC \TMP1, \XMM1 # Round 2
918 pshufd $78, \XMM6, \TMP2
920 movdqa HashKey_3(%rsp), \TMP5
921 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
922 movaps 0x30(%arg1), \TMP3
923 AESENC \TMP3, \XMM1 # Round 3
927 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
928 movaps 0x40(%arg1), \TMP3
929 AESENC \TMP3, \XMM1 # Round 4
933 movdqa HashKey_3_k(%rsp), \TMP5
934 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
935 movaps 0x50(%arg1), \TMP3
936 AESENC \TMP3, \XMM1 # Round 5
941 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
945 pshufd $78, \XMM7, \TMP2
947 movdqa HashKey_2(%rsp ), \TMP5
949 # Multiply TMP5 * HashKey using karatsuba
951 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
952 movaps 0x60(%arg1), \TMP3
953 AESENC \TMP3, \XMM1 # Round 6
957 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
958 movaps 0x70(%arg1), \TMP3
959 AESENC \TMP3, \XMM1 # Round 7
963 movdqa HashKey_2_k(%rsp), \TMP5
964 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
965 movaps 0x80(%arg1), \TMP3
966 AESENC \TMP3, \XMM1 # Round 8
971 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
975 # Multiply XMM8 * HashKey
976 # XMM8 and TMP5 hold the values for the two operands
979 pshufd $78, \XMM8, \TMP2
981 movdqa HashKey(%rsp), \TMP5
982 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
983 movaps 0x90(%arg1), \TMP3
984 AESENC \TMP3, \XMM1 # Round 9
988 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
989 movaps 0xa0(%arg1), \TMP3
990 AESENCLAST \TMP3, \XMM1 # Round 10
991 AESENCLAST \TMP3, \XMM2
992 AESENCLAST \TMP3, \XMM3
993 AESENCLAST \TMP3, \XMM4
994 movdqa HashKey_k(%rsp), \TMP5
995 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
996 movdqu (%arg3,%r11,1), \TMP3
997 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
998 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
1000 movdqu 16(%arg3,%r11,1), \TMP3
1001 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1002 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
1004 movdqu 32(%arg3,%r11,1), \TMP3
1005 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1006 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1008 movdqu 48(%arg3,%r11,1), \TMP3
1009 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1010 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1012 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1013 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1014 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1015 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1023 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1024 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1026 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1028 # first phase of reduction
1033 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1034 pslld $31, \TMP2 # packed right shift << 31
1035 pslld $30, \TMP3 # packed right shift << 30
1036 pslld $25, \TMP4 # packed right shift << 25
1037 pxor \TMP3, \TMP2 # xor the shifted versions
1040 psrldq $4, \TMP5 # right shift T5 1 DW
1041 pslldq $12, \TMP2 # left shift T2 3 DWs
1044 # second phase of reduction
1046 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1049 psrld $1, \TMP2 # packed left shift >>1
1050 psrld $2, \TMP3 # packed left shift >>2
1051 psrld $7, \TMP4 # packed left shift >>7
1052 pxor \TMP3,\TMP2 # xor the shifted versions
1056 pxor \TMP1, \XMM5 # result is in TMP1
1061 /* GHASH the last 4 ciphertext blocks. */
1062 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1063 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1065 # Multiply TMP6 * HashKey (using Karatsuba)
1068 pshufd $78, \XMM1, \TMP2
1070 movdqa HashKey_4(%rsp), \TMP5
1071 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1072 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1073 movdqa HashKey_4_k(%rsp), \TMP4
1074 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1075 movdqa \XMM1, \XMMDst
1076 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1078 # Multiply TMP1 * HashKey (using Karatsuba)
1081 pshufd $78, \XMM2, \TMP2
1083 movdqa HashKey_3(%rsp), \TMP5
1084 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1085 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1086 movdqa HashKey_3_k(%rsp), \TMP4
1087 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1091 # results accumulated in TMP6, XMMDst, XMM1
1093 # Multiply TMP1 * HashKey (using Karatsuba)
1096 pshufd $78, \XMM3, \TMP2
1098 movdqa HashKey_2(%rsp), \TMP5
1099 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1100 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1101 movdqa HashKey_2_k(%rsp), \TMP4
1102 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1105 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1107 # Multiply TMP1 * HashKey (using Karatsuba)
1109 pshufd $78, \XMM4, \TMP2
1111 movdqa HashKey(%rsp), \TMP5
1112 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1113 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1114 movdqa HashKey_k(%rsp), \TMP4
1115 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1121 # middle section of the temp results combined as in karatsuba algorithm
1123 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1124 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1127 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1128 # first phase of the reduction
1129 movdqa \XMMDst, \TMP2
1130 movdqa \XMMDst, \TMP3
1131 movdqa \XMMDst, \TMP4
1132 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1133 pslld $31, \TMP2 # packed right shifting << 31
1134 pslld $30, \TMP3 # packed right shifting << 30
1135 pslld $25, \TMP4 # packed right shifting << 25
1136 pxor \TMP3, \TMP2 # xor the shifted versions
1139 psrldq $4, \TMP7 # right shift TMP7 1 DW
1140 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1143 # second phase of the reduction
1144 movdqa \XMMDst, \TMP2
1145 # make 3 copies of XMMDst for doing 3 shift operations
1146 movdqa \XMMDst, \TMP3
1147 movdqa \XMMDst, \TMP4
1148 psrld $1, \TMP2 # packed left shift >> 1
1149 psrld $2, \TMP3 # packed left shift >> 2
1150 psrld $7, \TMP4 # packed left shift >> 7
1151 pxor \TMP3, \TMP2 # xor the shifted versions
1155 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1158 /* Encryption of a single block done*/
1159 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1162 movaps 16(%arg1), \TMP1
1164 movaps 32(%arg1), \TMP1
1166 movaps 48(%arg1), \TMP1
1168 movaps 64(%arg1), \TMP1
1170 movaps 80(%arg1), \TMP1
1172 movaps 96(%arg1), \TMP1
1174 movaps 112(%arg1), \TMP1
1176 movaps 128(%arg1), \TMP1
1178 movaps 144(%arg1), \TMP1
1180 movaps 160(%arg1), \TMP1
1181 AESENCLAST \TMP1, \XMM0
1185 /*****************************************************************************
1186 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1187 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
1188 * const u8 *in, // Ciphertext input
1189 * u64 plaintext_len, // Length of data in bytes for decryption.
1190 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1191 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1192 * // concatenated with 0x00000001. 16-byte aligned pointer.
1193 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1194 * const u8 *aad, // Additional Authentication Data (AAD)
1195 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1196 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1197 * // given authentication tag and only return the plaintext if they match.
1198 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1199 * // (most likely), 12 or 8.
1204 * keys are pre-expanded and aligned to 16 bytes. we are using the first
1205 * set of 11 keys in the data structure void *aes_ctx
1209 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1210 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1211 * | Salt (From the SA) |
1212 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1213 * | Initialization Vector |
1214 * | (This is the sequence number from IPSec header) |
1215 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1217 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1222 * AAD padded to 128 bits with 0
1223 * for example, assume AAD is a u32 vector
1225 * if AAD is 8 bytes:
1226 * AAD[3] = {A0, A1};
1227 * padded AAD in xmm register = {A1 A0 0 0}
1230 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1231 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1233 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1234 * | 32-bit Sequence Number (A0) |
1235 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1237 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1239 * AAD Format with 32-bit Sequence Number
1241 * if AAD is 12 bytes:
1242 * AAD[3] = {A0, A1, A2};
1243 * padded AAD in xmm register = {A2 A1 A0 0}
1246 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1247 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1248 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1249 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1251 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1252 * | 64-bit Extended Sequence Number {A1,A0} |
1254 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1256 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1258 * AAD Format with 64-bit Extended Sequence Number
1261 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1262 * The code supports 16 too but for other sizes, the code will fail.
1265 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1266 * For other sizes, the code will fail.
1268 * poly = x^128 + x^127 + x^126 + x^121 + 1
1270 *****************************************************************************/
1271 ENTRY(aesni_gcm_dec)
1277 * states of %xmm registers %xmm6:%xmm15 not saved
1278 * all %xmm registers are clobbered
1280 sub $VARIABLE_OFFSET, %rsp
1281 and $~63, %rsp # align rsp to 64 bytes
1283 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1284 movdqa SHUF_MASK(%rip), %xmm2
1285 PSHUFB_XMM %xmm2, %xmm13
1288 # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1290 movdqa %xmm13, %xmm2
1300 pshufd $0x24, %xmm1, %xmm2
1301 pcmpeqd TWOONE(%rip), %xmm2
1302 pand POLY(%rip), %xmm2
1303 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1306 # Decrypt first few blocks
1308 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1309 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1310 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1313 jz _initial_num_blocks_is_0_decrypt
1315 jb _initial_num_blocks_is_1_decrypt
1316 je _initial_num_blocks_is_2_decrypt
1317 _initial_num_blocks_is_3_decrypt:
1318 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1319 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1321 jmp _initial_blocks_decrypted
1322 _initial_num_blocks_is_2_decrypt:
1323 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1324 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1326 jmp _initial_blocks_decrypted
1327 _initial_num_blocks_is_1_decrypt:
1328 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1329 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1331 jmp _initial_blocks_decrypted
1332 _initial_num_blocks_is_0_decrypt:
1333 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1334 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1335 _initial_blocks_decrypted:
1337 je _zero_cipher_left_decrypt
1339 je _four_cipher_left_decrypt
1341 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1342 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1346 _four_cipher_left_decrypt:
1347 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1348 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1349 _zero_cipher_left_decrypt:
1351 and $15, %r13 # %r13 = arg4 (mod 16)
1352 je _multiple_of_16_bytes_decrypt
1354 # Handle the last <16 byte block separately
1356 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1357 movdqa SHUF_MASK(%rip), %xmm10
1358 PSHUFB_XMM %xmm10, %xmm0
1360 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1363 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1364 lea SHIFT_MASK+16(%rip), %r12
1366 # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1367 # (%r13 is the number of bytes in plaintext mod 16)
1368 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1369 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1372 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1373 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1374 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1375 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1377 movdqa SHUF_MASK(%rip), %xmm10
1378 PSHUFB_XMM %xmm10 ,%xmm2
1381 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1382 # GHASH computation for the last <16 byte block
1387 MOVQ_R64_XMM %xmm0, %rax
1389 jle _less_than_8_bytes_left_decrypt
1390 mov %rax, (%arg2 , %r11, 1)
1393 MOVQ_R64_XMM %xmm0, %rax
1395 _less_than_8_bytes_left_decrypt:
1396 mov %al, (%arg2, %r11, 1)
1400 jne _less_than_8_bytes_left_decrypt
1401 _multiple_of_16_bytes_decrypt:
1402 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1403 shl $3, %r12 # convert into number of bits
1404 movd %r12d, %xmm15 # len(A) in %xmm15
1405 shl $3, %arg4 # len(C) in bits (*128)
1406 MOVQ_R64_XMM %arg4, %xmm1
1407 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1408 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1410 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1411 # final GHASH computation
1412 movdqa SHUF_MASK(%rip), %xmm10
1413 PSHUFB_XMM %xmm10, %xmm8
1415 mov %arg5, %rax # %rax = *Y0
1416 movdqu (%rax), %xmm0 # %xmm0 = Y0
1417 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1420 mov arg9, %r10 # %r10 = authTag
1421 mov arg10, %r11 # %r11 = auth_tag_len
1427 MOVQ_R64_XMM %xmm0, %rax
1429 jmp _return_T_done_decrypt
1431 MOVQ_R64_XMM %xmm0, %rax
1436 jmp _return_T_done_decrypt
1438 movdqu %xmm0, (%r10)
1439 _return_T_done_decrypt:
1445 ENDPROC(aesni_gcm_dec)
1448 /*****************************************************************************
1449 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1450 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1451 * const u8 *in, // Plaintext input
1452 * u64 plaintext_len, // Length of data in bytes for encryption.
1453 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1454 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1455 * // concatenated with 0x00000001. 16-byte aligned pointer.
1456 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1457 * const u8 *aad, // Additional Authentication Data (AAD)
1458 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1459 * u8 *auth_tag, // Authenticated Tag output.
1460 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1466 * keys are pre-expanded and aligned to 16 bytes. we are using the
1467 * first set of 11 keys in the data structure void *aes_ctx
1472 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1473 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1474 * | Salt (From the SA) |
1475 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1476 * | Initialization Vector |
1477 * | (This is the sequence number from IPSec header) |
1478 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1480 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1485 * AAD padded to 128 bits with 0
1486 * for example, assume AAD is a u32 vector
1488 * if AAD is 8 bytes:
1489 * AAD[3] = {A0, A1};
1490 * padded AAD in xmm register = {A1 A0 0 0}
1493 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1494 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1496 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1497 * | 32-bit Sequence Number (A0) |
1498 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1500 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1502 * AAD Format with 32-bit Sequence Number
1504 * if AAD is 12 bytes:
1505 * AAD[3] = {A0, A1, A2};
1506 * padded AAD in xmm register = {A2 A1 A0 0}
1509 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1510 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1512 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1513 * | 64-bit Extended Sequence Number {A1,A0} |
1515 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1517 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1519 * AAD Format with 64-bit Extended Sequence Number
1522 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1523 * The code supports 16 too but for other sizes, the code will fail.
1526 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1527 * For other sizes, the code will fail.
1529 * poly = x^128 + x^127 + x^126 + x^121 + 1
1530 ***************************************************************************/
1531 ENTRY(aesni_gcm_enc)
1537 # states of %xmm registers %xmm6:%xmm15 not saved
1538 # all %xmm registers are clobbered
1540 sub $VARIABLE_OFFSET, %rsp
1543 movdqu (%r12), %xmm13
1544 movdqa SHUF_MASK(%rip), %xmm2
1545 PSHUFB_XMM %xmm2, %xmm13
1548 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1550 movdqa %xmm13, %xmm2
1560 pshufd $0x24, %xmm1, %xmm2
1561 pcmpeqd TWOONE(%rip), %xmm2
1562 pand POLY(%rip), %xmm2
1564 movdqa %xmm13, HashKey(%rsp)
1565 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1569 # Encrypt first few blocks
1572 jz _initial_num_blocks_is_0_encrypt
1574 jb _initial_num_blocks_is_1_encrypt
1575 je _initial_num_blocks_is_2_encrypt
1576 _initial_num_blocks_is_3_encrypt:
1577 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1578 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1580 jmp _initial_blocks_encrypted
1581 _initial_num_blocks_is_2_encrypt:
1582 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1583 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1585 jmp _initial_blocks_encrypted
1586 _initial_num_blocks_is_1_encrypt:
1587 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1588 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1590 jmp _initial_blocks_encrypted
1591 _initial_num_blocks_is_0_encrypt:
1592 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1593 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1594 _initial_blocks_encrypted:
1596 # Main loop - Encrypt remaining blocks
1599 je _zero_cipher_left_encrypt
1601 je _four_cipher_left_encrypt
1602 _encrypt_by_4_encrypt:
1603 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1604 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1607 jne _encrypt_by_4_encrypt
1608 _four_cipher_left_encrypt:
1609 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1610 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1611 _zero_cipher_left_encrypt:
1613 and $15, %r13 # %r13 = arg4 (mod 16)
1614 je _multiple_of_16_bytes_encrypt
1616 # Handle the last <16 Byte block separately
1617 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1618 movdqa SHUF_MASK(%rip), %xmm10
1619 PSHUFB_XMM %xmm10, %xmm0
1622 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1625 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1626 lea SHIFT_MASK+16(%rip), %r12
1628 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1629 # (%r13 is the number of bytes in plaintext mod 16)
1630 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1631 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1632 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1633 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1634 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1635 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1636 movdqa SHUF_MASK(%rip), %xmm10
1637 PSHUFB_XMM %xmm10,%xmm0
1640 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1641 # GHASH computation for the last <16 byte block
1645 movdqa SHUF_MASK(%rip), %xmm10
1646 PSHUFB_XMM %xmm10, %xmm0
1648 # shuffle xmm0 back to output as ciphertext
1651 MOVQ_R64_XMM %xmm0, %rax
1653 jle _less_than_8_bytes_left_encrypt
1654 mov %rax, (%arg2 , %r11, 1)
1657 MOVQ_R64_XMM %xmm0, %rax
1659 _less_than_8_bytes_left_encrypt:
1660 mov %al, (%arg2, %r11, 1)
1664 jne _less_than_8_bytes_left_encrypt
1665 _multiple_of_16_bytes_encrypt:
1666 mov arg8, %r12 # %r12 = addLen (number of bytes)
1668 movd %r12d, %xmm15 # len(A) in %xmm15
1669 shl $3, %arg4 # len(C) in bits (*128)
1670 MOVQ_R64_XMM %arg4, %xmm1
1671 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1672 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1674 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1675 # final GHASH computation
1676 movdqa SHUF_MASK(%rip), %xmm10
1677 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1679 mov %arg5, %rax # %rax = *Y0
1680 movdqu (%rax), %xmm0 # %xmm0 = Y0
1681 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1684 mov arg9, %r10 # %r10 = authTag
1685 mov arg10, %r11 # %r11 = auth_tag_len
1691 MOVQ_R64_XMM %xmm0, %rax
1693 jmp _return_T_done_encrypt
1695 MOVQ_R64_XMM %xmm0, %rax
1700 jmp _return_T_done_encrypt
1702 movdqu %xmm0, (%r10)
1703 _return_T_done_encrypt:
1709 ENDPROC(aesni_gcm_enc)
1716 _key_expansion_256a:
1717 pshufd $0b11111111, %xmm1, %xmm1
1718 shufps $0b00010000, %xmm0, %xmm4
1720 shufps $0b10001100, %xmm0, %xmm4
1723 movaps %xmm0, (TKEYP)
1726 ENDPROC(_key_expansion_128)
1727 ENDPROC(_key_expansion_256a)
1730 _key_expansion_192a:
1731 pshufd $0b01010101, %xmm1, %xmm1
1732 shufps $0b00010000, %xmm0, %xmm4
1734 shufps $0b10001100, %xmm0, %xmm4
1741 pshufd $0b11111111, %xmm0, %xmm3
1746 shufps $0b01000100, %xmm0, %xmm6
1747 movaps %xmm6, (TKEYP)
1748 shufps $0b01001110, %xmm2, %xmm1
1749 movaps %xmm1, 0x10(TKEYP)
1752 ENDPROC(_key_expansion_192a)
1755 _key_expansion_192b:
1756 pshufd $0b01010101, %xmm1, %xmm1
1757 shufps $0b00010000, %xmm0, %xmm4
1759 shufps $0b10001100, %xmm0, %xmm4
1765 pshufd $0b11111111, %xmm0, %xmm3
1769 movaps %xmm0, (TKEYP)
1772 ENDPROC(_key_expansion_192b)
1775 _key_expansion_256b:
1776 pshufd $0b10101010, %xmm1, %xmm1
1777 shufps $0b00010000, %xmm2, %xmm4
1779 shufps $0b10001100, %xmm2, %xmm4
1782 movaps %xmm2, (TKEYP)
1785 ENDPROC(_key_expansion_256b)
1788 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1789 * unsigned int key_len)
1791 ENTRY(aesni_set_key)
1794 movl 8(%esp), KEYP # ctx
1795 movl 12(%esp), UKEYP # in_key
1796 movl 16(%esp), %edx # key_len
1798 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1799 movaps %xmm0, (KEYP)
1800 lea 0x10(KEYP), TKEYP # key addr
1801 movl %edx, 480(KEYP)
1802 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1806 movups 0x10(UKEYP), %xmm2 # other user key
1807 movaps %xmm2, (TKEYP)
1809 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1810 call _key_expansion_256a
1811 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1812 call _key_expansion_256b
1813 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1814 call _key_expansion_256a
1815 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1816 call _key_expansion_256b
1817 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1818 call _key_expansion_256a
1819 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1820 call _key_expansion_256b
1821 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1822 call _key_expansion_256a
1823 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1824 call _key_expansion_256b
1825 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1826 call _key_expansion_256a
1827 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1828 call _key_expansion_256b
1829 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1830 call _key_expansion_256a
1831 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1832 call _key_expansion_256b
1833 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1834 call _key_expansion_256a
1837 movq 0x10(UKEYP), %xmm2 # other user key
1838 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1839 call _key_expansion_192a
1840 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1841 call _key_expansion_192b
1842 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1843 call _key_expansion_192a
1844 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1845 call _key_expansion_192b
1846 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1847 call _key_expansion_192a
1848 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1849 call _key_expansion_192b
1850 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1851 call _key_expansion_192a
1852 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1853 call _key_expansion_192b
1856 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1857 call _key_expansion_128
1858 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1859 call _key_expansion_128
1860 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1861 call _key_expansion_128
1862 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1863 call _key_expansion_128
1864 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1865 call _key_expansion_128
1866 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1867 call _key_expansion_128
1868 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1869 call _key_expansion_128
1870 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1871 call _key_expansion_128
1872 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1873 call _key_expansion_128
1874 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1875 call _key_expansion_128
1878 movaps (KEYP), %xmm0
1879 movaps (TKEYP), %xmm1
1880 movaps %xmm0, 240(TKEYP)
1881 movaps %xmm1, 240(KEYP)
1883 lea 240-16(TKEYP), UKEYP
1886 movaps (KEYP), %xmm0
1888 movaps %xmm1, (UKEYP)
1898 ENDPROC(aesni_set_key)
1901 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1911 movl 480(KEYP), KLEN # key length
1912 movups (INP), STATE # input
1914 movups STATE, (OUTP) # output
1923 * _aesni_enc1: internal ABI
1925 * KEYP: key struct pointer
1927 * STATE: initial state (input)
1929 * STATE: finial state (output)
1936 movaps (KEYP), KEY # key
1938 pxor KEY, STATE # round 0
1942 lea 0x20(TKEYP), TKEYP
1945 movaps -0x60(TKEYP), KEY
1947 movaps -0x50(TKEYP), KEY
1951 movaps -0x40(TKEYP), KEY
1953 movaps -0x30(TKEYP), KEY
1957 movaps -0x20(TKEYP), KEY
1959 movaps -0x10(TKEYP), KEY
1963 movaps 0x10(TKEYP), KEY
1965 movaps 0x20(TKEYP), KEY
1967 movaps 0x30(TKEYP), KEY
1969 movaps 0x40(TKEYP), KEY
1971 movaps 0x50(TKEYP), KEY
1973 movaps 0x60(TKEYP), KEY
1975 movaps 0x70(TKEYP), KEY
1976 AESENCLAST KEY STATE
1978 ENDPROC(_aesni_enc1)
1981 * _aesni_enc4: internal ABI
1983 * KEYP: key struct pointer
1985 * STATE1: initial state (input)
1990 * STATE1: finial state (output)
2000 movaps (KEYP), KEY # key
2002 pxor KEY, STATE1 # round 0
2009 lea 0x20(TKEYP), TKEYP
2012 movaps -0x60(TKEYP), KEY
2017 movaps -0x50(TKEYP), KEY
2024 movaps -0x40(TKEYP), KEY
2029 movaps -0x30(TKEYP), KEY
2036 movaps -0x20(TKEYP), KEY
2041 movaps -0x10(TKEYP), KEY
2051 movaps 0x10(TKEYP), KEY
2056 movaps 0x20(TKEYP), KEY
2061 movaps 0x30(TKEYP), KEY
2066 movaps 0x40(TKEYP), KEY
2071 movaps 0x50(TKEYP), KEY
2076 movaps 0x60(TKEYP), KEY
2081 movaps 0x70(TKEYP), KEY
2082 AESENCLAST KEY STATE1 # last round
2083 AESENCLAST KEY STATE2
2084 AESENCLAST KEY STATE3
2085 AESENCLAST KEY STATE4
2087 ENDPROC(_aesni_enc4)
2090 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2100 mov 480(KEYP), KLEN # key length
2102 movups (INP), STATE # input
2104 movups STATE, (OUTP) #output
2113 * _aesni_dec1: internal ABI
2115 * KEYP: key struct pointer
2117 * STATE: initial state (input)
2119 * STATE: finial state (output)
2126 movaps (KEYP), KEY # key
2128 pxor KEY, STATE # round 0
2132 lea 0x20(TKEYP), TKEYP
2135 movaps -0x60(TKEYP), KEY
2137 movaps -0x50(TKEYP), KEY
2141 movaps -0x40(TKEYP), KEY
2143 movaps -0x30(TKEYP), KEY
2147 movaps -0x20(TKEYP), KEY
2149 movaps -0x10(TKEYP), KEY
2153 movaps 0x10(TKEYP), KEY
2155 movaps 0x20(TKEYP), KEY
2157 movaps 0x30(TKEYP), KEY
2159 movaps 0x40(TKEYP), KEY
2161 movaps 0x50(TKEYP), KEY
2163 movaps 0x60(TKEYP), KEY
2165 movaps 0x70(TKEYP), KEY
2166 AESDECLAST KEY STATE
2168 ENDPROC(_aesni_dec1)
2171 * _aesni_dec4: internal ABI
2173 * KEYP: key struct pointer
2175 * STATE1: initial state (input)
2180 * STATE1: finial state (output)
2190 movaps (KEYP), KEY # key
2192 pxor KEY, STATE1 # round 0
2199 lea 0x20(TKEYP), TKEYP
2202 movaps -0x60(TKEYP), KEY
2207 movaps -0x50(TKEYP), KEY
2214 movaps -0x40(TKEYP), KEY
2219 movaps -0x30(TKEYP), KEY
2226 movaps -0x20(TKEYP), KEY
2231 movaps -0x10(TKEYP), KEY
2241 movaps 0x10(TKEYP), KEY
2246 movaps 0x20(TKEYP), KEY
2251 movaps 0x30(TKEYP), KEY
2256 movaps 0x40(TKEYP), KEY
2261 movaps 0x50(TKEYP), KEY
2266 movaps 0x60(TKEYP), KEY
2271 movaps 0x70(TKEYP), KEY
2272 AESDECLAST KEY STATE1 # last round
2273 AESDECLAST KEY STATE2
2274 AESDECLAST KEY STATE3
2275 AESDECLAST KEY STATE4
2277 ENDPROC(_aesni_dec4)
2280 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2283 ENTRY(aesni_ecb_enc)
2293 test LEN, LEN # check length
2302 movups (INP), STATE1
2303 movups 0x10(INP), STATE2
2304 movups 0x20(INP), STATE3
2305 movups 0x30(INP), STATE4
2307 movups STATE1, (OUTP)
2308 movups STATE2, 0x10(OUTP)
2309 movups STATE3, 0x20(OUTP)
2310 movups STATE4, 0x30(OUTP)
2320 movups (INP), STATE1
2322 movups STATE1, (OUTP)
2335 ENDPROC(aesni_ecb_enc)
2338 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2341 ENTRY(aesni_ecb_dec)
2361 movups (INP), STATE1
2362 movups 0x10(INP), STATE2
2363 movups 0x20(INP), STATE3
2364 movups 0x30(INP), STATE4
2366 movups STATE1, (OUTP)
2367 movups STATE2, 0x10(OUTP)
2368 movups STATE3, 0x20(OUTP)
2369 movups STATE4, 0x30(OUTP)
2379 movups (INP), STATE1
2381 movups STATE1, (OUTP)
2394 ENDPROC(aesni_ecb_dec)
2397 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2398 * size_t len, u8 *iv)
2400 ENTRY(aesni_cbc_enc)
2415 movups (IVP), STATE # load iv as initial state
2418 movups (INP), IN # load input
2421 movups STATE, (OUTP) # store output
2436 ENDPROC(aesni_cbc_enc)
2439 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2440 * size_t len, u8 *iv)
2442 ENTRY(aesni_cbc_dec)
2455 jb .Lcbc_dec_just_ret
2465 movups 0x10(INP), IN2
2468 movups 0x20(INP), IN3
2470 movups 0x30(INP), IN4
2473 movups 0x20(INP), IN1
2475 movups 0x30(INP), IN2
2490 movups 0x10(INP), IN2
2493 movups STATE1, (OUTP)
2494 movups STATE2, 0x10(OUTP)
2495 movups STATE3, 0x20(OUTP)
2496 movups STATE4, 0x30(OUTP)
2510 movups STATE, (OUTP)
2527 ENDPROC(aesni_cbc_dec)
2532 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2535 * _aesni_inc_init: internal ABI
2536 * setup registers used by _aesni_inc
2540 * CTR: == IV, in little endian
2541 * TCTR_LOW: == lower qword of CTR
2542 * INC: == 1, in little endian
2543 * BSWAP_MASK == endian swapping mask
2547 movaps .Lbswap_mask, BSWAP_MASK
2549 PSHUFB_XMM BSWAP_MASK CTR
2551 MOVQ_R64_XMM TCTR_LOW INC
2552 MOVQ_R64_XMM CTR TCTR_LOW
2554 ENDPROC(_aesni_inc_init)
2557 * _aesni_inc: internal ABI
2558 * Increase IV by 1, IV is in big endian
2561 * CTR: == IV, in little endian
2562 * TCTR_LOW: == lower qword of CTR
2563 * INC: == 1, in little endian
2564 * BSWAP_MASK == endian swapping mask
2568 * CTR: == output IV, in little endian
2569 * TCTR_LOW: == lower qword of CTR
2581 PSHUFB_XMM BSWAP_MASK IV
2586 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2587 * size_t len, u8 *iv)
2589 ENTRY(aesni_ctr_enc)
2591 jb .Lctr_enc_just_ret
2594 call _aesni_inc_init
2604 movups 0x10(INP), IN2
2607 movups 0x20(INP), IN3
2610 movups 0x30(INP), IN4
2613 movups STATE1, (OUTP)
2615 movups STATE2, 0x10(OUTP)
2617 movups STATE3, 0x20(OUTP)
2619 movups STATE4, 0x30(OUTP)
2634 movups STATE, (OUTP)
2644 ENDPROC(aesni_ctr_enc)
2647 * _aesni_gf128mul_x_ble: internal ABI
2648 * Multiply in GF(2^128) for XTS IVs
2651 * GF128MUL_MASK == mask with 0x87 and 0x01
2655 * CTR: == temporary value
2657 #define _aesni_gf128mul_x_ble() \
2658 pshufd $0x13, IV, CTR; \
2661 pand GF128MUL_MASK, CTR; \
2665 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2668 ENTRY(aesni_xts_crypt8)
2672 leaq _aesni_enc4, %r11
2673 leaq _aesni_dec4, %rax
2677 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2684 pxor 0x00(INP), STATE1
2685 movdqu IV, 0x00(OUTP)
2687 _aesni_gf128mul_x_ble()
2689 pxor 0x10(INP), STATE2
2690 movdqu IV, 0x10(OUTP)
2692 _aesni_gf128mul_x_ble()
2694 pxor 0x20(INP), STATE3
2695 movdqu IV, 0x20(OUTP)
2697 _aesni_gf128mul_x_ble()
2699 pxor 0x30(INP), STATE4
2700 movdqu IV, 0x30(OUTP)
2704 pxor 0x00(OUTP), STATE1
2705 movdqu STATE1, 0x00(OUTP)
2707 _aesni_gf128mul_x_ble()
2709 pxor 0x40(INP), STATE1
2710 movdqu IV, 0x40(OUTP)
2712 pxor 0x10(OUTP), STATE2
2713 movdqu STATE2, 0x10(OUTP)
2715 _aesni_gf128mul_x_ble()
2717 pxor 0x50(INP), STATE2
2718 movdqu IV, 0x50(OUTP)
2720 pxor 0x20(OUTP), STATE3
2721 movdqu STATE3, 0x20(OUTP)
2723 _aesni_gf128mul_x_ble()
2725 pxor 0x60(INP), STATE3
2726 movdqu IV, 0x60(OUTP)
2728 pxor 0x30(OUTP), STATE4
2729 movdqu STATE4, 0x30(OUTP)
2731 _aesni_gf128mul_x_ble()
2733 pxor 0x70(INP), STATE4
2734 movdqu IV, 0x70(OUTP)
2736 _aesni_gf128mul_x_ble()
2741 pxor 0x40(OUTP), STATE1
2742 movdqu STATE1, 0x40(OUTP)
2744 pxor 0x50(OUTP), STATE2
2745 movdqu STATE2, 0x50(OUTP)
2747 pxor 0x60(OUTP), STATE3
2748 movdqu STATE3, 0x60(OUTP)
2750 pxor 0x70(OUTP), STATE4
2751 movdqu STATE4, 0x70(OUTP)
2754 ENDPROC(aesni_xts_crypt8)