2 * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
4 * Copyright (C) 2015 Martin Willi
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
12 #include <linux/linkage.h>
17 ANMASK: .octa 0x0000000003ffffff0000000003ffffff
50 ENTRY(poly1305_block_sse2)
51 # %rdi: Accumulator h[5]
52 # %rsi: 16 byte input block m
53 # %rdx: Poly1305 key r[5]
56 # This single block variant tries to improve performance by doing two
57 # multiplications in parallel using SSE instructions. There is quite
58 # some quardword packing involved, hence the speedup is marginal.
66 lea (%eax,%eax,4),%eax
69 lea (%eax,%eax,4),%eax
72 lea (%eax,%eax,4),%eax
75 lea (%eax,%eax,4),%eax
78 movdqa ANMASK(%rip),mask
81 # h01 = [0, h1, 0, h0]
82 # h23 = [0, h3, 0, h2]
83 # h44 = [0, h4, 0, h4]
93 # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
100 # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
108 # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
116 # t1[0] = h0 * r0 + h2 * s3
117 # t1[1] = h1 * s4 + h3 * s2
127 # t2[0] = h0 * r1 + h2 * s4
128 # t2[1] = h1 * r0 + h3 * s3
144 # d0 = t1[0] + t1[1] + t3[0]
145 # d1 = t2[0] + t2[1] + t3[1]
155 # t1[0] = h0 * r2 + h2 * r0
156 # t1[1] = h1 * r1 + h3 * s4
166 # t2[0] = h0 * r3 + h2 * r1
167 # t2[1] = h1 * r2 + h3 * r0
183 # d2 = t1[0] + t1[1] + t3[0]
184 # d3 = t2[0] + t2[1] + t3[1]
194 # t1[0] = h0 * r4 + h2 * r2
195 # t1[1] = h1 * r3 + h3 * r1
208 # d4 = t1[0] + t1[1] + t3[0]
219 # h0 = d0 & 0x3ffffff
227 # h1 = d1 & 0x3ffffff
236 # h2 = d2 & 0x3ffffff
245 # h3 = d3 & 0x3ffffff
250 # h0 += (d4 >> 26) * 5
253 lea (%eax,%eax,4),%eax
255 # h4 = d4 & 0x3ffffff
264 # h0 = h0 & 0x3ffffff
276 ENDPROC(poly1305_block_sse2)