From 830a526837f6b4a949559c45c311d0ade0132589 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 10 Jan 2017 18:32:19 -0800 Subject: [PATCH] ANDROID: arm64/crypto: add ARMv8-CE optimized poly_hash algorithm poly_hash is part of the HEH (Hash-Encrypt-Hash) encryption mode, proposed in Internet Draft https://tools.ietf.org/html/draft-cope-heh-01. poly_hash is very similar to GHASH; besides the swapping of the last two coefficients which we opted to handle in the HEH template, poly_hash just uses a different finite field representation. As with GHASH, poly_hash becomes much faster and more secure against timing attacks when implemented using carryless multiplication instructions instead of tables. This patch adds an ARMv8-CE optimized version of poly_hash, based roughly on the existing ARMv8-CE optimized version of GHASH. Benchmark results are shown below, but note that the resistance to timing attacks may be even more important than the performance gain. poly_hash only: poly_hash-generic: 1,000,000 setkey() takes 1185 ms hashing is 328 MB/s poly_hash-ce: 1,000,000 setkey() takes 8 ms hashing is 1756 MB/s heh(aes) with 4096-byte inputs (this is the ideal case, as the improvement is less significant with smaller inputs): encryption with "heh_base(cmac(aes-ce),poly_hash-generic,ecb-aes-ce)": 118 MB/s decryption with "heh_base(cmac(aes-ce),poly_hash-generic,ecb-aes-ce)": 120 MB/s encryption with "heh_base(cmac(aes-ce),poly_hash-ce,ecb-aes-ce)": 291 MB/s decryption with "heh_base(cmac(aes-ce),poly_hash-ce,ecb-aes-ce)": 293 MB/s Bug: 32508661 Signed-off-by: Eric Biggers Change-Id: I621ec0e1115df7e6f5cbd7e864a4a9d8d2e94cf2 --- arch/arm64/crypto/Kconfig | 5 + arch/arm64/crypto/Makefile | 3 + arch/arm64/crypto/poly-hash-ce-core.S | 163 +++++++++++++++++++++++++ arch/arm64/crypto/poly-hash-ce-glue.c | 166 ++++++++++++++++++++++++++ crypto/Kconfig | 1 + 5 files changed, 338 insertions(+) create mode 100644 arch/arm64/crypto/poly-hash-ce-core.S create mode 100644 arch/arm64/crypto/poly-hash-ce-glue.c diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 2cf32e9887e1..de1aab4b5da8 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -23,6 +23,11 @@ config CRYPTO_GHASH_ARM64_CE depends on ARM64 && KERNEL_MODE_NEON select CRYPTO_HASH +config CRYPTO_POLY_HASH_ARM64_CE + tristate "poly_hash (for HEH encryption mode) using ARMv8 Crypto Extensions" + depends on ARM64 && KERNEL_MODE_NEON + select CRYPTO_HASH + config CRYPTO_AES_ARM64_CE tristate "AES core cipher using ARMv8 Crypto Extensions" depends on ARM64 && KERNEL_MODE_NEON diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index abb79b3cfcfe..f0a8f2475ea3 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -17,6 +17,9 @@ sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o +obj-$(CONFIG_CRYPTO_POLY_HASH_ARM64_CE) += poly-hash-ce.o +poly-hash-ce-y := poly-hash-ce-glue.o poly-hash-ce-core.o + obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto diff --git a/arch/arm64/crypto/poly-hash-ce-core.S b/arch/arm64/crypto/poly-hash-ce-core.S new file mode 100644 index 000000000000..8ccb544c5526 --- /dev/null +++ b/arch/arm64/crypto/poly-hash-ce-core.S @@ -0,0 +1,163 @@ +/* + * Accelerated poly_hash implementation with ARMv8 PMULL instructions. + * + * Based on ghash-ce-core.S. + * + * Copyright (C) 2014 Linaro Ltd. + * Copyright (C) 2017 Google, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include + + KEY .req v0 + KEY2 .req v1 + T1 .req v2 + T2 .req v3 + GSTAR .req v4 + XL .req v5 + XM .req v6 + XH .req v7 + + .text + .arch armv8-a+crypto + + /* 16-byte aligned (2**4 = 16); not required, but might as well */ + .align 4 +.Lgstar: + .quad 0x87, 0x87 + +/* + * void pmull_poly_hash_update(le128 *digest, const le128 *key, + * const u8 *src, unsigned int blocks, + * unsigned int partial); + */ +ENTRY(pmull_poly_hash_update) + + /* Load digest into XL */ + ld1 {XL.16b}, [x0] + + /* Load key into KEY */ + ld1 {KEY.16b}, [x1] + + /* Load g*(x) = g(x) + x^128 = x^7 + x^2 + x + 1 into both halves of + * GSTAR */ + adr x1, .Lgstar + ld1 {GSTAR.2d}, [x1] + + /* Set KEY2 to (KEY[1]+KEY[0]):(KEY[1]+KEY[0]). This is needed for + * Karatsuba multiplication. */ + ext KEY2.16b, KEY.16b, KEY.16b, #8 + eor KEY2.16b, KEY2.16b, KEY.16b + + /* If 'partial' is nonzero, then we're finishing a pending block and + * should go right to the multiplication. */ + cbnz w4, 1f + +0: + /* Add the next block from 'src' to the digest */ + ld1 {T1.16b}, [x2], #16 + eor XL.16b, XL.16b, T1.16b + sub w3, w3, #1 + +1: + /* + * Multiply the current 128-bit digest (a1:a0, in XL) by the 128-bit key + * (b1:b0, in KEY) using Karatsuba multiplication. + */ + + /* T1 = (a1+a0):(a1+a0) */ + ext T1.16b, XL.16b, XL.16b, #8 + eor T1.16b, T1.16b, XL.16b + + /* XH = a1 * b1 */ + pmull2 XH.1q, XL.2d, KEY.2d + + /* XL = a0 * b0 */ + pmull XL.1q, XL.1d, KEY.1d + + /* XM = (a1+a0) * (b1+b0) */ + pmull XM.1q, T1.1d, KEY2.1d + + /* XM += (XH[0]:XL[1]) + XL + XH */ + ext T1.16b, XL.16b, XH.16b, #8 + eor T2.16b, XL.16b, XH.16b + eor XM.16b, XM.16b, T1.16b + eor XM.16b, XM.16b, T2.16b + + /* + * Now the 256-bit product is in XH[1]:XM:XL[0]. It represents a + * polynomial over GF(2) with degree as large as 255. We need to + * compute its remainder modulo g(x) = x^128+x^7+x^2+x+1. For this it + * is sufficient to compute the remainder of the high half 'c(x)x^128' + * add it to the low half. To reduce the high half we use the Barrett + * reduction method. The basic idea is that we can express the + * remainder p(x) as g(x)q(x) mod x^128, where q(x) = (c(x)x^128)/g(x). + * As detailed in [1], to avoid having to divide by g(x) at runtime the + * following equivalent expression can be derived: + * + * p(x) = [ g*(x)((c(x)q+(x))/x^128) ] mod x^128 + * + * where g*(x) = x^128+g(x) = x^7+x^2+x+1, and q+(x) = x^256/g(x) = g(x) + * in this case. This is also equivalent to: + * + * p(x) = [ g*(x)((c(x)(x^128 + g*(x)))/x^128) ] mod x^128 + * = [ g*(x)(c(x) + (c(x)g*(x))/x^128) ] mod x^128 + * + * Since deg g*(x) < 64: + * + * p(x) = [ g*(x)(c(x) + ((c(x)/x^64)g*(x))/x^64) ] mod x^128 + * = [ g*(x)((c(x)/x^64)x^64 + (c(x) mod x^64) + + * ((c(x)/x^64)g*(x))/x^64) ] mod x^128 + * + * Letting t(x) = g*(x)(c(x)/x^64): + * + * p(x) = [ t(x)x^64 + g*(x)((c(x) mod x^64) + t(x)/x^64) ] mod x^128 + * + * Therefore, to do the reduction we only need to issue two 64-bit => + * 128-bit carryless multiplications: g*(x) times c(x)/x^64, and g*(x) + * times ((c(x) mod x^64) + t(x)/x^64). (Multiplication by x^64 doesn't + * count since it is simply a shift or move.) + * + * An alternate reduction method, also based on Barrett reduction and + * described in [1], uses only shifts and XORs --- no multiplications. + * However, the method with multiplications requires fewer instructions + * and is faster on processors with fast carryless multiplication. + * + * [1] "Intel Carry-Less Multiplication Instruction and its Usage for + * Computing the GCM Mode", + * https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf + */ + + /* 256-bit product is XH[1]:XM:XL[0], so c(x) is XH[1]:XM[1] */ + + /* T1 = t(x) = g*(x)(c(x)/x^64) */ + pmull2 T1.1q, GSTAR.2d, XH.2d + + /* T2 = g*(x)((c(x) mod x^64) + t(x)/x^64) */ + eor T2.16b, XM.16b, T1.16b + pmull2 T2.1q, GSTAR.2d, T2.2d + + /* Make XL[0] be the low half of the 128-bit result by adding the low 64 + * bits of the T2 term to what was already there. The 't(x)x^64' term + * makes no difference, so skip it. */ + eor XL.16b, XL.16b, T2.16b + + /* Make XL[1] be the high half of the 128-bit result by adding the high + * 64 bits of the 't(x)x^64' and T2 terms to what was already in XM[0], + * then moving XM[0] to XL[1]. */ + eor XM.16b, XM.16b, T1.16b + ext T2.16b, T2.16b, T2.16b, #8 + eor XM.16b, XM.16b, T2.16b + mov XL.d[1], XM.d[0] + + /* If more blocks remain, then loop back to process the next block; + * else, store the digest and return. */ + cbnz w3, 0b + st1 {XL.16b}, [x0] + ret +ENDPROC(pmull_poly_hash_update) diff --git a/arch/arm64/crypto/poly-hash-ce-glue.c b/arch/arm64/crypto/poly-hash-ce-glue.c new file mode 100644 index 000000000000..e195740c9ecf --- /dev/null +++ b/arch/arm64/crypto/poly-hash-ce-glue.c @@ -0,0 +1,166 @@ +/* + * Accelerated poly_hash implementation with ARMv8 PMULL instructions. + * + * Based on ghash-ce-glue.c. + * + * poly_hash is part of the HEH (Hash-Encrypt-Hash) encryption mode, proposed in + * Internet Draft https://tools.ietf.org/html/draft-cope-heh-01. + * + * poly_hash is very similar to GHASH: both algorithms are keyed hashes which + * interpret their input data as coefficients of a polynomial over GF(2^128), + * then calculate a hash value by evaluating that polynomial at the point given + * by the key, e.g. using Horner's rule. The difference is that poly_hash uses + * the more natural "ble" convention to represent GF(2^128) elements, whereas + * GHASH uses the less natural "lle" convention (see include/crypto/gf128mul.h). + * The ble convention makes it simpler to implement GF(2^128) multiplication. + * + * Copyright (C) 2014 Linaro Ltd. + * Copyright (C) 2017 Google Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +/* + * Note: in this algorithm we currently use 'le128' to represent GF(2^128) + * elements, even though poly_hash-generic uses 'be128'. Both types are + * actually "wrong" because the elements are actually in 'ble' format, and there + * should be a ble type to represent this --- as well as lle, bbe, and lbe types + * for the other conventions for representing GF(2^128) elements. But + * practically it doesn't matter which type we choose here, so we just use le128 + * since it's arguably more accurate, while poly_hash-generic still has to use + * be128 because the generic GF(2^128) multiplication functions all take be128. + */ + +struct poly_hash_desc_ctx { + le128 digest; + unsigned int count; +}; + +asmlinkage void pmull_poly_hash_update(le128 *digest, const le128 *key, + const u8 *src, unsigned int blocks, + unsigned int partial); + +static int poly_hash_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen) +{ + if (keylen != sizeof(le128)) { + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + memcpy(crypto_shash_ctx(tfm), key, sizeof(le128)); + return 0; +} + +static int poly_hash_init(struct shash_desc *desc) +{ + struct poly_hash_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->digest = (le128) { 0 }; + ctx->count = 0; + return 0; +} + +static int poly_hash_update(struct shash_desc *desc, const u8 *src, + unsigned int len) +{ + struct poly_hash_desc_ctx *ctx = shash_desc_ctx(desc); + unsigned int partial = ctx->count % sizeof(le128); + u8 *dst = (u8 *)&ctx->digest + partial; + + ctx->count += len; + + /* Finishing at least one block? */ + if (partial + len >= sizeof(le128)) { + const le128 *key = crypto_shash_ctx(desc->tfm); + + if (partial) { + /* Finish the pending block. */ + unsigned int n = sizeof(le128) - partial; + + len -= n; + do { + *dst++ ^= *src++; + } while (--n); + } + + /* + * Do the real work. If 'partial' is nonzero, this starts by + * multiplying 'digest' by 'key'. Then for each additional full + * block it adds the block to 'digest' and multiplies by 'key'. + */ + kernel_neon_begin_partial(8); + pmull_poly_hash_update(&ctx->digest, key, src, + len / sizeof(le128), partial); + kernel_neon_end(); + + src += len - (len % sizeof(le128)); + len %= sizeof(le128); + dst = (u8 *)&ctx->digest; + } + + /* Continue adding the next block to 'digest'. */ + while (len--) + *dst++ ^= *src++; + return 0; +} + +static int poly_hash_final(struct shash_desc *desc, u8 *out) +{ + struct poly_hash_desc_ctx *ctx = shash_desc_ctx(desc); + unsigned int partial = ctx->count % sizeof(le128); + + /* Finish the last block if needed. */ + if (partial) { + const le128 *key = crypto_shash_ctx(desc->tfm); + + kernel_neon_begin_partial(8); + pmull_poly_hash_update(&ctx->digest, key, NULL, 0, partial); + kernel_neon_end(); + } + + memcpy(out, &ctx->digest, sizeof(le128)); + return 0; +} + +static struct shash_alg poly_hash_alg = { + .digestsize = sizeof(le128), + .init = poly_hash_init, + .update = poly_hash_update, + .final = poly_hash_final, + .setkey = poly_hash_setkey, + .descsize = sizeof(struct poly_hash_desc_ctx), + .base = { + .cra_name = "poly_hash", + .cra_driver_name = "poly_hash-ce", + .cra_priority = 300, + .cra_ctxsize = sizeof(le128), + .cra_module = THIS_MODULE, + }, +}; + +static int __init poly_hash_ce_mod_init(void) +{ + return crypto_register_shash(&poly_hash_alg); +} + +static void __exit poly_hash_ce_mod_exit(void) +{ + crypto_unregister_shash(&poly_hash_alg); +} + +MODULE_DESCRIPTION("Polynomial evaluation hash using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Eric Biggers "); +MODULE_LICENSE("GPL v2"); + +module_cpu_feature_match(PMULL, poly_hash_ce_mod_init); +module_exit(poly_hash_ce_mod_exit); diff --git a/crypto/Kconfig b/crypto/Kconfig index 627227f1162d..3240d394426c 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -295,6 +295,7 @@ config CRYPTO_HEH select CRYPTO_ECB select CRYPTO_GF128MUL select CRYPTO_MANAGER + select CRYPTO_POLY_HASH_ARM64_CE if ARM64 && KERNEL_MODE_NEON help HEH: Hash-Encrypt-Hash mode HEH is a proposed block cipher mode of operation which extends the -- 2.34.1