| /* |
| * Accelerated poly_hash implementation with ARMv8 PMULL instructions. |
| * |
| * Based on ghash-ce-core.S. |
| * |
| * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> |
| * Copyright (C) 2017 Google, Inc. <ebiggers@google.com> |
| * |
| * This program is free software; you can redistribute it and/or modify it |
| * under the terms of the GNU General Public License version 2 as published |
| * by the Free Software Foundation. |
| */ |
| |
| #include <linux/linkage.h> |
| #include <asm/assembler.h> |
| |
| KEY .req v0 |
| KEY2 .req v1 |
| T1 .req v2 |
| T2 .req v3 |
| GSTAR .req v4 |
| XL .req v5 |
| XM .req v6 |
| XH .req v7 |
| |
| .text |
| .arch armv8-a+crypto |
| |
| /* 16-byte aligned (2**4 = 16); not required, but might as well */ |
| .align 4 |
| .Lgstar: |
| .quad 0x87, 0x87 |
| |
| /* |
| * void pmull_poly_hash_update(le128 *digest, const le128 *key, |
| * const u8 *src, unsigned int blocks, |
| * unsigned int partial); |
| */ |
| ENTRY(pmull_poly_hash_update) |
| |
| /* Load digest into XL */ |
| ld1 {XL.16b}, [x0] |
| |
| /* Load key into KEY */ |
| ld1 {KEY.16b}, [x1] |
| |
| /* Load g*(x) = g(x) + x^128 = x^7 + x^2 + x + 1 into both halves of |
| * GSTAR */ |
| adr x1, .Lgstar |
| ld1 {GSTAR.2d}, [x1] |
| |
| /* Set KEY2 to (KEY[1]+KEY[0]):(KEY[1]+KEY[0]). This is needed for |
| * Karatsuba multiplication. */ |
| ext KEY2.16b, KEY.16b, KEY.16b, #8 |
| eor KEY2.16b, KEY2.16b, KEY.16b |
| |
| /* If 'partial' is nonzero, then we're finishing a pending block and |
| * should go right to the multiplication. */ |
| cbnz w4, 1f |
| |
| 0: |
| /* Add the next block from 'src' to the digest */ |
| ld1 {T1.16b}, [x2], #16 |
| eor XL.16b, XL.16b, T1.16b |
| sub w3, w3, #1 |
| |
| 1: |
| /* |
| * Multiply the current 128-bit digest (a1:a0, in XL) by the 128-bit key |
| * (b1:b0, in KEY) using Karatsuba multiplication. |
| */ |
| |
| /* T1 = (a1+a0):(a1+a0) */ |
| ext T1.16b, XL.16b, XL.16b, #8 |
| eor T1.16b, T1.16b, XL.16b |
| |
| /* XH = a1 * b1 */ |
| pmull2 XH.1q, XL.2d, KEY.2d |
| |
| /* XL = a0 * b0 */ |
| pmull XL.1q, XL.1d, KEY.1d |
| |
| /* XM = (a1+a0) * (b1+b0) */ |
| pmull XM.1q, T1.1d, KEY2.1d |
| |
| /* XM += (XH[0]:XL[1]) + XL + XH */ |
| ext T1.16b, XL.16b, XH.16b, #8 |
| eor T2.16b, XL.16b, XH.16b |
| eor XM.16b, XM.16b, T1.16b |
| eor XM.16b, XM.16b, T2.16b |
| |
| /* |
| * Now the 256-bit product is in XH[1]:XM:XL[0]. It represents a |
| * polynomial over GF(2) with degree as large as 255. We need to |
| * compute its remainder modulo g(x) = x^128+x^7+x^2+x+1. For this it |
| * is sufficient to compute the remainder of the high half 'c(x)x^128' |
| * add it to the low half. To reduce the high half we use the Barrett |
| * reduction method. The basic idea is that we can express the |
| * remainder p(x) as g(x)q(x) mod x^128, where q(x) = (c(x)x^128)/g(x). |
| * As detailed in [1], to avoid having to divide by g(x) at runtime the |
| * following equivalent expression can be derived: |
| * |
| * p(x) = [ g*(x)((c(x)q+(x))/x^128) ] mod x^128 |
| * |
| * where g*(x) = x^128+g(x) = x^7+x^2+x+1, and q+(x) = x^256/g(x) = g(x) |
| * in this case. This is also equivalent to: |
| * |
| * p(x) = [ g*(x)((c(x)(x^128 + g*(x)))/x^128) ] mod x^128 |
| * = [ g*(x)(c(x) + (c(x)g*(x))/x^128) ] mod x^128 |
| * |
| * Since deg g*(x) < 64: |
| * |
| * p(x) = [ g*(x)(c(x) + ((c(x)/x^64)g*(x))/x^64) ] mod x^128 |
| * = [ g*(x)((c(x)/x^64)x^64 + (c(x) mod x^64) + |
| * ((c(x)/x^64)g*(x))/x^64) ] mod x^128 |
| * |
| * Letting t(x) = g*(x)(c(x)/x^64): |
| * |
| * p(x) = [ t(x)x^64 + g*(x)((c(x) mod x^64) + t(x)/x^64) ] mod x^128 |
| * |
| * Therefore, to do the reduction we only need to issue two 64-bit => |
| * 128-bit carryless multiplications: g*(x) times c(x)/x^64, and g*(x) |
| * times ((c(x) mod x^64) + t(x)/x^64). (Multiplication by x^64 doesn't |
| * count since it is simply a shift or move.) |
| * |
| * An alternate reduction method, also based on Barrett reduction and |
| * described in [1], uses only shifts and XORs --- no multiplications. |
| * However, the method with multiplications requires fewer instructions |
| * and is faster on processors with fast carryless multiplication. |
| * |
| * [1] "Intel Carry-Less Multiplication Instruction and its Usage for |
| * Computing the GCM Mode", |
| * https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf |
| */ |
| |
| /* 256-bit product is XH[1]:XM:XL[0], so c(x) is XH[1]:XM[1] */ |
| |
| /* T1 = t(x) = g*(x)(c(x)/x^64) */ |
| pmull2 T1.1q, GSTAR.2d, XH.2d |
| |
| /* T2 = g*(x)((c(x) mod x^64) + t(x)/x^64) */ |
| eor T2.16b, XM.16b, T1.16b |
| pmull2 T2.1q, GSTAR.2d, T2.2d |
| |
| /* Make XL[0] be the low half of the 128-bit result by adding the low 64 |
| * bits of the T2 term to what was already there. The 't(x)x^64' term |
| * makes no difference, so skip it. */ |
| eor XL.16b, XL.16b, T2.16b |
| |
| /* Make XL[1] be the high half of the 128-bit result by adding the high |
| * 64 bits of the 't(x)x^64' and T2 terms to what was already in XM[0], |
| * then moving XM[0] to XL[1]. */ |
| eor XM.16b, XM.16b, T1.16b |
| ext T2.16b, T2.16b, T2.16b, #8 |
| eor XM.16b, XM.16b, T2.16b |
| mov XL.d[1], XM.d[0] |
| |
| /* If more blocks remain, then loop back to process the next block; |
| * else, store the digest and return. */ |
| cbnz w3, 0b |
| st1 {XL.16b}, [x0] |
| ret |
| ENDPROC(pmull_poly_hash_update) |