android / kernel / common.git / eba1300649c15a5d2eb96a6b3341e45996f32e5d / . / arch / arm64 / crypto / poly-hash-ce-core.S

/* | |

* Accelerated poly_hash implementation with ARMv8 PMULL instructions. | |

* | |

* Based on ghash-ce-core.S. | |

* | |

* Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> | |

* Copyright (C) 2017 Google, Inc. <ebiggers@google.com> | |

* | |

* This program is free software; you can redistribute it and/or modify it | |

* under the terms of the GNU General Public License version 2 as published | |

* by the Free Software Foundation. | |

*/ | |

#include <linux/linkage.h> | |

#include <asm/assembler.h> | |

KEY .req v0 | |

KEY2 .req v1 | |

T1 .req v2 | |

T2 .req v3 | |

GSTAR .req v4 | |

XL .req v5 | |

XM .req v6 | |

XH .req v7 | |

.text | |

.arch armv8-a+crypto | |

/* 16-byte aligned (2**4 = 16); not required, but might as well */ | |

.align 4 | |

.Lgstar: | |

.quad 0x87, 0x87 | |

/* | |

* void pmull_poly_hash_update(le128 *digest, const le128 *key, | |

* const u8 *src, unsigned int blocks, | |

* unsigned int partial); | |

*/ | |

ENTRY(pmull_poly_hash_update) | |

/* Load digest into XL */ | |

ld1 {XL.16b}, [x0] | |

/* Load key into KEY */ | |

ld1 {KEY.16b}, [x1] | |

/* Load g*(x) = g(x) + x^128 = x^7 + x^2 + x + 1 into both halves of | |

* GSTAR */ | |

adr x1, .Lgstar | |

ld1 {GSTAR.2d}, [x1] | |

/* Set KEY2 to (KEY[1]+KEY[0]):(KEY[1]+KEY[0]). This is needed for | |

* Karatsuba multiplication. */ | |

ext KEY2.16b, KEY.16b, KEY.16b, #8 | |

eor KEY2.16b, KEY2.16b, KEY.16b | |

/* If 'partial' is nonzero, then we're finishing a pending block and | |

* should go right to the multiplication. */ | |

cbnz w4, 1f | |

0: | |

/* Add the next block from 'src' to the digest */ | |

ld1 {T1.16b}, [x2], #16 | |

eor XL.16b, XL.16b, T1.16b | |

sub w3, w3, #1 | |

1: | |

/* | |

* Multiply the current 128-bit digest (a1:a0, in XL) by the 128-bit key | |

* (b1:b0, in KEY) using Karatsuba multiplication. | |

*/ | |

/* T1 = (a1+a0):(a1+a0) */ | |

ext T1.16b, XL.16b, XL.16b, #8 | |

eor T1.16b, T1.16b, XL.16b | |

/* XH = a1 * b1 */ | |

pmull2 XH.1q, XL.2d, KEY.2d | |

/* XL = a0 * b0 */ | |

pmull XL.1q, XL.1d, KEY.1d | |

/* XM = (a1+a0) * (b1+b0) */ | |

pmull XM.1q, T1.1d, KEY2.1d | |

/* XM += (XH[0]:XL[1]) + XL + XH */ | |

ext T1.16b, XL.16b, XH.16b, #8 | |

eor T2.16b, XL.16b, XH.16b | |

eor XM.16b, XM.16b, T1.16b | |

eor XM.16b, XM.16b, T2.16b | |

/* | |

* Now the 256-bit product is in XH[1]:XM:XL[0]. It represents a | |

* polynomial over GF(2) with degree as large as 255. We need to | |

* compute its remainder modulo g(x) = x^128+x^7+x^2+x+1. For this it | |

* is sufficient to compute the remainder of the high half 'c(x)x^128' | |

* add it to the low half. To reduce the high half we use the Barrett | |

* reduction method. The basic idea is that we can express the | |

* remainder p(x) as g(x)q(x) mod x^128, where q(x) = (c(x)x^128)/g(x). | |

* As detailed in [1], to avoid having to divide by g(x) at runtime the | |

* following equivalent expression can be derived: | |

* | |

* p(x) = [ g*(x)((c(x)q+(x))/x^128) ] mod x^128 | |

* | |

* where g*(x) = x^128+g(x) = x^7+x^2+x+1, and q+(x) = x^256/g(x) = g(x) | |

* in this case. This is also equivalent to: | |

* | |

* p(x) = [ g*(x)((c(x)(x^128 + g*(x)))/x^128) ] mod x^128 | |

* = [ g*(x)(c(x) + (c(x)g*(x))/x^128) ] mod x^128 | |

* | |

* Since deg g*(x) < 64: | |

* | |

* p(x) = [ g*(x)(c(x) + ((c(x)/x^64)g*(x))/x^64) ] mod x^128 | |

* = [ g*(x)((c(x)/x^64)x^64 + (c(x) mod x^64) + | |

* ((c(x)/x^64)g*(x))/x^64) ] mod x^128 | |

* | |

* Letting t(x) = g*(x)(c(x)/x^64): | |

* | |

* p(x) = [ t(x)x^64 + g*(x)((c(x) mod x^64) + t(x)/x^64) ] mod x^128 | |

* | |

* Therefore, to do the reduction we only need to issue two 64-bit => | |

* 128-bit carryless multiplications: g*(x) times c(x)/x^64, and g*(x) | |

* times ((c(x) mod x^64) + t(x)/x^64). (Multiplication by x^64 doesn't | |

* count since it is simply a shift or move.) | |

* | |

* An alternate reduction method, also based on Barrett reduction and | |

* described in [1], uses only shifts and XORs --- no multiplications. | |

* However, the method with multiplications requires fewer instructions | |

* and is faster on processors with fast carryless multiplication. | |

* | |

* [1] "Intel Carry-Less Multiplication Instruction and its Usage for | |

* Computing the GCM Mode", | |

* https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf | |

*/ | |

/* 256-bit product is XH[1]:XM:XL[0], so c(x) is XH[1]:XM[1] */ | |

/* T1 = t(x) = g*(x)(c(x)/x^64) */ | |

pmull2 T1.1q, GSTAR.2d, XH.2d | |

/* T2 = g*(x)((c(x) mod x^64) + t(x)/x^64) */ | |

eor T2.16b, XM.16b, T1.16b | |

pmull2 T2.1q, GSTAR.2d, T2.2d | |

/* Make XL[0] be the low half of the 128-bit result by adding the low 64 | |

* bits of the T2 term to what was already there. The 't(x)x^64' term | |

* makes no difference, so skip it. */ | |

eor XL.16b, XL.16b, T2.16b | |

/* Make XL[1] be the high half of the 128-bit result by adding the high | |

* 64 bits of the 't(x)x^64' and T2 terms to what was already in XM[0], | |

* then moving XM[0] to XL[1]. */ | |

eor XM.16b, XM.16b, T1.16b | |

ext T2.16b, T2.16b, T2.16b, #8 | |

eor XM.16b, XM.16b, T2.16b | |

mov XL.d[1], XM.d[0] | |

/* If more blocks remain, then loop back to process the next block; | |

* else, store the digest and return. */ | |

cbnz w3, 0b | |

st1 {XL.16b}, [x0] | |

ret | |

ENDPROC(pmull_poly_hash_update) |