diff options
Diffstat (limited to 'arch/arm64/crypto/aes-ce-ccm-core.S')
-rw-r--r-- | arch/arm64/crypto/aes-ce-ccm-core.S | 265 |
1 files changed, 93 insertions, 172 deletions
diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S index b03f7f71f893..f2624238fd95 100644 --- a/arch/arm64/crypto/aes-ce-ccm-core.S +++ b/arch/arm64/crypto/aes-ce-ccm-core.S @@ -1,8 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions + * aes-ce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions * - * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2013 - 2017 Linaro Ltd. + * Copyright (C) 2024 Google LLC + * + * Author: Ard Biesheuvel <ardb@kernel.org> */ #include <linux/linkage.h> @@ -11,211 +14,129 @@ .text .arch armv8-a+crypto - /* - * u32 ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes, - * u32 macp, u8 const rk[], u32 rounds); - */ -SYM_FUNC_START(ce_aes_ccm_auth_data) - ld1 {v0.16b}, [x0] /* load mac */ - cbz w3, 1f - sub w3, w3, #16 - eor v1.16b, v1.16b, v1.16b -0: ldrb w7, [x1], #1 /* get 1 byte of input */ - subs w2, w2, #1 - add w3, w3, #1 - ins v1.b[0], w7 - ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */ - beq 8f /* out of input? */ - cbnz w3, 0b - eor v0.16b, v0.16b, v1.16b -1: ld1 {v3.4s}, [x4] /* load first round key */ - prfm pldl1strm, [x1] - cmp w5, #12 /* which key size? */ - add x6, x4, #16 - sub w7, w5, #2 /* modified # of rounds */ - bmi 2f - bne 5f - mov v5.16b, v3.16b - b 4f -2: mov v4.16b, v3.16b - ld1 {v5.4s}, [x6], #16 /* load 2nd round key */ -3: aese v0.16b, v4.16b - aesmc v0.16b, v0.16b -4: ld1 {v3.4s}, [x6], #16 /* load next round key */ - aese v0.16b, v5.16b - aesmc v0.16b, v0.16b -5: ld1 {v4.4s}, [x6], #16 /* load next round key */ - subs w7, w7, #3 - aese v0.16b, v3.16b - aesmc v0.16b, v0.16b - ld1 {v5.4s}, [x6], #16 /* load next round key */ - bpl 3b - aese v0.16b, v4.16b - subs w2, w2, #16 /* last data? */ - eor v0.16b, v0.16b, v5.16b /* final round */ - bmi 6f - ld1 {v1.16b}, [x1], #16 /* load next input block */ - eor v0.16b, v0.16b, v1.16b /* xor with mac */ - bne 1b -6: st1 {v0.16b}, [x0] /* store mac */ - beq 10f - adds w2, w2, #16 - beq 10f - mov w3, w2 -7: ldrb w7, [x1], #1 - umov w6, v0.b[0] - eor w6, w6, w7 - strb w6, [x0], #1 - subs w2, w2, #1 - beq 10f - ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */ - b 7b -8: cbz w3, 91f - mov w7, w3 - add w3, w3, #16 -9: ext v1.16b, v1.16b, v1.16b, #1 - adds w7, w7, #1 - bne 9b -91: eor v0.16b, v0.16b, v1.16b - st1 {v0.16b}, [x0] -10: mov w0, w3 - ret -SYM_FUNC_END(ce_aes_ccm_auth_data) + .macro load_round_keys, rk, nr, tmp + sub w\tmp, \nr, #10 + add \tmp, \rk, w\tmp, sxtw #4 + ld1 {v10.4s-v13.4s}, [\rk] + ld1 {v14.4s-v17.4s}, [\tmp], #64 + ld1 {v18.4s-v21.4s}, [\tmp], #64 + ld1 {v3.4s-v5.4s}, [\tmp] + .endm - /* - * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[], - * u32 rounds); - */ -SYM_FUNC_START(ce_aes_ccm_final) - ld1 {v3.4s}, [x2], #16 /* load first round key */ - ld1 {v0.16b}, [x0] /* load mac */ - cmp w3, #12 /* which key size? */ - sub w3, w3, #2 /* modified # of rounds */ - ld1 {v1.16b}, [x1] /* load 1st ctriv */ - bmi 0f - bne 3f - mov v5.16b, v3.16b - b 2f -0: mov v4.16b, v3.16b -1: ld1 {v5.4s}, [x2], #16 /* load next round key */ - aese v0.16b, v4.16b - aesmc v0.16b, v0.16b - aese v1.16b, v4.16b - aesmc v1.16b, v1.16b -2: ld1 {v3.4s}, [x2], #16 /* load next round key */ - aese v0.16b, v5.16b - aesmc v0.16b, v0.16b - aese v1.16b, v5.16b - aesmc v1.16b, v1.16b -3: ld1 {v4.4s}, [x2], #16 /* load next round key */ - subs w3, w3, #3 - aese v0.16b, v3.16b - aesmc v0.16b, v0.16b - aese v1.16b, v3.16b - aesmc v1.16b, v1.16b - bpl 1b - aese v0.16b, v4.16b - aese v1.16b, v4.16b - /* final round key cancels out */ - eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */ - st1 {v0.16b}, [x0] /* store result */ - ret -SYM_FUNC_END(ce_aes_ccm_final) + .macro dround, va, vb, vk + aese \va\().16b, \vk\().16b + aesmc \va\().16b, \va\().16b + aese \vb\().16b, \vk\().16b + aesmc \vb\().16b, \vb\().16b + .endm + + .macro aes_encrypt, va, vb, nr + tbz \nr, #2, .L\@ + dround \va, \vb, v10 + dround \va, \vb, v11 + tbz \nr, #1, .L\@ + dround \va, \vb, v12 + dround \va, \vb, v13 +.L\@: .irp v, v14, v15, v16, v17, v18, v19, v20, v21, v3 + dround \va, \vb, \v + .endr + aese \va\().16b, v4.16b + aese \vb\().16b, v4.16b + .endm .macro aes_ccm_do_crypt,enc - cbz x2, 5f - ldr x8, [x6, #8] /* load lower ctr */ + load_round_keys x3, w4, x10 + ld1 {v0.16b}, [x5] /* load mac */ + cbz x2, ce_aes_ccm_final + ldr x8, [x6, #8] /* load lower ctr */ CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */ 0: /* outer loop */ ld1 {v1.8b}, [x6] /* load upper ctr */ prfm pldl1strm, [x1] add x8, x8, #1 rev x9, x8 - cmp w4, #12 /* which key size? */ - sub w7, w4, #2 /* get modified # of rounds */ ins v1.d[1], x9 /* no carry in lower ctr */ - ld1 {v3.4s}, [x3] /* load first round key */ - add x10, x3, #16 - bmi 1f - bne 4f - mov v5.16b, v3.16b - b 3f -1: mov v4.16b, v3.16b - ld1 {v5.4s}, [x10], #16 /* load 2nd round key */ -2: /* inner loop: 3 rounds, 2x interleaved */ - aese v0.16b, v4.16b - aesmc v0.16b, v0.16b - aese v1.16b, v4.16b - aesmc v1.16b, v1.16b -3: ld1 {v3.4s}, [x10], #16 /* load next round key */ - aese v0.16b, v5.16b - aesmc v0.16b, v0.16b - aese v1.16b, v5.16b - aesmc v1.16b, v1.16b -4: ld1 {v4.4s}, [x10], #16 /* load next round key */ - subs w7, w7, #3 - aese v0.16b, v3.16b - aesmc v0.16b, v0.16b - aese v1.16b, v3.16b - aesmc v1.16b, v1.16b - ld1 {v5.4s}, [x10], #16 /* load next round key */ - bpl 2b - aese v0.16b, v4.16b - aese v1.16b, v4.16b + + aes_encrypt v0, v1, w4 + subs w2, w2, #16 - bmi 6f /* partial block? */ + bmi ce_aes_ccm_crypt_tail ld1 {v2.16b}, [x1], #16 /* load next input block */ .if \enc == 1 eor v2.16b, v2.16b, v5.16b /* final round enc+mac */ - eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */ + eor v6.16b, v1.16b, v2.16b /* xor with crypted ctr */ .else eor v2.16b, v2.16b, v1.16b /* xor with crypted ctr */ - eor v1.16b, v2.16b, v5.16b /* final round enc */ + eor v6.16b, v2.16b, v5.16b /* final round enc */ .endif eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */ - st1 {v1.16b}, [x0], #16 /* write output block */ + st1 {v6.16b}, [x0], #16 /* write output block */ bne 0b CPU_LE( rev x8, x8 ) - st1 {v0.16b}, [x5] /* store mac */ str x8, [x6, #8] /* store lsb end of ctr (BE) */ -5: ret - -6: eor v0.16b, v0.16b, v5.16b /* final round mac */ - eor v1.16b, v1.16b, v5.16b /* final round enc */ + cbnz x7, ce_aes_ccm_final st1 {v0.16b}, [x5] /* store mac */ - add w2, w2, #16 /* process partial tail block */ -7: ldrb w9, [x1], #1 /* get 1 byte of input */ - umov w6, v1.b[0] /* get top crypted ctr byte */ - umov w7, v0.b[0] /* get top mac byte */ - .if \enc == 1 - eor w7, w7, w9 - eor w9, w9, w6 - .else - eor w9, w9, w6 - eor w7, w7, w9 - .endif - strb w9, [x0], #1 /* store out byte */ - strb w7, [x5], #1 /* store mac byte */ - subs w2, w2, #1 - beq 5b - ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */ - ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */ - b 7b + ret .endm +SYM_FUNC_START_LOCAL(ce_aes_ccm_crypt_tail) + eor v0.16b, v0.16b, v5.16b /* final round mac */ + eor v1.16b, v1.16b, v5.16b /* final round enc */ + + add x1, x1, w2, sxtw /* rewind the input pointer (w2 < 0) */ + add x0, x0, w2, sxtw /* rewind the output pointer */ + + adr_l x8, .Lpermute /* load permute vectors */ + add x9, x8, w2, sxtw + sub x8, x8, w2, sxtw + ld1 {v7.16b-v8.16b}, [x9] + ld1 {v9.16b}, [x8] + + ld1 {v2.16b}, [x1] /* load a full block of input */ + tbl v1.16b, {v1.16b}, v7.16b /* move keystream to end of register */ + eor v7.16b, v2.16b, v1.16b /* encrypt partial input block */ + bif v2.16b, v7.16b, v22.16b /* select plaintext */ + tbx v7.16b, {v6.16b}, v8.16b /* insert output from previous iteration */ + tbl v2.16b, {v2.16b}, v9.16b /* copy plaintext to start of v2 */ + eor v0.16b, v0.16b, v2.16b /* fold plaintext into mac */ + + st1 {v7.16b}, [x0] /* store output block */ + cbz x7, 0f + +SYM_INNER_LABEL(ce_aes_ccm_final, SYM_L_LOCAL) + ld1 {v1.16b}, [x7] /* load 1st ctriv */ + + aes_encrypt v0, v1, w4 + + /* final round key cancels out */ + eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */ +0: st1 {v0.16b}, [x5] /* store result */ + ret +SYM_FUNC_END(ce_aes_ccm_crypt_tail) + /* * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, * u8 const rk[], u32 rounds, u8 mac[], - * u8 ctr[]); + * u8 ctr[], u8 const final_iv[]); * void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, * u8 const rk[], u32 rounds, u8 mac[], - * u8 ctr[]); + * u8 ctr[], u8 const final_iv[]); */ SYM_FUNC_START(ce_aes_ccm_encrypt) + movi v22.16b, #255 aes_ccm_do_crypt 1 SYM_FUNC_END(ce_aes_ccm_encrypt) SYM_FUNC_START(ce_aes_ccm_decrypt) + movi v22.16b, #0 aes_ccm_do_crypt 0 SYM_FUNC_END(ce_aes_ccm_decrypt) + + .section ".rodata", "a" + .align 6 + .fill 15, 1, 0xff +.Lpermute: + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf + .fill 15, 1, 0xff |