summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-11-19 10:28:41 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-11-19 10:28:41 -0800
commit02b2f1a7b8ef340e57cae640a52ec7199b0b887d (patch)
tree5f988798262afdeda17dc8f0cd6882d30621de5d /arch
parent1af29b34ea7f63c3e7225c324ffa86c9748874e4 (diff)
parent4223414efeae3a8efb4da1e9c9c52a1a44c1c5bf (diff)
Merge tag 'v6.13-p1' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu: "API: - Add sig driver API - Remove signing/verification from akcipher API - Move crypto_simd_disabled_for_test to lib/crypto - Add WARN_ON for return values from driver that indicates memory corruption Algorithms: - Provide crc32-arch and crc32c-arch through Crypto API - Optimise crc32c code size on x86 - Optimise crct10dif on arm/arm64 - Optimise p10-aes-gcm on powerpc - Optimise aegis128 on x86 - Output full sample from test interface in jitter RNG - Retry without padata when it fails in pcrypt Drivers: - Add support for Airoha EN7581 TRNG - Add support for STM32MP25x platforms in stm32 - Enable iproc-r200 RNG driver on BCMBCA - Add Broadcom BCM74110 RNG driver" * tag 'v6.13-p1' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (112 commits) crypto: marvell/cesa - fix uninit value for struct mv_cesa_op_ctx crypto: cavium - Fix an error handling path in cpt_ucode_load_fw() crypto: aesni - Move back to module_init crypto: lib/mpi - Export mpi_set_bit crypto: aes-gcm-p10 - Use the correct bit to test for P10 hwrng: amd - remove reference to removed PPC_MAPLE config crypto: arm/crct10dif - Implement plain NEON variant crypto: arm/crct10dif - Macroify PMULL asm code crypto: arm/crct10dif - Use existing mov_l macro instead of __adrl crypto: arm64/crct10dif - Remove remaining 64x64 PMULL fallback code crypto: arm64/crct10dif - Use faster 16x64 bit polynomial multiply crypto: arm64/crct10dif - Remove obsolete chunking logic crypto: bcm - add error check in the ahash_hmac_init function crypto: caam - add error check to caam_rsa_set_priv_key_form hwrng: bcm74110 - Add Broadcom BCM74110 RNG driver dt-bindings: rng: add binding for BCM74110 RNG padata: Clean up in padata_do_multithreaded() crypto: inside-secure - Fix the return value of safexcel_xcbcmac_cra_init() crypto: qat - Fix missing destroy_workqueue in adf_init_aer() crypto: rsassa-pkcs1 - Reinstate support for legacy protocols ...
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/crypto/crct10dif-ce-core.S249
-rw-r--r--arch/arm/crypto/crct10dif-ce-glue.c55
-rw-r--r--arch/arm64/crypto/crct10dif-ce-core.S335
-rw-r--r--arch/arm64/crypto/crct10dif-ce-glue.c48
-rw-r--r--arch/powerpc/crypto/Kconfig2
-rw-r--r--arch/powerpc/crypto/aes-gcm-p10-glue.c141
-rw-r--r--arch/powerpc/crypto/aes-gcm-p10.S2421
-rw-r--r--arch/x86/crypto/Kconfig4
-rw-r--r--arch/x86/crypto/aegis128-aesni-asm.S532
-rw-r--r--arch/x86/crypto/aegis128-aesni-glue.c145
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c2
-rw-r--r--arch/x86/crypto/cast5-avx-x86_64-asm_64.S76
-rw-r--r--arch/x86/crypto/crc32c-intel_glue.c2
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S354
14 files changed, 1951 insertions, 2415 deletions
diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
index 46c02c518a30..2bbf2df9c1e2 100644
--- a/arch/arm/crypto/crct10dif-ce-core.S
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -112,55 +112,120 @@
FOLD_CONST_L .req q10l
FOLD_CONST_H .req q10h
+ /*
+ * Pairwise long polynomial multiplication of two 16-bit values
+ *
+ * { w0, w1 }, { y0, y1 }
+ *
+ * by two 64-bit values
+ *
+ * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
+ *
+ * where each vector element is a byte, ordered from least to most
+ * significant. The resulting 80-bit vectors are XOR'ed together.
+ *
+ * This can be implemented using 8x8 long polynomial multiplication, by
+ * reorganizing the input so that each pairwise 8x8 multiplication
+ * produces one of the terms from the decomposition below, and
+ * combining the results of each rank and shifting them into place.
+ *
+ * Rank
+ * 0 w0*x0 ^ | y0*z0 ^
+ * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^
+ * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^
+ * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^
+ * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^
+ * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^
+ * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^
+ * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^
+ * 8 w1*x7 << 64 | y1*z7 << 64
+ *
+ * The inputs can be reorganized into
+ *
+ * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
+ * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
+ *
+ * and after performing 8x8->16 bit long polynomial multiplication of
+ * each of the halves of the first vector with those of the second one,
+ * we obtain the following four vectors of 16-bit elements:
+ *
+ * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
+ * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
+ * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
+ * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
+ *
+ * Results b and c can be XORed together, as the vector elements have
+ * matching ranks. Then, the final XOR can be pulled forward, and
+ * applied between the halves of each of the remaining three vectors,
+ * which are then shifted into place, and XORed together to produce the
+ * final 80-bit result.
+ */
+ .macro pmull16x64_p8, v16, v64
+ vext.8 q11, \v64, \v64, #1
+ vld1.64 {q12}, [r4, :128]
+ vuzp.8 q11, \v64
+ vtbl.8 d24, {\v16\()_L-\v16\()_H}, d24
+ vtbl.8 d25, {\v16\()_L-\v16\()_H}, d25
+ bl __pmull16x64_p8
+ veor \v64, q12, q14
+ .endm
+
+__pmull16x64_p8:
+ vmull.p8 q13, d23, d24
+ vmull.p8 q14, d23, d25
+ vmull.p8 q15, d22, d24
+ vmull.p8 q12, d22, d25
+
+ veor q14, q14, q15
+ veor d24, d24, d25
+ veor d26, d26, d27
+ veor d28, d28, d29
+ vmov.i32 d25, #0
+ vmov.i32 d29, #0
+ vext.8 q12, q12, q12, #14
+ vext.8 q14, q14, q14, #15
+ veor d24, d24, d26
+ bx lr
+ENDPROC(__pmull16x64_p8)
+
+ .macro pmull16x64_p64, v16, v64
+ vmull.p64 q11, \v64\()l, \v16\()_L
+ vmull.p64 \v64, \v64\()h, \v16\()_H
+ veor \v64, \v64, q11
+ .endm
+
// Fold reg1, reg2 into the next 32 data bytes, storing the result back
// into reg1, reg2.
- .macro fold_32_bytes, reg1, reg2
- vld1.64 {q11-q12}, [buf]!
+ .macro fold_32_bytes, reg1, reg2, p
+ vld1.64 {q8-q9}, [buf]!
- vmull.p64 q8, \reg1\()h, FOLD_CONST_H
- vmull.p64 \reg1, \reg1\()l, FOLD_CONST_L
- vmull.p64 q9, \reg2\()h, FOLD_CONST_H
- vmull.p64 \reg2, \reg2\()l, FOLD_CONST_L
+ pmull16x64_\p FOLD_CONST, \reg1
+ pmull16x64_\p FOLD_CONST, \reg2
-CPU_LE( vrev64.8 q11, q11 )
-CPU_LE( vrev64.8 q12, q12 )
- vswp q11l, q11h
- vswp q12l, q12h
+CPU_LE( vrev64.8 q8, q8 )
+CPU_LE( vrev64.8 q9, q9 )
+ vswp q8l, q8h
+ vswp q9l, q9h
veor.8 \reg1, \reg1, q8
veor.8 \reg2, \reg2, q9
- veor.8 \reg1, \reg1, q11
- veor.8 \reg2, \reg2, q12
.endm
// Fold src_reg into dst_reg, optionally loading the next fold constants
- .macro fold_16_bytes, src_reg, dst_reg, load_next_consts
- vmull.p64 q8, \src_reg\()l, FOLD_CONST_L
- vmull.p64 \src_reg, \src_reg\()h, FOLD_CONST_H
+ .macro fold_16_bytes, src_reg, dst_reg, p, load_next_consts
+ pmull16x64_\p FOLD_CONST, \src_reg
.ifnb \load_next_consts
vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
.endif
- veor.8 \dst_reg, \dst_reg, q8
veor.8 \dst_reg, \dst_reg, \src_reg
.endm
- .macro __adrl, out, sym
- movw \out, #:lower16:\sym
- movt \out, #:upper16:\sym
- .endm
-
-//
-// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-ENTRY(crc_t10dif_pmull)
-
+ .macro crct10dif, p
// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
cmp len, #256
- blt .Lless_than_256_bytes
+ blt .Lless_than_256_bytes\@
- __adrl fold_consts_ptr, .Lfold_across_128_bytes_consts
+ mov_l fold_consts_ptr, .Lfold_across_128_bytes_consts
// Load the first 128 data bytes. Byte swapping is necessary to make
// the bit order match the polynomial coefficient order.
@@ -199,27 +264,27 @@ CPU_LE( vrev64.8 q7, q7 )
// While >= 128 data bytes remain (not counting q0-q7), fold the 128
// bytes q0-q7 into them, storing the result back into q0-q7.
-.Lfold_128_bytes_loop:
- fold_32_bytes q0, q1
- fold_32_bytes q2, q3
- fold_32_bytes q4, q5
- fold_32_bytes q6, q7
+.Lfold_128_bytes_loop\@:
+ fold_32_bytes q0, q1, \p
+ fold_32_bytes q2, q3, \p
+ fold_32_bytes q4, q5, \p
+ fold_32_bytes q6, q7, \p
subs len, len, #128
- bge .Lfold_128_bytes_loop
+ bge .Lfold_128_bytes_loop\@
// Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
// Fold across 64 bytes.
vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
- fold_16_bytes q0, q4
- fold_16_bytes q1, q5
- fold_16_bytes q2, q6
- fold_16_bytes q3, q7, 1
+ fold_16_bytes q0, q4, \p
+ fold_16_bytes q1, q5, \p
+ fold_16_bytes q2, q6, \p
+ fold_16_bytes q3, q7, \p, 1
// Fold across 32 bytes.
- fold_16_bytes q4, q6
- fold_16_bytes q5, q7, 1
+ fold_16_bytes q4, q6, \p
+ fold_16_bytes q5, q7, \p, 1
// Fold across 16 bytes.
- fold_16_bytes q6, q7
+ fold_16_bytes q6, q7, \p
// Add 128 to get the correct number of data bytes remaining in 0...127
// (not counting q7), following the previous extra subtraction by 128.
@@ -229,25 +294,23 @@ CPU_LE( vrev64.8 q7, q7 )
// While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7
// into them, storing the result back into q7.
- blt .Lfold_16_bytes_loop_done
-.Lfold_16_bytes_loop:
- vmull.p64 q8, q7l, FOLD_CONST_L
- vmull.p64 q7, q7h, FOLD_CONST_H
- veor.8 q7, q7, q8
+ blt .Lfold_16_bytes_loop_done\@
+.Lfold_16_bytes_loop\@:
+ pmull16x64_\p FOLD_CONST, q7
vld1.64 {q0}, [buf]!
CPU_LE( vrev64.8 q0, q0 )
vswp q0l, q0h
veor.8 q7, q7, q0
subs len, len, #16
- bge .Lfold_16_bytes_loop
+ bge .Lfold_16_bytes_loop\@
-.Lfold_16_bytes_loop_done:
+.Lfold_16_bytes_loop_done\@:
// Add 16 to get the correct number of data bytes remaining in 0...15
// (not counting q7), following the previous extra subtraction by 16.
adds len, len, #16
- beq .Lreduce_final_16_bytes
+ beq .Lreduce_final_16_bytes\@
-.Lhandle_partial_segment:
+.Lhandle_partial_segment\@:
// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
// 16 bytes are in q7 and the rest are the remaining data in 'buf'. To
// do this without needing a fold constant for each possible 'len',
@@ -262,9 +325,9 @@ CPU_LE( vrev64.8 q0, q0 )
vswp q0l, q0h
// q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
- __adrl r3, .Lbyteshift_table + 16
- sub r3, r3, len
- vld1.8 {q2}, [r3]
+ mov_l r1, .Lbyteshift_table + 16
+ sub r1, r1, len
+ vld1.8 {q2}, [r1]
vtbl.8 q1l, {q7l-q7h}, q2l
vtbl.8 q1h, {q7l-q7h}, q2h
@@ -282,12 +345,46 @@ CPU_LE( vrev64.8 q0, q0 )
vbsl.8 q2, q1, q0
// Fold the first chunk into the second chunk, storing the result in q7.
- vmull.p64 q0, q3l, FOLD_CONST_L
- vmull.p64 q7, q3h, FOLD_CONST_H
- veor.8 q7, q7, q0
- veor.8 q7, q7, q2
+ pmull16x64_\p FOLD_CONST, q3
+ veor.8 q7, q3, q2
+ b .Lreduce_final_16_bytes\@
+
+.Lless_than_256_bytes\@:
+ // Checksumming a buffer of length 16...255 bytes
+
+ mov_l fold_consts_ptr, .Lfold_across_16_bytes_consts
+
+ // Load the first 16 data bytes.
+ vld1.64 {q7}, [buf]!
+CPU_LE( vrev64.8 q7, q7 )
+ vswp q7l, q7h
+
+ // XOR the first 16 data *bits* with the initial CRC value.
+ vmov.i8 q0h, #0
+ vmov.u16 q0h[3], init_crc
+ veor.8 q7h, q7h, q0h
+
+ // Load the fold-across-16-bytes constants.
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
+
+ cmp len, #16
+ beq .Lreduce_final_16_bytes\@ // len == 16
+ subs len, len, #32
+ addlt len, len, #16
+ blt .Lhandle_partial_segment\@ // 17 <= len <= 31
+ b .Lfold_16_bytes_loop\@ // 32 <= len <= 255
+
+.Lreduce_final_16_bytes\@:
+ .endm
+
+//
+// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+ENTRY(crc_t10dif_pmull64)
+ crct10dif p64
-.Lreduce_final_16_bytes:
// Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
@@ -320,32 +417,19 @@ CPU_LE( vrev64.8 q0, q0 )
vmov.u16 r0, q0l[0]
bx lr
+ENDPROC(crc_t10dif_pmull64)
-.Lless_than_256_bytes:
- // Checksumming a buffer of length 16...255 bytes
+ENTRY(crc_t10dif_pmull8)
+ push {r4, lr}
+ mov_l r4, .L16x64perm
- __adrl fold_consts_ptr, .Lfold_across_16_bytes_consts
+ crct10dif p8
- // Load the first 16 data bytes.
- vld1.64 {q7}, [buf]!
CPU_LE( vrev64.8 q7, q7 )
vswp q7l, q7h
-
- // XOR the first 16 data *bits* with the initial CRC value.
- vmov.i8 q0h, #0
- vmov.u16 q0h[3], init_crc
- veor.8 q7h, q7h, q0h
-
- // Load the fold-across-16-bytes constants.
- vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
-
- cmp len, #16
- beq .Lreduce_final_16_bytes // len == 16
- subs len, len, #32
- addlt len, len, #16
- blt .Lhandle_partial_segment // 17 <= len <= 31
- b .Lfold_16_bytes_loop // 32 <= len <= 255
-ENDPROC(crc_t10dif_pmull)
+ vst1.64 {q7}, [r3, :128]
+ pop {r4, pc}
+ENDPROC(crc_t10dif_pmull8)
.section ".rodata", "a"
.align 4
@@ -379,3 +463,6 @@ ENDPROC(crc_t10dif_pmull)
.byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
+
+.L16x64perm:
+ .quad 0x808080800000000, 0x909090901010101
diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
index 79f3b204d8c0..a8b74523729e 100644
--- a/arch/arm/crypto/crct10dif-ce-glue.c
+++ b/arch/arm/crypto/crct10dif-ce-glue.c
@@ -19,7 +19,9 @@
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
-asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len,
+ u8 out[16]);
static int crct10dif_init(struct shash_desc *desc)
{
@@ -29,14 +31,14 @@ static int crct10dif_init(struct shash_desc *desc)
return 0;
}
-static int crct10dif_update(struct shash_desc *desc, const u8 *data,
- unsigned int length)
+static int crct10dif_update_ce(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
{
u16 *crc = shash_desc_ctx(desc);
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
kernel_neon_begin();
- *crc = crc_t10dif_pmull(*crc, data, length);
+ *crc = crc_t10dif_pmull64(*crc, data, length);
kernel_neon_end();
} else {
*crc = crc_t10dif_generic(*crc, data, length);
@@ -45,6 +47,27 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data,
return 0;
}
+static int crct10dif_update_neon(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ u16 *crcp = shash_desc_ctx(desc);
+ u8 buf[16] __aligned(16);
+ u16 crc = *crcp;
+
+ if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
+ kernel_neon_begin();
+ crc_t10dif_pmull8(crc, data, length, buf);
+ kernel_neon_end();
+
+ crc = 0;
+ data = buf;
+ length = sizeof(buf);
+ }
+
+ *crcp = crc_t10dif_generic(crc, data, length);
+ return 0;
+}
+
static int crct10dif_final(struct shash_desc *desc, u8 *out)
{
u16 *crc = shash_desc_ctx(desc);
@@ -53,10 +76,22 @@ static int crct10dif_final(struct shash_desc *desc, u8 *out)
return 0;
}
-static struct shash_alg crc_t10dif_alg = {
+static struct shash_alg algs[] = {{
+ .digestsize = CRC_T10DIF_DIGEST_SIZE,
+ .init = crct10dif_init,
+ .update = crct10dif_update_neon,
+ .final = crct10dif_final,
+ .descsize = CRC_T10DIF_DIGEST_SIZE,
+
+ .base.cra_name = "crct10dif",
+ .base.cra_driver_name = "crct10dif-arm-neon",
+ .base.cra_priority = 150,
+ .base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+}, {
.digestsize = CRC_T10DIF_DIGEST_SIZE,
.init = crct10dif_init,
- .update = crct10dif_update,
+ .update = crct10dif_update_ce,
.final = crct10dif_final,
.descsize = CRC_T10DIF_DIGEST_SIZE,
@@ -65,19 +100,19 @@ static struct shash_alg crc_t10dif_alg = {
.base.cra_priority = 200,
.base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
-};
+}};
static int __init crc_t10dif_mod_init(void)
{
- if (!(elf_hwcap2 & HWCAP2_PMULL))
+ if (!(elf_hwcap & HWCAP_NEON))
return -ENODEV;
- return crypto_register_shash(&crc_t10dif_alg);
+ return crypto_register_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL));
}
static void __exit crc_t10dif_mod_exit(void)
{
- crypto_unregister_shash(&crc_t10dif_alg);
+ crypto_unregister_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL));
}
module_init(crc_t10dif_mod_init);
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
index 5604de61d06d..87dd6d46224d 100644
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -1,8 +1,11 @@
//
// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
//
-// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
-// Copyright (C) 2019 Google LLC <ebiggers@google.com>
+// Copyright (C) 2016 Linaro Ltd
+// Copyright (C) 2019-2024 Google LLC
+//
+// Authors: Ard Biesheuvel <ardb@google.com>
+// Eric Biggers <ebiggers@google.com>
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License version 2 as
@@ -71,161 +74,117 @@
init_crc .req w0
buf .req x1
len .req x2
- fold_consts_ptr .req x3
+ fold_consts_ptr .req x5
fold_consts .req v10
- ad .req v14
-
- k00_16 .req v15
- k32_48 .req v16
-
t3 .req v17
t4 .req v18
t5 .req v19
t6 .req v20
t7 .req v21
t8 .req v22
- t9 .req v23
- perm1 .req v24
- perm2 .req v25
- perm3 .req v26
- perm4 .req v27
-
- bd1 .req v28
- bd2 .req v29
- bd3 .req v30
- bd4 .req v31
-
- .macro __pmull_init_p64
- .endm
+ perm .req v27
- .macro __pmull_pre_p64, bd
+ .macro pmull16x64_p64, a16, b64, c64
+ pmull2 \c64\().1q, \a16\().2d, \b64\().2d
+ pmull \b64\().1q, \a16\().1d, \b64\().1d
.endm
- .macro __pmull_init_p8
- // k00_16 := 0x0000000000000000_000000000000ffff
- // k32_48 := 0x00000000ffffffff_0000ffffffffffff
- movi k32_48.2d, #0xffffffff
- mov k32_48.h[2], k32_48.h[0]
- ushr k00_16.2d, k32_48.2d, #32
-
- // prepare the permutation vectors
- mov_q x5, 0x080f0e0d0c0b0a09
- movi perm4.8b, #8
- dup perm1.2d, x5
- eor perm1.16b, perm1.16b, perm4.16b
- ushr perm2.2d, perm1.2d, #8
- ushr perm3.2d, perm1.2d, #16
- ushr perm4.2d, perm1.2d, #24
- sli perm2.2d, perm1.2d, #56
- sli perm3.2d, perm1.2d, #48
- sli perm4.2d, perm1.2d, #40
+ /*
+ * Pairwise long polynomial multiplication of two 16-bit values
+ *
+ * { w0, w1 }, { y0, y1 }
+ *
+ * by two 64-bit values
+ *
+ * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
+ *
+ * where each vector element is a byte, ordered from least to most
+ * significant.
+ *
+ * This can be implemented using 8x8 long polynomial multiplication, by
+ * reorganizing the input so that each pairwise 8x8 multiplication
+ * produces one of the terms from the decomposition below, and
+ * combining the results of each rank and shifting them into place.
+ *
+ * Rank
+ * 0 w0*x0 ^ | y0*z0 ^
+ * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^
+ * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^
+ * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^
+ * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^
+ * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^
+ * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^
+ * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^
+ * 8 w1*x7 << 64 | y1*z7 << 64
+ *
+ * The inputs can be reorganized into
+ *
+ * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
+ * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
+ *
+ * and after performing 8x8->16 bit long polynomial multiplication of
+ * each of the halves of the first vector with those of the second one,
+ * we obtain the following four vectors of 16-bit elements:
+ *
+ * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
+ * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
+ * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
+ * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
+ *
+ * Results b and c can be XORed together, as the vector elements have
+ * matching ranks. Then, the final XOR (*) can be pulled forward, and
+ * applied between the halves of each of the remaining three vectors,
+ * which are then shifted into place, and combined to produce two
+ * 80-bit results.
+ *
+ * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent
+ * to the 64x64 bit one above, but XOR'ing the outputs together will
+ * produce the expected result, and this is sufficient in the context of
+ * this algorithm.
+ */
+ .macro pmull16x64_p8, a16, b64, c64
+ ext t7.16b, \b64\().16b, \b64\().16b, #1
+ tbl t5.16b, {\a16\().16b}, perm.16b
+ uzp1 t7.16b, \b64\().16b, t7.16b
+ bl __pmull_p8_16x64
+ ext \b64\().16b, t4.16b, t4.16b, #15
+ eor \c64\().16b, t8.16b, t5.16b
.endm
- .macro __pmull_pre_p8, bd
- tbl bd1.16b, {\bd\().16b}, perm1.16b
- tbl bd2.16b, {\bd\().16b}, perm2.16b
- tbl bd3.16b, {\bd\().16b}, perm3.16b
- tbl bd4.16b, {\bd\().16b}, perm4.16b
- .endm
-
-SYM_FUNC_START_LOCAL(__pmull_p8_core)
-.L__pmull_p8_core:
- ext t4.8b, ad.8b, ad.8b, #1 // A1
- ext t5.8b, ad.8b, ad.8b, #2 // A2
- ext t6.8b, ad.8b, ad.8b, #3 // A3
-
- pmull t4.8h, t4.8b, fold_consts.8b // F = A1*B
- pmull t8.8h, ad.8b, bd1.8b // E = A*B1
- pmull t5.8h, t5.8b, fold_consts.8b // H = A2*B
- pmull t7.8h, ad.8b, bd2.8b // G = A*B2
- pmull t6.8h, t6.8b, fold_consts.8b // J = A3*B
- pmull t9.8h, ad.8b, bd3.8b // I = A*B3
- pmull t3.8h, ad.8b, bd4.8b // K = A*B4
- b 0f
-
-.L__pmull_p8_core2:
- tbl t4.16b, {ad.16b}, perm1.16b // A1
- tbl t5.16b, {ad.16b}, perm2.16b // A2
- tbl t6.16b, {ad.16b}, perm3.16b // A3
-
- pmull2 t4.8h, t4.16b, fold_consts.16b // F = A1*B
- pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1
- pmull2 t5.8h, t5.16b, fold_consts.16b // H = A2*B
- pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2
- pmull2 t6.8h, t6.16b, fold_consts.16b // J = A3*B
- pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3
- pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4
-
-0: eor t4.16b, t4.16b, t8.16b // L = E + F
- eor t5.16b, t5.16b, t7.16b // M = G + H
- eor t6.16b, t6.16b, t9.16b // N = I + J
-
- uzp1 t8.2d, t4.2d, t5.2d
- uzp2 t4.2d, t4.2d, t5.2d
- uzp1 t7.2d, t6.2d, t3.2d
- uzp2 t6.2d, t6.2d, t3.2d
-
- // t4 = (L) (P0 + P1) << 8
- // t5 = (M) (P2 + P3) << 16
- eor t8.16b, t8.16b, t4.16b
- and t4.16b, t4.16b, k32_48.16b
-
- // t6 = (N) (P4 + P5) << 24
- // t7 = (K) (P6 + P7) << 32
- eor t7.16b, t7.16b, t6.16b
- and t6.16b, t6.16b, k00_16.16b
-
- eor t8.16b, t8.16b, t4.16b
- eor t7.16b, t7.16b, t6.16b
-
- zip2 t5.2d, t8.2d, t4.2d
- zip1 t4.2d, t8.2d, t4.2d
- zip2 t3.2d, t7.2d, t6.2d
- zip1 t6.2d, t7.2d, t6.2d
-
- ext t4.16b, t4.16b, t4.16b, #15
+SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
+ ext t6.16b, t5.16b, t5.16b, #8
+
+ pmull t3.8h, t7.8b, t5.8b
+ pmull t4.8h, t7.8b, t6.8b
+ pmull2 t5.8h, t7.16b, t5.16b
+ pmull2 t6.8h, t7.16b, t6.16b
+
+ ext t8.16b, t3.16b, t3.16b, #8
+ eor t4.16b, t4.16b, t6.16b
+ ext t7.16b, t5.16b, t5.16b, #8
+ ext t6.16b, t4.16b, t4.16b, #8
+ eor t8.8b, t8.8b, t3.8b
+ eor t5.8b, t5.8b, t7.8b
+ eor t4.8b, t4.8b, t6.8b
ext t5.16b, t5.16b, t5.16b, #14
- ext t6.16b, t6.16b, t6.16b, #13
- ext t3.16b, t3.16b, t3.16b, #12
-
- eor t4.16b, t4.16b, t5.16b
- eor t6.16b, t6.16b, t3.16b
ret
-SYM_FUNC_END(__pmull_p8_core)
+SYM_FUNC_END(__pmull_p8_16x64)
- .macro __pmull_p8, rq, ad, bd, i
- .ifnc \bd, fold_consts
- .err
- .endif
- mov ad.16b, \ad\().16b
- .ifb \i
- pmull \rq\().8h, \ad\().8b, \bd\().8b // D = A*B
- .else
- pmull2 \rq\().8h, \ad\().16b, \bd\().16b // D = A*B
- .endif
-
- bl .L__pmull_p8_core\i
-
- eor \rq\().16b, \rq\().16b, t4.16b
- eor \rq\().16b, \rq\().16b, t6.16b
- .endm
// Fold reg1, reg2 into the next 32 data bytes, storing the result back
// into reg1, reg2.
.macro fold_32_bytes, p, reg1, reg2
ldp q11, q12, [buf], #0x20
- __pmull_\p v8, \reg1, fold_consts, 2
- __pmull_\p \reg1, \reg1, fold_consts
+ pmull16x64_\p fold_consts, \reg1, v8
CPU_LE( rev64 v11.16b, v11.16b )
CPU_LE( rev64 v12.16b, v12.16b )
- __pmull_\p v9, \reg2, fold_consts, 2
- __pmull_\p \reg2, \reg2, fold_consts
+ pmull16x64_\p fold_consts, \reg2, v9
CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
@@ -238,26 +197,15 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
// Fold src_reg into dst_reg, optionally loading the next fold constants
.macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts
- __pmull_\p v8, \src_reg, fold_consts
- __pmull_\p \src_reg, \src_reg, fold_consts, 2
+ pmull16x64_\p fold_consts, \src_reg, v8
.ifnb \load_next_consts
ld1 {fold_consts.2d}, [fold_consts_ptr], #16
- __pmull_pre_\p fold_consts
.endif
eor \dst_reg\().16b, \dst_reg\().16b, v8.16b
eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
.endm
- .macro __pmull_p64, rd, rn, rm, n
- .ifb \n
- pmull \rd\().1q, \rn\().1d, \rm\().1d
- .else
- pmull2 \rd\().1q, \rn\().2d, \rm\().2d
- .endif
- .endm
-
.macro crc_t10dif_pmull, p
- __pmull_init_\p
// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
cmp len, #256
@@ -296,7 +244,6 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// Load the constants for folding across 128 bytes.
ld1 {fold_consts.2d}, [fold_consts_ptr]
- __pmull_pre_\p fold_consts
// Subtract 128 for the 128 data bytes just consumed. Subtract another
// 128 to simplify the termination condition of the following loop.
@@ -318,7 +265,6 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// Fold across 64 bytes.
add fold_consts_ptr, fold_consts_ptr, #16
ld1 {fold_consts.2d}, [fold_consts_ptr], #16
- __pmull_pre_\p fold_consts
fold_16_bytes \p, v0, v4
fold_16_bytes \p, v1, v5
fold_16_bytes \p, v2, v6
@@ -339,8 +285,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// into them, storing the result back into v7.
b.lt .Lfold_16_bytes_loop_done_\@
.Lfold_16_bytes_loop_\@:
- __pmull_\p v8, v7, fold_consts
- __pmull_\p v7, v7, fold_consts, 2
+ pmull16x64_\p fold_consts, v7, v8
eor v7.16b, v7.16b, v8.16b
ldr q0, [buf], #16
CPU_LE( rev64 v0.16b, v0.16b )
@@ -387,51 +332,10 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
bsl v2.16b, v1.16b, v0.16b
// Fold the first chunk into the second chunk, storing the result in v7.
- __pmull_\p v0, v3, fold_consts
- __pmull_\p v7, v3, fold_consts, 2
- eor v7.16b, v7.16b, v0.16b
+ pmull16x64_\p fold_consts, v3, v0
+ eor v7.16b, v3.16b, v0.16b
eor v7.16b, v7.16b, v2.16b
-
-.Lreduce_final_16_bytes_\@:
- // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
-
- movi v2.16b, #0 // init zero register
-
- // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
- ld1 {fold_consts.2d}, [fold_consts_ptr], #16
- __pmull_pre_\p fold_consts
-
- // Fold the high 64 bits into the low 64 bits, while also multiplying by
- // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
- // whose low 48 bits are 0.
- ext v0.16b, v2.16b, v7.16b, #8
- __pmull_\p v7, v7, fold_consts, 2 // high bits * x^48 * (x^80 mod G(x))
- eor v0.16b, v0.16b, v7.16b // + low bits * x^64
-
- // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
- // value congruent to x^64 * M(x) and whose low 48 bits are 0.
- ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits
- mov v0.s[3], v2.s[0] // zero high 32 bits
- __pmull_\p v1, v1, fold_consts // high 32 bits * x^48 * (x^48 mod G(x))
- eor v0.16b, v0.16b, v1.16b // + low bits
-
- // Load G(x) and floor(x^48 / G(x)).
- ld1 {fold_consts.2d}, [fold_consts_ptr]
- __pmull_pre_\p fold_consts
-
- // Use Barrett reduction to compute the final CRC value.
- __pmull_\p v1, v0, fold_consts, 2 // high 32 bits * floor(x^48 / G(x))
- ushr v1.2d, v1.2d, #32 // /= x^32
- __pmull_\p v1, v1, fold_consts // *= G(x)
- ushr v0.2d, v0.2d, #48
- eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits
- // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
-
- umov w0, v0.h[0]
- .ifc \p, p8
- frame_pop
- .endif
- ret
+ b .Lreduce_final_16_bytes_\@
.Lless_than_256_bytes_\@:
// Checksumming a buffer of length 16...255 bytes
@@ -450,7 +354,6 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// Load the fold-across-16-bytes constants.
ld1 {fold_consts.2d}, [fold_consts_ptr], #16
- __pmull_pre_\p fold_consts
cmp len, #16
b.eq .Lreduce_final_16_bytes_\@ // len == 16
@@ -458,6 +361,8 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255
add len, len, #16
b .Lhandle_partial_segment_\@ // 17 <= len <= 31
+
+.Lreduce_final_16_bytes_\@:
.endm
//
@@ -467,7 +372,22 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
//
SYM_FUNC_START(crc_t10dif_pmull_p8)
frame_push 1
+
+ // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
+ movi perm.4h, #8, lsl #8
+ orr perm.2s, #1, lsl #16
+ orr perm.2s, #1, lsl #24
+ zip1 perm.16b, perm.16b, perm.16b
+ zip1 perm.16b, perm.16b, perm.16b
+
crc_t10dif_pmull p8
+
+CPU_LE( rev64 v7.16b, v7.16b )
+CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
+ str q7, [x3]
+
+ frame_pop
+ ret
SYM_FUNC_END(crc_t10dif_pmull_p8)
.align 5
@@ -478,6 +398,41 @@ SYM_FUNC_END(crc_t10dif_pmull_p8)
//
SYM_FUNC_START(crc_t10dif_pmull_p64)
crc_t10dif_pmull p64
+
+ // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
+
+ movi v2.16b, #0 // init zero register
+
+ // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
+
+ // Fold the high 64 bits into the low 64 bits, while also multiplying by
+ // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
+ // whose low 48 bits are 0.
+ ext v0.16b, v2.16b, v7.16b, #8
+ pmull2 v7.1q, v7.2d, fold_consts.2d // high bits * x^48 * (x^80 mod G(x))
+ eor v0.16b, v0.16b, v7.16b // + low bits * x^64
+
+ // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
+ // value congruent to x^64 * M(x) and whose low 48 bits are 0.
+ ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits
+ mov v0.s[3], v2.s[0] // zero high 32 bits
+ pmull v1.1q, v1.1d, fold_consts.1d // high 32 bits * x^48 * (x^48 mod G(x))
+ eor v0.16b, v0.16b, v1.16b // + low bits
+
+ // Load G(x) and floor(x^48 / G(x)).
+ ld1 {fold_consts.2d}, [fold_consts_ptr]
+
+ // Use Barrett reduction to compute the final CRC value.
+ pmull2 v1.1q, v0.2d, fold_consts.2d // high 32 bits * floor(x^48 / G(x))
+ ushr v1.2d, v1.2d, #32 // /= x^32
+ pmull v1.1q, v1.1d, fold_consts.1d // *= G(x)
+ ushr v0.2d, v0.2d, #48
+ eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits
+ // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
+
+ umov w0, v0.h[0]
+ ret
SYM_FUNC_END(crc_t10dif_pmull_p64)
.section ".rodata", "a"
diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
index 606d25c559ed..08bcbd884395 100644
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -20,7 +20,8 @@
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
-asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len,
+ u8 out[16]);
asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
static int crct10dif_init(struct shash_desc *desc)
@@ -34,25 +35,21 @@ static int crct10dif_init(struct shash_desc *desc)
static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
unsigned int length)
{
- u16 *crc = shash_desc_ctx(desc);
-
- if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
- do {
- unsigned int chunk = length;
-
- if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
- chunk = SZ_4K;
-
- kernel_neon_begin();
- *crc = crc_t10dif_pmull_p8(*crc, data, chunk);
- kernel_neon_end();
- data += chunk;
- length -= chunk;
- } while (length);
- } else {
- *crc = crc_t10dif_generic(*crc, data, length);
+ u16 *crcp = shash_desc_ctx(desc);
+ u16 crc = *crcp;
+ u8 buf[16];
+
+ if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
+ kernel_neon_begin();
+ crc_t10dif_pmull_p8(crc, data, length, buf);
+ kernel_neon_end();
+
+ crc = 0;
+ data = buf;
+ length = sizeof(buf);
}
+ *crcp = crc_t10dif_generic(crc, data, length);
return 0;
}
@@ -62,18 +59,9 @@ static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
u16 *crc = shash_desc_ctx(desc);
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
- do {
- unsigned int chunk = length;
-
- if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
- chunk = SZ_4K;
-
- kernel_neon_begin();
- *crc = crc_t10dif_pmull_p64(*crc, data, chunk);
- kernel_neon_end();
- data += chunk;
- length -= chunk;
- } while (length);
+ kernel_neon_begin();
+ *crc = crc_t10dif_pmull_p64(*crc, data, length);
+ kernel_neon_end();
} else {
*crc = crc_t10dif_generic(*crc, data, length);
}
diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
index 46a4c85e85e2..951a43726461 100644
--- a/arch/powerpc/crypto/Kconfig
+++ b/arch/powerpc/crypto/Kconfig
@@ -107,12 +107,12 @@ config CRYPTO_AES_PPC_SPE
config CRYPTO_AES_GCM_P10
tristate "Stitched AES/GCM acceleration support on P10 or later CPU (PPC)"
- depends on BROKEN
depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
select CRYPTO_LIB_AES
select CRYPTO_ALGAPI
select CRYPTO_AEAD
select CRYPTO_SKCIPHER
+ select CRYPTO_SIMD
help
AEAD cipher: AES cipher algorithms (FIPS-197)
GCM (Galois/Counter Mode) authenticated encryption mode (NIST SP800-38D)
diff --git a/arch/powerpc/crypto/aes-gcm-p10-glue.c b/arch/powerpc/crypto/aes-gcm-p10-glue.c
index f66ad56e765f..f37b3d13fc53 100644
--- a/arch/powerpc/crypto/aes-gcm-p10-glue.c
+++ b/arch/powerpc/crypto/aes-gcm-p10-glue.c
@@ -8,6 +8,7 @@
#include <linux/unaligned.h>
#include <asm/simd.h>
#include <asm/switch_to.h>
+#include <crypto/gcm.h>
#include <crypto/aes.h>
#include <crypto/algapi.h>
#include <crypto/b128ops.h>
@@ -24,6 +25,7 @@
#define PPC_ALIGN 16
#define GCM_IV_SIZE 12
+#define RFC4106_NONCE_SIZE 4
MODULE_DESCRIPTION("PPC64le AES-GCM with Stitched implementation");
MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com");
@@ -31,7 +33,7 @@ MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("aes");
asmlinkage int aes_p10_set_encrypt_key(const u8 *userKey, const int bits,
- void *key);
+ void *key);
asmlinkage void aes_p10_encrypt(const u8 *in, u8 *out, const void *key);
asmlinkage void aes_p10_gcm_encrypt(u8 *in, u8 *out, size_t len,
void *rkey, u8 *iv, void *Xi);
@@ -39,7 +41,8 @@ asmlinkage void aes_p10_gcm_decrypt(u8 *in, u8 *out, size_t len,
void *rkey, u8 *iv, void *Xi);
asmlinkage void gcm_init_htable(unsigned char htable[], unsigned char Xi[]);
asmlinkage void gcm_ghash_p10(unsigned char *Xi, unsigned char *Htable,
- unsigned char *aad, unsigned int alen);
+ unsigned char *aad, unsigned int alen);
+asmlinkage void gcm_update(u8 *iv, void *Xi);
struct aes_key {
u8 key[AES_MAX_KEYLENGTH];
@@ -52,6 +55,7 @@ struct gcm_ctx {
u8 aad_hash[16];
u64 aadLen;
u64 Plen; /* offset 56 - used in aes_p10_gcm_{en/de}crypt */
+ u8 pblock[16];
};
struct Hash_ctx {
u8 H[16]; /* subkey */
@@ -60,17 +64,20 @@ struct Hash_ctx {
struct p10_aes_gcm_ctx {
struct aes_key enc_key;
+ u8 nonce[RFC4106_NONCE_SIZE];
};
static void vsx_begin(void)
{
preempt_disable();
+ pagefault_disable();
enable_kernel_vsx();
}
static void vsx_end(void)
{
disable_kernel_vsx();
+ pagefault_enable();
preempt_enable();
}
@@ -185,7 +192,7 @@ static int set_authsize(struct crypto_aead *tfm, unsigned int authsize)
}
static int p10_aes_gcm_setkey(struct crypto_aead *aead, const u8 *key,
- unsigned int keylen)
+ unsigned int keylen)
{
struct crypto_tfm *tfm = crypto_aead_tfm(aead);
struct p10_aes_gcm_ctx *ctx = crypto_tfm_ctx(tfm);
@@ -198,7 +205,8 @@ static int p10_aes_gcm_setkey(struct crypto_aead *aead, const u8 *key,
return ret ? -EINVAL : 0;
}
-static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
+static int p10_aes_gcm_crypt(struct aead_request *req, u8 *riv,
+ int assoclen, int enc)
{
struct crypto_tfm *tfm = req->base.tfm;
struct p10_aes_gcm_ctx *ctx = crypto_tfm_ctx(tfm);
@@ -210,7 +218,6 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
struct skcipher_walk walk;
u8 *assocmem = NULL;
u8 *assoc;
- unsigned int assoclen = req->assoclen;
unsigned int cryptlen = req->cryptlen;
unsigned char ivbuf[AES_BLOCK_SIZE+PPC_ALIGN];
unsigned char *iv = PTR_ALIGN((void *)ivbuf, PPC_ALIGN);
@@ -218,11 +225,12 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
unsigned long auth_tag_len = crypto_aead_authsize(__crypto_aead_cast(tfm));
u8 otag[16];
int total_processed = 0;
+ int nbytes;
memset(databuf, 0, sizeof(databuf));
memset(hashbuf, 0, sizeof(hashbuf));
memset(ivbuf, 0, sizeof(ivbuf));
- memcpy(iv, req->iv, GCM_IV_SIZE);
+ memcpy(iv, riv, GCM_IV_SIZE);
/* Linearize assoc, if not already linear */
if (req->src->length >= assoclen && req->src->length) {
@@ -257,19 +265,25 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
if (ret)
return ret;
- while (walk.nbytes > 0 && ret == 0) {
+ while ((nbytes = walk.nbytes) > 0 && ret == 0) {
+ u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ u8 buf[AES_BLOCK_SIZE];
+
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
+ src = dst = memcpy(buf, src, nbytes);
vsx_begin();
if (enc)
- aes_p10_gcm_encrypt(walk.src.virt.addr,
- walk.dst.virt.addr,
- walk.nbytes,
+ aes_p10_gcm_encrypt(src, dst, nbytes,
&ctx->enc_key, gctx->iv, hash->Htable);
else
- aes_p10_gcm_decrypt(walk.src.virt.addr,
- walk.dst.virt.addr,
- walk.nbytes,
+ aes_p10_gcm_decrypt(src, dst, nbytes,
&ctx->enc_key, gctx->iv, hash->Htable);
+
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
+ memcpy(walk.dst.virt.addr, buf, nbytes);
+
vsx_end();
total_processed += walk.nbytes;
@@ -281,6 +295,7 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
/* Finalize hash */
vsx_begin();
+ gcm_update(gctx->iv, hash->Htable);
finish_tag(gctx, hash, total_processed);
vsx_end();
@@ -302,17 +317,63 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
return 0;
}
+static int rfc4106_setkey(struct crypto_aead *tfm, const u8 *inkey,
+ unsigned int keylen)
+{
+ struct p10_aes_gcm_ctx *ctx = crypto_aead_ctx(tfm);
+ int err;
+
+ keylen -= RFC4106_NONCE_SIZE;
+ err = p10_aes_gcm_setkey(tfm, inkey, keylen);
+ if (err)
+ return err;
+
+ memcpy(ctx->nonce, inkey + keylen, RFC4106_NONCE_SIZE);
+ return 0;
+}
+
+static int rfc4106_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
+{
+ return crypto_rfc4106_check_authsize(authsize);
+}
+
+static int rfc4106_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct p10_aes_gcm_ctx *ctx = crypto_aead_ctx(aead);
+ u8 iv[AES_BLOCK_SIZE];
+
+ memcpy(iv, ctx->nonce, RFC4106_NONCE_SIZE);
+ memcpy(iv + RFC4106_NONCE_SIZE, req->iv, GCM_RFC4106_IV_SIZE);
+
+ return crypto_ipsec_check_assoclen(req->assoclen) ?:
+ p10_aes_gcm_crypt(req, iv, req->assoclen - GCM_RFC4106_IV_SIZE, 1);
+}
+
+static int rfc4106_decrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct p10_aes_gcm_ctx *ctx = crypto_aead_ctx(aead);
+ u8 iv[AES_BLOCK_SIZE];
+
+ memcpy(iv, ctx->nonce, RFC4106_NONCE_SIZE);
+ memcpy(iv + RFC4106_NONCE_SIZE, req->iv, GCM_RFC4106_IV_SIZE);
+
+ return crypto_ipsec_check_assoclen(req->assoclen) ?:
+ p10_aes_gcm_crypt(req, iv, req->assoclen - GCM_RFC4106_IV_SIZE, 0);
+}
+
static int p10_aes_gcm_encrypt(struct aead_request *req)
{
- return p10_aes_gcm_crypt(req, 1);
+ return p10_aes_gcm_crypt(req, req->iv, req->assoclen, 1);
}
static int p10_aes_gcm_decrypt(struct aead_request *req)
{
- return p10_aes_gcm_crypt(req, 0);
+ return p10_aes_gcm_crypt(req, req->iv, req->assoclen, 0);
}
-static struct aead_alg gcm_aes_alg = {
+static struct aead_alg gcm_aes_algs[] = {{
.ivsize = GCM_IV_SIZE,
.maxauthsize = 16,
@@ -321,23 +382,57 @@ static struct aead_alg gcm_aes_alg = {
.encrypt = p10_aes_gcm_encrypt,
.decrypt = p10_aes_gcm_decrypt,
- .base.cra_name = "gcm(aes)",
- .base.cra_driver_name = "aes_gcm_p10",
+ .base.cra_name = "__gcm(aes)",
+ .base.cra_driver_name = "__aes_gcm_p10",
.base.cra_priority = 2100,
.base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct p10_aes_gcm_ctx),
+ .base.cra_ctxsize = sizeof(struct p10_aes_gcm_ctx)+
+ 4 * sizeof(u64[2]),
.base.cra_module = THIS_MODULE,
-};
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+}, {
+ .ivsize = GCM_RFC4106_IV_SIZE,
+ .maxauthsize = 16,
+ .setkey = rfc4106_setkey,
+ .setauthsize = rfc4106_setauthsize,
+ .encrypt = rfc4106_encrypt,
+ .decrypt = rfc4106_decrypt,
+
+ .base.cra_name = "__rfc4106(gcm(aes))",
+ .base.cra_driver_name = "__rfc4106_aes_gcm_p10",
+ .base.cra_priority = 2100,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct p10_aes_gcm_ctx) +
+ 4 * sizeof(u64[2]),
+ .base.cra_module = THIS_MODULE,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+}};
+
+static struct simd_aead_alg *p10_simd_aeads[ARRAY_SIZE(gcm_aes_algs)];
static int __init p10_init(void)
{
- return crypto_register_aead(&gcm_aes_alg);
+ int ret;
+
+ if (!cpu_has_feature(CPU_FTR_ARCH_31))
+ return 0;
+
+ ret = simd_register_aeads_compat(gcm_aes_algs,
+ ARRAY_SIZE(gcm_aes_algs),
+ p10_simd_aeads);
+ if (ret) {
+ simd_unregister_aeads(gcm_aes_algs, ARRAY_SIZE(gcm_aes_algs),
+ p10_simd_aeads);
+ return ret;
+ }
+ return 0;
}
static void __exit p10_exit(void)
{
- crypto_unregister_aead(&gcm_aes_alg);
+ simd_unregister_aeads(gcm_aes_algs, ARRAY_SIZE(gcm_aes_algs),
+ p10_simd_aeads);
}
-module_cpu_feature_match(PPC_MODULE_FEATURE_P10, p10_init);
+module_init(p10_init);
module_exit(p10_exit);
diff --git a/arch/powerpc/crypto/aes-gcm-p10.S b/arch/powerpc/crypto/aes-gcm-p10.S
index a51f4b265308..89f50eef3512 100644
--- a/arch/powerpc/crypto/aes-gcm-p10.S
+++ b/arch/powerpc/crypto/aes-gcm-p10.S
@@ -1,42 +1,42 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
- #
- # Accelerated AES-GCM stitched implementation for ppc64le.
- #
- # Copyright 2022- IBM Inc. All rights reserved
- #
- #===================================================================================
- # Written by Danny Tsen <dtsen@linux.ibm.com>
- #
- # GHASH is based on the Karatsuba multiplication method.
- #
- # Xi xor X1
- #
- # X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
- # (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
- # (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
- # (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
- # (X4.h * H.h + X4.l * H.l + X4 * H)
- #
- # Xi = v0
- # H Poly = v2
- # Hash keys = v3 - v14
- # ( H.l, H, H.h)
- # ( H^2.l, H^2, H^2.h)
- # ( H^3.l, H^3, H^3.h)
- # ( H^4.l, H^4, H^4.h)
- #
- # v30 is IV
- # v31 - counter 1
- #
- # AES used,
- # vs0 - vs14 for round keys
- # v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
- #
- # This implementation uses stitched AES-GCM approach to improve overall performance.
- # AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
- #
- # ===================================================================================
- #
+#
+# Accelerated AES-GCM stitched implementation for ppc64le.
+#
+# Copyright 2024- IBM Inc.
+#
+#===================================================================================
+# Written by Danny Tsen <dtsen@us.ibm.com>
+#
+# GHASH is based on the Karatsuba multiplication method.
+#
+# Xi xor X1
+#
+# X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
+# (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
+# (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
+# (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
+# (X4.h * H.h + X4.l * H.l + X4 * H)
+#
+# Xi = v0
+# H Poly = v2
+# Hash keys = v3 - v14
+# ( H.l, H, H.h)
+# ( H^2.l, H^2, H^2.h)
+# ( H^3.l, H^3, H^3.h)
+# ( H^4.l, H^4, H^4.h)
+#
+# v30 is IV
+# v31 - counter 1
+#
+# AES used,
+# vs0 - round key 0
+# v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
+#
+# This implementation uses stitched AES-GCM approach to improve overall performance.
+# AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
+#
+# ===================================================================================
+#
#include <asm/ppc_asm.h>
#include <linux/linkage.h>
@@ -44,483 +44,224 @@
.machine "any"
.text
- # 4x loops
- # v15 - v18 - input states
- # vs1 - vs9 - round keys
- #
-.macro Loop_aes_middle4x
- xxlor 19+32, 1, 1
- xxlor 20+32, 2, 2
- xxlor 21+32, 3, 3
- xxlor 22+32, 4, 4
-
- vcipher 15, 15, 19
- vcipher 16, 16, 19
- vcipher 17, 17, 19
- vcipher 18, 18, 19
-
- vcipher 15, 15, 20
- vcipher 16, 16, 20
- vcipher 17, 17, 20
- vcipher 18, 18, 20
-
- vcipher 15, 15, 21
- vcipher 16, 16, 21
- vcipher 17, 17, 21
- vcipher 18, 18, 21
-
- vcipher 15, 15, 22
- vcipher 16, 16, 22
- vcipher 17, 17, 22
- vcipher 18, 18, 22
-
- xxlor 19+32, 5, 5
- xxlor 20+32, 6, 6
- xxlor 21+32, 7, 7
- xxlor 22+32, 8, 8
-
- vcipher 15, 15, 19
- vcipher 16, 16, 19
- vcipher 17, 17, 19
- vcipher 18, 18, 19
-
- vcipher 15, 15, 20
- vcipher 16, 16, 20
- vcipher 17, 17, 20
- vcipher 18, 18, 20
-
- vcipher 15, 15, 21
- vcipher 16, 16, 21
- vcipher 17, 17, 21
- vcipher 18, 18, 21
-
- vcipher 15, 15, 22
- vcipher 16, 16, 22
- vcipher 17, 17, 22
- vcipher 18, 18, 22
-
- xxlor 23+32, 9, 9
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
+.macro SAVE_GPR GPR OFFSET FRAME
+ std \GPR,\OFFSET(\FRAME)
.endm
- # 8x loops
- # v15 - v22 - input states
- # vs1 - vs9 - round keys
- #
-.macro Loop_aes_middle8x
- xxlor 23+32, 1, 1
- xxlor 24+32, 2, 2
- xxlor 25+32, 3, 3
- xxlor 26+32, 4, 4
-
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
- vcipher 19, 19, 23
- vcipher 20, 20, 23
- vcipher 21, 21, 23
- vcipher 22, 22, 23
-
- vcipher 15, 15, 24
- vcipher 16, 16, 24
- vcipher 17, 17, 24
- vcipher 18, 18, 24
- vcipher 19, 19, 24
- vcipher 20, 20, 24
- vcipher 21, 21, 24
- vcipher 22, 22, 24
-
- vcipher 15, 15, 25
- vcipher 16, 16, 25
- vcipher 17, 17, 25
- vcipher 18, 18, 25
- vcipher 19, 19, 25
- vcipher 20, 20, 25
- vcipher 21, 21, 25
- vcipher 22, 22, 25
-
- vcipher 15, 15, 26
- vcipher 16, 16, 26
- vcipher 17, 17, 26
- vcipher 18, 18, 26
- vcipher 19, 19, 26
- vcipher 20, 20, 26
- vcipher 21, 21, 26
- vcipher 22, 22, 26
-
- xxlor 23+32, 5, 5
- xxlor 24+32, 6, 6
- xxlor 25+32, 7, 7
- xxlor 26+32, 8, 8
-
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
- vcipher 19, 19, 23
- vcipher 20, 20, 23
- vcipher 21, 21, 23
- vcipher 22, 22, 23
-
- vcipher 15, 15, 24
- vcipher 16, 16, 24
- vcipher 17, 17, 24
- vcipher 18, 18, 24
- vcipher 19, 19, 24
- vcipher 20, 20, 24
- vcipher 21, 21, 24
- vcipher 22, 22, 24
-
- vcipher 15, 15, 25
- vcipher 16, 16, 25
- vcipher 17, 17, 25
- vcipher 18, 18, 25
- vcipher 19, 19, 25
- vcipher 20, 20, 25
- vcipher 21, 21, 25
- vcipher 22, 22, 25
-
- vcipher 15, 15, 26
- vcipher 16, 16, 26
- vcipher 17, 17, 26
- vcipher 18, 18, 26
- vcipher 19, 19, 26
- vcipher 20, 20, 26
- vcipher 21, 21, 26
- vcipher 22, 22, 26
-
- xxlor 23+32, 9, 9
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
- vcipher 19, 19, 23
- vcipher 20, 20, 23
- vcipher 21, 21, 23
- vcipher 22, 22, 23
+.macro SAVE_VRS VRS OFFSET FRAME
+ stxv \VRS+32, \OFFSET(\FRAME)
.endm
-.macro Loop_aes_middle_1x
- xxlor 19+32, 1, 1
- xxlor 20+32, 2, 2
- xxlor 21+32, 3, 3
- xxlor 22+32, 4, 4
-
- vcipher 15, 15, 19
- vcipher 15, 15, 20
- vcipher 15, 15, 21
- vcipher 15, 15, 22
-
- xxlor 19+32, 5, 5
- xxlor 20+32, 6, 6
- xxlor 21+32, 7, 7
- xxlor 22+32, 8, 8
-
- vcipher 15, 15, 19
- vcipher 15, 15, 20
- vcipher 15, 15, 21
- vcipher 15, 15, 22
-
- xxlor 19+32, 9, 9
- vcipher 15, 15, 19
+.macro RESTORE_GPR GPR OFFSET FRAME
+ ld \GPR,\OFFSET(\FRAME)
.endm
- #
- # Compute 4x hash values based on Karatsuba method.
- #
-.macro ppc_aes_gcm_ghash
- vxor 15, 15, 0
-
- vpmsumd 23, 12, 15 # H4.L * X.L
- vpmsumd 24, 9, 16
- vpmsumd 25, 6, 17
- vpmsumd 26, 3, 18
-
- vxor 23, 23, 24
- vxor 23, 23, 25
- vxor 23, 23, 26 # L
-
- vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
- vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
- vpmsumd 26, 7, 17
- vpmsumd 27, 4, 18
-
- vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27 # M
-
- # sum hash and reduction with H Poly
- vpmsumd 28, 23, 2 # reduction
-
- vxor 29, 29, 29
- vsldoi 26, 24, 29, 8 # mL
- vsldoi 29, 29, 24, 8 # mH
- vxor 23, 23, 26 # mL + L
-
- vsldoi 23, 23, 23, 8 # swap
- vxor 23, 23, 28
-
- vpmsumd 24, 14, 15 # H4.H * X.H
- vpmsumd 25, 11, 16
- vpmsumd 26, 8, 17
- vpmsumd 27, 5, 18
-
- vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27
-
- vxor 24, 24, 29
-
- # sum hash and reduction with H Poly
- vsldoi 27, 23, 23, 8 # swap
- vpmsumd 23, 23, 2
- vxor 27, 27, 24
- vxor 23, 23, 27
-
- xxlor 32, 23+32, 23+32 # update hash
-
+.macro RESTORE_VRS VRS OFFSET FRAME
+ lxv \VRS+32, \OFFSET(\FRAME)
.endm
- #
- # Combine two 4x ghash
- # v15 - v22 - input blocks
- #
-.macro ppc_aes_gcm_ghash2_4x
- # first 4x hash
- vxor 15, 15, 0 # Xi + X
-
- vpmsumd 23, 12, 15 # H4.L * X.L
- vpmsumd 24, 9, 16
- vpmsumd 25, 6, 17
- vpmsumd 26, 3, 18
-
- vxor 23, 23, 24
- vxor 23, 23, 25
- vxor 23, 23, 26 # L
-
- vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
- vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
- vpmsumd 26, 7, 17
- vpmsumd 27, 4, 18
-
- vxor 24, 24, 25
- vxor 24, 24, 26
-
- # sum hash and reduction with H Poly
- vpmsumd 28, 23, 2 # reduction
-
- vxor 29, 29, 29
-
- vxor 24, 24, 27 # M
- vsldoi 26, 24, 29, 8 # mL
- vsldoi 29, 29, 24, 8 # mH
- vxor 23, 23, 26 # mL + L
-
- vsldoi 23, 23, 23, 8 # swap
- vxor 23, 23, 28
+.macro SAVE_REGS
+ mflr 0
+ std 0, 16(1)
+ stdu 1,-512(1)
+
+ SAVE_GPR 14, 112, 1
+ SAVE_GPR 15, 120, 1
+ SAVE_GPR 16, 128, 1
+ SAVE_GPR 17, 136, 1
+ SAVE_GPR 18, 144, 1
+ SAVE_GPR 19, 152, 1
+ SAVE_GPR 20, 160, 1
+ SAVE_GPR 21, 168, 1
+ SAVE_GPR 22, 176, 1
+ SAVE_GPR 23, 184, 1
+ SAVE_GPR 24, 192, 1
+
+ addi 9, 1, 256
+ SAVE_VRS 20, 0, 9
+ SAVE_VRS 21, 16, 9
+ SAVE_VRS 22, 32, 9
+ SAVE_VRS 23, 48, 9
+ SAVE_VRS 24, 64, 9
+ SAVE_VRS 25, 80, 9
+ SAVE_VRS 26, 96, 9
+ SAVE_VRS 27, 112, 9
+ SAVE_VRS 28, 128, 9
+ SAVE_VRS 29, 144, 9
+ SAVE_VRS 30, 160, 9
+ SAVE_VRS 31, 176, 9
+.endm # SAVE_REGS
- vpmsumd 24, 14, 15 # H4.H * X.H
- vpmsumd 25, 11, 16
- vpmsumd 26, 8, 17
- vpmsumd 27, 5, 18
+.macro RESTORE_REGS
+ addi 9, 1, 256
+ RESTORE_VRS 20, 0, 9
+ RESTORE_VRS 21, 16, 9
+ RESTORE_VRS 22, 32, 9
+ RESTORE_VRS 23, 48, 9
+ RESTORE_VRS 24, 64, 9
+ RESTORE_VRS 25, 80, 9
+ RESTORE_VRS 26, 96, 9
+ RESTORE_VRS 27, 112, 9
+ RESTORE_VRS 28, 128, 9
+ RESTORE_VRS 29, 144, 9
+ RESTORE_VRS 30, 160, 9
+ RESTORE_VRS 31, 176, 9
+
+ RESTORE_GPR 14, 112, 1
+ RESTORE_GPR 15, 120, 1
+ RESTORE_GPR 16, 128, 1
+ RESTORE_GPR 17, 136, 1
+ RESTORE_GPR 18, 144, 1
+ RESTORE_GPR 19, 152, 1
+ RESTORE_GPR 20, 160, 1
+ RESTORE_GPR 21, 168, 1
+ RESTORE_GPR 22, 176, 1
+ RESTORE_GPR 23, 184, 1
+ RESTORE_GPR 24, 192, 1
+
+ addi 1, 1, 512
+ ld 0, 16(1)
+ mtlr 0
+.endm # RESTORE_REGS
+
+# 4x loops
+.macro AES_CIPHER_4x _VCIPHER ST r
+ \_VCIPHER \ST, \ST, \r
+ \_VCIPHER \ST+1, \ST+1, \r
+ \_VCIPHER \ST+2, \ST+2, \r
+ \_VCIPHER \ST+3, \ST+3, \r
+.endm
- vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27 # H
+# 8x loops
+.macro AES_CIPHER_8x _VCIPHER ST r
+ \_VCIPHER \ST, \ST, \r
+ \_VCIPHER \ST+1, \ST+1, \r
+ \_VCIPHER \ST+2, \ST+2, \r
+ \_VCIPHER \ST+3, \ST+3, \r
+ \_VCIPHER \ST+4, \ST+4, \r
+ \_VCIPHER \ST+5, \ST+5, \r
+ \_VCIPHER \ST+6, \ST+6, \r
+ \_VCIPHER \ST+7, \ST+7, \r
+.endm
- vxor 24, 24, 29 # H + mH
+.macro LOOP_8AES_STATE
+ xxlor 32+23, 1, 1
+ xxlor 32+24, 2, 2
+ xxlor 32+25, 3, 3
+ xxlor 32+26, 4, 4
+ AES_CIPHER_8x vcipher, 15, 23
+ AES_CIPHER_8x vcipher, 15, 24
+ AES_CIPHER_8x vcipher, 15, 25
+ AES_CIPHER_8x vcipher, 15, 26
+ xxlor 32+23, 5, 5
+ xxlor 32+24, 6, 6
+ xxlor 32+25, 7, 7
+ xxlor 32+26, 8, 8
+ AES_CIPHER_8x vcipher, 15, 23
+ AES_CIPHER_8x vcipher, 15, 24
+ AES_CIPHER_8x vcipher, 15, 25
+ AES_CIPHER_8x vcipher, 15, 26
+.endm
- # sum hash and reduction with H Poly
- vsldoi 27, 23, 23, 8 # swap
- vpmsumd 23, 23, 2
- vxor 27, 27, 24
- vxor 27, 23, 27 # 1st Xi
-
- # 2nd 4x hash
- vpmsumd 24, 9, 20
- vpmsumd 25, 6, 21
- vpmsumd 26, 3, 22
- vxor 19, 19, 27 # Xi + X
- vpmsumd 23, 12, 19 # H4.L * X.L
-
- vxor 23, 23, 24
- vxor 23, 23, 25
- vxor 23, 23, 26 # L
-
- vpmsumd 24, 13, 19 # H4.L * X.H + H4.H * X.L
- vpmsumd 25, 10, 20 # H3.L * X1.H + H3.H * X1.L
- vpmsumd 26, 7, 21
- vpmsumd 27, 4, 22
-
- vxor 24, 24, 25
- vxor 24, 24, 26
+#
+# PPC_GHASH4x(H, S1, S2, S3, S4): Compute 4x hash values based on Karatsuba method.
+# H: returning digest
+# S#: states
+#
+# S1 should xor with the previous digest
+#
+# Xi = v0
+# H Poly = v2
+# Hash keys = v3 - v14
+# Scratch: v23 - v29
+#
+.macro PPC_GHASH4x H S1 S2 S3 S4
+
+ vpmsumd 23, 12, \S1 # H4.L * X.L
+ vpmsumd 24, 9, \S2
+ vpmsumd 25, 6, \S3
+ vpmsumd 26, 3, \S4
+
+ vpmsumd 27, 13, \S1 # H4.L * X.H + H4.H * X.L
+ vpmsumd 28, 10, \S2 # H3.L * X1.H + H3.H * X1.L
+
+ vxor 23, 23, 24
+ vxor 23, 23, 25
+ vxor 23, 23, 26 # L
+
+ vxor 24, 27, 28
+ vpmsumd 25, 7, \S3
+ vpmsumd 26, 4, \S4
+
+ vxor 24, 24, 25
+ vxor 24, 24, 26 # M
# sum hash and reduction with H Poly
- vpmsumd 28, 23, 2 # reduction
-
- vxor 29, 29, 29
+ vpmsumd 28, 23, 2 # reduction
- vxor 24, 24, 27 # M
- vsldoi 26, 24, 29, 8 # mL
- vsldoi 29, 29, 24, 8 # mH
- vxor 23, 23, 26 # mL + L
+ vxor 1, 1, 1
+ vsldoi 25, 24, 1, 8 # mL
+ vsldoi 1, 1, 24, 8 # mH
+ vxor 23, 23, 25 # mL + L
- vsldoi 23, 23, 23, 8 # swap
- vxor 23, 23, 28
+ # This performs swap and xor like,
+ # vsldoi 23, 23, 23, 8 # swap
+ # vxor 23, 23, 28
+ xxlor 32+25, 10, 10
+ vpermxor 23, 23, 28, 25
- vpmsumd 24, 14, 19 # H4.H * X.H
- vpmsumd 25, 11, 20
- vpmsumd 26, 8, 21
- vpmsumd 27, 5, 22
+ vpmsumd 26, 14, \S1 # H4.H * X.H
+ vpmsumd 27, 11, \S2
+ vpmsumd 28, 8, \S3
+ vpmsumd 29, 5, \S4
- vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27 # H
+ vxor 24, 26, 27
+ vxor 24, 24, 28
+ vxor 24, 24, 29
- vxor 24, 24, 29 # H + mH
+ vxor 24, 24, 1
# sum hash and reduction with H Poly
- vsldoi 27, 23, 23, 8 # swap
- vpmsumd 23, 23, 2
- vxor 27, 27, 24
- vxor 23, 23, 27
-
- xxlor 32, 23+32, 23+32 # update hash
-
+ vsldoi 25, 23, 23, 8 # swap
+ vpmsumd 23, 23, 2
+ vxor 27, 25, 24
+ vxor \H, 23, 27
.endm
- #
- # Compute update single hash
- #
-.macro ppc_update_hash_1x
- vxor 28, 28, 0
-
- vxor 19, 19, 19
+#
+# Compute update single ghash
+# scratch: v1, v22..v27
+#
+.macro PPC_GHASH1x H S1
- vpmsumd 22, 3, 28 # L
- vpmsumd 23, 4, 28 # M
- vpmsumd 24, 5, 28 # H
+ vxor 1, 1, 1
- vpmsumd 27, 22, 2 # reduction
+ vpmsumd 22, 3, \S1 # L
+ vpmsumd 23, 4, \S1 # M
+ vpmsumd 24, 5, \S1 # H
- vsldoi 25, 23, 19, 8 # mL
- vsldoi 26, 19, 23, 8 # mH
- vxor 22, 22, 25 # LL + LL
- vxor 24, 24, 26 # HH + HH
+ vpmsumd 27, 22, 2 # reduction
- vsldoi 22, 22, 22, 8 # swap
- vxor 22, 22, 27
+ vsldoi 25, 23, 1, 8 # mL
+ vsldoi 26, 1, 23, 8 # mH
+ vxor 22, 22, 25 # LL + LL
+ vxor 24, 24, 26 # HH + HH
- vsldoi 20, 22, 22, 8 # swap
- vpmsumd 22, 22, 2 # reduction
- vxor 20, 20, 24
- vxor 22, 22, 20
+ xxlor 32+25, 10, 10
+ vpermxor 22, 22, 27, 25
- vmr 0, 22 # update hash
-
-.endm
-
-.macro SAVE_REGS
- stdu 1,-640(1)
- mflr 0
-
- std 14,112(1)
- std 15,120(1)
- std 16,128(1)
- std 17,136(1)
- std 18,144(1)
- std 19,152(1)
- std 20,160(1)
- std 21,168(1)
- li 9, 256
- stvx 20, 9, 1
- addi 9, 9, 16
- stvx 21, 9, 1
- addi 9, 9, 16
- stvx 22, 9, 1
- addi 9, 9, 16
- stvx 23, 9, 1
- addi 9, 9, 16
- stvx 24, 9, 1
- addi 9, 9, 16
- stvx 25, 9, 1
- addi 9, 9, 16
- stvx 26, 9, 1
- addi 9, 9, 16
- stvx 27, 9, 1
- addi 9, 9, 16
- stvx 28, 9, 1
- addi 9, 9, 16
- stvx 29, 9, 1
- addi 9, 9, 16
- stvx 30, 9, 1
- addi 9, 9, 16
- stvx 31, 9, 1
- stxv 14, 464(1)
- stxv 15, 480(1)
- stxv 16, 496(1)
- stxv 17, 512(1)
- stxv 18, 528(1)
- stxv 19, 544(1)
- stxv 20, 560(1)
- stxv 21, 576(1)
- stxv 22, 592(1)
- std 0, 656(1)
-.endm
-
-.macro RESTORE_REGS
- lxv 14, 464(1)
- lxv 15, 480(1)
- lxv 16, 496(1)
- lxv 17, 512(1)
- lxv 18, 528(1)
- lxv 19, 544(1)
- lxv 20, 560(1)
- lxv 21, 576(1)
- lxv 22, 592(1)
- li 9, 256
- lvx 20, 9, 1
- addi 9, 9, 16
- lvx 21, 9, 1
- addi 9, 9, 16
- lvx 22, 9, 1
- addi 9, 9, 16
- lvx 23, 9, 1
- addi 9, 9, 16
- lvx 24, 9, 1
- addi 9, 9, 16
- lvx 25, 9, 1
- addi 9, 9, 16
- lvx 26, 9, 1
- addi 9, 9, 16
- lvx 27, 9, 1
- addi 9, 9, 16
- lvx 28, 9, 1
- addi 9, 9, 16
- lvx 29, 9, 1
- addi 9, 9, 16
- lvx 30, 9, 1
- addi 9, 9, 16
- lvx 31, 9, 1
-
- ld 0, 656(1)
- ld 14,112(1)
- ld 15,120(1)
- ld 16,128(1)
- ld 17,136(1)
- ld 18,144(1)
- ld 19,152(1)
- ld 20,160(1)
- ld 21,168(1)
-
- mtlr 0
- addi 1, 1, 640
+ vsldoi 23, 22, 22, 8 # swap
+ vpmsumd 22, 22, 2 # reduction
+ vxor 23, 23, 24
+ vxor \H, 22, 23
.endm
+#
+# LOAD_HASH_TABLE
+# Xi = v0
+# H Poly = v2
+# Hash keys = v3 - v14
+#
.macro LOAD_HASH_TABLE
# Load Xi
lxvb16x 32, 0, 8 # load Xi
@@ -557,657 +298,434 @@
lxvd2x 14+32, 10, 8 # H^4h
.endm
- #
- # aes_p10_gcm_encrypt (const void *inp, void *out, size_t len,
- # const char *rk, unsigned char iv[16], void *Xip);
- #
- # r3 - inp
- # r4 - out
- # r5 - len
- # r6 - AES round keys
- # r7 - iv and other data
- # r8 - Xi, HPoli, hash keys
- #
- # rounds is at offset 240 in rk
- # Xi is at 0 in gcm_table (Xip).
- #
-_GLOBAL(aes_p10_gcm_encrypt)
-.align 5
-
- SAVE_REGS
-
- LOAD_HASH_TABLE
-
- # initialize ICB: GHASH( IV ), IV - r7
- lxvb16x 30+32, 0, 7 # load IV - v30
-
- mr 12, 5 # length
- li 11, 0 # block index
-
- # counter 1
- vxor 31, 31, 31
- vspltisb 22, 1
- vsldoi 31, 31, 22,1 # counter 1
-
- # load round key to VSR
- lxv 0, 0(6)
- lxv 1, 0x10(6)
- lxv 2, 0x20(6)
- lxv 3, 0x30(6)
- lxv 4, 0x40(6)
- lxv 5, 0x50(6)
- lxv 6, 0x60(6)
- lxv 7, 0x70(6)
- lxv 8, 0x80(6)
- lxv 9, 0x90(6)
- lxv 10, 0xa0(6)
-
- # load rounds - 10 (128), 12 (192), 14 (256)
- lwz 9,240(6)
-
- #
- # vxor state, state, w # addroundkey
- xxlor 32+29, 0, 0
- vxor 15, 30, 29 # IV + round key - add round key 0
-
- cmpdi 9, 10
- beq Loop_aes_gcm_8x
-
- # load 2 more round keys (v11, v12)
- lxv 11, 0xb0(6)
- lxv 12, 0xc0(6)
-
- cmpdi 9, 12
- beq Loop_aes_gcm_8x
-
- # load 2 more round keys (v11, v12, v13, v14)
- lxv 13, 0xd0(6)
- lxv 14, 0xe0(6)
- cmpdi 9, 14
- beq Loop_aes_gcm_8x
-
- b aes_gcm_out
-
-.align 5
-Loop_aes_gcm_8x:
- mr 14, 3
- mr 9, 4
-
- #
- # check partial block
- #
-Continue_partial_check:
- ld 15, 56(7)
- cmpdi 15, 0
- beq Continue
- bgt Final_block
- cmpdi 15, 16
- blt Final_block
-
-Continue:
- # n blcoks
- li 10, 128
- divdu 10, 12, 10 # n 128 bytes-blocks
- cmpdi 10, 0
- beq Loop_last_block
-
- vaddudm 30, 30, 31 # IV + counter
- vxor 16, 30, 29
- vaddudm 30, 30, 31
- vxor 17, 30, 29
- vaddudm 30, 30, 31
- vxor 18, 30, 29
- vaddudm 30, 30, 31
- vxor 19, 30, 29
- vaddudm 30, 30, 31
- vxor 20, 30, 29
- vaddudm 30, 30, 31
- vxor 21, 30, 29
- vaddudm 30, 30, 31
- vxor 22, 30, 29
-
- mtctr 10
-
- li 15, 16
- li 16, 32
- li 17, 48
- li 18, 64
- li 19, 80
- li 20, 96
- li 21, 112
-
- lwz 10, 240(6)
-
-Loop_8x_block:
-
- lxvb16x 15, 0, 14 # load block
- lxvb16x 16, 15, 14 # load block
- lxvb16x 17, 16, 14 # load block
- lxvb16x 18, 17, 14 # load block
- lxvb16x 19, 18, 14 # load block
- lxvb16x 20, 19, 14 # load block
- lxvb16x 21, 20, 14 # load block
- lxvb16x 22, 21, 14 # load block
- addi 14, 14, 128
-
- Loop_aes_middle8x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_next_ghash
-
- # 192 bits
- xxlor 24+32, 11, 11
-
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
- vcipher 19, 19, 23
- vcipher 20, 20, 23
- vcipher 21, 21, 23
- vcipher 22, 22, 23
-
- vcipher 15, 15, 24
- vcipher 16, 16, 24
- vcipher 17, 17, 24
- vcipher 18, 18, 24
- vcipher 19, 19, 24
- vcipher 20, 20, 24
- vcipher 21, 21, 24
- vcipher 22, 22, 24
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_next_ghash
-
- # 256 bits
- xxlor 24+32, 13, 13
-
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
- vcipher 19, 19, 23
- vcipher 20, 20, 23
- vcipher 21, 21, 23
- vcipher 22, 22, 23
-
- vcipher 15, 15, 24
- vcipher 16, 16, 24
- vcipher 17, 17, 24
- vcipher 18, 18, 24
- vcipher 19, 19, 24
- vcipher 20, 20, 24
- vcipher 21, 21, 24
- vcipher 22, 22, 24
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_next_ghash
- b aes_gcm_out
-
-Do_next_ghash:
-
- #
- # last round
- vcipherlast 15, 15, 23
- vcipherlast 16, 16, 23
-
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9 # store output
- xxlxor 48, 48, 16
- stxvb16x 48, 15, 9 # store output
-
- vcipherlast 17, 17, 23
- vcipherlast 18, 18, 23
-
- xxlxor 49, 49, 17
- stxvb16x 49, 16, 9 # store output
- xxlxor 50, 50, 18
- stxvb16x 50, 17, 9 # store output
-
- vcipherlast 19, 19, 23
- vcipherlast 20, 20, 23
-
- xxlxor 51, 51, 19
- stxvb16x 51, 18, 9 # store output
- xxlxor 52, 52, 20
- stxvb16x 52, 19, 9 # store output
-
- vcipherlast 21, 21, 23
- vcipherlast 22, 22, 23
-
- xxlxor 53, 53, 21
- stxvb16x 53, 20, 9 # store output
- xxlxor 54, 54, 22
- stxvb16x 54, 21, 9 # store output
-
- addi 9, 9, 128
-
- # ghash here
- ppc_aes_gcm_ghash2_4x
-
- xxlor 27+32, 0, 0
- vaddudm 30, 30, 31 # IV + counter
- vmr 29, 30
- vxor 15, 30, 27 # add round key
- vaddudm 30, 30, 31
- vxor 16, 30, 27
- vaddudm 30, 30, 31
- vxor 17, 30, 27
- vaddudm 30, 30, 31
- vxor 18, 30, 27
- vaddudm 30, 30, 31
- vxor 19, 30, 27
- vaddudm 30, 30, 31
- vxor 20, 30, 27
- vaddudm 30, 30, 31
- vxor 21, 30, 27
- vaddudm 30, 30, 31
- vxor 22, 30, 27
-
- addi 12, 12, -128
- addi 11, 11, 128
-
- bdnz Loop_8x_block
-
- vmr 30, 29
- stxvb16x 30+32, 0, 7 # update IV
-
-Loop_last_block:
- cmpdi 12, 0
- beq aes_gcm_out
-
- # loop last few blocks
+################################################################################
+# Compute AES and ghash one block at a time.
+# r23: AES rounds
+# v30: current IV
+# vs0: roundkey 0
+#
+################################################################################
+SYM_FUNC_START_LOCAL(aes_gcm_crypt_1x)
+
+ cmpdi 5, 16
+ bge __More_1x
+ blr
+__More_1x:
li 10, 16
- divdu 10, 12, 10
-
- mtctr 10
-
- lwz 10, 240(6)
-
- cmpdi 12, 16
- blt Final_block
-
-Next_rem_block:
- lxvb16x 15, 0, 14 # load block
-
- Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_next_1x
-
- # 192 bits
- xxlor 24+32, 11, 11
-
- vcipher 15, 15, 23
- vcipher 15, 15, 24
-
- xxlor 23+32, 12, 12
+ divdu 12, 5, 10
+
+ xxlxor 32+15, 32+30, 0
+
+ # Pre-load 8 AES rounds to scratch vectors.
+ xxlor 32+16, 1, 1
+ xxlor 32+17, 2, 2
+ xxlor 32+18, 3, 3
+ xxlor 32+19, 4, 4
+ xxlor 32+20, 5, 5
+ xxlor 32+21, 6, 6
+ xxlor 32+28, 7, 7
+ xxlor 32+29, 8, 8
+ lwz 23, 240(6) # n rounds
+ addi 22, 23, -9 # remaing AES rounds
- cmpdi 10, 12
- beq Do_next_1x
-
- # 256 bits
- xxlor 24+32, 13, 13
-
- vcipher 15, 15, 23
- vcipher 15, 15, 24
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_next_1x
-
-Do_next_1x:
- vcipherlast 15, 15, 23
-
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9 # store output
- addi 14, 14, 16
- addi 9, 9, 16
-
- vmr 28, 15
- ppc_update_hash_1x
-
- addi 12, 12, -16
- addi 11, 11, 16
- xxlor 19+32, 0, 0
- vaddudm 30, 30, 31 # IV + counter
- vxor 15, 30, 19 # add round key
-
- bdnz Next_rem_block
-
- li 15, 0
- std 15, 56(7) # clear partial?
- stxvb16x 30+32, 0, 7 # update IV
cmpdi 12, 0
- beq aes_gcm_out
-
-Final_block:
- lwz 10, 240(6)
- Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_final_1x
-
- # 192 bits
- xxlor 24+32, 11, 11
-
- vcipher 15, 15, 23
- vcipher 15, 15, 24
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_final_1x
-
- # 256 bits
- xxlor 24+32, 13, 13
-
- vcipher 15, 15, 23
- vcipher 15, 15, 24
+ bgt __Loop_1x
+ blr
- xxlor 23+32, 14, 14
+__Loop_1x:
+ mtctr 22
+ addi 10, 6, 144
+ vcipher 15, 15, 16
+ vcipher 15, 15, 17
+ vcipher 15, 15, 18
+ vcipher 15, 15, 19
+ vcipher 15, 15, 20
+ vcipher 15, 15, 21
+ vcipher 15, 15, 28
+ vcipher 15, 15, 29
- cmpdi 10, 14
- beq Do_final_1x
+__Loop_aes_1state:
+ lxv 32+1, 0(10)
+ vcipher 15, 15, 1
+ addi 10, 10, 16
+ bdnz __Loop_aes_1state
+ lxv 32+1, 0(10) # last round key
+ lxvb16x 11, 0, 14 # load input block
+ vcipherlast 15, 15, 1
+
+ xxlxor 32+15, 32+15, 11
+ stxvb16x 32+15, 0, 9 # store output
+ addi 14, 14, 16
+ addi 9, 9, 16
-Do_final_1x:
- vcipherlast 15, 15, 23
+ cmpdi 24, 0 # decrypt?
+ bne __Encrypt_1x
+ xxlor 15+32, 11, 11
+__Encrypt_1x:
+ vxor 15, 15, 0
+ PPC_GHASH1x 0, 15
- # check partial block
- li 21, 0 # encrypt
- ld 15, 56(7) # partial?
- cmpdi 15, 0
- beq Normal_block
- bl Do_partial_block
+ addi 5, 5, -16
+ addi 11, 11, 16
+ vadduwm 30, 30, 31 # IV + counter
+ xxlxor 32+15, 32+30, 0
+ addi 12, 12, -1
cmpdi 12, 0
- ble aes_gcm_out
+ bgt __Loop_1x
- b Continue_partial_check
-
-Normal_block:
- lxvb16x 15, 0, 14 # load last block
- xxlxor 47, 47, 15
-
- # create partial block mask
- li 15, 16
- sub 15, 15, 12 # index to the mask
-
- vspltisb 16, -1 # first 16 bytes - 0xffff...ff
- vspltisb 17, 0 # second 16 bytes - 0x0000...00
- li 10, 192
- stvx 16, 10, 1
+ stxvb16x 32+30, 0, 7 # update IV
+ stxvb16x 32+0, 0, 8 # update Xi
+ blr
+SYM_FUNC_END(aes_gcm_crypt_1x)
+
+################################################################################
+# Process a normal partial block when we come here.
+# Compute partial mask, Load and store partial block to stack.
+# Update partial_len and pblock.
+# pblock is (encrypted ^ AES state) for encrypt
+# and (input ^ AES state) for decrypt.
+#
+################################################################################
+SYM_FUNC_START_LOCAL(__Process_partial)
+
+ # create partial mask
+ vspltisb 16, -1
+ li 12, 16
+ sub 12, 12, 5
+ sldi 12, 12, 3
+ mtvsrdd 32+17, 0, 12
+ vslo 16, 16, 17 # partial block mask
+
+ lxvb16x 11, 0, 14 # load partial block
+ xxland 11, 11, 32+16
+
+ # AES crypt partial
+ xxlxor 32+15, 32+30, 0
+ lwz 23, 240(6) # n rounds
+ addi 22, 23, -1 # loop - 1
+ mtctr 22
+ addi 10, 6, 16
+
+__Loop_aes_pstate:
+ lxv 32+1, 0(10)
+ vcipher 15, 15, 1
addi 10, 10, 16
- stvx 17, 10, 1
-
- addi 10, 1, 192
- lxvb16x 16, 15, 10 # load partial block mask
- xxland 47, 47, 16
-
- vmr 28, 15
- ppc_update_hash_1x
+ bdnz __Loop_aes_pstate
+ lxv 32+1, 0(10) # last round key
+ vcipherlast 15, 15, 1
- # * should store only the remaining bytes.
- bl Write_partial_block
-
- stxvb16x 30+32, 0, 7 # update IV
- std 12, 56(7) # update partial?
- li 16, 16
+ xxlxor 32+15, 32+15, 11
+ vand 15, 15, 16
- stxvb16x 32, 0, 8 # write out Xi
- stxvb16x 32, 16, 8 # write out Xi
- b aes_gcm_out
-
- #
- # Compute data mask
- #
-.macro GEN_MASK _mask _start _end
- vspltisb 16, -1 # first 16 bytes - 0xffff...ff
- vspltisb 17, 0 # second 16 bytes - 0x0000...00
- li 10, 192
- stxvb16x 17+32, 10, 1
- add 10, 10, \_start
- stxvb16x 16+32, 10, 1
- add 10, 10, \_end
- stxvb16x 17+32, 10, 1
-
- addi 10, 1, 192
- lxvb16x \_mask, 0, 10 # load partial block mask
-.endm
+ # AES crypt output v15
+ # Write partial
+ li 10, 224
+ stxvb16x 15+32, 10, 1 # write v15 to stack
+ addi 10, 1, 223
+ addi 12, 9, -1
+ mtctr 5 # partial block len
+__Write_partial:
+ lbzu 22, 1(10)
+ stbu 22, 1(12)
+ bdnz __Write_partial
+
+ cmpdi 24, 0 # decrypt?
+ bne __Encrypt_partial
+ xxlor 32+15, 11, 11 # decrypt using the input block
+__Encrypt_partial:
+ #vxor 15, 15, 0 # ^ previous hash
+ #PPC_GHASH1x 0, 15
+
+ add 14, 14, 5
+ add 9, 9, 5
+ std 5, 56(7) # update partial
+ sub 11, 11, 5
+ li 5, 0 # done last byte
- #
- # Handle multiple partial blocks for encrypt and decrypt
- # operations.
- #
-SYM_FUNC_START_LOCAL(Do_partial_block)
- add 17, 15, 5
- cmpdi 17, 16
- bgt Big_block
- GEN_MASK 18, 15, 5
- b _Partial
-SYM_FUNC_END(Do_partial_block)
-Big_block:
+ #
+ # Don't increase IV since this is the last partial.
+ # It should get updated in gcm_update if no more data blocks.
+ #vadduwm 30, 30, 31 # increase IV
+ stxvb16x 32+30, 0, 7 # update IV
+ li 10, 64
+ stxvb16x 32+0, 0, 8 # Update X1
+ stxvb16x 32+15, 10, 7 # Update pblock
+ blr
+SYM_FUNC_END(__Process_partial)
+
+################################################################################
+# Combine partial blocks and ghash when we come here.
+#
+# The partial block has to be shifted to the right location to encrypt/decrypt
+# and compute ghash if combing the previous partial block is needed.
+# - Compute ghash for a full block. Clear Partial_len and pblock. Update IV.
+# Write Xi.
+# - Don't compute ghash if not full block. gcm_update will take care of it
+# is the last block. Update Partial_len and pblock.
+#
+################################################################################
+SYM_FUNC_START_LOCAL(__Combine_partial)
+
+ ld 12, 56(7)
+ mr 21, 5 # these bytes to be processed
+
+ li 17, 0
li 16, 16
- GEN_MASK 18, 15, 16
-
-_Partial:
- lxvb16x 17+32, 0, 14 # load last block
- sldi 16, 15, 3
- mtvsrdd 32+16, 0, 16
- vsro 17, 17, 16
- xxlxor 47, 47, 17+32
- xxland 47, 47, 18
-
- vxor 0, 0, 0 # clear Xi
- vmr 28, 15
-
- cmpdi 21, 0 # encrypt/decrypt ops?
- beq Skip_decrypt
- xxland 32+28, 32+17, 18
-
-Skip_decrypt:
-
- ppc_update_hash_1x
+ sub 22, 16, 12 # bytes to complete a block
+ sub 17, 22, 5 # remaining bytes in a block
+ cmpdi 5, 16
+ ble __Inp_msg_less16
+ li 17, 0
+ mr 21, 22
+ b __Combine_continue
+__Inp_msg_less16:
+ cmpd 22, 5
+ bgt __Combine_continue
+ li 17, 0
+ mr 21, 22 # these bytes to be processed
+
+__Combine_continue:
+ # load msg and shift to the proper location and mask
+ vspltisb 16, -1
+ sldi 15, 12, 3
+ mtvsrdd 32+17, 0, 15
+ vslo 16, 16, 17
+ vsro 16, 16, 17
+ sldi 15, 17, 3
+ mtvsrdd 32+17, 0, 15
+ vsro 16, 16, 17
+ vslo 16, 16, 17 # mask
+
+ lxvb16x 32+19, 0, 14 # load partial block
+ sldi 15, 12, 3
+ mtvsrdd 32+17, 0, 15
+ vsro 19, 19, 17 # 0x00..xxxx??..??
+ sldi 15, 17, 3
+ mtvsrdd 32+17, 0, 15
+ vsro 19, 19, 17 # 0x00..xxxx
+ vslo 19, 19, 17 # shift back to form 0x00..xxxx00..00
+
+ # AES crypt partial
+ xxlxor 32+15, 32+30, 0
+ lwz 23, 240(6) # n rounds
+ addi 22, 23, -1 # loop - 1
+ mtctr 22
+ addi 10, 6, 16
+
+__Loop_aes_cpstate:
+ lxv 32+1, 0(10)
+ vcipher 15, 15, 1
+ addi 10, 10, 16
+ bdnz __Loop_aes_cpstate
+ lxv 32+1, 0(10) # last round key
+ vcipherlast 15, 15, 1
- li 16, 16
- lxvb16x 32+29, 16, 8
- vxor 0, 0, 29
- stxvb16x 32, 0, 8 # save Xi
- stxvb16x 32, 16, 8 # save Xi
-
- # store partial block
- # loop the rest of the stream if any
- sldi 16, 15, 3
- mtvsrdd 32+16, 0, 16
- vslo 15, 15, 16
- #stxvb16x 15+32, 0, 9 # last block
+ vxor 15, 15, 19
+ vand 15, 15, 16
- li 16, 16
- sub 17, 16, 15 # 16 - partial
-
- add 16, 15, 5
- cmpdi 16, 16
- bgt Larger_16
- mr 17, 5
-Larger_16:
-
- # write partial
- li 10, 192
- stxvb16x 15+32, 10, 1 # save current block
-
- addi 10, 9, -1
- addi 16, 1, 191
- mtctr 17 # move partial byte count
-
-Write_last_partial:
- lbzu 18, 1(16)
- stbu 18, 1(10)
- bdnz Write_last_partial
- # Complete loop partial
-
- add 14, 14, 17
- add 9, 9, 17
- sub 12, 12, 17
- add 11, 11, 17
-
- add 15, 15, 5
- cmpdi 15, 16
- blt Save_partial
-
- vaddudm 30, 30, 31
- stxvb16x 30+32, 0, 7 # update IV
- xxlor 32+29, 0, 0
- vxor 15, 30, 29 # IV + round key - add round key 0
- li 15, 0
- std 15, 56(7) # partial done - clear
- b Partial_done
-Save_partial:
- std 15, 56(7) # partial
-
-Partial_done:
+ # AES crypt output v15
+ # Write partial
+ li 10, 224
+ stxvb16x 15+32, 10, 1 # write v15 to stack
+ addi 10, 1, 223
+ add 10, 10, 12 # add offset
+ addi 15, 9, -1
+ mtctr 21 # partial block len
+__Write_combine_partial:
+ lbzu 22, 1(10)
+ stbu 22, 1(15)
+ bdnz __Write_combine_partial
+
+ add 14, 14, 21
+ add 11, 11, 21
+ add 9, 9, 21
+ sub 5, 5, 21
+
+ # Encrypt/Decrypt?
+ cmpdi 24, 0 # decrypt?
+ bne __Encrypt_combine_partial
+ vmr 15, 19 # decrypt using the input block
+
+__Encrypt_combine_partial:
+ #
+ # Update partial flag and combine ghash.
+__Update_partial_ghash:
+ li 10, 64
+ lxvb16x 32+17, 10, 7 # load previous pblock
+ add 12, 12, 21 # combined pprocessed
+ vxor 15, 15, 17 # combined pblock
+
+ cmpdi 12, 16
+ beq __Clear_partial_flag
+ std 12, 56(7) # update partial len
+ stxvb16x 32+15, 10, 7 # Update current pblock
blr
- #
- # Write partial block
- # r9 - output
- # r12 - remaining bytes
- # v15 - partial input data
- #
-SYM_FUNC_START_LOCAL(Write_partial_block)
- li 10, 192
- stxvb16x 15+32, 10, 1 # last block
-
- addi 10, 9, -1
- addi 16, 1, 191
-
- mtctr 12 # remaining bytes
- li 15, 0
-
-Write_last_byte:
- lbzu 14, 1(16)
- stbu 14, 1(10)
- bdnz Write_last_byte
+__Clear_partial_flag:
+ li 12, 0
+ std 12, 56(7)
+ # Update IV and ghash here
+ vadduwm 30, 30, 31 # increase IV
+ stxvb16x 32+30, 0, 7 # update IV
+
+ # v15 either is either (input blockor encrypted)^(AES state)
+ vxor 15, 15, 0
+ PPC_GHASH1x 0, 15
+ stxvb16x 32+0, 10, 7 # update pblock for debug?
+ stxvb16x 32+0, 0, 8 # update Xi
blr
-SYM_FUNC_END(Write_partial_block)
+SYM_FUNC_END(__Combine_partial)
-aes_gcm_out:
- # out = state
- stxvb16x 32, 0, 8 # write out Xi
- add 3, 11, 12 # return count
+################################################################################
+# gcm_update(iv, Xi) - compute last hash
+#
+################################################################################
+SYM_FUNC_START(gcm_update)
- RESTORE_REGS
- blr
+ ld 10, 56(3)
+ cmpdi 10, 0
+ beq __no_update
- #
- # 8x Decrypt
- #
-_GLOBAL(aes_p10_gcm_decrypt)
-.align 5
+ lxvb16x 32, 0, 4 # load Xi
+ # load Hash - h^4, h^3, h^2, h
+ li 10, 32
+ lxvd2x 2+32, 10, 4 # H Poli
+ li 10, 48
+ lxvd2x 3+32, 10, 4 # Hl
+ li 10, 64
+ lxvd2x 4+32, 10, 4 # H
+ li 10, 80
+ lxvd2x 5+32, 10, 4 # Hh
+
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxv 10, 0(11) # vs10: vpermxor vector
+
+ li 9, 64
+ lxvb16x 32+6, 9, 3 # load pblock
+ vxor 6, 6, 0
+
+ vxor 1, 1, 1
+ vpmsumd 12, 3, 6 # L
+ vpmsumd 13, 4, 6 # M
+ vpmsumd 14, 5, 6 # H
+ vpmsumd 17, 12, 2 # reduction
+ vsldoi 15, 13, 1, 8 # mL
+ vsldoi 16, 1, 13, 8 # mH
+ vxor 12, 12, 15 # LL + LL
+ vxor 14, 14, 16 # HH + HH
+ xxlor 32+15, 10, 10
+ vpermxor 12, 12, 17, 15
+ vsldoi 13, 12, 12, 8 # swap
+ vpmsumd 12, 12, 2 # reduction
+ vxor 13, 13, 14
+ vxor 7, 12, 13
+
+ #vxor 0, 0, 0
+ #stxvb16x 32+0, 9, 3
+ li 10, 0
+ std 10, 56(3)
+ stxvb16x 32+7, 0, 4
+
+__no_update:
+ blr
+SYM_FUNC_END(gcm_update)
+
+################################################################################
+# aes_p10_gcm_encrypt (const void *inp, void *out, size_t len,
+# const char *rk, unsigned char iv[16], void *Xip);
+#
+# r3 - inp
+# r4 - out
+# r5 - len
+# r6 - AES round keys
+# r7 - iv and other data
+# r8 - Xi, HPoli, hash keys
+#
+# rounds is at offset 240 in rk
+# Xi is at 0 in gcm_table (Xip).
+#
+################################################################################
+SYM_FUNC_START(aes_p10_gcm_encrypt)
+
+ cmpdi 5, 0
+ ble __Invalid_msg_len
SAVE_REGS
-
LOAD_HASH_TABLE
# initialize ICB: GHASH( IV ), IV - r7
lxvb16x 30+32, 0, 7 # load IV - v30
- mr 12, 5 # length
- li 11, 0 # block index
+ mr 14, 3
+ mr 9, 4
# counter 1
vxor 31, 31, 31
vspltisb 22, 1
vsldoi 31, 31, 22,1 # counter 1
- # load round key to VSR
- lxv 0, 0(6)
- lxv 1, 0x10(6)
- lxv 2, 0x20(6)
- lxv 3, 0x30(6)
- lxv 4, 0x40(6)
- lxv 5, 0x50(6)
- lxv 6, 0x60(6)
- lxv 7, 0x70(6)
- lxv 8, 0x80(6)
- lxv 9, 0x90(6)
- lxv 10, 0xa0(6)
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxv 10, 0(11) # vs10: vpermxor vector
+ li 11, 0
+
+ # load 9 round keys to VSR
+ lxv 0, 0(6) # round key 0
+ lxv 1, 16(6) # round key 1
+ lxv 2, 32(6) # round key 2
+ lxv 3, 48(6) # round key 3
+ lxv 4, 64(6) # round key 4
+ lxv 5, 80(6) # round key 5
+ lxv 6, 96(6) # round key 6
+ lxv 7, 112(6) # round key 7
+ lxv 8, 128(6) # round key 8
# load rounds - 10 (128), 12 (192), 14 (256)
- lwz 9,240(6)
+ lwz 23, 240(6) # n rounds
+ li 24, 1 # encrypt
+__Process_encrypt:
#
- # vxor state, state, w # addroundkey
- xxlor 32+29, 0, 0
- vxor 15, 30, 29 # IV + round key - add round key 0
-
- cmpdi 9, 10
- beq Loop_aes_gcm_8x_dec
-
- # load 2 more round keys (v11, v12)
- lxv 11, 0xb0(6)
- lxv 12, 0xc0(6)
-
- cmpdi 9, 12
- beq Loop_aes_gcm_8x_dec
-
- # load 2 more round keys (v11, v12, v13, v14)
- lxv 13, 0xd0(6)
- lxv 14, 0xe0(6)
- cmpdi 9, 14
- beq Loop_aes_gcm_8x_dec
+ # Process different blocks
+ #
+ ld 12, 56(7)
+ cmpdi 12, 0
+ bgt __Do_combine_enc
+ cmpdi 5, 128
+ blt __Process_more_enc
+
+#
+# Process 8x AES/GCM blocks
+#
+__Process_8x_enc:
+ # 8x blcoks
+ li 10, 128
+ divdu 12, 5, 10 # n 128 bytes-blocks
- b aes_gcm_out
+ addi 12, 12, -1 # loop - 1
-.align 5
-Loop_aes_gcm_8x_dec:
- mr 14, 3
- mr 9, 4
+ vmr 15, 30 # first state: IV
+ vadduwm 16, 15, 31 # state + counter
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
- #
- # check partial block
- #
-Continue_partial_check_dec:
- ld 15, 56(7)
- cmpdi 15, 0
- beq Continue_dec
- bgt Final_block_dec
- cmpdi 15, 16
- blt Final_block_dec
-
-Continue_dec:
- # n blcoks
- li 10, 128
- divdu 10, 12, 10 # n 128 bytes-blocks
- cmpdi 10, 0
- beq Loop_last_block_dec
-
- vaddudm 30, 30, 31 # IV + counter
- vxor 16, 30, 29
- vaddudm 30, 30, 31
- vxor 17, 30, 29
- vaddudm 30, 30, 31
- vxor 18, 30, 29
- vaddudm 30, 30, 31
- vxor 19, 30, 29
- vaddudm 30, 30, 31
- vxor 20, 30, 29
- vaddudm 30, 30, 31
- vxor 21, 30, 29
- vaddudm 30, 30, 31
- vxor 22, 30, 29
-
- mtctr 10
+ # vxor state, state, w # addroundkey
+ xxlor 32+29, 0, 0
+ vxor 15, 15, 29 # IV + round key - add round key 0
+ vxor 16, 16, 29
+ vxor 17, 17, 29
+ vxor 18, 18, 29
+ vxor 19, 19, 29
+ vxor 20, 20, 29
+ vxor 21, 21, 29
+ vxor 22, 22, 29
li 15, 16
li 16, 32
@@ -1217,305 +735,502 @@ Continue_dec:
li 20, 96
li 21, 112
- lwz 10, 240(6)
-
-Loop_8x_block_dec:
-
- lxvb16x 15, 0, 14 # load block
- lxvb16x 16, 15, 14 # load block
- lxvb16x 17, 16, 14 # load block
- lxvb16x 18, 17, 14 # load block
- lxvb16x 19, 18, 14 # load block
- lxvb16x 20, 19, 14 # load block
- lxvb16x 21, 20, 14 # load block
- lxvb16x 22, 21, 14 # load block
- addi 14, 14, 128
-
- Loop_aes_middle8x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_next_ghash_dec
-
- # 192 bits
- xxlor 24+32, 11, 11
-
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
- vcipher 19, 19, 23
- vcipher 20, 20, 23
- vcipher 21, 21, 23
- vcipher 22, 22, 23
-
- vcipher 15, 15, 24
- vcipher 16, 16, 24
- vcipher 17, 17, 24
- vcipher 18, 18, 24
- vcipher 19, 19, 24
- vcipher 20, 20, 24
- vcipher 21, 21, 24
- vcipher 22, 22, 24
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_next_ghash_dec
-
- # 256 bits
- xxlor 24+32, 13, 13
-
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
- vcipher 19, 19, 23
- vcipher 20, 20, 23
- vcipher 21, 21, 23
- vcipher 22, 22, 23
-
- vcipher 15, 15, 24
- vcipher 16, 16, 24
- vcipher 17, 17, 24
- vcipher 18, 18, 24
- vcipher 19, 19, 24
- vcipher 20, 20, 24
- vcipher 21, 21, 24
- vcipher 22, 22, 24
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_next_ghash_dec
- b aes_gcm_out
+ #
+ # Pre-compute first 8 AES state and leave 1/3/5 more rounds
+ # for the loop.
+ #
+ addi 22, 23, -9 # process 8 keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
-Do_next_ghash_dec:
+ LOOP_8AES_STATE # process 8 AES keys
- #
- # last round
- vcipherlast 15, 15, 23
- vcipherlast 16, 16, 23
-
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9 # store output
- xxlxor 48, 48, 16
- stxvb16x 48, 15, 9 # store output
-
- vcipherlast 17, 17, 23
- vcipherlast 18, 18, 23
-
- xxlxor 49, 49, 17
- stxvb16x 49, 16, 9 # store output
- xxlxor 50, 50, 18
- stxvb16x 50, 17, 9 # store output
-
- vcipherlast 19, 19, 23
- vcipherlast 20, 20, 23
-
- xxlxor 51, 51, 19
- stxvb16x 51, 18, 9 # store output
- xxlxor 52, 52, 20
- stxvb16x 52, 19, 9 # store output
-
- vcipherlast 21, 21, 23
- vcipherlast 22, 22, 23
-
- xxlxor 53, 53, 21
- stxvb16x 53, 20, 9 # store output
- xxlxor 54, 54, 22
- stxvb16x 54, 21, 9 # store output
-
- addi 9, 9, 128
-
- xxlor 15+32, 15, 15
- xxlor 16+32, 16, 16
- xxlor 17+32, 17, 17
- xxlor 18+32, 18, 18
- xxlor 19+32, 19, 19
- xxlor 20+32, 20, 20
- xxlor 21+32, 21, 21
- xxlor 22+32, 22, 22
+__PreLoop_aes_state:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x vcipher 15 1
+ addi 10, 10, 16
+ bdnz __PreLoop_aes_state
+ lxv 32+1, 0(10) # last round key (v1)
+
+ cmpdi 12, 0 # Only one loop (8 block)
+ beq __Finish_ghash
+
+#
+# Loop 8x blocks and compute ghash
+#
+__Loop_8x_block_enc:
+ vcipherlast 15, 15, 1
+ vcipherlast 16, 16, 1
+ vcipherlast 17, 17, 1
+ vcipherlast 18, 18, 1
+ vcipherlast 19, 19, 1
+ vcipherlast 20, 20, 1
+ vcipherlast 21, 21, 1
+ vcipherlast 22, 22, 1
+
+ lxvb16x 32+23, 0, 14 # load block
+ lxvb16x 32+24, 15, 14 # load block
+ lxvb16x 32+25, 16, 14 # load block
+ lxvb16x 32+26, 17, 14 # load block
+ lxvb16x 32+27, 18, 14 # load block
+ lxvb16x 32+28, 19, 14 # load block
+ lxvb16x 32+29, 20, 14 # load block
+ lxvb16x 32+30, 21, 14 # load block
+ addi 14, 14, 128
+
+ vxor 15, 15, 23
+ vxor 16, 16, 24
+ vxor 17, 17, 25
+ vxor 18, 18, 26
+ vxor 19, 19, 27
+ vxor 20, 20, 28
+ vxor 21, 21, 29
+ vxor 22, 22, 30
+
+ stxvb16x 47, 0, 9 # store output
+ stxvb16x 48, 15, 9 # store output
+ stxvb16x 49, 16, 9 # store output
+ stxvb16x 50, 17, 9 # store output
+ stxvb16x 51, 18, 9 # store output
+ stxvb16x 52, 19, 9 # store output
+ stxvb16x 53, 20, 9 # store output
+ stxvb16x 54, 21, 9 # store output
+ addi 9, 9, 128
# ghash here
- ppc_aes_gcm_ghash2_4x
-
- xxlor 27+32, 0, 0
- vaddudm 30, 30, 31 # IV + counter
- vmr 29, 30
- vxor 15, 30, 27 # add round key
- vaddudm 30, 30, 31
- vxor 16, 30, 27
- vaddudm 30, 30, 31
- vxor 17, 30, 27
- vaddudm 30, 30, 31
- vxor 18, 30, 27
- vaddudm 30, 30, 31
- vxor 19, 30, 27
- vaddudm 30, 30, 31
- vxor 20, 30, 27
- vaddudm 30, 30, 31
- vxor 21, 30, 27
- vaddudm 30, 30, 31
- vxor 22, 30, 27
-
- addi 12, 12, -128
+ vxor 15, 15, 0
+ PPC_GHASH4x 0, 15, 16, 17, 18
+
+ vxor 19, 19, 0
+ PPC_GHASH4x 0, 19, 20, 21, 22
+
+ xxlor 32+15, 9, 9 # last state
+ vadduwm 15, 15, 31 # state + counter
+ vadduwm 16, 15, 31
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
+
+ xxlor 32+27, 0, 0 # restore roundkey 0
+ vxor 15, 15, 27 # IV + round key - add round key 0
+ vxor 16, 16, 27
+ vxor 17, 17, 27
+ vxor 18, 18, 27
+ vxor 19, 19, 27
+ vxor 20, 20, 27
+ vxor 21, 21, 27
+ vxor 22, 22, 27
+
+ addi 5, 5, -128
addi 11, 11, 128
- bdnz Loop_8x_block_dec
-
- vmr 30, 29
- stxvb16x 30+32, 0, 7 # update IV
-
-Loop_last_block_dec:
- cmpdi 12, 0
- beq aes_gcm_out
-
- # loop last few blocks
- li 10, 16
- divdu 10, 12, 10
-
- mtctr 10
-
- lwz 10, 240(6)
-
- cmpdi 12, 16
- blt Final_block_dec
-
-Next_rem_block_dec:
- lxvb16x 15, 0, 14 # load block
-
- Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
+ LOOP_8AES_STATE # process 8 AES keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
+__LastLoop_aes_state:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x vcipher 15 1
+ addi 10, 10, 16
+ bdnz __LastLoop_aes_state
+ lxv 32+1, 0(10) # last round key (v1)
- cmpdi 10, 10
- beq Do_next_1x_dec
+ addi 12, 12, -1
+ cmpdi 12, 0
+ bne __Loop_8x_block_enc
+
+__Finish_ghash:
+ vcipherlast 15, 15, 1
+ vcipherlast 16, 16, 1
+ vcipherlast 17, 17, 1
+ vcipherlast 18, 18, 1
+ vcipherlast 19, 19, 1
+ vcipherlast 20, 20, 1
+ vcipherlast 21, 21, 1
+ vcipherlast 22, 22, 1
+
+ lxvb16x 32+23, 0, 14 # load block
+ lxvb16x 32+24, 15, 14 # load block
+ lxvb16x 32+25, 16, 14 # load block
+ lxvb16x 32+26, 17, 14 # load block
+ lxvb16x 32+27, 18, 14 # load block
+ lxvb16x 32+28, 19, 14 # load block
+ lxvb16x 32+29, 20, 14 # load block
+ lxvb16x 32+30, 21, 14 # load block
+ addi 14, 14, 128
+
+ vxor 15, 15, 23
+ vxor 16, 16, 24
+ vxor 17, 17, 25
+ vxor 18, 18, 26
+ vxor 19, 19, 27
+ vxor 20, 20, 28
+ vxor 21, 21, 29
+ vxor 22, 22, 30
+
+ stxvb16x 47, 0, 9 # store output
+ stxvb16x 48, 15, 9 # store output
+ stxvb16x 49, 16, 9 # store output
+ stxvb16x 50, 17, 9 # store output
+ stxvb16x 51, 18, 9 # store output
+ stxvb16x 52, 19, 9 # store output
+ stxvb16x 53, 20, 9 # store output
+ stxvb16x 54, 21, 9 # store output
+ addi 9, 9, 128
+
+ vxor 15, 15, 0
+ PPC_GHASH4x 0, 15, 16, 17, 18
+
+ vxor 19, 19, 0
+ PPC_GHASH4x 0, 19, 20, 21, 22
+
+ xxlor 30+32, 9, 9 # last ctr
+ vadduwm 30, 30, 31 # increase ctr
+ stxvb16x 32+30, 0, 7 # update IV
+ stxvb16x 32+0, 0, 8 # update Xi
+
+ addi 5, 5, -128
+ addi 11, 11, 128
- # 192 bits
- xxlor 24+32, 11, 11
+ #
+ # Done 8x blocks
+ #
- vcipher 15, 15, 23
- vcipher 15, 15, 24
+ cmpdi 5, 0
+ beq aes_gcm_out
- xxlor 23+32, 12, 12
+__Process_more_enc:
+ li 24, 1 # encrypt
+ bl aes_gcm_crypt_1x
+ cmpdi 5, 0
+ beq aes_gcm_out
- cmpdi 10, 12
- beq Do_next_1x_dec
+ bl __Process_partial
+ cmpdi 5, 0
+ beq aes_gcm_out
+__Do_combine_enc:
+ bl __Combine_partial
+ cmpdi 5, 0
+ bgt __Process_encrypt
+ b aes_gcm_out
- # 256 bits
- xxlor 24+32, 13, 13
+SYM_FUNC_END(aes_p10_gcm_encrypt)
- vcipher 15, 15, 23
- vcipher 15, 15, 24
+################################################################################
+# aes_p10_gcm_decrypt (const void *inp, void *out, size_t len,
+# const char *rk, unsigned char iv[16], void *Xip);
+# 8x Decrypt
+#
+################################################################################
+SYM_FUNC_START(aes_p10_gcm_decrypt)
- xxlor 23+32, 14, 14
+ cmpdi 5, 0
+ ble __Invalid_msg_len
- cmpdi 10, 14
- beq Do_next_1x_dec
+ SAVE_REGS
+ LOAD_HASH_TABLE
-Do_next_1x_dec:
- vcipherlast 15, 15, 23
+ # initialize ICB: GHASH( IV ), IV - r7
+ lxvb16x 30+32, 0, 7 # load IV - v30
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9 # store output
- addi 14, 14, 16
- addi 9, 9, 16
+ mr 14, 3
+ mr 9, 4
- xxlor 28+32, 15, 15
- #vmr 28, 15
- ppc_update_hash_1x
+ # counter 1
+ vxor 31, 31, 31
+ vspltisb 22, 1
+ vsldoi 31, 31, 22,1 # counter 1
- addi 12, 12, -16
- addi 11, 11, 16
- xxlor 19+32, 0, 0
- vaddudm 30, 30, 31 # IV + counter
- vxor 15, 30, 19 # add round key
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxv 10, 0(11) # vs10: vpermxor vector
+ li 11, 0
+
+ # load 9 round keys to VSR
+ lxv 0, 0(6) # round key 0
+ lxv 1, 16(6) # round key 1
+ lxv 2, 32(6) # round key 2
+ lxv 3, 48(6) # round key 3
+ lxv 4, 64(6) # round key 4
+ lxv 5, 80(6) # round key 5
+ lxv 6, 96(6) # round key 6
+ lxv 7, 112(6) # round key 7
+ lxv 8, 128(6) # round key 8
- bdnz Next_rem_block_dec
+ # load rounds - 10 (128), 12 (192), 14 (256)
+ lwz 23, 240(6) # n rounds
+ li 24, 0 # decrypt
- li 15, 0
- std 15, 56(7) # clear partial?
- stxvb16x 30+32, 0, 7 # update IV
+__Process_decrypt:
+ #
+ # Process different blocks
+ #
+ ld 12, 56(7)
cmpdi 12, 0
- beq aes_gcm_out
-
-Final_block_dec:
- lwz 10, 240(6)
- Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_final_1x_dec
+ bgt __Do_combine_dec
+ cmpdi 5, 128
+ blt __Process_more_dec
+
+#
+# Process 8x AES/GCM blocks
+#
+__Process_8x_dec:
+ # 8x blcoks
+ li 10, 128
+ divdu 12, 5, 10 # n 128 bytes-blocks
- # 192 bits
- xxlor 24+32, 11, 11
+ addi 12, 12, -1 # loop - 1
- vcipher 15, 15, 23
- vcipher 15, 15, 24
+ vmr 15, 30 # first state: IV
+ vadduwm 16, 15, 31 # state + counter
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
- xxlor 23+32, 12, 12
+ # vxor state, state, w # addroundkey
+ xxlor 32+29, 0, 0
+ vxor 15, 15, 29 # IV + round key - add round key 0
+ vxor 16, 16, 29
+ vxor 17, 17, 29
+ vxor 18, 18, 29
+ vxor 19, 19, 29
+ vxor 20, 20, 29
+ vxor 21, 21, 29
+ vxor 22, 22, 29
- cmpdi 10, 12
- beq Do_final_1x_dec
+ li 15, 16
+ li 16, 32
+ li 17, 48
+ li 18, 64
+ li 19, 80
+ li 20, 96
+ li 21, 112
- # 256 bits
- xxlor 24+32, 13, 13
+ #
+ # Pre-compute first 8 AES state and leave 1/3/5 more rounds
+ # for the loop.
+ #
+ addi 22, 23, -9 # process 8 keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
- vcipher 15, 15, 23
- vcipher 15, 15, 24
+ LOOP_8AES_STATE # process 8 AES keys
- xxlor 23+32, 14, 14
+__PreLoop_aes_state_dec:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x vcipher 15 1
+ addi 10, 10, 16
+ bdnz __PreLoop_aes_state_dec
+ lxv 32+1, 0(10) # last round key (v1)
+
+ cmpdi 12, 0 # Only one loop (8 block)
+ beq __Finish_ghash_dec
+
+#
+# Loop 8x blocks and compute ghash
+#
+__Loop_8x_block_dec:
+ vcipherlast 15, 15, 1
+ vcipherlast 16, 16, 1
+ vcipherlast 17, 17, 1
+ vcipherlast 18, 18, 1
+ vcipherlast 19, 19, 1
+ vcipherlast 20, 20, 1
+ vcipherlast 21, 21, 1
+ vcipherlast 22, 22, 1
+
+ lxvb16x 32+23, 0, 14 # load block
+ lxvb16x 32+24, 15, 14 # load block
+ lxvb16x 32+25, 16, 14 # load block
+ lxvb16x 32+26, 17, 14 # load block
+ lxvb16x 32+27, 18, 14 # load block
+ lxvb16x 32+28, 19, 14 # load block
+ lxvb16x 32+29, 20, 14 # load block
+ lxvb16x 32+30, 21, 14 # load block
+ addi 14, 14, 128
+
+ vxor 15, 15, 23
+ vxor 16, 16, 24
+ vxor 17, 17, 25
+ vxor 18, 18, 26
+ vxor 19, 19, 27
+ vxor 20, 20, 28
+ vxor 21, 21, 29
+ vxor 22, 22, 30
+
+ stxvb16x 47, 0, 9 # store output
+ stxvb16x 48, 15, 9 # store output
+ stxvb16x 49, 16, 9 # store output
+ stxvb16x 50, 17, 9 # store output
+ stxvb16x 51, 18, 9 # store output
+ stxvb16x 52, 19, 9 # store output
+ stxvb16x 53, 20, 9 # store output
+ stxvb16x 54, 21, 9 # store output
+
+ addi 9, 9, 128
+
+ vmr 15, 23
+ vmr 16, 24
+ vmr 17, 25
+ vmr 18, 26
+ vmr 19, 27
+ vmr 20, 28
+ vmr 21, 29
+ vmr 22, 30
- cmpdi 10, 14
- beq Do_final_1x_dec
+ # ghash here
+ vxor 15, 15, 0
+ PPC_GHASH4x 0, 15, 16, 17, 18
+
+ vxor 19, 19, 0
+ PPC_GHASH4x 0, 19, 20, 21, 22
+
+ xxlor 32+15, 9, 9 # last state
+ vadduwm 15, 15, 31 # state + counter
+ vadduwm 16, 15, 31
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
+
+ xxlor 32+27, 0, 0 # restore roundkey 0
+ vxor 15, 15, 27 # IV + round key - add round key 0
+ vxor 16, 16, 27
+ vxor 17, 17, 27
+ vxor 18, 18, 27
+ vxor 19, 19, 27
+ vxor 20, 20, 27
+ vxor 21, 21, 27
+ vxor 22, 22, 27
+
+ addi 5, 5, -128
+ addi 11, 11, 128
-Do_final_1x_dec:
- vcipherlast 15, 15, 23
+ LOOP_8AES_STATE # process 8 AES keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
+__LastLoop_aes_state_dec:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x vcipher 15 1
+ addi 10, 10, 16
+ bdnz __LastLoop_aes_state_dec
+ lxv 32+1, 0(10) # last round key (v1)
- # check partial block
- li 21, 1 # decrypt
- ld 15, 56(7) # partial?
- cmpdi 15, 0
- beq Normal_block_dec
- bl Do_partial_block
+ addi 12, 12, -1
cmpdi 12, 0
- ble aes_gcm_out
-
- b Continue_partial_check_dec
+ bne __Loop_8x_block_dec
+
+__Finish_ghash_dec:
+ vcipherlast 15, 15, 1
+ vcipherlast 16, 16, 1
+ vcipherlast 17, 17, 1
+ vcipherlast 18, 18, 1
+ vcipherlast 19, 19, 1
+ vcipherlast 20, 20, 1
+ vcipherlast 21, 21, 1
+ vcipherlast 22, 22, 1
+
+ lxvb16x 32+23, 0, 14 # load block
+ lxvb16x 32+24, 15, 14 # load block
+ lxvb16x 32+25, 16, 14 # load block
+ lxvb16x 32+26, 17, 14 # load block
+ lxvb16x 32+27, 18, 14 # load block
+ lxvb16x 32+28, 19, 14 # load block
+ lxvb16x 32+29, 20, 14 # load block
+ lxvb16x 32+30, 21, 14 # load block
+ addi 14, 14, 128
+
+ vxor 15, 15, 23
+ vxor 16, 16, 24
+ vxor 17, 17, 25
+ vxor 18, 18, 26
+ vxor 19, 19, 27
+ vxor 20, 20, 28
+ vxor 21, 21, 29
+ vxor 22, 22, 30
+
+ stxvb16x 47, 0, 9 # store output
+ stxvb16x 48, 15, 9 # store output
+ stxvb16x 49, 16, 9 # store output
+ stxvb16x 50, 17, 9 # store output
+ stxvb16x 51, 18, 9 # store output
+ stxvb16x 52, 19, 9 # store output
+ stxvb16x 53, 20, 9 # store output
+ stxvb16x 54, 21, 9 # store output
+ addi 9, 9, 128
+
+ #vmr 15, 23
+ vxor 15, 23, 0
+ vmr 16, 24
+ vmr 17, 25
+ vmr 18, 26
+ vmr 19, 27
+ vmr 20, 28
+ vmr 21, 29
+ vmr 22, 30
+
+ #vxor 15, 15, 0
+ PPC_GHASH4x 0, 15, 16, 17, 18
+
+ vxor 19, 19, 0
+ PPC_GHASH4x 0, 19, 20, 21, 22
+
+ xxlor 30+32, 9, 9 # last ctr
+ vadduwm 30, 30, 31 # increase ctr
+ stxvb16x 32+30, 0, 7 # update IV
+ stxvb16x 32+0, 0, 8 # update Xi
+
+ addi 5, 5, -128
+ addi 11, 11, 128
-Normal_block_dec:
- lxvb16x 15, 0, 14 # load last block
- xxlxor 47, 47, 15
+ #
+ # Done 8x blocks
+ #
- # create partial block mask
- li 15, 16
- sub 15, 15, 12 # index to the mask
+ cmpdi 5, 0
+ beq aes_gcm_out
- vspltisb 16, -1 # first 16 bytes - 0xffff...ff
- vspltisb 17, 0 # second 16 bytes - 0x0000...00
- li 10, 192
- stvx 16, 10, 1
- addi 10, 10, 16
- stvx 17, 10, 1
+__Process_more_dec:
+ li 24, 0 # decrypt
+ bl aes_gcm_crypt_1x
+ cmpdi 5, 0
+ beq aes_gcm_out
- addi 10, 1, 192
- lxvb16x 16, 15, 10 # load partial block mask
- xxland 47, 47, 16
+ bl __Process_partial
+ cmpdi 5, 0
+ beq aes_gcm_out
+__Do_combine_dec:
+ bl __Combine_partial
+ cmpdi 5, 0
+ bgt __Process_decrypt
+ b aes_gcm_out
+SYM_FUNC_END(aes_p10_gcm_decrypt)
- xxland 32+28, 15, 16
- #vmr 28, 15
- ppc_update_hash_1x
+SYM_FUNC_START_LOCAL(aes_gcm_out)
- # * should store only the remaining bytes.
- bl Write_partial_block
+ mr 3, 11 # return count
- stxvb16x 30+32, 0, 7 # update IV
- std 12, 56(7) # update partial?
- li 16, 16
+ RESTORE_REGS
+ blr
- stxvb16x 32, 0, 8 # write out Xi
- stxvb16x 32, 16, 8 # write out Xi
- b aes_gcm_out
+__Invalid_msg_len:
+ li 3, 0
+ blr
+SYM_FUNC_END(aes_gcm_out)
+
+SYM_DATA_START_LOCAL(PERMX)
+.align 4
+# for vector permute and xor
+permx:
+.long 0x4c5d6e7f, 0x08192a3b, 0xc4d5e6f7, 0x8091a2b3
+SYM_DATA_END(permx)
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 7b1bebed879d..3d2e38ba5240 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -363,7 +363,7 @@ config CRYPTO_CHACHA20_X86_64
- AVX-512VL (Advanced Vector Extensions-512VL)
config CRYPTO_AEGIS128_AESNI_SSE2
- tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE2)"
+ tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE4.1)"
depends on X86 && 64BIT
select CRYPTO_AEAD
select CRYPTO_SIMD
@@ -372,7 +372,7 @@ config CRYPTO_AEGIS128_AESNI_SSE2
Architecture: x86_64 using:
- AES-NI (AES New Instructions)
- - SSE2 (Streaming SIMD Extensions 2)
+ - SSE4.1 (Streaming SIMD Extensions 4.1)
config CRYPTO_NHPOLY1305_SSE2
tristate "Hash functions: NHPoly1305 (SSE2)"
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index ad7f4c891625..7294dc0ee7ba 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -1,14 +1,13 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * AES-NI + SSE2 implementation of AEGIS-128
+ * AES-NI + SSE4.1 implementation of AEGIS-128
*
* Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
* Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ * Copyright 2024 Google LLC
*/
#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-#include <asm/frame.h>
#define STATE0 %xmm0
#define STATE1 %xmm1
@@ -20,11 +19,6 @@
#define T0 %xmm6
#define T1 %xmm7
-#define STATEP %rdi
-#define LEN %rsi
-#define SRC %rdx
-#define DST %rcx
-
.section .rodata.cst16.aegis128_const, "aM", @progbits, 32
.align 16
.Laegis128_const_0:
@@ -34,11 +28,11 @@
.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
-.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16
-.align 16
-.Laegis128_counter:
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
- .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+.section .rodata.cst32.zeropad_mask, "aM", @progbits, 32
+.align 32
+.Lzeropad_mask:
+ .octa 0xffffffffffffffffffffffffffffffff
+ .octa 0
.text
@@ -61,140 +55,102 @@
.endm
/*
- * __load_partial: internal ABI
- * input:
- * LEN - bytes
- * SRC - src
- * output:
- * MSG - message block
- * changed:
- * T0
- * %r8
- * %r9
+ * Load 1 <= LEN (%ecx) <= 15 bytes from the pointer SRC into the xmm register
+ * MSG and zeroize any remaining bytes. Clobbers %rax, %rcx, and %r8.
*/
-SYM_FUNC_START_LOCAL(__load_partial)
- xor %r9d, %r9d
- pxor MSG, MSG
-
- mov LEN, %r8
- and $0x1, %r8
- jz .Lld_partial_1
-
- mov LEN, %r8
- and $0x1E, %r8
- add SRC, %r8
- mov (%r8), %r9b
-
-.Lld_partial_1:
- mov LEN, %r8
- and $0x2, %r8
- jz .Lld_partial_2
-
- mov LEN, %r8
- and $0x1C, %r8
- add SRC, %r8
- shl $0x10, %r9
- mov (%r8), %r9w
-
-.Lld_partial_2:
- mov LEN, %r8
- and $0x4, %r8
- jz .Lld_partial_4
-
- mov LEN, %r8
- and $0x18, %r8
- add SRC, %r8
- shl $32, %r9
- mov (%r8), %r8d
- xor %r8, %r9
-
-.Lld_partial_4:
- movq %r9, MSG
-
- mov LEN, %r8
- and $0x8, %r8
- jz .Lld_partial_8
-
- mov LEN, %r8
- and $0x10, %r8
- add SRC, %r8
- pslldq $8, MSG
- movq (%r8), T0
- pxor T0, MSG
-
-.Lld_partial_8:
- RET
-SYM_FUNC_END(__load_partial)
+.macro load_partial
+ sub $8, %ecx /* LEN - 8 */
+ jle .Lle8\@
+
+ /* Load 9 <= LEN <= 15 bytes: */
+ movq (SRC), MSG /* Load first 8 bytes */
+ mov (SRC, %rcx), %rax /* Load last 8 bytes */
+ neg %ecx
+ shl $3, %ecx
+ shr %cl, %rax /* Discard overlapping bytes */
+ pinsrq $1, %rax, MSG
+ jmp .Ldone\@
+
+.Lle8\@:
+ add $4, %ecx /* LEN - 4 */
+ jl .Llt4\@
+
+ /* Load 4 <= LEN <= 8 bytes: */
+ mov (SRC), %eax /* Load first 4 bytes */
+ mov (SRC, %rcx), %r8d /* Load last 4 bytes */
+ jmp .Lcombine\@
+
+.Llt4\@:
+ /* Load 1 <= LEN <= 3 bytes: */
+ add $2, %ecx /* LEN - 2 */
+ movzbl (SRC), %eax /* Load first byte */
+ jl .Lmovq\@
+ movzwl (SRC, %rcx), %r8d /* Load last 2 bytes */
+.Lcombine\@:
+ shl $3, %ecx
+ shl %cl, %r8
+ or %r8, %rax /* Combine the two parts */
+.Lmovq\@:
+ movq %rax, MSG
+.Ldone\@:
+.endm
/*
- * __store_partial: internal ABI
- * input:
- * LEN - bytes
- * DST - dst
- * output:
- * T0 - message block
- * changed:
- * %r8
- * %r9
- * %r10
+ * Store 1 <= LEN (%ecx) <= 15 bytes from the xmm register \msg to the pointer
+ * DST. Clobbers %rax, %rcx, and %r8.
*/
-SYM_FUNC_START_LOCAL(__store_partial)
- mov LEN, %r8
- mov DST, %r9
-
- movq T0, %r10
-
- cmp $8, %r8
- jl .Lst_partial_8
-
- mov %r10, (%r9)
- psrldq $8, T0
- movq T0, %r10
-
- sub $8, %r8
- add $8, %r9
-
-.Lst_partial_8:
- cmp $4, %r8
- jl .Lst_partial_4
-
- mov %r10d, (%r9)
- shr $32, %r10
-
- sub $4, %r8
- add $4, %r9
-
-.Lst_partial_4:
- cmp $2, %r8
- jl .Lst_partial_2
-
- mov %r10w, (%r9)
- shr $0x10, %r10
-
- sub $2, %r8
- add $2, %r9
-
-.Lst_partial_2:
- cmp $1, %r8
- jl .Lst_partial_1
-
- mov %r10b, (%r9)
-
-.Lst_partial_1:
- RET
-SYM_FUNC_END(__store_partial)
+.macro store_partial msg
+ sub $8, %ecx /* LEN - 8 */
+ jl .Llt8\@
+
+ /* Store 8 <= LEN <= 15 bytes: */
+ pextrq $1, \msg, %rax
+ mov %ecx, %r8d
+ shl $3, %ecx
+ ror %cl, %rax
+ mov %rax, (DST, %r8) /* Store last LEN - 8 bytes */
+ movq \msg, (DST) /* Store first 8 bytes */
+ jmp .Ldone\@
+
+.Llt8\@:
+ add $4, %ecx /* LEN - 4 */
+ jl .Llt4\@
+
+ /* Store 4 <= LEN <= 7 bytes: */
+ pextrd $1, \msg, %eax
+ mov %ecx, %r8d
+ shl $3, %ecx
+ ror %cl, %eax
+ mov %eax, (DST, %r8) /* Store last LEN - 4 bytes */
+ movd \msg, (DST) /* Store first 4 bytes */
+ jmp .Ldone\@
+
+.Llt4\@:
+ /* Store 1 <= LEN <= 3 bytes: */
+ pextrb $0, \msg, 0(DST)
+ cmp $-2, %ecx /* LEN - 4 == -2, i.e. LEN == 2? */
+ jl .Ldone\@
+ pextrb $1, \msg, 1(DST)
+ je .Ldone\@
+ pextrb $2, \msg, 2(DST)
+.Ldone\@:
+.endm
/*
- * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv);
+ * void aegis128_aesni_init(struct aegis_state *state,
+ * const struct aegis_block *key,
+ * const u8 iv[AEGIS128_NONCE_SIZE]);
*/
-SYM_FUNC_START(crypto_aegis128_aesni_init)
- FRAME_BEGIN
+SYM_FUNC_START(aegis128_aesni_init)
+ .set STATEP, %rdi
+ .set KEYP, %rsi
+ .set IVP, %rdx
/* load IV: */
- movdqu (%rdx), T1
+ movdqu (IVP), T1
/* load key: */
- movdqa (%rsi), KEY
+ movdqa (KEYP), KEY
pxor KEY, T1
movdqa T1, STATE0
movdqa KEY, STATE3
@@ -224,20 +180,22 @@ SYM_FUNC_START(crypto_aegis128_aesni_init)
movdqu STATE2, 0x20(STATEP)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
-
- FRAME_END
RET
-SYM_FUNC_END(crypto_aegis128_aesni_init)
+SYM_FUNC_END(aegis128_aesni_init)
/*
- * void crypto_aegis128_aesni_ad(void *state, unsigned int length,
- * const void *data);
+ * void aegis128_aesni_ad(struct aegis_state *state, const u8 *data,
+ * unsigned int len);
+ *
+ * len must be a multiple of 16.
*/
-SYM_FUNC_START(crypto_aegis128_aesni_ad)
- FRAME_BEGIN
+SYM_FUNC_START(aegis128_aesni_ad)
+ .set STATEP, %rdi
+ .set SRC, %rsi
+ .set LEN, %edx
- cmp $0x10, LEN
- jb .Lad_out
+ test LEN, LEN
+ jz .Lad_out
/* load the state: */
movdqu 0x00(STATEP), STATE0
@@ -246,89 +204,40 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu 0x30(STATEP), STATE3
movdqu 0x40(STATEP), STATE4
- mov SRC, %r8
- and $0xF, %r8
- jnz .Lad_u_loop
-
-.align 8
-.Lad_a_loop:
- movdqa 0x00(SRC), MSG
- aegis128_update
- pxor MSG, STATE4
- sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_1
-
- movdqa 0x10(SRC), MSG
- aegis128_update
- pxor MSG, STATE3
- sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_2
-
- movdqa 0x20(SRC), MSG
- aegis128_update
- pxor MSG, STATE2
- sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_3
-
- movdqa 0x30(SRC), MSG
- aegis128_update
- pxor MSG, STATE1
- sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_4
-
- movdqa 0x40(SRC), MSG
- aegis128_update
- pxor MSG, STATE0
- sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_0
-
- add $0x50, SRC
- jmp .Lad_a_loop
-
.align 8
-.Lad_u_loop:
+.Lad_loop:
movdqu 0x00(SRC), MSG
aegis128_update
pxor MSG, STATE4
sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_1
+ jz .Lad_out_1
movdqu 0x10(SRC), MSG
aegis128_update
pxor MSG, STATE3
sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_2
+ jz .Lad_out_2
movdqu 0x20(SRC), MSG
aegis128_update
pxor MSG, STATE2
sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_3
+ jz .Lad_out_3
movdqu 0x30(SRC), MSG
aegis128_update
pxor MSG, STATE1
sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_4
+ jz .Lad_out_4
movdqu 0x40(SRC), MSG
aegis128_update
pxor MSG, STATE0
sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_0
+ jz .Lad_out_0
add $0x50, SRC
- jmp .Lad_u_loop
+ jmp .Lad_loop
/* store the state: */
.Lad_out_0:
@@ -337,7 +246,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE2, 0x20(STATEP)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
- FRAME_END
RET
.Lad_out_1:
@@ -346,7 +254,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE1, 0x20(STATEP)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
- FRAME_END
RET
.Lad_out_2:
@@ -355,7 +262,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE0, 0x20(STATEP)
movdqu STATE1, 0x30(STATEP)
movdqu STATE2, 0x40(STATEP)
- FRAME_END
RET
.Lad_out_3:
@@ -364,7 +270,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE4, 0x20(STATEP)
movdqu STATE0, 0x30(STATEP)
movdqu STATE1, 0x40(STATEP)
- FRAME_END
RET
.Lad_out_4:
@@ -373,41 +278,38 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE3, 0x20(STATEP)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
- FRAME_END
- RET
-
.Lad_out:
- FRAME_END
RET
-SYM_FUNC_END(crypto_aegis128_aesni_ad)
+SYM_FUNC_END(aegis128_aesni_ad)
-.macro encrypt_block a s0 s1 s2 s3 s4 i
- movdq\a (\i * 0x10)(SRC), MSG
+.macro encrypt_block s0 s1 s2 s3 s4 i
+ movdqu (\i * 0x10)(SRC), MSG
movdqa MSG, T0
pxor \s1, T0
pxor \s4, T0
movdqa \s2, T1
pand \s3, T1
pxor T1, T0
- movdq\a T0, (\i * 0x10)(DST)
+ movdqu T0, (\i * 0x10)(DST)
aegis128_update
pxor MSG, \s4
sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lenc_out_\i
+ jz .Lenc_out_\i
.endm
/*
- * void crypto_aegis128_aesni_enc(void *state, unsigned int length,
- * const void *src, void *dst);
+ * void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, u8 *dst,
+ * unsigned int len);
+ *
+ * len must be nonzero and a multiple of 16.
*/
-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc)
- FRAME_BEGIN
-
- cmp $0x10, LEN
- jb .Lenc_out
+SYM_FUNC_START(aegis128_aesni_enc)
+ .set STATEP, %rdi
+ .set SRC, %rsi
+ .set DST, %rdx
+ .set LEN, %ecx
/* load the state: */
movdqu 0x00(STATEP), STATE0
@@ -416,34 +318,17 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc)
movdqu 0x30(STATEP), STATE3
movdqu 0x40(STATEP), STATE4
- mov SRC, %r8
- or DST, %r8
- and $0xF, %r8
- jnz .Lenc_u_loop
-
.align 8
-.Lenc_a_loop:
- encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
- encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
- encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
- encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
- encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
+.Lenc_loop:
+ encrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0
+ encrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1
+ encrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2
+ encrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3
+ encrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4
add $0x50, SRC
add $0x50, DST
- jmp .Lenc_a_loop
-
-.align 8
-.Lenc_u_loop:
- encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
- encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
- encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
- encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
- encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
-
- add $0x50, SRC
- add $0x50, DST
- jmp .Lenc_u_loop
+ jmp .Lenc_loop
/* store the state: */
.Lenc_out_0:
@@ -452,7 +337,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE1, 0x20(STATEP)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
- FRAME_END
RET
.Lenc_out_1:
@@ -461,7 +345,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE0, 0x20(STATEP)
movdqu STATE1, 0x30(STATEP)
movdqu STATE2, 0x40(STATEP)
- FRAME_END
RET
.Lenc_out_2:
@@ -470,7 +353,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE4, 0x20(STATEP)
movdqu STATE0, 0x30(STATEP)
movdqu STATE1, 0x40(STATEP)
- FRAME_END
RET
.Lenc_out_3:
@@ -479,7 +361,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE3, 0x20(STATEP)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
- FRAME_END
RET
.Lenc_out_4:
@@ -488,20 +369,19 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE2, 0x20(STATEP)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
- FRAME_END
- RET
-
.Lenc_out:
- FRAME_END
RET
-SYM_FUNC_END(crypto_aegis128_aesni_enc)
+SYM_FUNC_END(aegis128_aesni_enc)
/*
- * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length,
- * const void *src, void *dst);
+ * void aegis128_aesni_enc_tail(struct aegis_state *state, const u8 *src,
+ * u8 *dst, unsigned int len);
*/
-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail)
- FRAME_BEGIN
+SYM_FUNC_START(aegis128_aesni_enc_tail)
+ .set STATEP, %rdi
+ .set SRC, %rsi
+ .set DST, %rdx
+ .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */
/* load the state: */
movdqu 0x00(STATEP), STATE0
@@ -511,7 +391,8 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail)
movdqu 0x40(STATEP), STATE4
/* encrypt message: */
- call __load_partial
+ mov LEN, %r9d
+ load_partial
movdqa MSG, T0
pxor STATE1, T0
@@ -520,7 +401,8 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail)
pand STATE3, T1
pxor T1, T0
- call __store_partial
+ mov %r9d, LEN
+ store_partial T0
aegis128_update
pxor MSG, STATE4
@@ -531,37 +413,36 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail)
movdqu STATE1, 0x20(STATEP)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
-
- FRAME_END
RET
-SYM_FUNC_END(crypto_aegis128_aesni_enc_tail)
+SYM_FUNC_END(aegis128_aesni_enc_tail)
-.macro decrypt_block a s0 s1 s2 s3 s4 i
- movdq\a (\i * 0x10)(SRC), MSG
+.macro decrypt_block s0 s1 s2 s3 s4 i
+ movdqu (\i * 0x10)(SRC), MSG
pxor \s1, MSG
pxor \s4, MSG
movdqa \s2, T1
pand \s3, T1
pxor T1, MSG
- movdq\a MSG, (\i * 0x10)(DST)
+ movdqu MSG, (\i * 0x10)(DST)
aegis128_update
pxor MSG, \s4
sub $0x10, LEN
- cmp $0x10, LEN
- jl .Ldec_out_\i
+ jz .Ldec_out_\i
.endm
/*
- * void crypto_aegis128_aesni_dec(void *state, unsigned int length,
- * const void *src, void *dst);
+ * void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, u8 *dst,
+ * unsigned int len);
+ *
+ * len must be nonzero and a multiple of 16.
*/
-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec)
- FRAME_BEGIN
-
- cmp $0x10, LEN
- jb .Ldec_out
+SYM_FUNC_START(aegis128_aesni_dec)
+ .set STATEP, %rdi
+ .set SRC, %rsi
+ .set DST, %rdx
+ .set LEN, %ecx
/* load the state: */
movdqu 0x00(STATEP), STATE0
@@ -570,34 +451,17 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec)
movdqu 0x30(STATEP), STATE3
movdqu 0x40(STATEP), STATE4
- mov SRC, %r8
- or DST, %r8
- and $0xF, %r8
- jnz .Ldec_u_loop
-
.align 8
-.Ldec_a_loop:
- decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
- decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
- decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
- decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
- decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
+.Ldec_loop:
+ decrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0
+ decrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1
+ decrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2
+ decrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3
+ decrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4
add $0x50, SRC
add $0x50, DST
- jmp .Ldec_a_loop
-
-.align 8
-.Ldec_u_loop:
- decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
- decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
- decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
- decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
- decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
-
- add $0x50, SRC
- add $0x50, DST
- jmp .Ldec_u_loop
+ jmp .Ldec_loop
/* store the state: */
.Ldec_out_0:
@@ -606,7 +470,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE1, 0x20(STATEP)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
- FRAME_END
RET
.Ldec_out_1:
@@ -615,7 +478,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE0, 0x20(STATEP)
movdqu STATE1, 0x30(STATEP)
movdqu STATE2, 0x40(STATEP)
- FRAME_END
RET
.Ldec_out_2:
@@ -624,7 +486,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE4, 0x20(STATEP)
movdqu STATE0, 0x30(STATEP)
movdqu STATE1, 0x40(STATEP)
- FRAME_END
RET
.Ldec_out_3:
@@ -633,7 +494,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE3, 0x20(STATEP)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
- FRAME_END
RET
.Ldec_out_4:
@@ -642,20 +502,19 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE2, 0x20(STATEP)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
- FRAME_END
- RET
-
.Ldec_out:
- FRAME_END
RET
-SYM_FUNC_END(crypto_aegis128_aesni_dec)
+SYM_FUNC_END(aegis128_aesni_dec)
/*
- * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length,
- * const void *src, void *dst);
+ * void aegis128_aesni_dec_tail(struct aegis_state *state, const u8 *src,
+ * u8 *dst, unsigned int len);
*/
-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail)
- FRAME_BEGIN
+SYM_FUNC_START(aegis128_aesni_dec_tail)
+ .set STATEP, %rdi
+ .set SRC, %rsi
+ .set DST, %rdx
+ .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */
/* load the state: */
movdqu 0x00(STATEP), STATE0
@@ -665,7 +524,8 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail)
movdqu 0x40(STATEP), STATE4
/* decrypt message: */
- call __load_partial
+ mov LEN, %r9d
+ load_partial
pxor STATE1, MSG
pxor STATE4, MSG
@@ -673,17 +533,13 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail)
pand STATE3, T1
pxor T1, MSG
- movdqa MSG, T0
- call __store_partial
+ mov %r9d, LEN
+ store_partial MSG
/* mask with byte count: */
- movq LEN, T0
- punpcklbw T0, T0
- punpcklbw T0, T0
- punpcklbw T0, T0
- punpcklbw T0, T0
- movdqa .Laegis128_counter(%rip), T1
- pcmpgtb T1, T0
+ lea .Lzeropad_mask+16(%rip), %rax
+ sub %r9, %rax
+ movdqu (%rax), T0
pand T0, MSG
aegis128_update
@@ -695,17 +551,19 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail)
movdqu STATE1, 0x20(STATEP)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
-
- FRAME_END
RET
-SYM_FUNC_END(crypto_aegis128_aesni_dec_tail)
+SYM_FUNC_END(aegis128_aesni_dec_tail)
/*
- * void crypto_aegis128_aesni_final(void *state, void *tag_xor,
- * u64 assoclen, u64 cryptlen);
+ * void aegis128_aesni_final(struct aegis_state *state,
+ * struct aegis_block *tag_xor,
+ * unsigned int assoclen, unsigned int cryptlen);
*/
-SYM_FUNC_START(crypto_aegis128_aesni_final)
- FRAME_BEGIN
+SYM_FUNC_START(aegis128_aesni_final)
+ .set STATEP, %rdi
+ .set TAG_XOR, %rsi
+ .set ASSOCLEN, %edx
+ .set CRYPTLEN, %ecx
/* load the state: */
movdqu 0x00(STATEP), STATE0
@@ -715,10 +573,8 @@ SYM_FUNC_START(crypto_aegis128_aesni_final)
movdqu 0x40(STATEP), STATE4
/* prepare length block: */
- movq %rdx, MSG
- movq %rcx, T0
- pslldq $8, T0
- pxor T0, MSG
+ movd ASSOCLEN, MSG
+ pinsrd $2, CRYPTLEN, MSG
psllq $3, MSG /* multiply by 8 (to get bit count) */
pxor STATE3, MSG
@@ -733,7 +589,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_final)
aegis128_update; pxor MSG, STATE3
/* xor tag: */
- movdqu (%rsi), MSG
+ movdqu (TAG_XOR), MSG
pxor STATE0, MSG
pxor STATE1, MSG
@@ -741,8 +597,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_final)
pxor STATE3, MSG
pxor STATE4, MSG
- movdqu MSG, (%rsi)
-
- FRAME_END
+ movdqu MSG, (TAG_XOR)
RET
-SYM_FUNC_END(crypto_aegis128_aesni_final)
+SYM_FUNC_END(aegis128_aesni_final)
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index 4623189000d8..c19d8e3d96a3 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* The AEGIS-128 Authenticated-Encryption Algorithm
- * Glue for AES-NI + SSE2 implementation
+ * Glue for AES-NI + SSE4.1 implementation
*
* Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
* Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
@@ -23,27 +23,6 @@
#define AEGIS128_MIN_AUTH_SIZE 8
#define AEGIS128_MAX_AUTH_SIZE 16
-asmlinkage void crypto_aegis128_aesni_init(void *state, void *key, void *iv);
-
-asmlinkage void crypto_aegis128_aesni_ad(
- void *state, unsigned int length, const void *data);
-
-asmlinkage void crypto_aegis128_aesni_enc(
- void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis128_aesni_dec(
- void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis128_aesni_enc_tail(
- void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis128_aesni_dec_tail(
- void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis128_aesni_final(
- void *state, void *tag_xor, unsigned int cryptlen,
- unsigned int assoclen);
-
struct aegis_block {
u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN);
};
@@ -56,15 +35,31 @@ struct aegis_ctx {
struct aegis_block key;
};
-struct aegis_crypt_ops {
- int (*skcipher_walk_init)(struct skcipher_walk *walk,
- struct aead_request *req, bool atomic);
+asmlinkage void aegis128_aesni_init(struct aegis_state *state,
+ const struct aegis_block *key,
+ const u8 iv[AEGIS128_NONCE_SIZE]);
- void (*crypt_blocks)(void *state, unsigned int length, const void *src,
- void *dst);
- void (*crypt_tail)(void *state, unsigned int length, const void *src,
- void *dst);
-};
+asmlinkage void aegis128_aesni_ad(struct aegis_state *state, const u8 *data,
+ unsigned int len);
+
+asmlinkage void aegis128_aesni_enc(struct aegis_state *state, const u8 *src,
+ u8 *dst, unsigned int len);
+
+asmlinkage void aegis128_aesni_dec(struct aegis_state *state, const u8 *src,
+ u8 *dst, unsigned int len);
+
+asmlinkage void aegis128_aesni_enc_tail(struct aegis_state *state,
+ const u8 *src, u8 *dst,
+ unsigned int len);
+
+asmlinkage void aegis128_aesni_dec_tail(struct aegis_state *state,
+ const u8 *src, u8 *dst,
+ unsigned int len);
+
+asmlinkage void aegis128_aesni_final(struct aegis_state *state,
+ struct aegis_block *tag_xor,
+ unsigned int assoclen,
+ unsigned int cryptlen);
static void crypto_aegis128_aesni_process_ad(
struct aegis_state *state, struct scatterlist *sg_src,
@@ -85,16 +80,15 @@ static void crypto_aegis128_aesni_process_ad(
if (pos > 0) {
unsigned int fill = AEGIS128_BLOCK_SIZE - pos;
memcpy(buf.bytes + pos, src, fill);
- crypto_aegis128_aesni_ad(state,
- AEGIS128_BLOCK_SIZE,
- buf.bytes);
+ aegis128_aesni_ad(state, buf.bytes,
+ AEGIS128_BLOCK_SIZE);
pos = 0;
left -= fill;
src += fill;
}
- crypto_aegis128_aesni_ad(state, left, src);
-
+ aegis128_aesni_ad(state, src,
+ left & ~(AEGIS128_BLOCK_SIZE - 1));
src += left & ~(AEGIS128_BLOCK_SIZE - 1);
left &= AEGIS128_BLOCK_SIZE - 1;
}
@@ -110,24 +104,37 @@ static void crypto_aegis128_aesni_process_ad(
if (pos > 0) {
memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos);
- crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes);
+ aegis128_aesni_ad(state, buf.bytes, AEGIS128_BLOCK_SIZE);
}
}
-static void crypto_aegis128_aesni_process_crypt(
- struct aegis_state *state, struct skcipher_walk *walk,
- const struct aegis_crypt_ops *ops)
+static __always_inline void
+crypto_aegis128_aesni_process_crypt(struct aegis_state *state,
+ struct skcipher_walk *walk, bool enc)
{
while (walk->nbytes >= AEGIS128_BLOCK_SIZE) {
- ops->crypt_blocks(state,
- round_down(walk->nbytes, AEGIS128_BLOCK_SIZE),
- walk->src.virt.addr, walk->dst.virt.addr);
+ if (enc)
+ aegis128_aesni_enc(state, walk->src.virt.addr,
+ walk->dst.virt.addr,
+ round_down(walk->nbytes,
+ AEGIS128_BLOCK_SIZE));
+ else
+ aegis128_aesni_dec(state, walk->src.virt.addr,
+ walk->dst.virt.addr,
+ round_down(walk->nbytes,
+ AEGIS128_BLOCK_SIZE));
skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE);
}
if (walk->nbytes) {
- ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr,
- walk->dst.virt.addr);
+ if (enc)
+ aegis128_aesni_enc_tail(state, walk->src.virt.addr,
+ walk->dst.virt.addr,
+ walk->nbytes);
+ else
+ aegis128_aesni_dec_tail(state, walk->src.virt.addr,
+ walk->dst.virt.addr,
+ walk->nbytes);
skcipher_walk_done(walk, 0);
}
}
@@ -162,42 +169,39 @@ static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm,
return 0;
}
-static void crypto_aegis128_aesni_crypt(struct aead_request *req,
- struct aegis_block *tag_xor,
- unsigned int cryptlen,
- const struct aegis_crypt_ops *ops)
+static __always_inline void
+crypto_aegis128_aesni_crypt(struct aead_request *req,
+ struct aegis_block *tag_xor,
+ unsigned int cryptlen, bool enc)
{
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm);
struct skcipher_walk walk;
struct aegis_state state;
- ops->skcipher_walk_init(&walk, req, true);
+ if (enc)
+ skcipher_walk_aead_encrypt(&walk, req, true);
+ else
+ skcipher_walk_aead_decrypt(&walk, req, true);
kernel_fpu_begin();
- crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv);
+ aegis128_aesni_init(&state, &ctx->key, req->iv);
crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen);
- crypto_aegis128_aesni_process_crypt(&state, &walk, ops);
- crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
+ crypto_aegis128_aesni_process_crypt(&state, &walk, enc);
+ aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
kernel_fpu_end();
}
static int crypto_aegis128_aesni_encrypt(struct aead_request *req)
{
- static const struct aegis_crypt_ops OPS = {
- .skcipher_walk_init = skcipher_walk_aead_encrypt,
- .crypt_blocks = crypto_aegis128_aesni_enc,
- .crypt_tail = crypto_aegis128_aesni_enc_tail,
- };
-
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct aegis_block tag = {};
unsigned int authsize = crypto_aead_authsize(tfm);
unsigned int cryptlen = req->cryptlen;
- crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
+ crypto_aegis128_aesni_crypt(req, &tag, cryptlen, true);
scatterwalk_map_and_copy(tag.bytes, req->dst,
req->assoclen + cryptlen, authsize, 1);
@@ -208,12 +212,6 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req)
{
static const struct aegis_block zeros = {};
- static const struct aegis_crypt_ops OPS = {
- .skcipher_walk_init = skcipher_walk_aead_decrypt,
- .crypt_blocks = crypto_aegis128_aesni_dec,
- .crypt_tail = crypto_aegis128_aesni_dec_tail,
- };
-
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct aegis_block tag;
unsigned int authsize = crypto_aead_authsize(tfm);
@@ -222,27 +220,16 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req)
scatterwalk_map_and_copy(tag.bytes, req->src,
req->assoclen + cryptlen, authsize, 0);
- crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
+ crypto_aegis128_aesni_crypt(req, &tag, cryptlen, false);
return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
}
-static int crypto_aegis128_aesni_init_tfm(struct crypto_aead *aead)
-{
- return 0;
-}
-
-static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead)
-{
-}
-
static struct aead_alg crypto_aegis128_aesni_alg = {
.setkey = crypto_aegis128_aesni_setkey,
.setauthsize = crypto_aegis128_aesni_setauthsize,
.encrypt = crypto_aegis128_aesni_encrypt,
.decrypt = crypto_aegis128_aesni_decrypt,
- .init = crypto_aegis128_aesni_init_tfm,
- .exit = crypto_aegis128_aesni_exit_tfm,
.ivsize = AEGIS128_NONCE_SIZE,
.maxauthsize = AEGIS128_MAX_AUTH_SIZE,
@@ -267,7 +254,7 @@ static struct simd_aead_alg *simd_alg;
static int __init crypto_aegis128_aesni_module_init(void)
{
- if (!boot_cpu_has(X86_FEATURE_XMM2) ||
+ if (!boot_cpu_has(X86_FEATURE_XMM4_1) ||
!boot_cpu_has(X86_FEATURE_AES) ||
!cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
@@ -286,6 +273,6 @@ module_exit(crypto_aegis128_aesni_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
-MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE2 implementation");
+MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE4.1 implementation");
MODULE_ALIAS_CRYPTO("aegis128");
MODULE_ALIAS_CRYPTO("aegis128-aesni");
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index b0dd83555499..fbf43482e1f5 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -1747,7 +1747,7 @@ static void __exit aesni_exit(void)
unregister_avx_algs();
}
-late_initcall(aesni_init);
+module_init(aesni_init);
module_exit(aesni_exit);
MODULE_DESCRIPTION("AES cipher and modes, optimized with AES-NI or VAES instructions");
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index b4e460a87f18..fb95a614249d 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -487,79 +487,3 @@ SYM_FUNC_START(cast5_cbc_dec_16way)
FRAME_END
RET;
SYM_FUNC_END(cast5_cbc_dec_16way)
-
-SYM_FUNC_START(cast5_ctr_16way)
- /* input:
- * %rdi: ctx
- * %rsi: dst
- * %rdx: src
- * %rcx: iv (big endian, 64bit)
- */
- FRAME_BEGIN
- pushq %r12;
- pushq %r15;
-
- movq %rdi, CTX;
- movq %rsi, %r11;
- movq %rdx, %r12;
-
- vpcmpeqd RTMP, RTMP, RTMP;
- vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */
-
- vpcmpeqd RKR, RKR, RKR;
- vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
- vmovdqa .Lbswap_iv_mask(%rip), R1ST;
- vmovdqa .Lbswap128_mask(%rip), RKM;
-
- /* load IV and byteswap */
- vmovq (%rcx), RX;
- vpshufb R1ST, RX, RX;
-
- /* construct IVs */
- vpsubq RTMP, RX, RX; /* le: IV1, IV0 */
- vpshufb RKM, RX, RL1; /* be: IV0, IV1 */
- vpsubq RKR, RX, RX;
- vpshufb RKM, RX, RR1; /* be: IV2, IV3 */
- vpsubq RKR, RX, RX;
- vpshufb RKM, RX, RL2; /* be: IV4, IV5 */
- vpsubq RKR, RX, RX;
- vpshufb RKM, RX, RR2; /* be: IV6, IV7 */
- vpsubq RKR, RX, RX;
- vpshufb RKM, RX, RL3; /* be: IV8, IV9 */
- vpsubq RKR, RX, RX;
- vpshufb RKM, RX, RR3; /* be: IV10, IV11 */
- vpsubq RKR, RX, RX;
- vpshufb RKM, RX, RL4; /* be: IV12, IV13 */
- vpsubq RKR, RX, RX;
- vpshufb RKM, RX, RR4; /* be: IV14, IV15 */
-
- /* store last IV */
- vpsubq RTMP, RX, RX; /* le: IV16, IV14 */
- vpshufb R1ST, RX, RX; /* be: IV16, IV16 */
- vmovq RX, (%rcx);
-
- call __cast5_enc_blk16;
-
- /* dst = src ^ iv */
- vpxor (0*16)(%r12), RR1, RR1;
- vpxor (1*16)(%r12), RL1, RL1;
- vpxor (2*16)(%r12), RR2, RR2;
- vpxor (3*16)(%r12), RL2, RL2;
- vpxor (4*16)(%r12), RR3, RR3;
- vpxor (5*16)(%r12), RL3, RL3;
- vpxor (6*16)(%r12), RR4, RR4;
- vpxor (7*16)(%r12), RL4, RL4;
- vmovdqu RR1, (0*16)(%r11);
- vmovdqu RL1, (1*16)(%r11);
- vmovdqu RR2, (2*16)(%r11);
- vmovdqu RL2, (3*16)(%r11);
- vmovdqu RR3, (4*16)(%r11);
- vmovdqu RL3, (5*16)(%r11);
- vmovdqu RR4, (6*16)(%r11);
- vmovdqu RL4, (7*16)(%r11);
-
- popq %r15;
- popq %r12;
- FRAME_END
- RET;
-SYM_FUNC_END(cast5_ctr_16way)
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index feccb5254c7e..52c5d47ef5a1 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -41,7 +41,7 @@
*/
#define CRC32C_PCL_BREAKEVEN 512
-asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
+asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len,
unsigned int crc_init);
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index bbcff1fb78cb..752812bc4991 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -7,6 +7,7 @@
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
*
* Copyright (C) 2012 Intel Corporation.
+ * Copyright 2024 Google LLC
*
* Authors:
* Wajdi Feghali <wajdi.k.feghali@intel.com>
@@ -44,185 +45,129 @@
*/
#include <linux/linkage.h>
-#include <asm/nospec-branch.h>
## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
-.macro LABEL prefix n
-.L\prefix\n\():
-.endm
-
-.macro JMPTBL_ENTRY i
-.quad .Lcrc_\i
-.endm
-
-.macro JNC_LESS_THAN j
- jnc .Lless_than_\j
-.endm
-
-# Define threshold where buffers are considered "small" and routed to more
-# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
-# SMALL_SIZE can be no larger than 255.
-
+# Define threshold below which buffers are considered "small" and routed to
+# regular CRC code that does not interleave the CRC instructions.
#define SMALL_SIZE 200
-.if (SMALL_SIZE > 255)
-.error "SMALL_ SIZE must be < 256"
-.endif
-
-# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
+# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init);
.text
SYM_FUNC_START(crc_pcl)
-#define bufp rdi
-#define bufp_dw %edi
-#define bufp_w %di
-#define bufp_b %dil
-#define bufptmp %rcx
-#define block_0 %rcx
-#define block_1 %rdx
-#define block_2 %r11
-#define len %rsi
-#define len_dw %esi
-#define len_w %si
-#define len_b %sil
-#define crc_init_arg %rdx
-#define tmp %rbx
-#define crc_init %r8
-#define crc_init_dw %r8d
-#define crc1 %r9
-#define crc2 %r10
-
- pushq %rbx
- pushq %rdi
- pushq %rsi
-
- ## Move crc_init for Linux to a different
- mov crc_init_arg, crc_init
+#define bufp %rdi
+#define bufp_d %edi
+#define len %esi
+#define crc_init %edx
+#define crc_init_q %rdx
+#define n_misaligned %ecx /* overlaps chunk_bytes! */
+#define n_misaligned_q %rcx
+#define chunk_bytes %ecx /* overlaps n_misaligned! */
+#define chunk_bytes_q %rcx
+#define crc1 %r8
+#define crc2 %r9
+
+ cmp $SMALL_SIZE, len
+ jb .Lsmall
################################################################
## 1) ALIGN:
################################################################
-
- mov %bufp, bufptmp # rdi = *buf
- neg %bufp
- and $7, %bufp # calculate the unalignment amount of
+ mov bufp_d, n_misaligned
+ neg n_misaligned
+ and $7, n_misaligned # calculate the misalignment amount of
# the address
- je .Lproc_block # Skip if aligned
-
- ## If len is less than 8 and we're unaligned, we need to jump
- ## to special code to avoid reading beyond the end of the buffer
- cmp $8, len
- jae .Ldo_align
- # less_than_8 expects length in upper 3 bits of len_dw
- # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
- shl $32-3+1, len_dw
- jmp .Lless_than_8_post_shl1
+ je .Laligned # Skip if aligned
+ # Process 1 <= n_misaligned <= 7 bytes individually in order to align
+ # the remaining data to an 8-byte boundary.
.Ldo_align:
- #### Calculate CRC of unaligned bytes of the buffer (if any)
- movq (bufptmp), tmp # load a quadward from the buffer
- add %bufp, bufptmp # align buffer pointer for quadword
- # processing
- sub %bufp, len # update buffer length
+ movq (bufp), %rax
+ add n_misaligned_q, bufp
+ sub n_misaligned, len
.Lalign_loop:
- crc32b %bl, crc_init_dw # compute crc32 of 1-byte
- shr $8, tmp # get next byte
- dec %bufp
+ crc32b %al, crc_init # compute crc32 of 1-byte
+ shr $8, %rax # get next byte
+ dec n_misaligned
jne .Lalign_loop
-
-.Lproc_block:
+.Laligned:
################################################################
- ## 2) PROCESS BLOCKS:
+ ## 2) PROCESS BLOCK:
################################################################
- ## compute num of bytes to be processed
- movq len, tmp # save num bytes in tmp
-
- cmpq $128*24, len
+ cmp $128*24, len
jae .Lfull_block
-.Lcontinue_block:
- cmpq $SMALL_SIZE, len
- jb .Lsmall
-
- ## len < 128*24
- movq $2731, %rax # 2731 = ceil(2^16 / 24)
- mul len_dw
- shrq $16, %rax
-
- ## eax contains floor(bytes / 24) = num 24-byte chunks to do
-
- ## process rax 24-byte chunks (128 >= rax >= 0)
-
- ## compute end address of each block
- ## block 0 (base addr + RAX * 8)
- ## block 1 (base addr + RAX * 16)
- ## block 2 (base addr + RAX * 24)
- lea (bufptmp, %rax, 8), block_0
- lea (block_0, %rax, 8), block_1
- lea (block_1, %rax, 8), block_2
+.Lpartial_block:
+ # Compute floor(len / 24) to get num qwords to process from each lane.
+ imul $2731, len, %eax # 2731 = ceil(2^16 / 24)
+ shr $16, %eax
+ jmp .Lcrc_3lanes
- xor crc1, crc1
- xor crc2, crc2
-
- ## branch into array
- leaq jump_table(%rip), %bufp
- mov (%bufp,%rax,8), %bufp
- JMP_NOSPEC bufp
-
- ################################################################
- ## 2a) PROCESS FULL BLOCKS:
- ################################################################
.Lfull_block:
- movl $128,%eax
- lea 128*8*2(block_0), block_1
- lea 128*8*3(block_0), block_2
- add $128*8*1, block_0
-
- xor crc1,crc1
- xor crc2,crc2
-
- # Fall through into top of crc array (crc_128)
+ # Processing 128 qwords from each lane.
+ mov $128, %eax
################################################################
- ## 3) CRC Array:
+ ## 3) CRC each of three lanes:
################################################################
- i=128
-.rept 128-1
-.altmacro
-LABEL crc_ %i
-.noaltmacro
- ENDBR
- crc32q -i*8(block_0), crc_init
- crc32q -i*8(block_1), crc1
- crc32q -i*8(block_2), crc2
- i=(i-1)
-.endr
-
-.altmacro
-LABEL crc_ %i
-.noaltmacro
- ENDBR
- crc32q -i*8(block_0), crc_init
- crc32q -i*8(block_1), crc1
-# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet
-
- mov block_2, block_0
+.Lcrc_3lanes:
+ xor crc1,crc1
+ xor crc2,crc2
+ mov %eax, chunk_bytes
+ shl $3, chunk_bytes # num bytes to process from each lane
+ sub $5, %eax # 4 for 4x_loop, 1 for special last iter
+ jl .Lcrc_3lanes_4x_done
+
+ # Unroll the loop by a factor of 4 to reduce the overhead of the loop
+ # bookkeeping instructions, which can compete with crc32q for the ALUs.
+.Lcrc_3lanes_4x_loop:
+ crc32q (bufp), crc_init_q
+ crc32q (bufp,chunk_bytes_q), crc1
+ crc32q (bufp,chunk_bytes_q,2), crc2
+ crc32q 8(bufp), crc_init_q
+ crc32q 8(bufp,chunk_bytes_q), crc1
+ crc32q 8(bufp,chunk_bytes_q,2), crc2
+ crc32q 16(bufp), crc_init_q
+ crc32q 16(bufp,chunk_bytes_q), crc1
+ crc32q 16(bufp,chunk_bytes_q,2), crc2
+ crc32q 24(bufp), crc_init_q
+ crc32q 24(bufp,chunk_bytes_q), crc1
+ crc32q 24(bufp,chunk_bytes_q,2), crc2
+ add $32, bufp
+ sub $4, %eax
+ jge .Lcrc_3lanes_4x_loop
+
+.Lcrc_3lanes_4x_done:
+ add $4, %eax
+ jz .Lcrc_3lanes_last_qword
+
+.Lcrc_3lanes_1x_loop:
+ crc32q (bufp), crc_init_q
+ crc32q (bufp,chunk_bytes_q), crc1
+ crc32q (bufp,chunk_bytes_q,2), crc2
+ add $8, bufp
+ dec %eax
+ jnz .Lcrc_3lanes_1x_loop
+
+.Lcrc_3lanes_last_qword:
+ crc32q (bufp), crc_init_q
+ crc32q (bufp,chunk_bytes_q), crc1
+# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
################################################################
## 4) Combine three results:
################################################################
- lea (K_table-8)(%rip), %bufp # first entry is for idx 1
- shlq $3, %rax # rax *= 8
- pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2
- leal (%eax,%eax,2), %eax # rax *= 3 (total *24)
- subq %rax, tmp # tmp -= rax*24
+ lea (K_table-8)(%rip), %rax # first entry is for idx 1
+ pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
+ lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
+ sub %eax, len # len -= chunk_bytes * 3
- movq crc_init, %xmm1 # CRC for block 1
+ movq crc_init_q, %xmm1 # CRC for block 1
pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
movq crc1, %xmm2 # CRC for block 2
@@ -230,103 +175,54 @@ LABEL crc_ %i
pxor %xmm2,%xmm1
movq %xmm1, %rax
- xor -i*8(block_2), %rax
- mov crc2, crc_init
- crc32 %rax, crc_init
+ xor (bufp,chunk_bytes_q,2), %rax
+ mov crc2, crc_init_q
+ crc32 %rax, crc_init_q
+ lea 8(bufp,chunk_bytes_q,2), bufp
################################################################
- ## 5) Check for end:
+ ## 5) If more blocks remain, goto (2):
################################################################
-LABEL crc_ 0
- ENDBR
- mov tmp, len
- cmp $128*24, tmp
- jae .Lfull_block
- cmp $24, tmp
- jae .Lcontinue_block
-
-.Lless_than_24:
- shl $32-4, len_dw # less_than_16 expects length
- # in upper 4 bits of len_dw
- jnc .Lless_than_16
- crc32q (bufptmp), crc_init
- crc32q 8(bufptmp), crc_init
- jz .Ldo_return
- add $16, bufptmp
- # len is less than 8 if we got here
- # less_than_8 expects length in upper 3 bits of len_dw
- # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
- shl $2, len_dw
- jmp .Lless_than_8_post_shl1
+ cmp $128*24, len
+ jae .Lfull_block
+ cmp $SMALL_SIZE, len
+ jae .Lpartial_block
#######################################################################
- ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
+ ## 6) Process any remainder without interleaving:
#######################################################################
.Lsmall:
- shl $32-8, len_dw # Prepare len_dw for less_than_256
- j=256
-.rept 5 # j = {256, 128, 64, 32, 16}
-.altmacro
-LABEL less_than_ %j # less_than_j: Length should be in
- # upper lg(j) bits of len_dw
- j=(j/2)
- shl $1, len_dw # Get next MSB
- JNC_LESS_THAN %j
-.noaltmacro
- i=0
-.rept (j/8)
- crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data
- i=i+8
-.endr
- jz .Ldo_return # Return if remaining length is zero
- add $j, bufptmp # Advance buf
-.endr
-
-.Lless_than_8: # Length should be stored in
- # upper 3 bits of len_dw
- shl $1, len_dw
-.Lless_than_8_post_shl1:
- jnc .Lless_than_4
- crc32l (bufptmp), crc_init_dw # CRC of 4 bytes
- jz .Ldo_return # return if remaining data is zero
- add $4, bufptmp
-.Lless_than_4: # Length should be stored in
- # upper 2 bits of len_dw
- shl $1, len_dw
- jnc .Lless_than_2
- crc32w (bufptmp), crc_init_dw # CRC of 2 bytes
- jz .Ldo_return # return if remaining data is zero
- add $2, bufptmp
-.Lless_than_2: # Length should be stored in the MSB
- # of len_dw
- shl $1, len_dw
- jnc .Lless_than_1
- crc32b (bufptmp), crc_init_dw # CRC of 1 byte
-.Lless_than_1: # Length should be zero
-.Ldo_return:
- movq crc_init, %rax
- popq %rsi
- popq %rdi
- popq %rbx
+ test len, len
+ jz .Ldone
+ mov len, %eax
+ shr $3, %eax
+ jz .Ldo_dword
+.Ldo_qwords:
+ crc32q (bufp), crc_init_q
+ add $8, bufp
+ dec %eax
+ jnz .Ldo_qwords
+.Ldo_dword:
+ test $4, len
+ jz .Ldo_word
+ crc32l (bufp), crc_init
+ add $4, bufp
+.Ldo_word:
+ test $2, len
+ jz .Ldo_byte
+ crc32w (bufp), crc_init
+ add $2, bufp
+.Ldo_byte:
+ test $1, len
+ jz .Ldone
+ crc32b (bufp), crc_init
+.Ldone:
+ mov crc_init, %eax
RET
SYM_FUNC_END(crc_pcl)
.section .rodata, "a", @progbits
- ################################################################
- ## jump table Table is 129 entries x 2 bytes each
- ################################################################
-.align 4
-jump_table:
- i=0
-.rept 129
-.altmacro
-JMPTBL_ENTRY %i
-.noaltmacro
- i=i+1
-.endr
-
-
################################################################
## PCLMULQDQ tables
## Table is 128 entries x 2 words (8 bytes) each