diff options
Diffstat (limited to 'arch/powerpc/crypto')
| -rw-r--r-- | arch/powerpc/crypto/Kconfig | 17 | ||||
| -rw-r--r-- | arch/powerpc/crypto/Makefile | 13 | ||||
| -rw-r--r-- | arch/powerpc/crypto/aes-gcm-p10-glue.c | 343 | ||||
| -rw-r--r-- | arch/powerpc/crypto/aes-gcm-p10.S | 1521 | ||||
| -rw-r--r-- | arch/powerpc/crypto/aesp8-ppc.pl | 585 | ||||
| -rw-r--r-- | arch/powerpc/crypto/ghashp8-ppc.pl | 370 | ||||
| -rw-r--r-- | arch/powerpc/crypto/ppc-xlate.pl | 229 | 
7 files changed, 3078 insertions, 0 deletions
| diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index c1b964447401..7113f9355165 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -94,4 +94,21 @@ config CRYPTO_AES_PPC_SPE  	  architecture specific assembler implementations that work on 1KB  	  tables or 256 bytes S-boxes. +config CRYPTO_AES_GCM_P10 +	tristate "Stitched AES/GCM acceleration support on P10 or later CPU (PPC)" +	depends on PPC64 && CPU_LITTLE_ENDIAN +	select CRYPTO_LIB_AES +	select CRYPTO_ALGAPI +	select CRYPTO_AEAD +	default m +	help +	  AEAD cipher: AES cipher algorithms (FIPS-197) +	  GCM (Galois/Counter Mode) authenticated encryption mode (NIST SP800-38D) +	  Architecture: powerpc64 using: +	    - little-endian +	    - Power10 or later features + +	  Support for cryptographic acceleration instructions on Power10 or +	  later CPU. This module supports stitched acceleration for AES/GCM. +  endmenu diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile index 4808d97fede5..05c7486f42c5 100644 --- a/arch/powerpc/crypto/Makefile +++ b/arch/powerpc/crypto/Makefile @@ -13,6 +13,7 @@ obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o  obj-$(CONFIG_CRYPTO_CRC32C_VPMSUM) += crc32c-vpmsum.o  obj-$(CONFIG_CRYPTO_CRCT10DIF_VPMSUM) += crct10dif-vpmsum.o  obj-$(CONFIG_CRYPTO_VPMSUM_TESTER) += crc-vpmsum_test.o +obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o  aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o  md5-ppc-y := md5-asm.o md5-glue.o @@ -21,3 +22,15 @@ sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o  sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o  crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o  crct10dif-vpmsum-y := crct10dif-vpmsum_asm.o crct10dif-vpmsum_glue.o +aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp8-ppc.o aesp8-ppc.o + +quiet_cmd_perl = PERL    $@ +      cmd_perl = $(PERL) $< $(if $(CONFIG_CPU_LITTLE_ENDIAN), linux-ppc64le, linux-ppc64) > $@ + +targets += aesp8-ppc.S ghashp8-ppc.S + +$(obj)/aesp8-ppc.S $(obj)/ghashp8-ppc.S: $(obj)/%.S: $(src)/%.pl FORCE +	$(call if_changed,perl) + +OBJECT_FILES_NON_STANDARD_aesp8-ppc.o := y +OBJECT_FILES_NON_STANDARD_ghashp8-ppc.o := y diff --git a/arch/powerpc/crypto/aes-gcm-p10-glue.c b/arch/powerpc/crypto/aes-gcm-p10-glue.c new file mode 100644 index 000000000000..bd3475f5348d --- /dev/null +++ b/arch/powerpc/crypto/aes-gcm-p10-glue.c @@ -0,0 +1,343 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Glue code for accelerated AES-GCM stitched implementation for ppc64le. + * + * Copyright 2022- IBM Inc. All rights reserved + */ + +#include <asm/unaligned.h> +#include <asm/simd.h> +#include <asm/switch_to.h> +#include <crypto/aes.h> +#include <crypto/algapi.h> +#include <crypto/b128ops.h> +#include <crypto/gf128mul.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/aead.h> +#include <crypto/internal/hash.h> +#include <crypto/internal/skcipher.h> +#include <crypto/scatterwalk.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/module.h> +#include <linux/types.h> + +#define	PPC_ALIGN		16 +#define GCM_IV_SIZE		12 + +MODULE_DESCRIPTION("PPC64le AES-GCM with Stitched implementation"); +MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("aes"); + +asmlinkage int aes_p8_set_encrypt_key(const u8 *userKey, const int bits, +				      void *key); +asmlinkage void aes_p8_encrypt(const u8 *in, u8 *out, const void *key); +asmlinkage void aes_p10_gcm_encrypt(u8 *in, u8 *out, size_t len, +				    void *rkey, u8 *iv, void *Xi); +asmlinkage void aes_p10_gcm_decrypt(u8 *in, u8 *out, size_t len, +				    void *rkey, u8 *iv, void *Xi); +asmlinkage void gcm_init_htable(unsigned char htable[256], unsigned char Xi[16]); +asmlinkage void gcm_ghash_p8(unsigned char *Xi, unsigned char *Htable, +		unsigned char *aad, unsigned int alen); + +struct aes_key { +	u8 key[AES_MAX_KEYLENGTH]; +	u64 rounds; +}; + +struct gcm_ctx { +	u8 iv[16]; +	u8 ivtag[16]; +	u8 aad_hash[16]; +	u64 aadLen; +	u64 Plen;	/* offset 56 - used in aes_p10_gcm_{en/de}crypt */ +}; +struct Hash_ctx { +	u8 H[16];	/* subkey */ +	u8 Htable[256];	/* Xi, Hash table(offset 32) */ +}; + +struct p10_aes_gcm_ctx { +	struct aes_key enc_key; +}; + +static void vsx_begin(void) +{ +	preempt_disable(); +	enable_kernel_vsx(); +} + +static void vsx_end(void) +{ +	disable_kernel_vsx(); +	preempt_enable(); +} + +static void set_subkey(unsigned char *hash) +{ +	*(u64 *)&hash[0] = be64_to_cpup((__be64 *)&hash[0]); +	*(u64 *)&hash[8] = be64_to_cpup((__be64 *)&hash[8]); +} + +/* + * Compute aad if any. + *   - Hash aad and copy to Xi. + */ +static void set_aad(struct gcm_ctx *gctx, struct Hash_ctx *hash, +		    unsigned char *aad, int alen) +{ +	int i; +	u8 nXi[16] = {0, }; + +	gctx->aadLen = alen; +	i = alen & ~0xf; +	if (i) { +		gcm_ghash_p8(nXi, hash->Htable+32, aad, i); +		aad += i; +		alen -= i; +	} +	if (alen) { +		for (i = 0; i < alen; i++) +			nXi[i] ^= aad[i]; + +		memset(gctx->aad_hash, 0, 16); +		gcm_ghash_p8(gctx->aad_hash, hash->Htable+32, nXi, 16); +	} else { +		memcpy(gctx->aad_hash, nXi, 16); +	} + +	memcpy(hash->Htable, gctx->aad_hash, 16); +} + +static void gcmp10_init(struct gcm_ctx *gctx, u8 *iv, unsigned char *rdkey, +			struct Hash_ctx *hash, u8 *assoc, unsigned int assoclen) +{ +	__be32 counter = cpu_to_be32(1); + +	aes_p8_encrypt(hash->H, hash->H, rdkey); +	set_subkey(hash->H); +	gcm_init_htable(hash->Htable+32, hash->H); + +	*((__be32 *)(iv+12)) = counter; + +	gctx->Plen = 0; + +	/* +	 * Encrypt counter vector as iv tag and increment counter. +	 */ +	aes_p8_encrypt(iv, gctx->ivtag, rdkey); + +	counter = cpu_to_be32(2); +	*((__be32 *)(iv+12)) = counter; +	memcpy(gctx->iv, iv, 16); + +	gctx->aadLen = assoclen; +	memset(gctx->aad_hash, 0, 16); +	if (assoclen) +		set_aad(gctx, hash, assoc, assoclen); +} + +static void finish_tag(struct gcm_ctx *gctx, struct Hash_ctx *hash, int len) +{ +	int i; +	unsigned char len_ac[16 + PPC_ALIGN]; +	unsigned char *aclen = PTR_ALIGN((void *)len_ac, PPC_ALIGN); +	__be64 clen = cpu_to_be64(len << 3); +	__be64 alen = cpu_to_be64(gctx->aadLen << 3); + +	if (len == 0 && gctx->aadLen == 0) { +		memcpy(hash->Htable, gctx->ivtag, 16); +		return; +	} + +	/* +	 * Len is in bits. +	 */ +	*((__be64 *)(aclen)) = alen; +	*((__be64 *)(aclen+8)) = clen; + +	/* +	 * hash (AAD len and len) +	 */ +	gcm_ghash_p8(hash->Htable, hash->Htable+32, aclen, 16); + +	for (i = 0; i < 16; i++) +		hash->Htable[i] ^= gctx->ivtag[i]; +} + +static int set_authsize(struct crypto_aead *tfm, unsigned int authsize) +{ +	switch (authsize) { +	case 4: +	case 8: +	case 12: +	case 13: +	case 14: +	case 15: +	case 16: +		break; +	default: +		return -EINVAL; +	} + +	return 0; +} + +static int p10_aes_gcm_setkey(struct crypto_aead *aead, const u8 *key, +			     unsigned int keylen) +{ +	struct crypto_tfm *tfm = crypto_aead_tfm(aead); +	struct p10_aes_gcm_ctx *ctx = crypto_tfm_ctx(tfm); +	int ret; + +	vsx_begin(); +	ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key); +	vsx_end(); + +	return ret ? -EINVAL : 0; +} + +static int p10_aes_gcm_crypt(struct aead_request *req, int enc) +{ +	struct crypto_tfm *tfm = req->base.tfm; +	struct p10_aes_gcm_ctx *ctx = crypto_tfm_ctx(tfm); +	u8 databuf[sizeof(struct gcm_ctx) + PPC_ALIGN]; +	struct gcm_ctx *gctx = PTR_ALIGN((void *)databuf, PPC_ALIGN); +	u8 hashbuf[sizeof(struct Hash_ctx) + PPC_ALIGN]; +	struct Hash_ctx *hash = PTR_ALIGN((void *)hashbuf, PPC_ALIGN); +	struct scatter_walk assoc_sg_walk; +	struct skcipher_walk walk; +	u8 *assocmem = NULL; +	u8 *assoc; +	unsigned int assoclen = req->assoclen; +	unsigned int cryptlen = req->cryptlen; +	unsigned char ivbuf[AES_BLOCK_SIZE+PPC_ALIGN]; +	unsigned char *iv = PTR_ALIGN((void *)ivbuf, PPC_ALIGN); +	int ret; +	unsigned long auth_tag_len = crypto_aead_authsize(__crypto_aead_cast(tfm)); +	u8 otag[16]; +	int total_processed = 0; + +	memset(databuf, 0, sizeof(databuf)); +	memset(hashbuf, 0, sizeof(hashbuf)); +	memset(ivbuf, 0, sizeof(ivbuf)); +	memcpy(iv, req->iv, GCM_IV_SIZE); + +	/* Linearize assoc, if not already linear */ +	if (req->src->length >= assoclen && req->src->length) { +		scatterwalk_start(&assoc_sg_walk, req->src); +		assoc = scatterwalk_map(&assoc_sg_walk); +	} else { +		gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? +			      GFP_KERNEL : GFP_ATOMIC; + +		/* assoc can be any length, so must be on heap */ +		assocmem = kmalloc(assoclen, flags); +		if (unlikely(!assocmem)) +			return -ENOMEM; +		assoc = assocmem; + +		scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0); +	} + +	vsx_begin(); +	gcmp10_init(gctx, iv, (unsigned char *) &ctx->enc_key, hash, assoc, assoclen); +	vsx_end(); + +	if (!assocmem) +		scatterwalk_unmap(assoc); +	else +		kfree(assocmem); + +	if (enc) +		ret = skcipher_walk_aead_encrypt(&walk, req, false); +	else +		ret = skcipher_walk_aead_decrypt(&walk, req, false); +	if (ret) +		return ret; + +	while (walk.nbytes > 0 && ret == 0) { + +		vsx_begin(); +		if (enc) +			aes_p10_gcm_encrypt(walk.src.virt.addr, +					    walk.dst.virt.addr, +					    walk.nbytes, +					    &ctx->enc_key, gctx->iv, hash->Htable); +		else +			aes_p10_gcm_decrypt(walk.src.virt.addr, +					    walk.dst.virt.addr, +					    walk.nbytes, +					    &ctx->enc_key, gctx->iv, hash->Htable); +		vsx_end(); + +		total_processed += walk.nbytes; +		ret = skcipher_walk_done(&walk, 0); +	} + +	if (ret) +		return ret; + +	/* Finalize hash */ +	vsx_begin(); +	finish_tag(gctx, hash, total_processed); +	vsx_end(); + +	/* copy Xi to end of dst */ +	if (enc) +		scatterwalk_map_and_copy(hash->Htable, req->dst, req->assoclen + cryptlen, +					 auth_tag_len, 1); +	else { +		scatterwalk_map_and_copy(otag, req->src, +					 req->assoclen + cryptlen - auth_tag_len, +					 auth_tag_len, 0); + +		if (crypto_memneq(otag, hash->Htable, auth_tag_len)) { +			memzero_explicit(hash->Htable, 16); +			return -EBADMSG; +		} +	} + +	return 0; +} + +static int p10_aes_gcm_encrypt(struct aead_request *req) +{ +	return p10_aes_gcm_crypt(req, 1); +} + +static int p10_aes_gcm_decrypt(struct aead_request *req) +{ +	return p10_aes_gcm_crypt(req, 0); +} + +static struct aead_alg gcm_aes_alg = { +	.ivsize			= GCM_IV_SIZE, +	.maxauthsize		= 16, + +	.setauthsize		= set_authsize, +	.setkey			= p10_aes_gcm_setkey, +	.encrypt		= p10_aes_gcm_encrypt, +	.decrypt		= p10_aes_gcm_decrypt, + +	.base.cra_name		= "gcm(aes)", +	.base.cra_driver_name	= "aes_gcm_p10", +	.base.cra_priority	= 2100, +	.base.cra_blocksize	= 1, +	.base.cra_ctxsize	= sizeof(struct p10_aes_gcm_ctx), +	.base.cra_module	= THIS_MODULE, +}; + +static int __init p10_init(void) +{ +	return crypto_register_aead(&gcm_aes_alg); +} + +static void __exit p10_exit(void) +{ +	crypto_unregister_aead(&gcm_aes_alg); +} + +module_cpu_feature_match(PPC_MODULE_FEATURE_P10, p10_init); +module_exit(p10_exit); diff --git a/arch/powerpc/crypto/aes-gcm-p10.S b/arch/powerpc/crypto/aes-gcm-p10.S new file mode 100644 index 000000000000..a51f4b265308 --- /dev/null +++ b/arch/powerpc/crypto/aes-gcm-p10.S @@ -0,0 +1,1521 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + # + # Accelerated AES-GCM stitched implementation for ppc64le. + # + # Copyright 2022- IBM Inc. All rights reserved + # + #=================================================================================== + # Written by Danny Tsen <dtsen@linux.ibm.com> + # + # GHASH is based on the Karatsuba multiplication method. + # + #    Xi xor X1 + # + #    X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H = + #      (X1.h * H4.h + xX.l * H4.l + X1 * H4) + + #      (X2.h * H3.h + X2.l * H3.l + X2 * H3) + + #      (X3.h * H2.h + X3.l * H2.l + X3 * H2) + + #      (X4.h * H.h + X4.l * H.l + X4 * H) + # + # Xi = v0 + # H Poly = v2 + # Hash keys = v3 - v14 + #     ( H.l, H, H.h) + #     ( H^2.l, H^2, H^2.h) + #     ( H^3.l, H^3, H^3.h) + #     ( H^4.l, H^4, H^4.h) + # + # v30 is IV + # v31 - counter 1 + # + # AES used, + #     vs0 - vs14 for round keys + #     v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted) + # + # This implementation uses stitched AES-GCM approach to improve overall performance. + # AES is implemented with 8x blocks and GHASH is using 2 4x blocks. + # + # =================================================================================== + # + +#include <asm/ppc_asm.h> +#include <linux/linkage.h> + +.machine        "any" +.text + + # 4x loops + # v15 - v18 - input states + # vs1 - vs9 - round keys + # +.macro Loop_aes_middle4x +	xxlor	19+32, 1, 1 +	xxlor	20+32, 2, 2 +	xxlor	21+32, 3, 3 +	xxlor	22+32, 4, 4 + +	vcipher	15, 15, 19 +	vcipher	16, 16, 19 +	vcipher	17, 17, 19 +	vcipher	18, 18, 19 + +	vcipher	15, 15, 20 +	vcipher	16, 16, 20 +	vcipher	17, 17, 20 +	vcipher	18, 18, 20 + +	vcipher	15, 15, 21 +	vcipher	16, 16, 21 +	vcipher	17, 17, 21 +	vcipher	18, 18, 21 + +	vcipher	15, 15, 22 +	vcipher	16, 16, 22 +	vcipher	17, 17, 22 +	vcipher	18, 18, 22 + +	xxlor	19+32, 5, 5 +	xxlor	20+32, 6, 6 +	xxlor	21+32, 7, 7 +	xxlor	22+32, 8, 8 + +	vcipher	15, 15, 19 +	vcipher	16, 16, 19 +	vcipher	17, 17, 19 +	vcipher	18, 18, 19 + +	vcipher	15, 15, 20 +	vcipher	16, 16, 20 +	vcipher	17, 17, 20 +	vcipher	18, 18, 20 + +	vcipher	15, 15, 21 +	vcipher	16, 16, 21 +	vcipher	17, 17, 21 +	vcipher	18, 18, 21 + +	vcipher	15, 15, 22 +	vcipher	16, 16, 22 +	vcipher	17, 17, 22 +	vcipher	18, 18, 22 + +	xxlor	23+32, 9, 9 +	vcipher	15, 15, 23 +	vcipher	16, 16, 23 +	vcipher	17, 17, 23 +	vcipher	18, 18, 23 +.endm + + # 8x loops + # v15 - v22 - input states + # vs1 - vs9 - round keys + # +.macro Loop_aes_middle8x +	xxlor	23+32, 1, 1 +	xxlor	24+32, 2, 2 +	xxlor	25+32, 3, 3 +	xxlor	26+32, 4, 4 + +	vcipher	15, 15, 23 +	vcipher	16, 16, 23 +	vcipher	17, 17, 23 +	vcipher	18, 18, 23 +	vcipher	19, 19, 23 +	vcipher	20, 20, 23 +	vcipher	21, 21, 23 +	vcipher	22, 22, 23 + +	vcipher	15, 15, 24 +	vcipher	16, 16, 24 +	vcipher	17, 17, 24 +	vcipher	18, 18, 24 +	vcipher	19, 19, 24 +	vcipher	20, 20, 24 +	vcipher	21, 21, 24 +	vcipher	22, 22, 24 + +	vcipher	15, 15, 25 +	vcipher	16, 16, 25 +	vcipher	17, 17, 25 +	vcipher	18, 18, 25 +	vcipher	19, 19, 25 +	vcipher	20, 20, 25 +	vcipher	21, 21, 25 +	vcipher	22, 22, 25 + +	vcipher	15, 15, 26 +	vcipher	16, 16, 26 +	vcipher	17, 17, 26 +	vcipher	18, 18, 26 +	vcipher	19, 19, 26 +	vcipher	20, 20, 26 +	vcipher	21, 21, 26 +	vcipher	22, 22, 26 + +	xxlor	23+32, 5, 5 +	xxlor	24+32, 6, 6 +	xxlor	25+32, 7, 7 +	xxlor	26+32, 8, 8 + +	vcipher	15, 15, 23 +	vcipher	16, 16, 23 +	vcipher	17, 17, 23 +	vcipher	18, 18, 23 +	vcipher	19, 19, 23 +	vcipher	20, 20, 23 +	vcipher	21, 21, 23 +	vcipher	22, 22, 23 + +	vcipher	15, 15, 24 +	vcipher	16, 16, 24 +	vcipher	17, 17, 24 +	vcipher	18, 18, 24 +	vcipher	19, 19, 24 +	vcipher	20, 20, 24 +	vcipher	21, 21, 24 +	vcipher	22, 22, 24 + +	vcipher	15, 15, 25 +	vcipher	16, 16, 25 +	vcipher	17, 17, 25 +	vcipher	18, 18, 25 +	vcipher	19, 19, 25 +	vcipher	20, 20, 25 +	vcipher	21, 21, 25 +	vcipher	22, 22, 25 + +	vcipher	15, 15, 26 +	vcipher	16, 16, 26 +	vcipher	17, 17, 26 +	vcipher	18, 18, 26 +	vcipher	19, 19, 26 +	vcipher	20, 20, 26 +	vcipher	21, 21, 26 +	vcipher	22, 22, 26 + +	xxlor	23+32, 9, 9 +	vcipher	15, 15, 23 +	vcipher	16, 16, 23 +	vcipher	17, 17, 23 +	vcipher	18, 18, 23 +	vcipher	19, 19, 23 +	vcipher	20, 20, 23 +	vcipher	21, 21, 23 +	vcipher	22, 22, 23 +.endm + +.macro Loop_aes_middle_1x +	xxlor	19+32, 1, 1 +	xxlor	20+32, 2, 2 +	xxlor	21+32, 3, 3 +	xxlor	22+32, 4, 4 + +	vcipher 15, 15, 19 +	vcipher 15, 15, 20 +	vcipher 15, 15, 21 +	vcipher 15, 15, 22 + +	xxlor	19+32, 5, 5 +	xxlor	20+32, 6, 6 +	xxlor	21+32, 7, 7 +	xxlor	22+32, 8, 8 + +	vcipher 15, 15, 19 +	vcipher 15, 15, 20 +	vcipher 15, 15, 21 +	vcipher 15, 15, 22 + +	xxlor	19+32, 9, 9 +	vcipher 15, 15, 19 +.endm + + # + # Compute 4x hash values based on Karatsuba method. + # +.macro ppc_aes_gcm_ghash +	vxor		15, 15, 0 + +	vpmsumd		23, 12, 15		# H4.L * X.L +	vpmsumd		24, 9, 16 +	vpmsumd		25, 6, 17 +	vpmsumd		26, 3, 18 + +	vxor		23, 23, 24 +	vxor		23, 23, 25 +	vxor		23, 23, 26		# L + +	vpmsumd		24, 13, 15		# H4.L * X.H + H4.H * X.L +	vpmsumd		25, 10, 16		# H3.L * X1.H + H3.H * X1.L +	vpmsumd		26, 7, 17 +	vpmsumd		27, 4, 18 + +	vxor		24, 24, 25 +	vxor		24, 24, 26 +	vxor		24, 24, 27		# M + +	# sum hash and reduction with H Poly +	vpmsumd		28, 23, 2		# reduction + +	vxor		29, 29, 29 +	vsldoi		26, 24, 29, 8		# mL +	vsldoi		29, 29, 24, 8		# mH +	vxor		23, 23, 26		# mL + L + +	vsldoi		23, 23, 23, 8		# swap +	vxor		23, 23, 28 + +	vpmsumd		24, 14, 15		# H4.H * X.H +	vpmsumd		25, 11, 16 +	vpmsumd		26, 8, 17 +	vpmsumd		27, 5, 18 + +	vxor		24, 24, 25 +	vxor		24, 24, 26 +	vxor		24, 24, 27 + +	vxor		24, 24, 29 + +	# sum hash and reduction with H Poly +	vsldoi		27, 23, 23, 8		# swap +	vpmsumd		23, 23, 2 +	vxor		27, 27, 24 +	vxor		23, 23, 27 + +	xxlor		32, 23+32, 23+32		# update hash + +.endm + + # + # Combine two 4x ghash + # v15 - v22 - input blocks + # +.macro ppc_aes_gcm_ghash2_4x +	# first 4x hash +	vxor		15, 15, 0		# Xi + X + +	vpmsumd		23, 12, 15		# H4.L * X.L +	vpmsumd		24, 9, 16 +	vpmsumd		25, 6, 17 +	vpmsumd		26, 3, 18 + +	vxor		23, 23, 24 +	vxor		23, 23, 25 +	vxor		23, 23, 26		# L + +	vpmsumd		24, 13, 15		# H4.L * X.H + H4.H * X.L +	vpmsumd		25, 10, 16		# H3.L * X1.H + H3.H * X1.L +	vpmsumd		26, 7, 17 +	vpmsumd		27, 4, 18 + +	vxor		24, 24, 25 +	vxor		24, 24, 26 + +	# sum hash and reduction with H Poly +	vpmsumd		28, 23, 2		# reduction + +	vxor		29, 29, 29 + +	vxor		24, 24, 27		# M +	vsldoi		26, 24, 29, 8		# mL +	vsldoi		29, 29, 24, 8		# mH +	vxor		23, 23, 26		# mL + L + +	vsldoi		23, 23, 23, 8		# swap +	vxor		23, 23, 28 + +	vpmsumd		24, 14, 15		# H4.H * X.H +	vpmsumd		25, 11, 16 +	vpmsumd		26, 8, 17 +	vpmsumd		27, 5, 18 + +	vxor		24, 24, 25 +	vxor		24, 24, 26 +	vxor		24, 24, 27		# H + +	vxor		24, 24, 29		# H + mH + +	# sum hash and reduction with H Poly +	vsldoi		27, 23, 23, 8		# swap +	vpmsumd		23, 23, 2 +	vxor		27, 27, 24 +	vxor		27, 23, 27		# 1st Xi + +	# 2nd 4x hash +	vpmsumd		24, 9, 20 +	vpmsumd		25, 6, 21 +	vpmsumd		26, 3, 22 +	vxor		19, 19, 27		# Xi + X +	vpmsumd		23, 12, 19		# H4.L * X.L + +	vxor		23, 23, 24 +	vxor		23, 23, 25 +	vxor		23, 23, 26		# L + +	vpmsumd		24, 13, 19		# H4.L * X.H + H4.H * X.L +	vpmsumd		25, 10, 20		# H3.L * X1.H + H3.H * X1.L +	vpmsumd		26, 7, 21 +	vpmsumd		27, 4, 22 + +	vxor		24, 24, 25 +	vxor		24, 24, 26 + +	# sum hash and reduction with H Poly +	vpmsumd		28, 23, 2		# reduction + +	vxor		29, 29, 29 + +	vxor		24, 24, 27		# M +	vsldoi		26, 24, 29, 8		# mL +	vsldoi		29, 29, 24, 8		# mH +	vxor		23, 23, 26		# mL + L + +	vsldoi		23, 23, 23, 8		# swap +	vxor		23, 23, 28 + +	vpmsumd		24, 14, 19		# H4.H * X.H +	vpmsumd		25, 11, 20 +	vpmsumd		26, 8, 21 +	vpmsumd		27, 5, 22 + +	vxor		24, 24, 25 +	vxor		24, 24, 26 +	vxor		24, 24, 27		# H + +	vxor		24, 24, 29		# H + mH + +	# sum hash and reduction with H Poly +	vsldoi		27, 23, 23, 8		# swap +	vpmsumd		23, 23, 2 +	vxor		27, 27, 24 +	vxor		23, 23, 27 + +	xxlor		32, 23+32, 23+32		# update hash + +.endm + + # + # Compute update single hash + # +.macro ppc_update_hash_1x +	vxor		28, 28, 0 + +	vxor		19, 19, 19 + +	vpmsumd		22, 3, 28		# L +	vpmsumd		23, 4, 28		# M +	vpmsumd		24, 5, 28		# H + +	vpmsumd		27, 22, 2		# reduction + +	vsldoi		25, 23, 19, 8		# mL +	vsldoi		26, 19, 23, 8		# mH +	vxor		22, 22, 25		# LL + LL +	vxor		24, 24, 26		# HH + HH + +	vsldoi		22, 22, 22, 8		# swap +	vxor		22, 22, 27 + +	vsldoi		20, 22, 22, 8		# swap +	vpmsumd		22, 22, 2		# reduction +	vxor		20, 20, 24 +	vxor		22, 22, 20 + +	vmr		0, 22			# update hash + +.endm + +.macro SAVE_REGS +	stdu 1,-640(1) +	mflr 0 + +	std	14,112(1) +	std	15,120(1) +	std	16,128(1) +	std	17,136(1) +	std	18,144(1) +	std	19,152(1) +	std	20,160(1) +	std	21,168(1) +	li	9, 256 +	stvx	20, 9, 1 +	addi	9, 9, 16 +	stvx	21, 9, 1 +	addi	9, 9, 16 +	stvx	22, 9, 1 +	addi	9, 9, 16 +	stvx	23, 9, 1 +	addi	9, 9, 16 +	stvx	24, 9, 1 +	addi	9, 9, 16 +	stvx	25, 9, 1 +	addi	9, 9, 16 +	stvx	26, 9, 1 +	addi	9, 9, 16 +	stvx	27, 9, 1 +	addi	9, 9, 16 +	stvx	28, 9, 1 +	addi	9, 9, 16 +	stvx	29, 9, 1 +	addi	9, 9, 16 +	stvx	30, 9, 1 +	addi	9, 9, 16 +	stvx	31, 9, 1 +	stxv	14, 464(1) +	stxv	15, 480(1) +	stxv	16, 496(1) +	stxv	17, 512(1) +	stxv	18, 528(1) +	stxv	19, 544(1) +	stxv	20, 560(1) +	stxv	21, 576(1) +	stxv	22, 592(1) +	std	0, 656(1) +.endm + +.macro RESTORE_REGS +	lxv	14, 464(1) +	lxv	15, 480(1) +	lxv	16, 496(1) +	lxv	17, 512(1) +	lxv	18, 528(1) +	lxv	19, 544(1) +	lxv	20, 560(1) +	lxv	21, 576(1) +	lxv	22, 592(1) +	li	9, 256 +	lvx	20, 9, 1 +	addi	9, 9, 16 +	lvx	21, 9, 1 +	addi	9, 9, 16 +	lvx	22, 9, 1 +	addi	9, 9, 16 +	lvx	23, 9, 1 +	addi	9, 9, 16 +	lvx	24, 9, 1 +	addi	9, 9, 16 +	lvx	25, 9, 1 +	addi	9, 9, 16 +	lvx	26, 9, 1 +	addi	9, 9, 16 +	lvx	27, 9, 1 +	addi	9, 9, 16 +	lvx	28, 9, 1 +	addi	9, 9, 16 +	lvx	29, 9, 1 +	addi	9, 9, 16 +	lvx	30, 9, 1 +	addi	9, 9, 16 +	lvx	31, 9, 1 + +	ld	0, 656(1) +	ld      14,112(1) +	ld      15,120(1) +	ld      16,128(1) +	ld      17,136(1) +	ld      18,144(1) +	ld      19,152(1) +	ld      20,160(1) +	ld	21,168(1) + +	mtlr	0 +	addi	1, 1, 640 +.endm + +.macro LOAD_HASH_TABLE +	# Load Xi +	lxvb16x	32, 0, 8	# load Xi + +	# load Hash - h^4, h^3, h^2, h +	li	10, 32 +	lxvd2x	2+32, 10, 8	# H Poli +	li	10, 48 +	lxvd2x	3+32, 10, 8	# Hl +	li	10, 64 +	lxvd2x	4+32, 10, 8	# H +	li	10, 80 +	lxvd2x	5+32, 10, 8	# Hh + +	li	10, 96 +	lxvd2x	6+32, 10, 8	# H^2l +	li	10, 112 +	lxvd2x	7+32, 10, 8	# H^2 +	li	10, 128 +	lxvd2x	8+32, 10, 8	# H^2h + +	li	10, 144 +	lxvd2x	9+32, 10, 8	# H^3l +	li	10, 160 +	lxvd2x	10+32, 10, 8	# H^3 +	li	10, 176 +	lxvd2x	11+32, 10, 8	# H^3h + +	li	10, 192 +	lxvd2x	12+32, 10, 8	# H^4l +	li	10, 208 +	lxvd2x	13+32, 10, 8	# H^4 +	li	10, 224 +	lxvd2x	14+32, 10, 8	# H^4h +.endm + + # + # aes_p10_gcm_encrypt (const void *inp, void *out, size_t len, + #               const char *rk, unsigned char iv[16], void *Xip); + # + #    r3 - inp + #    r4 - out + #    r5 - len + #    r6 - AES round keys + #    r7 - iv and other data + #    r8 - Xi, HPoli, hash keys + # + #    rounds is at offset 240 in rk + #    Xi is at 0 in gcm_table (Xip). + # +_GLOBAL(aes_p10_gcm_encrypt) +.align 5 + +	SAVE_REGS + +	LOAD_HASH_TABLE + +	# initialize ICB: GHASH( IV ), IV - r7 +	lxvb16x	30+32, 0, 7	# load IV  - v30 + +	mr	12, 5		# length +	li	11, 0		# block index + +	# counter 1 +	vxor	31, 31, 31 +	vspltisb 22, 1 +	vsldoi	31, 31, 22,1	# counter 1 + +	# load round key to VSR +	lxv	0, 0(6) +	lxv	1, 0x10(6) +	lxv	2, 0x20(6) +	lxv	3, 0x30(6) +	lxv	4, 0x40(6) +	lxv	5, 0x50(6) +	lxv	6, 0x60(6) +	lxv	7, 0x70(6) +	lxv	8, 0x80(6) +	lxv	9, 0x90(6) +	lxv	10, 0xa0(6) + +	# load rounds - 10 (128), 12 (192), 14 (256) +	lwz	9,240(6) + +	# +	# vxor	state, state, w # addroundkey +	xxlor	32+29, 0, 0 +	vxor	15, 30, 29	# IV + round key - add round key 0 + +	cmpdi	9, 10 +	beq	Loop_aes_gcm_8x + +	# load 2 more round keys (v11, v12) +	lxv	11, 0xb0(6) +	lxv	12, 0xc0(6) + +	cmpdi	9, 12 +	beq	Loop_aes_gcm_8x + +	# load 2 more round keys (v11, v12, v13, v14) +	lxv	13, 0xd0(6) +	lxv	14, 0xe0(6) +	cmpdi	9, 14 +	beq	Loop_aes_gcm_8x + +	b	aes_gcm_out + +.align 5 +Loop_aes_gcm_8x: +	mr	14, 3 +	mr	9, 4 + +	# +	# check partial block +	# +Continue_partial_check: +	ld	15, 56(7) +	cmpdi	15, 0 +	beq	Continue +	bgt	Final_block +	cmpdi	15, 16 +	blt	Final_block + +Continue: +	# n blcoks +	li	10, 128 +	divdu	10, 12, 10	# n 128 bytes-blocks +	cmpdi	10, 0 +	beq	Loop_last_block + +	vaddudm	30, 30, 31	# IV + counter +	vxor	16, 30, 29 +	vaddudm	30, 30, 31 +	vxor	17, 30, 29 +	vaddudm	30, 30, 31 +	vxor	18, 30, 29 +	vaddudm	30, 30, 31 +	vxor	19, 30, 29 +	vaddudm	30, 30, 31 +	vxor	20, 30, 29 +	vaddudm	30, 30, 31 +	vxor	21, 30, 29 +	vaddudm	30, 30, 31 +	vxor	22, 30, 29 + +	mtctr	10 + +	li	15, 16 +	li	16, 32 +	li	17, 48 +	li	18, 64 +	li	19, 80 +	li	20, 96 +	li	21, 112 + +	lwz	10, 240(6) + +Loop_8x_block: + +	lxvb16x		15, 0, 14	# load block +	lxvb16x		16, 15, 14	# load block +	lxvb16x		17, 16, 14	# load block +	lxvb16x		18, 17, 14	# load block +	lxvb16x		19, 18, 14	# load block +	lxvb16x		20, 19, 14	# load block +	lxvb16x		21, 20, 14	# load block +	lxvb16x		22, 21, 14	# load block +	addi		14, 14, 128 + +	Loop_aes_middle8x + +	xxlor	23+32, 10, 10 + +	cmpdi	10, 10 +	beq	Do_next_ghash + +	# 192 bits +	xxlor	24+32, 11, 11 + +	vcipher	15, 15, 23 +	vcipher	16, 16, 23 +	vcipher	17, 17, 23 +	vcipher	18, 18, 23 +	vcipher	19, 19, 23 +	vcipher	20, 20, 23 +	vcipher	21, 21, 23 +	vcipher	22, 22, 23 + +	vcipher	15, 15, 24 +	vcipher	16, 16, 24 +	vcipher	17, 17, 24 +	vcipher	18, 18, 24 +	vcipher	19, 19, 24 +	vcipher	20, 20, 24 +	vcipher	21, 21, 24 +	vcipher	22, 22, 24 + +	xxlor	23+32, 12, 12 + +	cmpdi	10, 12 +	beq	Do_next_ghash + +	# 256 bits +	xxlor	24+32, 13, 13 + +	vcipher	15, 15, 23 +	vcipher	16, 16, 23 +	vcipher	17, 17, 23 +	vcipher	18, 18, 23 +	vcipher	19, 19, 23 +	vcipher	20, 20, 23 +	vcipher	21, 21, 23 +	vcipher	22, 22, 23 + +	vcipher	15, 15, 24 +	vcipher	16, 16, 24 +	vcipher	17, 17, 24 +	vcipher	18, 18, 24 +	vcipher	19, 19, 24 +	vcipher	20, 20, 24 +	vcipher	21, 21, 24 +	vcipher	22, 22, 24 + +	xxlor	23+32, 14, 14 + +	cmpdi	10, 14 +	beq	Do_next_ghash +	b	aes_gcm_out + +Do_next_ghash: + +	# +	# last round +	vcipherlast     15, 15, 23 +	vcipherlast     16, 16, 23 + +	xxlxor		47, 47, 15 +	stxvb16x        47, 0, 9	# store output +	xxlxor		48, 48, 16 +	stxvb16x        48, 15, 9	# store output + +	vcipherlast     17, 17, 23 +	vcipherlast     18, 18, 23 + +	xxlxor		49, 49, 17 +	stxvb16x        49, 16, 9	# store output +	xxlxor		50, 50, 18 +	stxvb16x        50, 17, 9	# store output + +	vcipherlast     19, 19, 23 +	vcipherlast     20, 20, 23 + +	xxlxor		51, 51, 19 +	stxvb16x        51, 18, 9	# store output +	xxlxor		52, 52, 20 +	stxvb16x        52, 19, 9	# store output + +	vcipherlast     21, 21, 23 +	vcipherlast     22, 22, 23 + +	xxlxor		53, 53, 21 +	stxvb16x        53, 20, 9	# store output +	xxlxor		54, 54, 22 +	stxvb16x        54, 21, 9	# store output + +	addi		9, 9, 128 + +	# ghash here +	ppc_aes_gcm_ghash2_4x + +	xxlor	27+32, 0, 0 +	vaddudm 30, 30, 31		# IV + counter +	vmr	29, 30 +	vxor    15, 30, 27		# add round key +	vaddudm 30, 30, 31 +	vxor    16, 30, 27 +	vaddudm 30, 30, 31 +	vxor    17, 30, 27 +	vaddudm 30, 30, 31 +	vxor    18, 30, 27 +	vaddudm 30, 30, 31 +	vxor    19, 30, 27 +	vaddudm 30, 30, 31 +	vxor    20, 30, 27 +	vaddudm 30, 30, 31 +	vxor    21, 30, 27 +	vaddudm 30, 30, 31 +	vxor    22, 30, 27 + +	addi    12, 12, -128 +	addi    11, 11, 128 + +	bdnz	Loop_8x_block + +	vmr	30, 29 +	stxvb16x 30+32, 0, 7		# update IV + +Loop_last_block: +	cmpdi   12, 0 +	beq     aes_gcm_out + +	# loop last few blocks +	li      10, 16 +	divdu   10, 12, 10 + +	mtctr   10 + +	lwz	10, 240(6) + +	cmpdi   12, 16 +	blt     Final_block + +Next_rem_block: +	lxvb16x 15, 0, 14		# load block + +	Loop_aes_middle_1x + +	xxlor	23+32, 10, 10 + +	cmpdi	10, 10 +	beq	Do_next_1x + +	# 192 bits +	xxlor	24+32, 11, 11 + +	vcipher	15, 15, 23 +	vcipher	15, 15, 24 + +	xxlor	23+32, 12, 12 + +	cmpdi	10, 12 +	beq	Do_next_1x + +	# 256 bits +	xxlor	24+32, 13, 13 + +	vcipher	15, 15, 23 +	vcipher	15, 15, 24 + +	xxlor	23+32, 14, 14 + +	cmpdi	10, 14 +	beq	Do_next_1x + +Do_next_1x: +	vcipherlast     15, 15, 23 + +	xxlxor		47, 47, 15 +	stxvb16x	47, 0, 9	# store output +	addi		14, 14, 16 +	addi		9, 9, 16 + +	vmr		28, 15 +	ppc_update_hash_1x + +	addi		12, 12, -16 +	addi		11, 11, 16 +	xxlor		19+32, 0, 0 +	vaddudm		30, 30, 31		# IV + counter +	vxor		15, 30, 19		# add round key + +	bdnz	Next_rem_block + +	li	15, 0 +	std	15, 56(7)		# clear partial? +	stxvb16x 30+32, 0, 7		# update IV +	cmpdi	12, 0 +	beq	aes_gcm_out + +Final_block: +	lwz	10, 240(6) +	Loop_aes_middle_1x + +	xxlor	23+32, 10, 10 + +	cmpdi	10, 10 +	beq	Do_final_1x + +	# 192 bits +	xxlor	24+32, 11, 11 + +	vcipher	15, 15, 23 +	vcipher	15, 15, 24 + +	xxlor	23+32, 12, 12 + +	cmpdi	10, 12 +	beq	Do_final_1x + +	# 256 bits +	xxlor	24+32, 13, 13 + +	vcipher	15, 15, 23 +	vcipher	15, 15, 24 + +	xxlor	23+32, 14, 14 + +	cmpdi	10, 14 +	beq	Do_final_1x + +Do_final_1x: +	vcipherlast     15, 15, 23 + +	# check partial block +	li	21, 0			# encrypt +	ld	15, 56(7)		# partial? +	cmpdi	15, 0 +	beq	Normal_block +	bl	Do_partial_block + +	cmpdi	12, 0 +	ble aes_gcm_out + +	b Continue_partial_check + +Normal_block: +	lxvb16x	15, 0, 14		# load last block +	xxlxor	47, 47, 15 + +	# create partial block mask +	li	15, 16 +	sub	15, 15, 12		# index to the mask + +	vspltisb	16, -1		# first 16 bytes - 0xffff...ff +	vspltisb	17, 0		# second 16 bytes - 0x0000...00 +	li	10, 192 +	stvx	16, 10, 1 +	addi	10, 10, 16 +	stvx	17, 10, 1 + +	addi	10, 1, 192 +	lxvb16x	16, 15, 10		# load partial block mask +	xxland	47, 47, 16 + +	vmr	28, 15 +	ppc_update_hash_1x + +	# * should store only the remaining bytes. +	bl	Write_partial_block + +	stxvb16x 30+32, 0, 7		# update IV +	std	12, 56(7)		# update partial? +	li	16, 16 + +	stxvb16x	32, 0, 8		# write out Xi +	stxvb16x	32, 16, 8		# write out Xi +	b aes_gcm_out + + # + # Compute data mask + # +.macro GEN_MASK _mask _start _end +	vspltisb	16, -1		# first 16 bytes - 0xffff...ff +	vspltisb	17, 0		# second 16 bytes - 0x0000...00 +	li	10, 192 +	stxvb16x	17+32, 10, 1 +	add	10, 10, \_start +	stxvb16x	16+32, 10, 1 +	add	10, 10, \_end +	stxvb16x	17+32, 10, 1 + +	addi	10, 1, 192 +	lxvb16x	\_mask, 0, 10		# load partial block mask +.endm + + # + # Handle multiple partial blocks for encrypt and decrypt + #   operations. + # +SYM_FUNC_START_LOCAL(Do_partial_block) +	add	17, 15, 5 +	cmpdi	17, 16 +	bgt	Big_block +	GEN_MASK 18, 15, 5 +	b	_Partial +SYM_FUNC_END(Do_partial_block) +Big_block: +	li	16, 16 +	GEN_MASK 18, 15, 16 + +_Partial: +	lxvb16x	17+32, 0, 14		# load last block +	sldi	16, 15, 3 +	mtvsrdd	32+16, 0, 16 +	vsro	17, 17, 16 +	xxlxor	47, 47, 17+32 +	xxland	47, 47, 18 + +	vxor	0, 0, 0			# clear Xi +	vmr	28, 15 + +	cmpdi	21, 0			# encrypt/decrypt ops? +	beq	Skip_decrypt +	xxland	32+28, 32+17, 18 + +Skip_decrypt: + +	ppc_update_hash_1x + +	li	16, 16 +	lxvb16x 32+29, 16, 8 +	vxor	0, 0, 29 +	stxvb16x 32, 0, 8		# save Xi +	stxvb16x 32, 16, 8		# save Xi + +	# store partial block +	# loop the rest of the stream if any +	sldi	16, 15, 3 +	mtvsrdd	32+16, 0, 16 +	vslo	15, 15, 16 +	#stxvb16x 15+32, 0, 9		# last block + +	li	16, 16 +	sub	17, 16, 15		# 16 - partial + +	add	16, 15, 5 +	cmpdi	16, 16 +	bgt	Larger_16 +	mr	17, 5 +Larger_16: + +	# write partial +	li		10, 192 +	stxvb16x	15+32, 10, 1	# save current block + +	addi		10, 9, -1 +	addi		16, 1, 191 +	mtctr		17		# move partial byte count + +Write_last_partial: +        lbzu		18, 1(16) +	stbu		18, 1(10) +        bdnz		Write_last_partial +	# Complete loop partial + +	add	14, 14, 17 +	add	9, 9, 17 +	sub	12, 12, 17 +	add	11, 11, 17 + +	add	15, 15, 5 +	cmpdi	15, 16 +	blt	Save_partial + +	vaddudm	30, 30, 31 +	stxvb16x 30+32, 0, 7		# update IV +	xxlor	32+29, 0, 0 +	vxor	15, 30, 29		# IV + round key - add round key 0 +	li	15, 0 +	std	15, 56(7)		# partial done - clear +	b	Partial_done +Save_partial: +	std	15, 56(7)		# partial + +Partial_done: +	blr + + # + # Write partial block + # r9 - output + # r12 - remaining bytes + # v15 - partial input data + # +SYM_FUNC_START_LOCAL(Write_partial_block) +	li		10, 192 +	stxvb16x	15+32, 10, 1		# last block + +	addi		10, 9, -1 +	addi		16, 1, 191 + +        mtctr		12			# remaining bytes +	li		15, 0 + +Write_last_byte: +        lbzu		14, 1(16) +	stbu		14, 1(10) +        bdnz		Write_last_byte +	blr +SYM_FUNC_END(Write_partial_block) + +aes_gcm_out: +	# out = state +	stxvb16x	32, 0, 8		# write out Xi +	add	3, 11, 12		# return count + +	RESTORE_REGS +	blr + + # + # 8x Decrypt + # +_GLOBAL(aes_p10_gcm_decrypt) +.align 5 + +	SAVE_REGS + +	LOAD_HASH_TABLE + +	# initialize ICB: GHASH( IV ), IV - r7 +	lxvb16x	30+32, 0, 7	# load IV  - v30 + +	mr	12, 5		# length +	li	11, 0		# block index + +	# counter 1 +	vxor	31, 31, 31 +	vspltisb 22, 1 +	vsldoi	31, 31, 22,1	# counter 1 + +	# load round key to VSR +	lxv	0, 0(6) +	lxv	1, 0x10(6) +	lxv	2, 0x20(6) +	lxv	3, 0x30(6) +	lxv	4, 0x40(6) +	lxv	5, 0x50(6) +	lxv	6, 0x60(6) +	lxv	7, 0x70(6) +	lxv	8, 0x80(6) +	lxv	9, 0x90(6) +	lxv	10, 0xa0(6) + +	# load rounds - 10 (128), 12 (192), 14 (256) +	lwz	9,240(6) + +	# +	# vxor	state, state, w # addroundkey +	xxlor	32+29, 0, 0 +	vxor	15, 30, 29	# IV + round key - add round key 0 + +	cmpdi	9, 10 +	beq	Loop_aes_gcm_8x_dec + +	# load 2 more round keys (v11, v12) +	lxv	11, 0xb0(6) +	lxv	12, 0xc0(6) + +	cmpdi	9, 12 +	beq	Loop_aes_gcm_8x_dec + +	# load 2 more round keys (v11, v12, v13, v14) +	lxv	13, 0xd0(6) +	lxv	14, 0xe0(6) +	cmpdi	9, 14 +	beq	Loop_aes_gcm_8x_dec + +	b	aes_gcm_out + +.align 5 +Loop_aes_gcm_8x_dec: +	mr	14, 3 +	mr	9, 4 + +	# +	# check partial block +	# +Continue_partial_check_dec: +	ld	15, 56(7) +	cmpdi	15, 0 +	beq	Continue_dec +	bgt	Final_block_dec +	cmpdi	15, 16 +	blt	Final_block_dec + +Continue_dec: +	# n blcoks +	li	10, 128 +	divdu	10, 12, 10	# n 128 bytes-blocks +	cmpdi	10, 0 +	beq	Loop_last_block_dec + +	vaddudm	30, 30, 31	# IV + counter +	vxor	16, 30, 29 +	vaddudm	30, 30, 31 +	vxor	17, 30, 29 +	vaddudm	30, 30, 31 +	vxor	18, 30, 29 +	vaddudm	30, 30, 31 +	vxor	19, 30, 29 +	vaddudm	30, 30, 31 +	vxor	20, 30, 29 +	vaddudm	30, 30, 31 +	vxor	21, 30, 29 +	vaddudm	30, 30, 31 +	vxor	22, 30, 29 + +	mtctr	10 + +	li	15, 16 +	li	16, 32 +	li	17, 48 +	li	18, 64 +	li	19, 80 +	li	20, 96 +	li	21, 112 + +	lwz	10, 240(6) + +Loop_8x_block_dec: + +	lxvb16x		15, 0, 14	# load block +	lxvb16x		16, 15, 14	# load block +	lxvb16x		17, 16, 14	# load block +	lxvb16x		18, 17, 14	# load block +	lxvb16x		19, 18, 14	# load block +	lxvb16x		20, 19, 14	# load block +	lxvb16x		21, 20, 14	# load block +	lxvb16x		22, 21, 14	# load block +	addi		14, 14, 128 + +	Loop_aes_middle8x + +	xxlor	23+32, 10, 10 + +	cmpdi	10, 10 +	beq	Do_next_ghash_dec + +	# 192 bits +	xxlor	24+32, 11, 11 + +	vcipher	15, 15, 23 +	vcipher	16, 16, 23 +	vcipher	17, 17, 23 +	vcipher	18, 18, 23 +	vcipher	19, 19, 23 +	vcipher	20, 20, 23 +	vcipher	21, 21, 23 +	vcipher	22, 22, 23 + +	vcipher	15, 15, 24 +	vcipher	16, 16, 24 +	vcipher	17, 17, 24 +	vcipher	18, 18, 24 +	vcipher	19, 19, 24 +	vcipher	20, 20, 24 +	vcipher	21, 21, 24 +	vcipher	22, 22, 24 + +	xxlor	23+32, 12, 12 + +	cmpdi	10, 12 +	beq	Do_next_ghash_dec + +	# 256 bits +	xxlor	24+32, 13, 13 + +	vcipher	15, 15, 23 +	vcipher	16, 16, 23 +	vcipher	17, 17, 23 +	vcipher	18, 18, 23 +	vcipher	19, 19, 23 +	vcipher	20, 20, 23 +	vcipher	21, 21, 23 +	vcipher	22, 22, 23 + +	vcipher	15, 15, 24 +	vcipher	16, 16, 24 +	vcipher	17, 17, 24 +	vcipher	18, 18, 24 +	vcipher	19, 19, 24 +	vcipher	20, 20, 24 +	vcipher	21, 21, 24 +	vcipher	22, 22, 24 + +	xxlor	23+32, 14, 14 + +	cmpdi	10, 14 +	beq	Do_next_ghash_dec +	b	aes_gcm_out + +Do_next_ghash_dec: + +	# +	# last round +	vcipherlast     15, 15, 23 +	vcipherlast     16, 16, 23 + +	xxlxor		47, 47, 15 +	stxvb16x        47, 0, 9	# store output +	xxlxor		48, 48, 16 +	stxvb16x        48, 15, 9	# store output + +	vcipherlast     17, 17, 23 +	vcipherlast     18, 18, 23 + +	xxlxor		49, 49, 17 +	stxvb16x        49, 16, 9	# store output +	xxlxor		50, 50, 18 +	stxvb16x        50, 17, 9	# store output + +	vcipherlast     19, 19, 23 +	vcipherlast     20, 20, 23 + +	xxlxor		51, 51, 19 +	stxvb16x        51, 18, 9	# store output +	xxlxor		52, 52, 20 +	stxvb16x        52, 19, 9	# store output + +	vcipherlast     21, 21, 23 +	vcipherlast     22, 22, 23 + +	xxlxor		53, 53, 21 +	stxvb16x        53, 20, 9	# store output +	xxlxor		54, 54, 22 +	stxvb16x        54, 21, 9	# store output + +	addi		9, 9, 128 + +	xxlor           15+32, 15, 15 +	xxlor           16+32, 16, 16 +	xxlor           17+32, 17, 17 +	xxlor           18+32, 18, 18 +	xxlor           19+32, 19, 19 +	xxlor           20+32, 20, 20 +	xxlor           21+32, 21, 21 +	xxlor           22+32, 22, 22 + +	# ghash here +	ppc_aes_gcm_ghash2_4x + +	xxlor	27+32, 0, 0 +	vaddudm 30, 30, 31		# IV + counter +	vmr	29, 30 +	vxor    15, 30, 27		# add round key +	vaddudm 30, 30, 31 +	vxor    16, 30, 27 +	vaddudm 30, 30, 31 +	vxor    17, 30, 27 +	vaddudm 30, 30, 31 +	vxor    18, 30, 27 +	vaddudm 30, 30, 31 +	vxor    19, 30, 27 +	vaddudm 30, 30, 31 +	vxor    20, 30, 27 +	vaddudm 30, 30, 31 +	vxor    21, 30, 27 +	vaddudm 30, 30, 31 +	vxor    22, 30, 27 + +	addi    12, 12, -128 +	addi    11, 11, 128 + +	bdnz	Loop_8x_block_dec + +	vmr	30, 29 +	stxvb16x 30+32, 0, 7		# update IV + +Loop_last_block_dec: +	cmpdi   12, 0 +	beq     aes_gcm_out + +	# loop last few blocks +	li      10, 16 +	divdu   10, 12, 10 + +	mtctr   10 + +	lwz	10, 240(6) + +	cmpdi   12, 16 +	blt     Final_block_dec + +Next_rem_block_dec: +	lxvb16x 15, 0, 14		# load block + +	Loop_aes_middle_1x + +	xxlor	23+32, 10, 10 + +	cmpdi	10, 10 +	beq	Do_next_1x_dec + +	# 192 bits +	xxlor	24+32, 11, 11 + +	vcipher	15, 15, 23 +	vcipher	15, 15, 24 + +	xxlor	23+32, 12, 12 + +	cmpdi	10, 12 +	beq	Do_next_1x_dec + +	# 256 bits +	xxlor	24+32, 13, 13 + +	vcipher	15, 15, 23 +	vcipher	15, 15, 24 + +	xxlor	23+32, 14, 14 + +	cmpdi	10, 14 +	beq	Do_next_1x_dec + +Do_next_1x_dec: +	vcipherlast     15, 15, 23 + +	xxlxor		47, 47, 15 +	stxvb16x	47, 0, 9	# store output +	addi		14, 14, 16 +	addi		9, 9, 16 + +	xxlor           28+32, 15, 15 +	#vmr		28, 15 +	ppc_update_hash_1x + +	addi		12, 12, -16 +	addi		11, 11, 16 +	xxlor		19+32, 0, 0 +	vaddudm		30, 30, 31		# IV + counter +	vxor		15, 30, 19		# add round key + +	bdnz	Next_rem_block_dec + +	li	15, 0 +	std	15, 56(7)		# clear partial? +	stxvb16x 30+32, 0, 7		# update IV +	cmpdi	12, 0 +	beq	aes_gcm_out + +Final_block_dec: +	lwz	10, 240(6) +	Loop_aes_middle_1x + +	xxlor	23+32, 10, 10 + +	cmpdi	10, 10 +	beq	Do_final_1x_dec + +	# 192 bits +	xxlor	24+32, 11, 11 + +	vcipher	15, 15, 23 +	vcipher	15, 15, 24 + +	xxlor	23+32, 12, 12 + +	cmpdi	10, 12 +	beq	Do_final_1x_dec + +	# 256 bits +	xxlor	24+32, 13, 13 + +	vcipher	15, 15, 23 +	vcipher	15, 15, 24 + +	xxlor	23+32, 14, 14 + +	cmpdi	10, 14 +	beq	Do_final_1x_dec + +Do_final_1x_dec: +	vcipherlast     15, 15, 23 + +	# check partial block +	li	21, 1			# decrypt +	ld	15, 56(7)		# partial? +	cmpdi	15, 0 +	beq	Normal_block_dec +	bl	Do_partial_block +	cmpdi	12, 0 +	ble aes_gcm_out + +	b Continue_partial_check_dec + +Normal_block_dec: +	lxvb16x	15, 0, 14		# load last block +	xxlxor	47, 47, 15 + +	# create partial block mask +	li	15, 16 +	sub	15, 15, 12		# index to the mask + +	vspltisb	16, -1		# first 16 bytes - 0xffff...ff +	vspltisb	17, 0		# second 16 bytes - 0x0000...00 +	li	10, 192 +	stvx	16, 10, 1 +	addi	10, 10, 16 +	stvx	17, 10, 1 + +	addi	10, 1, 192 +	lxvb16x	16, 15, 10		# load partial block mask +	xxland	47, 47, 16 + +	xxland	32+28, 15, 16 +	#vmr	28, 15 +	ppc_update_hash_1x + +	# * should store only the remaining bytes. +	bl	Write_partial_block + +	stxvb16x 30+32, 0, 7		# update IV +	std	12, 56(7)		# update partial? +	li	16, 16 + +	stxvb16x	32, 0, 8		# write out Xi +	stxvb16x	32, 16, 8		# write out Xi +	b aes_gcm_out diff --git a/arch/powerpc/crypto/aesp8-ppc.pl b/arch/powerpc/crypto/aesp8-ppc.pl new file mode 100644 index 000000000000..1f22aec27d79 --- /dev/null +++ b/arch/powerpc/crypto/aesp8-ppc.pl @@ -0,0 +1,585 @@ +#! /usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 + +# This code is taken from CRYPTOGAMs[1] and is included here using the option +# in the license to distribute the code under the GPL. Therefore this program +# is free software; you can redistribute it and/or modify it under the terms of +# the GNU General Public License version 2 as published by the Free Software +# Foundation. +# +# [1] https://www.openssl.org/~appro/cryptogams/ + +# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org> +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +#       * Redistributions of source code must retain copyright notices, +#         this list of conditions and the following disclaimer. +# +#       * Redistributions in binary form must reproduce the above +#         copyright notice, this list of conditions and the following +#         disclaimer in the documentation and/or other materials +#         provided with the distribution. +# +#       * Neither the name of the CRYPTOGAMS nor the names of its +#         copyright holder and contributors may be used to endorse or +#         promote products derived from this software without specific +#         prior written permission. +# +# ALTERNATIVELY, provided that this notice is retained in full, this +# product may be distributed under the terms of the GNU General Public +# License (GPL), in which case the provisions of the GPL apply INSTEAD OF +# those given above. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see https://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements support for AES instructions as per PowerISA +# specification version 2.07, first implemented by POWER8 processor. +# The module is endian-agnostic in sense that it supports both big- +# and little-endian cases. Data alignment in parallelizable modes is +# handled with VSX loads and stores, which implies MSR.VSX flag being +# set. It should also be noted that ISA specification doesn't prohibit +# alignment exceptions for these instructions on page boundaries. +# Initially alignment was handled in pure AltiVec/VMX way [when data +# is aligned programmatically, which in turn guarantees exception- +# free execution], but it turned to hamper performance when vcipher +# instructions are interleaved. It's reckoned that eventual +# misalignment penalties at page boundaries are in average lower +# than additional overhead in pure AltiVec approach. +# +# May 2016 +# +# Add XTS subroutine, 9x on little- and 12x improvement on big-endian +# systems were measured. +# +###################################################################### +# Current large-block performance in cycles per byte processed with +# 128-bit key (less is better). +# +#		CBC en-/decrypt	CTR	XTS +# POWER8[le]	3.96/0.72	0.74	1.1 +# POWER8[be]	3.75/0.65	0.66	1.0 + +$flavour = shift; + +if ($flavour =~ /64/) { +	$SIZE_T	=8; +	$LRSAVE	=2*$SIZE_T; +	$STU	="stdu"; +	$POP	="ld"; +	$PUSH	="std"; +	$UCMP	="cmpld"; +	$SHL	="sldi"; +} elsif ($flavour =~ /32/) { +	$SIZE_T	=4; +	$LRSAVE	=$SIZE_T; +	$STU	="stwu"; +	$POP	="lwz"; +	$PUSH	="stw"; +	$UCMP	="cmplw"; +	$SHL	="slwi"; +} else { die "nonsense $flavour"; } + +$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; + +$FRAME=8*$SIZE_T; +$prefix="aes_p8"; + +$sp="r1"; +$vrsave="r12"; + +######################################################################### +{{{	# Key setup procedures						# +my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8)); +my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6)); +my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11)); + +$code.=<<___; +.machine	"any" + +.text + +.align	7 +rcon: +.long	0x01000000, 0x01000000, 0x01000000, 0x01000000	?rev +.long	0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000	?rev +.long	0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c	?rev +.long	0,0,0,0						?asis +Lconsts: +	mflr	r0 +	bcl	20,31,\$+4 +	mflr	$ptr	 #vvvvv "distance between . and rcon +	addi	$ptr,$ptr,-0x48 +	mtlr	r0 +	blr +	.long	0 +	.byte	0,12,0x14,0,0,0,0,0 +.asciz	"AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>" + +.globl	.${prefix}_set_encrypt_key +Lset_encrypt_key: +	mflr		r11 +	$PUSH		r11,$LRSAVE($sp) + +	li		$ptr,-1 +	${UCMP}i	$inp,0 +	beq-		Lenc_key_abort		# if ($inp==0) return -1; +	${UCMP}i	$out,0 +	beq-		Lenc_key_abort		# if ($out==0) return -1; +	li		$ptr,-2 +	cmpwi		$bits,128 +	blt-		Lenc_key_abort +	cmpwi		$bits,256 +	bgt-		Lenc_key_abort +	andi.		r0,$bits,0x3f +	bne-		Lenc_key_abort + +	lis		r0,0xfff0 +	mfspr		$vrsave,256 +	mtspr		256,r0 + +	bl		Lconsts +	mtlr		r11 + +	neg		r9,$inp +	lvx		$in0,0,$inp +	addi		$inp,$inp,15		# 15 is not typo +	lvsr		$key,0,r9		# borrow $key +	li		r8,0x20 +	cmpwi		$bits,192 +	lvx		$in1,0,$inp +	le?vspltisb	$mask,0x0f		# borrow $mask +	lvx		$rcon,0,$ptr +	le?vxor		$key,$key,$mask		# adjust for byte swap +	lvx		$mask,r8,$ptr +	addi		$ptr,$ptr,0x10 +	vperm		$in0,$in0,$in1,$key	# align [and byte swap in LE] +	li		$cnt,8 +	vxor		$zero,$zero,$zero +	mtctr		$cnt + +	?lvsr		$outperm,0,$out +	vspltisb	$outmask,-1 +	lvx		$outhead,0,$out +	?vperm		$outmask,$zero,$outmask,$outperm + +	blt		Loop128 +	addi		$inp,$inp,8 +	beq		L192 +	addi		$inp,$inp,8 +	b		L256 + +.align	4 +Loop128: +	vperm		$key,$in0,$in0,$mask	# rotate-n-splat +	vsldoi		$tmp,$zero,$in0,12	# >>32 +	 vperm		$outtail,$in0,$in0,$outperm	# rotate +	 vsel		$stage,$outhead,$outtail,$outmask +	 vmr		$outhead,$outtail +	vcipherlast	$key,$key,$rcon +	 stvx		$stage,0,$out +	 addi		$out,$out,16 + +	vxor		$in0,$in0,$tmp +	vsldoi		$tmp,$zero,$tmp,12	# >>32 +	vxor		$in0,$in0,$tmp +	vsldoi		$tmp,$zero,$tmp,12	# >>32 +	vxor		$in0,$in0,$tmp +	 vadduwm	$rcon,$rcon,$rcon +	vxor		$in0,$in0,$key +	bdnz		Loop128 + +	lvx		$rcon,0,$ptr		# last two round keys + +	vperm		$key,$in0,$in0,$mask	# rotate-n-splat +	vsldoi		$tmp,$zero,$in0,12	# >>32 +	 vperm		$outtail,$in0,$in0,$outperm	# rotate +	 vsel		$stage,$outhead,$outtail,$outmask +	 vmr		$outhead,$outtail +	vcipherlast	$key,$key,$rcon +	 stvx		$stage,0,$out +	 addi		$out,$out,16 + +	vxor		$in0,$in0,$tmp +	vsldoi		$tmp,$zero,$tmp,12	# >>32 +	vxor		$in0,$in0,$tmp +	vsldoi		$tmp,$zero,$tmp,12	# >>32 +	vxor		$in0,$in0,$tmp +	 vadduwm	$rcon,$rcon,$rcon +	vxor		$in0,$in0,$key + +	vperm		$key,$in0,$in0,$mask	# rotate-n-splat +	vsldoi		$tmp,$zero,$in0,12	# >>32 +	 vperm		$outtail,$in0,$in0,$outperm	# rotate +	 vsel		$stage,$outhead,$outtail,$outmask +	 vmr		$outhead,$outtail +	vcipherlast	$key,$key,$rcon +	 stvx		$stage,0,$out +	 addi		$out,$out,16 + +	vxor		$in0,$in0,$tmp +	vsldoi		$tmp,$zero,$tmp,12	# >>32 +	vxor		$in0,$in0,$tmp +	vsldoi		$tmp,$zero,$tmp,12	# >>32 +	vxor		$in0,$in0,$tmp +	vxor		$in0,$in0,$key +	 vperm		$outtail,$in0,$in0,$outperm	# rotate +	 vsel		$stage,$outhead,$outtail,$outmask +	 vmr		$outhead,$outtail +	 stvx		$stage,0,$out + +	addi		$inp,$out,15		# 15 is not typo +	addi		$out,$out,0x50 + +	li		$rounds,10 +	b		Ldone + +.align	4 +L192: +	lvx		$tmp,0,$inp +	li		$cnt,4 +	 vperm		$outtail,$in0,$in0,$outperm	# rotate +	 vsel		$stage,$outhead,$outtail,$outmask +	 vmr		$outhead,$outtail +	 stvx		$stage,0,$out +	 addi		$out,$out,16 +	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE] +	vspltisb	$key,8			# borrow $key +	mtctr		$cnt +	vsububm		$mask,$mask,$key	# adjust the mask + +Loop192: +	vperm		$key,$in1,$in1,$mask	# roate-n-splat +	vsldoi		$tmp,$zero,$in0,12	# >>32 +	vcipherlast	$key,$key,$rcon + +	vxor		$in0,$in0,$tmp +	vsldoi		$tmp,$zero,$tmp,12	# >>32 +	vxor		$in0,$in0,$tmp +	vsldoi		$tmp,$zero,$tmp,12	# >>32 +	vxor		$in0,$in0,$tmp + +	 vsldoi		$stage,$zero,$in1,8 +	vspltw		$tmp,$in0,3 +	vxor		$tmp,$tmp,$in1 +	vsldoi		$in1,$zero,$in1,12	# >>32 +	 vadduwm	$rcon,$rcon,$rcon +	vxor		$in1,$in1,$tmp +	vxor		$in0,$in0,$key +	vxor		$in1,$in1,$key +	 vsldoi		$stage,$stage,$in0,8 + +	vperm		$key,$in1,$in1,$mask	# rotate-n-splat +	vsldoi		$tmp,$zero,$in0,12	# >>32 +	 vperm		$outtail,$stage,$stage,$outperm	# rotate +	 vsel		$stage,$outhead,$outtail,$outmask +	 vmr		$outhead,$outtail +	vcipherlast	$key,$key,$rcon +	 stvx		$stage,0,$out +	 addi		$out,$out,16 + +	 vsldoi		$stage,$in0,$in1,8 +	vxor		$in0,$in0,$tmp +	vsldoi		$tmp,$zero,$tmp,12	# >>32 +	 vperm		$outtail,$stage,$stage,$outperm	# rotate +	 vsel		$stage,$outhead,$outtail,$outmask +	 vmr		$outhead,$outtail +	vxor		$in0,$in0,$tmp +	vsldoi		$tmp,$zero,$tmp,12	# >>32 +	vxor		$in0,$in0,$tmp +	 stvx		$stage,0,$out +	 addi		$out,$out,16 + +	vspltw		$tmp,$in0,3 +	vxor		$tmp,$tmp,$in1 +	vsldoi		$in1,$zero,$in1,12	# >>32 +	 vadduwm	$rcon,$rcon,$rcon +	vxor		$in1,$in1,$tmp +	vxor		$in0,$in0,$key +	vxor		$in1,$in1,$key +	 vperm		$outtail,$in0,$in0,$outperm	# rotate +	 vsel		$stage,$outhead,$outtail,$outmask +	 vmr		$outhead,$outtail +	 stvx		$stage,0,$out +	 addi		$inp,$out,15		# 15 is not typo +	 addi		$out,$out,16 +	bdnz		Loop192 + +	li		$rounds,12 +	addi		$out,$out,0x20 +	b		Ldone + +.align	4 +L256: +	lvx		$tmp,0,$inp +	li		$cnt,7 +	li		$rounds,14 +	 vperm		$outtail,$in0,$in0,$outperm	# rotate +	 vsel		$stage,$outhead,$outtail,$outmask +	 vmr		$outhead,$outtail +	 stvx		$stage,0,$out +	 addi		$out,$out,16 +	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE] +	mtctr		$cnt + +Loop256: +	vperm		$key,$in1,$in1,$mask	# rotate-n-splat +	vsldoi		$tmp,$zero,$in0,12	# >>32 +	 vperm		$outtail,$in1,$in1,$outperm	# rotate +	 vsel		$stage,$outhead,$outtail,$outmask +	 vmr		$outhead,$outtail +	vcipherlast	$key,$key,$rcon +	 stvx		$stage,0,$out +	 addi		$out,$out,16 + +	vxor		$in0,$in0,$tmp +	vsldoi		$tmp,$zero,$tmp,12	# >>32 +	vxor		$in0,$in0,$tmp +	vsldoi		$tmp,$zero,$tmp,12	# >>32 +	vxor		$in0,$in0,$tmp +	 vadduwm	$rcon,$rcon,$rcon +	vxor		$in0,$in0,$key +	 vperm		$outtail,$in0,$in0,$outperm	# rotate +	 vsel		$stage,$outhead,$outtail,$outmask +	 vmr		$outhead,$outtail +	 stvx		$stage,0,$out +	 addi		$inp,$out,15		# 15 is not typo +	 addi		$out,$out,16 +	bdz		Ldone + +	vspltw		$key,$in0,3		# just splat +	vsldoi		$tmp,$zero,$in1,12	# >>32 +	vsbox		$key,$key + +	vxor		$in1,$in1,$tmp +	vsldoi		$tmp,$zero,$tmp,12	# >>32 +	vxor		$in1,$in1,$tmp +	vsldoi		$tmp,$zero,$tmp,12	# >>32 +	vxor		$in1,$in1,$tmp + +	vxor		$in1,$in1,$key +	b		Loop256 + +.align	4 +Ldone: +	lvx		$in1,0,$inp		# redundant in aligned case +	vsel		$in1,$outhead,$in1,$outmask +	stvx		$in1,0,$inp +	li		$ptr,0 +	mtspr		256,$vrsave +	stw		$rounds,0($out) + +Lenc_key_abort: +	mr		r3,$ptr +	blr +	.long		0 +	.byte		0,12,0x14,1,0,0,3,0 +	.long		0 +.size	.${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key + +.globl	.${prefix}_set_decrypt_key +	$STU		$sp,-$FRAME($sp) +	mflr		r10 +	$PUSH		r10,$FRAME+$LRSAVE($sp) +	bl		Lset_encrypt_key +	mtlr		r10 + +	cmpwi		r3,0 +	bne-		Ldec_key_abort + +	slwi		$cnt,$rounds,4 +	subi		$inp,$out,240		# first round key +	srwi		$rounds,$rounds,1 +	add		$out,$inp,$cnt		# last round key +	mtctr		$rounds + +Ldeckey: +	lwz		r0, 0($inp) +	lwz		r6, 4($inp) +	lwz		r7, 8($inp) +	lwz		r8, 12($inp) +	addi		$inp,$inp,16 +	lwz		r9, 0($out) +	lwz		r10,4($out) +	lwz		r11,8($out) +	lwz		r12,12($out) +	stw		r0, 0($out) +	stw		r6, 4($out) +	stw		r7, 8($out) +	stw		r8, 12($out) +	subi		$out,$out,16 +	stw		r9, -16($inp) +	stw		r10,-12($inp) +	stw		r11,-8($inp) +	stw		r12,-4($inp) +	bdnz		Ldeckey + +	xor		r3,r3,r3		# return value +Ldec_key_abort: +	addi		$sp,$sp,$FRAME +	blr +	.long		0 +	.byte		0,12,4,1,0x80,0,3,0 +	.long		0 +.size	.${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key +___ +}}} +######################################################################### +{{{	# Single block en- and decrypt procedures			# +sub gen_block () { +my $dir = shift; +my $n   = $dir eq "de" ? "n" : ""; +my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7)); + +$code.=<<___; +.globl	.${prefix}_${dir}crypt +	lwz		$rounds,240($key) +	lis		r0,0xfc00 +	mfspr		$vrsave,256 +	li		$idx,15			# 15 is not typo +	mtspr		256,r0 + +	lvx		v0,0,$inp +	neg		r11,$out +	lvx		v1,$idx,$inp +	lvsl		v2,0,$inp		# inpperm +	le?vspltisb	v4,0x0f +	?lvsl		v3,0,r11		# outperm +	le?vxor		v2,v2,v4 +	li		$idx,16 +	vperm		v0,v0,v1,v2		# align [and byte swap in LE] +	lvx		v1,0,$key +	?lvsl		v5,0,$key		# keyperm +	srwi		$rounds,$rounds,1 +	lvx		v2,$idx,$key +	addi		$idx,$idx,16 +	subi		$rounds,$rounds,1 +	?vperm		v1,v1,v2,v5		# align round key + +	vxor		v0,v0,v1 +	lvx		v1,$idx,$key +	addi		$idx,$idx,16 +	mtctr		$rounds + +Loop_${dir}c: +	?vperm		v2,v2,v1,v5 +	v${n}cipher	v0,v0,v2 +	lvx		v2,$idx,$key +	addi		$idx,$idx,16 +	?vperm		v1,v1,v2,v5 +	v${n}cipher	v0,v0,v1 +	lvx		v1,$idx,$key +	addi		$idx,$idx,16 +	bdnz		Loop_${dir}c + +	?vperm		v2,v2,v1,v5 +	v${n}cipher	v0,v0,v2 +	lvx		v2,$idx,$key +	?vperm		v1,v1,v2,v5 +	v${n}cipherlast	v0,v0,v1 + +	vspltisb	v2,-1 +	vxor		v1,v1,v1 +	li		$idx,15			# 15 is not typo +	?vperm		v2,v1,v2,v3		# outmask +	le?vxor		v3,v3,v4 +	lvx		v1,0,$out		# outhead +	vperm		v0,v0,v0,v3		# rotate [and byte swap in LE] +	vsel		v1,v1,v0,v2 +	lvx		v4,$idx,$out +	stvx		v1,0,$out +	vsel		v0,v0,v4,v2 +	stvx		v0,$idx,$out + +	mtspr		256,$vrsave +	blr +	.long		0 +	.byte		0,12,0x14,0,0,0,3,0 +	.long		0 +.size	.${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt +___ +} +&gen_block("en"); +&gen_block("de"); +}}} + +my $consts=1; +foreach(split("\n",$code)) { +        s/\`([^\`]*)\`/eval($1)/geo; + +	# constants table endian-specific conversion +	if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) { +	    my $conv=$3; +	    my @bytes=(); + +	    # convert to endian-agnostic format +	    if ($1 eq "long") { +	      foreach (split(/,\s*/,$2)) { +		my $l = /^0/?oct:int; +		push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff; +	      } +	    } else { +		@bytes = map(/^0/?oct:int,split(/,\s*/,$2)); +	    } + +	    # little-endian conversion +	    if ($flavour =~ /le$/o) { +		SWITCH: for($conv)  { +		    /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; }; +		    /\?rev/ && do   { @bytes=reverse(@bytes);    last; }; +		} +	    } + +	    #emit +	    print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n"; +	    next; +	} +	$consts=0 if (m/Lconsts:/o);	# end of table + +	# instructions prefixed with '?' are endian-specific and need +	# to be adjusted accordingly... +	if ($flavour =~ /le$/o) {	# little-endian +	    s/le\?//o		or +	    s/be\?/#be#/o	or +	    s/\?lvsr/lvsl/o	or +	    s/\?lvsl/lvsr/o	or +	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or +	    s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or +	    s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o; +	} else {			# big-endian +	    s/le\?/#le#/o	or +	    s/be\?//o		or +	    s/\?([a-z]+)/$1/o; +	} + +        print $_,"\n"; +} + +close STDOUT; diff --git a/arch/powerpc/crypto/ghashp8-ppc.pl b/arch/powerpc/crypto/ghashp8-ppc.pl new file mode 100644 index 000000000000..b56603b4a893 --- /dev/null +++ b/arch/powerpc/crypto/ghashp8-ppc.pl @@ -0,0 +1,370 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 + +# This code is taken from the OpenSSL project but the author (Andy Polyakov) +# has relicensed it under the GPLv2. Therefore this program is free software; +# you can redistribute it and/or modify it under the terms of the GNU General +# Public License version 2 as published by the Free Software Foundation. +# +# The original headers, including the original license headers, are +# included below for completeness. + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see https://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# GHASH for PowerISA v2.07. +# +# July 2014 +# +# Accurate performance measurements are problematic, because it's +# always virtualized setup with possibly throttled processor. +# Relative comparison is therefore more informative. This initial +# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x +# faster than "4-bit" integer-only compiler-generated 64-bit code. +# "Initial version" means that there is room for futher improvement. + +$flavour=shift; +$output =shift; + +if ($flavour =~ /64/) { +	$SIZE_T=8; +	$LRSAVE=2*$SIZE_T; +	$STU="stdu"; +	$POP="ld"; +	$PUSH="std"; +} elsif ($flavour =~ /32/) { +	$SIZE_T=4; +	$LRSAVE=$SIZE_T; +	$STU="stwu"; +	$POP="lwz"; +	$PUSH="stw"; +} else { die "nonsense $flavour"; } + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; + +my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));	# argument block + +my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3)); +my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12)); +my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19)); +my $vrsave="r12"; +my ($t4,$t5,$t6) = ($Hl,$H,$Hh); + +$code=<<___; +.machine	"any" + +.text + +.globl	.gcm_init_p8 +	lis		r0,0xfff0 +	li		r8,0x10 +	mfspr		$vrsave,256 +	li		r9,0x20 +	mtspr		256,r0 +	li		r10,0x30 +	lvx_u		$H,0,r4			# load H +	le?xor		r7,r7,r7 +	le?addi		r7,r7,0x8		# need a vperm start with 08 +	le?lvsr		5,0,r7 +	le?vspltisb	6,0x0f +	le?vxor		5,5,6			# set a b-endian mask +	le?vperm	$H,$H,$H,5 + +	vspltisb	$xC2,-16		# 0xf0 +	vspltisb	$t0,1			# one +	vaddubm		$xC2,$xC2,$xC2		# 0xe0 +	vxor		$zero,$zero,$zero +	vor		$xC2,$xC2,$t0		# 0xe1 +	vsldoi		$xC2,$xC2,$zero,15	# 0xe1... +	vsldoi		$t1,$zero,$t0,1		# ...1 +	vaddubm		$xC2,$xC2,$xC2		# 0xc2... +	vspltisb	$t2,7 +	vor		$xC2,$xC2,$t1		# 0xc2....01 +	vspltb		$t1,$H,0		# most significant byte +	vsl		$H,$H,$t0		# H<<=1 +	vsrab		$t1,$t1,$t2		# broadcast carry bit +	vand		$t1,$t1,$xC2 +	vxor		$H,$H,$t1		# twisted H + +	vsldoi		$H,$H,$H,8		# twist even more ... +	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0 +	vsldoi		$Hl,$zero,$H,8		# ... and split +	vsldoi		$Hh,$H,$zero,8 + +	stvx_u		$xC2,0,r3		# save pre-computed table +	stvx_u		$Hl,r8,r3 +	stvx_u		$H, r9,r3 +	stvx_u		$Hh,r10,r3 + +	mtspr		256,$vrsave +	blr +	.long		0 +	.byte		0,12,0x14,0,0,0,2,0 +	.long		0 +.size	.gcm_init_p8,.-.gcm_init_p8 + +.globl	.gcm_init_htable +	lis		r0,0xfff0 +	li		r8,0x10 +	mfspr		$vrsave,256 +	li		r9,0x20 +	mtspr		256,r0 +	li		r10,0x30 +	lvx_u		$H,0,r4			# load H + +	vspltisb	$xC2,-16		# 0xf0 +	vspltisb	$t0,1			# one +	vaddubm		$xC2,$xC2,$xC2		# 0xe0 +	vxor		$zero,$zero,$zero +	vor		$xC2,$xC2,$t0		# 0xe1 +	vsldoi		$xC2,$xC2,$zero,15	# 0xe1... +	vsldoi		$t1,$zero,$t0,1		# ...1 +	vaddubm		$xC2,$xC2,$xC2		# 0xc2... +	vspltisb	$t2,7 +	vor		$xC2,$xC2,$t1		# 0xc2....01 +	vspltb		$t1,$H,0		# most significant byte +	vsl		$H,$H,$t0		# H<<=1 +	vsrab		$t1,$t1,$t2		# broadcast carry bit +	vand		$t1,$t1,$xC2 +	vxor		$IN,$H,$t1		# twisted H + +	vsldoi		$H,$IN,$IN,8		# twist even more ... +	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0 +	vsldoi		$Hl,$zero,$H,8		# ... and split +	vsldoi		$Hh,$H,$zero,8 + +	stvx_u		$xC2,0,r3		# save pre-computed table +	stvx_u		$Hl,r8,r3 +	li		r8,0x40 +	stvx_u		$H, r9,r3 +	li		r9,0x50 +	stvx_u		$Hh,r10,r3 +	li		r10,0x60 + +	vpmsumd		$Xl,$IN,$Hl		# H.lo·H.lo +	vpmsumd		$Xm,$IN,$H		# H.hi·H.lo+H.lo·H.hi +	vpmsumd		$Xh,$IN,$Hh		# H.hi·H.hi + +	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase + +	vsldoi		$t0,$Xm,$zero,8 +	vsldoi		$t1,$zero,$Xm,8 +	vxor		$Xl,$Xl,$t0 +	vxor		$Xh,$Xh,$t1 + +	vsldoi		$Xl,$Xl,$Xl,8 +	vxor		$Xl,$Xl,$t2 + +	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase +	vpmsumd		$Xl,$Xl,$xC2 +	vxor		$t1,$t1,$Xh +	vxor		$IN1,$Xl,$t1 + +	vsldoi		$H2,$IN1,$IN1,8 +	vsldoi		$H2l,$zero,$H2,8 +	vsldoi		$H2h,$H2,$zero,8 + +	stvx_u		$H2l,r8,r3		# save H^2 +	li		r8,0x70 +	stvx_u		$H2,r9,r3 +	li		r9,0x80 +	stvx_u		$H2h,r10,r3 +	li		r10,0x90 + +	vpmsumd		$Xl,$IN,$H2l		# H.lo·H^2.lo +	 vpmsumd	$Xl1,$IN1,$H2l		# H^2.lo·H^2.lo +	vpmsumd		$Xm,$IN,$H2		# H.hi·H^2.lo+H.lo·H^2.hi +	 vpmsumd	$Xm1,$IN1,$H2		# H^2.hi·H^2.lo+H^2.lo·H^2.hi +	vpmsumd		$Xh,$IN,$H2h		# H.hi·H^2.hi +	 vpmsumd	$Xh1,$IN1,$H2h		# H^2.hi·H^2.hi + +	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase +	 vpmsumd	$t6,$Xl1,$xC2		# 1st reduction phase + +	vsldoi		$t0,$Xm,$zero,8 +	vsldoi		$t1,$zero,$Xm,8 +	 vsldoi		$t4,$Xm1,$zero,8 +	 vsldoi		$t5,$zero,$Xm1,8 +	vxor		$Xl,$Xl,$t0 +	vxor		$Xh,$Xh,$t1 +	 vxor		$Xl1,$Xl1,$t4 +	 vxor		$Xh1,$Xh1,$t5 + +	vsldoi		$Xl,$Xl,$Xl,8 +	 vsldoi		$Xl1,$Xl1,$Xl1,8 +	vxor		$Xl,$Xl,$t2 +	 vxor		$Xl1,$Xl1,$t6 + +	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase +	 vsldoi		$t5,$Xl1,$Xl1,8		# 2nd reduction phase +	vpmsumd		$Xl,$Xl,$xC2 +	 vpmsumd	$Xl1,$Xl1,$xC2 +	vxor		$t1,$t1,$Xh +	 vxor		$t5,$t5,$Xh1 +	vxor		$Xl,$Xl,$t1 +	 vxor		$Xl1,$Xl1,$t5 + +	vsldoi		$H,$Xl,$Xl,8 +	 vsldoi		$H2,$Xl1,$Xl1,8 +	vsldoi		$Hl,$zero,$H,8 +	vsldoi		$Hh,$H,$zero,8 +	 vsldoi		$H2l,$zero,$H2,8 +	 vsldoi		$H2h,$H2,$zero,8 + +	stvx_u		$Hl,r8,r3		# save H^3 +	li		r8,0xa0 +	stvx_u		$H,r9,r3 +	li		r9,0xb0 +	stvx_u		$Hh,r10,r3 +	li		r10,0xc0 +	 stvx_u		$H2l,r8,r3		# save H^4 +	 stvx_u		$H2,r9,r3 +	 stvx_u		$H2h,r10,r3 + +	mtspr		256,$vrsave +	blr +	.long		0 +	.byte		0,12,0x14,0,0,0,2,0 +	.long		0 +.size	.gcm_init_htable,.-.gcm_init_htable + +.globl	.gcm_gmult_p8 +	lis		r0,0xfff8 +	li		r8,0x10 +	mfspr		$vrsave,256 +	li		r9,0x20 +	mtspr		256,r0 +	li		r10,0x30 +	lvx_u		$IN,0,$Xip		# load Xi + +	lvx_u		$Hl,r8,$Htbl		# load pre-computed table +	 le?lvsl	$lemask,r0,r0 +	lvx_u		$H, r9,$Htbl +	 le?vspltisb	$t0,0x07 +	lvx_u		$Hh,r10,$Htbl +	 le?vxor	$lemask,$lemask,$t0 +	lvx_u		$xC2,0,$Htbl +	 le?vperm	$IN,$IN,$IN,$lemask +	vxor		$zero,$zero,$zero + +	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo +	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi +	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi + +	vpmsumd		$t2,$Xl,$xC2		# 1st phase + +	vsldoi		$t0,$Xm,$zero,8 +	vsldoi		$t1,$zero,$Xm,8 +	vxor		$Xl,$Xl,$t0 +	vxor		$Xh,$Xh,$t1 + +	vsldoi		$Xl,$Xl,$Xl,8 +	vxor		$Xl,$Xl,$t2 + +	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase +	vpmsumd		$Xl,$Xl,$xC2 +	vxor		$t1,$t1,$Xh +	vxor		$Xl,$Xl,$t1 + +	le?vperm	$Xl,$Xl,$Xl,$lemask +	stvx_u		$Xl,0,$Xip		# write out Xi + +	mtspr		256,$vrsave +	blr +	.long		0 +	.byte		0,12,0x14,0,0,0,2,0 +	.long		0 +.size	.gcm_gmult_p8,.-.gcm_gmult_p8 + +.globl	.gcm_ghash_p8 +	lis		r0,0xfff8 +	li		r8,0x10 +	mfspr		$vrsave,256 +	li		r9,0x20 +	mtspr		256,r0 +	li		r10,0x30 +	lvx_u		$Xl,0,$Xip		# load Xi + +	lvx_u		$Hl,r8,$Htbl		# load pre-computed table +	 le?lvsl	$lemask,r0,r0 +	lvx_u		$H, r9,$Htbl +	 le?vspltisb	$t0,0x07 +	lvx_u		$Hh,r10,$Htbl +	 le?vxor	$lemask,$lemask,$t0 +	lvx_u		$xC2,0,$Htbl +	 le?vperm	$Xl,$Xl,$Xl,$lemask +	vxor		$zero,$zero,$zero + +	lvx_u		$IN,0,$inp +	addi		$inp,$inp,16 +	subi		$len,$len,16 +	 le?vperm	$IN,$IN,$IN,$lemask +	vxor		$IN,$IN,$Xl +	b		Loop + +.align	5 +Loop: +	 subic		$len,$len,16 +	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo +	 subfe.		r0,r0,r0		# borrow?-1:0 +	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi +	 and		r0,r0,$len +	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi +	 add		$inp,$inp,r0 + +	vpmsumd		$t2,$Xl,$xC2		# 1st phase + +	vsldoi		$t0,$Xm,$zero,8 +	vsldoi		$t1,$zero,$Xm,8 +	vxor		$Xl,$Xl,$t0 +	vxor		$Xh,$Xh,$t1 + +	vsldoi		$Xl,$Xl,$Xl,8 +	vxor		$Xl,$Xl,$t2 +	 lvx_u		$IN,0,$inp +	 addi		$inp,$inp,16 + +	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase +	vpmsumd		$Xl,$Xl,$xC2 +	 le?vperm	$IN,$IN,$IN,$lemask +	vxor		$t1,$t1,$Xh +	vxor		$IN,$IN,$t1 +	vxor		$IN,$IN,$Xl +	beq		Loop			# did $len-=16 borrow? + +	vxor		$Xl,$Xl,$t1 +	le?vperm	$Xl,$Xl,$Xl,$lemask +	stvx_u		$Xl,0,$Xip		# write out Xi + +	mtspr		256,$vrsave +	blr +	.long		0 +	.byte		0,12,0x14,0,0,0,4,0 +	.long		0 +.size	.gcm_ghash_p8,.-.gcm_ghash_p8 + +.asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>" +.align  2 +___ + +foreach (split("\n",$code)) { +	if ($flavour =~ /le$/o) {	# little-endian +	    s/le\?//o		or +	    s/be\?/#be#/o; +	} else { +	    s/le\?/#le#/o	or +	    s/be\?//o; +	} +	print $_,"\n"; +} + +close STDOUT; # enforce flush diff --git a/arch/powerpc/crypto/ppc-xlate.pl b/arch/powerpc/crypto/ppc-xlate.pl new file mode 100644 index 000000000000..23cca703ce29 --- /dev/null +++ b/arch/powerpc/crypto/ppc-xlate.pl @@ -0,0 +1,229 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 + +# PowerPC assembler distiller by <appro>. + +my $flavour = shift; +my $output = shift; +open STDOUT,">$output" || die "can't open $output: $!"; + +my %GLOBALS; +my $dotinlocallabels=($flavour=~/linux/)?1:0; + +################################################################ +# directives which need special treatment on different platforms +################################################################ +my $globl = sub { +    my $junk = shift; +    my $name = shift; +    my $global = \$GLOBALS{$name}; +    my $ret; + +    $name =~ s|^[\.\_]||; + +    SWITCH: for ($flavour) { +	/aix/		&& do { $name = ".$name"; +				last; +			      }; +	/osx/		&& do { $name = "_$name"; +				last; +			      }; +	/linux/ +			&& do {	$ret = "_GLOBAL($name)"; +				last; +			      }; +    } + +    $ret = ".globl	$name\nalign 5\n$name:" if (!$ret); +    $$global = $name; +    $ret; +}; +my $text = sub { +    my $ret = ($flavour =~ /aix/) ? ".csect\t.text[PR],7" : ".text"; +    $ret = ".abiversion	2\n".$ret	if ($flavour =~ /linux.*64le/); +    $ret; +}; +my $machine = sub { +    my $junk = shift; +    my $arch = shift; +    if ($flavour =~ /osx/) +    {	$arch =~ s/\"//g; +	$arch = ($flavour=~/64/) ? "ppc970-64" : "ppc970" if ($arch eq "any"); +    } +    ".machine	$arch"; +}; +my $size = sub { +    if ($flavour =~ /linux/) +    {	shift; +	my $name = shift; $name =~ s|^[\.\_]||; +	my $ret  = ".size	$name,.-".($flavour=~/64$/?".":"").$name; +	$ret .= "\n.size	.$name,.-.$name" if ($flavour=~/64$/); +	$ret; +    } +    else +    {	"";	} +}; +my $asciz = sub { +    shift; +    my $line = join(",",@_); +    if ($line =~ /^"(.*)"$/) +    {	".byte	" . join(",",unpack("C*",$1),0) . "\n.align	2";	} +    else +    {	"";	} +}; +my $quad = sub { +    shift; +    my @ret; +    my ($hi,$lo); +    for (@_) { +	if (/^0x([0-9a-f]*?)([0-9a-f]{1,8})$/io) +	{  $hi=$1?"0x$1":"0"; $lo="0x$2";  } +	elsif (/^([0-9]+)$/o) +	{  $hi=$1>>32; $lo=$1&0xffffffff;  } # error-prone with 32-bit perl +	else +	{  $hi=undef; $lo=$_; } + +	if (defined($hi)) +	{  push(@ret,$flavour=~/le$/o?".long\t$lo,$hi":".long\t$hi,$lo");  } +	else +	{  push(@ret,".quad	$lo");  } +    } +    join("\n",@ret); +}; + +################################################################ +# simplified mnemonics not handled by at least one assembler +################################################################ +my $cmplw = sub { +    my $f = shift; +    my $cr = 0; $cr = shift if ($#_>1); +    # Some out-of-date 32-bit GNU assembler just can't handle cmplw... +    ($flavour =~ /linux.*32/) ? +	"	.long	".sprintf "0x%x",31<<26|$cr<<23|$_[0]<<16|$_[1]<<11|64 : +	"	cmplw	".join(',',$cr,@_); +}; +my $bdnz = sub { +    my $f = shift; +    my $bo = $f=~/[\+\-]/ ? 16+9 : 16;	# optional "to be taken" hint +    "	bc	$bo,0,".shift; +} if ($flavour!~/linux/); +my $bltlr = sub { +    my $f = shift; +    my $bo = $f=~/\-/ ? 12+2 : 12;	# optional "not to be taken" hint +    ($flavour =~ /linux/) ?		# GNU as doesn't allow most recent hints +	"	.long	".sprintf "0x%x",19<<26|$bo<<21|16<<1 : +	"	bclr	$bo,0"; +}; +my $bnelr = sub { +    my $f = shift; +    my $bo = $f=~/\-/ ? 4+2 : 4;	# optional "not to be taken" hint +    ($flavour =~ /linux/) ?		# GNU as doesn't allow most recent hints +	"	.long	".sprintf "0x%x",19<<26|$bo<<21|2<<16|16<<1 : +	"	bclr	$bo,2"; +}; +my $beqlr = sub { +    my $f = shift; +    my $bo = $f=~/-/ ? 12+2 : 12;	# optional "not to be taken" hint +    ($flavour =~ /linux/) ?		# GNU as doesn't allow most recent hints +	"	.long	".sprintf "0x%X",19<<26|$bo<<21|2<<16|16<<1 : +	"	bclr	$bo,2"; +}; +# GNU assembler can't handle extrdi rA,rS,16,48, or when sum of last two +# arguments is 64, with "operand out of range" error. +my $extrdi = sub { +    my ($f,$ra,$rs,$n,$b) = @_; +    $b = ($b+$n)&63; $n = 64-$n; +    "	rldicl	$ra,$rs,$b,$n"; +}; +my $vmr = sub { +    my ($f,$vx,$vy) = @_; +    "	vor	$vx,$vy,$vy"; +}; + +# Some ABIs specify vrsave, special-purpose register #256, as reserved +# for system use. +my $no_vrsave = ($flavour =~ /linux-ppc64le/); +my $mtspr = sub { +    my ($f,$idx,$ra) = @_; +    if ($idx == 256 && $no_vrsave) { +	"	or	$ra,$ra,$ra"; +    } else { +	"	mtspr	$idx,$ra"; +    } +}; +my $mfspr = sub { +    my ($f,$rd,$idx) = @_; +    if ($idx == 256 && $no_vrsave) { +	"	li	$rd,-1"; +    } else { +	"	mfspr	$rd,$idx"; +    } +}; + +# PowerISA 2.06 stuff +sub vsxmem_op { +    my ($f, $vrt, $ra, $rb, $op) = @_; +    "	.long	".sprintf "0x%X",(31<<26)|($vrt<<21)|($ra<<16)|($rb<<11)|($op*2+1); +} +# made-up unaligned memory reference AltiVec/VMX instructions +my $lvx_u	= sub {	vsxmem_op(@_, 844); };	# lxvd2x +my $stvx_u	= sub {	vsxmem_op(@_, 972); };	# stxvd2x +my $lvdx_u	= sub {	vsxmem_op(@_, 588); };	# lxsdx +my $stvdx_u	= sub {	vsxmem_op(@_, 716); };	# stxsdx +my $lvx_4w	= sub { vsxmem_op(@_, 780); };	# lxvw4x +my $stvx_4w	= sub { vsxmem_op(@_, 908); };	# stxvw4x + +# PowerISA 2.07 stuff +sub vcrypto_op { +    my ($f, $vrt, $vra, $vrb, $op) = @_; +    "	.long	".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|$op; +} +my $vcipher	= sub { vcrypto_op(@_, 1288); }; +my $vcipherlast	= sub { vcrypto_op(@_, 1289); }; +my $vncipher	= sub { vcrypto_op(@_, 1352); }; +my $vncipherlast= sub { vcrypto_op(@_, 1353); }; +my $vsbox	= sub { vcrypto_op(@_, 0, 1480); }; +my $vshasigmad	= sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1730); }; +my $vshasigmaw	= sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1666); }; +my $vpmsumb	= sub { vcrypto_op(@_, 1032); }; +my $vpmsumd	= sub { vcrypto_op(@_, 1224); }; +my $vpmsubh	= sub { vcrypto_op(@_, 1096); }; +my $vpmsumw	= sub { vcrypto_op(@_, 1160); }; +my $vaddudm	= sub { vcrypto_op(@_, 192);  }; +my $vadduqm	= sub { vcrypto_op(@_, 256);  }; + +my $mtsle	= sub { +    my ($f, $arg) = @_; +    "	.long	".sprintf "0x%X",(31<<26)|($arg<<21)|(147*2); +}; + +print "#include <asm/ppc_asm.h>\n" if $flavour =~ /linux/; + +while($line=<>) { + +    $line =~ s|[#!;].*$||;	# get rid of asm-style comments... +    $line =~ s|/\*.*\*/||;	# ... and C-style comments... +    $line =~ s|^\s+||;		# ... and skip white spaces in beginning... +    $line =~ s|\s+$||;		# ... and at the end + +    { +	$line =~ s|\b\.L(\w+)|L$1|g;	# common denominator for Locallabel +	$line =~ s|\bL(\w+)|\.L$1|g	if ($dotinlocallabels); +    } + +    { +	$line =~ s|^\s*(\.?)(\w+)([\.\+\-]?)\s*||; +	my $c = $1; $c = "\t" if ($c eq ""); +	my $mnemonic = $2; +	my $f = $3; +	my $opcode = eval("\$$mnemonic"); +	$line =~ s/\b(c?[rf]|v|vs)([0-9]+)\b/$2/g if ($c ne "." and $flavour !~ /osx/); +	if (ref($opcode) eq 'CODE') { $line = &$opcode($f,split(',',$line)); } +	elsif ($mnemonic)           { $line = $c.$mnemonic.$f."\t".$line; } +    } + +    print $line if ($line); +    print "\n"; +} + +close STDOUT; | 
