summaryrefslogtreecommitdiff
path: root/arch/x86/entry/vdso/vgetrandom-chacha.S
blob: bcba5639b8ee9a33556ba0d81287a4ef953b3446 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 */

#include <linux/linkage.h>
#include <asm/frame.h>

.section	.rodata, "a"
.align 16
CONSTANTS:	.octa 0x6b20657479622d323320646e61707865
.text

/*
 * Very basic SSE2 implementation of ChaCha20. Produces a given positive number
 * of blocks of output with a nonce of 0, taking an input key and 8-byte
 * counter. Importantly does not spill to the stack. Its arguments are:
 *
 *	rdi: output bytes
 *	rsi: 32-byte key input
 *	rdx: 8-byte counter input/output
 *	rcx: number of 64-byte blocks to write to output
 */
SYM_FUNC_START(__arch_chacha20_blocks_nostack)

.set	output,		%rdi
.set	key,		%rsi
.set	counter,	%rdx
.set	nblocks,	%rcx
.set	i,		%al
/* xmm registers are *not* callee-save. */
.set	temp,		%xmm0
.set	state0,		%xmm1
.set	state1,		%xmm2
.set	state2,		%xmm3
.set	state3,		%xmm4
.set	copy0,		%xmm5
.set	copy1,		%xmm6
.set	copy2,		%xmm7
.set	copy3,		%xmm8
.set	one,		%xmm9

	/* copy0 = "expand 32-byte k" */
	movaps		CONSTANTS(%rip),copy0
	/* copy1,copy2 = key */
	movups		0x00(key),copy1
	movups		0x10(key),copy2
	/* copy3 = counter || zero nonce */
	movq		0x00(counter),copy3
	/* one = 1 || 0 */
	movq		$1,%rax
	movq		%rax,one

.Lblock:
	/* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */
	movdqa		copy0,state0
	movdqa		copy1,state1
	movdqa		copy2,state2
	movdqa		copy3,state3

	movb		$10,i
.Lpermute:
	/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
	paddd		state1,state0
	pxor		state0,state3
	movdqa		state3,temp
	pslld		$16,temp
	psrld		$16,state3
	por		temp,state3

	/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
	paddd		state3,state2
	pxor		state2,state1
	movdqa		state1,temp
	pslld		$12,temp
	psrld		$20,state1
	por		temp,state1

	/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
	paddd		state1,state0
	pxor		state0,state3
	movdqa		state3,temp
	pslld		$8,temp
	psrld		$24,state3
	por		temp,state3

	/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
	paddd		state3,state2
	pxor		state2,state1
	movdqa		state1,temp
	pslld		$7,temp
	psrld		$25,state1
	por		temp,state1

	/* state1[0,1,2,3] = state1[1,2,3,0] */
	pshufd		$0x39,state1,state1
	/* state2[0,1,2,3] = state2[2,3,0,1] */
	pshufd		$0x4e,state2,state2
	/* state3[0,1,2,3] = state3[3,0,1,2] */
	pshufd		$0x93,state3,state3

	/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
	paddd		state1,state0
	pxor		state0,state3
	movdqa		state3,temp
	pslld		$16,temp
	psrld		$16,state3
	por		temp,state3

	/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
	paddd		state3,state2
	pxor		state2,state1
	movdqa		state1,temp
	pslld		$12,temp
	psrld		$20,state1
	por		temp,state1

	/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
	paddd		state1,state0
	pxor		state0,state3
	movdqa		state3,temp
	pslld		$8,temp
	psrld		$24,state3
	por		temp,state3

	/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
	paddd		state3,state2
	pxor		state2,state1
	movdqa		state1,temp
	pslld		$7,temp
	psrld		$25,state1
	por		temp,state1

	/* state1[0,1,2,3] = state1[3,0,1,2] */
	pshufd		$0x93,state1,state1
	/* state2[0,1,2,3] = state2[2,3,0,1] */
	pshufd		$0x4e,state2,state2
	/* state3[0,1,2,3] = state3[1,2,3,0] */
	pshufd		$0x39,state3,state3

	decb		i
	jnz		.Lpermute

	/* output0 = state0 + copy0 */
	paddd		copy0,state0
	movups		state0,0x00(output)
	/* output1 = state1 + copy1 */
	paddd		copy1,state1
	movups		state1,0x10(output)
	/* output2 = state2 + copy2 */
	paddd		copy2,state2
	movups		state2,0x20(output)
	/* output3 = state3 + copy3 */
	paddd		copy3,state3
	movups		state3,0x30(output)

	/* ++copy3.counter */
	paddq		one,copy3

	/* output += 64, --nblocks */
	addq		$64,output
	decq		nblocks
	jnz		.Lblock

	/* counter = copy3.counter */
	movq		copy3,0x00(counter)

	/* Zero out the potentially sensitive regs, in case nothing uses these again. */
	pxor		state0,state0
	pxor		state1,state1
	pxor		state2,state2
	pxor		state3,state3
	pxor		copy1,copy1
	pxor		copy2,copy2
	pxor		temp,temp

	ret
SYM_FUNC_END(__arch_chacha20_blocks_nostack)