1 | /* $NetBSD: cprng_fast.c,v 1.13 2015/04/13 22:43:41 riastradh Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c) 2014 The NetBSD Foundation, Inc. |
5 | * All rights reserved. |
6 | * |
7 | * This code is derived from software contributed to The NetBSD Foundation |
8 | * by Taylor R. Campbell. |
9 | * |
10 | * Redistribution and use in source and binary forms, with or without |
11 | * modification, are permitted provided that the following conditions |
12 | * are met: |
13 | * 1. Redistributions of source code must retain the above copyright |
14 | * notice, this list of conditions and the following disclaimer. |
15 | * 2. Redistributions in binary form must reproduce the above copyright |
16 | * notice, this list of conditions and the following disclaimer in the |
17 | * documentation and/or other materials provided with the distribution. |
18 | * |
19 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
20 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
21 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
22 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
29 | * POSSIBILITY OF SUCH DAMAGE. |
30 | */ |
31 | |
32 | #include <sys/cdefs.h> |
33 | __KERNEL_RCSID(0, "$NetBSD: cprng_fast.c,v 1.13 2015/04/13 22:43:41 riastradh Exp $" ); |
34 | |
35 | #include <sys/types.h> |
36 | #include <sys/param.h> |
37 | #include <sys/bitops.h> |
38 | #include <sys/cprng.h> |
39 | #include <sys/cpu.h> |
40 | #include <sys/intr.h> |
41 | #include <sys/percpu.h> |
42 | #include <sys/rnd.h> /* rnd_initial_entropy */ |
43 | |
44 | /* ChaCha core */ |
45 | |
46 | #define crypto_core_OUTPUTWORDS 16 |
47 | #define crypto_core_INPUTWORDS 4 |
48 | #define crypto_core_KEYWORDS 8 |
49 | #define crypto_core_CONSTWORDS 4 |
50 | |
51 | #define crypto_core_ROUNDS 8 |
52 | |
53 | static uint32_t |
54 | rotate(uint32_t u, unsigned c) |
55 | { |
56 | |
57 | return (u << c) | (u >> (32 - c)); |
58 | } |
59 | |
60 | #define QUARTERROUND(a, b, c, d) do { \ |
61 | (a) += (b); (d) ^= (a); (d) = rotate((d), 16); \ |
62 | (c) += (d); (b) ^= (c); (b) = rotate((b), 12); \ |
63 | (a) += (b); (d) ^= (a); (d) = rotate((d), 8); \ |
64 | (c) += (d); (b) ^= (c); (b) = rotate((b), 7); \ |
65 | } while (0) |
66 | |
67 | static void |
68 | crypto_core(uint32_t *out, const uint32_t *in, const uint32_t *k, |
69 | const uint32_t *c) |
70 | { |
71 | uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; |
72 | int i; |
73 | |
74 | x0 = c[0]; |
75 | x1 = c[1]; |
76 | x2 = c[2]; |
77 | x3 = c[3]; |
78 | x4 = k[0]; |
79 | x5 = k[1]; |
80 | x6 = k[2]; |
81 | x7 = k[3]; |
82 | x8 = k[4]; |
83 | x9 = k[5]; |
84 | x10 = k[6]; |
85 | x11 = k[7]; |
86 | x12 = in[0]; |
87 | x13 = in[1]; |
88 | x14 = in[2]; |
89 | x15 = in[3]; |
90 | |
91 | for (i = crypto_core_ROUNDS; i > 0; i -= 2) { |
92 | QUARTERROUND( x0, x4, x8,x12); |
93 | QUARTERROUND( x1, x5, x9,x13); |
94 | QUARTERROUND( x2, x6,x10,x14); |
95 | QUARTERROUND( x3, x7,x11,x15); |
96 | QUARTERROUND( x0, x5,x10,x15); |
97 | QUARTERROUND( x1, x6,x11,x12); |
98 | QUARTERROUND( x2, x7, x8,x13); |
99 | QUARTERROUND( x3, x4, x9,x14); |
100 | } |
101 | |
102 | out[0] = x0 + c[0]; |
103 | out[1] = x1 + c[1]; |
104 | out[2] = x2 + c[2]; |
105 | out[3] = x3 + c[3]; |
106 | out[4] = x4 + k[0]; |
107 | out[5] = x5 + k[1]; |
108 | out[6] = x6 + k[2]; |
109 | out[7] = x7 + k[3]; |
110 | out[8] = x8 + k[4]; |
111 | out[9] = x9 + k[5]; |
112 | out[10] = x10 + k[6]; |
113 | out[11] = x11 + k[7]; |
114 | out[12] = x12 + in[0]; |
115 | out[13] = x13 + in[1]; |
116 | out[14] = x14 + in[2]; |
117 | out[15] = x15 + in[3]; |
118 | } |
119 | |
120 | /* `expand 32-byte k' */ |
121 | static const uint32_t crypto_core_constant32[4] = { |
122 | 0x61707865U, 0x3320646eU, 0x79622d32U, 0x6b206574U, |
123 | }; |
124 | |
125 | /* |
126 | * Test vector for ChaCha20 from |
127 | * <http://tools.ietf.org/html/draft-strombergson-chacha-test-vectors-00>, |
128 | * test vectors for ChaCha12 and ChaCha8 generated by the same |
129 | * crypto_core code with crypto_core_ROUNDS varied. |
130 | */ |
131 | |
132 | #define check(E) do \ |
133 | { \ |
134 | if (!(E)) \ |
135 | panic("crypto self-test failed: %s", #E); \ |
136 | } while (0) |
137 | |
138 | static void |
139 | crypto_core_selftest(void) |
140 | { |
141 | const uint32_t zero32[8] = {0}; |
142 | const uint8_t sigma[] = "expand 32-byte k" ; |
143 | uint32_t block[16]; |
144 | unsigned i; |
145 | |
146 | #if crypto_core_ROUNDS == 8 |
147 | static const uint8_t out[64] = { |
148 | 0x3e,0x00,0xef,0x2f,0x89,0x5f,0x40,0xd6, |
149 | 0x7f,0x5b,0xb8,0xe8,0x1f,0x09,0xa5,0xa1, |
150 | 0x2c,0x84,0x0e,0xc3,0xce,0x9a,0x7f,0x3b, |
151 | 0x18,0x1b,0xe1,0x88,0xef,0x71,0x1a,0x1e, |
152 | 0x98,0x4c,0xe1,0x72,0xb9,0x21,0x6f,0x41, |
153 | 0x9f,0x44,0x53,0x67,0x45,0x6d,0x56,0x19, |
154 | 0x31,0x4a,0x42,0xa3,0xda,0x86,0xb0,0x01, |
155 | 0x38,0x7b,0xfd,0xb8,0x0e,0x0c,0xfe,0x42, |
156 | }; |
157 | #elif crypto_core_ROUNDS == 12 |
158 | static const uint8_t out[64] = { |
159 | 0x9b,0xf4,0x9a,0x6a,0x07,0x55,0xf9,0x53, |
160 | 0x81,0x1f,0xce,0x12,0x5f,0x26,0x83,0xd5, |
161 | 0x04,0x29,0xc3,0xbb,0x49,0xe0,0x74,0x14, |
162 | 0x7e,0x00,0x89,0xa5,0x2e,0xae,0x15,0x5f, |
163 | 0x05,0x64,0xf8,0x79,0xd2,0x7a,0xe3,0xc0, |
164 | 0x2c,0xe8,0x28,0x34,0xac,0xfa,0x8c,0x79, |
165 | 0x3a,0x62,0x9f,0x2c,0xa0,0xde,0x69,0x19, |
166 | 0x61,0x0b,0xe8,0x2f,0x41,0x13,0x26,0xbe, |
167 | }; |
168 | #elif crypto_core_ROUNDS == 20 |
169 | static const uint8_t out[64] = { |
170 | 0x76,0xb8,0xe0,0xad,0xa0,0xf1,0x3d,0x90, |
171 | 0x40,0x5d,0x6a,0xe5,0x53,0x86,0xbd,0x28, |
172 | 0xbd,0xd2,0x19,0xb8,0xa0,0x8d,0xed,0x1a, |
173 | 0xa8,0x36,0xef,0xcc,0x8b,0x77,0x0d,0xc7, |
174 | 0xda,0x41,0x59,0x7c,0x51,0x57,0x48,0x8d, |
175 | 0x77,0x24,0xe0,0x3f,0xb8,0xd8,0x4a,0x37, |
176 | 0x6a,0x43,0xb8,0xf4,0x15,0x18,0xa1,0x1c, |
177 | 0xc3,0x87,0xb6,0x69,0xb2,0xee,0x65,0x86, |
178 | }; |
179 | #else |
180 | #error crypto_core_ROUNDS must be 8, 12, or 20. |
181 | #endif |
182 | |
183 | check(crypto_core_constant32[0] == le32dec(&sigma[0])); |
184 | check(crypto_core_constant32[1] == le32dec(&sigma[4])); |
185 | check(crypto_core_constant32[2] == le32dec(&sigma[8])); |
186 | check(crypto_core_constant32[3] == le32dec(&sigma[12])); |
187 | |
188 | crypto_core(block, zero32, zero32, crypto_core_constant32); |
189 | for (i = 0; i < 16; i++) |
190 | check(block[i] == le32dec(&out[i*4])); |
191 | } |
192 | |
193 | #undef check |
194 | |
195 | #define CPRNG_FAST_SEED_BYTES (crypto_core_KEYWORDS * sizeof(uint32_t)) |
196 | |
197 | struct cprng_fast { |
198 | uint32_t buffer[crypto_core_OUTPUTWORDS]; |
199 | uint32_t key[crypto_core_KEYWORDS]; |
200 | uint32_t nonce[crypto_core_INPUTWORDS]; |
201 | bool have_initial; |
202 | }; |
203 | |
204 | __CTASSERT(sizeof ((struct cprng_fast *)0)->key == CPRNG_FAST_SEED_BYTES); |
205 | |
206 | static void cprng_fast_init_cpu(void *, void *, struct cpu_info *); |
207 | static void cprng_fast_schedule_reseed(struct cprng_fast *); |
208 | static void cprng_fast_intr(void *); |
209 | |
210 | static void cprng_fast_seed(struct cprng_fast *, const void *); |
211 | static void cprng_fast_buf(struct cprng_fast *, void *, unsigned); |
212 | |
213 | static void cprng_fast_buf_short(void *, size_t); |
214 | static void cprng_fast_buf_long(void *, size_t); |
215 | |
216 | static percpu_t *cprng_fast_percpu __read_mostly; |
217 | static void *cprng_fast_softint __read_mostly; |
218 | |
219 | void |
220 | cprng_fast_init(void) |
221 | { |
222 | |
223 | crypto_core_selftest(); |
224 | cprng_fast_percpu = percpu_alloc(sizeof(struct cprng_fast)); |
225 | percpu_foreach(cprng_fast_percpu, &cprng_fast_init_cpu, NULL); |
226 | cprng_fast_softint = softint_establish(SOFTINT_SERIAL|SOFTINT_MPSAFE, |
227 | &cprng_fast_intr, NULL); |
228 | } |
229 | |
230 | static void |
231 | cprng_fast_init_cpu(void *p, void *arg __unused, struct cpu_info *ci __unused) |
232 | { |
233 | struct cprng_fast *const cprng = p; |
234 | uint8_t seed[CPRNG_FAST_SEED_BYTES]; |
235 | |
236 | cprng_strong(kern_cprng, seed, sizeof seed, 0); |
237 | cprng_fast_seed(cprng, seed); |
238 | cprng->have_initial = rnd_initial_entropy; |
239 | (void)explicit_memset(seed, 0, sizeof seed); |
240 | } |
241 | |
242 | static inline int |
243 | cprng_fast_get(struct cprng_fast **cprngp) |
244 | { |
245 | struct cprng_fast *cprng; |
246 | int s; |
247 | |
248 | *cprngp = cprng = percpu_getref(cprng_fast_percpu); |
249 | s = splvm(); |
250 | |
251 | if (__predict_false(!cprng->have_initial)) |
252 | cprng_fast_schedule_reseed(cprng); |
253 | |
254 | return s; |
255 | } |
256 | |
257 | static inline void |
258 | cprng_fast_put(struct cprng_fast *cprng, int s) |
259 | { |
260 | |
261 | KASSERT((cprng == percpu_getref(cprng_fast_percpu)) && |
262 | (percpu_putref(cprng_fast_percpu), true)); |
263 | splx(s); |
264 | percpu_putref(cprng_fast_percpu); |
265 | } |
266 | |
267 | static void |
268 | cprng_fast_schedule_reseed(struct cprng_fast *cprng __unused) |
269 | { |
270 | |
271 | softint_schedule(cprng_fast_softint); |
272 | } |
273 | |
274 | static void |
275 | cprng_fast_intr(void *cookie __unused) |
276 | { |
277 | struct cprng_fast *cprng; |
278 | uint8_t seed[CPRNG_FAST_SEED_BYTES]; |
279 | int s; |
280 | |
281 | cprng_strong(kern_cprng, seed, sizeof(seed), 0); |
282 | |
283 | cprng = percpu_getref(cprng_fast_percpu); |
284 | s = splvm(); |
285 | cprng_fast_seed(cprng, seed); |
286 | cprng->have_initial = rnd_initial_entropy; |
287 | splx(s); |
288 | percpu_putref(cprng_fast_percpu); |
289 | |
290 | explicit_memset(seed, 0, sizeof(seed)); |
291 | } |
292 | |
293 | /* CPRNG algorithm */ |
294 | |
295 | /* |
296 | * The state consists of a key, the current nonce, and a 64-byte buffer |
297 | * of output. Since we fill the buffer only when we need output, and |
298 | * eat a 32-bit word at a time, one 32-bit word of the buffer would be |
299 | * wasted. Instead, we repurpose it to count the number of entries in |
300 | * the buffer remaining, counting from high to low in order to allow |
301 | * comparison to zero to detect when we need to refill it. |
302 | */ |
303 | #define CPRNG_FAST_BUFIDX (crypto_core_OUTPUTWORDS - 1) |
304 | |
305 | static void |
306 | cprng_fast_seed(struct cprng_fast *cprng, const void *seed) |
307 | { |
308 | |
309 | (void)memset(cprng->buffer, 0, sizeof cprng->buffer); |
310 | (void)memcpy(cprng->key, seed, sizeof cprng->key); |
311 | (void)memset(cprng->nonce, 0, sizeof cprng->nonce); |
312 | } |
313 | |
314 | static inline uint32_t |
315 | cprng_fast_word(struct cprng_fast *cprng) |
316 | { |
317 | uint32_t v; |
318 | |
319 | if (__predict_true(0 < cprng->buffer[CPRNG_FAST_BUFIDX])) { |
320 | v = cprng->buffer[--cprng->buffer[CPRNG_FAST_BUFIDX]]; |
321 | } else { |
322 | /* If we don't have enough words, refill the buffer. */ |
323 | crypto_core(cprng->buffer, cprng->nonce, cprng->key, |
324 | crypto_core_constant32); |
325 | if (__predict_false(++cprng->nonce[0] == 0)) { |
326 | cprng->nonce[1]++; |
327 | cprng_fast_schedule_reseed(cprng); |
328 | } |
329 | v = cprng->buffer[CPRNG_FAST_BUFIDX]; |
330 | cprng->buffer[CPRNG_FAST_BUFIDX] = CPRNG_FAST_BUFIDX; |
331 | } |
332 | |
333 | return v; |
334 | } |
335 | |
336 | static inline void |
337 | cprng_fast_buf(struct cprng_fast *cprng, void *buf, unsigned n) |
338 | { |
339 | uint8_t *p = buf; |
340 | uint32_t v; |
341 | unsigned w, r; |
342 | |
343 | w = n / sizeof(uint32_t); |
344 | while (w--) { |
345 | v = cprng_fast_word(cprng); |
346 | (void)memcpy(p, &v, 4); |
347 | p += 4; |
348 | } |
349 | |
350 | r = n % sizeof(uint32_t); |
351 | if (r) { |
352 | v = cprng_fast_word(cprng); |
353 | while (r--) { |
354 | *p++ = (v & 0xff); |
355 | v >>= 8; |
356 | } |
357 | } |
358 | } |
359 | |
360 | /* |
361 | * crypto_onetimestream: Expand a short unpredictable one-time seed |
362 | * into a long unpredictable output. |
363 | */ |
364 | static void |
365 | crypto_onetimestream(const uint32_t seed[crypto_core_KEYWORDS], void *buf, |
366 | size_t n) |
367 | { |
368 | uint32_t block[crypto_core_OUTPUTWORDS]; |
369 | uint32_t nonce[crypto_core_INPUTWORDS] = {0}; |
370 | uint8_t *p8; |
371 | uint32_t *p32; |
372 | size_t ni, nb, nf; |
373 | |
374 | /* |
375 | * Guarantee we can generate up to n bytes. We have |
376 | * 2^(32*INPUTWORDS) possible inputs yielding output of |
377 | * 4*OUTPUTWORDS*2^(32*INPUTWORDS) bytes. It suffices to |
378 | * require that sizeof n > (1/CHAR_BIT) log_2 n be less than |
379 | * (1/CHAR_BIT) log_2 of the total output stream length. We |
380 | * have |
381 | * |
382 | * log_2 (4 o 2^(32 i)) = log_2 (4 o) + log_2 2^(32 i) |
383 | * = 2 + log_2 o + 32 i. |
384 | */ |
385 | __CTASSERT(CHAR_BIT*sizeof n <= |
386 | (2 + ilog2(crypto_core_OUTPUTWORDS) + 32*crypto_core_INPUTWORDS)); |
387 | |
388 | p8 = buf; |
389 | p32 = (uint32_t *)roundup2((uintptr_t)p8, sizeof(uint32_t)); |
390 | ni = (uint8_t *)p32 - p8; |
391 | if (n < ni) |
392 | ni = n; |
393 | nb = (n - ni) / sizeof block; |
394 | nf = (n - ni) % sizeof block; |
395 | |
396 | KASSERT(((uintptr_t)p32 & 3) == 0); |
397 | KASSERT(ni <= n); |
398 | KASSERT(nb <= (n / sizeof block)); |
399 | KASSERT(nf <= n); |
400 | KASSERT(n == (ni + (nb * sizeof block) + nf)); |
401 | KASSERT(ni < sizeof(uint32_t)); |
402 | KASSERT(nf < sizeof block); |
403 | |
404 | if (ni) { |
405 | crypto_core(block, nonce, seed, crypto_core_constant32); |
406 | nonce[0]++; |
407 | (void)memcpy(p8, block, ni); |
408 | } |
409 | while (nb--) { |
410 | crypto_core(p32, nonce, seed, crypto_core_constant32); |
411 | if (++nonce[0] == 0) |
412 | nonce[1]++; |
413 | p32 += crypto_core_OUTPUTWORDS; |
414 | } |
415 | if (nf) { |
416 | crypto_core(block, nonce, seed, crypto_core_constant32); |
417 | if (++nonce[0] == 0) |
418 | nonce[1]++; |
419 | (void)memcpy(p32, block, nf); |
420 | } |
421 | |
422 | if (ni | nf) |
423 | (void)explicit_memset(block, 0, sizeof block); |
424 | } |
425 | |
426 | /* Public API */ |
427 | |
428 | uint32_t |
429 | cprng_fast32(void) |
430 | { |
431 | struct cprng_fast *cprng; |
432 | uint32_t v; |
433 | int s; |
434 | |
435 | s = cprng_fast_get(&cprng); |
436 | v = cprng_fast_word(cprng); |
437 | cprng_fast_put(cprng, s); |
438 | |
439 | return v; |
440 | } |
441 | |
442 | uint64_t |
443 | cprng_fast64(void) |
444 | { |
445 | struct cprng_fast *cprng; |
446 | uint32_t hi, lo; |
447 | int s; |
448 | |
449 | s = cprng_fast_get(&cprng); |
450 | hi = cprng_fast_word(cprng); |
451 | lo = cprng_fast_word(cprng); |
452 | cprng_fast_put(cprng, s); |
453 | |
454 | return ((uint64_t)hi << 32) | lo; |
455 | } |
456 | |
457 | static void |
458 | cprng_fast_buf_short(void *buf, size_t len) |
459 | { |
460 | struct cprng_fast *cprng; |
461 | int s; |
462 | |
463 | s = cprng_fast_get(&cprng); |
464 | cprng_fast_buf(cprng, buf, len); |
465 | cprng_fast_put(cprng, s); |
466 | } |
467 | |
468 | static __noinline void |
469 | cprng_fast_buf_long(void *buf, size_t len) |
470 | { |
471 | uint32_t seed[crypto_core_KEYWORDS]; |
472 | struct cprng_fast *cprng; |
473 | int s; |
474 | |
475 | s = cprng_fast_get(&cprng); |
476 | cprng_fast_buf(cprng, seed, sizeof seed); |
477 | cprng_fast_put(cprng, s); |
478 | |
479 | crypto_onetimestream(seed, buf, len); |
480 | |
481 | (void)explicit_memset(seed, 0, sizeof seed); |
482 | } |
483 | |
484 | size_t |
485 | cprng_fast(void *buf, size_t len) |
486 | { |
487 | |
488 | /* |
489 | * We don't want to hog the CPU, so we use the short version, |
490 | * to generate output without preemption, only if we can do it |
491 | * with at most one crypto_core. |
492 | */ |
493 | if (len <= (sizeof(uint32_t) * crypto_core_OUTPUTWORDS)) |
494 | cprng_fast_buf_short(buf, len); |
495 | else |
496 | cprng_fast_buf_long(buf, len); |
497 | |
498 | return len; |
499 | } |
500 | |