[PATCH v3 1/9] stdlib: Add arc4random, arc4random_buf, and arc4random_uniform (BZ #4417)
Adhemerval Zanella
adhemerval.zanella@linaro.org
Mon Apr 25 12:26:57 GMT 2022
On 24/04/2022 23:22, Mark Harris wrote:
> On Tue, Apr 19, 2022 at 2:29 PM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>> The implementation is based on scalar Chacha20, with global cache and
>> locking. It uses getrandom or /dev/urandom as fallback to get the
>> initial entropy, and reseeds the internal state on every 16MB of
>> consumed buffer.
>>
>> It maintains an internal buffer which consumes at maximum one page on
>> most systems (assuming minimum of 4k pages). The internal buf optimizes
>> the cipher encrypt calls, by amortize arc4random calls (where both
>
> s/amortize/amortizing/
>
Ack.
>> function call and locks cost are the dominating factor).
>
> s/locks/lock/
Ack.
>
>>
>> The ChaCha20 implementation is based on the RFC8439 [1], with last
>> step that XOR with the input omited. Since the input stream will either
>> zero bytes (initial state) or the PRNG output itself this step does not
>> add any extra entropy.
>
> The src argument to chacha20_crypt is always zeros, never PRNG output.
> Perhaps it would be clearer to say something like this:
>
> The ChaCha20 implementation is based on RFC8439 [1], omitting the final
> XOR of the keystream with the plaintext because the plaintext is a
> stream of zeros.
Ack, I have also added a remark it follow OpenBSD strategy:
The ChaCha20 implementation is based on RFC8439 [1], omitting the final
XOR of the keystream with the plaintext because the plaintext is a
stream of zeros. This strategy is similar to what OpenBSD arc4random
does.
>> diff --git a/include/stdlib.h b/include/stdlib.h
>> index 1c6f70b082..055f9d2965 100644
>> --- a/include/stdlib.h
>> +++ b/include/stdlib.h
>> @@ -144,6 +144,19 @@ libc_hidden_proto (__ptsname_r)
>> libc_hidden_proto (grantpt)
>> libc_hidden_proto (unlockpt)
>>
>> +__typeof (arc4random) __arc4random;
>> +libc_hidden_proto (__arc4random);
>> +__typeof (arc4random_buf) __arc4random_buf;
>> +libc_hidden_proto (__arc4random_buf);
>> +__typeof (arc4random_uniform) __arc4random_uniform;
>> +libc_hidden_proto (__arc4random_uniform);
>> +extern void __arc4random_buf_internal (void *buffer, size_t len)
>> + attribute_hidden;
>> +/* Called from the fork function to reinitialize the internal lock in thte
>
> s/thte/the/
Ack.
>> +/* Besides the cipher state 'ctx', it keeps two counters: 'have' is the
>> + current valid bytes not yet consumed in 'buf', while 'count' is the maximum
>> + number of bytes until a reseed.
>> +
>> + Both the initial seed an reseed tries to obtain entropy from the kernel
>> + and abort the process if none could be obtained.
>> +
>> + The state 'buf' improves the usage of the cipher call, allowing to call
>> + optimized implementations (if the archictecture provides it) and optimize
>> + arc4random calls (since only multiple call it will encrypt the next block).
>> + */
>> +
>> +/* Maximum number bytes until reseed (16 MB). */
>> +#define CHACHE_RESEED_SIZE (16 * 1024 * 1024)
It should, I changed it.
>
> Should this be CHACHA20_RESEED_SIZE?
>
>> +/* Internal buffer size in bytes (1KB). */
>> +#define CHACHA20_BUFSIZE (8 * CHACHA20_BLOCK_SIZE)
>
> 8 * 64 = 512; should this be (16 * CHACHA20_BLOCK_SIZE)?
I updated the comment in fact. I changed to 512 on v3 the optimize the
lockless optimization TCB buffer size.
>> +/* Fork detection is done by checking if MADV_WIPEONFORK supported. If not
>> + the fork callback will reset the state on the fork call. It does not
>> + handle direct clone calls, nor vfork or _Fork (arc4random is not
>> + async-signal-safe due the internal lock usage). */
>> +static void
>> +arc4random_init (uint8_t *buf, size_t len)
>
> len is not used in this function.
Indeed, I removed it.
>
>> +{
>> + state = __mmap (NULL, sizeof (struct arc4random_state),
>> + PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
>> + if (state == MAP_FAILED)
>> + arc4random_allocate_failure ();
>> +
>> +#ifdef MADV_WIPEONFORK
>> + int r = __madvise (state, sizeof (struct arc4random_state), MADV_WIPEONFORK);
>> + if (r == 0)
>> + __arc4random_wipeonfork = true;
>> + else if (errno != EINVAL)
>> + arc4random_allocate_failure ();
>> +#endif
>> +
>> + chacha20_init (state->ctx, buf, buf + CHACHA20_KEY_SIZE);
>> +}
>> +
>> +#define min(x,y) (((x) > (y)) ? (y) : (x))
>> +
>> +static void
>> +arc4random_rekey (uint8_t *rnd, size_t rndlen)
>> +{
>> + memset (state->buf, 0, sizeof state->buf);
>> + chacha20_crypt (state->ctx, state->buf, state->buf, sizeof state->buf);
>> +
>> + /* Mix some extra entropy if provided. */
>> + if (rnd != NULL)
>> + {
>> + size_t m = min (rndlen, CHACHA20_KEY_SIZE + CHACHA20_IV_SIZE);
>> + for (size_t i = 0; i < m; i++)
>> + state->buf[i] ^= rnd[i];
>> + }
>> +
>> + /* Immediately reinit for backtracking resistance. */
>> + chacha20_init (state->ctx, state->buf, state->buf + CHACHA20_KEY_SIZE);
>> + memset (state->buf, 0, CHACHA20_KEY_SIZE + CHACHA20_IV_SIZE);
>> + state->have = sizeof (state->buf) - (CHACHA20_KEY_SIZE + CHACHA20_IV_SIZE);
>> +}
>> +
>> +static void
>> +arc4random_getentropy (uint8_t *rnd, size_t len)
>> +{
>> + if (__getrandomn_nocancel (rnd, len, GRND_NONBLOCK) == len)
>> + return;
>> +
>> + int fd = __open64_nocancel ("/dev/urandom", O_RDONLY);
>
> Should this be O_RDONLY | O_CLOEXEC?
It should, I have change it.
>
>> + if (fd != -1)
>> + {
>> + unsigned char *p = rnd;
>> + unsigned char *end = p + len;
>
> uint8_t * would be consistent with the declaration of md.
Ack.
>
>> + do
>> + {
>> + ssize_t ret = TEMP_FAILURE_RETRY (__read_nocancel (fd, p, end - p));
>> + if (ret <= 0)
>> + arc4random_getrandom_failure ();
>> + p += ret;
>> + }
>> + while (p < end);
>> +
>> + if (__close_nocancel (fd) != 0)
>
> Should this be == 0?
Ack.
>> +static uint32_t
>> +compute_uniform (uint32_t n)
>> +{
>> + if (n <= 1)
>> + /* There is no valid return value for a zero limit, and 0 is the
>> + only possible result for limit 1. */
>> + return 0;
>> +
>> + /* The bits variable serves as a source for bits. Prefetch the
>> + minimum number of bytes needed. */
>> + unsigned count = byte_count (n);
>
> uint32_t would be consistent with the declaration of byte_count.
Ack.
>
>> + uint32_t bits_length = count * CHAR_BIT;
>> + uint32_t bits;
>> + random_bytes (&bits, count);
>> +
>> + /* Powers of two are easy. */
>> + if (powerof2 (n))
>> + return bits & (n - 1);
>> +
>> + /* The general case. This algorithm follows Jérémie Lumbroso,
>> + Optimal Discrete Uniform Generation from Coin Flips, and
>> + Applications (2013), who credits Donald E. Knuth and Andrew
>> + C. Yao, The complexity of nonuniform random number generation
>> + (1976), for solving the general case.
>> +
>> + The implementation below unrolls the initialization stage of the
>> + loop, where v is less than n. */
>> +
>> + /* Use 64-bit variables even though the intermediate results are
>> + never larger that 33 bits. This ensures the code easier to
>
> s/that/than/
> s/the code/that the code is/
Ack.
>> +
>> +/* 32-bit stream position, then 96-bit nonce. */
>> +#define CHACHA20_IV_SIZE 16
>> +#define CHACHA20_KEY_SIZE 32
>> +
>> +#define CHACHA20_BLOCK_SIZE 64
>> +#define CHACHA20_BLOCK_WORDS (CHACHA20_BLOCK_SIZE / sizeof (uint32_t))
>> +
>> +#define CHACHA20_STATE_LEN 16
>> +
>> +/* Defining CHACHA20_XOR_FINAL issues the final XOR using the input as defined
>> + Sby RFC8439. Since the input stream will either zero bytes (initial state)
>
> s/Sby/by/
Ack.
>
>> + or the PRNG output itself this step does not add any extra entropy. */
>
> The plaintext input stream (src argument to chacha20_crypt) is always
> zeros, never PRNG output.
Indeed, I have change to the suggestion you gave above.
>
>> +
>> +enum chacha20_constants
>> +{
>> + CHACHA20_CONSTANT_EXPA = 0x61707865U,
>> + CHACHA20_CONSTANT_ND_3 = 0x3320646eU,
>> + CHACHA20_CONSTANT_2_BY = 0x79622d32U,
>> + CHACHA20_CONSTANT_TE_K = 0x6b206574U
>> +};
>> +
>> +static inline uint32_t
>> +read_unaligned_32 (const uint8_t *p)
>> +{
>> + uint32_t r;
>> + memcpy (&r, p, sizeof (r));
>> + return r;
>> +}
>> +
>> +static inline void
>> +write_unaligned_32 (uint8_t *p, uint32_t v)
>> +{
>> + memcpy (p, &v, sizeof (v));
>> +}
>> +
>> +#if __BYTE_ORDER == __BIG_ENDIAN
>> +# define read_unaligned_le32(p) __builtin_bswap32 (read_unaligned_32 (p))
>> +# define set_state(v) __builtin_bswap32 ((v))
>> +#else
>> +# define read_unaligned_le32(p) read_unaligned_32 ((p))
>> +# define set_state(v) (v)
>> +#endif
>> +
>> +static inline void
>> +chacha20_init (uint32_t *state, const uint8_t *key, const uint8_t *iv)
>> +{
>> + state[0] = CHACHA20_CONSTANT_EXPA;
>> + state[1] = CHACHA20_CONSTANT_ND_3;
>> + state[2] = CHACHA20_CONSTANT_2_BY;
>> + state[3] = CHACHA20_CONSTANT_TE_K;
>> +
>> + state[4] = read_unaligned_le32 (key + 0 * sizeof (uint32_t));
>> + state[5] = read_unaligned_le32 (key + 1 * sizeof (uint32_t));
>> + state[6] = read_unaligned_le32 (key + 2 * sizeof (uint32_t));
>> + state[7] = read_unaligned_le32 (key + 3 * sizeof (uint32_t));
>> + state[8] = read_unaligned_le32 (key + 4 * sizeof (uint32_t));
>> + state[9] = read_unaligned_le32 (key + 5 * sizeof (uint32_t));
>> + state[10] = read_unaligned_le32 (key + 6 * sizeof (uint32_t));
>> + state[11] = read_unaligned_le32 (key + 7 * sizeof (uint32_t));
>> +
>> + state[12] = read_unaligned_le32 (iv + 0 * sizeof (uint32_t));
>> + state[13] = read_unaligned_le32 (iv + 1 * sizeof (uint32_t));
>> + state[14] = read_unaligned_le32 (iv + 2 * sizeof (uint32_t));
>> + state[15] = read_unaligned_le32 (iv + 3 * sizeof (uint32_t));
>> +}
>> +
>> +static inline uint32_t
>> +rotl32 (unsigned int shift, uint32_t word)
>> +{
>> + return (word << (shift & 31)) | (word >> ((-shift) & 31));
>> +}
>> +
>> +#define QROUND(x0, x1, x2, x3) \
>> + do { \
>> + x0 = x0 + x1; x3 = rotl32 (16, (x0 ^ x3)); \
>> + x2 = x2 + x3; x1 = rotl32 (12, (x1 ^ x2)); \
>> + x0 = x0 + x1; x3 = rotl32 (8, (x0 ^ x3)); \
>> + x2 = x2 + x3; x1 = rotl32 (7, (x1 ^ x2)); \
>> + } while(0)
>> +
>> +static inline void
>> +chacha20_block (uint32_t *state, uint32_t *stream)
>> +{
>> + uint32_t x[CHACHA20_STATE_LEN];
>> + memcpy (x, state, sizeof x);
>> +
>> + for (int i = 0; i < 20; i += 2)
>> + {
>> + QROUND (x[0], x[4], x[8], x[12]);
>> + QROUND (x[1], x[5], x[9], x[13]);
>> + QROUND (x[2], x[6], x[10], x[14]);
>> + QROUND (x[3], x[7], x[11], x[15]);
>> +
>> + QROUND (x[0], x[5], x[10], x[15]);
>> + QROUND (x[1], x[6], x[11], x[12]);
>> + QROUND (x[2], x[7], x[8], x[13]);
>> + QROUND (x[3], x[4], x[9], x[14]);
>> + }
>> +
>> + /* Unroll the loop a bit. */
>> + for (int i = 0; i < CHACHA20_BLOCK_WORDS / 4; i++)
>> + {
>> + stream[i*4+0] = set_state (x[i*4+0] + state[i*4+0]);
>> + stream[i*4+1] = set_state (x[i*4+1] + state[i*4+1]);
>> + stream[i*4+2] = set_state (x[i*4+2] + state[i*4+2]);
>> + stream[i*4+3] = set_state (x[i*4+3] + state[i*4+3]);
>> + }
>> +
>> + state[12]++;
>> +}
>> +
>> +static void
>> +chacha20_crypt (uint32_t *state, uint8_t *dst, const uint8_t *src,
>> + size_t bytes)
>> +{
>> + uint32_t stream[CHACHA20_BLOCK_WORDS];
>> +
>> + while (bytes >= CHACHA20_BLOCK_SIZE)
>> + {
>> + chacha20_block (state, stream);
>> +#ifdef CHACHA20_XOR_FINAL
>> + for (int i = 0; i < CHACHA20_BLOCK_WORDS; i++)
>> + stream[i] ^= read_unaligned_32 (&src[i * sizeof (uint32_t)]);
>> +#endif
>> + memcpy (dst, stream, CHACHA20_BLOCK_SIZE);
>> + bytes -= CHACHA20_BLOCK_SIZE;
>> + dst += CHACHA20_BLOCK_SIZE;
>> + src += CHACHA20_BLOCK_SIZE;
>> + }
>> + if (bytes != 0)
>> + {
>> + chacha20_block (state, stream);
>> +#ifdef CHACHA20_XOR_FINAL
>> + for (int i = 0; i < CHACHA20_BLOCK_WORDS; i++)
>> + stream[i] ^= read_unaligned_32 (&src[i * sizeof (uint32_t)]);
>> +#endif
>> + memcpy (dst, stream, bytes);
>> + }
>> +}
>> diff --git a/stdlib/stdlib.h b/stdlib/stdlib.h
>> index bf7cd438e1..f2b0c83c12 100644
>> --- a/stdlib/stdlib.h
>> +++ b/stdlib/stdlib.h
>> @@ -485,6 +485,7 @@ extern unsigned short int *seed48 (unsigned short int __seed16v[3])
>> extern void lcong48 (unsigned short int __param[7]) __THROW __nonnull ((1));
>>
>> # ifdef __USE_MISC
>> +# include <bits/stdint-uintn.h>
>> /* Data structure for communication with thread safe versions. This
>> type is to be regarded as opaque. It's only exported because users
>> have to allocate objects of this type. */
>> @@ -533,6 +534,19 @@ extern int seed48_r (unsigned short int __seed16v[3],
>> extern int lcong48_r (unsigned short int __param[7],
>> struct drand48_data *__buffer)
>> __THROW __nonnull ((1, 2));
>> +
>> +/* Return a random integer between zero and 2**31-1 (inclusive). */
>
> 2**32-1
>
Ack.
More information about the Libc-alpha
mailing list