[PATCH v3 1/9] stdlib: Add arc4random, arc4random_buf, and arc4random_uniform (BZ #4417)

Adhemerval Zanella adhemerval.zanella@linaro.org
Mon Apr 25 12:26:57 GMT 2022



On 24/04/2022 23:22, Mark Harris wrote:
> On Tue, Apr 19, 2022 at 2:29 PM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>> The implementation is based on scalar Chacha20, with global cache and
>> locking.  It uses getrandom or /dev/urandom as fallback to get the
>> initial entropy, and reseeds the internal state on every 16MB of
>> consumed buffer.
>>
>> It maintains an internal buffer which consumes at maximum one page on
>> most systems (assuming minimum of 4k pages).  The internal buf optimizes
>> the cipher encrypt calls, by amortize arc4random calls (where both
> 
> s/amortize/amortizing/
> 

Ack.

>> function call and locks cost are the dominating factor).
> 
> s/locks/lock/

Ack.

> 
>>
>> The ChaCha20 implementation is based on the RFC8439 [1], with last
>> step that XOR with the input omited.  Since the input stream will either
>> zero bytes (initial state) or the PRNG output itself this step does not
>> add any extra entropy.
> 
> The src argument to chacha20_crypt is always zeros, never PRNG output.
> Perhaps it would be clearer to say something like this:
> 
> The ChaCha20 implementation is based on RFC8439 [1], omitting the final
> XOR of the keystream with the plaintext because the plaintext is a
> stream of zeros.

Ack, I have also added a remark it follow OpenBSD strategy:

  The ChaCha20 implementation is based on RFC8439 [1], omitting the final                       
  XOR of the keystream with the plaintext because the plaintext is a                            
  stream of zeros.  This strategy is similar to what OpenBSD arc4random                         
  does.  

>> diff --git a/include/stdlib.h b/include/stdlib.h
>> index 1c6f70b082..055f9d2965 100644
>> --- a/include/stdlib.h
>> +++ b/include/stdlib.h
>> @@ -144,6 +144,19 @@ libc_hidden_proto (__ptsname_r)
>>  libc_hidden_proto (grantpt)
>>  libc_hidden_proto (unlockpt)
>>
>> +__typeof (arc4random) __arc4random;
>> +libc_hidden_proto (__arc4random);
>> +__typeof (arc4random_buf) __arc4random_buf;
>> +libc_hidden_proto (__arc4random_buf);
>> +__typeof (arc4random_uniform) __arc4random_uniform;
>> +libc_hidden_proto (__arc4random_uniform);
>> +extern void __arc4random_buf_internal (void *buffer, size_t len)
>> +     attribute_hidden;
>> +/* Called from the fork function to reinitialize the internal lock in thte
> 
> s/thte/the/

Ack.


>> +/* Besides the cipher state 'ctx', it keeps two counters: 'have' is the
>> +   current valid bytes not yet consumed in 'buf', while 'count' is the maximum
>> +   number of bytes until a reseed.
>> +
>> +   Both the initial seed an reseed tries to obtain entropy from the kernel
>> +   and abort the process if none could be obtained.
>> +
>> +   The state 'buf' improves the usage of the cipher call, allowing to call
>> +   optimized implementations (if the archictecture provides it) and optimize
>> +   arc4random calls (since only multiple call it will encrypt the next block).
>> + */
>> +
>> +/* Maximum number bytes until reseed (16 MB).  */
>> +#define CHACHE_RESEED_SIZE     (16 * 1024 * 1024)

It should, I changed it.

> 
> Should this be CHACHA20_RESEED_SIZE?
> 
>> +/* Internal buffer size in bytes (1KB).  */
>> +#define CHACHA20_BUFSIZE        (8 * CHACHA20_BLOCK_SIZE)
> 
> 8 * 64 = 512; should this be (16 * CHACHA20_BLOCK_SIZE)?

I updated the comment in fact.  I changed to 512 on v3 the optimize the
lockless optimization TCB buffer size.


>> +/* Fork detection is done by checking if MADV_WIPEONFORK supported.  If not
>> +   the fork callback will reset the state on the fork call.  It does not
>> +   handle direct clone calls, nor vfork or _Fork (arc4random is not
>> +   async-signal-safe due the internal lock usage).  */
>> +static void
>> +arc4random_init (uint8_t *buf, size_t len)
> 
> len is not used in this function.

Indeed, I removed it.

> 
>> +{
>> +  state = __mmap (NULL, sizeof (struct arc4random_state),
>> +                 PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
>> +  if (state == MAP_FAILED)
>> +    arc4random_allocate_failure ();
>> +
>> +#ifdef MADV_WIPEONFORK
>> +  int r = __madvise (state, sizeof (struct arc4random_state), MADV_WIPEONFORK);
>> +  if (r == 0)
>> +    __arc4random_wipeonfork = true;
>> +  else if (errno != EINVAL)
>> +    arc4random_allocate_failure ();
>> +#endif
>> +
>> +  chacha20_init (state->ctx, buf, buf + CHACHA20_KEY_SIZE);
>> +}
>> +
>> +#define min(x,y) (((x) > (y)) ? (y) : (x))
>> +
>> +static void
>> +arc4random_rekey (uint8_t *rnd, size_t rndlen)
>> +{
>> +  memset (state->buf, 0, sizeof state->buf);
>> +  chacha20_crypt (state->ctx, state->buf, state->buf, sizeof state->buf);
>> +
>> +  /* Mix some extra entropy if provided.  */
>> +  if (rnd != NULL)
>> +    {
>> +      size_t m = min (rndlen, CHACHA20_KEY_SIZE + CHACHA20_IV_SIZE);
>> +      for (size_t i = 0; i < m; i++)
>> +       state->buf[i] ^= rnd[i];
>> +    }
>> +
>> +  /* Immediately reinit for backtracking resistance.  */
>> +  chacha20_init (state->ctx, state->buf, state->buf + CHACHA20_KEY_SIZE);
>> +  memset (state->buf, 0, CHACHA20_KEY_SIZE + CHACHA20_IV_SIZE);
>> +  state->have = sizeof (state->buf) - (CHACHA20_KEY_SIZE + CHACHA20_IV_SIZE);
>> +}
>> +
>> +static void
>> +arc4random_getentropy (uint8_t *rnd, size_t len)
>> +{
>> +  if (__getrandomn_nocancel (rnd, len, GRND_NONBLOCK) == len)
>> +    return;
>> +
>> +  int fd = __open64_nocancel ("/dev/urandom", O_RDONLY);
> 
> Should this be O_RDONLY | O_CLOEXEC?

It should, I have change it.

> 
>> +  if (fd != -1)
>> +    {
>> +      unsigned char *p = rnd;
>> +      unsigned char *end = p + len;
> 
> uint8_t * would be consistent with the declaration of md.

Ack.

> 
>> +      do
>> +       {
>> +         ssize_t ret = TEMP_FAILURE_RETRY (__read_nocancel (fd, p, end - p));
>> +         if (ret <= 0)
>> +           arc4random_getrandom_failure ();
>> +         p += ret;
>> +       }
>> +      while (p < end);
>> +
>> +      if (__close_nocancel (fd) != 0)
> 
> Should this be == 0?

Ack.

>> +static uint32_t
>> +compute_uniform (uint32_t n)
>> +{
>> +  if (n <= 1)
>> +    /* There is no valid return value for a zero limit, and 0 is the
>> +       only possible result for limit 1.  */
>> +    return 0;
>> +
>> +  /* The bits variable serves as a source for bits.  Prefetch the
>> +     minimum number of bytes needed.  */
>> +  unsigned count = byte_count (n);
> 
> uint32_t would be consistent with the declaration of byte_count.

Ack.

> 
>> +  uint32_t bits_length = count * CHAR_BIT;
>> +  uint32_t bits;
>> +  random_bytes (&bits, count);
>> +
>> +  /* Powers of two are easy.  */
>> +  if (powerof2 (n))
>> +    return bits & (n - 1);
>> +
>> +  /* The general case.  This algorithm follows Jérémie Lumbroso,
>> +     Optimal Discrete Uniform Generation from Coin Flips, and
>> +     Applications (2013), who credits Donald E. Knuth and Andrew
>> +     C. Yao, The complexity of nonuniform random number generation
>> +     (1976), for solving the general case.
>> +
>> +     The implementation below unrolls the initialization stage of the
>> +     loop, where v is less than n.  */
>> +
>> +  /* Use 64-bit variables even though the intermediate results are
>> +     never larger that 33 bits.  This ensures the code easier to
> 
> s/that/than/
> s/the code/that the code is/

Ack.

>> +
>> +/* 32-bit stream position, then 96-bit nonce.  */
>> +#define CHACHA20_IV_SIZE       16
>> +#define CHACHA20_KEY_SIZE      32
>> +
>> +#define CHACHA20_BLOCK_SIZE     64
>> +#define CHACHA20_BLOCK_WORDS    (CHACHA20_BLOCK_SIZE / sizeof (uint32_t))
>> +
>> +#define CHACHA20_STATE_LEN     16
>> +
>> +/* Defining CHACHA20_XOR_FINAL issues the final XOR using the input as defined
>> +   Sby RFC8439.  Since the input stream will either zero bytes (initial state)
> 
> s/Sby/by/

Ack.

> 
>> +   or the PRNG output itself this step does not add any extra entropy.   */
> 
> The plaintext input stream (src argument to chacha20_crypt) is always
> zeros, never PRNG output.

Indeed, I have change to the suggestion you gave above.

> 
>> +
>> +enum chacha20_constants
>> +{
>> +  CHACHA20_CONSTANT_EXPA = 0x61707865U,
>> +  CHACHA20_CONSTANT_ND_3 = 0x3320646eU,
>> +  CHACHA20_CONSTANT_2_BY = 0x79622d32U,
>> +  CHACHA20_CONSTANT_TE_K = 0x6b206574U
>> +};
>> +
>> +static inline uint32_t
>> +read_unaligned_32 (const uint8_t *p)
>> +{
>> +  uint32_t r;
>> +  memcpy (&r, p, sizeof (r));
>> +  return r;
>> +}
>> +
>> +static inline void
>> +write_unaligned_32 (uint8_t *p, uint32_t v)
>> +{
>> +  memcpy (p, &v, sizeof (v));
>> +}
>> +
>> +#if __BYTE_ORDER == __BIG_ENDIAN
>> +# define read_unaligned_le32(p) __builtin_bswap32 (read_unaligned_32 (p))
>> +# define set_state(v)          __builtin_bswap32 ((v))
>> +#else
>> +# define read_unaligned_le32(p) read_unaligned_32 ((p))
>> +# define set_state(v)          (v)
>> +#endif
>> +
>> +static inline void
>> +chacha20_init (uint32_t *state, const uint8_t *key, const uint8_t *iv)
>> +{
>> +  state[0]  = CHACHA20_CONSTANT_EXPA;
>> +  state[1]  = CHACHA20_CONSTANT_ND_3;
>> +  state[2]  = CHACHA20_CONSTANT_2_BY;
>> +  state[3]  = CHACHA20_CONSTANT_TE_K;
>> +
>> +  state[4]  = read_unaligned_le32 (key + 0 * sizeof (uint32_t));
>> +  state[5]  = read_unaligned_le32 (key + 1 * sizeof (uint32_t));
>> +  state[6]  = read_unaligned_le32 (key + 2 * sizeof (uint32_t));
>> +  state[7]  = read_unaligned_le32 (key + 3 * sizeof (uint32_t));
>> +  state[8]  = read_unaligned_le32 (key + 4 * sizeof (uint32_t));
>> +  state[9]  = read_unaligned_le32 (key + 5 * sizeof (uint32_t));
>> +  state[10] = read_unaligned_le32 (key + 6 * sizeof (uint32_t));
>> +  state[11] = read_unaligned_le32 (key + 7 * sizeof (uint32_t));
>> +
>> +  state[12] = read_unaligned_le32 (iv + 0 * sizeof (uint32_t));
>> +  state[13] = read_unaligned_le32 (iv + 1 * sizeof (uint32_t));
>> +  state[14] = read_unaligned_le32 (iv + 2 * sizeof (uint32_t));
>> +  state[15] = read_unaligned_le32 (iv + 3 * sizeof (uint32_t));
>> +}
>> +
>> +static inline uint32_t
>> +rotl32 (unsigned int shift, uint32_t word)
>> +{
>> +  return (word << (shift & 31)) | (word >> ((-shift) & 31));
>> +}
>> +
>> +#define QROUND(x0, x1, x2, x3)                         \
>> +  do {                                         \
>> +   x0 = x0 + x1; x3 = rotl32 (16, (x0 ^ x3));  \
>> +   x2 = x2 + x3; x1 = rotl32 (12, (x1 ^ x2));  \
>> +   x0 = x0 + x1; x3 = rotl32 (8,  (x0 ^ x3));  \
>> +   x2 = x2 + x3; x1 = rotl32 (7,  (x1 ^ x2));  \
>> +  } while(0)
>> +
>> +static inline void
>> +chacha20_block (uint32_t *state, uint32_t *stream)
>> +{
>> +  uint32_t x[CHACHA20_STATE_LEN];
>> +  memcpy (x, state, sizeof x);
>> +
>> +  for (int i = 0; i < 20; i += 2)
>> +    {
>> +      QROUND (x[0], x[4], x[8],  x[12]);
>> +      QROUND (x[1], x[5], x[9],  x[13]);
>> +      QROUND (x[2], x[6], x[10], x[14]);
>> +      QROUND (x[3], x[7], x[11], x[15]);
>> +
>> +      QROUND (x[0], x[5], x[10], x[15]);
>> +      QROUND (x[1], x[6], x[11], x[12]);
>> +      QROUND (x[2], x[7], x[8],  x[13]);
>> +      QROUND (x[3], x[4], x[9],  x[14]);
>> +    }
>> +
>> +  /* Unroll the loop a bit.  */
>> +  for (int i = 0; i < CHACHA20_BLOCK_WORDS / 4; i++)
>> +    {
>> +      stream[i*4+0] = set_state (x[i*4+0] + state[i*4+0]);
>> +      stream[i*4+1] = set_state (x[i*4+1] + state[i*4+1]);
>> +      stream[i*4+2] = set_state (x[i*4+2] + state[i*4+2]);
>> +      stream[i*4+3] = set_state (x[i*4+3] + state[i*4+3]);
>> +    }
>> +
>> +  state[12]++;
>> +}
>> +
>> +static void
>> +chacha20_crypt (uint32_t *state, uint8_t *dst, const uint8_t *src,
>> +               size_t bytes)
>> +{
>> +  uint32_t stream[CHACHA20_BLOCK_WORDS];
>> +
>> +  while (bytes >= CHACHA20_BLOCK_SIZE)
>> +    {
>> +      chacha20_block (state, stream);
>> +#ifdef CHACHA20_XOR_FINAL
>> +      for (int i = 0; i < CHACHA20_BLOCK_WORDS; i++)
>> +       stream[i] ^= read_unaligned_32 (&src[i * sizeof (uint32_t)]);
>> +#endif
>> +      memcpy (dst, stream, CHACHA20_BLOCK_SIZE);
>> +      bytes -= CHACHA20_BLOCK_SIZE;
>> +      dst += CHACHA20_BLOCK_SIZE;
>> +      src += CHACHA20_BLOCK_SIZE;
>> +    }
>> +  if (bytes != 0)
>> +    {
>> +      chacha20_block (state, stream);
>> +#ifdef CHACHA20_XOR_FINAL
>> +      for (int i = 0; i < CHACHA20_BLOCK_WORDS; i++)
>> +       stream[i] ^= read_unaligned_32 (&src[i * sizeof (uint32_t)]);
>> +#endif
>> +      memcpy (dst, stream, bytes);
>> +    }
>> +}
>> diff --git a/stdlib/stdlib.h b/stdlib/stdlib.h
>> index bf7cd438e1..f2b0c83c12 100644
>> --- a/stdlib/stdlib.h
>> +++ b/stdlib/stdlib.h
>> @@ -485,6 +485,7 @@ extern unsigned short int *seed48 (unsigned short int __seed16v[3])
>>  extern void lcong48 (unsigned short int __param[7]) __THROW __nonnull ((1));
>>
>>  # ifdef __USE_MISC
>> +#  include <bits/stdint-uintn.h>
>>  /* Data structure for communication with thread safe versions.  This
>>     type is to be regarded as opaque.  It's only exported because users
>>     have to allocate objects of this type.  */
>> @@ -533,6 +534,19 @@ extern int seed48_r (unsigned short int __seed16v[3],
>>  extern int lcong48_r (unsigned short int __param[7],
>>                       struct drand48_data *__buffer)
>>       __THROW __nonnull ((1, 2));
>> +
>> +/* Return a random integer between zero and 2**31-1 (inclusive).  */
> 
> 2**32-1
> 

Ack.


More information about the Libc-alpha mailing list