This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH v2 1/2] iconv_prog: Don't slurp whole input at once [BZ #6050]
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: Benjamin Herr <ben at 0x539 dot de>
- Cc: libc-alpha at sourceware dot org
- Date: Wed, 12 Aug 2015 00:53:22 +0200
- Subject: Re: [PATCH v2 1/2] iconv_prog: Don't slurp whole input at once [BZ #6050]
- Authentication-results: sourceware.org; auth=none
- References: <1439332066-13664-1-git-send-email-ben at 0x539 dot de> <1439332066-13664-2-git-send-email-ben at 0x539 dot de>
On Wed, Aug 12, 2015 at 12:27:45AM +0200, Benjamin Herr wrote:
> As described in the bug report comments, contrary to the comment in
> 'process_fd' , iconv can deal with being passed invalid multibyte
> sequences, as it leaves the input pointer at the start of the sequence
> and we can just call it again once more bytes are availabe.
>
> Therefore we do not need to read the whole input at once when reading
> from an fd, and can process it in chunks instead. This enables the iconv
> program to be used in pipelines sensibly, and to not choke on very large
> input files.
>
> To still support reporting the position of invalid sequences in the
> input, we count consumed input bytes explicitly. As overflow of that
> counter is now possible, we use saturating addition and just mention in
> the error message when (size_t) -1 has been reached.
> ---
> 2015-08-08 Benjamin Herr <ben@0x539.de>
>
> [BZ #6050]
> * iconv/iconv_prog.c: Don't slurp the whole input at once, instead
> pass chunks to iconv and carry over incomplete multibyte sequences.
>
Could you next time split patches like that to first do refactoring to
shuffle lines around and then actual patch? It simplifies review.
I read this patch and it look reasonable but is quite big so I would
prefer second set of eyes to also check that.
>
> iconv/iconv_prog.c | 247 +++++++++++++++++++++++++++++------------------------
> 1 file changed, 135 insertions(+), 112 deletions(-)
>
> diff --git a/iconv/iconv_prog.c b/iconv/iconv_prog.c
> index e249bce..63b8e33 100644
> --- a/iconv/iconv_prog.c
> +++ b/iconv/iconv_prog.c
> @@ -465,29 +465,93 @@ conversion stopped due to problem in writing the output"));
> return 0;
> }
>
> +static void
> +report_iconv_error (size_t position)
> +{
> + switch (errno)
> + {
> + case EILSEQ:
> + if (position < (size_t) -1)
> + error (0, 0,
> + _("illegal input sequence at position %zu"), position);
> + else
> + error (0, 0,
> + _("illegal input sequence past position %zu"),
> + position - 1);
> + break;
> + case EINVAL:
> + error (0, 0, _("\
> +incomplete character or shift sequence at end of buffer"));
> + break;
> + case EBADF:
> + error (0, 0, _("internal error (illegal descriptor)"));
> + break;
> + default:
> + error (0, 0, _("unknown iconv() error %d"), errno);
> + break;
> + }
> +}
>
ok, as its moved from below.
> +#define BUF_SIZE 32768
> static int
> -process_block (iconv_t cd, char *addr, size_t len, FILE **output,
> - const char *output_file)
> +flush_state (iconv_t cd, FILE **output, const char *output_file, size_t offset)
> {
> -#define OUTBUF_SIZE 32768
> - const char *start = addr;
> - char outbuf[OUTBUF_SIZE];
> + char outbuf[BUF_SIZE];
> + char *outptr;
> + size_t outlen;
> + size_t n;
> +
> + /* All the input test is processed. For state-dependent
> + character sets we have to flush the state now. */
> + outptr = outbuf;
> + outlen = BUF_SIZE;
> + n = iconv (cd, NULL, NULL, &outptr, &outlen);
> +
> + if (outptr != outbuf)
> + {
> + if (write_output (outbuf, outptr, output, output_file) == -1)
> + return -1;
> + }
> +
> + if (n == (size_t) -1)
> + {
> + report_iconv_error (offset);
> + return 1;
> + }
> +
> + return 0;
> +}
> +
ok
> +static size_t
> +saturating_add (size_t a, size_t b)
> +{
> + if ((size_t) -1 - a > b)
> + return a + b;
> + else
> + return -1;
> +}
> +
ok
> +static int
> +process_part (iconv_t cd, char **addr, size_t *len, size_t offset,
> + FILE **output, const char *output_file)
> +{
> + const char *start = *addr;
> + char outbuf[BUF_SIZE];
> char *outptr;
> size_t outlen;
> size_t n;
> int ret = 0;
>
> - while (len > 0)
> + while (*len > 0)
> {
> outptr = outbuf;
> - outlen = OUTBUF_SIZE;
> - n = iconv (cd, &addr, &len, &outptr, &outlen);
> + outlen = BUF_SIZE;
> + n = iconv (cd, addr, len, &outptr, &outlen);
>
> if (n == (size_t) -1 && omit_invalid && errno == EILSEQ)
> {
> ret = 1;
> - if (len == 0)
> + if (*len == 0)
> n = 0;
> else
> errno = E2BIG;
> @@ -500,52 +564,18 @@ process_block (iconv_t cd, char *addr, size_t len, FILE **output,
> break;
> }
>
> - if (n != (size_t) -1)
> + /* Incomplete multibyte characters might be completed by the next
> + chunk, so do not treat them as an error here. */
> + if (n != (size_t) -1 || errno == EINVAL)
> {
> - /* All the input test is processed. For state-dependent
> - character sets we have to flush the state now. */
> - outptr = outbuf;
> - outlen = OUTBUF_SIZE;
> - n = iconv (cd, NULL, NULL, &outptr, &outlen);
> -
> - if (outptr != outbuf)
> - {
> - ret = write_output (outbuf, outptr, output, output_file);
> - if (ret != 0)
> - break;
> - }
> -
> - if (n != (size_t) -1)
> - break;
> -
> - if (omit_invalid && errno == EILSEQ)
> - {
> - ret = 1;
> - break;
> - }
> + ret = 0;
> + break;
> }
>
> if (errno != E2BIG)
> {
> /* iconv() ran into a problem. */
> - switch (errno)
> - {
> - case EILSEQ:
> - if (! omit_invalid)
> - error (0, 0, _("illegal input sequence at position %ld"),
> - (long int) (addr - start));
> - break;
> - case EINVAL:
> - error (0, 0, _("\
> -incomplete character or shift sequence at end of buffer"));
> - break;
> - case EBADF:
> - error (0, 0, _("internal error (illegal descriptor)"));
> - break;
> - default:
> - error (0, 0, _("unknown iconv() error %d"), errno);
> - break;
> - }
> + report_iconv_error (saturating_add (offset, (*addr - start)));
>
> return -1;
> }
> @@ -554,83 +584,76 @@ incomplete character or shift sequence at end of buffer"));
> return ret;
> }
>
ok
> -
> static int
> -process_fd (iconv_t cd, int fd, FILE **output, const char *output_file)
> +process_block (iconv_t cd, char *addr, size_t len, FILE **output,
> + const char *output_file)
> {
> - /* we have a problem with reading from a desriptor since we must not
> - provide the iconv() function an incomplete character or shift
> - sequence at the end of the buffer. Since we have to deal with
> - arbitrary encodings we must read the whole text in a buffer and
> - process it in one step. */
> - static char *inbuf = NULL;
> - static size_t maxlen = 0;
> - char *inptr = NULL;
> - size_t actlen = 0;
> -
> - while (actlen < maxlen)
> - {
> - ssize_t n = read (fd, inptr, maxlen - actlen);
> + char *start = addr;
>
> - if (n == 0)
> - /* No more text to read. */
> - break;
> + /* Process everything in one go. */
> + int ret = process_part (cd, &addr, &len, 0, output, output_file);
>
> - if (n == -1)
> - {
> - /* Error while reading. */
> - error (0, errno, _("error while reading the input"));
> - return -1;
> - }
> + if (ret != 0)
> + return ret;
>
> - inptr += n;
> - actlen += n;
> + /* If there is input left over, there is an incomplete multibyte
> + sequence at the end. */
> + if (len > 0)
> + {
> + errno = EINVAL;
> + report_iconv_error (addr - start);
> + return -1;
> }
>
> - if (actlen == maxlen)
> - while (1)
> - {
> - ssize_t n;
> - char *new_inbuf;
> + return flush_state (cd, output, output_file, addr - start);
> +}
>
> - /* Increase the buffer. */
> - new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
> - if (new_inbuf == NULL)
> - {
> - error (0, errno, _("unable to allocate buffer for input"));
> - return -1;
> - }
> - inbuf = new_inbuf;
> - maxlen += 32768;
> - inptr = inbuf + actlen;
>
> - do
> - {
> - n = read (fd, inptr, maxlen - actlen);
> +static int
> +process_fd (iconv_t cd, int fd, FILE **output, const char *output_file)
> +{
> + char inbuf[BUF_SIZE];
> + size_t len = 0;
> + size_t offset = 0;
> + ssize_t n;
>
> - if (n == 0)
> - /* No more text to read. */
> - break;
> + /* Read into the buffer past unconsumed bytes from the last iteration. */
> + while ((n = read (fd, inbuf + len, BUF_SIZE - len)) > 0)
> + {
> + int ret;
> + char *inptr;
>
> - if (n == -1)
> - {
> - /* Error while reading. */
> - error (0, errno, _("error while reading the input"));
> - return -1;
> - }
> + len += n;
>
> - inptr += n;
> - actlen += n;
> - }
> - while (actlen < maxlen);
> + inptr = inbuf;
> + /* Process what we have read. */
> + ret = process_part (cd, &inptr, &len, offset, output, output_file);
> + if (ret != 0)
> + return ret;
> + /* Keep track of overall position in the input for error reporting. */
> + offset = saturating_add (offset, inptr - inbuf);
>
> - if (n == 0)
> - /* Break again so we leave both loops. */
> - break;
> - }
> + /* Keep incomplete multibyte characters, if any. */
> + memmove (inbuf, inptr, len);
> + }
> +
> + if (n == -1)
> + {
> + /* Error while reading. */
> + error (0, errno, _("error while reading the input"));
> + return -1;
> + }
> +
> + /* No more input available, so incomplete multibyte sequences are
> + not going to get completed at this point. */
> + if (len > 0)
> + {
> + errno = EINVAL;
> + report_iconv_error (offset);
> + return -1;
> + }
>
> - /* Now we have all the input in the buffer. Process it in one run. */
> - return process_block (cd, inbuf, actlen, output, output_file);
> + return flush_state (cd, output, output_file, offset);
> }
>
>