Message ID | 1439332066-13664-2-git-send-email-ben@0x539.de |
---|---|
State | New |
Headers | show |
On Wed, Aug 12, 2015 at 12:27:45AM +0200, Benjamin Herr wrote: > As described in the bug report comments, contrary to the comment in > 'process_fd' , iconv can deal with being passed invalid multibyte > sequences, as it leaves the input pointer at the start of the sequence > and we can just call it again once more bytes are availabe. > > Therefore we do not need to read the whole input at once when reading > from an fd, and can process it in chunks instead. This enables the iconv > program to be used in pipelines sensibly, and to not choke on very large > input files. > > To still support reporting the position of invalid sequences in the > input, we count consumed input bytes explicitly. As overflow of that > counter is now possible, we use saturating addition and just mention in > the error message when (size_t) -1 has been reached. > --- > 2015-08-08 Benjamin Herr <ben@0x539.de> > > [BZ #6050] > * iconv/iconv_prog.c: Don't slurp the whole input at once, instead > pass chunks to iconv and carry over incomplete multibyte sequences. > Could you next time split patches like that to first do refactoring to shuffle lines around and then actual patch? It simplifies review. I read this patch and it look reasonable but is quite big so I would prefer second set of eyes to also check that. > > iconv/iconv_prog.c | 247 +++++++++++++++++++++++++++++------------------------ > 1 file changed, 135 insertions(+), 112 deletions(-) > > diff --git a/iconv/iconv_prog.c b/iconv/iconv_prog.c > index e249bce..63b8e33 100644 > --- a/iconv/iconv_prog.c > +++ b/iconv/iconv_prog.c > @@ -465,29 +465,93 @@ conversion stopped due to problem in writing the output")); > return 0; > } > > +static void > +report_iconv_error (size_t position) > +{ > + switch (errno) > + { > + case EILSEQ: > + if (position < (size_t) -1) > + error (0, 0, > + _("illegal input sequence at position %zu"), position); > + else > + error (0, 0, > + _("illegal input sequence past position %zu"), > + position - 1); > + break; > + case EINVAL: > + error (0, 0, _("\ > +incomplete character or shift sequence at end of buffer")); > + break; > + case EBADF: > + error (0, 0, _("internal error (illegal descriptor)")); > + break; > + default: > + error (0, 0, _("unknown iconv() error %d"), errno); > + break; > + } > +} > ok, as its moved from below. > +#define BUF_SIZE 32768 > static int > -process_block (iconv_t cd, char *addr, size_t len, FILE **output, > - const char *output_file) > +flush_state (iconv_t cd, FILE **output, const char *output_file, size_t offset) > { > -#define OUTBUF_SIZE 32768 > - const char *start = addr; > - char outbuf[OUTBUF_SIZE]; > + char outbuf[BUF_SIZE]; > + char *outptr; > + size_t outlen; > + size_t n; > + > + /* All the input test is processed. For state-dependent > + character sets we have to flush the state now. */ > + outptr = outbuf; > + outlen = BUF_SIZE; > + n = iconv (cd, NULL, NULL, &outptr, &outlen); > + > + if (outptr != outbuf) > + { > + if (write_output (outbuf, outptr, output, output_file) == -1) > + return -1; > + } > + > + if (n == (size_t) -1) > + { > + report_iconv_error (offset); > + return 1; > + } > + > + return 0; > +} > + ok > +static size_t > +saturating_add (size_t a, size_t b) > +{ > + if ((size_t) -1 - a > b) > + return a + b; > + else > + return -1; > +} > + ok > +static int > +process_part (iconv_t cd, char **addr, size_t *len, size_t offset, > + FILE **output, const char *output_file) > +{ > + const char *start = *addr; > + char outbuf[BUF_SIZE]; > char *outptr; > size_t outlen; > size_t n; > int ret = 0; > > - while (len > 0) > + while (*len > 0) > { > outptr = outbuf; > - outlen = OUTBUF_SIZE; > - n = iconv (cd, &addr, &len, &outptr, &outlen); > + outlen = BUF_SIZE; > + n = iconv (cd, addr, len, &outptr, &outlen); > > if (n == (size_t) -1 && omit_invalid && errno == EILSEQ) > { > ret = 1; > - if (len == 0) > + if (*len == 0) > n = 0; > else > errno = E2BIG; > @@ -500,52 +564,18 @@ process_block (iconv_t cd, char *addr, size_t len, FILE **output, > break; > } > > - if (n != (size_t) -1) > + /* Incomplete multibyte characters might be completed by the next > + chunk, so do not treat them as an error here. */ > + if (n != (size_t) -1 || errno == EINVAL) > { > - /* All the input test is processed. For state-dependent > - character sets we have to flush the state now. */ > - outptr = outbuf; > - outlen = OUTBUF_SIZE; > - n = iconv (cd, NULL, NULL, &outptr, &outlen); > - > - if (outptr != outbuf) > - { > - ret = write_output (outbuf, outptr, output, output_file); > - if (ret != 0) > - break; > - } > - > - if (n != (size_t) -1) > - break; > - > - if (omit_invalid && errno == EILSEQ) > - { > - ret = 1; > - break; > - } > + ret = 0; > + break; > } > > if (errno != E2BIG) > { > /* iconv() ran into a problem. */ > - switch (errno) > - { > - case EILSEQ: > - if (! omit_invalid) > - error (0, 0, _("illegal input sequence at position %ld"), > - (long int) (addr - start)); > - break; > - case EINVAL: > - error (0, 0, _("\ > -incomplete character or shift sequence at end of buffer")); > - break; > - case EBADF: > - error (0, 0, _("internal error (illegal descriptor)")); > - break; > - default: > - error (0, 0, _("unknown iconv() error %d"), errno); > - break; > - } > + report_iconv_error (saturating_add (offset, (*addr - start))); > > return -1; > } > @@ -554,83 +584,76 @@ incomplete character or shift sequence at end of buffer")); > return ret; > } > ok > - > static int > -process_fd (iconv_t cd, int fd, FILE **output, const char *output_file) > +process_block (iconv_t cd, char *addr, size_t len, FILE **output, > + const char *output_file) > { > - /* we have a problem with reading from a desriptor since we must not > - provide the iconv() function an incomplete character or shift > - sequence at the end of the buffer. Since we have to deal with > - arbitrary encodings we must read the whole text in a buffer and > - process it in one step. */ > - static char *inbuf = NULL; > - static size_t maxlen = 0; > - char *inptr = NULL; > - size_t actlen = 0; > - > - while (actlen < maxlen) > - { > - ssize_t n = read (fd, inptr, maxlen - actlen); > + char *start = addr; > > - if (n == 0) > - /* No more text to read. */ > - break; > + /* Process everything in one go. */ > + int ret = process_part (cd, &addr, &len, 0, output, output_file); > > - if (n == -1) > - { > - /* Error while reading. */ > - error (0, errno, _("error while reading the input")); > - return -1; > - } > + if (ret != 0) > + return ret; > > - inptr += n; > - actlen += n; > + /* If there is input left over, there is an incomplete multibyte > + sequence at the end. */ > + if (len > 0) > + { > + errno = EINVAL; > + report_iconv_error (addr - start); > + return -1; > } > > - if (actlen == maxlen) > - while (1) > - { > - ssize_t n; > - char *new_inbuf; > + return flush_state (cd, output, output_file, addr - start); > +} > > - /* Increase the buffer. */ > - new_inbuf = (char *) realloc (inbuf, maxlen + 32768); > - if (new_inbuf == NULL) > - { > - error (0, errno, _("unable to allocate buffer for input")); > - return -1; > - } > - inbuf = new_inbuf; > - maxlen += 32768; > - inptr = inbuf + actlen; > > - do > - { > - n = read (fd, inptr, maxlen - actlen); > +static int > +process_fd (iconv_t cd, int fd, FILE **output, const char *output_file) > +{ > + char inbuf[BUF_SIZE]; > + size_t len = 0; > + size_t offset = 0; > + ssize_t n; > > - if (n == 0) > - /* No more text to read. */ > - break; > + /* Read into the buffer past unconsumed bytes from the last iteration. */ > + while ((n = read (fd, inbuf + len, BUF_SIZE - len)) > 0) > + { > + int ret; > + char *inptr; > > - if (n == -1) > - { > - /* Error while reading. */ > - error (0, errno, _("error while reading the input")); > - return -1; > - } > + len += n; > > - inptr += n; > - actlen += n; > - } > - while (actlen < maxlen); > + inptr = inbuf; > + /* Process what we have read. */ > + ret = process_part (cd, &inptr, &len, offset, output, output_file); > + if (ret != 0) > + return ret; > + /* Keep track of overall position in the input for error reporting. */ > + offset = saturating_add (offset, inptr - inbuf); > > - if (n == 0) > - /* Break again so we leave both loops. */ > - break; > - } > + /* Keep incomplete multibyte characters, if any. */ > + memmove (inbuf, inptr, len); > + } > + > + if (n == -1) > + { > + /* Error while reading. */ > + error (0, errno, _("error while reading the input")); > + return -1; > + } > + > + /* No more input available, so incomplete multibyte sequences are > + not going to get completed at this point. */ > + if (len > 0) > + { > + errno = EINVAL; > + report_iconv_error (offset); > + return -1; > + } > > - /* Now we have all the input in the buffer. Process it in one run. */ > - return process_block (cd, inbuf, actlen, output, output_file); > + return flush_state (cd, output, output_file, offset); > } > >
diff --git a/iconv/iconv_prog.c b/iconv/iconv_prog.c index e249bce..63b8e33 100644 --- a/iconv/iconv_prog.c +++ b/iconv/iconv_prog.c @@ -465,29 +465,93 @@ conversion stopped due to problem in writing the output")); return 0; } +static void +report_iconv_error (size_t position) +{ + switch (errno) + { + case EILSEQ: + if (position < (size_t) -1) + error (0, 0, + _("illegal input sequence at position %zu"), position); + else + error (0, 0, + _("illegal input sequence past position %zu"), + position - 1); + break; + case EINVAL: + error (0, 0, _("\ +incomplete character or shift sequence at end of buffer")); + break; + case EBADF: + error (0, 0, _("internal error (illegal descriptor)")); + break; + default: + error (0, 0, _("unknown iconv() error %d"), errno); + break; + } +} +#define BUF_SIZE 32768 static int -process_block (iconv_t cd, char *addr, size_t len, FILE **output, - const char *output_file) +flush_state (iconv_t cd, FILE **output, const char *output_file, size_t offset) { -#define OUTBUF_SIZE 32768 - const char *start = addr; - char outbuf[OUTBUF_SIZE]; + char outbuf[BUF_SIZE]; + char *outptr; + size_t outlen; + size_t n; + + /* All the input test is processed. For state-dependent + character sets we have to flush the state now. */ + outptr = outbuf; + outlen = BUF_SIZE; + n = iconv (cd, NULL, NULL, &outptr, &outlen); + + if (outptr != outbuf) + { + if (write_output (outbuf, outptr, output, output_file) == -1) + return -1; + } + + if (n == (size_t) -1) + { + report_iconv_error (offset); + return 1; + } + + return 0; +} + +static size_t +saturating_add (size_t a, size_t b) +{ + if ((size_t) -1 - a > b) + return a + b; + else + return -1; +} + +static int +process_part (iconv_t cd, char **addr, size_t *len, size_t offset, + FILE **output, const char *output_file) +{ + const char *start = *addr; + char outbuf[BUF_SIZE]; char *outptr; size_t outlen; size_t n; int ret = 0; - while (len > 0) + while (*len > 0) { outptr = outbuf; - outlen = OUTBUF_SIZE; - n = iconv (cd, &addr, &len, &outptr, &outlen); + outlen = BUF_SIZE; + n = iconv (cd, addr, len, &outptr, &outlen); if (n == (size_t) -1 && omit_invalid && errno == EILSEQ) { ret = 1; - if (len == 0) + if (*len == 0) n = 0; else errno = E2BIG; @@ -500,52 +564,18 @@ process_block (iconv_t cd, char *addr, size_t len, FILE **output, break; } - if (n != (size_t) -1) + /* Incomplete multibyte characters might be completed by the next + chunk, so do not treat them as an error here. */ + if (n != (size_t) -1 || errno == EINVAL) { - /* All the input test is processed. For state-dependent - character sets we have to flush the state now. */ - outptr = outbuf; - outlen = OUTBUF_SIZE; - n = iconv (cd, NULL, NULL, &outptr, &outlen); - - if (outptr != outbuf) - { - ret = write_output (outbuf, outptr, output, output_file); - if (ret != 0) - break; - } - - if (n != (size_t) -1) - break; - - if (omit_invalid && errno == EILSEQ) - { - ret = 1; - break; - } + ret = 0; + break; } if (errno != E2BIG) { /* iconv() ran into a problem. */ - switch (errno) - { - case EILSEQ: - if (! omit_invalid) - error (0, 0, _("illegal input sequence at position %ld"), - (long int) (addr - start)); - break; - case EINVAL: - error (0, 0, _("\ -incomplete character or shift sequence at end of buffer")); - break; - case EBADF: - error (0, 0, _("internal error (illegal descriptor)")); - break; - default: - error (0, 0, _("unknown iconv() error %d"), errno); - break; - } + report_iconv_error (saturating_add (offset, (*addr - start))); return -1; } @@ -554,83 +584,76 @@ incomplete character or shift sequence at end of buffer")); return ret; } - static int -process_fd (iconv_t cd, int fd, FILE **output, const char *output_file) +process_block (iconv_t cd, char *addr, size_t len, FILE **output, + const char *output_file) { - /* we have a problem with reading from a desriptor since we must not - provide the iconv() function an incomplete character or shift - sequence at the end of the buffer. Since we have to deal with - arbitrary encodings we must read the whole text in a buffer and - process it in one step. */ - static char *inbuf = NULL; - static size_t maxlen = 0; - char *inptr = NULL; - size_t actlen = 0; - - while (actlen < maxlen) - { - ssize_t n = read (fd, inptr, maxlen - actlen); + char *start = addr; - if (n == 0) - /* No more text to read. */ - break; + /* Process everything in one go. */ + int ret = process_part (cd, &addr, &len, 0, output, output_file); - if (n == -1) - { - /* Error while reading. */ - error (0, errno, _("error while reading the input")); - return -1; - } + if (ret != 0) + return ret; - inptr += n; - actlen += n; + /* If there is input left over, there is an incomplete multibyte + sequence at the end. */ + if (len > 0) + { + errno = EINVAL; + report_iconv_error (addr - start); + return -1; } - if (actlen == maxlen) - while (1) - { - ssize_t n; - char *new_inbuf; + return flush_state (cd, output, output_file, addr - start); +} - /* Increase the buffer. */ - new_inbuf = (char *) realloc (inbuf, maxlen + 32768); - if (new_inbuf == NULL) - { - error (0, errno, _("unable to allocate buffer for input")); - return -1; - } - inbuf = new_inbuf; - maxlen += 32768; - inptr = inbuf + actlen; - do - { - n = read (fd, inptr, maxlen - actlen); +static int +process_fd (iconv_t cd, int fd, FILE **output, const char *output_file) +{ + char inbuf[BUF_SIZE]; + size_t len = 0; + size_t offset = 0; + ssize_t n; - if (n == 0) - /* No more text to read. */ - break; + /* Read into the buffer past unconsumed bytes from the last iteration. */ + while ((n = read (fd, inbuf + len, BUF_SIZE - len)) > 0) + { + int ret; + char *inptr; - if (n == -1) - { - /* Error while reading. */ - error (0, errno, _("error while reading the input")); - return -1; - } + len += n; - inptr += n; - actlen += n; - } - while (actlen < maxlen); + inptr = inbuf; + /* Process what we have read. */ + ret = process_part (cd, &inptr, &len, offset, output, output_file); + if (ret != 0) + return ret; + /* Keep track of overall position in the input for error reporting. */ + offset = saturating_add (offset, inptr - inbuf); - if (n == 0) - /* Break again so we leave both loops. */ - break; - } + /* Keep incomplete multibyte characters, if any. */ + memmove (inbuf, inptr, len); + } + + if (n == -1) + { + /* Error while reading. */ + error (0, errno, _("error while reading the input")); + return -1; + } + + /* No more input available, so incomplete multibyte sequences are + not going to get completed at this point. */ + if (len > 0) + { + errno = EINVAL; + report_iconv_error (offset); + return -1; + } - /* Now we have all the input in the buffer. Process it in one run. */ - return process_block (cd, inbuf, actlen, output, output_file); + return flush_state (cd, output, output_file, offset); }