diff mbox

[1/2] iconv_prog: Don't slurp whole input at once [BZ #6050]

Message ID 1439158464-18443-2-git-send-email-ben@0x539.de
State New
Headers show

Commit Message

Benjamin Herr Aug. 9, 2015, 10:14 p.m. UTC
As described in the bug report comments, contrary to the comment in
'process_fd' , iconv can deal with being passed invalid multibyte
sequences, as it leaves the input pointer at the start of the sequence
and we can just call it again once more bytes are availabe.

Therefore we do not need to read the whole input at once when reading
from an fd, and can process it in chunks instead. This enables the iconv
program to be used in pipelines sensibly, and to not choke on very large
input files.

To still support reporting the position of invalid sequences in the
input, we count consumed input bytes explicitly. As overflow of that
counter is now possible, we use saturating addition and just mention in
the error message when (size_t) -1 has been reached.
---
2015-08-08  Benjamin Herr  <ben@0x539.de>

	[BZ #6050]
	* iconv/iconv_prog.c: Don't slurp the whole input at once, instead
	pass chunks to iconv and carry over incomplete multibyte sequences.


 iconv/iconv_prog.c | 247 +++++++++++++++++++++++++++++------------------------
 1 file changed, 135 insertions(+), 112 deletions(-)

Comments

Andreas Schwab Aug. 9, 2015, 10:31 p.m. UTC | #1
Benjamin Herr <ben@0x539.de> writes:

> +static int

s/int/size_t/

> +saturating_add (size_t a, size_t b)
> +{
> +  if ((size_t) -1 - a > b)
> +    return a + b;
> +  else
> +    return -1;
> +}

Andreas.
Rich Felker Aug. 9, 2015, 10:46 p.m. UTC | #2
On Mon, Aug 10, 2015 at 12:14:23AM +0200, Benjamin Herr wrote:
> To still support reporting the position of invalid sequences in the
> input, we count consumed input bytes explicitly. As overflow of that
> counter is now possible, we use saturating addition and just mention in
> the error message when (size_t) -1 has been reached.

Why not use uint64_t? Then it's reasonable to assume overflow does not
happen -- it would take something like a century to hit -- and you
don't underreport in the case of huge files that fit in off_t but not
size_t.

Rich
Joseph Myers Aug. 10, 2015, 2:29 p.m. UTC | #3
On Sun, 9 Aug 2015, Rich Felker wrote:

> On Mon, Aug 10, 2015 at 12:14:23AM +0200, Benjamin Herr wrote:
> > To still support reporting the position of invalid sequences in the
> > input, we count consumed input bytes explicitly. As overflow of that
> > counter is now possible, we use saturating addition and just mention in
> > the error message when (size_t) -1 has been reached.
> 
> Why not use uint64_t? Then it's reasonable to assume overflow does not
> happen -- it would take something like a century to hit -- and you
> don't underreport in the case of huge files that fit in off_t but not
> size_t.

off64_t seems logically right for file offsets (with casts to intmax_t for 
printing).  But right now there are several places in iconv_prog.c that 
use non-large-file interfaces for I/O (including trying to mmap the whole 
file into memory without regard to st.st_size - from fstat64 - possibly 
overflowing size_t).  I think large file issues can reasonably be 
considered a separate bug (filed in Bugzilla and fixed separately).
diff mbox

Patch

diff --git a/iconv/iconv_prog.c b/iconv/iconv_prog.c
index e249bce..a935402 100644
--- a/iconv/iconv_prog.c
+++ b/iconv/iconv_prog.c
@@ -465,29 +465,93 @@  conversion stopped due to problem in writing the output"));
   return 0;
 }
 
+static void
+report_iconv_error (size_t position)
+{
+  switch (errno)
+    {
+    case EILSEQ:
+      if (position < (size_t) -1)
+	error (0, 0,
+	       _("illegal input sequence at position %zu"), position);
+      else
+	error (0, 0,
+	       _("illegal input sequence past position %zu"),
+	       position - 1);
+      break;
+    case EINVAL:
+      error (0, 0, _("\
+incomplete character or shift sequence at end of buffer"));
+      break;
+    case EBADF:
+      error (0, 0, _("internal error (illegal descriptor)"));
+      break;
+    default:
+      error (0, 0, _("unknown iconv() error %d"), errno);
+      break;
+    }
+}
 
+#define BUF_SIZE	32768
 static int
-process_block (iconv_t cd, char *addr, size_t len, FILE **output,
-	       const char *output_file)
+flush_state (iconv_t cd, FILE **output, const char *output_file, size_t offset)
 {
-#define OUTBUF_SIZE	32768
-  const char *start = addr;
-  char outbuf[OUTBUF_SIZE];
+  char outbuf[BUF_SIZE];
+  char *outptr;
+  size_t outlen;
+  size_t n;
+
+  /* All the input test is processed.  For state-dependent
+     character sets we have to flush the state now.  */
+  outptr = outbuf;
+  outlen = BUF_SIZE;
+  n = iconv (cd, NULL, NULL, &outptr, &outlen);
+
+  if (outptr != outbuf)
+    {
+      if (write_output (outbuf, outptr, output, output_file) == -1)
+	return -1;
+    }
+
+  if (n == (size_t) -1)
+    {
+      report_iconv_error (offset);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+saturating_add (size_t a, size_t b)
+{
+  if ((size_t) -1 - a > b)
+    return a + b;
+  else
+    return -1;
+}
+
+static int
+process_part (iconv_t cd, char **addr, size_t *len, size_t offset,
+	      FILE **output, const char *output_file)
+{
+  const char *start = *addr;
+  char outbuf[BUF_SIZE];
   char *outptr;
   size_t outlen;
   size_t n;
   int ret = 0;
 
-  while (len > 0)
+  while (*len > 0)
     {
       outptr = outbuf;
-      outlen = OUTBUF_SIZE;
-      n = iconv (cd, &addr, &len, &outptr, &outlen);
+      outlen = BUF_SIZE;
+      n = iconv (cd, addr, len, &outptr, &outlen);
 
       if (n == (size_t) -1 && omit_invalid && errno == EILSEQ)
 	{
 	  ret = 1;
-	  if (len == 0)
+	  if (*len == 0)
 	    n = 0;
 	  else
 	    errno = E2BIG;
@@ -500,52 +564,18 @@  process_block (iconv_t cd, char *addr, size_t len, FILE **output,
 	    break;
 	}
 
-      if (n != (size_t) -1)
+      /* Incomplete multibyte characters might be completed by the next
+         chunk, so do not treat them as an error here. */
+      if (n != (size_t) -1 || errno == EINVAL)
 	{
-	  /* All the input test is processed.  For state-dependent
-	     character sets we have to flush the state now.  */
-	  outptr = outbuf;
-	  outlen = OUTBUF_SIZE;
-	  n = iconv (cd, NULL, NULL, &outptr, &outlen);
-
-	  if (outptr != outbuf)
-	    {
-	      ret = write_output (outbuf, outptr, output, output_file);
-	      if (ret != 0)
-		break;
-	    }
-
-	  if (n != (size_t) -1)
-	    break;
-
-	  if (omit_invalid && errno == EILSEQ)
-	    {
-	      ret = 1;
-	      break;
-	    }
+	  ret = 0;
+	  break;
 	}
 
       if (errno != E2BIG)
 	{
 	  /* iconv() ran into a problem.  */
-	  switch (errno)
-	    {
-	    case EILSEQ:
-	      if (! omit_invalid)
-		error (0, 0, _("illegal input sequence at position %ld"),
-		       (long int) (addr - start));
-	      break;
-	    case EINVAL:
-	      error (0, 0, _("\
-incomplete character or shift sequence at end of buffer"));
-	      break;
-	    case EBADF:
-	      error (0, 0, _("internal error (illegal descriptor)"));
-	      break;
-	    default:
-	      error (0, 0, _("unknown iconv() error %d"), errno);
-	      break;
-	    }
+          report_iconv_error (saturating_add (offset, (*addr - start)));
 
 	  return -1;
 	}
@@ -554,83 +584,76 @@  incomplete character or shift sequence at end of buffer"));
   return ret;
 }
 
-
 static int
-process_fd (iconv_t cd, int fd, FILE **output, const char *output_file)
+process_block (iconv_t cd, char *addr, size_t len, FILE **output,
+	       const char *output_file)
 {
-  /* we have a problem with reading from a desriptor since we must not
-     provide the iconv() function an incomplete character or shift
-     sequence at the end of the buffer.  Since we have to deal with
-     arbitrary encodings we must read the whole text in a buffer and
-     process it in one step.  */
-  static char *inbuf = NULL;
-  static size_t maxlen = 0;
-  char *inptr = NULL;
-  size_t actlen = 0;
-
-  while (actlen < maxlen)
-    {
-      ssize_t n = read (fd, inptr, maxlen - actlen);
+  char *start = addr;
 
-      if (n == 0)
-	/* No more text to read.  */
-	break;
+  /* Process everything in one go.  */
+  int ret = process_part (cd, &addr, &len, 0, output, output_file);
 
-      if (n == -1)
-	{
-	  /* Error while reading.  */
-	  error (0, errno, _("error while reading the input"));
-	  return -1;
-	}
+  if (ret != 0)
+    return ret;
 
-      inptr += n;
-      actlen += n;
+  /* If there is input left over, there is an incomplete multibyte
+     sequence at the end.  */
+  if (len > 0)
+    {
+      errno = EINVAL;
+      report_iconv_error (addr - start);
+      return -1;
     }
 
-  if (actlen == maxlen)
-    while (1)
-      {
-	ssize_t n;
-	char *new_inbuf;
+  return flush_state (cd, output, output_file, addr - start);
+}
 
-	/* Increase the buffer.  */
-	new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
-	if (new_inbuf == NULL)
-	  {
-	    error (0, errno, _("unable to allocate buffer for input"));
-	    return -1;
-	  }
-	inbuf = new_inbuf;
-	maxlen += 32768;
-	inptr = inbuf + actlen;
 
-	do
-	  {
-	    n = read (fd, inptr, maxlen - actlen);
+static int
+process_fd (iconv_t cd, int fd, FILE **output, const char *output_file)
+{
+  char inbuf[BUF_SIZE];
+  size_t len = 0;
+  size_t offset = 0;
+  ssize_t n;
 
-	    if (n == 0)
-	      /* No more text to read.  */
-	      break;
+  /* Read into the buffer past unconsumed bytes from the last iteration.  */
+  while ((n = read (fd, inbuf + len, BUF_SIZE - len)) > 0)
+    {
+      int ret;
+      char *inptr;
 
-	    if (n == -1)
-	      {
-		/* Error while reading.  */
-		error (0, errno, _("error while reading the input"));
-		return -1;
-	      }
+      len += n;
 
-	    inptr += n;
-	    actlen += n;
-	  }
-	while (actlen < maxlen);
+      inptr = inbuf;
+      /* Process what we have read.  */
+      ret = process_part (cd, &inptr, &len, offset, output, output_file);
+      if (ret != 0)
+	return ret;
+      /* Keep track of overall position in the input for error reporting.  */
+      offset = saturating_add (offset, inptr - inbuf);
 
-	if (n == 0)
-	  /* Break again so we leave both loops.  */
-	  break;
-      }
+      /* Keep incomplete multibyte characters, if any.  */
+      memmove (inbuf, inptr, len);
+    }
+
+  if (n == -1)
+    {
+      /* Error while reading.  */
+      error (0, errno, _("error while reading the input"));
+      return -1;
+    }
+
+  /* No more input available, so incomplete multibyte sequences are
+     not going to get completed at this point. */
+  if (len > 0)
+    {
+      errno = EINVAL;
+      report_iconv_error (offset);
+      return -1;
+    }
 
-  /* Now we have all the input in the buffer.  Process it in one run.  */
-  return process_block (cd, inbuf, actlen, output, output_file);
+  return flush_state (cd, output, output_file, offset);
 }