[38/49] analyzer: new file: sm-taint.cc
diff mbox series

Message ID 1573867416-55618-39-git-send-email-dmalcolm@redhat.com
State New
Headers show
Series
  • RFC: Add a static analysis framework to GCC
Related show

Commit Message

David Malcolm Nov. 16, 2019, 1:23 a.m. UTC
This patch adds a state machine checker for tracking "taint",
where data potentially under an attacker's control is used for
things like array indices without sanitization (CWE-129).

This checker isn't ready for production, and is presented as a
proof-of-concept of the sm-based approach.

gcc/ChangeLog:
	* analyzer/sm-taint.cc: New file.
---
 gcc/analyzer/sm-taint.cc | 338 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 338 insertions(+)
 create mode 100644 gcc/analyzer/sm-taint.cc

Comments

Jeff Law Dec. 7, 2019, 3:22 p.m. UTC | #1
On Fri, 2019-11-15 at 20:23 -0500, David Malcolm wrote:
> This patch adds a state machine checker for tracking "taint",
> where data potentially under an attacker's control is used for
> things like array indices without sanitization (CWE-129).
> 
> This checker isn't ready for production, and is presented as a
> proof-of-concept of the sm-based approach.
> 
> gcc/ChangeLog:
> 	* analyzer/sm-taint.cc: New file.
As you know, I'm a big fan of getting some kind of taint analysis into
GCC.  So I'd certainly encourage someone to pick up on this as well as
the more localized work.  I'm particularly interested in things like a
tainted length/count which is then used to calculate how much memory to
allocate in malloc/alloca.  If an attacker can control that they can do
some nasty things.

I do think we're likely to end up with some more traditional warnings
in that space as well.  So figuring out the balance/guidelines for when
to improve the traditional warnings vs those from the static analyzer
will need to be figured out.

jeff
>

Patch
diff mbox series

diff --git a/gcc/analyzer/sm-taint.cc b/gcc/analyzer/sm-taint.cc
new file mode 100644
index 0000000..c664a54
--- /dev/null
+++ b/gcc/analyzer/sm-taint.cc
@@ -0,0 +1,338 @@ 
+/* An experimental state machine, for tracking "taint": unsanitized uses
+   of data potentially under an attacker's control.
+
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   Contributed by David Malcolm <dmalcolm@redhat.com>.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "gcc-plugin.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tree.h"
+#include "gimple.h"
+#include "diagnostic-path.h"
+#include "diagnostic-metadata.h"
+#include "analyzer/analyzer.h"
+#include "analyzer/pending-diagnostic.h"
+#include "analyzer/sm.h"
+
+namespace {
+
+/* An experimental state machine, for tracking "taint": unsanitized uses
+   of data potentially under an attacker's control.  */
+
+class taint_state_machine : public state_machine
+{
+public:
+  taint_state_machine (logger *logger);
+
+  bool inherited_state_p () const FINAL OVERRIDE { return true; }
+
+  bool on_stmt (sm_context *sm_ctxt,
+		const supernode *node,
+		const gimple *stmt) const FINAL OVERRIDE;
+
+  void on_condition (sm_context *sm_ctxt,
+		     const supernode *node,
+		     const gimple *stmt,
+		     tree lhs,
+		     enum tree_code op,
+		     tree rhs) const FINAL OVERRIDE;
+
+  void on_leak (sm_context *sm_ctxt,
+		const supernode *node,
+		const gimple *stmt,
+		tree var,
+		state_machine::state_t state) const FINAL OVERRIDE;
+  bool can_purge_p (state_t s) const FINAL OVERRIDE;
+
+  /* Start state.  */
+  state_t m_start;
+
+  /* State for a "tainted" value: unsanitized data potentially under an
+     attacker's control.  */
+  state_t m_tainted;
+
+  /* State for a "tainted" value that has a lower bound.  */
+  state_t m_has_lb;
+
+  /* State for a "tainted" value that has an upper bound.  */
+  state_t m_has_ub;
+
+  /* Stop state, for a value we don't want to track any more.  */
+  state_t m_stop;
+};
+
+////////////////////////////////////////////////////////////////////////////
+
+enum bounds
+{
+  BOUNDS_NONE,
+  BOUNDS_UPPER,
+  BOUNDS_LOWER
+};
+
+class tainted_array_index
+  : public pending_diagnostic_subclass<tainted_array_index>
+{
+public:
+  tainted_array_index (const taint_state_machine &sm, tree arg,
+		       enum bounds has_bounds)
+  : m_sm (sm), m_arg (arg), m_has_bounds (has_bounds) {}
+
+  const char *get_kind () const FINAL OVERRIDE { return "tainted_array_index"; }
+
+  bool operator== (const tainted_array_index &other) const
+  {
+    return m_arg == other.m_arg;
+  }
+
+  bool emit (rich_location *rich_loc) FINAL OVERRIDE
+  {
+    diagnostic_metadata m;
+    m.add_cwe (129);
+    switch (m_has_bounds)
+      {
+      default:
+	gcc_unreachable ();
+      case BOUNDS_NONE:
+	return warning_at (rich_loc, m, OPT_Wanalyzer_tainted_array_index,
+			   "use of tainted value %qE in array lookup"
+			   " without bounds checking",
+			   m_arg);
+	break;
+      case BOUNDS_UPPER:
+	return warning_at (rich_loc, m, OPT_Wanalyzer_tainted_array_index,
+			   "use of tainted value %qE in array lookup"
+			   " without lower-bounds checking",
+			   m_arg);
+	break;
+      case BOUNDS_LOWER:
+	return warning_at (rich_loc, m, OPT_Wanalyzer_tainted_array_index,
+			   "use of tainted value %qE in array lookup"
+			   " without upper-bounds checking",
+			   m_arg);
+	break;
+      }
+  }
+
+  label_text describe_state_change (const evdesc::state_change &change)
+    FINAL OVERRIDE
+  {
+    if (change.m_new_state == m_sm.m_tainted)
+      {
+	if (change.m_origin)
+	  return change.formatted_print ("%qE has an unchecked value here"
+					 " (from %qE)",
+					 change.m_expr, change.m_origin);
+	else
+	  return change.formatted_print ("%qE gets an unchecked value here",
+					 change.m_expr);
+      }
+    else if (change.m_new_state == m_sm.m_has_lb)
+      return change.formatted_print ("%qE has its lower bound checked here",
+				     change.m_expr);
+    else if (change.m_new_state == m_sm.m_has_ub)
+      return change.formatted_print ("%qE has its upper bound checked here",
+				     change.m_expr);
+    return label_text ();
+  }
+
+  label_text describe_final_event (const evdesc::final_event &ev) FINAL OVERRIDE
+  {
+    switch (m_has_bounds)
+      {
+      default:
+	gcc_unreachable ();
+      case BOUNDS_NONE:
+	return ev.formatted_print ("use of tainted value %qE in array lookup"
+				   " without bounds checking",
+				   m_arg);
+      case BOUNDS_UPPER:
+	return ev.formatted_print ("use of tainted value %qE in array lookup"
+				   " without lower-bounds checking",
+				   m_arg);
+      case BOUNDS_LOWER:
+	return ev.formatted_print ("use of tainted value %qE in array lookup"
+				   " without upper-bounds checking",
+				   m_arg);
+      }
+  }
+
+private:
+  const taint_state_machine &m_sm;
+  tree m_arg;
+  enum bounds m_has_bounds;
+};
+
+////////////////////////////////////////////////////////////////////////////
+
+/* taint_state_machine's ctor.  */
+
+taint_state_machine::taint_state_machine (logger *logger)
+: state_machine ("taint", logger)
+{
+  m_start = add_state ("start");
+  m_tainted = add_state ("tainted");
+  m_has_lb = add_state ("has_lb");
+  m_has_ub = add_state ("has_ub");
+  m_stop = add_state ("stop");
+}
+
+/* Implementation of state_machine::on_stmt vfunc for taint_state_machine.  */
+
+bool
+taint_state_machine::on_stmt (sm_context *sm_ctxt,
+			       const supernode *node,
+			       const gimple *stmt) const
+{
+  if (const gcall *call = dyn_cast <const gcall *> (stmt))
+    {
+      if (is_named_call_p (call, "fread", 4))
+	{
+	  tree arg = gimple_call_arg (call, 0);
+	  arg = sm_ctxt->get_readable_tree (arg);
+
+	  sm_ctxt->on_transition (node, stmt, arg, m_start, m_tainted);
+
+	  /* Dereference an ADDR_EXPR.  */
+	  // TODO: should the engine do this?
+	  if (TREE_CODE (arg) == ADDR_EXPR)
+	    sm_ctxt->on_transition (node, stmt, TREE_OPERAND (arg, 0),
+				    m_start, m_tainted);
+	  return true;
+	}
+    }
+  // TODO: ...etc; many other sources of untrusted data
+
+  if (const gassign *assign = dyn_cast <const gassign *> (stmt))
+    {
+      tree rhs1 = gimple_assign_rhs1 (assign);
+      enum tree_code op = gimple_assign_rhs_code (assign);
+
+      /* Check array accesses.  */
+      if (op == ARRAY_REF)
+	{
+	  tree arg = TREE_OPERAND (rhs1, 1);
+	  arg = sm_ctxt->get_readable_tree (arg);
+
+	  /* Unsigned types have an implicit lower bound.  */
+	  bool is_unsigned = false;
+	  if (INTEGRAL_TYPE_P (TREE_TYPE (arg)))
+	    is_unsigned = TYPE_UNSIGNED (TREE_TYPE (arg));
+
+	  /* Complain about missing bounds.  */
+	  sm_ctxt->warn_for_state
+	    (node, stmt, arg, m_tainted,
+	     new tainted_array_index (*this, arg,
+				      is_unsigned
+				      ? BOUNDS_LOWER : BOUNDS_NONE));
+	  sm_ctxt->on_transition (node, stmt, arg, m_tainted, m_stop);
+
+	  /* Complain about missing upper bound.  */
+	  sm_ctxt->warn_for_state  (node, stmt, arg, m_has_lb,
+				    new tainted_array_index (*this, arg,
+							     BOUNDS_LOWER));
+	  sm_ctxt->on_transition (node, stmt, arg, m_has_lb, m_stop);
+
+	  /* Complain about missing lower bound.  */
+	  if (!is_unsigned)
+	    {
+	      sm_ctxt->warn_for_state  (node, stmt, arg, m_has_ub,
+					new tainted_array_index (*this, arg,
+								 BOUNDS_UPPER));
+	      sm_ctxt->on_transition (node, stmt, arg, m_has_ub, m_stop);
+	    }
+	}
+    }
+
+  return false;
+}
+
+/* Implementation of state_machine::on_condition vfunc for taint_state_machine.
+   Potentially transition state 'tainted' to 'has_ub' or 'has_lb',
+   and states 'has_ub' and 'has_lb' to 'stop'.  */
+
+void
+taint_state_machine::on_condition (sm_context *sm_ctxt,
+				   const supernode *node,
+				   const gimple *stmt,
+				   tree lhs,
+				   enum tree_code op,
+				   tree rhs ATTRIBUTE_UNUSED) const
+{
+  if (stmt == NULL)
+    return;
+
+  // TODO: this doesn't use the RHS; should we make it symmetric?
+
+  // TODO
+  switch (op)
+    {
+      //case NE_EXPR:
+      //case EQ_EXPR:
+    case GE_EXPR:
+    case GT_EXPR:
+      {
+	sm_ctxt->on_transition (node, stmt, lhs, m_tainted,
+				m_has_lb);
+	sm_ctxt->on_transition (node, stmt, lhs, m_has_ub,
+				m_stop);
+      }
+      break;
+    case LE_EXPR:
+    case LT_EXPR:
+      {
+	sm_ctxt->on_transition (node, stmt, lhs, m_tainted,
+				m_has_ub);
+	sm_ctxt->on_transition (node, stmt, lhs, m_has_lb,
+				m_stop);
+      }
+      break;
+    default:
+      break;
+    }
+}
+
+void
+taint_state_machine::on_leak (sm_context *sm_ctxt ATTRIBUTE_UNUSED,
+			      const supernode *node ATTRIBUTE_UNUSED,
+			      const gimple *stmt ATTRIBUTE_UNUSED,
+			      tree var ATTRIBUTE_UNUSED,
+			      state_machine::state_t state ATTRIBUTE_UNUSED)
+  const
+{
+  /* Empty.  */
+}
+
+bool
+taint_state_machine::can_purge_p (state_t s ATTRIBUTE_UNUSED) const
+{
+  return true;
+}
+
+} // anonymous namespace
+
+/* Internal interface to this file. */
+
+state_machine *
+make_taint_state_machine (logger *logger)
+{
+  return new taint_state_machine (logger);
+}