diff mbox series

[2/2] Capture locations of bidi chars and underline ranges

Message ID 20211102205801.1202228-3-dmalcolm@redhat.com
State New
Headers show
Series Re: [PATCH] libcpp: Implement -Wbidirectional for CVE-2021-42574 [PR103026] | expand

Commit Message

David Malcolm Nov. 2, 2021, 8:58 p.m. UTC
This patch converts the bidi::vec to use a struct so that we can
capture location_t values for the bidirectional control characters,
and uses these to label sources ranges in the diagnostics.

The effect on the output can be seen in the new testcase.

gcc/testsuite/ChangeLog:
	* c-c++-common/Wbidirectional-ranges.c: New test.

libcpp/ChangeLog:
	* lex.c (struct bidi::context): New.
	(bidi::vec): Convert to a vec of context rather than unsigned char.
	(bidi::current_ctx): Update for above change.
	(bidi::current_ctx_ucn_p): Likewise.
	(bidi::current_ctx_loc): New.
	(bidi::on_char): Update for usage of context struct.  Add "loc"
	param and pass it when pushing contexts.
	(get_location_for_byte_range_in_cur_line): New.
	(get_bidi_utf8): Rename to...
	(get_bidi_utf8_1): ...this, reintroducing...
	(get_bidi_utf8): ...as a wrapper, setting *OUT when the result is
	not NONE.
	(get_bidi_ucn): Rename to...
	(get_bidi_ucn_1): ...this, reintroducing...
	(get_bidi_ucn): ...as a wrapper, setting *OUT when the result is
	not NONE.
	(class unpaired_bidi_rich_location): New.
	(maybe_warn_bidi_on_close): Use unpaired_bidi_rich_location when
	reporting on unpaired bidi chars.  Split into singular vs plural
	spellings.
	(maybe_warn_bidi_on_char): Pass in a location_t rather than a
	const uchar * and use it when emitting warnings, and when calling
	bidi::on_char.
	(_cpp_skip_block_comment): Capture location when kind is not NONE
	and pass it to maybe_warn_bidi_on_char.
	(skip_line_comment): Likewise.
	(forms_identifier_p): Likewise.
	(lex_raw_string): Likewise.
	(lex_string): Likewise.

Signed-off-by: David Malcolm <dmalcolm@redhat.com>
---
 .../c-c++-common/Wbidirectional-ranges.c      |  54 ++++
 libcpp/lex.c                                  | 241 ++++++++++++++----
 2 files changed, 252 insertions(+), 43 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/Wbidirectional-ranges.c
diff mbox series

Patch

diff --git a/gcc/testsuite/c-c++-common/Wbidirectional-ranges.c b/gcc/testsuite/c-c++-common/Wbidirectional-ranges.c
new file mode 100644
index 00000000000..a41ae47dc30
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/Wbidirectional-ranges.c
@@ -0,0 +1,54 @@ 
+/* PR preprocessor/103026 */
+/* { dg-do compile } */
+/* { dg-options "-Wbidirectional=unpaired -fdiagnostics-show-caret" } */
+/* Verify that we escape and underline pertinent bidirectional characters
+   when quoting the source.  */
+
+int test_unpaired_bidi () {
+    int isAdmin = 0;
+    /*‮ } ⁦if (isAdmin)⁩ ⁦ begin admins only */
+/* { dg-warning "bidirectional" "" { target *-*-* } .-1 } */
+#if 0
+   { dg-begin-multiline-output "" }
+     /*<U+202E> } <U+2066>if (isAdmin)<U+2069> <U+2066> begin admins only */
+       ~~~~~~~~                                ~~~~~~~~                    ^
+       |                                       |                           |
+       |                                       |                           end of bidirectional context
+       U+202E (RIGHT-TO-LEFT OVERRIDE)         U+2066 (LEFT-TO-RIGHT ISOLATE)
+   { dg-end-multiline-output "" }
+#endif
+
+        __builtin_printf("You are an admin.\n");
+    /* end admins only ‮ { ⁦*/
+/* { dg-warning "bidirectional" "" { target *-*-* } .-1 } */
+#if 0
+   { dg-begin-multiline-output "" }
+     /* end admins only <U+202E> { <U+2066>*/
+                        ~~~~~~~~   ~~~~~~~~ ^
+                        |          |        |
+                        |          |        end of bidirectional context
+                        |          U+2066 (LEFT-TO-RIGHT ISOLATE)
+                        U+202E (RIGHT-TO-LEFT OVERRIDE)
+   { dg-end-multiline-output "" }
+#endif
+
+    return 0;
+}
+
+int LRE_‪_PDF_\u202c;
+/* { dg-warning "mismatch" "" { target *-*-* } .-1 } */
+#if 0
+   { dg-begin-multiline-output "" }
+ int LRE_<U+202A>_PDF_\u202c;
+         ~~~~~~~~     ^~~~~~
+   { dg-end-multiline-output "" }
+#endif
+
+const char *s1 = "LRE_‪_PDF_\u202c";
+/* { dg-warning "mismatch" "" { target *-*-* } .-1 } */
+#if 0
+   { dg-begin-multiline-output "" }
+ const char *s1 = "LRE_<U+202A>_PDF_\u202c";
+                       ~~~~~~~~     ^~~~~~
+   { dg-end-multiline-output "" }
+#endif
diff --git a/libcpp/lex.c b/libcpp/lex.c
index 88aba307991..9e5531fb125 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -1172,11 +1172,34 @@  namespace bidi {
   /* All the UTF-8 encodings of bidi characters start with E2.  */
   constexpr uchar utf8_start = 0xe2;
 
+  struct context
+  {
+    context () {}
+    context (location_t loc, kind k, bool pdf, bool ucn)
+    : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
+    {
+    }
+
+    kind get_pop_kind () const
+    {
+      return m_pdf ? kind::PDF : kind::PDI;
+    }
+    bool ucn_p () const
+    {
+      return m_ucn;
+    }
+
+    location_t m_loc;
+    kind m_kind;
+    unsigned m_pdf : 1;
+    unsigned m_ucn : 1;
+  };
+
   /* A vector holding currently open bidi contexts.  We use a char for
      each context, its LSB is 1 if it represents a PDF context, 0 if it
      represents a PDI context.  The next bit is 1 if this context was open
      by a bidi character written as a UCN, and 0 when it was UTF-8.  */
-  semi_embedded_vec <unsigned char, 16> vec;
+  semi_embedded_vec <context, 16> vec;
 
   /* Close the whole comment/identifier/string literal/character constant
      context.  */
@@ -1199,7 +1222,7 @@  namespace bidi {
     unsigned int len = vec.count ();
     if (len == 0)
       return kind::NONE;
-    return (vec[len - 1] & 1) ? kind::PDF : kind::PDI;
+    return vec[len - 1].get_pop_kind ();
   }
 
   /* Return true if the current context comes from a UCN origin, that is,
@@ -1208,11 +1231,19 @@  namespace bidi {
   {
     unsigned int len = vec.count ();
     gcc_checking_assert (len > 0);
-    return (vec[len - 1] >> 1) & 1;
+    return vec[len - 1].m_ucn;
   }
 
-  /* We've read a bidi char, update the current vector as necessary.  */
-  void on_char (kind k, bool ucn_p)
+  location_t current_ctx_loc ()
+  {
+    unsigned int len = vec.count ();
+    gcc_checking_assert (len > 0);
+    return vec[len - 1].m_loc;
+  }
+
+  /* We've read a bidi char, update the current vector as necessary.
+     LOC is only valid when K is not kind::NONE.  */
+  void on_char (kind k, bool ucn_p, location_t loc)
   {
     switch (k)
       {
@@ -1220,12 +1251,12 @@  namespace bidi {
       case kind::RLE:
       case kind::LRO:
       case kind::RLO:
-	vec.push (ucn_p ? 3u : 1u);
+	vec.push (context (loc, k, true, ucn_p));
 	break;
       case kind::LRI:
       case kind::RLI:
       case kind::FSI:
-	vec.push (ucn_p ? 2u : 0u);
+	vec.push (context (loc, k, false, ucn_p));
 	break;
       case kind::PDF:
 	if (current_ctx () == kind::PDF)
@@ -1271,10 +1302,47 @@  namespace bidi {
   }
 }
 
+/* Get location_t for the range of bytes [START, START + NUM_BYTES)
+   within the current line in FILE, with the caret at START.  */
+
+static location_t
+get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
+					 const unsigned char *const start,
+					 size_t num_bytes)
+{
+  gcc_checking_assert (num_bytes > 0);
+
+  /* CPP_BUF_COLUMN and linemap_position_for_column both refer
+     to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
+     whereas linemap_position_for_column is 1-based.  */
+
+  /* Get 0-based offsets within the line.  */
+  size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
+  size_t end_offset = start_offset + num_bytes - 1;
+
+  /* Now convert to location_t, where "columns" are 1-based byte offsets.  */
+  location_t start_loc = linemap_position_for_column (pfile->line_table,
+						      start_offset + 1);
+  location_t end_loc = linemap_position_for_column (pfile->line_table,
+						     end_offset + 1);
+
+  if (start_loc == end_loc)
+    return start_loc;
+
+  source_range src_range;
+  src_range.m_start = start_loc;
+  src_range.m_finish = end_loc;
+  location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
+						   start_loc,
+						   src_range,
+						   NULL);
+  return combined_loc;
+}
+
 /* Parse a sequence of 3 bytes starting with P and return its bidi code.  */
 
 static bidi::kind
-get_bidi_utf8 (const unsigned char *const p)
+get_bidi_utf8_1 (const unsigned char *const p)
 {
   gcc_checking_assert (p[0] == bidi::utf8_start);
 
@@ -1312,10 +1380,25 @@  get_bidi_utf8 (const unsigned char *const p)
   return bidi::kind::NONE;
 }
 
+/* Parse a sequence of 3 bytes starting with P and return its bidi code.
+   If the kind is not NONE, write the location to *OUT.*/
+
+static bidi::kind
+get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
+{
+  bidi::kind result = get_bidi_utf8_1 (p);
+  if (result != bidi::kind::NONE)
+    {
+      /* We have a sequence of 3 bytes starting at P.  */
+      *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
+    }
+  return result;
+}
+
 /* Parse a UCN where P points just past \u or \U and return its bidi code.  */
 
 static bidi::kind
-get_bidi_ucn (const unsigned char *p, bool is_U)
+get_bidi_ucn_1 (const unsigned char *p, bool is_U)
 {
   /* 6.4.3 Universal Character Names
       \u hex-quad
@@ -1372,6 +1455,62 @@  get_bidi_ucn (const unsigned char *p, bool is_U)
   return bidi::kind::NONE;
 }
 
+/* Parse a UCN where P points just past \u or \U and return its bidi code.
+   If the kind is not NONE, write the location to *OUT.*/
+
+static bidi::kind
+get_bidi_ucn (cpp_reader *pfile,  const unsigned char *p, bool is_U,
+	      location_t *out)
+{
+  bidi::kind result = get_bidi_ucn_1 (p, is_U);
+  if (result != bidi::kind::NONE)
+    {
+      const unsigned char *start = p - 2;
+      size_t num_bytes = 2 + (is_U ? 8 : 4);
+      *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
+    }
+  return result;
+}
+
+/* Subclass of rich_location for reporting on unpaired UTF-8
+   bidirectional character(s).
+   Escape the source lines on output, and show all unclosed
+   bidi context, labelling everything.  */
+
+class unpaired_bidi_rich_location : public rich_location
+{
+ public:
+  class custom_range_label : public range_label
+  {
+   public:
+     label_text get_text (unsigned range_idx) const FINAL OVERRIDE
+     {
+       /* range 0 is the primary location; each subsequent range i + 1
+	  is for bidi::vec[i].  */
+       if (range_idx > 0)
+	 {
+	   const bidi::context &ctxt (bidi::vec[range_idx - 1]);
+	   return label_text::borrow (bidi::to_str (ctxt.m_kind));
+	 }
+       else
+	 return label_text::borrow (_("end of bidirectional context"));
+     }
+  };
+
+  unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
+  : rich_location (pfile->line_table, loc, &m_custom_label)
+  {
+    set_escape_on_output (true);
+    for (unsigned i = 0; i < bidi::vec.count (); i++)
+      add_range (bidi::vec[i].m_loc,
+		 SHOW_RANGE_WITHOUT_CARET,
+		 &m_custom_label);
+  }
+
+ private:
+   custom_range_label m_custom_label;
+};
+
 /* We're closing a bidi context, that is, we've encountered a newline,
    are closing a C-style comment, or are at the end of a string literal,
    character constant, or identifier.  Warn if this context was not
@@ -1387,11 +1526,17 @@  maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
       const location_t loc
 	= linemap_position_for_column (pfile->line_table,
 				       CPP_BUF_COLUMN (pfile->buffer, p));
-      rich_location rich_loc (pfile->line_table, loc);
-      rich_loc.set_escape_on_output (true);
-      cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
-		      "unpaired UTF-8 bidirectional character "
-		      "detected");
+      unpaired_bidi_rich_location rich_loc (pfile, loc);
+      /* cpp_callbacks doesn't yet have a way to handle singular vs plural
+	 forms of a diagnostic, so fake it for now.  */
+      if (bidi::vec.count () > 1)
+	cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
+			"unpaired UTF-8 bidirectional characters "
+			"detected");
+      else
+	cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
+			"unpaired UTF-8 bidirectional character "
+			"detected");
     }
   /* We're done with this context.  */
   bidi::on_close ();
@@ -1399,12 +1544,13 @@  maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
 
 /* We're at the beginning or in the middle of an identifier/comment/string
    literal/character constant.  Warn if we've encountered a bidi character.
-   KIND says which bidi character it was; P points to it in the character
-   stream.  UCN_P is true iff this bidi character was written as a UCN.  */
+   KIND says which bidi character it was; UCN_P is true iff this bidi
+   character was written as a UCN.  LOC is the location of the character,
+   but is only valid if KIND != bidi::kind::NONE.  */
 
 static void
-maybe_warn_bidi_on_char (cpp_reader *pfile, const uchar *p, bidi::kind kind,
-			 bool ucn_p)
+maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
+			 bool ucn_p, location_t loc)
 {
   if (__builtin_expect (kind == bidi::kind::NONE, 1))
     return;
@@ -1413,9 +1559,6 @@  maybe_warn_bidi_on_char (cpp_reader *pfile, const uchar *p, bidi::kind kind,
 
   if (warn_bidi != bidirectional_none)
     {
-      const location_t loc
-	= linemap_position_for_column (pfile->line_table,
-				       CPP_BUF_COLUMN (pfile->buffer, p));
       rich_location rich_loc (pfile->line_table, loc);
       rich_loc.set_escape_on_output (true);
 
@@ -1427,9 +1570,12 @@  maybe_warn_bidi_on_char (cpp_reader *pfile, const uchar *p, bidi::kind kind,
 	{
 	  if (warn_bidi == bidirectional_unpaired
 	      && bidi::current_ctx_ucn_p () != ucn_p)
-	    cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
-			    "UTF-8 vs UCN mismatch when closing "
-			    "a context by \"%s\"", bidi::to_str (kind));
+	    {
+	      rich_loc.add_range (bidi::current_ctx_loc ());
+	      cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
+			      "UTF-8 vs UCN mismatch when closing "
+			      "a context by \"%s\"", bidi::to_str (kind));
+	    }
 	}
       else if (warn_bidi == bidirectional_any)
 	{
@@ -1444,7 +1590,7 @@  maybe_warn_bidi_on_char (cpp_reader *pfile, const uchar *p, bidi::kind kind,
 	}
     }
   /* We're done with this context.  */
-  bidi::on_char (kind, ucn_p);
+  bidi::on_char (kind, ucn_p, loc);
 }
 
 /* Skip a C-style block comment.  We find the end of the comment by
@@ -1512,8 +1658,9 @@  _cpp_skip_block_comment (cpp_reader *pfile)
 	 a bidirectional character.  */
       else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
 	{
-	  bidi::kind kind = get_bidi_utf8 (cur - 1);
-	  maybe_warn_bidi_on_char (pfile, cur, kind, /*ucn_p=*/false);
+	  location_t loc;
+	  bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
+	  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
 	}
     }
 
@@ -1547,9 +1694,9 @@  skip_line_comment (cpp_reader *pfile)
 	    {
 	      if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
 		{
-		  bidi::kind kind = get_bidi_utf8 (buffer->cur);
-		  maybe_warn_bidi_on_char (pfile, buffer->cur, kind,
-					   /*ucn_p=*/false);
+		  location_t loc;
+		  bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
+		  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
 		}
 	      buffer->cur++;
 	    }
@@ -1699,9 +1846,9 @@  forms_identifier_p (cpp_reader *pfile, int first,
 	  if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
 	      && warn_bidi_p)
 	    {
-	      bidi::kind kind = get_bidi_utf8 (buffer->cur);
-	      maybe_warn_bidi_on_char (pfile, buffer->cur, kind,
-				       /*ucn_p=*/false);
+	      location_t loc;
+	      bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
+	      maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
 	    }
 	  if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
 			       state, &s))
@@ -1713,10 +1860,12 @@  forms_identifier_p (cpp_reader *pfile, int first,
 	  buffer->cur += 2;
 	  if (warn_bidi_p)
 	    {
-	      bidi::kind kind = get_bidi_ucn (buffer->cur,
-					      buffer->cur[-1] == 'U');
-	      maybe_warn_bidi_on_char (pfile, buffer->cur, kind,
-				       /*ucn_p=*/true);
+	      location_t loc;
+	      bidi::kind kind = get_bidi_ucn (pfile,
+					      buffer->cur,
+					      buffer->cur[-1] == 'U',
+					      &loc);
+	      maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
 	    }
 	  if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
 			      state, &s, NULL, NULL))
@@ -2341,8 +2490,11 @@  lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
 	}
       else if (__builtin_expect ((unsigned char) c == bidi::utf8_start, 0)
 	       && warn_bidi_p)
-	maybe_warn_bidi_on_char (pfile, pos - 1, get_bidi_utf8 (pos - 1),
-				 /*ucn_p=*/false);
+	{
+	  location_t loc;
+	  bidi::kind kind = get_bidi_utf8 (pfile, pos - 1, &loc);
+	  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
+	}
     }
 
   if (warn_bidi_p)
@@ -2453,8 +2605,10 @@  lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
 	{
 	  if ((cur[0] == 'u' || cur[0] == 'U') && warn_bidi_p)
 	    {
-	      bidi::kind kind = get_bidi_ucn (cur + 1, cur[0] == 'U');
-	      maybe_warn_bidi_on_char (pfile, cur, kind, /*ucn_p=*/true);
+	      location_t loc;
+	      bidi::kind kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U',
+					      &loc);
+	      maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
 	    }
 	  cur++;
 	}
@@ -2482,8 +2636,9 @@  lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
 	saw_NUL = true;
       else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
 	{
-	  bidi::kind kind = get_bidi_utf8 (cur - 1);
-	  maybe_warn_bidi_on_char (pfile, cur - 1, kind, /*ucn_p=*/false);
+	  location_t loc;
+	  bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
+	  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
 	}
     }