diff mbox

[libstdc++/77356] Support escape in regex bracket expression

Message ID CAG4ZjN=y6agzBoaZfVNq_k52FR=uNzRp0Q0SSEB2zr9VpN+fXw@mail.gmail.com
State New
Headers show

Commit Message

Tim Shen Aug. 24, 2016, 7:48 p.m. UTC
On Wed, Aug 24, 2016 at 1:41 AM, Jonathan Wakely <jwakely@redhat.com> wrote:
> On 24/08/16 00:18 -0700, Tim Shen wrote:
>>
>> I didn't realized that we can actually escape a dash inside bracket
>> expression: R"([\-])", in which case the dash should be treated
>> literally.
>
>
> With this patch applied I no longer get a match for:
>
>  regex_match("-", std::regex{"[a-]", std::regex_constants::basic})

Nice catch. I'm surprised that there is no test for it. Added one.

> I wonder if we want to give _S_token_unknown an initializer with a
> specific value, so it doesn't change whenever we add new tokens.
>
> If we use -1 it would change the underlying type of
> _ScannerBase::_TokenT from unsigned int to signed int, so we should
> give it a fixed type too:
>
>  struct _ScannerBase
>  {
>  public:
>    /// Token types returned from the scanner.
>    enum _TokenT : unsigned
>    {
>      ...
>      _S_token_unknown = -1u

Done.

Comments

Jonathan Wakely Aug. 26, 2016, 10:09 a.m. UTC | #1
On 24/08/16 12:48 -0700, Tim Shen wrote:
>On Wed, Aug 24, 2016 at 1:41 AM, Jonathan Wakely <jwakely@redhat.com> wrote:
>> On 24/08/16 00:18 -0700, Tim Shen wrote:
>>>
>>> I didn't realized that we can actually escape a dash inside bracket
>>> expression: R"([\-])", in which case the dash should be treated
>>> literally.
>>
>>
>> With this patch applied I no longer get a match for:
>>
>>  regex_match("-", std::regex{"[a-]", std::regex_constants::basic})
>
>Nice catch. I'm surprised that there is no test for it. Added one.
>
>> I wonder if we want to give _S_token_unknown an initializer with a
>> specific value, so it doesn't change whenever we add new tokens.
>>
>> If we use -1 it would change the underlying type of
>> _ScannerBase::_TokenT from unsigned int to signed int, so we should
>> give it a fixed type too:
>>
>>  struct _ScannerBase
>>  {
>>  public:
>>    /// Token types returned from the scanner.
>>    enum _TokenT : unsigned
>>    {
>>      ...
>>      _S_token_unknown = -1u
>
>Done.

OK for trunk, thanks.
Tim Shen Aug. 27, 2016, 2:04 a.m. UTC | #2
On Fri, Aug 26, 2016 at 3:09 AM, Jonathan Wakely <jwakely@redhat.com> wrote:
> OK for trunk, thanks.

Committed as r239794.

Thanks!
diff mbox

Patch

diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc
index ff69e16..ef6ebdd 100644
--- a/libstdc++-v3/include/bits/regex_compiler.tcc
+++ b/libstdc++-v3/include/bits/regex_compiler.tcc
@@ -426,13 +426,21 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
       pair<bool, _CharT> __last_char; // Optional<_CharT>
       __last_char.first = false;
       if (!(_M_flags & regex_constants::ECMAScript))
-	if (_M_try_char())
-	  {
-	    __matcher._M_add_char(_M_value[0]);
-	    __last_char.first = true;
-	    __last_char.second = _M_value[0];
-	  }
+	{
+	  if (_M_try_char())
+	    {
+	      __last_char.first = true;
+	      __last_char.second = _M_value[0];
+	    }
+	  else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
+	    {
+	      __last_char.first = true;
+	      __last_char.second = '-';
+	    }
+	}
       while (_M_expression_term(__last_char, __matcher));
+      if (__last_char.first)
+	__matcher._M_add_char(__last_char.second);
       __matcher._M_ready();
       _M_stack.push(_StateSeqT(
 		      *_M_nfa,
@@ -449,19 +457,43 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
       if (_M_match_token(_ScannerT::_S_token_bracket_end))
 	return false;
 
+      const auto __push_char = [&](_CharT __ch)
+      {
+	if (__last_char.first)
+	  __matcher._M_add_char(__last_char.second);
+	else
+	  __last_char.first = true;
+	__last_char.second = __ch;
+      };
+      const auto __flush = [&]
+      {
+	if (__last_char.first)
+	  {
+	    __matcher._M_add_char(__last_char.second);
+	    __last_char.first = false;
+	  }
+      };
+
       if (_M_match_token(_ScannerT::_S_token_collsymbol))
 	{
 	  auto __symbol = __matcher._M_add_collate_element(_M_value);
 	  if (__symbol.size() == 1)
-	    {
-	      __last_char.first = true;
-	      __last_char.second = __symbol[0];
-	    }
+	    __push_char(__symbol[0]);
+	  else
+	    __flush();
 	}
       else if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
-	__matcher._M_add_equivalence_class(_M_value);
+	{
+	  __flush();
+	  __matcher._M_add_equivalence_class(_M_value);
+	}
       else if (_M_match_token(_ScannerT::_S_token_char_class_name))
-	__matcher._M_add_character_class(_M_value, false);
+	{
+	  __flush();
+	  __matcher._M_add_character_class(_M_value, false);
+	}
+      else if (_M_try_char())
+	__push_char(_M_value[0]);
       // POSIX doesn't allow '-' as a start-range char (say [a-z--0]),
       // except when the '-' is the first or last character in the bracket
       // expression ([--0]). ECMAScript treats all '-' after a range as a
@@ -472,55 +504,55 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
       // Clang (3.5) always uses ECMAScript style even in its POSIX syntax.
       //
       // It turns out that no one reads BNFs ;)
-      else if (_M_try_char())
+      else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
 	{
 	  if (!__last_char.first)
 	    {
-	      __matcher._M_add_char(_M_value[0]);
-	      if (_M_value[0] == '-'
-		  && !(_M_flags & regex_constants::ECMAScript))
+	      if (!(_M_flags & regex_constants::ECMAScript))
 		{
 		  if (_M_match_token(_ScannerT::_S_token_bracket_end))
-		    return false;
+		    {
+		      __push_char('-');
+		      return false;
+		    }
 		  __throw_regex_error(
 		    regex_constants::error_range,
 		    "Unexpected dash in bracket expression. For POSIX syntax, "
 		    "a dash is not treated literally only when it is at "
 		    "beginning or end.");
 		}
-	      __last_char.first = true;
-	      __last_char.second = _M_value[0];
+	      __push_char('-');
 	    }
 	  else
 	    {
-	      if (_M_value[0] == '-')
+	      if (_M_try_char())
 		{
-		  if (_M_try_char())
-		    {
-		      __matcher._M_make_range(__last_char.second , _M_value[0]);
-		      __last_char.first = false;
-		    }
-		  else
-		    {
-		      if (_M_scanner._M_get_token()
-			  != _ScannerT::_S_token_bracket_end)
-			__throw_regex_error(
-			  regex_constants::error_range,
-			  "Unexpected end of bracket expression.");
-		      __matcher._M_add_char(_M_value[0]);
-		    }
+		  __matcher._M_make_range(__last_char.second, _M_value[0]);
+		  __last_char.first = false;
+		}
+	      else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
+		{
+		  __matcher._M_make_range(__last_char.second, '-');
+		  __last_char.first = false;
 		}
 	      else
 		{
-		  __matcher._M_add_char(_M_value[0]);
-		  __last_char.second = _M_value[0];
+		  if (_M_scanner._M_get_token()
+		      != _ScannerT::_S_token_bracket_end)
+		    __throw_regex_error(
+		      regex_constants::error_range,
+		      "Character is expected after a dash.");
+		  __push_char('-');
 		}
 	    }
 	}
       else if (_M_match_token(_ScannerT::_S_token_quoted_class))
-	__matcher._M_add_character_class(_M_value,
-					 _M_ctype.is(_CtypeT::upper,
-						     _M_value[0]));
+	{
+	  __flush();
+	  __matcher._M_add_character_class(_M_value,
+					   _M_ctype.is(_CtypeT::upper,
+						       _M_value[0]));
+	}
       else
 	__throw_regex_error(regex_constants::error_brack,
 			    "Unexpected character in bracket expression.");
diff --git a/libstdc++-v3/include/bits/regex_scanner.h b/libstdc++-v3/include/bits/regex_scanner.h
index 37dea84..ed0b723 100644
--- a/libstdc++-v3/include/bits/regex_scanner.h
+++ b/libstdc++-v3/include/bits/regex_scanner.h
@@ -43,7 +43,7 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
   {
   public:
     /// Token types returned from the scanner.
-    enum _TokenT
+    enum _TokenT : unsigned
     {
       _S_token_anychar,
       _S_token_ord_char,
@@ -73,7 +73,8 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
       _S_token_comma,
       _S_token_dup_count,
       _S_token_eof,
-      _S_token_unknown
+      _S_token_bracket_dash,
+      _S_token_unknown = -1u
     };
 
   protected:
diff --git a/libstdc++-v3/include/bits/regex_scanner.tcc b/libstdc++-v3/include/bits/regex_scanner.tcc
index fedba09..a734bb1 100644
--- a/libstdc++-v3/include/bits/regex_scanner.tcc
+++ b/libstdc++-v3/include/bits/regex_scanner.tcc
@@ -210,7 +210,9 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       auto __c = *_M_current++;
 
-      if (__c == '[')
+      if (__c == '-')
+	_M_token = _S_token_bracket_dash;
+      else if (__c == '[')
 	{
 	  if (_M_current == _M_end)
 	    __throw_regex_error(regex_constants::error_brack,
diff --git a/libstdc++-v3/testsuite/28_regex/regression.cc b/libstdc++-v3/testsuite/28_regex/regression.cc
index d367c8b..fac7fa2 100644
--- a/libstdc++-v3/testsuite/28_regex/regression.cc
+++ b/libstdc++-v3/testsuite/28_regex/regression.cc
@@ -61,12 +61,35 @@  test03()
   VERIFY(!regex_search_debug("a", regex(R"(\b$)"), regex_constants::match_not_eow));
 }
 
+// PR libstdc++/77356
+void
+test04()
+{
+  bool test __attribute__((unused)) = true;
+
+  static const char* kNumericAnchor ="(\\$|usd)(usd|\\$|to|and|up to|[0-9,\\.\\-\\sk])+";
+  const std::regex re(kNumericAnchor);
+  (void)re;
+}
+
+void
+test05()
+{
+  bool test __attribute__((unused)) = true;
+
+  VERIFY(regex_match_debug("!", std::regex("[![:alnum:]]")));
+  VERIFY(regex_match_debug("-", std::regex("[a-]", regex_constants::basic)));
+  VERIFY(regex_match_debug("-", std::regex("[a-]")));
+}
+
 int
 main()
 {
   test01();
   test02();
   test03();
+  test04();
+  test05();
   return 0;
 }