diff mbox

[libstdc++/67015] Fix regex POSIX bracket parsing

Message ID CAG4ZjN=99FjiPnG_FaUVKnpY65qgSKUqaFC-x42=198Y0A+Kew@mail.gmail.com
State New
Headers show

Commit Message

Tim Shen July 26, 2015, 12:20 p.m. UTC
On Sun, Jul 26, 2015 at 5:19 AM, Tim Shen <timshen@google.com> wrote:
> Kinda important, since "[a-z0-9-]" may be a common case.
>
> Bootstrapped and tested.

Actual patch...

Comments

Jonathan Wakely July 27, 2015, 11:45 a.m. UTC | #1
On 26/07/15 05:20 -0700, Tim Shen wrote:
>@@ -389,7 +391,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> #endif
>       }
>
>-      void
>+      _StringT
>       _M_add_collating_element(const _StringT& __s)
>       {
> 	auto __st = _M_traits.lookup_collatename(__s.data(),

Changing this return type is an ABI change. When compiled with older
versions of GCC this function will not return anything, so code
compiled with newer versions that links to that older code will read
garbage off the stack if linked to the old version.

(This isn't a problem for _M_expression_term because the return type
is part of the mangled name for function templates).

One solution would be to rename the function, so that code compiled
with the new GCC will use the new function, not link to the old
version that doesn't return anything.
diff mbox

Patch

diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h
index 4472116..6d9799e 100644
--- a/libstdc++-v3/include/bits/regex_compiler.h
+++ b/libstdc++-v3/include/bits/regex_compiler.h
@@ -116,8 +116,10 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	void
 	_M_insert_bracket_matcher(bool __neg);
 
+      // Returns true if successfully matched one term and should continue.
+      // Returns false if the compiler should move on.
       template<bool __icase, bool __collate>
-	void
+	bool
 	_M_expression_term(pair<bool, _CharT>& __last_char,
 			   _BracketMatcher<_TraitsT, __icase, __collate>&
 			   __matcher);
@@ -389,7 +391,7 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 #endif
       }
 
-      void
+      _StringT
       _M_add_collating_element(const _StringT& __s)
       {
 	auto __st = _M_traits.lookup_collatename(__s.data(),
@@ -400,6 +402,7 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 #ifdef _GLIBCXX_DEBUG
 	_M_is_ready = false;
 #endif
+	return __st;
       }
 
       void
diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc
index 33d7118..f48e3c1 100644
--- a/libstdc++-v3/include/bits/regex_compiler.tcc
+++ b/libstdc++-v3/include/bits/regex_compiler.tcc
@@ -424,8 +424,7 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	    __last_char.first = true;
 	    __last_char.second = _M_value[0];
 	  }
-      while (!_M_match_token(_ScannerT::_S_token_bracket_end))
-	_M_expression_term(__last_char, __matcher);
+      while (_M_expression_term(__last_char, __matcher));
       __matcher._M_ready();
       _M_stack.push(_StateSeqT(
 		      *_M_nfa,
@@ -434,21 +433,31 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
   template<typename _TraitsT>
   template<bool __icase, bool __collate>
-    void
+    bool
     _Compiler<_TraitsT>::
     _M_expression_term(pair<bool, _CharT>& __last_char,
 		       _BracketMatcher<_TraitsT, __icase, __collate>& __matcher)
     {
+      if (_M_match_token(_ScannerT::_S_token_bracket_end))
+	return false;
+
       if (_M_match_token(_ScannerT::_S_token_collsymbol))
-	__matcher._M_add_collating_element(_M_value);
+	{
+	  auto __symbol = __matcher._M_add_collating_element(_M_value);
+	  if (__symbol.size() == 1)
+	    {
+	      __last_char.first = true;
+	      __last_char.second = __symbol[0];
+	    }
+	}
       else if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
 	__matcher._M_add_equivalence_class(_M_value);
       else if (_M_match_token(_ScannerT::_S_token_char_class_name))
 	__matcher._M_add_character_class(_M_value, false);
-      // POSIX doesn't permit '-' as a start-range char (say [a-z--0]),
-      // except when the '-' is the first character in the bracket expression
-      // ([--0]). ECMAScript treats all '-' after a range as a normal character.
-      // Also see above, where _M_expression_term gets called.
+      // POSIX doesn't allow '-' as a start-range char (say [a-z--0]),
+      // except when the '-' is the first or last character in the bracket
+      // expression ([--0]). ECMAScript treats all '-' after a range as a
+      // normal character. Also see above, where _M_expression_term gets called.
       //
       // As a result, POSIX rejects [-----], but ECMAScript doesn't.
       // Boost (1.57.0) always uses POSIX style even in its ECMAScript syntax.
@@ -459,10 +468,14 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	{
 	  if (!__last_char.first)
 	    {
+	      __matcher._M_add_char(_M_value[0]);
 	      if (_M_value[0] == '-'
 		  && !(_M_flags & regex_constants::ECMAScript))
-		__throw_regex_error(regex_constants::error_range);
-	      __matcher._M_add_char(_M_value[0]);
+		{
+		  if (_M_match_token(_ScannerT::_S_token_bracket_end))
+		    return false;
+		  __throw_regex_error(regex_constants::error_range);
+		}
 	      __last_char.first = true;
 	      __last_char.second = _M_value[0];
 	    }
@@ -496,6 +509,8 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 						     _M_value[0]));
       else
 	__throw_regex_error(regex_constants::error_brack);
+
+      return true;
     }
 
   template<typename _TraitsT>
diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc
index f7653c6..62131a0 100644
--- a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc
+++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc
@@ -82,6 +82,22 @@  test02()
     VERIFY(e.code() == std::regex_constants::error_range);
   }
   std::regex re("[-----]", std::regex::ECMAScript);
+
+  VERIFY(!regex_match("b", regex("[-ac]", regex_constants::extended)));
+  VERIFY(!regex_match("b", regex("[ac-]", regex_constants::extended)));
+  VERIFY(regex_match("b", regex("[^-ac]", regex_constants::extended)));
+  VERIFY(regex_match("b", regex("[^ac-]", regex_constants::extended)));
+  VERIFY(regex_match("&", regex("[%--]", regex_constants::extended)));
+  VERIFY(regex_match(".", regex("[--@]", regex_constants::extended)));
+  try
+  {
+    regex("[a--@]", regex_constants::extended);
+    VERIFY(false);
+  }
+  catch (const std::regex_error& e)
+  {
+  }
+  VERIFY(regex_match("].", regex("[][.hyphen.]-0]*", regex_constants::extended)));
 }
 
 void
@@ -115,6 +131,44 @@  test04()
   VERIFY(regex_match_debug("w", re));
 }
 
+// libstdc++/67015
+void
+test05()
+{
+  bool test __attribute__((unused)) = true;
+
+  regex lanana_namespace("^[a-z0-9]+$", regex::extended);
+  regex lsb_namespace("^_?([a-z0-9_.]+-, regex::extended)+[a-z0-9]+$");
+  regex debian_dpkg_conffile_cruft("dpkg-(old|dist|new|tmp, regex::extended)$");
+  regex debian_cron_namespace("^[a-z0-9][a-z0-9-]*$", regex::extended);
+  VERIFY(regex_match("test", debian_cron_namespace));
+  VERIFY(!regex_match("-a", debian_cron_namespace));
+  VERIFY(regex_match("a-", debian_cron_namespace));
+  regex debian_cron_namespace_ok("^[a-z0-9][-a-z0-9]*$", regex::extended);
+  VERIFY(regex_match("test", debian_cron_namespace_ok));
+  VERIFY(!regex_match("-a", debian_cron_namespace_ok));
+  VERIFY(regex_match("a-", debian_cron_namespace_ok));
+}
+
+// libstdc++/67015
+void
+test06()
+{
+  bool test __attribute__((unused)) = true;
+
+  regex lanana_namespace("^[a-z0-9]+$");
+  regex lsb_namespace("^_?([a-z0-9_.]+-)+[a-z0-9]+$");
+  regex debian_dpkg_conffile_cruft("dpkg-(old|dist|new|tmp)$");
+  regex debian_cron_namespace("^[a-z0-9][a-z0-9-]*$");
+  VERIFY(regex_match("test", debian_cron_namespace));
+  VERIFY(!regex_match("-a", debian_cron_namespace));
+  VERIFY(regex_match("a-", debian_cron_namespace));
+  regex debian_cron_namespace_ok("^[a-z0-9][-a-z0-9]*$");
+  VERIFY(regex_match("test", debian_cron_namespace_ok));
+  VERIFY(!regex_match("-a", debian_cron_namespace_ok));
+  VERIFY(regex_match("a-", debian_cron_namespace_ok));
+}
+
 int
 main()
 {
@@ -122,5 +176,8 @@  main()
   test02();
   test03();
   test04();
+  test05();
+  test06();
+
   return 0;
 }