diff mbox

[PR,61424] std::regex matches right to left, not leftmost longest

Message ID CAPrifDnWns+QWcJgXe7Aw0NnTnLp4+UmwSL0hHcEhHhxXrDbdw@mail.gmail.com
State New
Headers show

Commit Message

Tim Shen June 28, 2014, 7:18 a.m. UTC
On Tue, Jun 10, 2014 at 1:56 PM, Tim Shen <timshen91@gmail.com> wrote:
> On Tue, Jun 10, 2014 at 9:54 AM, Jonathan Wakely <jwakely@redhat.com> wrote:
>> I'm sure this is because I still don't understand all the regex code,
>> but doesn't this change mean that for an "extended" mode regex with
>> backrefs, the user could define _GLIBCXX_REGEX_USE_THOMPSON_NFA and
>> backrefs wouldn't work?
>
> Sorry I missed that basic POSIX (BRE) has back-references (damn!), but
> extended POSIX (ERE) doesn't. So it should look like:
> -      if (!__re._M_automaton->_M_has_backref
> +      if (!(__re._M_automaton->_M_has_backref || (__re._M_flags &
> regex_constants::ECMAScript))
> ...and all deleted _M_has_backref lines should be undeleted.
>
> This patch is a temporary (I'm not sure how long though) workaround;
> BFS's support for ECMAScript with no back-references shall be done
> finally.

Sorry for late; this is the complete patch.

Bootstrapped and tested.

Thanks!

Comments

Jonathan Wakely June 28, 2014, 9:45 a.m. UTC | #1
On 28/06/14 00:18 -0700, Tim Shen wrote:
>diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h
>index 1991c00..e02fa65 100644
>--- a/libstdc++-v3/include/bits/regex_executor.h
>+++ b/libstdc++-v3/include/bits/regex_executor.h
>@@ -173,6 +173,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> 	  void _M_queue(_StateIdT __i, const _ResultsVec& __res)
> 	  { _M_match_queue.emplace_back(__i, __res); }
>
>+	  _BiIter* _M_get_sol_pos() { return nullptr; }
>+
> 	  // Saves states that need to be considered for the next character.
> 	  vector<pair<_StateIdT, _ResultsVec>>	_M_match_queue;
> 	  // Indicates which states are already visited.
>@@ -191,12 +193,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> 	  // Dummy implementations for DFS mode.
> 	  bool _M_visited(_StateIdT) const { return false; }
> 	  void _M_queue(_StateIdT, const _ResultsVec&) { }
>+	  _BiIter* _M_get_sol_pos() { return &_M_sol_pos; }

Please put a space above this new function, otherwise it looks like
the "Dummy implementations" comment applies to this function, but in
fact that function is used for dfs mode.

Similarly, maybe there should be a "Dummy implementation" comment on
the _State_info<__bfs, RV>::_M_get_sol_pos function.

OK with that change, thanks for the fix.
Tim Shen July 1, 2014, 2:17 a.m. UTC | #2
On Sat, Jun 28, 2014 at 2:45 AM, Jonathan Wakely <jwakely@redhat.com> wrote:
> On 28/06/14 00:18 -0700, Tim Shen wrote:
> Please put a space above this new function, otherwise it looks like
> the "Dummy implementations" comment applies to this function, but in
> fact that function is used for dfs mode.
>
> Similarly, maybe there should be a "Dummy implementation" comment on
> the _State_info<__bfs, RV>::_M_get_sol_pos function.
>
> OK with that change, thanks for the fix.
>

Committed.

Thanks!
Jonathan Wakely July 1, 2014, 8:40 a.m. UTC | #3
On 30/06/14 19:17 -0700, Tim Shen wrote:
>On Sat, Jun 28, 2014 at 2:45 AM, Jonathan Wakely <jwakely@redhat.com> wrote:
>> On 28/06/14 00:18 -0700, Tim Shen wrote:
>> Please put a space above this new function, otherwise it looks like
>> the "Dummy implementations" comment applies to this function, but in
>> fact that function is used for dfs mode.
>>
>> Similarly, maybe there should be a "Dummy implementation" comment on
>> the _State_info<__bfs, RV>::_M_get_sol_pos function.
>>
>> OK with that change, thanks for the fix.
>>
>
>Committed.

Great, thanks again.
diff mbox

Patch

diff --git a/libstdc++-v3/include/bits/regex.tcc b/libstdc++-v3/include/bits/regex.tcc
index a81f517..3322379 100644
--- a/libstdc++-v3/include/bits/regex.tcc
+++ b/libstdc++-v3/include/bits/regex.tcc
@@ -71,6 +71,7 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
       // _GLIBCXX_REGEX_USE_THOMPSON_NFA if they need to use this approach.
       bool __ret;
       if (!__re._M_automaton->_M_has_backref
+	  && !(__re._M_flags & regex_constants::ECMAScript)
 #ifndef _GLIBCXX_REGEX_USE_THOMPSON_NFA
 	  && __policy == _RegexExecutorPolicy::_S_alternate
 #endif
diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc
index 0df10cc..f15f7dd 100644
--- a/libstdc++-v3/include/bits/regex_compiler.tcc
+++ b/libstdc++-v3/include/bits/regex_compiler.tcc
@@ -103,9 +103,12 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	  auto __end = _M_nfa._M_insert_dummy();
 	  __alt1._M_append(__end);
 	  __alt2._M_append(__end);
+	  // __alt2 is state._M_next, __alt1 is state._M_alt. The executor
+	  // executes _M_alt before _M_next, as well as executing left
+	  // alternative before right one.
 	  _M_stack.push(_StateSeqT(_M_nfa,
-				   _M_nfa._M_insert_alt(__alt1._M_start,
-							__alt2._M_start, false),
+				   _M_nfa._M_insert_alt(__alt2._M_start,
+							__alt1._M_start, false),
 				   __end));
 	}
     }
diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h
index 1991c00..e02fa65 100644
--- a/libstdc++-v3/include/bits/regex_executor.h
+++ b/libstdc++-v3/include/bits/regex_executor.h
@@ -173,6 +173,8 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	  void _M_queue(_StateIdT __i, const _ResultsVec& __res)
 	  { _M_match_queue.emplace_back(__i, __res); }
 
+	  _BiIter* _M_get_sol_pos() { return nullptr; }
+
 	  // Saves states that need to be considered for the next character.
 	  vector<pair<_StateIdT, _ResultsVec>>	_M_match_queue;
 	  // Indicates which states are already visited.
@@ -191,12 +193,13 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	  // Dummy implementations for DFS mode.
 	  bool _M_visited(_StateIdT) const { return false; }
 	  void _M_queue(_StateIdT, const _ResultsVec&) { }
+	  _BiIter* _M_get_sol_pos() { return &_M_sol_pos; }
 
 	  // To record current solution.
 	  _StateIdT _M_start;
+	  _BiIter   _M_sol_pos;
 	};
 
-
     public:
       _ResultsVec                                           _M_cur_results;
       _BiIter                                               _M_current;
diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc
index aefa8f4..38b8ff2 100644
--- a/libstdc++-v3/include/bits/regex_executor.tcc
+++ b/libstdc++-v3/include/bits/regex_executor.tcc
@@ -82,6 +82,7 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
     _M_main_dispatch(_Match_mode __match_mode, __dfs)
     {
       _M_has_sol = false;
+      *_M_states._M_get_sol_pos() = _BiIter();
       _M_cur_results = _M_results;
       _M_dfs(__match_mode, _M_states._M_start);
       return _M_has_sol;
@@ -338,7 +339,29 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 		  && (_M_flags & regex_constants::match_not_null))
 		_M_has_sol = false;
 	      if (_M_has_sol)
-		_M_results = _M_cur_results;
+		{
+		  if (_M_nfa._M_flags & regex_constants::ECMAScript)
+		    _M_results = _M_cur_results;
+		  else // POSIX
+		    {
+		      _GLIBCXX_DEBUG_ASSERT(_M_states._M_get_sol_pos());
+		      // Here's POSIX's logic: match the longest one. However
+		      // we never know which one (lhs or rhs of "|") is longer
+		      // unless we try both of them and compare the results.
+		      // The member variable _M_sol_pos records the end
+		      // position of the last successful match. It's better
+		      // to be larger, because POSIX regex is always greedy.
+		      // TODO: This could be slow.
+		      if (*_M_states._M_get_sol_pos() == _BiIter()
+			  || std::distance(_M_begin,
+					   *_M_states._M_get_sol_pos())
+			     < std::distance(_M_begin, _M_current))
+			{
+			  *_M_states._M_get_sol_pos() = _M_current;
+			  _M_results = _M_cur_results;
+			}
+		    }
+		}
 	    }
 	  else
 	    {
@@ -354,9 +377,25 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	    }
 	  break;
 	case _S_opcode_alternative:
-	  _M_dfs(__match_mode, __state._M_alt);
-	  if (!__dfs_mode || !_M_has_sol)
-	    _M_dfs(__match_mode, __state._M_next);
+	  if (_M_nfa._M_flags & regex_constants::ECMAScript)
+	    {
+	      // TODO: Let DFS support ECMAScript's alternative operation.
+	      _GLIBCXX_DEBUG_ASSERT(!__dfs_mode);
+	      _M_dfs(__match_mode, __state._M_alt);
+	      // Pick lhs if it matches. Only try rhs if it doesn't.
+	      if (!_M_has_sol)
+		_M_dfs(__match_mode, __state._M_next);
+	    }
+	  else
+	    {
+	      // Try both and compare the result.
+	      // See "case _S_opcode_accept:" handling above.
+	      _M_dfs(__match_mode, __state._M_alt);
+	      auto __has_sol = _M_has_sol;
+	      _M_has_sol = false;
+	      _M_dfs(__match_mode, __state._M_next);
+	      _M_has_sol |= __has_sol;
+	    }
 	  break;
 	default:
 	  _GLIBCXX_DEBUG_ASSERT(false);
diff --git a/libstdc++-v3/include/bits/regex_scanner.h b/libstdc++-v3/include/bits/regex_scanner.h
index 6627db9..5552226 100644
--- a/libstdc++-v3/include/bits/regex_scanner.h
+++ b/libstdc++-v3/include/bits/regex_scanner.h
@@ -67,7 +67,6 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
       _S_token_or,
       _S_token_closure0,
       _S_token_closure1,
-      _S_token_ungreedy,
       _S_token_line_begin,
       _S_token_line_end,
       _S_token_word_bound, // neg if _M_value[0] == 'n'
diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/61424.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/61424.cc
new file mode 100644
index 0000000..bdccb4a
--- /dev/null
+++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/61424.cc
@@ -0,0 +1,52 @@ 
+// { dg-options "-std=gnu++11" }
+
+// Copyright (C) 2014 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// PR libstdc++/61424
+
+#include <regex>
+#include <testsuite_hooks.h>
+#include <testsuite_regex.h>
+
+using namespace std;
+using namespace __gnu_test;
+
+int main()
+{
+  regex_constants::syntax_option_type grammar[] = {
+    regex_constants::ECMAScript, regex_constants::extended,
+    regex_constants::awk, regex_constants::egrep
+  };
+
+  string sol[] = {
+      "tour",
+      "tournament",
+      "tournament",
+      "tournament",
+  };
+  int i = 0;
+  for (auto g : grammar)
+  {
+    regex re("tour|tournament|tourn", g);
+    const char str[] = "tournament";
+    cmatch m;
+    VERIFY(regex_search_debug(str, m, re));
+    VERIFY(sol[i] == m[0]);
+    i++;
+  }
+}