Reduce regex _M_dfs frame size

Message ID	CAG4ZjNnH25cpSXPgFFb3GDHRDyBD2wDdRSgZTbKuVk5oze=9CA@mail.gmail.com
State	New
Headers	show Return-Path: <gcc-patches-return-434344-incoming=patchwork.ozlabs.org@gcc.gnu.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :mime-version:from:date:message-id:subject:to:content-type; q= dns; s=default; b=Qd7geE7XSIrtpDkwwpdzng85ZaaS84WRZKuXqwPXd5nMt+ YEZsDD6O9km8fbpOY2LfeUg00eRhAl2sVE7Maxd6q8wbxYVkAgsqfrwRwnmfjtDh srDrGzKMUvDqardiw0bPT1gVmR5aES/hv7A0W3+JzAwfA05BBz09MGG1RxsAc= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org MIME-Version: 1.0 From: Tim Shen <timshen@google.com> Date: Sat, 20 Aug 2016 03:28:24 -0700 Message-ID: <CAG4ZjNnH25cpSXPgFFb3GDHRDyBD2wDdRSgZTbKuVk5oze=9CA@mail.gmail.com> Subject: [Patch] Reduce regex _M_dfs frame size To: gcc-patches <gcc-patches@gcc.gnu.org>, "libstdc++" <libstdc++@gcc.gnu.org> Content-Type: multipart/mixed; boundary=94eb2c12317067a89c053a7e4675

diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h index ef8aa91..33a68dd 100644 --- a/libstdc++-v3/include/bits/regex_executor.h +++ b/libstdc++-v3/include/bits/regex_executor.h @@ -109,6 +109,39 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_rep_once_more(_Match_mode __match_mode, _StateIdT); void + _M_handle_repeat(_Match_mode, _StateIdT); + + void + _M_handle_subexpr_begin(_Match_mode, _StateIdT); + + void + _M_handle_subexpr_end(_Match_mode, _StateIdT); + + void + _M_handle_line_begin_assertion(_Match_mode, _StateIdT); + + void + _M_handle_line_end_assertion(_Match_mode, _StateIdT); + + void + _M_handle_word_boundary(_Match_mode, _StateIdT); + + void + _M_handle_subexpr_lookahead(_Match_mode, _StateIdT); + + void + _M_handle_match(_Match_mode, _StateIdT); + + void + _M_handle_backref(_Match_mode, _StateIdT); + + void + _M_handle_accept(_Match_mode, _StateIdT); + + void + _M_handle_alternative(_Match_mode, _StateIdT); + + void _M_dfs(_Match_mode __match_mode, _StateIdT __start); bool diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc index 6bbcb1b..382909f 100644 --- a/libstdc++-v3/include/bits/regex_executor.tcc +++ b/libstdc++-v3/include/bits/regex_executor.tcc @@ -195,213 +195,295 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } }; + // _M_alt branch is "match once more", while _M_next is "get me out + // of this quantifier". Executing _M_next first or _M_alt first don't + // mean the same thing, and we need to choose the correct order under + // given greedy mode. template<typename _BiIter, typename _Alloc, typename _TraitsT, bool __dfs_mode> void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: - _M_dfs(_Match_mode __match_mode, _StateIdT __i) + _M_handle_repeat(_Match_mode __match_mode, _StateIdT __i) { - if (_M_states._M_visited(__i)) - return; - const auto& __state = _M_nfa[__i]; - // Every change on _M_cur_results and _M_current will be rolled back after - // finishing the recursion step. - switch (__state._M_opcode()) + + // Greedy. + if (!__state._M_neg) { - // _M_alt branch is "match once more", while _M_next is "get me out - // of this quantifier". Executing _M_next first or _M_alt first don't - // mean the same thing, and we need to choose the correct order under - // given greedy mode. - case _S_opcode_repeat: - { - // Greedy. - if (!__state._M_neg) - { - _M_rep_once_more(__match_mode, __i); - // If it's DFS executor and already accepted, we're done. - if (!__dfs_mode || !_M_has_sol) - _M_dfs(__match_mode, __state._M_next); - } - else // Non-greedy mode - { - if (__dfs_mode) - { - // vice-versa. - _M_dfs(__match_mode, __state._M_next); - if (!_M_has_sol) - _M_rep_once_more(__match_mode, __i); - } - else - { - // DON'T attempt anything, because there's already another - // state with higher priority accepted. This state cannot - // be better by attempting its next node. - if (!_M_has_sol) - { - _M_dfs(__match_mode, __state._M_next); - // DON'T attempt anything if it's already accepted. An - // accepted state *must* be better than a solution that - // matches a non-greedy quantifier one more time. - if (!_M_has_sol) - _M_rep_once_more(__match_mode, __i); - } - } - } - } - break; - case _S_opcode_subexpr_begin: - { - auto& __res = _M_cur_results[__state._M_subexpr]; - auto __back = __res.first; - __res.first = _M_current; - _M_dfs(__match_mode, __state._M_next); - __res.first = __back; - } - break; - case _S_opcode_subexpr_end: - { - auto& __res = _M_cur_results[__state._M_subexpr]; - auto __back = __res; - __res.second = _M_current; - __res.matched = true; - _M_dfs(__match_mode, __state._M_next); - __res = __back; - } - break; - case _S_opcode_line_begin_assertion: - if (_M_at_begin()) + _M_rep_once_more(__match_mode, __i); + // If it's DFS executor and already accepted, we're done. + if (!__dfs_mode || !_M_has_sol) _M_dfs(__match_mode, __state._M_next); - break; - case _S_opcode_line_end_assertion: - if (_M_at_end()) - _M_dfs(__match_mode, __state._M_next); - break; - case _S_opcode_word_boundary: - if (_M_word_boundary() == !__state._M_neg) - _M_dfs(__match_mode, __state._M_next); - break; - // Here __state._M_alt offers a single start node for a sub-NFA. - // We recursively invoke our algorithm to match the sub-NFA. - case _S_opcode_subexpr_lookahead: - if (_M_lookahead(__state._M_alt) == !__state._M_neg) - _M_dfs(__match_mode, __state._M_next); - break; - case _S_opcode_match: - if (_M_current == _M_end) - break; + } + else // Non-greedy mode + { if (__dfs_mode) { - if (__state._M_matches(*_M_current)) - { - ++_M_current; - _M_dfs(__match_mode, __state._M_next); - --_M_current; - } + // vice-versa. + _M_dfs(__match_mode, __state._M_next); + if (!_M_has_sol) + _M_rep_once_more(__match_mode, __i); } else - if (__state._M_matches(*_M_current)) - _M_states._M_queue(__state._M_next, _M_cur_results); - break; - // First fetch the matched result from _M_cur_results as __submatch; - // then compare it with - // (_M_current, _M_current + (__submatch.second - __submatch.first)). - // If matched, keep going; else just return and try another state. - case _S_opcode_backref: - { - __glibcxx_assert(__dfs_mode); - auto& __submatch = _M_cur_results[__state._M_backref_index]; - if (!__submatch.matched) - break; - auto __last = _M_current; - for (auto __tmp = __submatch.first; - __last != _M_end && __tmp != __submatch.second; - ++__tmp) - ++__last; - if (_M_re._M_automaton->_M_traits.transform(__submatch.first, - __submatch.second) - == _M_re._M_automaton->_M_traits.transform(_M_current, __last)) - { - if (__last != _M_current) - { - auto __backup = _M_current; - _M_current = __last; - _M_dfs(__match_mode, __state._M_next); - _M_current = __backup; - } - else - _M_dfs(__match_mode, __state._M_next); - } - } - break; - case _S_opcode_accept: - if (__dfs_mode) { - __glibcxx_assert(!_M_has_sol); - if (__match_mode == _Match_mode::_Exact) - _M_has_sol = _M_current == _M_end; - else - _M_has_sol = true; - if (_M_current == _M_begin - && (_M_flags & regex_constants::match_not_null)) - _M_has_sol = false; - if (_M_has_sol) + // DON'T attempt anything, because there's already another + // state with higher priority accepted. This state cannot + // be better by attempting its next node. + if (!_M_has_sol) { - if (_M_nfa._M_flags & regex_constants::ECMAScript) - _M_results = _M_cur_results; - else // POSIX - { - __glibcxx_assert(_M_states._M_get_sol_pos()); - // Here's POSIX's logic: match the longest one. However - // we never know which one (lhs or rhs of "|") is longer - // unless we try both of them and compare the results. - // The member variable _M_sol_pos records the end - // position of the last successful match. It's better - // to be larger, because POSIX regex is always greedy. - // TODO: This could be slow. - if (*_M_states._M_get_sol_pos() == _BiIter() - || std::distance(_M_begin, - *_M_states._M_get_sol_pos()) - < std::distance(_M_begin, _M_current)) - { - *_M_states._M_get_sol_pos() = _M_current; - _M_results = _M_cur_results; - } - } + _M_dfs(__match_mode, __state._M_next); + // DON'T attempt anything if it's already accepted. An + // accepted state *must* be better than a solution that + // matches a non-greedy quantifier one more time. + if (!_M_has_sol) + _M_rep_once_more(__match_mode, __i); } } - else + } + } + + template<typename _BiIter, typename _Alloc, typename _TraitsT, + bool __dfs_mode> + void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: + _M_handle_subexpr_begin(_Match_mode __match_mode, _StateIdT __i) + { + const auto& __state = _M_nfa[__i]; + + auto& __res = _M_cur_results[__state._M_subexpr]; + auto __back = __res.first; + __res.first = _M_current; + _M_dfs(__match_mode, __state._M_next); + __res.first = __back; + } + + template<typename _BiIter, typename _Alloc, typename _TraitsT, + bool __dfs_mode> + void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: + _M_handle_subexpr_end(_Match_mode __match_mode, _StateIdT __i) + { + const auto& __state = _M_nfa[__i]; + + auto& __res = _M_cur_results[__state._M_subexpr]; + auto __back = __res; + __res.second = _M_current; + __res.matched = true; + _M_dfs(__match_mode, __state._M_next); + __res = __back; + } + + template<typename _BiIter, typename _Alloc, typename _TraitsT, + bool __dfs_mode> + inline void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: + _M_handle_line_begin_assertion(_Match_mode __match_mode, _StateIdT __i) + { + const auto& __state = _M_nfa[__i]; + if (_M_at_begin()) + _M_dfs(__match_mode, __state._M_next); + } + + template<typename _BiIter, typename _Alloc, typename _TraitsT, + bool __dfs_mode> + inline void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: + _M_handle_line_end_assertion(_Match_mode __match_mode, _StateIdT __i) + { + const auto& __state = _M_nfa[__i]; + if (_M_at_end()) + _M_dfs(__match_mode, __state._M_next); + } + + template<typename _BiIter, typename _Alloc, typename _TraitsT, + bool __dfs_mode> + inline void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: + _M_handle_word_boundary(_Match_mode __match_mode, _StateIdT __i) + { + const auto& __state = _M_nfa[__i]; + if (_M_word_boundary() == !__state._M_neg) + _M_dfs(__match_mode, __state._M_next); + } + + // Here __state._M_alt offers a single start node for a sub-NFA. + // We recursively invoke our algorithm to match the sub-NFA. + template<typename _BiIter, typename _Alloc, typename _TraitsT, + bool __dfs_mode> + void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: + _M_handle_subexpr_lookahead(_Match_mode __match_mode, _StateIdT __i) + { + const auto& __state = _M_nfa[__i]; + if (_M_lookahead(__state._M_alt) == !__state._M_neg) + _M_dfs(__match_mode, __state._M_next); + } + + template<typename _BiIter, typename _Alloc, typename _TraitsT, + bool __dfs_mode> + void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: + _M_handle_match(_Match_mode __match_mode, _StateIdT __i) + { + const auto& __state = _M_nfa[__i]; + + if (_M_current == _M_end) + return; + if (__dfs_mode) + { + if (__state._M_matches(*_M_current)) { - if (_M_current == _M_begin - && (_M_flags & regex_constants::match_not_null)) - break; - if (__match_mode == _Match_mode::_Prefix || _M_current == _M_end) - if (!_M_has_sol) - { - _M_has_sol = true; - _M_results = _M_cur_results; - } + ++_M_current; + _M_dfs(__match_mode, __state._M_next); + --_M_current; } - break; - case _S_opcode_alternative: - if (_M_nfa._M_flags & regex_constants::ECMAScript) + } + else + if (__state._M_matches(*_M_current)) + _M_states._M_queue(__state._M_next, _M_cur_results); + } + + // First fetch the matched result from _M_cur_results as __submatch; + // then compare it with + // (_M_current, _M_current + (__submatch.second - __submatch.first)). + // If matched, keep going; else just return and try another state. + template<typename _BiIter, typename _Alloc, typename _TraitsT, + bool __dfs_mode> + void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: + _M_handle_backref(_Match_mode __match_mode, _StateIdT __i) + { + __glibcxx_assert(__dfs_mode); + + const auto& __state = _M_nfa[__i]; + auto& __submatch = _M_cur_results[__state._M_backref_index]; + if (!__submatch.matched) + return; + auto __last = _M_current; + for (auto __tmp = __submatch.first; + __last != _M_end && __tmp != __submatch.second; + ++__tmp) + ++__last; + if (_M_re._M_automaton->_M_traits.transform(__submatch.first, + __submatch.second) + == _M_re._M_automaton->_M_traits.transform(_M_current, __last)) + { + if (__last != _M_current) { - // TODO: Fix BFS support. It is wrong. - _M_dfs(__match_mode, __state._M_alt); - // Pick lhs if it matches. Only try rhs if it doesn't. - if (!_M_has_sol) - _M_dfs(__match_mode, __state._M_next); + auto __backup = _M_current; + _M_current = __last; + _M_dfs(__match_mode, __state._M_next); + _M_current = __backup; } else + _M_dfs(__match_mode, __state._M_next); + } + } + + template<typename _BiIter, typename _Alloc, typename _TraitsT, + bool __dfs_mode> + void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: + _M_handle_accept(_Match_mode __match_mode, _StateIdT __i) + { + if (__dfs_mode) + { + __glibcxx_assert(!_M_has_sol); + if (__match_mode == _Match_mode::_Exact) + _M_has_sol = _M_current == _M_end; + else + _M_has_sol = true; + if (_M_current == _M_begin + && (_M_flags & regex_constants::match_not_null)) + _M_has_sol = false; + if (_M_has_sol) { - // Try both and compare the result. - // See "case _S_opcode_accept:" handling above. - _M_dfs(__match_mode, __state._M_alt); - auto __has_sol = _M_has_sol; - _M_has_sol = false; - _M_dfs(__match_mode, __state._M_next); - _M_has_sol |= __has_sol; + if (_M_nfa._M_flags & regex_constants::ECMAScript) + _M_results = _M_cur_results; + else // POSIX + { + __glibcxx_assert(_M_states._M_get_sol_pos()); + // Here's POSIX's logic: match the longest one. However + // we never know which one (lhs or rhs of "|") is longer + // unless we try both of them and compare the results. + // The member variable _M_sol_pos records the end + // position of the last successful match. It's better + // to be larger, because POSIX regex is always greedy. + // TODO: This could be slow. + if (*_M_states._M_get_sol_pos() == _BiIter() + || std::distance(_M_begin, + *_M_states._M_get_sol_pos()) + < std::distance(_M_begin, _M_current)) + { + *_M_states._M_get_sol_pos() = _M_current; + _M_results = _M_cur_results; + } + } } - break; + } + else + { + if (_M_current == _M_begin + && (_M_flags & regex_constants::match_not_null)) + return; + if (__match_mode == _Match_mode::_Prefix || _M_current == _M_end) + if (!_M_has_sol) + { + _M_has_sol = true; + _M_results = _M_cur_results; + } + } + } + + template<typename _BiIter, typename _Alloc, typename _TraitsT, + bool __dfs_mode> + void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: + _M_handle_alternative(_Match_mode __match_mode, _StateIdT __i) + { + const auto& __state = _M_nfa[__i]; + + if (_M_nfa._M_flags & regex_constants::ECMAScript) + { + // TODO: Fix BFS support. It is wrong. + _M_dfs(__match_mode, __state._M_alt); + // Pick lhs if it matches. Only try rhs if it doesn't. + if (!_M_has_sol) + _M_dfs(__match_mode, __state._M_next); + } + else + { + // Try both and compare the result. + // See "case _S_opcode_accept:" handling above. + _M_dfs(__match_mode, __state._M_alt); + auto __has_sol = _M_has_sol; + _M_has_sol = false; + _M_dfs(__match_mode, __state._M_next); + _M_has_sol |= __has_sol; + } + } + + template<typename _BiIter, typename _Alloc, typename _TraitsT, + bool __dfs_mode> + void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: + _M_dfs(_Match_mode __match_mode, _StateIdT __i) + { + if (_M_states._M_visited(__i)) + return; + + switch (_M_nfa[__i]._M_opcode()) + { + case _S_opcode_repeat: + _M_handle_repeat(__match_mode, __i); break; + case _S_opcode_subexpr_begin: + _M_handle_subexpr_begin(__match_mode, __i); break; + case _S_opcode_subexpr_end: + _M_handle_subexpr_end(__match_mode, __i); break; + case _S_opcode_line_begin_assertion: + _M_handle_line_begin_assertion(__match_mode, __i); break; + case _S_opcode_line_end_assertion: + _M_handle_line_end_assertion(__match_mode, __i); break; + case _S_opcode_word_boundary: + _M_handle_word_boundary(__match_mode, __i); break; + case _S_opcode_subexpr_lookahead: + _M_handle_subexpr_lookahead(__match_mode, __i); break; + case _S_opcode_match: + _M_handle_match(__match_mode, __i); break; + case _S_opcode_backref: + _M_handle_backref(__match_mode, __i); break; + case _S_opcode_accept: + _M_handle_accept(__match_mode, __i); break; + case _S_opcode_alternative: + _M_handle_alternative(__match_mode, __i); break; default: __glibcxx_assert(false); }

Reduce regex _M_dfs frame size

Commit Message

Comments

Patch