From patchwork Fri Apr 25 22:04:20 2014 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Tim Shen X-Patchwork-Id: 343011 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id EB285140143 for ; Sat, 26 Apr 2014 08:04:41 +1000 (EST) DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :mime-version:in-reply-to:references:date:message-id:subject :from:to:content-type; q=dns; s=default; b=LhfqwiTnMXi1btzesPvES V16JNnX42wGQkQGLbiL+FFUzSE2L0GGiM0I4iK0M3w4ZhIx/9kIzaHRLcWk3J4J4 OuGPt3BAsvz8Oe59DQthHa9A98VMriYskW657h7WImjrtM5Sa7xemkPC3e8TFh1r bZB7iqoi+jQKX+Qtn7ob8I= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :mime-version:in-reply-to:references:date:message-id:subject :from:to:content-type; s=default; bh=neXjLcPr2s6CTKBnzrX+NeLwo6k =; b=Ba88P+M0NjMTiJG6wxTkbdmFO/UGU2txxRI1y3HQDOuZFIaS4+bGBHYTJ7r 38eSG/AeBxAfSqmoCl0OOtfl9MGvVMCaoRNHZrWQtvPzKuD/1D4HO68fI6R/ggNu MIRizi0q9bmm5sEQzqPFoxwj7bQom41m4JNgX0x6JPr3Yt7w= Received: (qmail 2893 invoked by alias); 25 Apr 2014 22:04:24 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Received: (qmail 2868 invoked by uid 89); 25 Apr 2014 22:04:23 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-2.2 required=5.0 tests=AWL, BAYES_00, FREEMAIL_ENVFROM_END_DIGIT, FREEMAIL_FROM, RCVD_IN_DNSWL_LOW, SPF_PASS autolearn=ham version=3.3.2 X-Spam-User: qpsmtpd, 2 recipients X-HELO: mail-oa0-f48.google.com Received: from mail-oa0-f48.google.com (HELO mail-oa0-f48.google.com) (209.85.219.48) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with (AES128-SHA encrypted) ESMTPS; Fri, 25 Apr 2014 22:04:22 +0000 Received: by mail-oa0-f48.google.com with SMTP id m1so4879850oag.35 for ; Fri, 25 Apr 2014 15:04:20 -0700 (PDT) MIME-Version: 1.0 X-Received: by 10.60.58.7 with SMTP id m7mr4196217oeq.59.1398463460866; Fri, 25 Apr 2014 15:04:20 -0700 (PDT) Received: by 10.182.250.65 with HTTP; Fri, 25 Apr 2014 15:04:20 -0700 (PDT) In-Reply-To: References: Date: Fri, 25 Apr 2014 18:04:20 -0400 Message-ID: Subject: Re: [Patch 3/3] Separate `repeating` node from `alternative` node in regex From: Tim Shen To: "libstdc++" , gcc-patches X-IsSubscribed: yes Patch attached. diff --git a/libstdc++-v3/include/bits/regex_automaton.h b/libstdc++-v3/include/bits/regex_automaton.h index 64ecd6d..1b0da14 100644 --- a/libstdc++-v3/include/bits/regex_automaton.h +++ b/libstdc++-v3/include/bits/regex_automaton.h @@ -52,6 +52,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { _S_opcode_unknown, _S_opcode_alternative, + _S_opcode_repeat, _S_opcode_backref, _S_opcode_line_begin_assertion, _S_opcode_line_end_assertion, @@ -74,7 +75,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION size_t _M_backref_index; // for _S_opcode_backref struct { - // for _S_opcode_alternative or _S_opcode_subexpr_lookahead + // for _S_opcode_alternative, _S_opcode_repeat and + // _S_opcode_subexpr_lookahead _StateIdT _M_alt; // for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or // quantifiers (ungreedy if set true) @@ -174,6 +176,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // approach. __tmp._M_next = __next; __tmp._M_alt = __alt; + return _M_insert_state(std::move(__tmp)); + } + + _StateIdT + _M_insert_repeat(_StateIdT __next, _StateIdT __alt, bool __neg) + { + _StateT __tmp(_S_opcode_repeat); + // It labels every quantifier to make greedy comparison easier in BFS + // approach. + __tmp._M_next = __next; + __tmp._M_alt = __alt; __tmp._M_neg = __neg; return _M_insert_state(std::move(__tmp)); } diff --git a/libstdc++-v3/include/bits/regex_automaton.tcc b/libstdc++-v3/include/bits/regex_automaton.tcc index 38787fa..e0ac3f9 100644 --- a/libstdc++-v3/include/bits/regex_automaton.tcc +++ b/libstdc++-v3/include/bits/regex_automaton.tcc @@ -41,6 +41,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION switch (_M_opcode) { case _S_opcode_alternative: + case _S_opcode_repeat: ostr << "alt next=" << _M_next << " alt=" << _M_alt; break; case _S_opcode_subexpr_begin: @@ -72,6 +73,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION switch (_M_opcode) { case _S_opcode_alternative: + case _S_opcode_repeat: __ostr << __id << " [label=\"" << __id << "\\nALT\"];\n" << __id << " -> " << _M_next << " [label=\"next\", tailport=\"s\"];\n" @@ -174,6 +176,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION == _S_opcode_dummy) __it._M_next = (*this)[__it._M_next]._M_next; if (__it._M_opcode == _S_opcode_alternative + || __it._M_opcode == _S_opcode_repeat || __it._M_opcode == _S_opcode_subexpr_lookahead) while (__it._M_alt >= 0 && (*this)[__it._M_alt]._M_opcode == _S_opcode_dummy) @@ -198,6 +201,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION auto __id = _M_nfa._M_insert_state(__dup); __m[__u] = __id; if (__dup._M_opcode == _S_opcode_alternative + || __dup._M_opcode == _S_opcode_repeat || __dup._M_opcode == _S_opcode_subexpr_lookahead) if (__dup._M_alt != _S_invalid_state_id && __m[__dup._M_alt] == -1) __stack.push(__dup._M_alt); @@ -217,6 +221,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __ref._M_next = __m[__ref._M_next]; } if (__ref._M_opcode == _S_opcode_alternative + || __ref._M_opcode == _S_opcode_repeat || __ref._M_opcode == _S_opcode_subexpr_lookahead) if (__ref._M_alt != _S_invalid_state_id) { diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h index f5a198f..d7e2162 100644 --- a/libstdc++-v3/include/bits/regex_compiler.h +++ b/libstdc++-v3/include/bits/regex_compiler.h @@ -421,7 +421,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION void _M_make_cache(true_type) { - for (int __i = 0; __i < _M_cache.size(); __i++) + for (size_t __i = 0; __i < _M_cache.size(); __i++) _M_cache[static_cast<_UnsignedCharT>(__i)] = _M_apply(__i, false_type()); } diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc index 128dac1..3cf9e457 100644 --- a/libstdc++-v3/include/bits/regex_compiler.tcc +++ b/libstdc++-v3/include/bits/regex_compiler.tcc @@ -188,8 +188,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { __init(); auto __e = _M_pop(); - _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id, - __e._M_start, __neg)); + _StateSeqT __r(_M_nfa, _M_nfa._M_insert_repeat(_S_invalid_state_id, + __e._M_start, __neg)); __e._M_append(__r); _M_stack.push(__r); } @@ -197,8 +197,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { __init(); auto __e = _M_pop(); - __e._M_append(_M_nfa._M_insert_alt(_S_invalid_state_id, __e._M_start, - __neg)); + __e._M_append(_M_nfa._M_insert_repeat(_S_invalid_state_id, + __e._M_start, __neg)); _M_stack.push(__e); } else if (_M_match_token(_ScannerT::_S_token_opt)) @@ -206,8 +206,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __init(); auto __e = _M_pop(); auto __end = _M_nfa._M_insert_dummy(); - _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id, - __e._M_start, __neg)); + _StateSeqT __r(_M_nfa, _M_nfa._M_insert_repeat(_S_invalid_state_id, + __e._M_start, __neg)); __e._M_append(__end); __r._M_append(__end); _M_stack.push(__r); @@ -244,8 +244,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { auto __tmp = __r._M_clone(); _StateSeqT __s(_M_nfa, - _M_nfa._M_insert_alt(_S_invalid_state_id, - __tmp._M_start, __neg)); + _M_nfa._M_insert_repeat(_S_invalid_state_id, + __tmp._M_start, __neg)); __tmp._M_append(__s); __e._M_append(__s); } @@ -261,8 +261,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION for (long __i = 0; __i < __n; ++__i) { auto __tmp = __r._M_clone(); - auto __alt = _M_nfa._M_insert_alt(__tmp._M_start, - __end, __neg); + auto __alt = _M_nfa._M_insert_repeat(__tmp._M_start, + __end, __neg); __stack.push(__alt); __e._M_append(_StateSeqT(_M_nfa, __alt, __tmp._M_end)); } diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc index bb10a72..6dd4d25 100644 --- a/libstdc++-v3/include/bits/regex_executor.tcc +++ b/libstdc++-v3/include/bits/regex_executor.tcc @@ -194,7 +194,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } }; - // TODO: Use a function vector to dispatch, instead of using switch-case. template template @@ -217,7 +216,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // of this quantifier". Executing _M_next first or _M_alt first don't // mean the same thing, and we need to choose the correct order under // given greedy mode. - case _S_opcode_alternative: + case _S_opcode_repeat: { // Greedy. if (!__state._M_neg) @@ -364,6 +363,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } } break; + case _S_opcode_alternative: + _M_dfs<__match_mode>(__state._M_alt); + if (!__dfs_mode || !_M_has_sol) + _M_dfs<__match_mode>(__state._M_next); + break; default: _GLIBCXX_DEBUG_ASSERT(false); } diff --git a/libstdc++-v3/include/bits/regex_scanner.tcc b/libstdc++-v3/include/bits/regex_scanner.tcc index 5332d2e..f501ff6 100644 --- a/libstdc++-v3/include/bits/regex_scanner.tcc +++ b/libstdc++-v3/include/bits/regex_scanner.tcc @@ -335,7 +335,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION else if (__c == 'x' || __c == 'u') { _M_value.erase(); - for (int i = 0; i < (__c == 'x' ? 2 : 4); i++) + for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++) { if (_M_current == _M_end || !_M_ctype.is(_CtypeT::xdigit, *_M_current))