From patchwork Sun Oct 27 20:12:11 2013 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Tim Shen X-Patchwork-Id: 286364 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits)) (Client did not present a certificate) by ozlabs.org (Postfix) with ESMTPS id 5D2C42C00C7 for ; Mon, 28 Oct 2013 07:12:26 +1100 (EST) DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :mime-version:date:message-id:subject:from:to:content-type; q= dns; s=default; b=h5A8EPGBeR6dcfvzqd37AKvmnDt3tpRhk1l4gW5lrraPsl BIKsayvzdUgbi+aq1sjR8Ng6XKF7soDs29yVxABG76hMwjzFNqGj7lJ206vNOTS9 qymOjH7TsmUWCv/9KLy7foufMothpc4CKzPTPVf40luZR/gVtPZC4jHzWfYYE= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :mime-version:date:message-id:subject:from:to:content-type; s= default; bh=LA533ckMl7NfFSoOkcGGcamWLTg=; b=gqT8ebXwKGlh3/2R9yE+ xJS7s2DNErkfsRWypzcQxL/buhMFTlwbTXFXlwO3Dp5tRS0yh4joFs4MgV/p71aK iI9uFWqXbvOrdf/5PqdRZGqzJgKYDibT2Vg29CGD00/2IEJ7bCgD/xKBEhrWtsqf MC0/8FfY36zJLEeUNXJuSHA= Received: (qmail 14866 invoked by alias); 27 Oct 2013 20:12:17 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Received: (qmail 14843 invoked by uid 89); 27 Oct 2013 20:12:16 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-2.3 required=5.0 tests=AWL, BAYES_00, FREEMAIL_ENVFROM_END_DIGIT, FREEMAIL_FROM, RCVD_IN_DNSWL_LOW, SPF_PASS autolearn=ham version=3.3.2 X-Spam-User: qpsmtpd, 2 recipients X-HELO: mail-ob0-f170.google.com Received: from mail-ob0-f170.google.com (HELO mail-ob0-f170.google.com) (209.85.214.170) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with (AES128-SHA encrypted) ESMTPS; Sun, 27 Oct 2013 20:12:13 +0000 Received: by mail-ob0-f170.google.com with SMTP id wp18so2854578obc.29 for ; Sun, 27 Oct 2013 13:12:11 -0700 (PDT) MIME-Version: 1.0 X-Received: by 10.60.68.10 with SMTP id r10mr175961oet.62.1382904731335; Sun, 27 Oct 2013 13:12:11 -0700 (PDT) Received: by 10.182.202.9 with HTTP; Sun, 27 Oct 2013 13:12:11 -0700 (PDT) Date: Sun, 27 Oct 2013 16:12:11 -0400 Message-ID: Subject: [Patch] Regex comments From: Tim Shen To: "libstdc++" , gcc-patches X-IsSubscribed: yes Add comments for last regex commit. Thanks! commit b4aa16a08992866ca6fd60f40e422f551d0d6b2a Author: tim Date: Sun Oct 27 15:50:44 2013 -0400 2013- diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc index d3b9a04..2f677de 100644 --- a/libstdc++-v3/include/bits/regex_executor.tcc +++ b/libstdc++-v3/include/bits/regex_executor.tcc @@ -53,6 +53,52 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return false; } + // This function serves in different modes, DFS mode or BFS mode, indicated + // by template variable __dfs_mode. See _M_main for details. + // + // ------------------------------------------------------------ + // + // DFS mode: + // + // It applys a Depth-First-Search (aka backtracking) on given NFA and input + // string. + // At the very beginning the executor stands in the start state, then it try + // every possible state transition in current state recursively. Some state + // transitions consume input string, say, a single-char-matcher or a + // back-reference matcher; some don't, like assertion or other anchor nodes. + // When the input is exhausted and/or the current state is an accepting state, + // the whole executor returns true. + // + // TODO: This approach is exponentially slow for certain input. + // Try to compile the NFA to a DFA. + // + // Time complexity: o(match_length), O(2^(_M_nfa.size())) + // Space complexity: \theta(match_results.size() + match_length) + // + // ------------------------------------------------------------ + // + // BFS mode: + // + // Russ Cox's article (http://swtch.com/~rsc/regexp/regexp1.html) + // explained this algorithm clearly. + // + // It first computes epsilon clousure for every state that's still matching, + // using the same DFS algorithm, but doesn't reenter states (set true in + // _M_visited), nor follow _S_opcode_match. + // + // Then apply DFS to every _S_opcode_match one by one (in _M_match_queue). + // + // The order of which states needs to be recursively applied DFS matters, + // depend on which greedy mode we use. See _S_opcode_alternative branch in + // _M_dfs. + // + // It significantly reduces potential duplicate states, so have a better + // upper bound; but it deserves more overhead. + // + // Time complexity: o(match_length * match_results.size()) + // O(match_length * _M_nfa.size() * match_results.size()) + // Space complexity: o(_M_nfa.size() + match_results.size()) + // O(_M_nfa.size() * match_results.size()) template template @@ -68,18 +114,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } else { - // Like the DFS approach, it try every possible state transition; - // Unlike DFS, it uses a queue instead of a stack to store matching - // states. It's a BFS approach. - // - // Russ Cox's article(http://swtch.com/~rsc/regexp/regexp1.html) - // explained this algorithm clearly. - // - // Time complexity: o(match_length * match_results.size()) - // O(match_length * _M_nfa.size() - // * match_results.size()) - // Space complexity: o(_M_nfa.size() + match_results.size()) - // O(_M_nfa.size() * match_results.size()) _M_match_queue->push(make_pair(_M_start_state, _M_results)); bool __ret = false; while (1) @@ -132,20 +166,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return false; } - // A _DFSExecutor perform a DFS on given NFA and input string. At the very - // beginning the executor stands in the start state, then it try every - // possible state transition in current state recursively. Some state - // transitions consume input string, say, a single-char-matcher or a - // back-reference matcher; some not, like assertion or other anchor nodes. - // When the input is exhausted and the current state is an accepting state, - // the whole executor return true. - // - // TODO: This approach is exponentially slow for certain input. - // Try to compile the NFA to a DFA. - // - // Time complexity: o(match_length), O(2^(_M_nfa.size())) - // Space complexity: \theta(match_results.size() + match_length) - // template template @@ -164,25 +184,39 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { case _S_opcode_alternative: // Greedy or not, this is a question ;) + // + // _M_alt branch is "match once more", while _M_next is "get me out + // of this quantifier". + + // Greedy. if (!__state._M_neg) { + // Once more. _M_dfs<__match_mode>(__state._M_alt); + // If it's DFS executor and already accepted, we're done. if (!__dfs_mode || !_M_has_sol) _M_dfs<__match_mode>(__state._M_next); } - else + else // Ungreedy mode { if (__dfs_mode) { + // vice-versa. _M_dfs<__match_mode>(__state._M_next); if (!_M_has_sol) _M_dfs<__match_mode>(__state._M_alt); } else { + // DON't attempt anything, because there's already someone + // with higher priority accepted. This state cannot be better + // by attempting its next node. if (!_M_has_sol) { _M_dfs<__match_mode>(__state._M_next); + // DON'T attempt anything if it's already accepted. An + // accepted state *must* be better than a solution that + // has one more time ungreedy quantifier loop. if (!_M_has_sol) _M_dfs<__match_mode>(__state._M_alt); } @@ -190,9 +224,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } break; case _S_opcode_subexpr_begin: - // Here's the critical part: if there's nothing changed since last - // visit, do NOT continue. This prevents the executor from get into - // infinite loop when use "()*" to match "". + // If there's nothing changed since last visit, do NOT continue. + // This prevents the executor from get into infinite loop when use + // "()*" to match "". // // Every change on _M_cur_results will be roll back after the // recursion step finished. @@ -232,8 +266,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION if (_M_word_boundry(__state) == !__state._M_neg) _M_dfs<__match_mode>(__state._M_next); break; - // Here __state._M_alt offers a single start node for a sub-NFA. - // We recursivly invoke our algorithm to match the sub-NFA. + // Here __state._M_alt offers a single start node for a sub-NFA. + // We recursively invoke our algorithm to match the sub-NFA. case _S_opcode_subexpr_lookahead: if (_M_lookahead(__state) == !__state._M_neg) _M_dfs<__match_mode>(__state._M_next);