From patchwork Wed Aug 24 07:18:01 2016 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Tim Shen X-Patchwork-Id: 662153 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3sJzDd0rzKz9sCp for ; Wed, 24 Aug 2016 17:18:22 +1000 (AEST) Authentication-Results: ozlabs.org; dkim=pass (1024-bit key; unprotected) header.d=gcc.gnu.org header.i=@gcc.gnu.org header.b=TaMCf7c9; dkim-atps=neutral DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :mime-version:from:date:message-id:subject:to:content-type; q= dns; s=default; b=DczqTXKgLVzAMTgYO7lynUTTCXlWdmgjzMzL4Bi9kpI7uA NmF6Paz9x0Z1omQDyvfzR17banvwHZHJYVMwydyIBq02o2nxyYaoqKyqXq38TjOU AsOjGuO41/onV7cQ726R+X88CcXCyZ1tSyDFEjhpNOv04dGqO9R1TAG5xx3gI= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :mime-version:from:date:message-id:subject:to:content-type; s= default; bh=DS6ZGJtcjovPyJ7IFhxpyVHZPJQ=; b=TaMCf7c9rak+23JjWXdU QdRs9jKDKKriNyiXMBUu53nnpAW06dFOuDw7tF7DsEsog0zBrwHFmOI8Pqtg6JjF Udlrs3C/Z5nbNgexiQuozhlAI9ktV+3L1ribi8h9ifEmSXaRKIofEsNhKczTOwIh qq7q2gcb4FrbTpB4PZbPe/0= Received: (qmail 98343 invoked by alias); 24 Aug 2016 07:18:14 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Received: (qmail 98319 invoked by uid 89); 24 Aug 2016 07:18:13 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=1.4 required=5.0 tests=AWL, BAYES_50, LIKELY_SPAM_BODY, RCVD_IN_DNSWL_LOW, RP_MATCHES_RCVD, SPF_PASS autolearn=no version=3.3.2 spammy=dash, documentations, Optional, Unexpected X-HELO: mail-qk0-f181.google.com Received: from mail-qk0-f181.google.com (HELO mail-qk0-f181.google.com) (209.85.220.181) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP; Wed, 24 Aug 2016 07:18:03 +0000 Received: by mail-qk0-f181.google.com with SMTP id z190so7124073qkc.0 for ; Wed, 24 Aug 2016 00:18:03 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20130820; h=x-gm-message-state:mime-version:from:date:message-id:subject:to; bh=jRKgqvGF9yyph1RvyKFy10twJig6RPPT4jegnpaG1a4=; b=aP3/LI0lW2JoClFDCDK8EQKKZHZnzYsMuf1JMQraP1IYEm/huCdMCAipbgnxoOCpqE jbDlSlxFlou5Pvev61UAPVz+X6gJkNL5on5lJuHMt2xUfO46vRMY8W/KBckKFjfQK/8J 4Rc6PY37g/Fllg7cl/mrXg/nijLPQK1j1EUrFbU/5z1r8smXVGyUEdnWXWqfV+KGIpgQ RTAGQBRtmPEief6JSDxXMX4TTBm41AjHL6Gghs1wo6d31GNcD7LaVBpaEG0Xxyae0R5e tGn0AnPE1W/+I80Ot87xFCn2F+Ew3gggdFBKheC4ggMbrfbwR76eWCuRn6cxx4ROc2lJ +0Mg== X-Gm-Message-State: AE9vXwMnlhApnuEfVTXKCTgEpZaJypTBGM+LeWqFE4VS4pW26v3/s4UVV2Qcbs5O6BJi+BrvlKu0QNHX0xYwQO+I X-Received: by 10.55.19.157 with SMTP id 29mr1809240qkt.205.1472023081834; Wed, 24 Aug 2016 00:18:01 -0700 (PDT) MIME-Version: 1.0 Received: by 10.55.132.2 with HTTP; Wed, 24 Aug 2016 00:18:01 -0700 (PDT) From: Tim Shen Date: Wed, 24 Aug 2016 00:18:01 -0700 Message-ID: Subject: [Patch, libstdc++/77356] Support escape in regex bracket expression To: "libstdc++" , gcc-patches I didn't realized that we can actually escape a dash inside bracket expression: R"([\-])", in which case the dash should be treated literally. Tell me if you feel like we need more documentations. :P Bootstrapped and tested on x86_64-linux-gnu. Thanks! diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc index ff69e16..3ffa170 100644 --- a/libstdc++-v3/include/bits/regex_compiler.tcc +++ b/libstdc++-v3/include/bits/regex_compiler.tcc @@ -426,13 +426,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION pair __last_char; // Optional<_CharT> __last_char.first = false; if (!(_M_flags & regex_constants::ECMAScript)) - if (_M_try_char()) - { - __matcher._M_add_char(_M_value[0]); - __last_char.first = true; - __last_char.second = _M_value[0]; - } + { + if (_M_try_char()) + { + __last_char.first = true; + __last_char.second = _M_value[0]; + } + else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) + { + __last_char.first = true; + __last_char.second = '-'; + } + } while (_M_expression_term(__last_char, __matcher)); + if (__last_char.first) + __matcher._M_add_char(__last_char.second); __matcher._M_ready(); _M_stack.push(_StateSeqT( *_M_nfa, @@ -449,19 +457,35 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION if (_M_match_token(_ScannerT::_S_token_bracket_end)) return false; + const auto __push_char = [&](_CharT __ch) + { + if (__last_char.first) + __matcher._M_add_char(__last_char.second); + else + __last_char.first = true; + __last_char.second = __ch; + }; + if (_M_match_token(_ScannerT::_S_token_collsymbol)) { auto __symbol = __matcher._M_add_collate_element(_M_value); if (__symbol.size() == 1) - { - __last_char.first = true; - __last_char.second = __symbol[0]; - } + __push_char(__symbol[0]); + else + __last_char.first = false; } else if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) - __matcher._M_add_equivalence_class(_M_value); + { + __last_char.first = false; + __matcher._M_add_equivalence_class(_M_value); + } else if (_M_match_token(_ScannerT::_S_token_char_class_name)) - __matcher._M_add_character_class(_M_value, false); + { + __last_char.first = false; + __matcher._M_add_character_class(_M_value, false); + } + else if (_M_try_char()) + __push_char(_M_value[0]); // POSIX doesn't allow '-' as a start-range char (say [a-z--0]), // except when the '-' is the first or last character in the bracket // expression ([--0]). ECMAScript treats all '-' after a range as a @@ -472,55 +496,55 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // Clang (3.5) always uses ECMAScript style even in its POSIX syntax. // // It turns out that no one reads BNFs ;) - else if (_M_try_char()) + else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) { if (!__last_char.first) { - __matcher._M_add_char(_M_value[0]); - if (_M_value[0] == '-' - && !(_M_flags & regex_constants::ECMAScript)) + if (!(_M_flags & regex_constants::ECMAScript)) { if (_M_match_token(_ScannerT::_S_token_bracket_end)) - return false; + { + __push_char('-'); + return false; + } __throw_regex_error( regex_constants::error_range, "Unexpected dash in bracket expression. For POSIX syntax, " "a dash is not treated literally only when it is at " "beginning or end."); } - __last_char.first = true; - __last_char.second = _M_value[0]; + __push_char('-'); } else { - if (_M_value[0] == '-') + if (_M_try_char()) { - if (_M_try_char()) - { - __matcher._M_make_range(__last_char.second , _M_value[0]); - __last_char.first = false; - } - else - { - if (_M_scanner._M_get_token() - != _ScannerT::_S_token_bracket_end) - __throw_regex_error( - regex_constants::error_range, - "Unexpected end of bracket expression."); - __matcher._M_add_char(_M_value[0]); - } + __matcher._M_make_range(__last_char.second, _M_value[0]); + __last_char.first = false; + } + else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) + { + __matcher._M_make_range(__last_char.second, '-'); + __last_char.first = false; } else { - __matcher._M_add_char(_M_value[0]); - __last_char.second = _M_value[0]; + if (_M_scanner._M_get_token() + != _ScannerT::_S_token_bracket_end) + __throw_regex_error( + regex_constants::error_range, + "Character is expected after a dash."); + __push_char(_M_value[0]); } } } else if (_M_match_token(_ScannerT::_S_token_quoted_class)) - __matcher._M_add_character_class(_M_value, - _M_ctype.is(_CtypeT::upper, - _M_value[0])); + { + __last_char.first = false; + __matcher._M_add_character_class(_M_value, + _M_ctype.is(_CtypeT::upper, + _M_value[0])); + } else __throw_regex_error(regex_constants::error_brack, "Unexpected character in bracket expression."); diff --git a/libstdc++-v3/include/bits/regex_scanner.h b/libstdc++-v3/include/bits/regex_scanner.h index 37dea84..2a83d1c 100644 --- a/libstdc++-v3/include/bits/regex_scanner.h +++ b/libstdc++-v3/include/bits/regex_scanner.h @@ -73,6 +73,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _S_token_comma, _S_token_dup_count, _S_token_eof, + _S_token_bracket_dash, _S_token_unknown }; diff --git a/libstdc++-v3/include/bits/regex_scanner.tcc b/libstdc++-v3/include/bits/regex_scanner.tcc index fedba09..a734bb1 100644 --- a/libstdc++-v3/include/bits/regex_scanner.tcc +++ b/libstdc++-v3/include/bits/regex_scanner.tcc @@ -210,7 +210,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION auto __c = *_M_current++; - if (__c == '[') + if (__c == '-') + _M_token = _S_token_bracket_dash; + else if (__c == '[') { if (_M_current == _M_end) __throw_regex_error(regex_constants::error_brack, diff --git a/libstdc++-v3/testsuite/28_regex/regression.cc b/libstdc++-v3/testsuite/28_regex/regression.cc index d367c8b..0896a74 100644 --- a/libstdc++-v3/testsuite/28_regex/regression.cc +++ b/libstdc++-v3/testsuite/28_regex/regression.cc @@ -61,12 +61,23 @@ test03() VERIFY(!regex_search_debug("a", regex(R"(\b$)"), regex_constants::match_not_eow)); } +// PR libstdc++/77356 +void +test04() +{ + bool test __attribute__((unused)) = true; + static const char* kNumericAnchor ="(\\$|usd)(usd|\\$|to|and|up to|[0-9,\\.\\-\\sk])+"; + const std::regex re(kNumericAnchor); + (void)re; +} + int main() { test01(); test02(); test03(); + test04(); return 0; }