From 4aebb4e4a6444400c9592484dab9336754e185e3 Mon Sep 17 00:00:00 2001 From: Tim Shen Date: Sat, 27 Aug 2016 02:03:23 +0000 Subject: [PATCH] re PR libstdc++/77356 (regex error for a ECMAScript syntax string) PR libstdc++/77356 * include/bits/regex_compiler.tcc(_M_insert_bracket_matcher, _M_expression_term): Modify to support dash literal. * include/bits/regex_scanner.h: Add dash as a token type to make a different from the mandated dash literal by escaping. * include/bits/regex_scanner.tcc(_M_scan_in_bracket): Emit dash token in bracket expression parsing. * testsuite/28_regex/regression.cc: Add new testcases. From-SVN: r239794 --- libstdc++-v3/ChangeLog | 11 ++ libstdc++-v3/include/bits/regex_compiler.tcc | 110 +++++++++++------- libstdc++-v3/include/bits/regex_scanner.h | 5 +- libstdc++-v3/include/bits/regex_scanner.tcc | 4 +- libstdc++-v3/testsuite/28_regex/regression.cc | 23 ++++ 5 files changed, 111 insertions(+), 42 deletions(-) diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog index 6b21648fcbd..8e3c466740d 100644 --- a/libstdc++-v3/ChangeLog +++ b/libstdc++-v3/ChangeLog @@ -1,3 +1,14 @@ +2016-08-27 Tim Shen + + PR libstdc++/77356 + * include/bits/regex_compiler.tcc(_M_insert_bracket_matcher, + _M_expression_term): Modify to support dash literal. + * include/bits/regex_scanner.h: Add dash as a token type to make + a different from the mandated dash literal by escaping. + * include/bits/regex_scanner.tcc(_M_scan_in_bracket): Emit dash + token in bracket expression parsing. + * testsuite/28_regex/regression.cc: Add new testcases. + 2016-08-26 Jonathan Wakely PR libstdc++/51960 diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc index ff69e165511..ef6ebdd2ca0 100644 --- a/libstdc++-v3/include/bits/regex_compiler.tcc +++ b/libstdc++-v3/include/bits/regex_compiler.tcc @@ -426,13 +426,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION pair __last_char; // Optional<_CharT> __last_char.first = false; if (!(_M_flags & regex_constants::ECMAScript)) - if (_M_try_char()) - { - __matcher._M_add_char(_M_value[0]); - __last_char.first = true; - __last_char.second = _M_value[0]; - } + { + if (_M_try_char()) + { + __last_char.first = true; + __last_char.second = _M_value[0]; + } + else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) + { + __last_char.first = true; + __last_char.second = '-'; + } + } while (_M_expression_term(__last_char, __matcher)); + if (__last_char.first) + __matcher._M_add_char(__last_char.second); __matcher._M_ready(); _M_stack.push(_StateSeqT( *_M_nfa, @@ -449,19 +457,43 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION if (_M_match_token(_ScannerT::_S_token_bracket_end)) return false; + const auto __push_char = [&](_CharT __ch) + { + if (__last_char.first) + __matcher._M_add_char(__last_char.second); + else + __last_char.first = true; + __last_char.second = __ch; + }; + const auto __flush = [&] + { + if (__last_char.first) + { + __matcher._M_add_char(__last_char.second); + __last_char.first = false; + } + }; + if (_M_match_token(_ScannerT::_S_token_collsymbol)) { auto __symbol = __matcher._M_add_collate_element(_M_value); if (__symbol.size() == 1) - { - __last_char.first = true; - __last_char.second = __symbol[0]; - } + __push_char(__symbol[0]); + else + __flush(); } else if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) - __matcher._M_add_equivalence_class(_M_value); + { + __flush(); + __matcher._M_add_equivalence_class(_M_value); + } else if (_M_match_token(_ScannerT::_S_token_char_class_name)) - __matcher._M_add_character_class(_M_value, false); + { + __flush(); + __matcher._M_add_character_class(_M_value, false); + } + else if (_M_try_char()) + __push_char(_M_value[0]); // POSIX doesn't allow '-' as a start-range char (say [a-z--0]), // except when the '-' is the first or last character in the bracket // expression ([--0]). ECMAScript treats all '-' after a range as a @@ -472,55 +504,55 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // Clang (3.5) always uses ECMAScript style even in its POSIX syntax. // // It turns out that no one reads BNFs ;) - else if (_M_try_char()) + else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) { if (!__last_char.first) { - __matcher._M_add_char(_M_value[0]); - if (_M_value[0] == '-' - && !(_M_flags & regex_constants::ECMAScript)) + if (!(_M_flags & regex_constants::ECMAScript)) { if (_M_match_token(_ScannerT::_S_token_bracket_end)) - return false; + { + __push_char('-'); + return false; + } __throw_regex_error( regex_constants::error_range, "Unexpected dash in bracket expression. For POSIX syntax, " "a dash is not treated literally only when it is at " "beginning or end."); } - __last_char.first = true; - __last_char.second = _M_value[0]; + __push_char('-'); } else { - if (_M_value[0] == '-') + if (_M_try_char()) { - if (_M_try_char()) - { - __matcher._M_make_range(__last_char.second , _M_value[0]); - __last_char.first = false; - } - else - { - if (_M_scanner._M_get_token() - != _ScannerT::_S_token_bracket_end) - __throw_regex_error( - regex_constants::error_range, - "Unexpected end of bracket expression."); - __matcher._M_add_char(_M_value[0]); - } + __matcher._M_make_range(__last_char.second, _M_value[0]); + __last_char.first = false; + } + else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) + { + __matcher._M_make_range(__last_char.second, '-'); + __last_char.first = false; } else { - __matcher._M_add_char(_M_value[0]); - __last_char.second = _M_value[0]; + if (_M_scanner._M_get_token() + != _ScannerT::_S_token_bracket_end) + __throw_regex_error( + regex_constants::error_range, + "Character is expected after a dash."); + __push_char('-'); } } } else if (_M_match_token(_ScannerT::_S_token_quoted_class)) - __matcher._M_add_character_class(_M_value, - _M_ctype.is(_CtypeT::upper, - _M_value[0])); + { + __flush(); + __matcher._M_add_character_class(_M_value, + _M_ctype.is(_CtypeT::upper, + _M_value[0])); + } else __throw_regex_error(regex_constants::error_brack, "Unexpected character in bracket expression."); diff --git a/libstdc++-v3/include/bits/regex_scanner.h b/libstdc++-v3/include/bits/regex_scanner.h index 37dea840d5b..ed0b723f3a5 100644 --- a/libstdc++-v3/include/bits/regex_scanner.h +++ b/libstdc++-v3/include/bits/regex_scanner.h @@ -43,7 +43,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { public: /// Token types returned from the scanner. - enum _TokenT + enum _TokenT : unsigned { _S_token_anychar, _S_token_ord_char, @@ -73,7 +73,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _S_token_comma, _S_token_dup_count, _S_token_eof, - _S_token_unknown + _S_token_bracket_dash, + _S_token_unknown = -1u }; protected: diff --git a/libstdc++-v3/include/bits/regex_scanner.tcc b/libstdc++-v3/include/bits/regex_scanner.tcc index fedba09a713..a734bb175d7 100644 --- a/libstdc++-v3/include/bits/regex_scanner.tcc +++ b/libstdc++-v3/include/bits/regex_scanner.tcc @@ -210,7 +210,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION auto __c = *_M_current++; - if (__c == '[') + if (__c == '-') + _M_token = _S_token_bracket_dash; + else if (__c == '[') { if (_M_current == _M_end) __throw_regex_error(regex_constants::error_brack, diff --git a/libstdc++-v3/testsuite/28_regex/regression.cc b/libstdc++-v3/testsuite/28_regex/regression.cc index 77ca043f851..53a19afe4c3 100644 --- a/libstdc++-v3/testsuite/28_regex/regression.cc +++ b/libstdc++-v3/testsuite/28_regex/regression.cc @@ -61,12 +61,35 @@ test03() VERIFY(!regex_search_debug("a", regex(R"(\b$)"), regex_constants::match_not_eow)); } +// PR libstdc++/77356 +void +test04() +{ + bool test __attribute__((unused)) = true; + + static const char* kNumericAnchor ="(\\$|usd)(usd|\\$|to|and|up to|[0-9,\\.\\-\\sk])+"; + const std::regex re(kNumericAnchor); + (void)re; +} + +void +test05() +{ + bool test __attribute__((unused)) = true; + + VERIFY(regex_match_debug("!", std::regex("[![:alnum:]]"))); + VERIFY(regex_match_debug("-", std::regex("[a-]", regex_constants::basic))); + VERIFY(regex_match_debug("-", std::regex("[a-]"))); +} + int main() { test01(); test02(); test03(); + test04(); + test05(); return 0; } -- 2.30.2