re PR libstdc++/77356 (regex error for a ECMAScript syntax string)

author Tim Shen <timshen@google.com>

Sat, 27 Aug 2016 02:03:23 +0000 (02:03 +0000)

committer Tim Shen <timshen@gcc.gnu.org>

Sat, 27 Aug 2016 02:03:23 +0000 (02:03 +0000)
author Tim Shen <timshen@google.com>
Sat, 27 Aug 2016 02:03:23 +0000 (02:03 +0000)
committer Tim Shen <timshen@gcc.gnu.org>
Sat, 27 Aug 2016 02:03:23 +0000 (02:03 +0000)
diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog

index 6b21648fcbd28a259eca1ea829040a7859d76aa9..8e3c466740d7863272af1e9940a915802b4af5b5 100644 (file)
--- a/libstdc++-v3/ChangeLog
+++ b/libstdc++-v3/ChangeLog
@@ -1,3 +1,14 @@
+2016-08-27  Tim Shen  <timshen@google.com>
+
+       PR libstdc++/77356
+       * include/bits/regex_compiler.tcc(_M_insert_bracket_matcher,
+       _M_expression_term): Modify to support dash literal.
+       * include/bits/regex_scanner.h: Add dash as a token type to make
+       a different from the mandated dash literal by escaping.
+       * include/bits/regex_scanner.tcc(_M_scan_in_bracket): Emit dash
+       token in bracket expression parsing.
+       * testsuite/28_regex/regression.cc: Add new testcases.
+
  2016-08-26  Jonathan Wakely  <jwakely@redhat.com>
  
         PR libstdc++/51960
diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc

index ff69e165511763e685386b6f78d4bbecf59b7ac8..ef6ebdd2ca065117ed781391f12f7e626832e7b7 100644 (file)
--- a/libstdc++-v3/include/bits/regex_compiler.tcc
+++ b/libstdc++-v3/include/bits/regex_compiler.tcc
@@ -426,13 +426,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        pair<bool, _CharT> __last_char; // Optional<_CharT>
        __last_char.first = false;
        if (!(_M_flags & regex_constants::ECMAScript))
-       if (_M_try_char())
-         {
-           __matcher._M_add_char(_M_value[0]);
-           __last_char.first = true;
-           __last_char.second = _M_value[0];
-         }
+       {
+         if (_M_try_char())
+           {
+             __last_char.first = true;
+             __last_char.second = _M_value[0];
+           }
+         else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
+           {
+             __last_char.first = true;
+             __last_char.second = '-';
+           }
+       }
        while (_M_expression_term(__last_char, __matcher));
+      if (__last_char.first)
+       __matcher._M_add_char(__last_char.second);
        __matcher._M_ready();
        _M_stack.push(_StateSeqT(
                       *_M_nfa,
@@ -449,19 +457,43 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        if (_M_match_token(_ScannerT::_S_token_bracket_end))
         return false;
  
+      const auto __push_char = [&](_CharT __ch)
+      {
+       if (__last_char.first)
+         __matcher._M_add_char(__last_char.second);
+       else
+         __last_char.first = true;
+       __last_char.second = __ch;
+      };
+      const auto __flush = [&]
+      {
+       if (__last_char.first)
+         {
+           __matcher._M_add_char(__last_char.second);
+           __last_char.first = false;
+         }
+      };
+
        if (_M_match_token(_ScannerT::_S_token_collsymbol))
         {
           auto __symbol = __matcher._M_add_collate_element(_M_value);
           if (__symbol.size() == 1)
-           {
-             __last_char.first = true;
-             __last_char.second = __symbol[0];
-           }
+           __push_char(__symbol[0]);
+         else
+           __flush();
         }
        else if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
-       __matcher._M_add_equivalence_class(_M_value);
+       {
+         __flush();
+         __matcher._M_add_equivalence_class(_M_value);
+       }
        else if (_M_match_token(_ScannerT::_S_token_char_class_name))
-       __matcher._M_add_character_class(_M_value, false);
+       {
+         __flush();
+         __matcher._M_add_character_class(_M_value, false);
+       }
+      else if (_M_try_char())
+       __push_char(_M_value[0]);
        // POSIX doesn't allow '-' as a start-range char (say [a-z--0]),
        // except when the '-' is the first or last character in the bracket
        // expression ([--0]). ECMAScript treats all '-' after a range as a
@@ -472,55 +504,55 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        // Clang (3.5) always uses ECMAScript style even in its POSIX syntax.
        //
        // It turns out that no one reads BNFs ;)
-      else if (_M_try_char())
+      else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
         {
           if (!__last_char.first)
             {
-             __matcher._M_add_char(_M_value[0]);
-             if (_M_value[0] == '-'
-                 && !(_M_flags & regex_constants::ECMAScript))
+             if (!(_M_flags & regex_constants::ECMAScript))
                 {
                   if (_M_match_token(_ScannerT::_S_token_bracket_end))
-                   return false;
+                   {
+                     __push_char('-');
+                     return false;
+                   }
                   __throw_regex_error(
                     regex_constants::error_range,
                     "Unexpected dash in bracket expression. For POSIX syntax, "
                     "a dash is not treated literally only when it is at "
                     "beginning or end.");
                 }
-             __last_char.first = true;
-             __last_char.second = _M_value[0];
+             __push_char('-');
             }
           else
             {
-             if (_M_value[0] == '-')
+             if (_M_try_char())
                 {
-                 if (_M_try_char())
-                   {
-                     __matcher._M_make_range(__last_char.second , _M_value[0]);
-                     __last_char.first = false;
-                   }
-                 else
-                   {
-                     if (_M_scanner._M_get_token()
-                         != _ScannerT::_S_token_bracket_end)
-                       __throw_regex_error(
-                         regex_constants::error_range,
-                         "Unexpected end of bracket expression.");
-                     __matcher._M_add_char(_M_value[0]);
-                   }
+                 __matcher._M_make_range(__last_char.second, _M_value[0]);
+                 __last_char.first = false;
+               }
+             else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
+               {
+                 __matcher._M_make_range(__last_char.second, '-');
+                 __last_char.first = false;
                 }
               else
                 {
-                 __matcher._M_add_char(_M_value[0]);
-                 __last_char.second = _M_value[0];
+                 if (_M_scanner._M_get_token()
+                     != _ScannerT::_S_token_bracket_end)
+                   __throw_regex_error(
+                     regex_constants::error_range,
+                     "Character is expected after a dash.");
+                 __push_char('-');
                 }
             }
         }
        else if (_M_match_token(_ScannerT::_S_token_quoted_class))
-       __matcher._M_add_character_class(_M_value,
-                                        _M_ctype.is(_CtypeT::upper,
-                                                    _M_value[0]));
+       {
+         __flush();
+         __matcher._M_add_character_class(_M_value,
+                                          _M_ctype.is(_CtypeT::upper,
+                                                      _M_value[0]));
+       }
        else
         __throw_regex_error(regex_constants::error_brack,
                             "Unexpected character in bracket expression.");
diff --git a/libstdc++-v3/include/bits/regex_scanner.h b/libstdc++-v3/include/bits/regex_scanner.h

index 37dea840d5be575d25a2d69acd53565f152043bf..ed0b723f3a5cc5a0d15a2133670ecf7acfc4c560 100644 (file)
--- a/libstdc++-v3/include/bits/regex_scanner.h
+++ b/libstdc++-v3/include/bits/regex_scanner.h
@@ -43,7 +43,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
    {
    public:
      /// Token types returned from the scanner.
-    enum _TokenT
+    enum _TokenT : unsigned
      {
        _S_token_anychar,
        _S_token_ord_char,
@@ -73,7 +73,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        _S_token_comma,
        _S_token_dup_count,
        _S_token_eof,
-      _S_token_unknown
+      _S_token_bracket_dash,
+      _S_token_unknown = -1u
      };
  
    protected:
diff --git a/libstdc++-v3/include/bits/regex_scanner.tcc b/libstdc++-v3/include/bits/regex_scanner.tcc

index fedba09a71396f06199b3d81b6c250629d5a3626..a734bb175d72ac0c4fd67a9ceaf0480177dfb6e2 100644 (file)
--- a/libstdc++-v3/include/bits/regex_scanner.tcc
+++ b/libstdc++-v3/include/bits/regex_scanner.tcc
@@ -210,7 +210,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
  
        auto __c = *_M_current++;
  
-      if (__c == '[')
+      if (__c == '-')
+       _M_token = _S_token_bracket_dash;
+      else if (__c == '[')
         {
           if (_M_current == _M_end)
             __throw_regex_error(regex_constants::error_brack,
diff --git a/libstdc++-v3/testsuite/28_regex/regression.cc b/libstdc++-v3/testsuite/28_regex/regression.cc

index 77ca043f85137305ee9264a41e30a5a8f7fbc25d..53a19afe4c3d0cdbe42ddf4223f4076b769822d3 100644 (file)
--- a/libstdc++-v3/testsuite/28_regex/regression.cc
+++ b/libstdc++-v3/testsuite/28_regex/regression.cc
@@ -61,12 +61,35 @@ test03()
    VERIFY(!regex_search_debug("a", regex(R"(\b$)"), regex_constants::match_not_eow));
  }
  
+// PR libstdc++/77356
+void
+test04()
+{
+  bool test __attribute__((unused)) = true;
+
+  static const char* kNumericAnchor ="(\\$|usd)(usd|\\$|to|and|up to|[0-9,\\.\\-\\sk])+";
+  const std::regex re(kNumericAnchor);
+  (void)re;
+}
+
+void
+test05()
+{
+  bool test __attribute__((unused)) = true;
+
+  VERIFY(regex_match_debug("!", std::regex("[![:alnum:]]")));
+  VERIFY(regex_match_debug("-", std::regex("[a-]", regex_constants::basic)));
+  VERIFY(regex_match_debug("-", std::regex("[a-]")));
+}
+
  int
  main()
  {
    test01();
    test02();
    test03();
+  test04();
+  test05();
    return 0;
  }
author	Tim Shen <timshen@google.com>
	Sat, 27 Aug 2016 02:03:23 +0000 (02:03 +0000)
committer	Tim Shen <timshen@gcc.gnu.org>
	Sat, 27 Aug 2016 02:03:23 +0000 (02:03 +0000)
libstdc++-v3/ChangeLog		patch \| blob \| history
libstdc++-v3/include/bits/regex_compiler.tcc		patch \| blob \| history
libstdc++-v3/include/bits/regex_scanner.h		patch \| blob \| history
libstdc++-v3/include/bits/regex_scanner.tcc		patch \| blob \| history
libstdc++-v3/testsuite/28_regex/regression.cc		patch \| blob \| history