regex.h: Executor caller.
authorTim Shen <timshen91@gmail.com>
Thu, 22 Aug 2013 00:58:28 +0000 (00:58 +0000)
committerTim Shen <timshen@gcc.gnu.org>
Thu, 22 Aug 2013 00:58:28 +0000 (00:58 +0000)
2013-08-22  Tim Shen  <timshen91@gmail.com>

* include/bits/regex.h: Executor caller.
* include/bits/regex_executor.h: Fix empty grouping problem.
* include/bits/regex_executor.tcc: Same.
* testsuite/28_regex/algorithms/regex_match/ecma/cstring_emptygroup.cc:
  New.

From-SVN: r201914

libstdc++-v3/ChangeLog
libstdc++-v3/include/bits/regex.h
libstdc++-v3/include/bits/regex_executor.h
libstdc++-v3/include/bits/regex_executor.tcc
libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/cstring_emptygroup.cc [new file with mode: 0644]

index 689dc99fe17ff092810a9c0519867cddd9afacd8..c614b326e35743d727ed591ea514417529df174a 100644 (file)
@@ -1,3 +1,11 @@
+2013-08-22  Tim Shen  <timshen91@gmail.com>
+
+       * include/bits/regex.h: Executor caller.
+       * include/bits/regex_executor.h: Fix empty grouping problem.
+       * include/bits/regex_executor.tcc: Same.
+       * testsuite/28_regex/algorithms/regex_match/ecma/cstring_emptygroup.cc:
+         New.
+
 2013-08-20  Phil Muldoon  <pmuldoon@redhat.com>
 
        PR libstdc++/53477
index 8fa979fa375967dd1de1e9c847910bcc38dd0c3a..f5932edaadb0058735a9c8e2600e8852a8c6f83c 100644 (file)
@@ -2211,7 +2211,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
     {
       if (__re._M_automaton == nullptr)
         return false;
-      if (__detail::__get_executor(__s, __e, __m, __re, __flags)->_M_match())
+      __detail::__get_executor(__s, __e, __m, __re, __flags)->_M_match();
+      if (__m.size() > 0 && __m[0].matched)
         {
           for (auto __it : __m)
             if (!__it.matched)
@@ -2371,22 +2372,25 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       if (__re._M_automaton == nullptr)
         return false;
       for (auto __cur = __first; __cur != __last; ++__cur) // Any KMP-like algo?
-        if (__detail::__get_executor(__cur, __last, __m, __re, __flags)
-              ->_M_search_from_first())
-          {
-            for (auto __it : __m)
-              if (!__it.matched)
-                __it.first = __it.second = __last;
-            __m.at(__m.size()).first = __first;
-            __m.at(__m.size()).second = __m[0].first;
-            __m.at(__m.size()+1).first = __m[0].second;
-            __m.at(__m.size()+1).second = __last;
-            __m.at(__m.size()).matched =
-              (__m.prefix().first != __m.prefix().second);
-            __m.at(__m.size()+1).matched =
-              (__m.suffix().first != __m.suffix().second);
-            return true;
-          }
+        {
+          __detail::__get_executor(__cur, __last, __m, __re, __flags)
+            ->_M_search_from_first();
+          if (__m.size() > 0 && __m[0].matched)
+           {
+             for (auto __it : __m)
+               if (!__it.matched)
+                 __it.first = __it.second = __last;
+             __m.at(__m.size()).first = __first;
+             __m.at(__m.size()).second = __m[0].first;
+             __m.at(__m.size()+1).first = __m[0].second;
+             __m.at(__m.size()+1).second = __last;
+             __m.at(__m.size()).matched =
+               (__m.prefix().first != __m.prefix().second);
+             __m.at(__m.size()+1).matched =
+               (__m.suffix().first != __m.suffix().second);
+             return true;
+           }
+        }
       return false;
     }
 
index 0006a29b614bbb09fcf31fc3c2f182371053c3ee..e2d6e6e412d87f911fb1279802e1b94a7d129af5 100644 (file)
  *  Do not attempt to use it directly. @headername{regex}
  */
 
+// TODO: convert comments to doxygen format.
+
 namespace std _GLIBCXX_VISIBILITY(default)
 {
 _GLIBCXX_BEGIN_NAMESPACE_VERSION
   template<typename, typename>
     class basic_regex;
 
+  template<typename>
+    class sub_match;
+
   template<typename, typename>
     class match_results;
 _GLIBCXX_END_NAMESPACE_VERSION
@@ -52,19 +57,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
     class _Executor
     {
     public:
-      typedef match_results<_BiIter, _Alloc>   _ResultsT;
-      typedef regex_constants::match_flag_type _FlagT;
+      typedef match_results<_BiIter, _Alloc>          _ResultsT;
+      typedef std::vector<sub_match<_BiIter>, _Alloc> _ResultsVec;
+      typedef regex_constants::match_flag_type        _FlagT;
 
       virtual
       ~_Executor()
       { }
 
       // Set matched when string exactly match the pattern.
-      virtual bool
+      virtual void
       _M_match() = 0;
 
       // Set matched when some prefix of the string matches the pattern.
-      virtual bool
+      virtual void
       _M_search_from_first() = 0;
 
     protected:
@@ -74,20 +80,34 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
                 _ResultsT& __results,
                 _FlagT     __flags,
                 _SizeT     __size)
-      : _M_current(__begin), _M_end(__end),
-        _M_results(__results), _M_flags(__flags)
+      : _M_current(__begin), _M_end(__end), _M_results(__results),
+        _M_flags(__flags)
       {
-        __results.resize(__size + 2);
-        for (auto __it : __results)
-          __it.matched = false;
+        __size += 2;
+        _M_results.resize(__size);
+        for (auto __i = 0; __i < __size; __i++)
+          _M_results[__i].matched = false;
       }
 
-      _BiIter    _M_current;
-      _BiIter    _M_end;
-      _ResultsT& _M_results;
-      _FlagT     _M_flags;
+      _BiIter       _M_current;
+      _BiIter       _M_end;
+      _ResultsVec&  _M_results;
+      _FlagT        _M_flags;
     };
 
+  // A _DFSExecutor perform a DFS on given NFA and input string. At the very
+  // beginning the executor stands in the start state, then it try every
+  // possible state transition in current state recursively. Some state
+  // transitions consume input string, say, a single-char-matcher or a
+  // back-reference matcher; some not, like assertion or other anchor nodes.
+  // When the input is exhausted and the current state is an accepting state,
+  // the whole executor return true.
+  //
+  // TODO: This approach is exponentially slow for certain input.
+  //       Try to compile the NFA to a DFA.
+  //
+  // Time complexity: exponential
+  // Space complexity: O(__end - __begin)
   template<typename _BiIter, typename _Alloc,
     typename _CharT, typename _TraitsT>
     class _DFSExecutor
@@ -97,6 +117,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       typedef _Executor<_BiIter, _Alloc, _CharT, _TraitsT> _BaseT;
       typedef _NFA<_CharT, _TraitsT>                       _RegexT;
       typedef typename _BaseT::_ResultsT                   _ResultsT;
+      typedef typename _BaseT::_ResultsVec                 _ResultsVec;
       typedef regex_constants::match_flag_type             _FlagT;
 
       _DFSExecutor(_BiIter        __begin,
@@ -105,37 +126,39 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
                    const _RegexT& __nfa,
                    _FlagT         __flags)
       : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()),
-        _M_traits(_TraitsT()), _M_nfa(__nfa)
+        _M_traits(_TraitsT()), _M_nfa(__nfa), _M_results_ret(this->_M_results)
       { }
 
-      bool
+      void
       _M_match()
-      { return _M_dfs<true>(_M_nfa._M_start()); }
+      { _M_dfs<true>(_M_nfa._M_start()); }
 
-      bool
+      void
       _M_search_from_first()
-      { return _M_dfs<false>(_M_nfa._M_start()); }
+      { _M_dfs<false>(_M_nfa._M_start()); }
 
     private:
       template<bool __match_mode>
         bool
         _M_dfs(_StateIdT __i);
 
+      _ResultsVec    _M_results_ret;
       _TraitsT       _M_traits;
       const _RegexT& _M_nfa;
     };
 
-  // It's essentially a variant of Single-Source-Shortest-Path problem, where,
-  // the matching results is the final distance and should be minimized.
-  // Instead of using Dijkstra Algorithm, I pick up the queue-optimizaed
-  // (BFS-like) Bellman-Ford algorithm,
-  // SPFA(http://en.wikipedia.org/wiki/Shortest_Path_Faster_Algorithm).
+  // Like the DFS approach, it try every possible state transition; Unlike DFS,
+  // it uses a queue instead of a stack to store matching states. It's a BFS
+  // approach.
+  //
+  // Russ Cox's article(http://swtch.com/~rsc/regexp/regexp1.html) explained
+  // this algorithm clearly.
   //
   // Every entry of _M_covered saves the solution(grouping status) for every
-  // matching head. When states transfer, solutions will be compared and
+  // matching head. When states transit, solutions will be compared and
   // deduplicated(based on which greedy mode we have).
   //
-  // Time complexity: O(_M_str_cur.size() * _M_nfa.size())
+  // Time complexity: O((__end - __begin) * _M_nfa.size())
   // Space complexity: O(_M_nfa.size() * _M_nfa.mark_count())
   template<typename _BiIter, typename _Alloc,
     typename _CharT, typename _TraitsT>
@@ -146,12 +169,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       typedef _Executor<_BiIter, _Alloc, _CharT, _TraitsT> _BaseT;
       typedef _NFA<_CharT, _TraitsT>                       _RegexT;
       typedef typename _BaseT::_ResultsT                   _ResultsT;
-      typedef std::unique_ptr<_ResultsT>                   _ResultsPtr;
+      typedef typename _BaseT::_ResultsVec                 _ResultsVec;
+      typedef std::unique_ptr<_ResultsVec>                 _ResultsPtr;
       typedef regex_constants::match_flag_type             _FlagT;
 
       _BFSExecutor(_BiIter        __begin,
                    _BiIter        __end,
-                   _ResultsT&      __results,
+                   _ResultsT&     __results,
                    const _RegexT& __nfa,
                    _FlagT         __flags)
       : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()),
@@ -159,21 +183,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       {
         if (_M_nfa._M_start() != _S_invalid_state_id)
           _M_covered[_M_nfa._M_start()] =
-            _ResultsPtr(new _ResultsT(this->_M_results));
+            _ResultsPtr(new _ResultsVec(this->_M_results));
         _M_e_closure();
       }
 
-      bool
+      void
       _M_match()
-      { return _M_main_loop<true>(); }
+      { _M_main_loop<true>(); }
 
-      bool
+      void
       _M_search_from_first()
-      { return _M_main_loop<false>(); }
+      { _M_main_loop<false>(); }
 
     private:
       template<bool __match_mode>
-        bool
+        void
         _M_main_loop();
 
       void
@@ -183,13 +207,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       _M_move();
 
       bool
-      _M_match_less_than(_StateIdT __u, _StateIdT __v) const;
+      _M_match_less_than(const _ResultsVec& __u, const _ResultsVec& __v) const;
 
       bool
       _M_includes_some() const;
 
       std::map<_StateIdT, _ResultsPtr>     _M_covered;
-      const _RegexT& _M_nfa;
+      const _RegexT&                       _M_nfa;
     };
 
  //@} regex-detail
index 08b4915a3e3ba8c7749489d83f9b45917b6c99d7..9115ea0ebf2d98f89ece67613786007d4abd97d9 100644 (file)
@@ -34,19 +34,18 @@ namespace __detail
 {
 _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
-  // TODO: This is too slow. Try to compile the NFA to a DFA.
   template<typename _BiIter, typename _Alloc,
     typename _CharT, typename _TraitsT>
   template<bool __match_mode>
     bool _DFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
     _M_dfs(_StateIdT __i)
     {
-      auto& __current = this->_M_current;
-      auto& __end = this->_M_end;
-      auto& __results = this->_M_results;
       if (__i == _S_invalid_state_id)
         // This is not that certain. Need deeper investigate.
         return false;
+      auto& __current = this->_M_current;
+      auto& __end = this->_M_end;
+      auto& __results = _M_results_ret;
       const auto& __state = _M_nfa[__i];
       bool __ret = false;
       switch (__state._M_opcode)
@@ -59,14 +58,33 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
             || _M_dfs<__match_mode>(__state._M_next);
           break;
         case _S_opcode_subexpr_begin:
-          __results.at(__state._M_subexpr).first = __current;
-          __ret = _M_dfs<__match_mode>(__state._M_next);
+          // Here's the critical part: if there's nothing changed since last
+          // visit, do NOT continue. This prevents the executor from get into
+          // infinite loop when use "()*" to match "".
+          //
+          // Every change on __results will be roll back after the recursion
+          // step finished.
+          if (!__results[__state._M_subexpr].matched
+              || __results[__state._M_subexpr].first != __current)
+            {
+              auto __back = __current;
+              __results[__state._M_subexpr].first = __current;
+              __ret = _M_dfs<__match_mode>(__state._M_next);
+              __results[__state._M_subexpr].first = __back;
+            }
           break;
         case _S_opcode_subexpr_end:
-          __results.at(__state._M_subexpr).second = __current;
-          __results.at(__state._M_subexpr).matched = true;
-          __ret = _M_dfs<__match_mode>(__state._M_next);
-          __results.at(__state._M_subexpr).matched = __ret;
+          if (__results[__state._M_subexpr].second != __current
+              || __results[__state._M_subexpr].matched != true)
+            {
+              auto __back = __results[__state._M_subexpr];
+              __results[__state._M_subexpr].second = __current;
+              __results[__state._M_subexpr].matched = true;
+              __ret = _M_dfs<__match_mode>(__state._M_next);
+              __results[__state._M_subexpr] = __back;
+            }
+          else
+            __ret = _M_dfs<__match_mode>(__state._M_next);
           break;
         case _S_opcode_match:
           if (__current != __end && __state._M_matches(*__current))
@@ -82,7 +100,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
         // If matched, keep going; else just return to try another state.
         case _S_opcode_backref:
           {
-            auto& __submatch = __results.at(__state._M_backref_index);
+            auto& __submatch = __results[__state._M_backref_index];
             if (!__submatch.matched)
               break;
             auto __last = __current;
@@ -92,12 +110,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
               ++__last;
             if (_M_traits.transform(__submatch.first, __submatch.second)
                 == _M_traits.transform(__current, __last))
-              {
-                auto __backup = __current;
-                __current = __last;
+              if (__last != __current)
+                {
+                  auto __backup = __current;
+                  __current = __last;
+                  __ret = _M_dfs<__match_mode>(__state._M_next);
+                  __current = __backup;
+                }
+              else
                 __ret = _M_dfs<__match_mode>(__state._M_next);
-                __current = __backup;
-              }
           }
           break;
         case _S_opcode_accept:
@@ -105,6 +126,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
             __ret = __current == __end;
           else
             __ret = true;
+          if (__ret)
+            this->_M_results = __results;
           break;
         default:
           _GLIBCXX_DEBUG_ASSERT(false);
@@ -115,22 +138,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   template<typename _BiIter, typename _Alloc,
     typename _CharT, typename _TraitsT>
   template<bool __match_mode>
-    bool _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
+    void _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
     _M_main_loop()
     {
       while (this->_M_current != this->_M_end)
         {
           if (!__match_mode)
             if (_M_includes_some())
-              return true;
+              return;
           _M_move();
           ++this->_M_current;
           _M_e_closure();
         }
-      return _M_includes_some();
+      _M_includes_some();
     }
 
-  // The SPFA approach.
   template<typename _BiIter, typename _Alloc,
     typename _CharT, typename _TraitsT>
     void _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
@@ -152,13 +174,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
           const auto& __state = _M_nfa[__u];
 
           // Can be implemented using method, but there're too much arguments.
+          // I would use macro function before C++11, but lambda is a better
+          // choice, since hopefully compiler can inline it.
           auto __add_visited_state = [&](_StateIdT __v)
           {
             if (__v == _S_invalid_state_id)
               return;
-            if (_M_match_less_than(__u, __v))
+            if (_M_covered.count(__u) != 0
+                && (_M_covered.count(__v) == 0
+                    || _M_match_less_than(*_M_covered[__u], *_M_covered[__v])))
               {
-                _M_covered[__v] = _ResultsPtr(new _ResultsT(*_M_covered[__u]));
+                _M_covered[__v] = _ResultsPtr(new _ResultsVec(*_M_covered[__u]));
                 // if a state is updated, it's outgoing neighbors should be
                 // reconsidered too. Push them to the queue.
                 if (!__in_q[__v])
@@ -176,13 +202,23 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
                 __add_visited_state(__state._M_alt);
                 break;
               case _S_opcode_subexpr_begin:
-                _M_covered[__u]->at(__state._M_subexpr).first = __current;
-                __add_visited_state(__state._M_next);
+                {
+                  auto& __cu = *_M_covered[__u];
+                  auto __back = __cu[__state._M_subexpr].first;
+                  __cu[__state._M_subexpr].first = __current;
+                  __add_visited_state(__state._M_next);
+                  __cu[__state._M_subexpr].first = __back;
+                }
                 break;
               case _S_opcode_subexpr_end:
-                _M_covered[__u]->at(__state._M_subexpr).second = __current;
-                _M_covered[__u]->at(__state._M_subexpr).matched = true;
-                __add_visited_state(__state._M_next);
+                {
+                  auto& __cu = *_M_covered[__u];
+                  auto __back = __cu[__state._M_subexpr];
+                  __cu[__state._M_subexpr].second = __current;
+                  __cu[__state._M_subexpr].matched = true;
+                  __add_visited_state(__state._M_next);
+                  __cu[__state._M_subexpr] = __back;
+                }
                 break;
               case _S_opcode_match:
                 break;
@@ -206,9 +242,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
           const auto& __state = _M_nfa[__it.first];
           if (__state._M_opcode == _S_opcode_match
               && __state._M_matches(*this->_M_current))
-            if (_M_match_less_than(__it.first, __state._M_next)
-                && __state._M_next != _S_invalid_state_id)
-              __next[__state._M_next] = move(__it.second);
+            if (__state._M_next != _S_invalid_state_id)
+              if (__next.count(__state._M_next) == 0
+                  || _M_match_less_than(*__it.second, *__next[__state._M_next]))
+                __next[__state._M_next] = move(__it.second);
         }
       _M_covered = move(__next);
     }
@@ -216,14 +253,28 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   template<typename _BiIter, typename _Alloc,
     typename _CharT, typename _TraitsT>
     bool _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
-    _M_match_less_than(_StateIdT __u, _StateIdT __v) const
+    _M_match_less_than(const _ResultsVec& __u, const _ResultsVec& __v) const
     {
-      if (_M_covered.count(__u) == 0)
-        return false;
-      if (_M_covered.count(__v) > 0)
-        return true;
       // TODO: Greedy and Non-greedy support
-      return true;
+      _GLIBCXX_DEBUG_ASSERT(__u.size() == __v.size());
+      auto __size = __u.size();
+      for (auto __i = 0; __i < __size; __i++)
+        {
+          auto& __uit = __u[__i], __vit = __v[__i];
+          if (__uit.matched && !__vit.matched)
+            return true;
+          if (!__uit.matched && __vit.matched)
+            return false;
+          if (__uit.matched && __vit.matched)
+            {
+              // GREEDY
+              if (__uit.first != __vit.first)
+                return __uit.first < __vit.first;
+              if (__uit.second != __vit.second)
+                return __uit.second > __vit.second;
+            }
+        }
+      return false;
     }
 
   template<typename _BiIter, typename _Alloc,
@@ -265,11 +316,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       typedef std::unique_ptr<_Executor<_BiIter, _Alloc, _CharT, _TraitsT>>
         _ExecutorPtr;
       typedef _DFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT> _DFSExecutorT;
+      typedef _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT> _BFSExecutorT;
       auto __p = std::static_pointer_cast<_NFA<_CharT, _TraitsT>>
         (__re._M_automaton);
       if (__p->_M_has_backref)
         return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, *__p, __flags));
-      return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, *__p, __flags));
+      return _ExecutorPtr(new _BFSExecutorT(__b, __e, __m, *__p, __flags));
     }
 
 _GLIBCXX_END_NAMESPACE_VERSION
diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/cstring_emptygroup.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/cstring_emptygroup.cc
new file mode 100644 (file)
index 0000000..e112db5
--- /dev/null
@@ -0,0 +1,58 @@
+// { dg-options "-std=gnu++11" }
+
+//
+// 2013-08-22  Tim Shen <timshen91@gmail.com>
+//
+// Copyright (C) 2013 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// 28.11.2 regex_match
+// Tests ECMAScript empty-grouping against a C-string.
+
+#include <regex>
+#include <testsuite_hooks.h>
+
+using namespace std;
+
+void
+test01()
+{
+  bool test __attribute__((unused)) = true;
+
+  {
+    regex re("()*\\1");
+    cmatch m;
+    const char s[] = "";
+    VERIFY( regex_match(s, m, re) );
+    VERIFY( m.size() == 2 );
+    VERIFY( m[0].matched );
+    VERIFY( m[1].matched );
+  }
+  {
+    regex re("()*");
+    cmatch m;
+    const char s[] = "";
+    VERIFY( regex_match(s, m, re) );
+  }
+}
+
+int
+main()
+{
+  test01();
+  return 0;
+}