regex.h (regex_token_iterator<>::regex_token_iterator): Fix initialization orders...
[gcc.git] / libstdc++-v3 / include / bits / regex_executor.h
1 // class template regex -*- C++ -*-
2
3 // Copyright (C) 2013 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24
25 /**
26 * @file bits/regex_executor.h
27 * This is an internal header file, included by other library headers.
28 * Do not attempt to use it directly. @headername{regex}
29 */
30
31 // FIXME convert comments to doxygen format.
32
33 // TODO Put _DFSExecutor and _BFSExecutor into one class. They are becoming
34 // much more similar. Also, make grouping seperated. The
35 // regex_constants::nosubs enables much more simpler execution.
36
37 namespace std _GLIBCXX_VISIBILITY(default)
38 {
39 _GLIBCXX_BEGIN_NAMESPACE_VERSION
40 template<typename, typename>
41 class basic_regex;
42
43 template<typename>
44 class sub_match;
45
46 template<typename, typename>
47 class match_results;
48 _GLIBCXX_END_NAMESPACE_VERSION
49
50 namespace __detail
51 {
52 _GLIBCXX_BEGIN_NAMESPACE_VERSION
53
54 /**
55 * @addtogroup regex-detail
56 * @{
57 */
58
59 template<typename _BiIter, typename _Alloc,
60 typename _CharT, typename _TraitsT>
61 class _Executor
62 {
63 public:
64 typedef basic_regex<_CharT, _TraitsT> _RegexT;
65 typedef std::vector<sub_match<_BiIter>, _Alloc> _ResultsVec;
66 typedef regex_constants::match_flag_type _FlagT;
67 typedef typename _TraitsT::char_class_type _ClassT;
68
69 public:
70 _Executor(_BiIter __begin,
71 _BiIter __end,
72 _ResultsVec& __results,
73 const _RegexT& __re,
74 _FlagT __flags)
75 : _M_begin(__begin),
76 _M_end(__end),
77 _M_re(__re),
78 _M_results(__results),
79 _M_flags((__flags & regex_constants::match_prev_avail)
80 ? (__flags
81 & ~regex_constants::match_not_bol
82 & ~regex_constants::match_not_bow)
83 : __flags)
84 { }
85
86 virtual
87 ~_Executor()
88 { }
89
90 // Set matched when string exactly match the pattern.
91 bool
92 _M_match()
93 {
94 _M_match_mode = true;
95 _M_init(_M_begin);
96 return _M_main();
97 }
98
99 // Set matched when some prefix of the string matches the pattern.
100 bool
101 _M_search_from_first()
102 {
103 _M_match_mode = false;
104 _M_init(_M_begin);
105 return _M_main();
106 }
107
108 bool
109 _M_search();
110
111 bool
112 _M_is_word(_CharT __ch) const
113 {
114 static const _CharT __s = 'w';
115 return _M_re._M_traits.isctype
116 (__ch, _M_re._M_traits.lookup_classname(&__s, &__s+1));
117 }
118
119 bool
120 _M_at_begin() const
121 {
122 return _M_current == _M_begin
123 && !(_M_flags & (regex_constants::match_not_bol
124 | regex_constants::match_prev_avail));
125 }
126
127 bool
128 _M_at_end() const
129 {
130 return _M_current == _M_end
131 && !(_M_flags & regex_constants::match_not_eol);
132 }
133
134 bool
135 _M_word_boundry(_State<_CharT, _TraitsT> __state) const;
136
137 virtual std::unique_ptr<_Executor>
138 _M_clone() const = 0;
139
140 // Return whether now match the given sub-NFA.
141 bool
142 _M_lookahead(_State<_CharT, _TraitsT> __state) const
143 {
144 auto __sub = this->_M_clone();
145 __sub->_M_set_start(__state._M_alt);
146 return __sub->_M_search_from_first();
147 }
148
149 void
150 _M_set_results(_ResultsVec& __cur_results);
151
152 public:
153 virtual void
154 _M_init(_BiIter __cur) = 0;
155
156 virtual void
157 _M_set_start(_StateIdT __start) = 0;
158
159 virtual bool
160 _M_main() = 0;
161
162 _BiIter _M_current;
163 const _BiIter _M_begin;
164 const _BiIter _M_end;
165 const _RegexT& _M_re;
166 _ResultsVec& _M_results;
167 _FlagT _M_flags;
168 bool _M_match_mode;
169 };
170
171 // A _DFSExecutor perform a DFS on given NFA and input string. At the very
172 // beginning the executor stands in the start state, then it try every
173 // possible state transition in current state recursively. Some state
174 // transitions consume input string, say, a single-char-matcher or a
175 // back-reference matcher; some not, like assertion or other anchor nodes.
176 // When the input is exhausted and the current state is an accepting state,
177 // the whole executor return true.
178 //
179 // TODO: This approach is exponentially slow for certain input.
180 // Try to compile the NFA to a DFA.
181 //
182 // Time complexity: exponential
183 // Space complexity: O(__end - __begin)
184 template<typename _BiIter, typename _Alloc,
185 typename _CharT, typename _TraitsT>
186 class _DFSExecutor
187 : public _Executor<_BiIter, _Alloc, _CharT, _TraitsT>
188 {
189 public:
190 typedef _Executor<_BiIter, _Alloc, _CharT, _TraitsT> _BaseT;
191 typedef _NFA<_CharT, _TraitsT> _NFAT;
192 typedef typename _BaseT::_RegexT _RegexT;
193 typedef typename _BaseT::_ResultsVec _ResultsVec;
194 typedef typename _BaseT::_FlagT _FlagT;
195
196 public:
197 _DFSExecutor(_BiIter __begin,
198 _BiIter __end,
199 _ResultsVec& __results,
200 const _RegexT& __re,
201 _FlagT __flags)
202 : _BaseT(__begin, __end, __results, __re, __flags),
203 _M_nfa(*std::static_pointer_cast<_NFA<_CharT, _TraitsT>>
204 (__re._M_automaton)),
205 _M_start_state(_M_nfa._M_start())
206 { }
207
208 private:
209 void
210 _M_init(_BiIter __cur)
211 {
212 _M_cur_results.resize(_M_nfa._M_sub_count() + 2);
213 this->_M_current = __cur;
214 }
215
216 void
217 _M_set_start(_StateIdT __start)
218 { _M_start_state = __start; }
219
220 bool
221 _M_main()
222 { return _M_dfs(this->_M_start_state); }
223
224 bool
225 _M_dfs(_StateIdT __start);
226
227 std::unique_ptr<_BaseT>
228 _M_clone() const
229 {
230 return std::unique_ptr<_BaseT>(new _DFSExecutor(this->_M_current,
231 this->_M_end,
232 this->_M_results,
233 this->_M_re,
234 this->_M_flags));
235 }
236
237 // To record current solution.
238 _ResultsVec _M_cur_results;
239 const _NFAT& _M_nfa;
240 _StateIdT _M_start_state;
241 };
242
243 // Like the DFS approach, it try every possible state transition; Unlike DFS,
244 // it uses a queue instead of a stack to store matching states. It's a BFS
245 // approach.
246 //
247 // Russ Cox's article(http://swtch.com/~rsc/regexp/regexp1.html) explained
248 // this algorithm clearly.
249 //
250 // Every entry of _M_covered saves the solution(grouping status) for every
251 // matching head. When states transit, solutions will be compared and
252 // deduplicated(based on which greedy mode we have).
253 //
254 // Time complexity: O((__end - __begin) * _M_nfa.size())
255 // Space complexity: O(_M_nfa.size() * _M_nfa.mark_count())
256 template<typename _BiIter, typename _Alloc,
257 typename _CharT, typename _TraitsT>
258 class _BFSExecutor
259 : public _Executor<_BiIter, _Alloc, _CharT, _TraitsT>
260 {
261 public:
262 typedef _Executor<_BiIter, _Alloc, _CharT, _TraitsT> _BaseT;
263 typedef _NFA<_CharT, _TraitsT> _NFAT;
264 typedef typename _BaseT::_RegexT _RegexT;
265 typedef typename _BaseT::_ResultsVec _ResultsVec;
266 typedef typename _BaseT::_FlagT _FlagT;
267 // Here's a solution for greedy/ungreedy mode in BFS approach. We need to
268 // carefully work out how to compare to conflict matching states.
269 //
270 // A matching state is a pair(where, when); `where` is a NFA node; `when`
271 // is a _BiIter, indicating which char is the next to be matched. Two
272 // matching states conflict if they have equivalent `where` and `when`.
273 //
274 // Now we need to drop one and keep another, because at most one of them
275 // could be the final optimal solution. This behavior is affected by
276 // greedy policy.
277 //
278 // The definition of `greedy`:
279 // For the sequence of quantifiers in NFA sorted by their start positions,
280 // now maintain a vector in every matching state, with length equal to
281 // quantifier seq, recording repeating times of every quantifier. Now to
282 // compare two matching states, we just lexically compare these two
283 // vectors. To win the compare(to survive), one matching state needs to
284 // make its greedy quantifier count larger, and ungreedy quantifiers
285 // count smaller.
286 //
287 // In the implementation, we recorded negtive counts for greedy
288 // quantifiers and positive counts of ungreedy ones. Now the implicit
289 // operator<() for lexicographical_compare will emit the answer.
290 //
291 // When two vectors equal, it means the `where`, `when` and quantifier
292 // counts are identical, and indicates the same solution; so
293 // _ResultsEntry::operator<() just return false.
294 struct _ResultsEntry
295 : private _ResultsVec
296 {
297 public:
298 _ResultsEntry(size_t __res_sz, size_t __sz)
299 : _ResultsVec(__res_sz), _M_quant_keys(__sz)
300 { }
301
302 void
303 resize(size_t __n)
304 { _ResultsVec::resize(__n); }
305
306 size_t
307 size()
308 { return _ResultsVec::size(); }
309
310 sub_match<_BiIter>&
311 operator[](size_t __idx)
312 { return _ResultsVec::operator[](__idx); }
313
314 bool
315 operator<(const _ResultsEntry& __rhs) const
316 {
317 _GLIBCXX_DEBUG_ASSERT(_M_quant_keys.size()
318 == __rhs._M_quant_keys.size());
319 return lexicographical_compare(_M_quant_keys.begin(),
320 _M_quant_keys.end(),
321 __rhs._M_quant_keys.begin(),
322 __rhs._M_quant_keys.end());
323 }
324
325 void
326 _M_inc(size_t __idx, bool __neg)
327 { _M_quant_keys[__idx] += __neg ? 1 : -1; }
328
329 _ResultsVec&
330 _M_get()
331 { return *this; }
332
333 public:
334 std::vector<int> _M_quant_keys;
335 };
336 typedef std::unique_ptr<_ResultsEntry> _ResultsPtr;
337
338 class _TodoList
339 {
340 public:
341 explicit
342 _TodoList(size_t __sz)
343 : _M_states(), _M_exists(__sz, false)
344 { }
345
346 void _M_push(_StateIdT __u)
347 {
348 _GLIBCXX_DEBUG_ASSERT(__u < _M_exists.size());
349 if (!_M_exists[__u])
350 {
351 _M_exists[__u] = true;
352 _M_states.push_back(__u);
353 }
354 }
355
356 _StateIdT _M_pop()
357 {
358 auto __ret = _M_states.back();
359 _M_states.pop_back();
360 _M_exists[__ret] = false;
361 return __ret;
362 }
363
364 bool _M_empty() const
365 { return _M_states.empty(); }
366
367 void _M_clear()
368 {
369 _M_states.clear();
370 _M_exists.assign(_M_exists.size(), false);
371 }
372
373 private:
374 std::vector<_StateIdT> _M_states;
375 std::vector<bool> _M_exists;
376 };
377
378 public:
379 _BFSExecutor(_BiIter __begin,
380 _BiIter __end,
381 _ResultsVec& __results,
382 const _RegexT& __re,
383 _FlagT __flags)
384 : _BaseT(__begin, __end, __results, __re, __flags),
385 _M_nfa(*std::static_pointer_cast<_NFA<_CharT, _TraitsT>>
386 (__re._M_automaton)),
387 _M_match_stack(_M_nfa.size()),
388 _M_stack(_M_nfa.size()),
389 _M_start_state(_M_nfa._M_start())
390 { }
391
392 private:
393 void
394 _M_init(_BiIter __cur)
395 {
396 this->_M_current = __cur;
397 _M_covered.clear();
398 _ResultsVec& __res(this->_M_results);
399 _M_covered[this->_M_start_state] =
400 _ResultsPtr(new _ResultsEntry(__res.size(),
401 _M_nfa._M_quant_count));
402 _M_stack._M_push(this->_M_start_state);
403 }
404
405 void
406 _M_set_start(_StateIdT __start)
407 { _M_start_state = __start; }
408
409 bool
410 _M_main();
411
412 void
413 _M_e_closure();
414
415 void
416 _M_move();
417
418 bool
419 _M_includes_some();
420
421 std::unique_ptr<_BaseT>
422 _M_clone() const
423 {
424 return std::unique_ptr<_BaseT>(new _BFSExecutor(this->_M_current,
425 this->_M_end,
426 this->_M_results,
427 this->_M_re,
428 this->_M_flags));
429 }
430
431 const _NFAT& _M_nfa;
432 std::map<_StateIdT, _ResultsPtr> _M_covered;
433 _TodoList _M_match_stack;
434 _TodoList _M_stack;
435 _StateIdT _M_start_state;
436 // To record global optimal solution.
437 _ResultsPtr _M_cur_results;
438 };
439
440 //@} regex-detail
441 _GLIBCXX_END_NAMESPACE_VERSION
442 } // namespace __detail
443 } // namespace std
444
445 #include <bits/regex_executor.tcc>