libstdc++
regex_executor.h
Go to the documentation of this file.
1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2013-2023 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  * @file bits/regex_executor.h
27  * This is an internal header file, included by other library headers.
28  * Do not attempt to use it directly. @headername{regex}
29  */
30 
31 // FIXME convert comments to doxygen format.
32 
33 namespace std _GLIBCXX_VISIBILITY(default)
34 {
35 _GLIBCXX_BEGIN_NAMESPACE_VERSION
36 
37 namespace __detail
38 {
39  /**
40  * @addtogroup regex-detail
41  * @{
42  */
43 
44  /**
45  * @brief Takes a regex and an input string and does the matching.
46  *
47  * The %_Executor class has two modes: DFS mode and BFS mode, controlled
48  * by the template parameter %__dfs_mode.
49  */
50  template<typename _BiIter, typename _Alloc, typename _TraitsT,
51  bool __dfs_mode>
52  class _Executor
53  {
55  using __dfs = true_type;
56  using __bfs = false_type;
57 
58  enum class _Match_mode : unsigned char { _Exact, _Prefix };
59 
60  public:
61  typedef typename iterator_traits<_BiIter>::value_type _CharT;
63  typedef _GLIBCXX_STD_C::vector<sub_match<_BiIter>, _Alloc> _ResultsVec;
65  typedef typename _TraitsT::char_class_type _ClassT;
66  typedef _NFA<_TraitsT> _NFAT;
67 
68  public:
69  _Executor(_BiIter __begin,
70  _BiIter __end,
71  _ResultsVec& __results,
72  const _RegexT& __re,
73  _FlagT __flags)
74  : _M_cur_results(__results.get_allocator()),
75  _M_begin(__begin),
76  _M_end(__end),
77  _M_re(__re),
78  _M_nfa(*__re._M_automaton),
79  _M_results(__results),
80  _M_rep_count(_M_nfa.size()),
81  _M_states(_M_nfa._M_start(), _M_nfa.size()),
82  _M_flags(__flags)
83  {
84  using namespace regex_constants;
85  if (__flags & match_prev_avail) // ignore not_bol and not_bow
86  _M_flags &= ~(match_not_bol | match_not_bow);
87  }
88 
89  // Set matched when string exactly matches the pattern.
90  bool
91  _M_match()
92  {
93  _M_current = _M_begin;
94  return _M_main(_Match_mode::_Exact);
95  }
96 
97  // Set matched when some prefix of the string matches the pattern.
98  bool
99  _M_search_from_first()
100  {
101  _M_current = _M_begin;
102  return _M_main(_Match_mode::_Prefix);
103  }
104 
105  bool
106  _M_search();
107 
108  private:
109  void
110  _M_rep_once_more(_Match_mode __match_mode, _StateIdT);
111 
112  void
113  _M_handle_repeat(_Match_mode, _StateIdT);
114 
115  void
116  _M_handle_subexpr_begin(_Match_mode, _StateIdT);
117 
118  void
119  _M_handle_subexpr_end(_Match_mode, _StateIdT);
120 
121  void
122  _M_handle_line_begin_assertion(_Match_mode, _StateIdT);
123 
124  void
125  _M_handle_line_end_assertion(_Match_mode, _StateIdT);
126 
127  void
128  _M_handle_word_boundary(_Match_mode, _StateIdT);
129 
130  void
131  _M_handle_subexpr_lookahead(_Match_mode, _StateIdT);
132 
133  void
134  _M_handle_match(_Match_mode, _StateIdT);
135 
136  void
137  _M_handle_backref(_Match_mode, _StateIdT);
138 
139  void
140  _M_handle_accept(_Match_mode, _StateIdT);
141 
142  void
143  _M_handle_alternative(_Match_mode, _StateIdT);
144 
145  void
146  _M_dfs(_Match_mode __match_mode, _StateIdT __start);
147 
148  bool
149  _M_main(_Match_mode __match_mode)
150  { return _M_main_dispatch(__match_mode, __search_mode{}); }
151 
152  bool
153  _M_main_dispatch(_Match_mode __match_mode, __dfs);
154 
155  bool
156  _M_main_dispatch(_Match_mode __match_mode, __bfs);
157 
158  bool
159  _M_is_word(_CharT __ch) const
160  {
161  static const _CharT __s[2] = { 'w' };
162  return _M_re._M_automaton->_M_traits.isctype
163  (__ch, _M_re._M_automaton->_M_traits.lookup_classname(__s, __s+1));
164  }
165 
166  bool
167  _M_at_begin() const
168  {
169  if (_M_current == _M_begin)
170  {
171  // match_not_bol means ^ does not match [_M_begin,_M_begin)
172  if (_M_flags & regex_constants::match_not_bol)
173  return false;
174  // match_prev_avail means _M_begin is not the start of the input.
175  if (_M_flags & regex_constants::match_prev_avail)
176  {
177  // For ECMAScript multiline matches, check if the previous
178  // character is a line terminator.
179  if (_M_match_multiline())
180  return _M_is_line_terminator(*std::prev(_M_current));
181  else
182  return false;
183  }
184  else // ^ matches at _M_begin
185  return true;
186  }
187  else if (_M_match_multiline())
188  return _M_is_line_terminator(*std::prev(_M_current));
189  else
190  return false;
191  }
192 
193  bool
194  _M_at_end() const
195  {
196  if (_M_current == _M_end)
197  return !(_M_flags & regex_constants::match_not_eol);
198  else if (_M_match_multiline())
199  return _M_is_line_terminator(*_M_current);
200  else
201  return false;
202  }
203 
204  bool
205  _M_word_boundary() const;
206 
207  bool
208  _M_lookahead(_StateIdT __next);
209 
210  bool
211  _M_is_line_terminator(_CharT __c) const
212  {
213  const auto& __traits = _M_re._M_automaton->_M_traits;
214  const auto& __ct = use_facet<ctype<_CharT>>(__traits.getloc());
215  const char __n{ __ct.narrow(__c, ' ') };
216  if (__n == '\n')
217  return true;
218  if (_M_re._M_automaton->_M_options() & regex_constants::ECMAScript)
219  {
220  if (__n == '\r')
221  return true;
222  // FIXME: U+2028 (line separator) and U+2029 (paragraph separator)
223  }
224  return false;
225  }
226 
227  bool
228  _M_match_multiline() const noexcept
229  {
230  constexpr auto __m
232  return (_M_re._M_automaton->_M_options() & __m) == __m;
233  }
234 
235  // Holds additional information used in BFS-mode.
236  template<typename _SearchMode, typename _ResultsVec>
237  struct _State_info;
238 
239  template<typename _ResultsVec>
240  struct _State_info<__bfs, _ResultsVec>
241  {
242  explicit
243  _State_info(_StateIdT __start, size_t __n)
244  : _M_visited_states(new bool[__n]()), _M_start(__start)
245  { }
246 
247  ~_State_info() { delete[] _M_visited_states; }
248 
249  _State_info(const _State_info&) = delete;
250  _State_info& operator=(const _State_info&) = delete;
251 
252  bool _M_visited(_StateIdT __i)
253  {
254  if (_M_visited_states[__i])
255  return true;
256  _M_visited_states[__i] = true;
257  return false;
258  }
259 
260  void _M_queue(_StateIdT __i, const _ResultsVec& __res)
261  { _M_match_queue.emplace_back(__i, __res); }
262 
263  // Dummy implementations for BFS mode.
264  _BiIter* _M_get_sol_pos() { return nullptr; }
265 
266  // Saves states that need to be considered for the next character.
267  _GLIBCXX_STD_C::vector<pair<_StateIdT, _ResultsVec>> _M_match_queue;
268  // Indicates which states are already visited.
269  bool* _M_visited_states;
270  // To record current solution.
271  _StateIdT _M_start;
272  };
273 
274  template<typename _ResultsVec>
275  struct _State_info<__dfs, _ResultsVec>
276  {
277  explicit
278  _State_info(_StateIdT __start, size_t) : _M_start(__start)
279  { }
280 
281  // Dummy implementations for DFS mode.
282  bool _M_visited(_StateIdT) const { return false; }
283  void _M_queue(_StateIdT, const _ResultsVec&) { }
284 
285  _BiIter* _M_get_sol_pos() { return &_M_sol_pos; }
286 
287  // To record current solution.
288  _StateIdT _M_start;
289  _BiIter _M_sol_pos;
290  };
291 
292  public:
293  _ResultsVec _M_cur_results;
294  _BiIter _M_current;
295  _BiIter _M_begin;
296  const _BiIter _M_end;
297  const _RegexT& _M_re;
298  const _NFAT& _M_nfa;
299  _ResultsVec& _M_results;
300  _GLIBCXX_STD_C::vector<pair<_BiIter, int>> _M_rep_count;
301  _State_info<__search_mode, _ResultsVec> _M_states;
302  _FlagT _M_flags;
303  // Do we have a solution so far?
304  bool _M_has_sol;
305  };
306 
307  ///@} regex-detail
308 } // namespace __detail
309 _GLIBCXX_END_NAMESPACE_VERSION
310 } // namespace std
311 
312 #include <bits/regex_executor.tcc>
integral_constant< bool, true > true_type
The type used as a compile-time boolean with true value.
Definition: type_traits:82
integral_constant< bool, false > false_type
The type used as a compile-time boolean with false value.
Definition: type_traits:85
ISO C++ entities toplevel namespace is std.
constexpr match_flag_type match_not_bow
constexpr match_flag_type match_not_bol
constexpr syntax_option_type ECMAScript
constexpr syntax_option_type __multiline
Extension: Equivalent to regex_constants::multiline for C++11 and C++14.
constexpr match_flag_type match_not_eol
match_flag_type
This is a bitmask type indicating regex matching rules.
constexpr match_flag_type match_prev_avail
integral_constant
Definition: type_traits:63
Traits class for iterators.
A regular expression.
Definition: regex.h:419
Takes a regex and an input string and does the matching.
constexpr allocator_type get_allocator() const noexcept
Get a copy of the memory allocation object.
Definition: stl_vector.h:311