1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2013-2014 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library.  This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  *  @file bits/regex.tcc
27  *  This is an internal header file, included by other library headers.
28  *  Do not attempt to use it directly. @headername{regex}
29  */
30 
31 // See below __regex_algo_impl to get what this is talking about. The default
32 // value 1 indicated a conservative optimization without giving up worst case
33 // performance.
34 #ifndef _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
35 #define _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 1
36 #endif
37 
38 namespace std _GLIBCXX_VISIBILITY(default)
39 {
40 namespace __detail
41 {
42 _GLIBCXX_BEGIN_NAMESPACE_VERSION
43 
44   // Result of merging regex_match and regex_search.
45   //
46   // __policy now can be _S_auto (auto dispatch) and _S_alternate (use
47   // the other one if possible, for test purpose).
48   //
49   // That __match_mode is true means regex_match, else regex_search.
50   template<typename _BiIter, typename _Alloc,
51 	   typename _CharT, typename _TraitsT,
52 	   _RegexExecutorPolicy __policy,
53 	   bool __match_mode>
54     bool
__regex_algo_impl(_BiIter __s,_BiIter __e,match_results<_BiIter,_Alloc> & __m,const basic_regex<_CharT,_TraitsT> & __re,regex_constants::match_flag_type __flags)55     __regex_algo_impl(_BiIter                              __s,
56 		      _BiIter                              __e,
57 		      match_results<_BiIter, _Alloc>&      __m,
58 		      const basic_regex<_CharT, _TraitsT>& __re,
59 		      regex_constants::match_flag_type     __flags)
60     {
61       if (__re._M_automaton == nullptr)
62 	return false;
63 
64       typename match_results<_BiIter, _Alloc>::_Base_type& __res = __m;
65       __m._M_begin = __s;
66       __res.resize(__re._M_automaton->_M_sub_count() + 2);
67       for (auto& __it : __res)
68 	__it.matched = false;
69 
70       // This function decide which executor to use under given circumstances.
71       // The _S_auto policy now is the following: if a NFA has no
72       // back-references and has more than _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
73       // quantifiers (*, +, ?), the BFS executor will be used, other wise
74       // DFS executor. This is because DFS executor has a exponential upper
75       // bound, but better best-case performace. Meanwhile, BFS executor can
76       // effectively prevent from exponential-long time matching (which must
77       // contains many quantifiers), but it's slower in average.
78       //
79       // For simple regex, BFS executor could be 2 or more times slower than
80       // DFS executor.
81       //
82       // Of course, BFS executor cannot handle back-references.
83       bool __ret;
84       if (!__re._M_automaton->_M_has_backref
85 	  && (__policy == _RegexExecutorPolicy::_S_alternate
86 	      || __re._M_automaton->_M_quant_count
87 		> _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT))
88 	{
89 	  _Executor<_BiIter, _Alloc, _TraitsT, false>
90 	    __executor(__s, __e, __m, __re, __flags);
91 	  if (__match_mode)
92 	    __ret = __executor._M_match();
93 	  else
94 	    __ret = __executor._M_search();
95 	}
96       else
97 	{
98 	  _Executor<_BiIter, _Alloc, _TraitsT, true>
99 	    __executor(__s, __e, __m, __re, __flags);
100 	  if (__match_mode)
101 	    __ret = __executor._M_match();
102 	  else
103 	    __ret = __executor._M_search();
104 	}
105       if (__ret)
106 	{
107 	  for (auto __it : __res)
108 	    if (!__it.matched)
109 	      __it.first = __it.second = __e;
110 	  auto& __pre = __res[__res.size()-2];
111 	  auto& __suf = __res[__res.size()-1];
112 	  if (__match_mode)
113 	    {
114 	      __pre.matched = false;
115 	      __pre.first = __s;
116 	      __pre.second = __s;
117 	      __suf.matched = false;
118 	      __suf.first = __e;
119 	      __suf.second = __e;
120 	    }
121 	  else
122 	    {
123 	      __pre.first = __s;
124 	      __pre.second = __res[0].first;
125 	      __pre.matched = (__pre.first != __pre.second);
126 	      __suf.first = __res[0].second;
127 	      __suf.second = __e;
128 	      __suf.matched = (__suf.first != __suf.second);
129 	    }
130 	}
131       return __ret;
132     }
133 
134 _GLIBCXX_END_NAMESPACE_VERSION
135 }
136 
137 _GLIBCXX_BEGIN_NAMESPACE_VERSION
138 
139   template<typename _Ch_type>
140   template<typename _Fwd_iter>
141     typename regex_traits<_Ch_type>::string_type
142     regex_traits<_Ch_type>::
lookup_collatename(_Fwd_iter __first,_Fwd_iter __last) const143     lookup_collatename(_Fwd_iter __first, _Fwd_iter __last) const
144     {
145       typedef std::ctype<char_type> __ctype_type;
146       const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
147 
148       static const char* __collatenames[] =
149 	{
150 	  "NUL",
151 	  "SOH",
152 	  "STX",
153 	  "ETX",
154 	  "EOT",
155 	  "ENQ",
156 	  "ACK",
157 	  "alert",
158 	  "backspace",
159 	  "tab",
160 	  "newline",
161 	  "vertical-tab",
162 	  "form-feed",
163 	  "carriage-return",
164 	  "SO",
165 	  "SI",
166 	  "DLE",
167 	  "DC1",
168 	  "DC2",
169 	  "DC3",
170 	  "DC4",
171 	  "NAK",
172 	  "SYN",
173 	  "ETB",
174 	  "CAN",
175 	  "EM",
176 	  "SUB",
177 	  "ESC",
178 	  "IS4",
179 	  "IS3",
180 	  "IS2",
181 	  "IS1",
182 	  "space",
183 	  "exclamation-mark",
184 	  "quotation-mark",
185 	  "number-sign",
186 	  "dollar-sign",
187 	  "percent-sign",
188 	  "ampersand",
189 	  "apostrophe",
190 	  "left-parenthesis",
191 	  "right-parenthesis",
192 	  "asterisk",
193 	  "plus-sign",
194 	  "comma",
195 	  "hyphen",
196 	  "period",
197 	  "slash",
198 	  "zero",
199 	  "one",
200 	  "two",
201 	  "three",
202 	  "four",
203 	  "five",
204 	  "six",
205 	  "seven",
206 	  "eight",
207 	  "nine",
208 	  "colon",
209 	  "semicolon",
210 	  "less-than-sign",
211 	  "equals-sign",
212 	  "greater-than-sign",
213 	  "question-mark",
214 	  "commercial-at",
215 	  "A",
216 	  "B",
217 	  "C",
218 	  "D",
219 	  "E",
220 	  "F",
221 	  "G",
222 	  "H",
223 	  "I",
224 	  "J",
225 	  "K",
226 	  "L",
227 	  "M",
228 	  "N",
229 	  "O",
230 	  "P",
231 	  "Q",
232 	  "R",
233 	  "S",
234 	  "T",
235 	  "U",
236 	  "V",
237 	  "W",
238 	  "X",
239 	  "Y",
240 	  "Z",
241 	  "left-square-bracket",
242 	  "backslash",
243 	  "right-square-bracket",
244 	  "circumflex",
245 	  "underscore",
246 	  "grave-accent",
247 	  "a",
248 	  "b",
249 	  "c",
250 	  "d",
251 	  "e",
252 	  "f",
253 	  "g",
254 	  "h",
255 	  "i",
256 	  "j",
257 	  "k",
258 	  "l",
259 	  "m",
260 	  "n",
261 	  "o",
262 	  "p",
263 	  "q",
264 	  "r",
265 	  "s",
266 	  "t",
267 	  "u",
268 	  "v",
269 	  "w",
270 	  "x",
271 	  "y",
272 	  "z",
273 	  "left-curly-bracket",
274 	  "vertical-line",
275 	  "right-curly-bracket",
276 	  "tilde",
277 	  "DEL",
278 	};
279 
280       string __s;
281       for (; __first != __last; ++__first)
282 	__s += __fctyp.narrow(*__first, 0);
283 
284       for (const auto& __it : __collatenames)
285 	if (__s == __it)
286 	  return string_type(1, __fctyp.widen(
287 	    static_cast<char>(&__it - __collatenames)));
288 
289       // TODO Add digraph support:
290       // http://boost.sourceforge.net/libs/regex/doc/collating_names.html
291 
292       return string_type();
293     }
294 
295   template<typename _Ch_type>
296   template<typename _Fwd_iter>
297     typename regex_traits<_Ch_type>::char_class_type
298     regex_traits<_Ch_type>::
lookup_classname(_Fwd_iter __first,_Fwd_iter __last,bool __icase) const299     lookup_classname(_Fwd_iter __first, _Fwd_iter __last, bool __icase) const
300     {
301       typedef std::ctype<char_type> __ctype_type;
302       const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
303 
304       // Mappings from class name to class mask.
305       static const pair<const char*, char_class_type> __classnames[] =
306       {
307 	{"d", ctype_base::digit},
308 	{"w", {ctype_base::alnum, _RegexMask::_S_under}},
309 	{"s", ctype_base::space},
310 	{"alnum", ctype_base::alnum},
311 	{"alpha", ctype_base::alpha},
312 	{"blank", {0, _RegexMask::_S_blank}},
313 	{"cntrl", ctype_base::cntrl},
314 	{"digit", ctype_base::digit},
315 	{"graph", ctype_base::graph},
316 	{"lower", ctype_base::lower},
317 	{"print", ctype_base::print},
318 	{"punct", ctype_base::punct},
319 	{"space", ctype_base::space},
320 	{"upper", ctype_base::upper},
321 	{"xdigit", ctype_base::xdigit},
322       };
323 
324       string __s;
325       for (; __first != __last; ++__first)
326 	__s += __fctyp.narrow(__fctyp.tolower(*__first), 0);
327 
328       for (const auto& __it : __classnames)
329 	if (__s == __it.first)
330 	  {
331 	    if (__icase
332 		&& ((__it.second
333 		     & (ctype_base::lower | ctype_base::upper)) != 0))
334 	      return ctype_base::alpha;
335 	    return __it.second;
336 	  }
337       return 0;
338     }
339 
340   template<typename _Ch_type>
341     bool
342     regex_traits<_Ch_type>::
isctype(_Ch_type __c,char_class_type __f) const343     isctype(_Ch_type __c, char_class_type __f) const
344     {
345       typedef std::ctype<char_type> __ctype_type;
346       const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
347 
348       return __fctyp.is(__f._M_base, __c)
349 	// [[:w:]]
350 	|| ((__f._M_extended & _RegexMask::_S_under)
351 	    && __c == __fctyp.widen('_'))
352 	// [[:blank:]]
353 	|| ((__f._M_extended & _RegexMask::_S_blank)
354 	    && (__c == __fctyp.widen(' ')
355 		|| __c == __fctyp.widen('\t')));
356     }
357 
358   template<typename _Ch_type>
359     int
360     regex_traits<_Ch_type>::
value(_Ch_type __ch,int __radix) const361     value(_Ch_type __ch, int __radix) const
362     {
363       std::basic_istringstream<char_type> __is(string_type(1, __ch));
364       long __v;
365       if (__radix == 8)
366 	__is >> std::oct;
367       else if (__radix == 16)
368 	__is >> std::hex;
369       __is >> __v;
370       return __is.fail() ? -1 : __v;
371     }
372 
373   template<typename _Bi_iter, typename _Alloc>
374   template<typename _Out_iter>
375     _Out_iter match_results<_Bi_iter, _Alloc>::
format(_Out_iter __out,const match_results<_Bi_iter,_Alloc>::char_type * __fmt_first,const match_results<_Bi_iter,_Alloc>::char_type * __fmt_last,match_flag_type __flags) const376     format(_Out_iter __out,
377 	   const match_results<_Bi_iter, _Alloc>::char_type* __fmt_first,
378 	   const match_results<_Bi_iter, _Alloc>::char_type* __fmt_last,
379 	   match_flag_type __flags) const
380     {
381       _GLIBCXX_DEBUG_ASSERT( ready() );
382       regex_traits<char_type> __traits;
383       typedef std::ctype<char_type> __ctype_type;
384       const __ctype_type&
385 	__fctyp(use_facet<__ctype_type>(__traits.getloc()));
386 
387       auto __output = [&](size_t __idx)
388 	{
389 	  auto& __sub = _Base_type::operator[](__idx);
390 	  if (__sub.matched)
391 	    __out = std::copy(__sub.first, __sub.second, __out);
392 	};
393 
394       if (__flags & regex_constants::format_sed)
395 	{
396 	  for (; __fmt_first != __fmt_last;)
397 	    if (*__fmt_first == '&')
398 	      {
399 		__output(0);
400 		++__fmt_first;
401 	      }
402 	    else if (*__fmt_first == '\\')
403 	      {
404 		if (++__fmt_first != __fmt_last
405 		    && __fctyp.is(__ctype_type::digit, *__fmt_first))
406 		  __output(__traits.value(*__fmt_first++, 10));
407 		else
408 		  *__out++ = '\\';
409 	      }
410 	    else
411 	      *__out++ = *__fmt_first++;
412 	}
413       else
414 	{
415 	  while (1)
416 	    {
417 	      auto __next = std::find(__fmt_first, __fmt_last, '$');
418 	      if (__next == __fmt_last)
419 		break;
420 
421 	      __out = std::copy(__fmt_first, __next, __out);
422 
423 	      auto __eat = [&](char __ch) -> bool
424 		{
425 		  if (*__next == __ch)
426 		    {
427 		      ++__next;
428 		      return true;
429 		    }
430 		  return false;
431 		};
432 
433 	      if (++__next == __fmt_last)
434 		*__out++ = '$';
435 	      else if (__eat('$'))
436 		*__out++ = '$';
437 	      else if (__eat('&'))
438 		__output(0);
439 	      else if (__eat('`'))
440 		__output(_Base_type::size()-2);
441 	      else if (__eat('\''))
442 		__output(_Base_type::size()-1);
443 	      else if (__fctyp.is(__ctype_type::digit, *__next))
444 		{
445 		  long __num = __traits.value(*__next, 10);
446 		  if (++__next != __fmt_last
447 		      && __fctyp.is(__ctype_type::digit, *__next))
448 		    {
449 		      __num *= 10;
450 		      __num += __traits.value(*__next++, 10);
451 		    }
452 		  if (0 <= __num && __num < this->size())
453 		    __output(__num);
454 		}
455 	      else
456 		*__out++ = '$';
457 	      __fmt_first = __next;
458 	    }
459 	  __out = std::copy(__fmt_first, __fmt_last, __out);
460 	}
461       return __out;
462     }
463 
464   template<typename _Out_iter, typename _Bi_iter,
465 	   typename _Rx_traits, typename _Ch_type>
466     _Out_iter
467     regex_replace(_Out_iter __out, _Bi_iter __first, _Bi_iter __last,
468 		  const basic_regex<_Ch_type, _Rx_traits>& __e,
469 		  const _Ch_type* __fmt,
470 		  regex_constants::match_flag_type __flags)
471     {
472       typedef regex_iterator<_Bi_iter, _Ch_type, _Rx_traits> _IterT;
473       _IterT __i(__first, __last, __e, __flags);
474       _IterT __end;
475       if (__i == __end)
476 	{
477 	  if (!(__flags & regex_constants::format_no_copy))
478 	    __out = std::copy(__first, __last, __out);
479 	}
480       else
481 	{
482 	  sub_match<_Bi_iter> __last;
483 	  auto __len = char_traits<_Ch_type>::length(__fmt);
484 	  for (; __i != __end; ++__i)
485 	    {
486 	      if (!(__flags & regex_constants::format_no_copy))
487 		__out = std::copy(__i->prefix().first, __i->prefix().second,
488 				  __out);
489 	      __out = __i->format(__out, __fmt, __fmt + __len, __flags);
490 	      __last = __i->suffix();
491 	      if (__flags & regex_constants::format_first_only)
492 		break;
493 	    }
494 	  if (!(__flags & regex_constants::format_no_copy))
495 	    __out = std::copy(__last.first, __last.second, __out);
496 	}
497       return __out;
498     }
499 
500   template<typename _Bi_iter,
501 	   typename _Ch_type,
502 	   typename _Rx_traits>
503     bool
504     regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
operator ==(const regex_iterator & __rhs) const505     operator==(const regex_iterator& __rhs) const
506     {
507       return (_M_match.empty() && __rhs._M_match.empty())
508 	|| (_M_begin == __rhs._M_begin
509 	    && _M_end == __rhs._M_end
510 	    && _M_pregex == __rhs._M_pregex
511 	    && _M_flags == __rhs._M_flags
512 	    && _M_match[0] == __rhs._M_match[0]);
513     }
514 
515   template<typename _Bi_iter,
516 	   typename _Ch_type,
517 	   typename _Rx_traits>
518     regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
519     regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
operator ++()520     operator++()
521     {
522       // In all cases in which the call to regex_search returns true,
523       // match.prefix().first shall be equal to the previous value of
524       // match[0].second, and for each index i in the half-open range
525       // [0, match.size()) for which match[i].matched is true,
526       // match[i].position() shall return distance(begin, match[i].first).
527       // [28.12.1.4.5]
528       if (_M_match[0].matched)
529 	{
530 	  auto __start = _M_match[0].second;
531 	  auto __prefix_first = _M_match[0].second;
532 	  if (_M_match[0].first == _M_match[0].second)
533 	    {
534 	      if (__start == _M_end)
535 		{
536 		  _M_match = value_type();
537 		  return *this;
538 		}
539 	      else
540 		{
541 		  if (regex_search(__start, _M_end, _M_match, *_M_pregex,
542 				   _M_flags
543 				   | regex_constants::match_not_null
544 				   | regex_constants::match_continuous))
545 		    {
546 		      _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched);
547 		      auto& __prefix = _M_match.at(_M_match.size());
548 		      __prefix.first = __prefix_first;
549 		      __prefix.matched = __prefix.first != __prefix.second;
550 		      // [28.12.1.4.5]
551 		      _M_match._M_begin = _M_begin;
552 		      return *this;
553 		    }
554 		  else
555 		    ++__start;
556 		}
557 	    }
558 	  _M_flags |= regex_constants::match_prev_avail;
559 	  if (regex_search(__start, _M_end, _M_match, *_M_pregex, _M_flags))
560 	    {
561 	      _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched);
562 	      auto& __prefix = _M_match.at(_M_match.size());
563 	      __prefix.first = __prefix_first;
564 	      __prefix.matched = __prefix.first != __prefix.second;
565 	      // [28.12.1.4.5]
566 	      _M_match._M_begin = _M_begin;
567 	    }
568 	  else
569 	    _M_match = value_type();
570 	}
571       return *this;
572     }
573 
574   template<typename _Bi_iter,
575 	   typename _Ch_type,
576 	   typename _Rx_traits>
577     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
578     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
operator =(const regex_token_iterator & __rhs)579     operator=(const regex_token_iterator& __rhs)
580     {
581       _M_position = __rhs._M_position;
582       _M_subs = __rhs._M_subs;
583       _M_n = __rhs._M_n;
584       _M_suffix = __rhs._M_suffix;
585       _M_has_m1 = __rhs._M_has_m1;
586       _M_normalize_result();
587       return *this;
588     }
589 
590   template<typename _Bi_iter,
591 	   typename _Ch_type,
592 	   typename _Rx_traits>
593     bool
594     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
operator ==(const regex_token_iterator & __rhs) const595     operator==(const regex_token_iterator& __rhs) const
596     {
597       if (_M_end_of_seq() && __rhs._M_end_of_seq())
598 	return true;
599       if (_M_suffix.matched && __rhs._M_suffix.matched
600 	  && _M_suffix == __rhs._M_suffix)
601 	return true;
602       if (_M_end_of_seq() || _M_suffix.matched
603 	  || __rhs._M_end_of_seq() || __rhs._M_suffix.matched)
604 	return false;
605       return _M_position == __rhs._M_position
606 	&& _M_n == __rhs._M_n
607 	&& _M_subs == __rhs._M_subs;
608     }
609 
610   template<typename _Bi_iter,
611 	   typename _Ch_type,
612 	   typename _Rx_traits>
613     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
614     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
operator ++()615     operator++()
616     {
617       _Position __prev = _M_position;
618       if (_M_suffix.matched)
619 	*this = regex_token_iterator();
620       else if (_M_n + 1 < _M_subs.size())
621 	{
622 	  _M_n++;
623 	  _M_result = &_M_current_match();
624 	}
625       else
626 	{
627 	  _M_n = 0;
628 	  ++_M_position;
629 	  if (_M_position != _Position())
630 	    _M_result = &_M_current_match();
631 	  else if (_M_has_m1 && __prev->suffix().length() != 0)
632 	    {
633 	      _M_suffix.matched = true;
634 	      _M_suffix.first = __prev->suffix().first;
635 	      _M_suffix.second = __prev->suffix().second;
636 	      _M_result = &_M_suffix;
637 	    }
638 	  else
639 	    *this = regex_token_iterator();
640 	}
641       return *this;
642     }
643 
644   template<typename _Bi_iter,
645 	   typename _Ch_type,
646 	   typename _Rx_traits>
647     void
648     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
_M_init(_Bi_iter __a,_Bi_iter __b)649     _M_init(_Bi_iter __a, _Bi_iter __b)
650     {
651       _M_has_m1 = false;
652       for (auto __it : _M_subs)
653 	if (__it == -1)
654 	  {
655 	    _M_has_m1 = true;
656 	    break;
657 	  }
658       if (_M_position != _Position())
659 	_M_result = &_M_current_match();
660       else if (_M_has_m1)
661 	{
662 	  _M_suffix.matched = true;
663 	  _M_suffix.first = __a;
664 	  _M_suffix.second = __b;
665 	  _M_result = &_M_suffix;
666 	}
667       else
668 	_M_result = nullptr;
669     }
670 
671 _GLIBCXX_END_NAMESPACE_VERSION
672 } // namespace
673 
674