1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2013-2015 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library.  This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  *  @file bits/regex.tcc
27  *  This is an internal header file, included by other library headers.
28  *  Do not attempt to use it directly. @headername{regex}
29  */
30 
31 // A non-standard switch to let the user pick the matching algorithm.
32 // If _GLIBCXX_REGEX_USE_THOMPSON_NFA is defined, the thompson NFA
33 // algorithm will be used. This algorithm is not enabled by default,
34 // and cannot be used if the regex contains back-references, but has better
35 // (polynomial instead of exponential) worst case performance.
36 // See __regex_algo_impl below.
37 
38 namespace std _GLIBCXX_VISIBILITY(default)
39 {
40 namespace __detail
41 {
42 _GLIBCXX_BEGIN_NAMESPACE_VERSION
43 
44   // Result of merging regex_match and regex_search.
45   //
46   // __policy now can be _S_auto (auto dispatch) and _S_alternate (use
47   // the other one if possible, for test purpose).
48   //
49   // That __match_mode is true means regex_match, else regex_search.
50   template<typename _BiIter, typename _Alloc,
51 	   typename _CharT, typename _TraitsT,
52 	   _RegexExecutorPolicy __policy,
53 	   bool __match_mode>
54     bool
__regex_algo_impl(_BiIter __s,_BiIter __e,match_results<_BiIter,_Alloc> & __m,const basic_regex<_CharT,_TraitsT> & __re,regex_constants::match_flag_type __flags)55     __regex_algo_impl(_BiIter                              __s,
56 		      _BiIter                              __e,
57 		      match_results<_BiIter, _Alloc>&      __m,
58 		      const basic_regex<_CharT, _TraitsT>& __re,
59 		      regex_constants::match_flag_type     __flags)
60     {
61       if (__re._M_automaton == nullptr)
62 	return false;
63 
64       typename match_results<_BiIter, _Alloc>::_Base_type& __res = __m;
65       __m._M_begin = __s;
66       __m._M_resize(__re._M_automaton->_M_sub_count());
67       for (auto& __it : __res)
68 	__it.matched = false;
69 
70       // __policy is used by testsuites so that they can use Thompson NFA
71       // without defining a macro. Users should define
72       // _GLIBCXX_REGEX_USE_THOMPSON_NFA if they need to use this approach.
73       bool __ret;
74       if (!__re._M_automaton->_M_has_backref
75 	  && !(__re._M_flags & regex_constants::ECMAScript)
76 #ifndef _GLIBCXX_REGEX_USE_THOMPSON_NFA
77 	  && __policy == _RegexExecutorPolicy::_S_alternate
78 #endif
79 	  )
80 	{
81 	  _Executor<_BiIter, _Alloc, _TraitsT, false>
82 	    __executor(__s, __e, __m, __re, __flags);
83 	  if (__match_mode)
84 	    __ret = __executor._M_match();
85 	  else
86 	    __ret = __executor._M_search();
87 	}
88       else
89 	{
90 	  _Executor<_BiIter, _Alloc, _TraitsT, true>
91 	    __executor(__s, __e, __m, __re, __flags);
92 	  if (__match_mode)
93 	    __ret = __executor._M_match();
94 	  else
95 	    __ret = __executor._M_search();
96 	}
97       if (__ret)
98 	{
99 	  for (auto& __it : __res)
100 	    if (!__it.matched)
101 	      __it.first = __it.second = __e;
102 	  auto& __pre = __m._M_prefix();
103 	  auto& __suf = __m._M_suffix();
104 	  if (__match_mode)
105 	    {
106 	      __pre.matched = false;
107 	      __pre.first = __s;
108 	      __pre.second = __s;
109 	      __suf.matched = false;
110 	      __suf.first = __e;
111 	      __suf.second = __e;
112 	    }
113 	  else
114 	    {
115 	      __pre.first = __s;
116 	      __pre.second = __res[0].first;
117 	      __pre.matched = (__pre.first != __pre.second);
118 	      __suf.first = __res[0].second;
119 	      __suf.second = __e;
120 	      __suf.matched = (__suf.first != __suf.second);
121 	    }
122 	}
123       else
124 	{
125 	  __m._M_resize(0);
126 	  for (auto& __it : __res)
127 	    {
128 	      __it.matched = false;
129 	      __it.first = __it.second = __e;
130 	    }
131 	}
132       return __ret;
133     }
134 
135 _GLIBCXX_END_NAMESPACE_VERSION
136 }
137 
138 _GLIBCXX_BEGIN_NAMESPACE_VERSION
139 
140   template<typename _Ch_type>
141   template<typename _Fwd_iter>
142     typename regex_traits<_Ch_type>::string_type
143     regex_traits<_Ch_type>::
lookup_collatename(_Fwd_iter __first,_Fwd_iter __last) const144     lookup_collatename(_Fwd_iter __first, _Fwd_iter __last) const
145     {
146       typedef std::ctype<char_type> __ctype_type;
147       const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
148 
149       static const char* __collatenames[] =
150 	{
151 	  "NUL",
152 	  "SOH",
153 	  "STX",
154 	  "ETX",
155 	  "EOT",
156 	  "ENQ",
157 	  "ACK",
158 	  "alert",
159 	  "backspace",
160 	  "tab",
161 	  "newline",
162 	  "vertical-tab",
163 	  "form-feed",
164 	  "carriage-return",
165 	  "SO",
166 	  "SI",
167 	  "DLE",
168 	  "DC1",
169 	  "DC2",
170 	  "DC3",
171 	  "DC4",
172 	  "NAK",
173 	  "SYN",
174 	  "ETB",
175 	  "CAN",
176 	  "EM",
177 	  "SUB",
178 	  "ESC",
179 	  "IS4",
180 	  "IS3",
181 	  "IS2",
182 	  "IS1",
183 	  "space",
184 	  "exclamation-mark",
185 	  "quotation-mark",
186 	  "number-sign",
187 	  "dollar-sign",
188 	  "percent-sign",
189 	  "ampersand",
190 	  "apostrophe",
191 	  "left-parenthesis",
192 	  "right-parenthesis",
193 	  "asterisk",
194 	  "plus-sign",
195 	  "comma",
196 	  "hyphen",
197 	  "period",
198 	  "slash",
199 	  "zero",
200 	  "one",
201 	  "two",
202 	  "three",
203 	  "four",
204 	  "five",
205 	  "six",
206 	  "seven",
207 	  "eight",
208 	  "nine",
209 	  "colon",
210 	  "semicolon",
211 	  "less-than-sign",
212 	  "equals-sign",
213 	  "greater-than-sign",
214 	  "question-mark",
215 	  "commercial-at",
216 	  "A",
217 	  "B",
218 	  "C",
219 	  "D",
220 	  "E",
221 	  "F",
222 	  "G",
223 	  "H",
224 	  "I",
225 	  "J",
226 	  "K",
227 	  "L",
228 	  "M",
229 	  "N",
230 	  "O",
231 	  "P",
232 	  "Q",
233 	  "R",
234 	  "S",
235 	  "T",
236 	  "U",
237 	  "V",
238 	  "W",
239 	  "X",
240 	  "Y",
241 	  "Z",
242 	  "left-square-bracket",
243 	  "backslash",
244 	  "right-square-bracket",
245 	  "circumflex",
246 	  "underscore",
247 	  "grave-accent",
248 	  "a",
249 	  "b",
250 	  "c",
251 	  "d",
252 	  "e",
253 	  "f",
254 	  "g",
255 	  "h",
256 	  "i",
257 	  "j",
258 	  "k",
259 	  "l",
260 	  "m",
261 	  "n",
262 	  "o",
263 	  "p",
264 	  "q",
265 	  "r",
266 	  "s",
267 	  "t",
268 	  "u",
269 	  "v",
270 	  "w",
271 	  "x",
272 	  "y",
273 	  "z",
274 	  "left-curly-bracket",
275 	  "vertical-line",
276 	  "right-curly-bracket",
277 	  "tilde",
278 	  "DEL",
279 	};
280 
281       string __s;
282       for (; __first != __last; ++__first)
283 	__s += __fctyp.narrow(*__first, 0);
284 
285       for (const auto& __it : __collatenames)
286 	if (__s == __it)
287 	  return string_type(1, __fctyp.widen(
288 	    static_cast<char>(&__it - __collatenames)));
289 
290       // TODO Add digraph support:
291       // http://boost.sourceforge.net/libs/regex/doc/collating_names.html
292 
293       return string_type();
294     }
295 
296   template<typename _Ch_type>
297   template<typename _Fwd_iter>
298     typename regex_traits<_Ch_type>::char_class_type
299     regex_traits<_Ch_type>::
lookup_classname(_Fwd_iter __first,_Fwd_iter __last,bool __icase) const300     lookup_classname(_Fwd_iter __first, _Fwd_iter __last, bool __icase) const
301     {
302       typedef std::ctype<char_type> __ctype_type;
303       const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
304 
305       // Mappings from class name to class mask.
306       static const pair<const char*, char_class_type> __classnames[] =
307       {
308 	{"d", ctype_base::digit},
309 	{"w", {ctype_base::alnum, _RegexMask::_S_under}},
310 	{"s", ctype_base::space},
311 	{"alnum", ctype_base::alnum},
312 	{"alpha", ctype_base::alpha},
313 	{"blank", ctype_base::blank},
314 	{"cntrl", ctype_base::cntrl},
315 	{"digit", ctype_base::digit},
316 	{"graph", ctype_base::graph},
317 	{"lower", ctype_base::lower},
318 	{"print", ctype_base::print},
319 	{"punct", ctype_base::punct},
320 	{"space", ctype_base::space},
321 	{"upper", ctype_base::upper},
322 	{"xdigit", ctype_base::xdigit},
323       };
324 
325       string __s;
326       for (; __first != __last; ++__first)
327 	__s += __fctyp.narrow(__fctyp.tolower(*__first), 0);
328 
329       for (const auto& __it : __classnames)
330 	if (__s == __it.first)
331 	  {
332 	    if (__icase
333 		&& ((__it.second
334 		     & (ctype_base::lower | ctype_base::upper)) != 0))
335 	      return ctype_base::alpha;
336 	    return __it.second;
337 	  }
338       return 0;
339     }
340 
341   template<typename _Ch_type>
342     bool
343     regex_traits<_Ch_type>::
isctype(_Ch_type __c,char_class_type __f) const344     isctype(_Ch_type __c, char_class_type __f) const
345     {
346       typedef std::ctype<char_type> __ctype_type;
347       const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
348 
349       return __fctyp.is(__f._M_base, __c)
350 	// [[:w:]]
351 	|| ((__f._M_extended & _RegexMask::_S_under)
352 	    && __c == __fctyp.widen('_'));
353     }
354 
355   template<typename _Ch_type>
356     int
357     regex_traits<_Ch_type>::
value(_Ch_type __ch,int __radix) const358     value(_Ch_type __ch, int __radix) const
359     {
360       std::basic_istringstream<char_type> __is(string_type(1, __ch));
361       long __v;
362       if (__radix == 8)
363 	__is >> std::oct;
364       else if (__radix == 16)
365 	__is >> std::hex;
366       __is >> __v;
367       return __is.fail() ? -1 : __v;
368     }
369 
370   template<typename _Bi_iter, typename _Alloc>
371   template<typename _Out_iter>
372     _Out_iter match_results<_Bi_iter, _Alloc>::
format(_Out_iter __out,const match_results<_Bi_iter,_Alloc>::char_type * __fmt_first,const match_results<_Bi_iter,_Alloc>::char_type * __fmt_last,match_flag_type __flags) const373     format(_Out_iter __out,
374 	   const match_results<_Bi_iter, _Alloc>::char_type* __fmt_first,
375 	   const match_results<_Bi_iter, _Alloc>::char_type* __fmt_last,
376 	   match_flag_type __flags) const
377     {
378       _GLIBCXX_DEBUG_ASSERT( ready() );
379       regex_traits<char_type> __traits;
380       typedef std::ctype<char_type> __ctype_type;
381       const __ctype_type&
382 	__fctyp(use_facet<__ctype_type>(__traits.getloc()));
383 
384       auto __output = [&](size_t __idx)
385 	{
386 	  auto& __sub = (*this)[__idx];
387 	  if (__sub.matched)
388 	    __out = std::copy(__sub.first, __sub.second, __out);
389 	};
390 
391       if (__flags & regex_constants::format_sed)
392 	{
393 	  for (; __fmt_first != __fmt_last;)
394 	    if (*__fmt_first == '&')
395 	      {
396 		__output(0);
397 		++__fmt_first;
398 	      }
399 	    else if (*__fmt_first == '\\')
400 	      {
401 		if (++__fmt_first != __fmt_last
402 		    && __fctyp.is(__ctype_type::digit, *__fmt_first))
403 		  __output(__traits.value(*__fmt_first++, 10));
404 		else
405 		  *__out++ = '\\';
406 	      }
407 	    else
408 	      *__out++ = *__fmt_first++;
409 	}
410       else
411 	{
412 	  while (1)
413 	    {
414 	      auto __next = std::find(__fmt_first, __fmt_last, '$');
415 	      if (__next == __fmt_last)
416 		break;
417 
418 	      __out = std::copy(__fmt_first, __next, __out);
419 
420 	      auto __eat = [&](char __ch) -> bool
421 		{
422 		  if (*__next == __ch)
423 		    {
424 		      ++__next;
425 		      return true;
426 		    }
427 		  return false;
428 		};
429 
430 	      if (++__next == __fmt_last)
431 		*__out++ = '$';
432 	      else if (__eat('$'))
433 		*__out++ = '$';
434 	      else if (__eat('&'))
435 		__output(0);
436 	      else if (__eat('`'))
437 		{
438 		  auto& __sub = _M_prefix();
439 		  if (__sub.matched)
440 		    __out = std::copy(__sub.first, __sub.second, __out);
441 		}
442 	      else if (__eat('\''))
443 		{
444 		  auto& __sub = _M_suffix();
445 		  if (__sub.matched)
446 		    __out = std::copy(__sub.first, __sub.second, __out);
447 		}
448 	      else if (__fctyp.is(__ctype_type::digit, *__next))
449 		{
450 		  long __num = __traits.value(*__next, 10);
451 		  if (++__next != __fmt_last
452 		      && __fctyp.is(__ctype_type::digit, *__next))
453 		    {
454 		      __num *= 10;
455 		      __num += __traits.value(*__next++, 10);
456 		    }
457 		  if (0 <= __num && __num < this->size())
458 		    __output(__num);
459 		}
460 	      else
461 		*__out++ = '$';
462 	      __fmt_first = __next;
463 	    }
464 	  __out = std::copy(__fmt_first, __fmt_last, __out);
465 	}
466       return __out;
467     }
468 
469   template<typename _Out_iter, typename _Bi_iter,
470 	   typename _Rx_traits, typename _Ch_type>
471     _Out_iter
472     regex_replace(_Out_iter __out, _Bi_iter __first, _Bi_iter __last,
473 		  const basic_regex<_Ch_type, _Rx_traits>& __e,
474 		  const _Ch_type* __fmt,
475 		  regex_constants::match_flag_type __flags)
476     {
477       typedef regex_iterator<_Bi_iter, _Ch_type, _Rx_traits> _IterT;
478       _IterT __i(__first, __last, __e, __flags);
479       _IterT __end;
480       if (__i == __end)
481 	{
482 	  if (!(__flags & regex_constants::format_no_copy))
483 	    __out = std::copy(__first, __last, __out);
484 	}
485       else
486 	{
487 	  sub_match<_Bi_iter> __last;
488 	  auto __len = char_traits<_Ch_type>::length(__fmt);
489 	  for (; __i != __end; ++__i)
490 	    {
491 	      if (!(__flags & regex_constants::format_no_copy))
492 		__out = std::copy(__i->prefix().first, __i->prefix().second,
493 				  __out);
494 	      __out = __i->format(__out, __fmt, __fmt + __len, __flags);
495 	      __last = __i->suffix();
496 	      if (__flags & regex_constants::format_first_only)
497 		break;
498 	    }
499 	  if (!(__flags & regex_constants::format_no_copy))
500 	    __out = std::copy(__last.first, __last.second, __out);
501 	}
502       return __out;
503     }
504 
505   template<typename _Bi_iter,
506 	   typename _Ch_type,
507 	   typename _Rx_traits>
508     bool
509     regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
operator ==(const regex_iterator & __rhs) const510     operator==(const regex_iterator& __rhs) const
511     {
512       if (_M_pregex == nullptr && __rhs._M_pregex == nullptr)
513 	return true;
514       return _M_pregex == __rhs._M_pregex
515 	  && _M_begin == __rhs._M_begin
516 	  && _M_end == __rhs._M_end
517 	  && _M_flags == __rhs._M_flags
518 	  && _M_match[0] == __rhs._M_match[0];
519     }
520 
521   template<typename _Bi_iter,
522 	   typename _Ch_type,
523 	   typename _Rx_traits>
524     regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
525     regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
operator ++()526     operator++()
527     {
528       // In all cases in which the call to regex_search returns true,
529       // match.prefix().first shall be equal to the previous value of
530       // match[0].second, and for each index i in the half-open range
531       // [0, match.size()) for which match[i].matched is true,
532       // match[i].position() shall return distance(begin, match[i].first).
533       // [28.12.1.4.5]
534       if (_M_match[0].matched)
535 	{
536 	  auto __start = _M_match[0].second;
537 	  auto __prefix_first = _M_match[0].second;
538 	  if (_M_match[0].first == _M_match[0].second)
539 	    {
540 	      if (__start == _M_end)
541 		{
542 		  _M_pregex = nullptr;
543 		  return *this;
544 		}
545 	      else
546 		{
547 		  if (regex_search(__start, _M_end, _M_match, *_M_pregex,
548 				   _M_flags
549 				   | regex_constants::match_not_null
550 				   | regex_constants::match_continuous))
551 		    {
552 		      _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched);
553 		      auto& __prefix = _M_match._M_prefix();
554 		      __prefix.first = __prefix_first;
555 		      __prefix.matched = __prefix.first != __prefix.second;
556 		      // [28.12.1.4.5]
557 		      _M_match._M_begin = _M_begin;
558 		      return *this;
559 		    }
560 		  else
561 		    ++__start;
562 		}
563 	    }
564 	  _M_flags |= regex_constants::match_prev_avail;
565 	  if (regex_search(__start, _M_end, _M_match, *_M_pregex, _M_flags))
566 	    {
567 	      _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched);
568 	      auto& __prefix = _M_match._M_prefix();
569 	      __prefix.first = __prefix_first;
570 	      __prefix.matched = __prefix.first != __prefix.second;
571 	      // [28.12.1.4.5]
572 	      _M_match._M_begin = _M_begin;
573 	    }
574 	  else
575 	    _M_pregex = nullptr;
576 	}
577       return *this;
578     }
579 
580   template<typename _Bi_iter,
581 	   typename _Ch_type,
582 	   typename _Rx_traits>
583     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
584     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
operator =(const regex_token_iterator & __rhs)585     operator=(const regex_token_iterator& __rhs)
586     {
587       _M_position = __rhs._M_position;
588       _M_subs = __rhs._M_subs;
589       _M_n = __rhs._M_n;
590       _M_suffix = __rhs._M_suffix;
591       _M_has_m1 = __rhs._M_has_m1;
592       _M_normalize_result();
593       return *this;
594     }
595 
596   template<typename _Bi_iter,
597 	   typename _Ch_type,
598 	   typename _Rx_traits>
599     bool
600     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
operator ==(const regex_token_iterator & __rhs) const601     operator==(const regex_token_iterator& __rhs) const
602     {
603       if (_M_end_of_seq() && __rhs._M_end_of_seq())
604 	return true;
605       if (_M_suffix.matched && __rhs._M_suffix.matched
606 	  && _M_suffix == __rhs._M_suffix)
607 	return true;
608       if (_M_end_of_seq() || _M_suffix.matched
609 	  || __rhs._M_end_of_seq() || __rhs._M_suffix.matched)
610 	return false;
611       return _M_position == __rhs._M_position
612 	&& _M_n == __rhs._M_n
613 	&& _M_subs == __rhs._M_subs;
614     }
615 
616   template<typename _Bi_iter,
617 	   typename _Ch_type,
618 	   typename _Rx_traits>
619     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
620     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
operator ++()621     operator++()
622     {
623       _Position __prev = _M_position;
624       if (_M_suffix.matched)
625 	*this = regex_token_iterator();
626       else if (_M_n + 1 < _M_subs.size())
627 	{
628 	  _M_n++;
629 	  _M_result = &_M_current_match();
630 	}
631       else
632 	{
633 	  _M_n = 0;
634 	  ++_M_position;
635 	  if (_M_position != _Position())
636 	    _M_result = &_M_current_match();
637 	  else if (_M_has_m1 && __prev->suffix().length() != 0)
638 	    {
639 	      _M_suffix.matched = true;
640 	      _M_suffix.first = __prev->suffix().first;
641 	      _M_suffix.second = __prev->suffix().second;
642 	      _M_result = &_M_suffix;
643 	    }
644 	  else
645 	    *this = regex_token_iterator();
646 	}
647       return *this;
648     }
649 
650   template<typename _Bi_iter,
651 	   typename _Ch_type,
652 	   typename _Rx_traits>
653     void
654     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
_M_init(_Bi_iter __a,_Bi_iter __b)655     _M_init(_Bi_iter __a, _Bi_iter __b)
656     {
657       _M_has_m1 = false;
658       for (auto __it : _M_subs)
659 	if (__it == -1)
660 	  {
661 	    _M_has_m1 = true;
662 	    break;
663 	  }
664       if (_M_position != _Position())
665 	_M_result = &_M_current_match();
666       else if (_M_has_m1)
667 	{
668 	  _M_suffix.matched = true;
669 	  _M_suffix.first = __a;
670 	  _M_suffix.second = __b;
671 	  _M_result = &_M_suffix;
672 	}
673       else
674 	_M_result = nullptr;
675     }
676 
677 _GLIBCXX_END_NAMESPACE_VERSION
678 } // namespace
679 
680