1 // class template regex -*- C++ -*- 2 3 // Copyright (C) 2013-2015 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 // <http://www.gnu.org/licenses/>. 24 25 /** 26 * @file bits/regex_scanner.tcc 27 * This is an internal header file, included by other library headers. 28 * Do not attempt to use it directly. @headername{regex} 29 */ 30 31 // FIXME make comments doxygen format. 32 33 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep 34 // and awk 35 // 1) grep is basic except '\n' is treated as '|' 36 // 2) egrep is extended except '\n' is treated as '|' 37 // 3) awk is extended except special escaping rules, and there's no 38 // back-reference. 39 // 40 // References: 41 // 42 // ECMAScript: ECMA-262 15.10 43 // 44 // basic, extended: 45 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html 46 // 47 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html 48 49 namespace std _GLIBCXX_VISIBILITY(default) 50 { 51 namespace __detail 52 { 53 _GLIBCXX_BEGIN_NAMESPACE_VERSION 54 55 template<typename _CharT> 56 _Scanner<_CharT>:: _Scanner(typename _Scanner::_IterT __begin,typename _Scanner::_IterT __end,_FlagT __flags,std::locale __loc)57 _Scanner(typename _Scanner::_IterT __begin, 58 typename _Scanner::_IterT __end, 59 _FlagT __flags, std::locale __loc) 60 : _ScannerBase(__flags), 61 _M_current(__begin), _M_end(__end), 62 _M_ctype(std::use_facet<_CtypeT>(__loc)), 63 _M_eat_escape(_M_is_ecma() 64 ? &_Scanner::_M_eat_escape_ecma 65 : &_Scanner::_M_eat_escape_posix) 66 { _M_advance(); } 67 68 template<typename _CharT> 69 void 70 _Scanner<_CharT>:: _M_advance()71 _M_advance() 72 { 73 if (_M_current == _M_end) 74 { 75 _M_token = _S_token_eof; 76 return; 77 } 78 79 if (_M_state == _S_state_normal) 80 _M_scan_normal(); 81 else if (_M_state == _S_state_in_bracket) 82 _M_scan_in_bracket(); 83 else if (_M_state == _S_state_in_brace) 84 _M_scan_in_brace(); 85 else 86 { 87 _GLIBCXX_DEBUG_ASSERT(false); 88 } 89 } 90 91 // Differences between styles: 92 // 1) "\(", "\)", "\{" in basic. It's not escaping. 93 // 2) "(?:", "(?=", "(?!" in ECMAScript. 94 template<typename _CharT> 95 void 96 _Scanner<_CharT>:: _M_scan_normal()97 _M_scan_normal() 98 { 99 auto __c = *_M_current++; 100 101 if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr) 102 { 103 _M_token = _S_token_ord_char; 104 _M_value.assign(1, __c); 105 return; 106 } 107 if (__c == '\\') 108 { 109 if (_M_current == _M_end) 110 __throw_regex_error(regex_constants::error_escape); 111 112 if (!_M_is_basic() 113 || (*_M_current != '(' 114 && *_M_current != ')' 115 && *_M_current != '{')) 116 { 117 (this->*_M_eat_escape)(); 118 return; 119 } 120 __c = *_M_current++; 121 } 122 if (__c == '(') 123 { 124 if (_M_is_ecma() && *_M_current == '?') 125 { 126 if (++_M_current == _M_end) 127 __throw_regex_error(regex_constants::error_paren); 128 129 if (*_M_current == ':') 130 { 131 ++_M_current; 132 _M_token = _S_token_subexpr_no_group_begin; 133 } 134 else if (*_M_current == '=') 135 { 136 ++_M_current; 137 _M_token = _S_token_subexpr_lookahead_begin; 138 _M_value.assign(1, 'p'); 139 } 140 else if (*_M_current == '!') 141 { 142 ++_M_current; 143 _M_token = _S_token_subexpr_lookahead_begin; 144 _M_value.assign(1, 'n'); 145 } 146 else 147 __throw_regex_error(regex_constants::error_paren); 148 } 149 else if (_M_flags & regex_constants::nosubs) 150 _M_token = _S_token_subexpr_no_group_begin; 151 else 152 _M_token = _S_token_subexpr_begin; 153 } 154 else if (__c == ')') 155 _M_token = _S_token_subexpr_end; 156 else if (__c == '[') 157 { 158 _M_state = _S_state_in_bracket; 159 _M_at_bracket_start = true; 160 if (_M_current != _M_end && *_M_current == '^') 161 { 162 _M_token = _S_token_bracket_neg_begin; 163 ++_M_current; 164 } 165 else 166 _M_token = _S_token_bracket_begin; 167 } 168 else if (__c == '{') 169 { 170 _M_state = _S_state_in_brace; 171 _M_token = _S_token_interval_begin; 172 } 173 else if (__c != ']' && __c != '}') 174 { 175 auto __it = _M_token_tbl; 176 auto __narrowc = _M_ctype.narrow(__c, '\0'); 177 for (; __it->first != '\0'; ++__it) 178 if (__it->first == __narrowc) 179 { 180 _M_token = __it->second; 181 return; 182 } 183 _GLIBCXX_DEBUG_ASSERT(false); 184 } 185 else 186 { 187 _M_token = _S_token_ord_char; 188 _M_value.assign(1, __c); 189 } 190 } 191 192 // Differences between styles: 193 // 1) different semantics of "[]" and "[^]". 194 // 2) Escaping in bracket expr. 195 template<typename _CharT> 196 void 197 _Scanner<_CharT>:: _M_scan_in_bracket()198 _M_scan_in_bracket() 199 { 200 if (_M_current == _M_end) 201 __throw_regex_error(regex_constants::error_brack); 202 203 auto __c = *_M_current++; 204 205 if (__c == '[') 206 { 207 if (_M_current == _M_end) 208 __throw_regex_error(regex_constants::error_brack); 209 210 if (*_M_current == '.') 211 { 212 _M_token = _S_token_collsymbol; 213 _M_eat_class(*_M_current++); 214 } 215 else if (*_M_current == ':') 216 { 217 _M_token = _S_token_char_class_name; 218 _M_eat_class(*_M_current++); 219 } 220 else if (*_M_current == '=') 221 { 222 _M_token = _S_token_equiv_class_name; 223 _M_eat_class(*_M_current++); 224 } 225 else 226 { 227 _M_token = _S_token_ord_char; 228 _M_value.assign(1, __c); 229 } 230 } 231 // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted 232 // literally. So "[]]" and "[^]]" are valid regexes. See the testcases 233 // `*/empty_range.cc`. 234 else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start)) 235 { 236 _M_token = _S_token_bracket_end; 237 _M_state = _S_state_normal; 238 } 239 // ECMAScript and awk permits escaping in bracket. 240 else if (__c == '\\' && (_M_is_ecma() || _M_is_awk())) 241 (this->*_M_eat_escape)(); 242 else 243 { 244 _M_token = _S_token_ord_char; 245 _M_value.assign(1, __c); 246 } 247 _M_at_bracket_start = false; 248 } 249 250 // Differences between styles: 251 // 1) "\}" in basic style. 252 template<typename _CharT> 253 void 254 _Scanner<_CharT>:: _M_scan_in_brace()255 _M_scan_in_brace() 256 { 257 if (_M_current == _M_end) 258 __throw_regex_error(regex_constants::error_brace); 259 260 auto __c = *_M_current++; 261 262 if (_M_ctype.is(_CtypeT::digit, __c)) 263 { 264 _M_token = _S_token_dup_count; 265 _M_value.assign(1, __c); 266 while (_M_current != _M_end 267 && _M_ctype.is(_CtypeT::digit, *_M_current)) 268 _M_value += *_M_current++; 269 } 270 else if (__c == ',') 271 _M_token = _S_token_comma; 272 // basic use \}. 273 else if (_M_is_basic()) 274 { 275 if (__c == '\\' && _M_current != _M_end && *_M_current == '}') 276 { 277 _M_state = _S_state_normal; 278 _M_token = _S_token_interval_end; 279 ++_M_current; 280 } 281 else 282 __throw_regex_error(regex_constants::error_badbrace); 283 } 284 else if (__c == '}') 285 { 286 _M_state = _S_state_normal; 287 _M_token = _S_token_interval_end; 288 } 289 else 290 __throw_regex_error(regex_constants::error_badbrace); 291 } 292 293 template<typename _CharT> 294 void 295 _Scanner<_CharT>:: _M_eat_escape_ecma()296 _M_eat_escape_ecma() 297 { 298 if (_M_current == _M_end) 299 __throw_regex_error(regex_constants::error_escape); 300 301 auto __c = *_M_current++; 302 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 303 304 if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket)) 305 { 306 _M_token = _S_token_ord_char; 307 _M_value.assign(1, *__pos); 308 } 309 else if (__c == 'b') 310 { 311 _M_token = _S_token_word_bound; 312 _M_value.assign(1, 'p'); 313 } 314 else if (__c == 'B') 315 { 316 _M_token = _S_token_word_bound; 317 _M_value.assign(1, 'n'); 318 } 319 // N3376 28.13 320 else if (__c == 'd' 321 || __c == 'D' 322 || __c == 's' 323 || __c == 'S' 324 || __c == 'w' 325 || __c == 'W') 326 { 327 _M_token = _S_token_quoted_class; 328 _M_value.assign(1, __c); 329 } 330 else if (__c == 'c') 331 { 332 if (_M_current == _M_end) 333 __throw_regex_error(regex_constants::error_escape); 334 _M_token = _S_token_ord_char; 335 _M_value.assign(1, *_M_current++); 336 } 337 else if (__c == 'x' || __c == 'u') 338 { 339 _M_value.erase(); 340 for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++) 341 { 342 if (_M_current == _M_end 343 || !_M_ctype.is(_CtypeT::xdigit, *_M_current)) 344 __throw_regex_error(regex_constants::error_escape); 345 _M_value += *_M_current++; 346 } 347 _M_token = _S_token_hex_num; 348 } 349 // ECMAScript recognizes multi-digit back-references. 350 else if (_M_ctype.is(_CtypeT::digit, __c)) 351 { 352 _M_value.assign(1, __c); 353 while (_M_current != _M_end 354 && _M_ctype.is(_CtypeT::digit, *_M_current)) 355 _M_value += *_M_current++; 356 _M_token = _S_token_backref; 357 } 358 else 359 { 360 _M_token = _S_token_ord_char; 361 _M_value.assign(1, __c); 362 } 363 } 364 365 // Differences between styles: 366 // 1) Extended doesn't support backref, but basic does. 367 template<typename _CharT> 368 void 369 _Scanner<_CharT>:: _M_eat_escape_posix()370 _M_eat_escape_posix() 371 { 372 if (_M_current == _M_end) 373 __throw_regex_error(regex_constants::error_escape); 374 375 auto __c = *_M_current; 376 auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')); 377 378 if (__pos != nullptr && *__pos != '\0') 379 { 380 _M_token = _S_token_ord_char; 381 _M_value.assign(1, __c); 382 } 383 // We MUST judge awk before handling backrefs. There's no backref in awk. 384 else if (_M_is_awk()) 385 { 386 _M_eat_escape_awk(); 387 return; 388 } 389 else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0') 390 { 391 _M_token = _S_token_backref; 392 _M_value.assign(1, __c); 393 } 394 else 395 { 396 #ifdef __STRICT_ANSI__ 397 // POSIX says it is undefined to escape ordinary characters 398 __throw_regex_error(regex_constants::error_escape); 399 #else 400 _M_token = _S_token_ord_char; 401 _M_value.assign(1, __c); 402 #endif 403 } 404 ++_M_current; 405 } 406 407 template<typename _CharT> 408 void 409 _Scanner<_CharT>:: _M_eat_escape_awk()410 _M_eat_escape_awk() 411 { 412 auto __c = *_M_current++; 413 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 414 415 if (__pos != nullptr) 416 { 417 _M_token = _S_token_ord_char; 418 _M_value.assign(1, *__pos); 419 } 420 // \ddd for oct representation 421 else if (_M_ctype.is(_CtypeT::digit, __c) 422 && __c != '8' 423 && __c != '9') 424 { 425 _M_value.assign(1, __c); 426 for (int __i = 0; 427 __i < 2 428 && _M_current != _M_end 429 && _M_ctype.is(_CtypeT::digit, *_M_current) 430 && *_M_current != '8' 431 && *_M_current != '9'; 432 __i++) 433 _M_value += *_M_current++; 434 _M_token = _S_token_oct_num; 435 return; 436 } 437 else 438 __throw_regex_error(regex_constants::error_escape); 439 } 440 441 // Eats a character class or throws an exception. 442 // __ch could be ':', '.' or '=', _M_current is the char after ']' when 443 // returning. 444 template<typename _CharT> 445 void 446 _Scanner<_CharT>:: _M_eat_class(char __ch)447 _M_eat_class(char __ch) 448 { 449 for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;) 450 _M_value += *_M_current++; 451 if (_M_current == _M_end 452 || *_M_current++ != __ch 453 || _M_current == _M_end // skip __ch 454 || *_M_current++ != ']') // skip ']' 455 { 456 if (__ch == ':') 457 __throw_regex_error(regex_constants::error_ctype); 458 else 459 __throw_regex_error(regex_constants::error_collate); 460 } 461 } 462 463 #ifdef _GLIBCXX_DEBUG 464 template<typename _CharT> 465 std::ostream& 466 _Scanner<_CharT>:: _M_print(std::ostream & ostr)467 _M_print(std::ostream& ostr) 468 { 469 switch (_M_token) 470 { 471 case _S_token_anychar: 472 ostr << "any-character\n"; 473 break; 474 case _S_token_backref: 475 ostr << "backref\n"; 476 break; 477 case _S_token_bracket_begin: 478 ostr << "bracket-begin\n"; 479 break; 480 case _S_token_bracket_neg_begin: 481 ostr << "bracket-neg-begin\n"; 482 break; 483 case _S_token_bracket_end: 484 ostr << "bracket-end\n"; 485 break; 486 case _S_token_char_class_name: 487 ostr << "char-class-name \"" << _M_value << "\"\n"; 488 break; 489 case _S_token_closure0: 490 ostr << "closure0\n"; 491 break; 492 case _S_token_closure1: 493 ostr << "closure1\n"; 494 break; 495 case _S_token_collsymbol: 496 ostr << "collsymbol \"" << _M_value << "\"\n"; 497 break; 498 case _S_token_comma: 499 ostr << "comma\n"; 500 break; 501 case _S_token_dup_count: 502 ostr << "dup count: " << _M_value << "\n"; 503 break; 504 case _S_token_eof: 505 ostr << "EOF\n"; 506 break; 507 case _S_token_equiv_class_name: 508 ostr << "equiv-class-name \"" << _M_value << "\"\n"; 509 break; 510 case _S_token_interval_begin: 511 ostr << "interval begin\n"; 512 break; 513 case _S_token_interval_end: 514 ostr << "interval end\n"; 515 break; 516 case _S_token_line_begin: 517 ostr << "line begin\n"; 518 break; 519 case _S_token_line_end: 520 ostr << "line end\n"; 521 break; 522 case _S_token_opt: 523 ostr << "opt\n"; 524 break; 525 case _S_token_or: 526 ostr << "or\n"; 527 break; 528 case _S_token_ord_char: 529 ostr << "ordinary character: \"" << _M_value << "\"\n"; 530 break; 531 case _S_token_subexpr_begin: 532 ostr << "subexpr begin\n"; 533 break; 534 case _S_token_subexpr_no_group_begin: 535 ostr << "no grouping subexpr begin\n"; 536 break; 537 case _S_token_subexpr_lookahead_begin: 538 ostr << "lookahead subexpr begin\n"; 539 break; 540 case _S_token_subexpr_end: 541 ostr << "subexpr end\n"; 542 break; 543 case _S_token_unknown: 544 ostr << "-- unknown token --\n"; 545 break; 546 case _S_token_oct_num: 547 ostr << "oct number " << _M_value << "\n"; 548 break; 549 case _S_token_hex_num: 550 ostr << "hex number " << _M_value << "\n"; 551 break; 552 case _S_token_quoted_class: 553 ostr << "quoted class " << "\\" << _M_value << "\n"; 554 break; 555 default: 556 _GLIBCXX_DEBUG_ASSERT(false); 557 } 558 return ostr; 559 } 560 #endif 561 562 _GLIBCXX_END_NAMESPACE_VERSION 563 } // namespace __detail 564 } // namespace 565