1 // Locale support (codecvt) -*- C++ -*-
2
3 // Copyright (C) 2015-2017 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24
25 #include <codecvt>
26 #include <cstring> // std::memcpy, std::memcmp
27 #include <bits/stl_algobase.h> // std::min
28
29 #ifdef _GLIBCXX_USE_C99_STDINT_TR1
30 namespace std _GLIBCXX_VISIBILITY(default)
31 {
32 _GLIBCXX_BEGIN_NAMESPACE_VERSION
33
34 // The standard doesn't define these operators, which is annoying.
35 static underlying_type<codecvt_mode>::type
to_integer(codecvt_mode m)36 to_integer(codecvt_mode m)
37 { return static_cast<underlying_type<codecvt_mode>::type>(m); }
38
operator &=(codecvt_mode & m,codecvt_mode n)39 static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
40 { return m = codecvt_mode(to_integer(m) & to_integer(n)); }
41
operator |=(codecvt_mode & m,codecvt_mode n)42 static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
43 { return m = codecvt_mode(to_integer(m) | to_integer(n)); }
44
operator ~(codecvt_mode m)45 static codecvt_mode operator~(codecvt_mode m)
46 { return codecvt_mode(~to_integer(m)); }
47
48 namespace
49 {
50 // Largest code point that fits in a single UTF-16 code unit.
51 const char32_t max_single_utf16_unit = 0xFFFF;
52
53 const char32_t max_code_point = 0x10FFFF;
54
55 // The functions below rely on maxcode < incomplete_mb_character
56 // (which is enforced by the codecvt_utf* classes on construction).
57 const char32_t incomplete_mb_character = char32_t(-2);
58 const char32_t invalid_mb_sequence = char32_t(-1);
59
60 // Utility type for reading and writing code units of type Elem from
61 // a range defined by a pair of pointers.
62 template<typename Elem, bool Aligned = true>
63 struct range
64 {
65 Elem* next;
66 Elem* end;
67
68 // Write a code unit.
operator =std::__anon1c46682b0111::range69 range& operator=(Elem e)
70 {
71 *next++ = e;
72 return *this;
73 }
74
75 // Read the next code unit.
operator *std::__anon1c46682b0111::range76 Elem operator*() const { return *next; }
77
78 // Read the Nth code unit.
operator []std::__anon1c46682b0111::range79 Elem operator[](size_t n) const { return next[n]; }
80
81 // Move to the next code unit.
operator ++std::__anon1c46682b0111::range82 range& operator++()
83 {
84 ++next;
85 return *this;
86 }
87
88 // Move to the Nth code unit.
operator +=std::__anon1c46682b0111::range89 range& operator+=(size_t n)
90 {
91 next += n;
92 return *this;
93 }
94
95 // The number of code units remaining.
sizestd::__anon1c46682b0111::range96 size_t size() const { return end - next; }
97
98 // The number of bytes remaining.
nbytesstd::__anon1c46682b0111::range99 size_t nbytes() const { return (const char*)end - (const char*)next; }
100 };
101
102 // This specialization is used when accessing char16_t values through
103 // pointers to char, which might not be correctly aligned for char16_t.
104 template<typename Elem>
105 struct range<Elem, false>
106 {
107 using value_type = typename remove_const<Elem>::type;
108
109 using char_pointer = typename
110 conditional<is_const<Elem>::value, const char*, char*>::type;
111
112 char_pointer next;
113 char_pointer end;
114
115 // Write a code unit.
operator =std::__anon1c46682b0111::range116 range& operator=(Elem e)
117 {
118 memcpy(next, &e, sizeof(Elem));
119 ++*this;
120 return *this;
121 }
122
123 // Read the next code unit.
operator *std::__anon1c46682b0111::range124 Elem operator*() const
125 {
126 value_type e;
127 memcpy(&e, next, sizeof(Elem));
128 return e;
129 }
130
131 // Read the Nth code unit.
operator []std::__anon1c46682b0111::range132 Elem operator[](size_t n) const
133 {
134 value_type e;
135 memcpy(&e, next + n * sizeof(Elem), sizeof(Elem));
136 return e;
137 }
138
139 // Move to the next code unit.
operator ++std::__anon1c46682b0111::range140 range& operator++()
141 {
142 next += sizeof(Elem);
143 return *this;
144 }
145
146 // Move to the Nth code unit.
operator +=std::__anon1c46682b0111::range147 range& operator+=(size_t n)
148 {
149 next += n * sizeof(Elem);
150 return *this;
151 }
152
153 // The number of code units remaining.
sizestd::__anon1c46682b0111::range154 size_t size() const { return nbytes() / sizeof(Elem); }
155
156 // The number of bytes remaining.
nbytesstd::__anon1c46682b0111::range157 size_t nbytes() const { return end - next; }
158 };
159
160 // Multibyte sequences can have "header" consisting of Byte Order Mark
161 const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF };
162 const unsigned char utf16_bom[2] = { 0xFE, 0xFF };
163 const unsigned char utf16le_bom[2] = { 0xFF, 0xFE };
164
165 // Write a BOM (space permitting).
166 template<typename C, bool A, size_t N>
167 bool
write_bom(range<C,A> & to,const unsigned char (& bom)[N])168 write_bom(range<C, A>& to, const unsigned char (&bom)[N])
169 {
170 static_assert( (N / sizeof(C)) != 0, "" );
171 static_assert( (N % sizeof(C)) == 0, "" );
172
173 if (to.nbytes() < N)
174 return false;
175 memcpy(to.next, bom, N);
176 to += (N / sizeof(C));
177 return true;
178 }
179
180 // Try to read a BOM.
181 template<typename C, bool A, size_t N>
182 bool
read_bom(range<C,A> & from,const unsigned char (& bom)[N])183 read_bom(range<C, A>& from, const unsigned char (&bom)[N])
184 {
185 static_assert( (N / sizeof(C)) != 0, "" );
186 static_assert( (N % sizeof(C)) == 0, "" );
187
188 if (from.nbytes() >= N && !memcmp(from.next, bom, N))
189 {
190 from += (N / sizeof(C));
191 return true;
192 }
193 return false;
194 }
195
196 // If generate_header is set in mode write out UTF-8 BOM.
197 bool
write_utf8_bom(range<char> & to,codecvt_mode mode)198 write_utf8_bom(range<char>& to, codecvt_mode mode)
199 {
200 if (mode & generate_header)
201 return write_bom(to, utf8_bom);
202 return true;
203 }
204
205 // If generate_header is set in mode write out the UTF-16 BOM indicated
206 // by whether little_endian is set in mode.
207 template<bool Aligned>
208 bool
write_utf16_bom(range<char16_t,Aligned> & to,codecvt_mode mode)209 write_utf16_bom(range<char16_t, Aligned>& to, codecvt_mode mode)
210 {
211 if (mode & generate_header)
212 {
213 if (mode & little_endian)
214 return write_bom(to, utf16le_bom);
215 else
216 return write_bom(to, utf16_bom);
217 }
218 return true;
219 }
220
221 // If consume_header is set in mode update from.next to after any BOM.
222 void
read_utf8_bom(range<const char> & from,codecvt_mode mode)223 read_utf8_bom(range<const char>& from, codecvt_mode mode)
224 {
225 if (mode & consume_header)
226 read_bom(from, utf8_bom);
227 }
228
229 // If consume_header is not set in mode, no effects.
230 // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
231 // - if the UTF-16BE BOM was found unset little_endian in mode, or
232 // - if the UTF-16LE BOM was found set little_endian in mode.
233 template<bool Aligned>
234 void
read_utf16_bom(range<const char16_t,Aligned> & from,codecvt_mode & mode)235 read_utf16_bom(range<const char16_t, Aligned>& from, codecvt_mode& mode)
236 {
237 if (mode & consume_header)
238 {
239 if (read_bom(from, utf16_bom))
240 mode &= ~little_endian;
241 else if (read_bom(from, utf16le_bom))
242 mode |= little_endian;
243 }
244 }
245
246 // Read a codepoint from a UTF-8 multibyte sequence.
247 // Updates from.next if the codepoint is not greater than maxcode.
248 // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
249 char32_t
read_utf8_code_point(range<const char> & from,unsigned long maxcode)250 read_utf8_code_point(range<const char>& from, unsigned long maxcode)
251 {
252 const size_t avail = from.size();
253 if (avail == 0)
254 return incomplete_mb_character;
255 unsigned char c1 = from[0];
256 // https://en.wikipedia.org/wiki/UTF-8#Sample_code
257 if (c1 < 0x80)
258 {
259 ++from;
260 return c1;
261 }
262 else if (c1 < 0xC2) // continuation or overlong 2-byte sequence
263 return invalid_mb_sequence;
264 else if (c1 < 0xE0) // 2-byte sequence
265 {
266 if (avail < 2)
267 return incomplete_mb_character;
268 unsigned char c2 = from[1];
269 if ((c2 & 0xC0) != 0x80)
270 return invalid_mb_sequence;
271 char32_t c = (c1 << 6) + c2 - 0x3080;
272 if (c <= maxcode)
273 from += 2;
274 return c;
275 }
276 else if (c1 < 0xF0) // 3-byte sequence
277 {
278 if (avail < 3)
279 return incomplete_mb_character;
280 unsigned char c2 = from[1];
281 if ((c2 & 0xC0) != 0x80)
282 return invalid_mb_sequence;
283 if (c1 == 0xE0 && c2 < 0xA0) // overlong
284 return invalid_mb_sequence;
285 unsigned char c3 = from[2];
286 if ((c3 & 0xC0) != 0x80)
287 return invalid_mb_sequence;
288 char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080;
289 if (c <= maxcode)
290 from += 3;
291 return c;
292 }
293 else if (c1 < 0xF5) // 4-byte sequence
294 {
295 if (avail < 4)
296 return incomplete_mb_character;
297 unsigned char c2 = from[1];
298 if ((c2 & 0xC0) != 0x80)
299 return invalid_mb_sequence;
300 if (c1 == 0xF0 && c2 < 0x90) // overlong
301 return invalid_mb_sequence;
302 if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF
303 return invalid_mb_sequence;
304 unsigned char c3 = from[2];
305 if ((c3 & 0xC0) != 0x80)
306 return invalid_mb_sequence;
307 unsigned char c4 = from[3];
308 if ((c4 & 0xC0) != 0x80)
309 return invalid_mb_sequence;
310 char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080;
311 if (c <= maxcode)
312 from += 4;
313 return c;
314 }
315 else // > U+10FFFF
316 return invalid_mb_sequence;
317 }
318
319 bool
write_utf8_code_point(range<char> & to,char32_t code_point)320 write_utf8_code_point(range<char>& to, char32_t code_point)
321 {
322 if (code_point < 0x80)
323 {
324 if (to.size() < 1)
325 return false;
326 to = code_point;
327 }
328 else if (code_point <= 0x7FF)
329 {
330 if (to.size() < 2)
331 return false;
332 to = (code_point >> 6) + 0xC0;
333 to = (code_point & 0x3F) + 0x80;
334 }
335 else if (code_point <= 0xFFFF)
336 {
337 if (to.size() < 3)
338 return false;
339 to = (code_point >> 12) + 0xE0;
340 to = ((code_point >> 6) & 0x3F) + 0x80;
341 to = (code_point & 0x3F) + 0x80;
342 }
343 else if (code_point <= 0x10FFFF)
344 {
345 if (to.size() < 4)
346 return false;
347 to = (code_point >> 18) + 0xF0;
348 to = ((code_point >> 12) & 0x3F) + 0x80;
349 to = ((code_point >> 6) & 0x3F) + 0x80;
350 to = (code_point & 0x3F) + 0x80;
351 }
352 else
353 return false;
354 return true;
355 }
356
357 inline char16_t
adjust_byte_order(char16_t c,codecvt_mode mode)358 adjust_byte_order(char16_t c, codecvt_mode mode)
359 {
360 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
361 return (mode & little_endian) ? __builtin_bswap16(c) : c;
362 #else
363 return (mode & little_endian) ? c : __builtin_bswap16(c);
364 #endif
365 }
366
367 // Return true if c is a high-surrogate (aka leading) code point.
368 inline bool
is_high_surrogate(char32_t c)369 is_high_surrogate(char32_t c)
370 {
371 return c >= 0xD800 && c <= 0xDBFF;
372 }
373
374 // Return true if c is a low-surrogate (aka trailing) code point.
375 inline bool
is_low_surrogate(char32_t c)376 is_low_surrogate(char32_t c)
377 {
378 return c >= 0xDC00 && c <= 0xDFFF;
379 }
380
381 inline char32_t
surrogate_pair_to_code_point(char32_t high,char32_t low)382 surrogate_pair_to_code_point(char32_t high, char32_t low)
383 {
384 return (high << 10) + low - 0x35FDC00;
385 }
386
387 // Read a codepoint from a UTF-16 multibyte sequence.
388 // The sequence's endianness is indicated by (mode & little_endian).
389 // Updates from.next if the codepoint is not greater than maxcode.
390 // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
391 template<bool Aligned>
392 char32_t
read_utf16_code_point(range<const char16_t,Aligned> & from,unsigned long maxcode,codecvt_mode mode)393 read_utf16_code_point(range<const char16_t, Aligned>& from,
394 unsigned long maxcode, codecvt_mode mode)
395 {
396 const size_t avail = from.size();
397 if (avail == 0)
398 return incomplete_mb_character;
399 int inc = 1;
400 char32_t c = adjust_byte_order(from[0], mode);
401 if (is_high_surrogate(c))
402 {
403 if (avail < 2)
404 return incomplete_mb_character;
405 const char16_t c2 = adjust_byte_order(from[1], mode);
406 if (is_low_surrogate(c2))
407 {
408 c = surrogate_pair_to_code_point(c, c2);
409 inc = 2;
410 }
411 else
412 return invalid_mb_sequence;
413 }
414 else if (is_low_surrogate(c))
415 return invalid_mb_sequence;
416 if (c <= maxcode)
417 from += inc;
418 return c;
419 }
420
421 template<typename C, bool A>
422 bool
write_utf16_code_point(range<C,A> & to,char32_t codepoint,codecvt_mode mode)423 write_utf16_code_point(range<C, A>& to, char32_t codepoint, codecvt_mode mode)
424 {
425 static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit");
426
427 if (codepoint <= max_single_utf16_unit)
428 {
429 if (to.size() > 0)
430 {
431 to = adjust_byte_order(codepoint, mode);
432 return true;
433 }
434 }
435 else if (to.size() > 1)
436 {
437 // Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4
438 const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
439 char16_t lead = LEAD_OFFSET + (codepoint >> 10);
440 char16_t trail = 0xDC00 + (codepoint & 0x3FF);
441 to = adjust_byte_order(lead, mode);
442 to = adjust_byte_order(trail, mode);
443 return true;
444 }
445 return false;
446 }
447
448 // utf8 -> ucs4
449 codecvt_base::result
ucs4_in(range<const char> & from,range<char32_t> & to,unsigned long maxcode=max_code_point,codecvt_mode mode={})450 ucs4_in(range<const char>& from, range<char32_t>& to,
451 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
452 {
453 read_utf8_bom(from, mode);
454 while (from.size() && to.size())
455 {
456 const char32_t codepoint = read_utf8_code_point(from, maxcode);
457 if (codepoint == incomplete_mb_character)
458 return codecvt_base::partial;
459 if (codepoint > maxcode)
460 return codecvt_base::error;
461 to = codepoint;
462 }
463 return from.size() ? codecvt_base::partial : codecvt_base::ok;
464 }
465
466 // ucs4 -> utf8
467 codecvt_base::result
ucs4_out(range<const char32_t> & from,range<char> & to,unsigned long maxcode=max_code_point,codecvt_mode mode={})468 ucs4_out(range<const char32_t>& from, range<char>& to,
469 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
470 {
471 if (!write_utf8_bom(to, mode))
472 return codecvt_base::partial;
473 while (from.size())
474 {
475 const char32_t c = from[0];
476 if (c > maxcode)
477 return codecvt_base::error;
478 if (!write_utf8_code_point(to, c))
479 return codecvt_base::partial;
480 ++from;
481 }
482 return codecvt_base::ok;
483 }
484
485 // utf16 -> ucs4
486 codecvt_base::result
ucs4_in(range<const char16_t,false> & from,range<char32_t> & to,unsigned long maxcode=max_code_point,codecvt_mode mode={})487 ucs4_in(range<const char16_t, false>& from, range<char32_t>& to,
488 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
489 {
490 read_utf16_bom(from, mode);
491 while (from.size() && to.size())
492 {
493 const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
494 if (codepoint == incomplete_mb_character)
495 return codecvt_base::partial;
496 if (codepoint > maxcode)
497 return codecvt_base::error;
498 to = codepoint;
499 }
500 return from.size() ? codecvt_base::partial : codecvt_base::ok;
501 }
502
503 // ucs4 -> utf16
504 codecvt_base::result
ucs4_out(range<const char32_t> & from,range<char16_t,false> & to,unsigned long maxcode=max_code_point,codecvt_mode mode={})505 ucs4_out(range<const char32_t>& from, range<char16_t, false>& to,
506 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
507 {
508 if (!write_utf16_bom(to, mode))
509 return codecvt_base::partial;
510 while (from.size())
511 {
512 const char32_t c = from[0];
513 if (c > maxcode)
514 return codecvt_base::error;
515 if (!write_utf16_code_point(to, c, mode))
516 return codecvt_base::partial;
517 ++from;
518 }
519 return codecvt_base::ok;
520 }
521
522 // Flag indicating whether to process UTF-16 or UCS2
523 enum class surrogates { allowed, disallowed };
524
525 // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
526 template<typename C>
527 codecvt_base::result
utf16_in(range<const char> & from,range<C> & to,unsigned long maxcode=max_code_point,codecvt_mode mode={},surrogates s=surrogates::allowed)528 utf16_in(range<const char>& from, range<C>& to,
529 unsigned long maxcode = max_code_point, codecvt_mode mode = {},
530 surrogates s = surrogates::allowed)
531 {
532 read_utf8_bom(from, mode);
533 while (from.size() && to.size())
534 {
535 auto orig = from;
536 const char32_t codepoint = read_utf8_code_point(from, maxcode);
537 if (codepoint == incomplete_mb_character)
538 {
539 if (s == surrogates::allowed)
540 return codecvt_base::partial;
541 else
542 return codecvt_base::error; // No surrogates in UCS2
543 }
544 if (codepoint > maxcode)
545 return codecvt_base::error;
546 if (!write_utf16_code_point(to, codepoint, mode))
547 {
548 from = orig; // rewind to previous position
549 return codecvt_base::partial;
550 }
551 }
552 return codecvt_base::ok;
553 }
554
555 // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
556 template<typename C>
557 codecvt_base::result
utf16_out(range<const C> & from,range<char> & to,unsigned long maxcode=max_code_point,codecvt_mode mode={},surrogates s=surrogates::allowed)558 utf16_out(range<const C>& from, range<char>& to,
559 unsigned long maxcode = max_code_point, codecvt_mode mode = {},
560 surrogates s = surrogates::allowed)
561 {
562 if (!write_utf8_bom(to, mode))
563 return codecvt_base::partial;
564 while (from.size())
565 {
566 char32_t c = from[0];
567 int inc = 1;
568 if (is_high_surrogate(c))
569 {
570 if (s == surrogates::disallowed)
571 return codecvt_base::error; // No surrogates in UCS-2
572
573 if (from.size() < 2)
574 return codecvt_base::ok; // stop converting at this point
575
576 const char32_t c2 = from[1];
577 if (is_low_surrogate(c2))
578 {
579 c = surrogate_pair_to_code_point(c, c2);
580 inc = 2;
581 }
582 else
583 return codecvt_base::error;
584 }
585 else if (is_low_surrogate(c))
586 return codecvt_base::error;
587 if (c > maxcode)
588 return codecvt_base::error;
589 if (!write_utf8_code_point(to, c))
590 return codecvt_base::partial;
591 from += inc;
592 }
593 return codecvt_base::ok;
594 }
595
596 // return pos such that [begin,pos) is valid UTF-16 string no longer than max
597 const char*
utf16_span(const char * begin,const char * end,size_t max,char32_t maxcode=max_code_point,codecvt_mode mode={})598 utf16_span(const char* begin, const char* end, size_t max,
599 char32_t maxcode = max_code_point, codecvt_mode mode = {})
600 {
601 range<const char> from{ begin, end };
602 read_utf8_bom(from, mode);
603 size_t count = 0;
604 while (count+1 < max)
605 {
606 char32_t c = read_utf8_code_point(from, maxcode);
607 if (c > maxcode)
608 return from.next;
609 else if (c > max_single_utf16_unit)
610 ++count;
611 ++count;
612 }
613 if (count+1 == max) // take one more character if it fits in a single unit
614 read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
615 return from.next;
616 }
617
618 // utf8 -> ucs2
619 codecvt_base::result
ucs2_in(range<const char> & from,range<char16_t> & to,char32_t maxcode=max_code_point,codecvt_mode mode={})620 ucs2_in(range<const char>& from, range<char16_t>& to,
621 char32_t maxcode = max_code_point, codecvt_mode mode = {})
622 {
623 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
624 maxcode = std::min(max_single_utf16_unit, maxcode);
625 return utf16_in(from, to, maxcode, mode, surrogates::disallowed);
626 }
627
628 // ucs2 -> utf8
629 codecvt_base::result
ucs2_out(range<const char16_t> & from,range<char> & to,char32_t maxcode=max_code_point,codecvt_mode mode={})630 ucs2_out(range<const char16_t>& from, range<char>& to,
631 char32_t maxcode = max_code_point, codecvt_mode mode = {})
632 {
633 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
634 maxcode = std::min(max_single_utf16_unit, maxcode);
635 return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
636 }
637
638 // ucs2 -> utf16
639 codecvt_base::result
ucs2_out(range<const char16_t> & from,range<char16_t,false> & to,char32_t maxcode=max_code_point,codecvt_mode mode={})640 ucs2_out(range<const char16_t>& from, range<char16_t, false>& to,
641 char32_t maxcode = max_code_point, codecvt_mode mode = {})
642 {
643 if (!write_utf16_bom(to, mode))
644 return codecvt_base::partial;
645 while (from.size() && to.size())
646 {
647 char16_t c = from[0];
648 if (is_high_surrogate(c))
649 return codecvt_base::error;
650 if (c > maxcode)
651 return codecvt_base::error;
652 to = adjust_byte_order(c, mode);
653 ++from;
654 }
655 return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
656 }
657
658 // utf16 -> ucs2
659 codecvt_base::result
ucs2_in(range<const char16_t,false> & from,range<char16_t> & to,char32_t maxcode=max_code_point,codecvt_mode mode={})660 ucs2_in(range<const char16_t, false>& from, range<char16_t>& to,
661 char32_t maxcode = max_code_point, codecvt_mode mode = {})
662 {
663 read_utf16_bom(from, mode);
664 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
665 maxcode = std::min(max_single_utf16_unit, maxcode);
666 while (from.size() && to.size())
667 {
668 const char32_t c = read_utf16_code_point(from, maxcode, mode);
669 if (c == incomplete_mb_character)
670 return codecvt_base::error; // UCS-2 only supports single units.
671 if (c > maxcode)
672 return codecvt_base::error;
673 to = c;
674 }
675 return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
676 }
677
678 const char16_t*
ucs2_span(range<const char16_t,false> & from,size_t max,char32_t maxcode,codecvt_mode mode)679 ucs2_span(range<const char16_t, false>& from, size_t max,
680 char32_t maxcode, codecvt_mode mode)
681 {
682 read_utf16_bom(from, mode);
683 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
684 maxcode = std::min(max_single_utf16_unit, maxcode);
685 char32_t c = 0;
686 while (max-- && c <= maxcode)
687 c = read_utf16_code_point(from, maxcode, mode);
688 return reinterpret_cast<const char16_t*>(from.next);
689 }
690
691 const char*
ucs2_span(const char * begin,const char * end,size_t max,char32_t maxcode,codecvt_mode mode)692 ucs2_span(const char* begin, const char* end, size_t max,
693 char32_t maxcode, codecvt_mode mode)
694 {
695 range<const char> from{ begin, end };
696 read_utf8_bom(from, mode);
697 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
698 maxcode = std::min(max_single_utf16_unit, maxcode);
699 char32_t c = 0;
700 while (max-- && c <= maxcode)
701 c = read_utf8_code_point(from, maxcode);
702 return from.next;
703 }
704
705 // return pos such that [begin,pos) is valid UCS-4 string no longer than max
706 const char*
ucs4_span(const char * begin,const char * end,size_t max,char32_t maxcode=max_code_point,codecvt_mode mode={})707 ucs4_span(const char* begin, const char* end, size_t max,
708 char32_t maxcode = max_code_point, codecvt_mode mode = {})
709 {
710 range<const char> from{ begin, end };
711 read_utf8_bom(from, mode);
712 char32_t c = 0;
713 while (max-- && c <= maxcode)
714 c = read_utf8_code_point(from, maxcode);
715 return from.next;
716 }
717
718 // return pos such that [begin,pos) is valid UCS-4 string no longer than max
719 const char16_t*
ucs4_span(range<const char16_t,false> & from,size_t max,char32_t maxcode=max_code_point,codecvt_mode mode={})720 ucs4_span(range<const char16_t, false>& from, size_t max,
721 char32_t maxcode = max_code_point, codecvt_mode mode = {})
722 {
723 read_utf16_bom(from, mode);
724 char32_t c = 0;
725 while (max-- && c <= maxcode)
726 c = read_utf16_code_point(from, maxcode, mode);
727 return reinterpret_cast<const char16_t*>(from.next);
728 }
729 }
730
731 // Define members of codecvt<char16_t, char, mbstate_t> specialization.
732 // Converts from UTF-8 to UTF-16.
733
734 locale::id codecvt<char16_t, char, mbstate_t>::id;
735
~codecvt()736 codecvt<char16_t, char, mbstate_t>::~codecvt() { }
737
738 codecvt_base::result
739 codecvt<char16_t, char, mbstate_t>::
do_out(state_type &,const intern_type * __from,const intern_type * __from_end,const intern_type * & __from_next,extern_type * __to,extern_type * __to_end,extern_type * & __to_next) const740 do_out(state_type&,
741 const intern_type* __from,
742 const intern_type* __from_end, const intern_type*& __from_next,
743 extern_type* __to, extern_type* __to_end,
744 extern_type*& __to_next) const
745 {
746 range<const char16_t> from{ __from, __from_end };
747 range<char> to{ __to, __to_end };
748 auto res = utf16_out(from, to);
749 __from_next = from.next;
750 __to_next = to.next;
751 return res;
752 }
753
754 codecvt_base::result
755 codecvt<char16_t, char, mbstate_t>::
do_unshift(state_type &,extern_type * __to,extern_type *,extern_type * & __to_next) const756 do_unshift(state_type&, extern_type* __to, extern_type*,
757 extern_type*& __to_next) const
758 {
759 __to_next = __to;
760 return noconv; // we don't use mbstate_t for the unicode facets
761 }
762
763 codecvt_base::result
764 codecvt<char16_t, char, mbstate_t>::
do_in(state_type &,const extern_type * __from,const extern_type * __from_end,const extern_type * & __from_next,intern_type * __to,intern_type * __to_end,intern_type * & __to_next) const765 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
766 const extern_type*& __from_next,
767 intern_type* __to, intern_type* __to_end,
768 intern_type*& __to_next) const
769 {
770 range<const char> from{ __from, __from_end };
771 range<char16_t> to{ __to, __to_end };
772 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
773 codecvt_mode mode = {};
774 #else
775 codecvt_mode mode = little_endian;
776 #endif
777 auto res = utf16_in(from, to, max_code_point, mode);
778 __from_next = from.next;
779 __to_next = to.next;
780 return res;
781 }
782
783 int
do_encoding() const784 codecvt<char16_t, char, mbstate_t>::do_encoding() const throw()
785 { return 0; } // UTF-8 is not a fixed-width encoding
786
787 bool
do_always_noconv() const788 codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw()
789 { return false; }
790
791 int
792 codecvt<char16_t, char, mbstate_t>::
do_length(state_type &,const extern_type * __from,const extern_type * __end,size_t __max) const793 do_length(state_type&, const extern_type* __from,
794 const extern_type* __end, size_t __max) const
795 {
796 __end = utf16_span(__from, __end, __max);
797 return __end - __from;
798 }
799
800 int
do_max_length() const801 codecvt<char16_t, char, mbstate_t>::do_max_length() const throw()
802 {
803 // A single character (one or two UTF-16 code units) requires
804 // up to four UTF-8 code units.
805 return 4;
806 }
807
808 // Define members of codecvt<char32_t, char, mbstate_t> specialization.
809 // Converts from UTF-8 to UTF-32 (aka UCS-4).
810
811 locale::id codecvt<char32_t, char, mbstate_t>::id;
812
~codecvt()813 codecvt<char32_t, char, mbstate_t>::~codecvt() { }
814
815 codecvt_base::result
816 codecvt<char32_t, char, mbstate_t>::
do_out(state_type &,const intern_type * __from,const intern_type * __from_end,const intern_type * & __from_next,extern_type * __to,extern_type * __to_end,extern_type * & __to_next) const817 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
818 const intern_type*& __from_next,
819 extern_type* __to, extern_type* __to_end,
820 extern_type*& __to_next) const
821 {
822 range<const char32_t> from{ __from, __from_end };
823 range<char> to{ __to, __to_end };
824 auto res = ucs4_out(from, to);
825 __from_next = from.next;
826 __to_next = to.next;
827 return res;
828 }
829
830 codecvt_base::result
831 codecvt<char32_t, char, mbstate_t>::
do_unshift(state_type &,extern_type * __to,extern_type *,extern_type * & __to_next) const832 do_unshift(state_type&, extern_type* __to, extern_type*,
833 extern_type*& __to_next) const
834 {
835 __to_next = __to;
836 return noconv;
837 }
838
839 codecvt_base::result
840 codecvt<char32_t, char, mbstate_t>::
do_in(state_type &,const extern_type * __from,const extern_type * __from_end,const extern_type * & __from_next,intern_type * __to,intern_type * __to_end,intern_type * & __to_next) const841 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
842 const extern_type*& __from_next,
843 intern_type* __to, intern_type* __to_end,
844 intern_type*& __to_next) const
845 {
846 range<const char> from{ __from, __from_end };
847 range<char32_t> to{ __to, __to_end };
848 auto res = ucs4_in(from, to);
849 __from_next = from.next;
850 __to_next = to.next;
851 return res;
852 }
853
854 int
do_encoding() const855 codecvt<char32_t, char, mbstate_t>::do_encoding() const throw()
856 { return 0; } // UTF-8 is not a fixed-width encoding
857
858 bool
do_always_noconv() const859 codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw()
860 { return false; }
861
862 int
863 codecvt<char32_t, char, mbstate_t>::
do_length(state_type &,const extern_type * __from,const extern_type * __end,size_t __max) const864 do_length(state_type&, const extern_type* __from,
865 const extern_type* __end, size_t __max) const
866 {
867 __end = ucs4_span(__from, __end, __max);
868 return __end - __from;
869 }
870
871 int
do_max_length() const872 codecvt<char32_t, char, mbstate_t>::do_max_length() const throw()
873 {
874 // A single character (one UTF-32 code unit) requires
875 // up to 4 UTF-8 code units.
876 return 4;
877 }
878
879 // Define members of codecvt_utf8<char16_t> base class implementation.
880 // Converts from UTF-8 to UCS-2.
881
~__codecvt_utf8_base()882 __codecvt_utf8_base<char16_t>::~__codecvt_utf8_base() { }
883
884 codecvt_base::result
885 __codecvt_utf8_base<char16_t>::
do_out(state_type &,const intern_type * __from,const intern_type * __from_end,const intern_type * & __from_next,extern_type * __to,extern_type * __to_end,extern_type * & __to_next) const886 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
887 const intern_type*& __from_next,
888 extern_type* __to, extern_type* __to_end,
889 extern_type*& __to_next) const
890 {
891 range<const char16_t> from{ __from, __from_end };
892 range<char> to{ __to, __to_end };
893 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
894 __from_next = from.next;
895 __to_next = to.next;
896 return res;
897 }
898
899 codecvt_base::result
900 __codecvt_utf8_base<char16_t>::
do_unshift(state_type &,extern_type * __to,extern_type *,extern_type * & __to_next) const901 do_unshift(state_type&, extern_type* __to, extern_type*,
902 extern_type*& __to_next) const
903 {
904 __to_next = __to;
905 return noconv;
906 }
907
908 codecvt_base::result
909 __codecvt_utf8_base<char16_t>::
do_in(state_type &,const extern_type * __from,const extern_type * __from_end,const extern_type * & __from_next,intern_type * __to,intern_type * __to_end,intern_type * & __to_next) const910 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
911 const extern_type*& __from_next,
912 intern_type* __to, intern_type* __to_end,
913 intern_type*& __to_next) const
914 {
915 range<const char> from{ __from, __from_end };
916 range<char16_t> to{ __to, __to_end };
917 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
918 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
919 mode = codecvt_mode(mode | little_endian);
920 #endif
921 auto res = ucs2_in(from, to, _M_maxcode, mode);
922 __from_next = from.next;
923 __to_next = to.next;
924 return res;
925 }
926
927 int
do_encoding() const928 __codecvt_utf8_base<char16_t>::do_encoding() const throw()
929 { return 0; } // UTF-8 is not a fixed-width encoding
930
931 bool
do_always_noconv() const932 __codecvt_utf8_base<char16_t>::do_always_noconv() const throw()
933 { return false; }
934
935 int
936 __codecvt_utf8_base<char16_t>::
do_length(state_type &,const extern_type * __from,const extern_type * __end,size_t __max) const937 do_length(state_type&, const extern_type* __from,
938 const extern_type* __end, size_t __max) const
939 {
940 __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
941 return __end - __from;
942 }
943
944 int
do_max_length() const945 __codecvt_utf8_base<char16_t>::do_max_length() const throw()
946 {
947 // A single UCS-2 character requires up to three UTF-8 code units.
948 // (UCS-2 cannot represent characters that use four UTF-8 code units).
949 int max = 3;
950 if (_M_mode & consume_header)
951 max += sizeof(utf8_bom);
952 return max;
953 }
954
955 // Define members of codecvt_utf8<char32_t> base class implementation.
956 // Converts from UTF-8 to UTF-32 (aka UCS-4).
957
~__codecvt_utf8_base()958 __codecvt_utf8_base<char32_t>::~__codecvt_utf8_base() { }
959
960 codecvt_base::result
961 __codecvt_utf8_base<char32_t>::
do_out(state_type &,const intern_type * __from,const intern_type * __from_end,const intern_type * & __from_next,extern_type * __to,extern_type * __to_end,extern_type * & __to_next) const962 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
963 const intern_type*& __from_next,
964 extern_type* __to, extern_type* __to_end,
965 extern_type*& __to_next) const
966 {
967 range<const char32_t> from{ __from, __from_end };
968 range<char> to{ __to, __to_end };
969 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
970 __from_next = from.next;
971 __to_next = to.next;
972 return res;
973 }
974
975 codecvt_base::result
976 __codecvt_utf8_base<char32_t>::
do_unshift(state_type &,extern_type * __to,extern_type *,extern_type * & __to_next) const977 do_unshift(state_type&, extern_type* __to, extern_type*,
978 extern_type*& __to_next) const
979 {
980 __to_next = __to;
981 return noconv;
982 }
983
984 codecvt_base::result
985 __codecvt_utf8_base<char32_t>::
do_in(state_type &,const extern_type * __from,const extern_type * __from_end,const extern_type * & __from_next,intern_type * __to,intern_type * __to_end,intern_type * & __to_next) const986 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
987 const extern_type*& __from_next,
988 intern_type* __to, intern_type* __to_end,
989 intern_type*& __to_next) const
990 {
991 range<const char> from{ __from, __from_end };
992 range<char32_t> to{ __to, __to_end };
993 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
994 __from_next = from.next;
995 __to_next = to.next;
996 return res;
997 }
998
999 int
do_encoding() const1000 __codecvt_utf8_base<char32_t>::do_encoding() const throw()
1001 { return 0; } // UTF-8 is not a fixed-width encoding
1002
1003 bool
do_always_noconv() const1004 __codecvt_utf8_base<char32_t>::do_always_noconv() const throw()
1005 { return false; }
1006
1007 int
1008 __codecvt_utf8_base<char32_t>::
do_length(state_type &,const extern_type * __from,const extern_type * __end,size_t __max) const1009 do_length(state_type&, const extern_type* __from,
1010 const extern_type* __end, size_t __max) const
1011 {
1012 __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1013 return __end - __from;
1014 }
1015
1016 int
do_max_length() const1017 __codecvt_utf8_base<char32_t>::do_max_length() const throw()
1018 {
1019 // A single UCS-4 character requires up to four UTF-8 code units.
1020 int max = 4;
1021 if (_M_mode & consume_header)
1022 max += sizeof(utf8_bom);
1023 return max;
1024 }
1025
1026 #ifdef _GLIBCXX_USE_WCHAR_T
1027
1028 #if __SIZEOF_WCHAR_T__ == 2
1029 static_assert(sizeof(wchar_t) == sizeof(char16_t), "");
1030 #elif __SIZEOF_WCHAR_T__ == 4
1031 static_assert(sizeof(wchar_t) == sizeof(char32_t), "");
1032 #endif
1033
1034 // Define members of codecvt_utf8<wchar_t> base class implementation.
1035 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1036
~__codecvt_utf8_base()1037 __codecvt_utf8_base<wchar_t>::~__codecvt_utf8_base() { }
1038
1039 codecvt_base::result
1040 __codecvt_utf8_base<wchar_t>::
do_out(state_type &,const intern_type * __from,const intern_type * __from_end,const intern_type * & __from_next,extern_type * __to,extern_type * __to_end,extern_type * & __to_next) const1041 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1042 const intern_type*& __from_next,
1043 extern_type* __to, extern_type* __to_end,
1044 extern_type*& __to_next) const
1045 {
1046 range<char> to{ __to, __to_end };
1047 #if __SIZEOF_WCHAR_T__ == 2
1048 range<const char16_t> from{
1049 reinterpret_cast<const char16_t*>(__from),
1050 reinterpret_cast<const char16_t*>(__from_end)
1051 };
1052 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1053 #elif __SIZEOF_WCHAR_T__ == 4
1054 range<const char32_t> from{
1055 reinterpret_cast<const char32_t*>(__from),
1056 reinterpret_cast<const char32_t*>(__from_end)
1057 };
1058 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1059 #else
1060 return codecvt_base::error;
1061 #endif
1062 __from_next = reinterpret_cast<const wchar_t*>(from.next);
1063 __to_next = to.next;
1064 return res;
1065 }
1066
1067 codecvt_base::result
1068 __codecvt_utf8_base<wchar_t>::
do_unshift(state_type &,extern_type * __to,extern_type *,extern_type * & __to_next) const1069 do_unshift(state_type&, extern_type* __to, extern_type*,
1070 extern_type*& __to_next) const
1071 {
1072 __to_next = __to;
1073 return noconv;
1074 }
1075
1076 codecvt_base::result
1077 __codecvt_utf8_base<wchar_t>::
do_in(state_type &,const extern_type * __from,const extern_type * __from_end,const extern_type * & __from_next,intern_type * __to,intern_type * __to_end,intern_type * & __to_next) const1078 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1079 const extern_type*& __from_next,
1080 intern_type* __to, intern_type* __to_end,
1081 intern_type*& __to_next) const
1082 {
1083 range<const char> from{ __from, __from_end };
1084 #if __SIZEOF_WCHAR_T__ == 2
1085 range<char16_t> to{
1086 reinterpret_cast<char16_t*>(__to),
1087 reinterpret_cast<char16_t*>(__to_end)
1088 };
1089 auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1090 #elif __SIZEOF_WCHAR_T__ == 4
1091 range<char32_t> to{
1092 reinterpret_cast<char32_t*>(__to),
1093 reinterpret_cast<char32_t*>(__to_end)
1094 };
1095 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1096 #else
1097 return codecvt_base::error;
1098 #endif
1099 __from_next = from.next;
1100 __to_next = reinterpret_cast<wchar_t*>(to.next);
1101 return res;
1102 }
1103
1104 int
do_encoding() const1105 __codecvt_utf8_base<wchar_t>::do_encoding() const throw()
1106 { return 0; } // UTF-8 is not a fixed-width encoding
1107
1108 bool
do_always_noconv() const1109 __codecvt_utf8_base<wchar_t>::do_always_noconv() const throw()
1110 { return false; }
1111
1112 int
1113 __codecvt_utf8_base<wchar_t>::
do_length(state_type &,const extern_type * __from,const extern_type * __end,size_t __max) const1114 do_length(state_type&, const extern_type* __from,
1115 const extern_type* __end, size_t __max) const
1116 {
1117 #if __SIZEOF_WCHAR_T__ == 2
1118 __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
1119 #elif __SIZEOF_WCHAR_T__ == 4
1120 __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1121 #else
1122 __end = __from;
1123 #endif
1124 return __end - __from;
1125 }
1126
1127 int
do_max_length() const1128 __codecvt_utf8_base<wchar_t>::do_max_length() const throw()
1129 {
1130 #if __SIZEOF_WCHAR_T__ == 2
1131 int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length()
1132 #else
1133 int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length()
1134 #endif
1135 if (_M_mode & consume_header)
1136 max += sizeof(utf8_bom);
1137 return max;
1138 }
1139 #endif
1140
1141 // Define members of codecvt_utf16<char16_t> base class implementation.
1142 // Converts from UTF-16 to UCS-2.
1143
~__codecvt_utf16_base()1144 __codecvt_utf16_base<char16_t>::~__codecvt_utf16_base() { }
1145
1146 codecvt_base::result
1147 __codecvt_utf16_base<char16_t>::
do_out(state_type &,const intern_type * __from,const intern_type * __from_end,const intern_type * & __from_next,extern_type * __to,extern_type * __to_end,extern_type * & __to_next) const1148 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1149 const intern_type*& __from_next,
1150 extern_type* __to, extern_type* __to_end,
1151 extern_type*& __to_next) const
1152 {
1153 range<const char16_t> from{ __from, __from_end };
1154 range<char16_t, false> to{ __to, __to_end };
1155 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1156 __from_next = from.next;
1157 __to_next = reinterpret_cast<char*>(to.next);
1158 return res;
1159 }
1160
1161 codecvt_base::result
1162 __codecvt_utf16_base<char16_t>::
do_unshift(state_type &,extern_type * __to,extern_type *,extern_type * & __to_next) const1163 do_unshift(state_type&, extern_type* __to, extern_type*,
1164 extern_type*& __to_next) const
1165 {
1166 __to_next = __to;
1167 return noconv;
1168 }
1169
1170 codecvt_base::result
1171 __codecvt_utf16_base<char16_t>::
do_in(state_type &,const extern_type * __from,const extern_type * __from_end,const extern_type * & __from_next,intern_type * __to,intern_type * __to_end,intern_type * & __to_next) const1172 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1173 const extern_type*& __from_next,
1174 intern_type* __to, intern_type* __to_end,
1175 intern_type*& __to_next) const
1176 {
1177 range<const char16_t, false> from{ __from, __from_end };
1178 range<char16_t> to{ __to, __to_end };
1179 auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1180 __from_next = reinterpret_cast<const char*>(from.next);
1181 __to_next = to.next;
1182 if (res == codecvt_base::ok && __from_next != __from_end)
1183 res = codecvt_base::error;
1184 return res;
1185 }
1186
1187 int
do_encoding() const1188 __codecvt_utf16_base<char16_t>::do_encoding() const throw()
1189 { return 0; } // UTF-16 is not a fixed-width encoding
1190
1191 bool
do_always_noconv() const1192 __codecvt_utf16_base<char16_t>::do_always_noconv() const throw()
1193 { return false; }
1194
1195 int
1196 __codecvt_utf16_base<char16_t>::
do_length(state_type &,const extern_type * __from,const extern_type * __end,size_t __max) const1197 do_length(state_type&, const extern_type* __from,
1198 const extern_type* __end, size_t __max) const
1199 {
1200 range<const char16_t, false> from{ __from, __end };
1201 const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
1202 return reinterpret_cast<const char*>(next) - __from;
1203 }
1204
1205 int
do_max_length() const1206 __codecvt_utf16_base<char16_t>::do_max_length() const throw()
1207 {
1208 // A single UCS-2 character requires one UTF-16 code unit (so two chars).
1209 // (UCS-2 cannot represent characters that use multiple UTF-16 code units).
1210 int max = 2;
1211 if (_M_mode & consume_header)
1212 max += sizeof(utf16_bom);
1213 return max;
1214 }
1215
1216 // Define members of codecvt_utf16<char32_t> base class implementation.
1217 // Converts from UTF-16 to UTF-32 (aka UCS-4).
1218
~__codecvt_utf16_base()1219 __codecvt_utf16_base<char32_t>::~__codecvt_utf16_base() { }
1220
1221 codecvt_base::result
1222 __codecvt_utf16_base<char32_t>::
do_out(state_type &,const intern_type * __from,const intern_type * __from_end,const intern_type * & __from_next,extern_type * __to,extern_type * __to_end,extern_type * & __to_next) const1223 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1224 const intern_type*& __from_next,
1225 extern_type* __to, extern_type* __to_end,
1226 extern_type*& __to_next) const
1227 {
1228 range<const char32_t> from{ __from, __from_end };
1229 range<char16_t, false> to{ __to, __to_end };
1230 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1231 __from_next = from.next;
1232 __to_next = reinterpret_cast<char*>(to.next);
1233 return res;
1234 }
1235
1236 codecvt_base::result
1237 __codecvt_utf16_base<char32_t>::
do_unshift(state_type &,extern_type * __to,extern_type *,extern_type * & __to_next) const1238 do_unshift(state_type&, extern_type* __to, extern_type*,
1239 extern_type*& __to_next) const
1240 {
1241 __to_next = __to;
1242 return noconv;
1243 }
1244
1245 codecvt_base::result
1246 __codecvt_utf16_base<char32_t>::
do_in(state_type &,const extern_type * __from,const extern_type * __from_end,const extern_type * & __from_next,intern_type * __to,intern_type * __to_end,intern_type * & __to_next) const1247 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1248 const extern_type*& __from_next,
1249 intern_type* __to, intern_type* __to_end,
1250 intern_type*& __to_next) const
1251 {
1252 range<const char16_t, false> from{ __from, __from_end };
1253 range<char32_t> to{ __to, __to_end };
1254 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1255 __from_next = reinterpret_cast<const char*>(from.next);
1256 __to_next = to.next;
1257 if (res == codecvt_base::ok && __from_next != __from_end)
1258 res = codecvt_base::error;
1259 return res;
1260 }
1261
1262 int
do_encoding() const1263 __codecvt_utf16_base<char32_t>::do_encoding() const throw()
1264 { return 0; } // UTF-16 is not a fixed-width encoding
1265
1266 bool
do_always_noconv() const1267 __codecvt_utf16_base<char32_t>::do_always_noconv() const throw()
1268 { return false; }
1269
1270 int
1271 __codecvt_utf16_base<char32_t>::
do_length(state_type &,const extern_type * __from,const extern_type * __end,size_t __max) const1272 do_length(state_type&, const extern_type* __from,
1273 const extern_type* __end, size_t __max) const
1274 {
1275 range<const char16_t, false> from{ __from, __end };
1276 const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
1277 return reinterpret_cast<const char*>(next) - __from;
1278 }
1279
1280 int
do_max_length() const1281 __codecvt_utf16_base<char32_t>::do_max_length() const throw()
1282 {
1283 // A single UCS-4 character requires one or two UTF-16 code units
1284 // (so up to four chars).
1285 int max = 4;
1286 if (_M_mode & consume_header)
1287 max += sizeof(utf16_bom);
1288 return max;
1289 }
1290
1291 #ifdef _GLIBCXX_USE_WCHAR_T
1292 // Define members of codecvt_utf16<wchar_t> base class implementation.
1293 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1294
~__codecvt_utf16_base()1295 __codecvt_utf16_base<wchar_t>::~__codecvt_utf16_base() { }
1296
1297 codecvt_base::result
1298 __codecvt_utf16_base<wchar_t>::
do_out(state_type &,const intern_type * __from,const intern_type * __from_end,const intern_type * & __from_next,extern_type * __to,extern_type * __to_end,extern_type * & __to_next) const1299 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1300 const intern_type*& __from_next,
1301 extern_type* __to, extern_type* __to_end,
1302 extern_type*& __to_next) const
1303 {
1304 range<char16_t, false> to{ __to, __to_end };
1305 #if __SIZEOF_WCHAR_T__ == 2
1306 range<const char16_t> from{
1307 reinterpret_cast<const char16_t*>(__from),
1308 reinterpret_cast<const char16_t*>(__from_end),
1309 };
1310 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1311 #elif __SIZEOF_WCHAR_T__ == 4
1312 range<const char32_t> from{
1313 reinterpret_cast<const char32_t*>(__from),
1314 reinterpret_cast<const char32_t*>(__from_end),
1315 };
1316 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1317 #else
1318 return codecvt_base::error;
1319 #endif
1320 __from_next = reinterpret_cast<const wchar_t*>(from.next);
1321 __to_next = reinterpret_cast<char*>(to.next);
1322 return res;
1323 }
1324
1325 codecvt_base::result
1326 __codecvt_utf16_base<wchar_t>::
do_unshift(state_type &,extern_type * __to,extern_type *,extern_type * & __to_next) const1327 do_unshift(state_type&, extern_type* __to, extern_type*,
1328 extern_type*& __to_next) const
1329 {
1330 __to_next = __to;
1331 return noconv;
1332 }
1333
1334 codecvt_base::result
1335 __codecvt_utf16_base<wchar_t>::
do_in(state_type &,const extern_type * __from,const extern_type * __from_end,const extern_type * & __from_next,intern_type * __to,intern_type * __to_end,intern_type * & __to_next) const1336 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1337 const extern_type*& __from_next,
1338 intern_type* __to, intern_type* __to_end,
1339 intern_type*& __to_next) const
1340 {
1341 range<const char16_t, false> from{ __from, __from_end };
1342 #if __SIZEOF_WCHAR_T__ == 2
1343 range<char16_t> to{
1344 reinterpret_cast<char16_t*>(__to),
1345 reinterpret_cast<char16_t*>(__to_end),
1346 };
1347 auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1348 #elif __SIZEOF_WCHAR_T__ == 4
1349 range<char32_t> to{
1350 reinterpret_cast<char32_t*>(__to),
1351 reinterpret_cast<char32_t*>(__to_end),
1352 };
1353 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1354 #else
1355 return codecvt_base::error;
1356 #endif
1357 __from_next = reinterpret_cast<const char*>(from.next);
1358 __to_next = reinterpret_cast<wchar_t*>(to.next);
1359 if (res == codecvt_base::ok && __from_next != __from_end)
1360 res = codecvt_base::error;
1361 return res;
1362 }
1363
1364 int
do_encoding() const1365 __codecvt_utf16_base<wchar_t>::do_encoding() const throw()
1366 { return 0; } // UTF-16 is not a fixed-width encoding
1367
1368 bool
do_always_noconv() const1369 __codecvt_utf16_base<wchar_t>::do_always_noconv() const throw()
1370 { return false; }
1371
1372 int
1373 __codecvt_utf16_base<wchar_t>::
do_length(state_type &,const extern_type * __from,const extern_type * __end,size_t __max) const1374 do_length(state_type&, const extern_type* __from,
1375 const extern_type* __end, size_t __max) const
1376 {
1377 range<const char16_t, false> from{ __from, __end };
1378 #if __SIZEOF_WCHAR_T__ == 2
1379 const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
1380 #elif __SIZEOF_WCHAR_T__ == 4
1381 const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
1382 #endif
1383 return reinterpret_cast<const char*>(next) - __from;
1384 }
1385
1386 int
do_max_length() const1387 __codecvt_utf16_base<wchar_t>::do_max_length() const throw()
1388 {
1389 #if __SIZEOF_WCHAR_T__ == 2
1390 int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length()
1391 #else
1392 int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length()
1393 #endif
1394 if (_M_mode & consume_header)
1395 max += sizeof(utf16_bom);
1396 return max;
1397 }
1398 #endif
1399
1400 // Define members of codecvt_utf8_utf16<char16_t> base class implementation.
1401 // Converts from UTF-8 to UTF-16.
1402
~__codecvt_utf8_utf16_base()1403 __codecvt_utf8_utf16_base<char16_t>::~__codecvt_utf8_utf16_base() { }
1404
1405 codecvt_base::result
1406 __codecvt_utf8_utf16_base<char16_t>::
do_out(state_type &,const intern_type * __from,const intern_type * __from_end,const intern_type * & __from_next,extern_type * __to,extern_type * __to_end,extern_type * & __to_next) const1407 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1408 const intern_type*& __from_next,
1409 extern_type* __to, extern_type* __to_end,
1410 extern_type*& __to_next) const
1411 {
1412 range<const char16_t> from{ __from, __from_end };
1413 range<char> to{ __to, __to_end };
1414 auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1415 __from_next = from.next;
1416 __to_next = to.next;
1417 return res;
1418 }
1419
1420 codecvt_base::result
1421 __codecvt_utf8_utf16_base<char16_t>::
do_unshift(state_type &,extern_type * __to,extern_type *,extern_type * & __to_next) const1422 do_unshift(state_type&, extern_type* __to, extern_type*,
1423 extern_type*& __to_next) const
1424 {
1425 __to_next = __to;
1426 return noconv;
1427 }
1428
1429 codecvt_base::result
1430 __codecvt_utf8_utf16_base<char16_t>::
do_in(state_type &,const extern_type * __from,const extern_type * __from_end,const extern_type * & __from_next,intern_type * __to,intern_type * __to_end,intern_type * & __to_next) const1431 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1432 const extern_type*& __from_next,
1433 intern_type* __to, intern_type* __to_end,
1434 intern_type*& __to_next) const
1435 {
1436 range<const char> from{ __from, __from_end };
1437 range<char16_t> to{ __to, __to_end };
1438 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1439 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1440 mode = codecvt_mode(mode | little_endian);
1441 #endif
1442 auto res = utf16_in(from, to, _M_maxcode, mode);
1443 __from_next = from.next;
1444 __to_next = to.next;
1445 return res;
1446 }
1447
1448 int
do_encoding() const1449 __codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw()
1450 { return 0; } // UTF-8 is not a fixed-width encoding
1451
1452 bool
do_always_noconv() const1453 __codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw()
1454 { return false; }
1455
1456 int
1457 __codecvt_utf8_utf16_base<char16_t>::
do_length(state_type &,const extern_type * __from,const extern_type * __end,size_t __max) const1458 do_length(state_type&, const extern_type* __from,
1459 const extern_type* __end, size_t __max) const
1460 {
1461 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1462 return __end - __from;
1463 }
1464
1465 int
do_max_length() const1466 __codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw()
1467 {
1468 // A single character can be 1 or 2 UTF-16 code units,
1469 // requiring up to 4 UTF-8 code units.
1470 int max = 4;
1471 if (_M_mode & consume_header)
1472 max += sizeof(utf8_bom);
1473 return max;
1474 }
1475
1476 // Define members of codecvt_utf8_utf16<char32_t> base class implementation.
1477 // Converts from UTF-8 to UTF-16.
1478
~__codecvt_utf8_utf16_base()1479 __codecvt_utf8_utf16_base<char32_t>::~__codecvt_utf8_utf16_base() { }
1480
1481 codecvt_base::result
1482 __codecvt_utf8_utf16_base<char32_t>::
do_out(state_type &,const intern_type * __from,const intern_type * __from_end,const intern_type * & __from_next,extern_type * __to,extern_type * __to_end,extern_type * & __to_next) const1483 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1484 const intern_type*& __from_next,
1485 extern_type* __to, extern_type* __to_end,
1486 extern_type*& __to_next) const
1487 {
1488 range<const char32_t> from{ __from, __from_end };
1489 range<char> to{ __to, __to_end };
1490 auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1491 __from_next = from.next;
1492 __to_next = to.next;
1493 return res;
1494 }
1495
1496 codecvt_base::result
1497 __codecvt_utf8_utf16_base<char32_t>::
do_unshift(state_type &,extern_type * __to,extern_type *,extern_type * & __to_next) const1498 do_unshift(state_type&, extern_type* __to, extern_type*,
1499 extern_type*& __to_next) const
1500 {
1501 __to_next = __to;
1502 return noconv;
1503 }
1504
1505 codecvt_base::result
1506 __codecvt_utf8_utf16_base<char32_t>::
do_in(state_type &,const extern_type * __from,const extern_type * __from_end,const extern_type * & __from_next,intern_type * __to,intern_type * __to_end,intern_type * & __to_next) const1507 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1508 const extern_type*& __from_next,
1509 intern_type* __to, intern_type* __to_end,
1510 intern_type*& __to_next) const
1511 {
1512 range<const char> from{ __from, __from_end };
1513 range<char32_t> to{ __to, __to_end };
1514 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1515 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1516 mode = codecvt_mode(mode | little_endian);
1517 #endif
1518 auto res = utf16_in(from, to, _M_maxcode, mode);
1519 __from_next = from.next;
1520 __to_next = to.next;
1521 return res;
1522 }
1523
1524 int
do_encoding() const1525 __codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw()
1526 { return 0; } // UTF-8 is not a fixed-width encoding
1527
1528 bool
do_always_noconv() const1529 __codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw()
1530 { return false; }
1531
1532 int
1533 __codecvt_utf8_utf16_base<char32_t>::
do_length(state_type &,const extern_type * __from,const extern_type * __end,size_t __max) const1534 do_length(state_type&, const extern_type* __from,
1535 const extern_type* __end, size_t __max) const
1536 {
1537 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1538 return __end - __from;
1539 }
1540
1541 int
do_max_length() const1542 __codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw()
1543 {
1544 // A single character can be 1 or 2 UTF-16 code units,
1545 // requiring up to 4 UTF-8 code units.
1546 int max = 4;
1547 if (_M_mode & consume_header)
1548 max += sizeof(utf8_bom);
1549 return max;
1550 }
1551
1552 #ifdef _GLIBCXX_USE_WCHAR_T
1553 // Define members of codecvt_utf8_utf16<wchar_t> base class implementation.
1554 // Converts from UTF-8 to UTF-16.
1555
~__codecvt_utf8_utf16_base()1556 __codecvt_utf8_utf16_base<wchar_t>::~__codecvt_utf8_utf16_base() { }
1557
1558 codecvt_base::result
1559 __codecvt_utf8_utf16_base<wchar_t>::
do_out(state_type &,const intern_type * __from,const intern_type * __from_end,const intern_type * & __from_next,extern_type * __to,extern_type * __to_end,extern_type * & __to_next) const1560 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1561 const intern_type*& __from_next,
1562 extern_type* __to, extern_type* __to_end,
1563 extern_type*& __to_next) const
1564 {
1565 range<const wchar_t> from{ __from, __from_end };
1566 range<char> to{ __to, __to_end };
1567 auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1568 __from_next = from.next;
1569 __to_next = to.next;
1570 return res;
1571 }
1572
1573 codecvt_base::result
1574 __codecvt_utf8_utf16_base<wchar_t>::
do_unshift(state_type &,extern_type * __to,extern_type *,extern_type * & __to_next) const1575 do_unshift(state_type&, extern_type* __to, extern_type*,
1576 extern_type*& __to_next) const
1577 {
1578 __to_next = __to;
1579 return noconv;
1580 }
1581
1582 codecvt_base::result
1583 __codecvt_utf8_utf16_base<wchar_t>::
do_in(state_type &,const extern_type * __from,const extern_type * __from_end,const extern_type * & __from_next,intern_type * __to,intern_type * __to_end,intern_type * & __to_next) const1584 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1585 const extern_type*& __from_next,
1586 intern_type* __to, intern_type* __to_end,
1587 intern_type*& __to_next) const
1588 {
1589 range<const char> from{ __from, __from_end };
1590 range<wchar_t> to{ __to, __to_end };
1591 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1592 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1593 mode = codecvt_mode(mode | little_endian);
1594 #endif
1595 auto res = utf16_in(from, to, _M_maxcode, mode);
1596 __from_next = from.next;
1597 __to_next = to.next;
1598 return res;
1599 }
1600
1601 int
do_encoding() const1602 __codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw()
1603 { return 0; } // UTF-8 is not a fixed-width encoding
1604
1605 bool
do_always_noconv() const1606 __codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw()
1607 { return false; }
1608
1609 int
1610 __codecvt_utf8_utf16_base<wchar_t>::
do_length(state_type &,const extern_type * __from,const extern_type * __end,size_t __max) const1611 do_length(state_type&, const extern_type* __from,
1612 const extern_type* __end, size_t __max) const
1613 {
1614 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1615 return __end - __from;
1616 }
1617
1618 int
do_max_length() const1619 __codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw()
1620 {
1621 // A single character can be 1 or 2 UTF-16 code units,
1622 // requiring up to 4 UTF-8 code units.
1623 int max = 4;
1624 if (_M_mode & consume_header)
1625 max += sizeof(utf8_bom);
1626 return max;
1627 }
1628 #endif
1629
1630 inline template class __codecvt_abstract_base<char16_t, char, mbstate_t>;
1631 inline template class __codecvt_abstract_base<char32_t, char, mbstate_t>;
1632 template class codecvt_byname<char16_t, char, mbstate_t>;
1633 template class codecvt_byname<char32_t, char, mbstate_t>;
1634
1635 _GLIBCXX_END_NAMESPACE_VERSION
1636 }
1637 #endif // _GLIBCXX_USE_C99_STDINT_TR1
1638