1 /* Conversion module for UTF-7.
2    Copyright (C) 2000-2021 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4 
5    The GNU C Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
9 
10    The GNU C Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14 
15    You should have received a copy of the GNU Lesser General Public
16    License along with the GNU C Library; if not, see
17    <https://www.gnu.org/licenses/>.  */
18 
19 /* UTF-7 is a legacy encoding used for transmitting Unicode within the
20    ASCII character set, used primarily by mail agents.  New programs
21    are encouraged to use UTF-8 instead.
22 
23    UTF-7 is specified in RFC 2152 (and old RFC 1641, RFC 1642).  The
24    original Base64 encoding is defined in RFC 2045.  */
25 
26 #include <dlfcn.h>
27 #include <gconv.h>
28 #include <stdint.h>
29 #include <stdlib.h>
30 
31 
32 /* Define this to 1 if you want the so-called "optional direct" characters
33       ! " # $ % & * ; < = > @ [ ] ^ _ ` { | }
34    to be encoded. Define to 0 if you want them to be passed straight
35    through, like the so-called "direct" characters.
36    We set this to 1 because it's safer.
37  */
38 #define UTF7_ENCODE_OPTIONAL_CHARS 1
39 
40 
41 /* The set of "direct characters":
42    A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
43 */
44 
45 static const unsigned char direct_tab[128 / 8] =
46   {
47     0x00, 0x26, 0x00, 0x00, 0x81, 0xf3, 0xff, 0x87,
48     0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
49   };
50 
51 static int
isdirect(uint32_t ch)52 isdirect (uint32_t ch)
53 {
54   return (ch < 128 && ((direct_tab[ch >> 3] >> (ch & 7)) & 1));
55 }
56 
57 
58 /* The set of "direct and optional direct characters":
59    A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
60    ! " # $ % & * ; < = > @ [ ] ^ _ ` { | }
61 */
62 
63 static const unsigned char xdirect_tab[128 / 8] =
64   {
65     0x00, 0x26, 0x00, 0x00, 0xff, 0xf7, 0xff, 0xff,
66     0xff, 0xff, 0xff, 0xef, 0xff, 0xff, 0xff, 0x3f
67   };
68 
69 static int
isxdirect(uint32_t ch)70 isxdirect (uint32_t ch)
71 {
72   return (ch < 128 && ((xdirect_tab[ch >> 3] >> (ch & 7)) & 1));
73 }
74 
75 
76 /* The set of "extended base64 characters":
77    A-Z a-z 0-9 + / -
78 */
79 
80 static const unsigned char xbase64_tab[128 / 8] =
81   {
82     0x00, 0x00, 0x00, 0x00, 0x00, 0xa8, 0xff, 0x03,
83     0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
84   };
85 
86 static int
isxbase64(uint32_t ch)87 isxbase64 (uint32_t ch)
88 {
89   return (ch < 128 && ((xbase64_tab[ch >> 3] >> (ch & 7)) & 1));
90 }
91 
92 
93 /* Converts a value in the range 0..63 to a base64 encoded char.  */
94 static unsigned char
base64(unsigned int i)95 base64 (unsigned int i)
96 {
97   if (i < 26)
98     return i + 'A';
99   else if (i < 52)
100     return i - 26 + 'a';
101   else if (i < 62)
102     return i - 52 + '0';
103   else if (i == 62)
104     return '+';
105   else if (i == 63)
106     return '/';
107   else
108     abort ();
109 }
110 
111 
112 /* Definitions used in the body of the `gconv' function.  */
113 #define CHARSET_NAME		"UTF-7//"
114 #define DEFINE_INIT		1
115 #define DEFINE_FINI		1
116 #define FROM_LOOP		from_utf7_loop
117 #define TO_LOOP			to_utf7_loop
118 #define MIN_NEEDED_FROM		1
119 #define MAX_NEEDED_FROM		6
120 #define MIN_NEEDED_TO		4
121 #define MAX_NEEDED_TO		4
122 #define ONE_DIRECTION		0
123 #define PREPARE_LOOP \
124   mbstate_t saved_state;						      \
125   mbstate_t *statep = data->__statep;
126 #define EXTRA_LOOP_ARGS		, statep
127 
128 
129 /* Since we might have to reset input pointer we must be able to save
130    and restore the state.  */
131 #define SAVE_RESET_STATE(Save) \
132   if (Save)								      \
133     saved_state = *statep;						      \
134   else									      \
135     *statep = saved_state
136 
137 
138 /* First define the conversion function from UTF-7 to UCS4.
139    The state is structured as follows:
140      __count bit 2..0: zero
141      __count bit 8..3: shift
142      __wch: data
143    Precise meaning:
144      shift      data
145        0         --          not inside base64 encoding
146      1..32  XX..XX00..00     inside base64, (32 - shift) bits pending
147    This state layout is simpler than relying on STORE_REST/UNPACK_BYTES.
148 
149    When shift = 0, __wch needs to store at most one lookahead byte (see
150    __GCONV_INCOMPLETE_INPUT below).
151 */
152 #define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
153 #define MAX_NEEDED_INPUT	MAX_NEEDED_FROM
154 #define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
155 #define MAX_NEEDED_OUTPUT	MAX_NEEDED_TO
156 #define LOOPFCT			FROM_LOOP
157 #define BODY \
158   {									      \
159     uint_fast8_t ch = *inptr;						      \
160 									      \
161     if ((statep->__count >> 3) == 0)					      \
162       {									      \
163 	/* base64 encoding inactive.  */				      \
164 	if (isxdirect (ch))						      \
165 	  {								      \
166 	    inptr++;							      \
167 	    put32 (outptr, ch);						      \
168 	    outptr += 4;						      \
169 	  }								      \
170 	else if (__glibc_likely (ch == '+'))				      \
171 	  {								      \
172 	    if (__glibc_unlikely (inptr + 2 > inend))			      \
173 	      {								      \
174 		/* Not enough input available.  */			      \
175 		result = __GCONV_INCOMPLETE_INPUT;			      \
176 		break;							      \
177 	      }								      \
178 	    if (inptr[1] == '-')					      \
179 	      {								      \
180 		inptr += 2;						      \
181 		put32 (outptr, ch);					      \
182 		outptr += 4;						      \
183 	      }								      \
184 	    else							      \
185 	      {								      \
186 		/* Switch into base64 mode.  */				      \
187 		inptr++;						      \
188 		statep->__count = (32 << 3);				      \
189 		statep->__value.__wch = 0;				      \
190 	      }								      \
191 	  }								      \
192 	else								      \
193 	  {								      \
194 	    /* The input is invalid.  */				      \
195 	    STANDARD_FROM_LOOP_ERR_HANDLER (1);				      \
196 	  }								      \
197       }									      \
198     else								      \
199       {									      \
200 	/* base64 encoding active.  */					      \
201 	uint32_t i;							      \
202 	int shift;							      \
203 									      \
204 	if (ch >= 'A' && ch <= 'Z')					      \
205 	  i = ch - 'A';							      \
206 	else if (ch >= 'a' && ch <= 'z')				      \
207 	  i = ch - 'a' + 26;						      \
208 	else if (ch >= '0' && ch <= '9')				      \
209 	  i = ch - '0' + 52;						      \
210 	else if (ch == '+')						      \
211 	  i = 62;							      \
212 	else if (ch == '/')						      \
213 	  i = 63;							      \
214 	else								      \
215 	  {								      \
216 	    /* Terminate base64 encoding.  */				      \
217 									      \
218 	    /* If accumulated data is nonzero, the input is invalid.  */      \
219 	    /* Also, partial UTF-16 characters are invalid.  */		      \
220 	    if (__builtin_expect (statep->__value.__wch != 0, 0)	      \
221 		|| __builtin_expect ((statep->__count >> 3) <= 26, 0))	      \
222 	      {								      \
223 		STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1));    \
224 	      }								      \
225 									      \
226 	    if (ch == '-')						      \
227 	      inptr++;							      \
228 									      \
229 	    statep->__count = 0;					      \
230 	    continue;							      \
231 	  }								      \
232 									      \
233 	/* Concatenate the base64 integer i to the accumulator.  */	      \
234 	shift = (statep->__count >> 3);					      \
235 	if (shift > 6)							      \
236 	  {								      \
237 	    uint32_t wch;						      \
238 									      \
239 	    shift -= 6;							      \
240 	    wch = statep->__value.__wch | (i << shift);			      \
241 									      \
242 	    if (shift <= 16 && shift > 10)				      \
243 	      {								      \
244 		/* An UTF-16 character has just been completed.  */	      \
245 		uint32_t wc1 = wch >> 16;				      \
246 									      \
247 		/* UTF-16: When we see a High Surrogate, we must also decode  \
248 		   the following Low Surrogate. */			      \
249 		if (!(wc1 >= 0xd800 && wc1 < 0xdc00))			      \
250 		  {							      \
251 		    wch = wch << 16;					      \
252 		    shift += 16;					      \
253 		    put32 (outptr, wc1);				      \
254 		    outptr += 4;					      \
255 		  }							      \
256 	      }								      \
257 	    else if (shift <= 10 && shift > 4)				      \
258 	      {								      \
259 		/* After a High Surrogate, verify that the next 16 bit	      \
260 		   indeed form a Low Surrogate.  */			      \
261 		uint32_t wc2 = wch & 0xffff;				      \
262 									      \
263 		if (! __builtin_expect (wc2 >= 0xdc00 && wc2 < 0xe000, 1))    \
264 		  {							      \
265 		    STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1));\
266 		  }							      \
267 	      }								      \
268 									      \
269 	    statep->__value.__wch = wch;				      \
270 	  }								      \
271 	else								      \
272 	  {								      \
273 	    /* An UTF-16 surrogate pair has just been completed.  */	      \
274 	    uint32_t wc1 = (uint32_t) statep->__value.__wch >> 16;	      \
275 	    uint32_t wc2 = ((uint32_t) statep->__value.__wch & 0xffff)	      \
276 			   | (i >> (6 - shift));			      \
277 									      \
278 	    statep->__value.__wch = (i << shift) << 26;			      \
279 	    shift += 26;						      \
280 									      \
281 	    assert (wc1 >= 0xd800 && wc1 < 0xdc00);			      \
282 	    assert (wc2 >= 0xdc00 && wc2 < 0xe000);			      \
283 	    put32 (outptr,						      \
284 		   0x10000 + ((wc1 - 0xd800) << 10) + (wc2 - 0xdc00));	      \
285 	    outptr += 4;						      \
286 	  }								      \
287 									      \
288 	statep->__count = shift << 3;					      \
289 									      \
290 	/* Now that we digested the input increment the input pointer.  */    \
291 	inptr++;							      \
292       }									      \
293   }
294 #define LOOP_NEED_FLAGS
295 #define EXTRA_LOOP_DECLS	, mbstate_t *statep
296 #include <iconv/loop.c>
297 
298 
299 /* Next, define the conversion from UCS4 to UTF-7.
300    The state is structured as follows:
301      __count bit 2..0: zero
302      __count bit 4..3: shift
303      __count bit 8..5: data
304    Precise meaning:
305      shift      data
306        0         0           not inside base64 encoding
307        1         0           inside base64, no pending bits
308        2       XX00          inside base64, 2 bits known for next byte
309        3       XXXX          inside base64, 4 bits known for next byte
310 
311    __count bit 2..0 and __wch are always zero, because this direction
312    never returns __GCONV_INCOMPLETE_INPUT.
313 */
314 #define MIN_NEEDED_INPUT	MIN_NEEDED_TO
315 #define MAX_NEEDED_INPUT	MAX_NEEDED_TO
316 #define MIN_NEEDED_OUTPUT	MIN_NEEDED_FROM
317 #define MAX_NEEDED_OUTPUT	MAX_NEEDED_FROM
318 #define LOOPFCT			TO_LOOP
319 #define BODY \
320   {									      \
321     uint32_t ch = get32 (inptr);					      \
322 									      \
323     if ((statep->__count & 0x18) == 0)					      \
324       {									      \
325 	/* base64 encoding inactive */					      \
326 	if (UTF7_ENCODE_OPTIONAL_CHARS ? isdirect (ch) : isxdirect (ch))      \
327 	  {								      \
328 	    *outptr++ = (unsigned char) ch;				      \
329 	  }								      \
330 	else								      \
331 	  {								      \
332 	    size_t count;						      \
333 									      \
334 	    if (ch == '+')						      \
335 	      count = 2;						      \
336 	    else if (ch < 0x10000)					      \
337 	      count = 3;						      \
338 	    else if (ch < 0x110000)					      \
339 	      count = 6;						      \
340 	    else							      \
341 	      STANDARD_TO_LOOP_ERR_HANDLER (4);				      \
342 									      \
343 	    if (__glibc_unlikely (outptr + count > outend))		      \
344 	      {								      \
345 		result = __GCONV_FULL_OUTPUT;				      \
346 		break;							      \
347 	      }								      \
348 									      \
349 	    *outptr++ = '+';						      \
350 	    if (ch == '+')						      \
351 	      *outptr++ = '-';						      \
352 	    else if (ch < 0x10000)					      \
353 	      {								      \
354 		*outptr++ = base64 (ch >> 10);				      \
355 		*outptr++ = base64 ((ch >> 4) & 0x3f);			      \
356 		statep->__count = ((ch & 15) << 5) | (3 << 3);		      \
357 	      }								      \
358 	    else if (ch < 0x110000)					      \
359 	      {								      \
360 		uint32_t ch1 = 0xd800 + ((ch - 0x10000) >> 10);		      \
361 		uint32_t ch2 = 0xdc00 + ((ch - 0x10000) & 0x3ff);	      \
362 									      \
363 		ch = (ch1 << 16) | ch2;					      \
364 		*outptr++ = base64 (ch >> 26);				      \
365 		*outptr++ = base64 ((ch >> 20) & 0x3f);			      \
366 		*outptr++ = base64 ((ch >> 14) & 0x3f);			      \
367 		*outptr++ = base64 ((ch >> 8) & 0x3f);			      \
368 		*outptr++ = base64 ((ch >> 2) & 0x3f);			      \
369 		statep->__count = ((ch & 3) << 7) | (2 << 3);		      \
370 	      }								      \
371 	    else							      \
372 	      abort ();							      \
373 	  }								      \
374       }									      \
375     else								      \
376       {									      \
377 	/* base64 encoding active */					      \
378 	if (UTF7_ENCODE_OPTIONAL_CHARS ? isdirect (ch) : isxdirect (ch))      \
379 	  {								      \
380 	    /* deactivate base64 encoding */				      \
381 	    size_t count;						      \
382 									      \
383 	    count = ((statep->__count & 0x18) >= 0x10) + isxbase64 (ch) + 1;  \
384 	    if (__glibc_unlikely (outptr + count > outend))		      \
385 	      {								      \
386 		result = __GCONV_FULL_OUTPUT;				      \
387 		break;							      \
388 	      }								      \
389 									      \
390 	    if ((statep->__count & 0x18) >= 0x10)			      \
391 	      *outptr++ = base64 ((statep->__count >> 3) & ~3);		      \
392 	    if (isxbase64 (ch))						      \
393 	      *outptr++ = '-';						      \
394 	    *outptr++ = (unsigned char) ch;				      \
395 	    statep->__count = 0;					      \
396 	  }								      \
397 	else								      \
398 	  {								      \
399 	    size_t count;						      \
400 									      \
401 	    if (ch < 0x10000)						      \
402 	      count = ((statep->__count & 0x18) >= 0x10 ? 3 : 2);	      \
403 	    else if (ch < 0x110000)					      \
404 	      count = ((statep->__count & 0x18) >= 0x18 ? 6 : 5);	      \
405 	    else							      \
406 	      STANDARD_TO_LOOP_ERR_HANDLER (4);				      \
407 									      \
408 	    if (__glibc_unlikely (outptr + count > outend))		      \
409 	      {								      \
410 		result = __GCONV_FULL_OUTPUT;				      \
411 		break;							      \
412 	      }								      \
413 									      \
414 	    if (ch < 0x10000)						      \
415 	      {								      \
416 		switch ((statep->__count >> 3) & 3)			      \
417 		  {							      \
418 		  case 1:						      \
419 		    *outptr++ = base64 (ch >> 10);			      \
420 		    *outptr++ = base64 ((ch >> 4) & 0x3f);		      \
421 		    statep->__count = ((ch & 15) << 5) | (3 << 3);	      \
422 		    break;						      \
423 		  case 2:						      \
424 		    *outptr++ =						      \
425 		      base64 (((statep->__count >> 3) & ~3) | (ch >> 12));    \
426 		    *outptr++ = base64 ((ch >> 6) & 0x3f);		      \
427 		    *outptr++ = base64 (ch & 0x3f);			      \
428 		    statep->__count = (1 << 3);				      \
429 		    break;						      \
430 		  case 3:						      \
431 		    *outptr++ =						      \
432 		      base64 (((statep->__count >> 3) & ~3) | (ch >> 14));    \
433 		    *outptr++ = base64 ((ch >> 8) & 0x3f);		      \
434 		    *outptr++ = base64 ((ch >> 2) & 0x3f);		      \
435 		    statep->__count = ((ch & 3) << 7) | (2 << 3);	      \
436 		    break;						      \
437 		  default:						      \
438 		    abort ();						      \
439 		  }							      \
440 	      }								      \
441 	    else if (ch < 0x110000)					      \
442 	      {								      \
443 		uint32_t ch1 = 0xd800 + ((ch - 0x10000) >> 10);		      \
444 		uint32_t ch2 = 0xdc00 + ((ch - 0x10000) & 0x3ff);	      \
445 									      \
446 		ch = (ch1 << 16) | ch2;					      \
447 		switch ((statep->__count >> 3) & 3)			      \
448 		  {							      \
449 		  case 1:						      \
450 		    *outptr++ = base64 (ch >> 26);			      \
451 		    *outptr++ = base64 ((ch >> 20) & 0x3f);		      \
452 		    *outptr++ = base64 ((ch >> 14) & 0x3f);		      \
453 		    *outptr++ = base64 ((ch >> 8) & 0x3f);		      \
454 		    *outptr++ = base64 ((ch >> 2) & 0x3f);		      \
455 		    statep->__count = ((ch & 3) << 7) | (2 << 3);	      \
456 		    break;						      \
457 		  case 2:						      \
458 		    *outptr++ =						      \
459 		      base64 (((statep->__count >> 3) & ~3) | (ch >> 28));    \
460 		    *outptr++ = base64 ((ch >> 22) & 0x3f);		      \
461 		    *outptr++ = base64 ((ch >> 16) & 0x3f);		      \
462 		    *outptr++ = base64 ((ch >> 10) & 0x3f);		      \
463 		    *outptr++ = base64 ((ch >> 4) & 0x3f);		      \
464 		    statep->__count = ((ch & 15) << 5) | (3 << 3);	      \
465 		    break;						      \
466 		  case 3:						      \
467 		    *outptr++ =						      \
468 		      base64 (((statep->__count >> 3) & ~3) | (ch >> 30));    \
469 		    *outptr++ = base64 ((ch >> 24) & 0x3f);		      \
470 		    *outptr++ = base64 ((ch >> 18) & 0x3f);		      \
471 		    *outptr++ = base64 ((ch >> 12) & 0x3f);		      \
472 		    *outptr++ = base64 ((ch >> 6) & 0x3f);		      \
473 		    *outptr++ = base64 (ch & 0x3f);			      \
474 		    statep->__count = (1 << 3);				      \
475 		    break;						      \
476 		  default:						      \
477 		    abort ();						      \
478 		  }							      \
479 	      }								      \
480 	    else							      \
481 	      abort ();							      \
482 	  }								      \
483       }									      \
484 									      \
485     /* Now that we wrote the output increment the input pointer.  */	      \
486     inptr += 4;								      \
487   }
488 #define LOOP_NEED_FLAGS
489 #define EXTRA_LOOP_DECLS	, mbstate_t *statep
490 #include <iconv/loop.c>
491 
492 
493 /* Since this is a stateful encoding we have to provide code which resets
494    the output state to the initial state.  This has to be done during the
495    flushing.  */
496 #define EMIT_SHIFT_TO_INIT \
497   if (FROM_DIRECTION)							      \
498     /* Nothing to emit.  */						      \
499     memset (data->__statep, '\0', sizeof (mbstate_t));			      \
500   else									      \
501     {									      \
502       /* The "to UTF-7" direction.  Flush the remaining bits and terminate    \
503 	 with a '-' byte.  This will guarantee correct decoding if more	      \
504 	 UTF-7 encoded text is added afterwards.  */			      \
505       int state = data->__statep->__count;				      \
506 									      \
507       if (state & 0x18)							      \
508 	{								      \
509 	  /* Deactivate base64 encoding.  */				      \
510 	  size_t count = ((state & 0x18) >= 0x10) + 1;			      \
511 									      \
512 	  if (__glibc_unlikely (outbuf + count > outend))		      \
513 	    /* We don't have enough room in the output buffer.  */	      \
514 	    status = __GCONV_FULL_OUTPUT;				      \
515 	  else								      \
516 	    {								      \
517 	      /* Write out the shift sequence.  */			      \
518 	      if ((state & 0x18) >= 0x10)				      \
519 		*outbuf++ = base64 ((state >> 3) & ~3);			      \
520 	      *outbuf++ = '-';						      \
521 									      \
522 	      data->__statep->__count = 0;				      \
523 	    }								      \
524 	}								      \
525       else								      \
526 	data->__statep->__count = 0;					      \
527     }
528 
529 
530 /* Now define the toplevel functions.  */
531 #include <iconv/skeleton.c>
532