1 /*
2 * Unicode utilities
3 *
4 * Copyright (c) 2017-2018 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <stdarg.h>
27 #include <string.h>
28 #include <assert.h>
29
30 #include "cutils.h"
31 #include "libunicode.h"
32 #include "libunicode-table.h"
33
34 enum {
35 RUN_TYPE_U,
36 RUN_TYPE_L,
37 RUN_TYPE_UF,
38 RUN_TYPE_LF,
39 RUN_TYPE_UL,
40 RUN_TYPE_LSU,
41 RUN_TYPE_U2L_399_EXT2,
42 RUN_TYPE_UF_D20,
43 RUN_TYPE_UF_D1_EXT,
44 RUN_TYPE_U_EXT,
45 RUN_TYPE_LF_EXT,
46 RUN_TYPE_U_EXT2,
47 RUN_TYPE_L_EXT2,
48 RUN_TYPE_U_EXT3,
49 };
50
51 /* conv_type:
52 0 = to upper
53 1 = to lower
54 2 = case folding (= to lower with modifications)
55 */
lre_case_conv(uint32_t * res,uint32_t c,int conv_type)56 int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
57 {
58 if (c < 128) {
59 if (conv_type) {
60 if (c >= 'A' && c <= 'Z') {
61 c = c - 'A' + 'a';
62 }
63 } else {
64 if (c >= 'a' && c <= 'z') {
65 c = c - 'a' + 'A';
66 }
67 }
68 } else {
69 uint32_t v, code, data, type, len, a, is_lower;
70 int idx, idx_min, idx_max;
71
72 is_lower = (conv_type != 0);
73 idx_min = 0;
74 idx_max = countof(case_conv_table1) - 1;
75 while (idx_min <= idx_max) {
76 idx = (unsigned)(idx_max + idx_min) / 2;
77 v = case_conv_table1[idx];
78 code = v >> (32 - 17);
79 len = (v >> (32 - 17 - 7)) & 0x7f;
80 if (c < code) {
81 idx_max = idx - 1;
82 } else if (c >= code + len) {
83 idx_min = idx + 1;
84 } else {
85 type = (v >> (32 - 17 - 7 - 4)) & 0xf;
86 data = ((v & 0xf) << 8) | case_conv_table2[idx];
87 switch(type) {
88 case RUN_TYPE_U:
89 case RUN_TYPE_L:
90 case RUN_TYPE_UF:
91 case RUN_TYPE_LF:
92 if (conv_type == (type & 1) ||
93 (type >= RUN_TYPE_UF && conv_type == 2)) {
94 c = c - code + (case_conv_table1[data] >> (32 - 17));
95 }
96 break;
97 case RUN_TYPE_UL:
98 a = c - code;
99 if ((a & 1) != (1 - is_lower))
100 break;
101 c = (a ^ 1) + code;
102 break;
103 case RUN_TYPE_LSU:
104 a = c - code;
105 if (a == 1) {
106 c += 2 * is_lower - 1;
107 } else if (a == (1 - is_lower) * 2) {
108 c += (2 * is_lower - 1) * 2;
109 }
110 break;
111 case RUN_TYPE_U2L_399_EXT2:
112 if (!is_lower) {
113 res[0] = c - code + case_conv_ext[data >> 6];
114 res[1] = 0x399;
115 return 2;
116 } else {
117 c = c - code + case_conv_ext[data & 0x3f];
118 }
119 break;
120 case RUN_TYPE_UF_D20:
121 if (conv_type == 1)
122 break;
123 c = data + (conv_type == 2) * 0x20;
124 break;
125 case RUN_TYPE_UF_D1_EXT:
126 if (conv_type == 1)
127 break;
128 c = case_conv_ext[data] + (conv_type == 2);
129 break;
130 case RUN_TYPE_U_EXT:
131 case RUN_TYPE_LF_EXT:
132 if (is_lower != (type - RUN_TYPE_U_EXT))
133 break;
134 c = case_conv_ext[data];
135 break;
136 case RUN_TYPE_U_EXT2:
137 case RUN_TYPE_L_EXT2:
138 if (conv_type != (type - RUN_TYPE_U_EXT2))
139 break;
140 res[0] = c - code + case_conv_ext[data >> 6];
141 res[1] = case_conv_ext[data & 0x3f];
142 return 2;
143 default:
144 case RUN_TYPE_U_EXT3:
145 if (conv_type != 0)
146 break;
147 res[0] = case_conv_ext[data >> 8];
148 res[1] = case_conv_ext[(data >> 4) & 0xf];
149 res[2] = case_conv_ext[data & 0xf];
150 return 3;
151 }
152 break;
153 }
154 }
155 }
156 res[0] = c;
157 return 1;
158 }
159
get_le24(const uint8_t * ptr)160 static uint32_t get_le24(const uint8_t *ptr)
161 {
162 #if defined(__x86__) || defined(__x86_64__)
163 return *(uint16_t *)ptr | (ptr[2] << 16);
164 #else
165 return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16);
166 #endif
167 }
168
169 #define UNICODE_INDEX_BLOCK_LEN 32
170
171 /* return -1 if not in table, otherwise the offset in the block */
get_index_pos(uint32_t * pcode,uint32_t c,const uint8_t * index_table,int index_table_len)172 static int get_index_pos(uint32_t *pcode, uint32_t c,
173 const uint8_t *index_table, int index_table_len)
174 {
175 uint32_t code, v;
176 int idx_min, idx_max, idx;
177
178 idx_min = 0;
179 v = get_le24(index_table);
180 code = v & ((1 << 21) - 1);
181 if (c < code) {
182 *pcode = 0;
183 return 0;
184 }
185 idx_max = index_table_len - 1;
186 code = get_le24(index_table + idx_max * 3);
187 if (c >= code)
188 return -1;
189 /* invariant: tab[idx_min] <= c < tab2[idx_max] */
190 while ((idx_max - idx_min) > 1) {
191 idx = (idx_max + idx_min) / 2;
192 v = get_le24(index_table + idx * 3);
193 code = v & ((1 << 21) - 1);
194 if (c < code) {
195 idx_max = idx;
196 } else {
197 idx_min = idx;
198 }
199 }
200 v = get_le24(index_table + idx_min * 3);
201 *pcode = v & ((1 << 21) - 1);
202 return (idx_min + 1) * UNICODE_INDEX_BLOCK_LEN + (v >> 21);
203 }
204
lre_is_in_table(uint32_t c,const uint8_t * table,const uint8_t * index_table,int index_table_len)205 static BOOL lre_is_in_table(uint32_t c, const uint8_t *table,
206 const uint8_t *index_table, int index_table_len)
207 {
208 uint32_t code, b, bit;
209 int pos;
210 const uint8_t *p;
211
212 pos = get_index_pos(&code, c, index_table, index_table_len);
213 if (pos < 0)
214 return FALSE; /* outside the table */
215 p = table + pos;
216 bit = 0;
217 for(;;) {
218 b = *p++;
219 if (b < 64) {
220 code += (b >> 3) + 1;
221 if (c < code)
222 return bit;
223 bit ^= 1;
224 code += (b & 7) + 1;
225 } else if (b >= 0x80) {
226 code += b - 0x80 + 1;
227 } else if (b < 0x60) {
228 code += (((b - 0x40) << 8) | p[0]) + 1;
229 p++;
230 } else {
231 code += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
232 p += 2;
233 }
234 if (c < code)
235 return bit;
236 bit ^= 1;
237 }
238 }
239
lre_is_cased(uint32_t c)240 BOOL lre_is_cased(uint32_t c)
241 {
242 uint32_t v, code, len;
243 int idx, idx_min, idx_max;
244
245 idx_min = 0;
246 idx_max = countof(case_conv_table1) - 1;
247 while (idx_min <= idx_max) {
248 idx = (unsigned)(idx_max + idx_min) / 2;
249 v = case_conv_table1[idx];
250 code = v >> (32 - 17);
251 len = (v >> (32 - 17 - 7)) & 0x7f;
252 if (c < code) {
253 idx_max = idx - 1;
254 } else if (c >= code + len) {
255 idx_min = idx + 1;
256 } else {
257 return TRUE;
258 }
259 }
260 return lre_is_in_table(c, unicode_prop_Cased1_table,
261 unicode_prop_Cased1_index,
262 sizeof(unicode_prop_Cased1_index) / 3);
263 }
264
lre_is_case_ignorable(uint32_t c)265 BOOL lre_is_case_ignorable(uint32_t c)
266 {
267 return lre_is_in_table(c, unicode_prop_Case_Ignorable_table,
268 unicode_prop_Case_Ignorable_index,
269 sizeof(unicode_prop_Case_Ignorable_index) / 3);
270 }
271
272 /* character range */
273
cr_dump(CharRange * cr)274 static __maybe_unused void cr_dump(CharRange *cr)
275 {
276 int i;
277 for(i = 0; i < cr->len; i++)
278 printf("%d: 0x%04x\n", i, cr->points[i]);
279 }
280
cr_default_realloc(void * opaque,void * ptr,size_t size)281 static void *cr_default_realloc(void *opaque, void *ptr, size_t size)
282 {
283 return realloc(ptr, size);
284 }
285
cr_init(CharRange * cr,void * mem_opaque,DynBufReallocFunc * realloc_func)286 void cr_init(CharRange *cr, void *mem_opaque, DynBufReallocFunc *realloc_func)
287 {
288 cr->len = cr->size = 0;
289 cr->points = NULL;
290 cr->mem_opaque = mem_opaque;
291 cr->realloc_func = realloc_func ? realloc_func : cr_default_realloc;
292 }
293
cr_free(CharRange * cr)294 void cr_free(CharRange *cr)
295 {
296 cr->realloc_func(cr->mem_opaque, cr->points, 0);
297 }
298
cr_realloc(CharRange * cr,int size)299 int cr_realloc(CharRange *cr, int size)
300 {
301 int new_size;
302 uint32_t *new_buf;
303
304 if (size > cr->size) {
305 new_size = max_int(size, cr->size * 3 / 2);
306 new_buf = cr->realloc_func(cr->mem_opaque, cr->points,
307 new_size * sizeof(cr->points[0]));
308 if (!new_buf)
309 return -1;
310 cr->points = new_buf;
311 cr->size = new_size;
312 }
313 return 0;
314 }
315
cr_copy(CharRange * cr,const CharRange * cr1)316 int cr_copy(CharRange *cr, const CharRange *cr1)
317 {
318 if (cr_realloc(cr, cr1->len))
319 return -1;
320 memcpy(cr->points, cr1->points, sizeof(cr->points[0]) * cr1->len);
321 cr->len = cr1->len;
322 return 0;
323 }
324
325 /* merge consecutive intervals and remove empty intervals */
cr_compress(CharRange * cr)326 static void cr_compress(CharRange *cr)
327 {
328 int i, j, k, len;
329 uint32_t *pt;
330
331 pt = cr->points;
332 len = cr->len;
333 i = 0;
334 j = 0;
335 k = 0;
336 while ((i + 1) < len) {
337 if (pt[i] == pt[i + 1]) {
338 /* empty interval */
339 i += 2;
340 } else {
341 j = i;
342 while ((j + 3) < len && pt[j + 1] == pt[j + 2])
343 j += 2;
344 /* just copy */
345 pt[k] = pt[i];
346 pt[k + 1] = pt[j + 1];
347 k += 2;
348 i = j + 2;
349 }
350 }
351 cr->len = k;
352 }
353
354 /* union or intersection */
cr_op(CharRange * cr,const uint32_t * a_pt,int a_len,const uint32_t * b_pt,int b_len,int op)355 int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
356 const uint32_t *b_pt, int b_len, int op)
357 {
358 int a_idx, b_idx, is_in;
359 uint32_t v;
360
361 a_idx = 0;
362 b_idx = 0;
363 for(;;) {
364 /* get one more point from a or b in increasing order */
365 if (a_idx < a_len && b_idx < b_len) {
366 if (a_pt[a_idx] < b_pt[b_idx]) {
367 goto a_add;
368 } else if (a_pt[a_idx] == b_pt[b_idx]) {
369 v = a_pt[a_idx];
370 a_idx++;
371 b_idx++;
372 } else {
373 goto b_add;
374 }
375 } else if (a_idx < a_len) {
376 a_add:
377 v = a_pt[a_idx++];
378 } else if (b_idx < b_len) {
379 b_add:
380 v = b_pt[b_idx++];
381 } else {
382 break;
383 }
384 /* add the point if the in/out status changes */
385 switch(op) {
386 case CR_OP_UNION:
387 is_in = (a_idx & 1) | (b_idx & 1);
388 break;
389 case CR_OP_INTER:
390 is_in = (a_idx & 1) & (b_idx & 1);
391 break;
392 case CR_OP_XOR:
393 is_in = (a_idx & 1) ^ (b_idx & 1);
394 break;
395 default:
396 abort();
397 }
398 if (is_in != (cr->len & 1)) {
399 if (cr_add_point(cr, v))
400 return -1;
401 }
402 }
403 cr_compress(cr);
404 return 0;
405 }
406
cr_union1(CharRange * cr,const uint32_t * b_pt,int b_len)407 int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len)
408 {
409 CharRange a = *cr;
410 int ret;
411 cr->len = 0;
412 cr->size = 0;
413 cr->points = NULL;
414 ret = cr_op(cr, a.points, a.len, b_pt, b_len, CR_OP_UNION);
415 cr_free(&a);
416 return ret;
417 }
418
cr_invert(CharRange * cr)419 int cr_invert(CharRange *cr)
420 {
421 int len;
422 len = cr->len;
423 if (cr_realloc(cr, len + 2))
424 return -1;
425 memmove(cr->points + 1, cr->points, len * sizeof(cr->points[0]));
426 cr->points[0] = 0;
427 cr->points[len + 1] = UINT32_MAX;
428 cr->len = len + 2;
429 cr_compress(cr);
430 return 0;
431 }
432
433 #ifdef CONFIG_ALL_UNICODE
434
lre_is_id_start(uint32_t c)435 BOOL lre_is_id_start(uint32_t c)
436 {
437 return lre_is_in_table(c, unicode_prop_ID_Start_table,
438 unicode_prop_ID_Start_index,
439 sizeof(unicode_prop_ID_Start_index) / 3);
440 }
441
lre_is_id_continue(uint32_t c)442 BOOL lre_is_id_continue(uint32_t c)
443 {
444 return lre_is_id_start(c) ||
445 lre_is_in_table(c, unicode_prop_ID_Continue1_table,
446 unicode_prop_ID_Continue1_index,
447 sizeof(unicode_prop_ID_Continue1_index) / 3);
448 }
449
450 #define UNICODE_DECOMP_LEN_MAX 18
451
452 typedef enum {
453 DECOMP_TYPE_C1, /* 16 bit char */
454 DECOMP_TYPE_L1, /* 16 bit char table */
455 DECOMP_TYPE_L2,
456 DECOMP_TYPE_L3,
457 DECOMP_TYPE_L4,
458 DECOMP_TYPE_L5, /* XXX: not used */
459 DECOMP_TYPE_L6, /* XXX: could remove */
460 DECOMP_TYPE_L7, /* XXX: could remove */
461 DECOMP_TYPE_LL1, /* 18 bit char table */
462 DECOMP_TYPE_LL2,
463 DECOMP_TYPE_S1, /* 8 bit char table */
464 DECOMP_TYPE_S2,
465 DECOMP_TYPE_S3,
466 DECOMP_TYPE_S4,
467 DECOMP_TYPE_S5,
468 DECOMP_TYPE_I1, /* increment 16 bit char value */
469 DECOMP_TYPE_I2_0,
470 DECOMP_TYPE_I2_1,
471 DECOMP_TYPE_I3_1,
472 DECOMP_TYPE_I3_2,
473 DECOMP_TYPE_I4_1,
474 DECOMP_TYPE_I4_2,
475 DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */
476 DECOMP_TYPE_B2,
477 DECOMP_TYPE_B3,
478 DECOMP_TYPE_B4,
479 DECOMP_TYPE_B5,
480 DECOMP_TYPE_B6,
481 DECOMP_TYPE_B7,
482 DECOMP_TYPE_B8,
483 DECOMP_TYPE_B18,
484 DECOMP_TYPE_LS2,
485 DECOMP_TYPE_PAT3,
486 DECOMP_TYPE_S2_UL,
487 DECOMP_TYPE_LS2_UL,
488 } DecompTypeEnum;
489
unicode_get_short_code(uint32_t c)490 static uint32_t unicode_get_short_code(uint32_t c)
491 {
492 static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 };
493
494 if (c < 0x80)
495 return c;
496 else if (c < 0x80 + 0x50)
497 return c - 0x80 + 0x300;
498 else
499 return unicode_short_table[c - 0x80 - 0x50];
500 }
501
unicode_get_lower_simple(uint32_t c)502 static uint32_t unicode_get_lower_simple(uint32_t c)
503 {
504 if (c < 0x100 || (c >= 0x410 && c <= 0x42f))
505 c += 0x20;
506 else
507 c++;
508 return c;
509 }
510
unicode_get16(const uint8_t * p)511 static uint16_t unicode_get16(const uint8_t *p)
512 {
513 return p[0] | (p[1] << 8);
514 }
515
unicode_decomp_entry(uint32_t * res,uint32_t c,int idx,uint32_t code,uint32_t len,uint32_t type)516 static int unicode_decomp_entry(uint32_t *res, uint32_t c,
517 int idx, uint32_t code, uint32_t len,
518 uint32_t type)
519 {
520 uint32_t c1;
521 int l, i, p;
522 const uint8_t *d;
523
524 if (type == DECOMP_TYPE_C1) {
525 res[0] = unicode_decomp_table2[idx];
526 return 1;
527 } else {
528 d = unicode_decomp_data + unicode_decomp_table2[idx];
529 switch(type) {
530 case DECOMP_TYPE_L1 ... DECOMP_TYPE_L7:
531 l = type - DECOMP_TYPE_L1 + 1;
532 d += (c - code) * l * 2;
533 for(i = 0; i < l; i++) {
534 if ((res[i] = unicode_get16(d + 2 * i)) == 0)
535 return 0;
536 }
537 return l;
538 case DECOMP_TYPE_LL1 ... DECOMP_TYPE_LL2:
539 {
540 uint32_t k, p;
541 l = type - DECOMP_TYPE_LL1 + 1;
542 k = (c - code) * l;
543 p = len * l * 2;
544 for(i = 0; i < l; i++) {
545 c1 = unicode_get16(d + 2 * k) |
546 (((d[p + (k / 4)] >> ((k % 4) * 2)) & 3) << 16);
547 if (!c1)
548 return 0;
549 res[i] = c1;
550 k++;
551 }
552 }
553 return l;
554 case DECOMP_TYPE_S1 ... DECOMP_TYPE_S5:
555 l = type - DECOMP_TYPE_S1 + 1;
556 d += (c - code) * l;
557 for(i = 0; i < l; i++) {
558 if ((res[i] = unicode_get_short_code(d[i])) == 0)
559 return 0;
560 }
561 return l;
562 case DECOMP_TYPE_I1:
563 l = 1;
564 p = 0;
565 goto decomp_type_i;
566 case DECOMP_TYPE_I2_0:
567 case DECOMP_TYPE_I2_1:
568 case DECOMP_TYPE_I3_1:
569 case DECOMP_TYPE_I3_2:
570 case DECOMP_TYPE_I4_1:
571 case DECOMP_TYPE_I4_2:
572 l = 2 + ((type - DECOMP_TYPE_I2_0) >> 1);
573 p = ((type - DECOMP_TYPE_I2_0) & 1) + (l > 2);
574 decomp_type_i:
575 for(i = 0; i < l; i++) {
576 c1 = unicode_get16(d + 2 * i);
577 if (i == p)
578 c1 += c - code;
579 res[i] = c1;
580 }
581 return l;
582 case DECOMP_TYPE_B18:
583 l = 18;
584 goto decomp_type_b;
585 case DECOMP_TYPE_B1 ... DECOMP_TYPE_B8:
586 l = type - DECOMP_TYPE_B1 + 1;
587 decomp_type_b:
588 {
589 uint32_t c_min;
590 c_min = unicode_get16(d);
591 d += 2 + (c - code) * l;
592 for(i = 0; i < l; i++) {
593 c1 = d[i];
594 if (c1 == 0xff)
595 c1 = 0x20;
596 else
597 c1 += c_min;
598 res[i] = c1;
599 }
600 }
601 return l;
602 case DECOMP_TYPE_LS2:
603 d += (c - code) * 3;
604 if (!(res[0] = unicode_get16(d)))
605 return 0;
606 res[1] = unicode_get_short_code(d[2]);
607 return 2;
608 case DECOMP_TYPE_PAT3:
609 res[0] = unicode_get16(d);
610 res[2] = unicode_get16(d + 2);
611 d += 4 + (c - code) * 2;
612 res[1] = unicode_get16(d);
613 return 3;
614 case DECOMP_TYPE_S2_UL:
615 case DECOMP_TYPE_LS2_UL:
616 c1 = c - code;
617 if (type == DECOMP_TYPE_S2_UL) {
618 d += c1 & ~1;
619 c = unicode_get_short_code(*d);
620 d++;
621 } else {
622 d += (c1 >> 1) * 3;
623 c = unicode_get16(d);
624 d += 2;
625 }
626 if (c1 & 1)
627 c = unicode_get_lower_simple(c);
628 res[0] = c;
629 res[1] = unicode_get_short_code(*d);
630 return 2;
631 }
632 }
633 return 0;
634 }
635
636
637 /* return the length of the decomposition (length <=
638 UNICODE_DECOMP_LEN_MAX) or 0 if no decomposition */
unicode_decomp_char(uint32_t * res,uint32_t c,BOOL is_compat1)639 static int unicode_decomp_char(uint32_t *res, uint32_t c, BOOL is_compat1)
640 {
641 uint32_t v, type, is_compat, code, len;
642 int idx_min, idx_max, idx;
643
644 idx_min = 0;
645 idx_max = countof(unicode_decomp_table1) - 1;
646 while (idx_min <= idx_max) {
647 idx = (idx_max + idx_min) / 2;
648 v = unicode_decomp_table1[idx];
649 code = v >> (32 - 18);
650 len = (v >> (32 - 18 - 7)) & 0x7f;
651 // printf("idx=%d code=%05x len=%d\n", idx, code, len);
652 if (c < code) {
653 idx_max = idx - 1;
654 } else if (c >= code + len) {
655 idx_min = idx + 1;
656 } else {
657 is_compat = v & 1;
658 if (is_compat1 < is_compat)
659 break;
660 type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
661 return unicode_decomp_entry(res, c, idx, code, len, type);
662 }
663 }
664 return 0;
665 }
666
667 /* return 0 if no pair found */
unicode_compose_pair(uint32_t c0,uint32_t c1)668 static int unicode_compose_pair(uint32_t c0, uint32_t c1)
669 {
670 uint32_t code, len, type, v, idx1, d_idx, d_offset, ch;
671 int idx_min, idx_max, idx, d;
672 uint32_t pair[2];
673
674 idx_min = 0;
675 idx_max = countof(unicode_comp_table) - 1;
676 while (idx_min <= idx_max) {
677 idx = (idx_max + idx_min) / 2;
678 idx1 = unicode_comp_table[idx];
679
680 /* idx1 represent an entry of the decomposition table */
681 d_idx = idx1 >> 6;
682 d_offset = idx1 & 0x3f;
683 v = unicode_decomp_table1[d_idx];
684 code = v >> (32 - 18);
685 len = (v >> (32 - 18 - 7)) & 0x7f;
686 type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
687 ch = code + d_offset;
688 unicode_decomp_entry(pair, ch, d_idx, code, len, type);
689 d = c0 - pair[0];
690 if (d == 0)
691 d = c1 - pair[1];
692 if (d < 0) {
693 idx_max = idx - 1;
694 } else if (d > 0) {
695 idx_min = idx + 1;
696 } else {
697 return ch;
698 }
699 }
700 return 0;
701 }
702
703 /* return the combining class of character c (between 0 and 255) */
unicode_get_cc(uint32_t c)704 static int unicode_get_cc(uint32_t c)
705 {
706 uint32_t code, n, type, cc, c1, b;
707 int pos;
708 const uint8_t *p;
709
710 pos = get_index_pos(&code, c,
711 unicode_cc_index, sizeof(unicode_cc_index) / 3);
712 if (pos < 0)
713 return 0;
714 p = unicode_cc_table + pos;
715 for(;;) {
716 b = *p++;
717 type = b >> 6;
718 n = b & 0x3f;
719 if (n < 48) {
720 } else if (n < 56) {
721 n = (n - 48) << 8;
722 n |= *p++;
723 n += 48;
724 } else {
725 n = (n - 56) << 8;
726 n |= *p++ << 8;
727 n |= *p++;
728 n += 48 + (1 << 11);
729 }
730 if (type <= 1)
731 p++;
732 c1 = code + n + 1;
733 if (c < c1) {
734 switch(type) {
735 case 0:
736 cc = p[-1];
737 break;
738 case 1:
739 cc = p[-1] + c - code;
740 break;
741 case 2:
742 cc = 0;
743 break;
744 default:
745 case 3:
746 cc = 230;
747 break;
748 }
749 return cc;
750 }
751 code = c1;
752 }
753 }
754
sort_cc(int * buf,int len)755 static void sort_cc(int *buf, int len)
756 {
757 int i, j, k, cc, cc1, start, ch1;
758
759 for(i = 0; i < len; i++) {
760 cc = unicode_get_cc(buf[i]);
761 if (cc != 0) {
762 start = i;
763 j = i + 1;
764 while (j < len) {
765 ch1 = buf[j];
766 cc1 = unicode_get_cc(ch1);
767 if (cc1 == 0)
768 break;
769 k = j - 1;
770 while (k >= start) {
771 if (unicode_get_cc(buf[k]) <= cc1)
772 break;
773 buf[k + 1] = buf[k];
774 k--;
775 }
776 buf[k + 1] = ch1;
777 j++;
778 }
779 #if 0
780 printf("cc:");
781 for(k = start; k < j; k++) {
782 printf(" %3d", unicode_get_cc(buf[k]));
783 }
784 printf("\n");
785 #endif
786 i = j;
787 }
788 }
789 }
790
to_nfd_rec(DynBuf * dbuf,const int * src,int src_len,int is_compat)791 static void to_nfd_rec(DynBuf *dbuf,
792 const int *src, int src_len, int is_compat)
793 {
794 uint32_t c, v;
795 int i, l;
796 uint32_t res[UNICODE_DECOMP_LEN_MAX];
797
798 for(i = 0; i < src_len; i++) {
799 c = src[i];
800 if (c >= 0xac00 && c < 0xd7a4) {
801 /* Hangul decomposition */
802 c -= 0xac00;
803 dbuf_put_u32(dbuf, 0x1100 + c / 588);
804 dbuf_put_u32(dbuf, 0x1161 + (c % 588) / 28);
805 v = c % 28;
806 if (v != 0)
807 dbuf_put_u32(dbuf, 0x11a7 + v);
808 } else {
809 l = unicode_decomp_char(res, c, is_compat);
810 if (l) {
811 to_nfd_rec(dbuf, (int *)res, l, is_compat);
812 } else {
813 dbuf_put_u32(dbuf, c);
814 }
815 }
816 }
817 }
818
819 /* return 0 if not found */
compose_pair(uint32_t c0,uint32_t c1)820 static int compose_pair(uint32_t c0, uint32_t c1)
821 {
822 /* Hangul composition */
823 if (c0 >= 0x1100 && c0 < 0x1100 + 19 &&
824 c1 >= 0x1161 && c1 < 0x1161 + 21) {
825 return 0xac00 + (c0 - 0x1100) * 588 + (c1 - 0x1161) * 28;
826 } else if (c0 >= 0xac00 && c0 < 0xac00 + 11172 &&
827 (c0 - 0xac00) % 28 == 0 &&
828 c1 >= 0x11a7 && c1 < 0x11a7 + 28) {
829 return c0 + c1 - 0x11a7;
830 } else {
831 return unicode_compose_pair(c0, c1);
832 }
833 }
834
unicode_normalize(uint32_t ** pdst,const uint32_t * src,int src_len,UnicodeNormalizationEnum n_type,void * opaque,DynBufReallocFunc * realloc_func)835 int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
836 UnicodeNormalizationEnum n_type,
837 void *opaque, DynBufReallocFunc *realloc_func)
838 {
839 int *buf, buf_len, i, p, starter_pos, cc, last_cc, out_len;
840 BOOL is_compat;
841 DynBuf dbuf_s, *dbuf = &dbuf_s;
842
843 is_compat = n_type >> 1;
844
845 dbuf_init2(dbuf, opaque, realloc_func);
846 if (dbuf_realloc(dbuf, sizeof(int) * src_len))
847 goto fail;
848
849 /* common case: latin1 is unaffected by NFC */
850 if (n_type == UNICODE_NFC) {
851 for(i = 0; i < src_len; i++) {
852 if (src[i] >= 0x100)
853 goto not_latin1;
854 }
855 buf = (int *)dbuf->buf;
856 memcpy(buf, src, src_len * sizeof(int));
857 *pdst = (uint32_t *)buf;
858 return src_len;
859 not_latin1: ;
860 }
861
862 to_nfd_rec(dbuf, (const int *)src, src_len, is_compat);
863 if (dbuf_error(dbuf)) {
864 fail:
865 *pdst = NULL;
866 return -1;
867 }
868 buf = (int *)dbuf->buf;
869 buf_len = dbuf->size / sizeof(int);
870
871 sort_cc(buf, buf_len);
872
873 if (buf_len <= 1 || (n_type & 1) != 0) {
874 /* NFD / NFKD */
875 *pdst = (uint32_t *)buf;
876 return buf_len;
877 }
878
879 i = 1;
880 out_len = 1;
881 while (i < buf_len) {
882 /* find the starter character and test if it is blocked from
883 the character at 'i' */
884 last_cc = unicode_get_cc(buf[i]);
885 starter_pos = out_len - 1;
886 while (starter_pos >= 0) {
887 cc = unicode_get_cc(buf[starter_pos]);
888 if (cc == 0)
889 break;
890 if (cc >= last_cc)
891 goto next;
892 last_cc = 256;
893 starter_pos--;
894 }
895 if (starter_pos >= 0 &&
896 (p = compose_pair(buf[starter_pos], buf[i])) != 0) {
897 buf[starter_pos] = p;
898 i++;
899 } else {
900 next:
901 buf[out_len++] = buf[i++];
902 }
903 }
904 *pdst = (uint32_t *)buf;
905 return out_len;
906 }
907
908 /* char ranges for various unicode properties */
909
unicode_find_name(const char * name_table,const char * name)910 static int unicode_find_name(const char *name_table, const char *name)
911 {
912 const char *p, *r;
913 int pos;
914 size_t name_len, len;
915
916 p = name_table;
917 pos = 0;
918 name_len = strlen(name);
919 while (*p) {
920 for(;;) {
921 r = strchr(p, ',');
922 if (!r)
923 len = strlen(p);
924 else
925 len = r - p;
926 if (len == name_len && !memcmp(p, name, name_len))
927 return pos;
928 p += len + 1;
929 if (!r)
930 break;
931 }
932 pos++;
933 }
934 return -1;
935 }
936
937 /* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
938 if not found */
unicode_script(CharRange * cr,const char * script_name,BOOL is_ext)939 int unicode_script(CharRange *cr,
940 const char *script_name, BOOL is_ext)
941 {
942 int script_idx;
943 const uint8_t *p, *p_end;
944 uint32_t c, c1, b, n, v, v_len, i, type;
945 CharRange cr1_s, *cr1;
946 CharRange cr2_s, *cr2 = &cr2_s;
947 BOOL is_common;
948
949 script_idx = unicode_find_name(unicode_script_name_table, script_name);
950 if (script_idx < 0)
951 return -2;
952 /* Note: we remove the "Unknown" Script */
953 script_idx += UNICODE_SCRIPT_Unknown + 1;
954
955 is_common = (script_idx == UNICODE_SCRIPT_Common ||
956 script_idx == UNICODE_SCRIPT_Inherited);
957 if (is_ext) {
958 cr1 = &cr1_s;
959 cr_init(cr1, cr->mem_opaque, cr->realloc_func);
960 cr_init(cr2, cr->mem_opaque, cr->realloc_func);
961 } else {
962 cr1 = cr;
963 }
964
965 p = unicode_script_table;
966 p_end = unicode_script_table + countof(unicode_script_table);
967 c = 0;
968 while (p < p_end) {
969 b = *p++;
970 type = b >> 7;
971 n = b & 0x7f;
972 if (n < 96) {
973 } else if (n < 112) {
974 n = (n - 96) << 8;
975 n |= *p++;
976 n += 96;
977 } else {
978 n = (n - 112) << 16;
979 n |= *p++ << 8;
980 n |= *p++;
981 n += 96 + (1 << 12);
982 }
983 if (type == 0)
984 v = 0;
985 else
986 v = *p++;
987 c1 = c + n + 1;
988 if (v == script_idx) {
989 if (cr_add_interval(cr1, c, c1))
990 goto fail;
991 }
992 c = c1;
993 }
994
995 if (is_ext) {
996 /* add the script extensions */
997 p = unicode_script_ext_table;
998 p_end = unicode_script_ext_table + countof(unicode_script_ext_table);
999 c = 0;
1000 while (p < p_end) {
1001 b = *p++;
1002 if (b < 128) {
1003 n = b;
1004 } else if (b < 128 + 64) {
1005 n = (b - 128) << 8;
1006 n |= *p++;
1007 n += 128;
1008 } else {
1009 n = (b - 128 - 64) << 16;
1010 n |= *p++ << 8;
1011 n |= *p++;
1012 n += 128 + (1 << 14);
1013 }
1014 c1 = c + n + 1;
1015 v_len = *p++;
1016 if (is_common) {
1017 if (v_len != 0) {
1018 if (cr_add_interval(cr2, c, c1))
1019 goto fail;
1020 }
1021 } else {
1022 for(i = 0; i < v_len; i++) {
1023 if (p[i] == script_idx) {
1024 if (cr_add_interval(cr2, c, c1))
1025 goto fail;
1026 break;
1027 }
1028 }
1029 }
1030 p += v_len;
1031 c = c1;
1032 }
1033 if (is_common) {
1034 /* remove all the characters with script extensions */
1035 if (cr_invert(cr2))
1036 goto fail;
1037 if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1038 CR_OP_INTER))
1039 goto fail;
1040 } else {
1041 if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1042 CR_OP_UNION))
1043 goto fail;
1044 }
1045 cr_free(cr1);
1046 cr_free(cr2);
1047 }
1048 return 0;
1049 fail:
1050 if (is_ext) {
1051 cr_free(cr1);
1052 cr_free(cr2);
1053 }
1054 goto fail;
1055 }
1056
1057 #define M(id) (1U << UNICODE_GC_ ## id)
1058
unicode_general_category1(CharRange * cr,uint32_t gc_mask)1059 static int unicode_general_category1(CharRange *cr, uint32_t gc_mask)
1060 {
1061 const uint8_t *p, *p_end;
1062 uint32_t c, c0, b, n, v;
1063
1064 p = unicode_gc_table;
1065 p_end = unicode_gc_table + countof(unicode_gc_table);
1066 c = 0;
1067 while (p < p_end) {
1068 b = *p++;
1069 n = b >> 5;
1070 v = b & 0x1f;
1071 if (n == 7) {
1072 n = *p++;
1073 if (n < 128) {
1074 n += 7;
1075 } else if (n < 128 + 64) {
1076 n = (n - 128) << 8;
1077 n |= *p++;
1078 n += 7 + 128;
1079 } else {
1080 n = (n - 128 - 64) << 16;
1081 n |= *p++ << 8;
1082 n |= *p++;
1083 n += 7 + 128 + (1 << 14);
1084 }
1085 }
1086 c0 = c;
1087 c += n + 1;
1088 if (v == 31) {
1089 /* run of Lu / Ll */
1090 b = gc_mask & (M(Lu) | M(Ll));
1091 if (b != 0) {
1092 if (b == (M(Lu) | M(Ll))) {
1093 goto add_range;
1094 } else {
1095 c0 += ((gc_mask & M(Ll)) != 0);
1096 for(; c0 < c; c0 += 2) {
1097 if (cr_add_interval(cr, c0, c0 + 1))
1098 return -1;
1099 }
1100 }
1101 }
1102 } else if ((gc_mask >> v) & 1) {
1103 add_range:
1104 if (cr_add_interval(cr, c0, c))
1105 return -1;
1106 }
1107 }
1108 return 0;
1109 }
1110
unicode_prop1(CharRange * cr,int prop_idx)1111 static int unicode_prop1(CharRange *cr, int prop_idx)
1112 {
1113 const uint8_t *p, *p_end;
1114 uint32_t c, c0, b, bit;
1115
1116 p = unicode_prop_table[prop_idx];
1117 p_end = p + unicode_prop_len_table[prop_idx];
1118 c = 0;
1119 bit = 0;
1120 while (p < p_end) {
1121 c0 = c;
1122 b = *p++;
1123 if (b < 64) {
1124 c += (b >> 3) + 1;
1125 if (bit) {
1126 if (cr_add_interval(cr, c0, c))
1127 return -1;
1128 }
1129 bit ^= 1;
1130 c0 = c;
1131 c += (b & 7) + 1;
1132 } else if (b >= 0x80) {
1133 c += b - 0x80 + 1;
1134 } else if (b < 0x60) {
1135 c += (((b - 0x40) << 8) | p[0]) + 1;
1136 p++;
1137 } else {
1138 c += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
1139 p += 2;
1140 }
1141 if (bit) {
1142 if (cr_add_interval(cr, c0, c))
1143 return -1;
1144 }
1145 bit ^= 1;
1146 }
1147 return 0;
1148 }
1149
1150 #define CASE_U (1 << 0)
1151 #define CASE_L (1 << 1)
1152 #define CASE_F (1 << 2)
1153
1154 /* use the case conversion table to generate range of characters.
1155 CASE_U: set char if modified by uppercasing,
1156 CASE_L: set char if modified by lowercasing,
1157 CASE_F: set char if modified by case folding,
1158 */
unicode_case1(CharRange * cr,int case_mask)1159 static int unicode_case1(CharRange *cr, int case_mask)
1160 {
1161 #define MR(x) (1 << RUN_TYPE_ ## x)
1162 const uint32_t tab_run_mask[3] = {
1163 MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) |
1164 MR(UF_D1_EXT) | MR(U_EXT) | MR(U_EXT2) | MR(U_EXT3),
1165
1166 MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(L_EXT2),
1167
1168 MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT),
1169 };
1170 #undef MR
1171 uint32_t mask, v, code, type, len, i, idx;
1172
1173 if (case_mask == 0)
1174 return 0;
1175 mask = 0;
1176 for(i = 0; i < 3; i++) {
1177 if ((case_mask >> i) & 1)
1178 mask |= tab_run_mask[i];
1179 }
1180 for(idx = 0; idx < countof(case_conv_table1); idx++) {
1181 v = case_conv_table1[idx];
1182 type = (v >> (32 - 17 - 7 - 4)) & 0xf;
1183 code = v >> (32 - 17);
1184 len = (v >> (32 - 17 - 7)) & 0x7f;
1185 if ((mask >> type) & 1) {
1186 // printf("%d: type=%d %04x %04x\n", idx, type, code, code + len - 1);
1187 switch(type) {
1188 case RUN_TYPE_UL:
1189 if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
1190 goto def_case;
1191 code += ((case_mask & CASE_U) != 0);
1192 for(i = 0; i < len; i += 2) {
1193 if (cr_add_interval(cr, code + i, code + i + 1))
1194 return -1;
1195 }
1196 break;
1197 case RUN_TYPE_LSU:
1198 if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
1199 goto def_case;
1200 if (!(case_mask & CASE_U)) {
1201 if (cr_add_interval(cr, code, code + 1))
1202 return -1;
1203 }
1204 if (cr_add_interval(cr, code + 1, code + 2))
1205 return -1;
1206 if (case_mask & CASE_U) {
1207 if (cr_add_interval(cr, code + 2, code + 3))
1208 return -1;
1209 }
1210 break;
1211 default:
1212 def_case:
1213 if (cr_add_interval(cr, code, code + len))
1214 return -1;
1215 break;
1216 }
1217 }
1218 }
1219 return 0;
1220 }
1221
1222 typedef enum {
1223 POP_GC,
1224 POP_PROP,
1225 POP_CASE,
1226 POP_UNION,
1227 POP_INTER,
1228 POP_XOR,
1229 POP_INVERT,
1230 POP_END,
1231 } PropOPEnum;
1232
1233 #define POP_STACK_LEN_MAX 4
1234
unicode_prop_ops(CharRange * cr,...)1235 static int unicode_prop_ops(CharRange *cr, ...)
1236 {
1237 va_list ap;
1238 CharRange stack[POP_STACK_LEN_MAX];
1239 int stack_len, op, ret, i;
1240 uint32_t a;
1241
1242 va_start(ap, cr);
1243 stack_len = 0;
1244 for(;;) {
1245 op = va_arg(ap, int);
1246 switch(op) {
1247 case POP_GC:
1248 assert(stack_len < POP_STACK_LEN_MAX);
1249 a = va_arg(ap, int);
1250 cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1251 if (unicode_general_category1(&stack[stack_len - 1], a))
1252 goto fail;
1253 break;
1254 case POP_PROP:
1255 assert(stack_len < POP_STACK_LEN_MAX);
1256 a = va_arg(ap, int);
1257 cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1258 if (unicode_prop1(&stack[stack_len - 1], a))
1259 goto fail;
1260 break;
1261 case POP_CASE:
1262 assert(stack_len < POP_STACK_LEN_MAX);
1263 a = va_arg(ap, int);
1264 cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1265 if (unicode_case1(&stack[stack_len - 1], a))
1266 goto fail;
1267 break;
1268 case POP_UNION:
1269 case POP_INTER:
1270 case POP_XOR:
1271 {
1272 CharRange *cr1, *cr2, *cr3;
1273 assert(stack_len >= 2);
1274 assert(stack_len < POP_STACK_LEN_MAX);
1275 cr1 = &stack[stack_len - 2];
1276 cr2 = &stack[stack_len - 1];
1277 cr3 = &stack[stack_len++];
1278 cr_init(cr3, cr->mem_opaque, cr->realloc_func);
1279 if (cr_op(cr3, cr1->points, cr1->len,
1280 cr2->points, cr2->len, op - POP_UNION + CR_OP_UNION))
1281 goto fail;
1282 cr_free(cr1);
1283 cr_free(cr2);
1284 *cr1 = *cr3;
1285 stack_len -= 2;
1286 }
1287 break;
1288 case POP_INVERT:
1289 assert(stack_len >= 1);
1290 if (cr_invert(&stack[stack_len - 1]))
1291 goto fail;
1292 break;
1293 case POP_END:
1294 goto done;
1295 default:
1296 abort();
1297 }
1298 }
1299 done:
1300 assert(stack_len == 1);
1301 ret = cr_copy(cr, &stack[0]);
1302 cr_free(&stack[0]);
1303 return ret;
1304 fail:
1305 for(i = 0; i < stack_len; i++)
1306 cr_free(&stack[i]);
1307 return -1;
1308 }
1309
1310 static const uint32_t unicode_gc_mask_table[] = {
1311 M(Lu) | M(Ll) | M(Lt), /* LC */
1312 M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo), /* L */
1313 M(Mn) | M(Mc) | M(Me), /* M */
1314 M(Nd) | M(Nl) | M(No), /* N */
1315 M(Sm) | M(Sc) | M(Sk) | M(So), /* S */
1316 M(Pc) | M(Pd) | M(Ps) | M(Pe) | M(Pi) | M(Pf) | M(Po), /* P */
1317 M(Zs) | M(Zl) | M(Zp), /* Z */
1318 M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn), /* C */
1319 };
1320
1321 /* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1322 if not found */
unicode_general_category(CharRange * cr,const char * gc_name)1323 int unicode_general_category(CharRange *cr, const char *gc_name)
1324 {
1325 int gc_idx;
1326 uint32_t gc_mask;
1327
1328 gc_idx = unicode_find_name(unicode_gc_name_table, gc_name);
1329 if (gc_idx < 0)
1330 return -2;
1331 if (gc_idx <= UNICODE_GC_Co) {
1332 gc_mask = (uint64_t)1 << gc_idx;
1333 } else {
1334 gc_mask = unicode_gc_mask_table[gc_idx - UNICODE_GC_LC];
1335 }
1336 return unicode_general_category1(cr, gc_mask);
1337 }
1338
1339
1340 /* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1341 if not found */
unicode_prop(CharRange * cr,const char * prop_name)1342 int unicode_prop(CharRange *cr, const char *prop_name)
1343 {
1344 int prop_idx, ret;
1345
1346 prop_idx = unicode_find_name(unicode_prop_name_table, prop_name);
1347 if (prop_idx < 0)
1348 return -2;
1349 prop_idx += UNICODE_PROP_ASCII_Hex_Digit;
1350
1351 ret = 0;
1352 switch(prop_idx) {
1353 case UNICODE_PROP_ASCII:
1354 if (cr_add_interval(cr, 0x00, 0x7f + 1))
1355 return -1;
1356 break;
1357 case UNICODE_PROP_Any:
1358 if (cr_add_interval(cr, 0x00000, 0x10ffff + 1))
1359 return -1;
1360 break;
1361 case UNICODE_PROP_Assigned:
1362 ret = unicode_prop_ops(cr,
1363 POP_GC, M(Cn),
1364 POP_INVERT,
1365 POP_END);
1366 break;
1367 case UNICODE_PROP_Math:
1368 ret = unicode_prop_ops(cr,
1369 POP_GC, M(Sm),
1370 POP_PROP, UNICODE_PROP_Other_Math,
1371 POP_UNION,
1372 POP_END);
1373 break;
1374 case UNICODE_PROP_Lowercase:
1375 ret = unicode_prop_ops(cr,
1376 POP_GC, M(Ll),
1377 POP_PROP, UNICODE_PROP_Other_Lowercase,
1378 POP_UNION,
1379 POP_END);
1380 break;
1381 case UNICODE_PROP_Uppercase:
1382 ret = unicode_prop_ops(cr,
1383 POP_GC, M(Lu),
1384 POP_PROP, UNICODE_PROP_Other_Uppercase,
1385 POP_UNION,
1386 POP_END);
1387 break;
1388 case UNICODE_PROP_Cased:
1389 ret = unicode_prop_ops(cr,
1390 POP_GC, M(Lu) | M(Ll) | M(Lt),
1391 POP_PROP, UNICODE_PROP_Other_Uppercase,
1392 POP_UNION,
1393 POP_PROP, UNICODE_PROP_Other_Lowercase,
1394 POP_UNION,
1395 POP_END);
1396 break;
1397 case UNICODE_PROP_Alphabetic:
1398 ret = unicode_prop_ops(cr,
1399 POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1400 POP_PROP, UNICODE_PROP_Other_Uppercase,
1401 POP_UNION,
1402 POP_PROP, UNICODE_PROP_Other_Lowercase,
1403 POP_UNION,
1404 POP_PROP, UNICODE_PROP_Other_Alphabetic,
1405 POP_UNION,
1406 POP_END);
1407 break;
1408 case UNICODE_PROP_Grapheme_Base:
1409 ret = unicode_prop_ops(cr,
1410 POP_GC, M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn) | M(Zl) | M(Zp) | M(Me) | M(Mn),
1411 POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1412 POP_UNION,
1413 POP_INVERT,
1414 POP_END);
1415 break;
1416 case UNICODE_PROP_Grapheme_Extend:
1417 ret = unicode_prop_ops(cr,
1418 POP_GC, M(Me) | M(Mn),
1419 POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1420 POP_UNION,
1421 POP_END);
1422 break;
1423 case UNICODE_PROP_XID_Start:
1424 ret = unicode_prop_ops(cr,
1425 POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1426 POP_PROP, UNICODE_PROP_Other_ID_Start,
1427 POP_UNION,
1428 POP_PROP, UNICODE_PROP_Pattern_Syntax,
1429 POP_PROP, UNICODE_PROP_Pattern_White_Space,
1430 POP_UNION,
1431 POP_PROP, UNICODE_PROP_XID_Start1,
1432 POP_UNION,
1433 POP_INVERT,
1434 POP_INTER,
1435 POP_END);
1436 break;
1437 case UNICODE_PROP_XID_Continue:
1438 ret = unicode_prop_ops(cr,
1439 POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
1440 M(Mn) | M(Mc) | M(Nd) | M(Pc),
1441 POP_PROP, UNICODE_PROP_Other_ID_Start,
1442 POP_UNION,
1443 POP_PROP, UNICODE_PROP_Other_ID_Continue,
1444 POP_UNION,
1445 POP_PROP, UNICODE_PROP_Pattern_Syntax,
1446 POP_PROP, UNICODE_PROP_Pattern_White_Space,
1447 POP_UNION,
1448 POP_PROP, UNICODE_PROP_XID_Continue1,
1449 POP_UNION,
1450 POP_INVERT,
1451 POP_INTER,
1452 POP_END);
1453 break;
1454 case UNICODE_PROP_Changes_When_Uppercased:
1455 ret = unicode_case1(cr, CASE_U);
1456 break;
1457 case UNICODE_PROP_Changes_When_Lowercased:
1458 ret = unicode_case1(cr, CASE_L);
1459 break;
1460 case UNICODE_PROP_Changes_When_Casemapped:
1461 ret = unicode_case1(cr, CASE_U | CASE_L | CASE_F);
1462 break;
1463 case UNICODE_PROP_Changes_When_Titlecased:
1464 ret = unicode_prop_ops(cr,
1465 POP_CASE, CASE_U,
1466 POP_PROP, UNICODE_PROP_Changes_When_Titlecased1,
1467 POP_XOR,
1468 POP_END);
1469 break;
1470 case UNICODE_PROP_Changes_When_Casefolded:
1471 ret = unicode_prop_ops(cr,
1472 POP_CASE, CASE_F,
1473 POP_PROP, UNICODE_PROP_Changes_When_Casefolded1,
1474 POP_XOR,
1475 POP_END);
1476 break;
1477 case UNICODE_PROP_Changes_When_NFKC_Casefolded:
1478 ret = unicode_prop_ops(cr,
1479 POP_CASE, CASE_F,
1480 POP_PROP, UNICODE_PROP_Changes_When_NFKC_Casefolded1,
1481 POP_XOR,
1482 POP_END);
1483 break;
1484 #if 0
1485 case UNICODE_PROP_ID_Start:
1486 ret = unicode_prop_ops(cr,
1487 POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1488 POP_PROP, UNICODE_PROP_Other_ID_Start,
1489 POP_UNION,
1490 POP_PROP, UNICODE_PROP_Pattern_Syntax,
1491 POP_PROP, UNICODE_PROP_Pattern_White_Space,
1492 POP_UNION,
1493 POP_INVERT,
1494 POP_INTER,
1495 POP_END);
1496 break;
1497 case UNICODE_PROP_ID_Continue:
1498 ret = unicode_prop_ops(cr,
1499 POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
1500 M(Mn) | M(Mc) | M(Nd) | M(Pc),
1501 POP_PROP, UNICODE_PROP_Other_ID_Start,
1502 POP_UNION,
1503 POP_PROP, UNICODE_PROP_Other_ID_Continue,
1504 POP_UNION,
1505 POP_PROP, UNICODE_PROP_Pattern_Syntax,
1506 POP_PROP, UNICODE_PROP_Pattern_White_Space,
1507 POP_UNION,
1508 POP_INVERT,
1509 POP_INTER,
1510 POP_END);
1511 break;
1512 case UNICODE_PROP_Case_Ignorable:
1513 ret = unicode_prop_ops(cr,
1514 POP_GC, M(Mn) | M(Cf) | M(Lm) | M(Sk),
1515 POP_PROP, UNICODE_PROP_Case_Ignorable1,
1516 POP_XOR,
1517 POP_END);
1518 break;
1519 #else
1520 /* we use the existing tables */
1521 case UNICODE_PROP_ID_Continue:
1522 ret = unicode_prop_ops(cr,
1523 POP_PROP, UNICODE_PROP_ID_Start,
1524 POP_PROP, UNICODE_PROP_ID_Continue1,
1525 POP_XOR,
1526 POP_END);
1527 break;
1528 #endif
1529 default:
1530 if (prop_idx >= countof(unicode_prop_table))
1531 return -2;
1532 ret = unicode_prop1(cr, prop_idx);
1533 break;
1534 }
1535 return ret;
1536 }
1537
1538 #endif /* CONFIG_ALL_UNICODE */
1539