1#!/usr/bin/python3
2# -*- coding: utf-8 -*-
3# Copyright (C) 2014-2021 Free Software Foundation, Inc.
4# This file is part of the GNU C Library.
5#
6# The GNU C Library is free software; you can redistribute it and/or
7# modify it under the terms of the GNU Lesser General Public
8# License as published by the Free Software Foundation; either
9# version 2.1 of the License, or (at your option) any later version.
10#
11# The GNU C Library is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14# Lesser General Public License for more details.
15#
16# You should have received a copy of the GNU Lesser General Public
17# License along with the GNU C Library; if not, see
18# <https://www.gnu.org/licenses/>.
19
20'''
21This script is useful for checking the differences between
22an old LC_CTYPE file /usr/share/i18n/locale/i18n and a
23new one generated by gen_unicode_ctype.py
24
25To see how it is used, call it with the “-h” option:
26
27    $ ./ctype_compatibility.py -h
28    … prints usage message …
29'''
30
31import sys
32import re
33import unicodedata
34import argparse
35
36from ctype_compatibility_test_cases import TEST_CASES
37
38def get_lines_from_file(filename):
39    '''Get all non-comment lines from a i18n file
40
41    Also merge all lines which are continued on the next line because
42    they end in “/” into a single line.
43    '''
44    with open(filename) as i18n_file:
45        current_line = ''
46        for line in i18n_file:
47            line = line.strip('\n')
48            if '%' in line:
49                if line.endswith('/'):
50                    line = line[0:line.find('%')] + '/'
51                else:
52                    line = line[0:line.find('%')]
53            line = line.strip()
54            if line.endswith('/'):
55                current_line += line[:-1]
56            else:
57                yield current_line + line
58                current_line = ''
59    if current_line: # file ends with a continuation line
60        yield current_line
61
62def extract_character_classes(filename):
63    '''Get all Unicode code points for each character class from a file
64
65    Store these code points in a dictionary using the character classes
66    as keys and the list of code points in this character class as values.
67
68    In case  of the character classes “toupper”, “tolower”, and “totitle”,
69    these area actually pairs of code points
70    '''
71    ctype_dict = {}
72    for line in get_lines_from_file(filename):
73        for char_class in [
74                'upper',
75                'lower',
76                'alpha',
77                'digit',
78                'outdigit',
79                'space',
80                'cntrl',
81                'punct',
82                'graph',
83                'print',
84                'xdigit',
85                'blank',
86                'combining',
87                'combining_level3',
88                'toupper',
89                'tolower',
90                'totitle']:
91            match = re.match(r'^('
92                             +'(?:(?:class|map)\s+")'
93                             +re.escape(char_class)+
94                             '(?:";)\s+'
95                             +'|'
96                             +re.escape(char_class)+'\s+'
97                             +')', line)
98            if match:
99                if char_class not in ctype_dict:
100                    ctype_dict[char_class] = []
101                process_chars(
102                    ctype_dict[char_class],
103                    line[match.end():])
104    return ctype_dict
105
106def process_chars(char_class_list, code_point_line):
107    '''
108    Extract Unicode values from code_point_line
109    and add to the list of code points in a character class
110    '''
111    for code_points in code_point_line.split(';'):
112        code_points = code_points.strip()
113        match = re.match(r'^<U(?P<codepoint>[0-9A-F]{4,8})>$', code_points)
114        if match: # <Uxxxx>
115            char_class_list.append(
116                int(match.group('codepoint'), 16))
117            continue
118        match = re.match(
119            r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
120            +'\.\.'+
121            '<U(?P<codepoint2>[0-9A-F]{4,8})>$',
122            code_points)
123        if match: # <Uxxxx>..<Uxxxx>
124            for codepoint in range(
125                    int(match.group('codepoint1'), 16),
126                    int(match.group('codepoint2'), 16) + 1):
127                char_class_list.append(codepoint)
128            continue
129        match = re.match(
130            r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
131            +'\.\.\(2\)\.\.'+
132            '<U(?P<codepoint2>[0-9A-F]{4,8})>$',
133            code_points)
134        if match: # <Uxxxx>..(2)..<Uxxxx>
135            for codepoint in range(
136                    int(match.group('codepoint1'), 16),
137                    int(match.group('codepoint2'), 16) + 1,
138                    2):
139                char_class_list.append(codepoint)
140            continue
141        match = re.match(
142            r'^\('
143            +'<U(?P<codepoint1>[0-9A-F]{4,8})>'
144            +','+
145            '<U(?P<codepoint2>[0-9A-F]{4,8})>'
146            +'\)$',
147            code_points)
148        if match: # (<Uxxxx>,<Uxxxx>)
149            char_class_list.append((
150                int(match.group('codepoint1'), 16),
151                int(match.group('codepoint2'), 16)))
152            continue
153        sys.stderr.write(
154            ('None of the regexps matched '
155             + 'code_points=%(cp)s in code_point_line=%(cpl)s\n') %{
156            'cp': code_points,
157            'cpl': code_point_line
158        })
159        exit(1)
160
161def compare_lists(old_ctype_dict, new_ctype_dict):
162    '''Compare character classes in the old and the new LC_CTYPE'''
163    print('****************************************************')
164    print('Character classes which are only in the new '
165          + 'or only in the old file:')
166    for char_class in sorted(old_ctype_dict):
167        if char_class not in new_ctype_dict:
168            print('Character class %s is in old ctype but not in new ctype'
169                  %char_class)
170    for char_class in sorted(new_ctype_dict):
171        if char_class not in old_ctype_dict:
172            print('Character class %s is in new ctype but not in old ctype'
173                  %char_class)
174    for char_class in sorted(old_ctype_dict):
175        print("****************************************************")
176        print("%s: %d chars in old ctype and %d chars in new ctype" %(
177            char_class,
178            len(old_ctype_dict[char_class]),
179            len(new_ctype_dict[char_class])))
180        print("----------------------------------------------------")
181        report(char_class,
182               old_ctype_dict[char_class],
183               new_ctype_dict[char_class])
184
185def report_code_points(char_class, code_point_list, text=''):
186    '''Report all code points which have been added to or removed from a
187    character class.
188    '''
189    for code_point in sorted(code_point_list):
190        if type(code_point) == type(int()):
191            print('%(char_class)s: %(text)s: %(char)s %(code_point)s %(name)s'
192                  %{'text': text,
193                    'char': chr(code_point),
194                    'char_class': char_class,
195                    'code_point': hex(code_point),
196                    'name': unicodedata.name(chr(code_point), 'name unknown')})
197        else:
198            print(('%(char_class)s: %(text)s: '
199                   + '%(char0)s → %(char1)s '
200                   + '%(code_point0)s → %(code_point1)s '
201                   + '%(name0)s → %(name1)s') %{
202                'text': text,
203                'char_class': char_class,
204                'char0': chr(code_point[0]),
205                'code_point0': hex(code_point[0]),
206                'name0': unicodedata.name(chr(code_point[0]), 'name unknown'),
207                'char1': chr(code_point[1]),
208                'code_point1': hex(code_point[1]),
209                'name1': unicodedata.name(chr(code_point[1]), 'name unknown')
210            })
211
212def report(char_class, old_list, new_list):
213    '''Report the differences for a certain LC_CTYPE character class
214    between the old and the newly generated state
215    '''
216    missing_chars = list(set(old_list)-set(new_list))
217    print(('%(char_class)s: Missing %(number)d characters '
218           + 'of old ctype in new ctype ')
219          %{'char_class': char_class, 'number': len(missing_chars)})
220    if ARGS.show_missing_characters:
221        report_code_points(char_class, missing_chars, 'Missing')
222    added_chars = list(set(new_list)-set(old_list))
223    print(('%(char_class)s: Added %(number)d characters '
224           + 'in new ctype which were not in old ctype')
225          %{'char_class': char_class, 'number': len(added_chars)})
226    if ARGS.show_added_characters:
227        report_code_points(char_class, added_chars, 'Added')
228
229
230def cperror(error_message, errorcounter=0):
231    '''Increase number of errors by one and print an error message'''
232    print(error_message)
233    return errorcounter + 1
234
235def cpcheck(ctype_dict, code_point_list_with_ranges, char_classes, reason='',
236            errorcounter=0):
237    '''The parameter “code_point_list_with_ranges” is a list of
238    integers or pairs of integers, for example:
239
240    [0x0E31, (0x0E34, 0x0E3A), (0x0E47, 0x0E4E)]
241
242    where the pairs of integers stand for all the code points in the range
243    of the two integers given, including the two integers of the pair.
244
245    '''
246    for code_point_range in code_point_list_with_ranges:
247        for code_point in ([code_point_range]
248                           if type(code_point_range) == type(int())
249                           else range(code_point_range[0],
250                                      code_point_range[1]+1)):
251            for char_class_tuple in char_classes:
252                char_class = char_class_tuple[0]
253                in_char_class = char_class_tuple[1]
254                if (code_point in ctype_dict[char_class]) != in_char_class:
255                    errorcounter = cperror(
256                        ('error: %(code_point)s %(char)s '
257                         + '%(char_class)s %(in)s: %(reason)s') %{
258                             'code_point': hex(code_point),
259                             'char': chr(code_point),
260                             'char_class': char_class,
261                             'in': not in_char_class,
262                             'reason': reason},
263                        errorcounter)
264    return errorcounter
265
266def tests(ctype_dict, errorcounter = 0):
267    '''Test a LC_CTYPE character class dictionary for known errors'''
268    # copy the information from ctype_dict (which contains lists) in
269    # a new dictionary ctype_dict2 (which contains dictionaries).
270    # The checks below are easier with that type of data structure.
271
272    ctype_dict2 = {}
273    for key in ctype_dict:
274        ctype_dict2[key] = {}
275        if ctype_dict[key]:
276            if type(ctype_dict[key][0]) == type(int()):
277                for value in ctype_dict[key]:
278                    ctype_dict2[key][value] = 1
279            else: # key is 'toupper', 'tolower', or 'totitle'
280                for value in ctype_dict[key]:
281                    ctype_dict2[key][value[0]] = value[1]
282
283    for test_case in TEST_CASES:
284        errorcounter = cpcheck(ctype_dict2,
285                               test_case[0],
286                               test_case[1],
287                               test_case[2],
288                               errorcounter = errorcounter)
289
290    for code_point in range(0, 0x110000):
291        # toupper restriction: "Only characters specified for the keywords
292	# lower and upper shall be specified.
293        if (code_point in ctype_dict2['toupper']
294            and code_point != ctype_dict2['toupper'][code_point]
295            and not (code_point in ctype_dict2['lower']
296                     or code_point in ctype_dict2['upper'])):
297            errorcounter = cperror(
298                ('error: %(char1)s is not upper|lower '
299                 + 'but toupper(%(cp1)s)=%(cp2)s (%(char2)s)') %{
300                     'char1': chr(code_point),
301                     'cp1': hex(code_point),
302                     'cp2': hex(ctype_dict2['toupper'][code_point]),
303                     'char2': chr(ctype_dict2['toupper'][code_point])
304                 },
305                errorcounter)
306        # tolower restriction: "Only characters specified for the keywords
307	# lower and upper shall be specified.
308        if (code_point in ctype_dict2['tolower']
309            and code_point != ctype_dict2['tolower'][code_point]
310            and not (code_point in ctype_dict2['lower']
311                     or code_point in ctype_dict2['upper'])):
312            errorcounter = cperror(
313                ('error: %(char1)s is not upper|lower '
314                 + 'but tolower(%(cp1)s)=%(cp2)s (%(char2)s)') %{
315                     'char1': chr(code_point),
316                     'cp1': hex(code_point),
317                     'cp2': hex(ctype_dict2['tolower'][code_point]),
318                     'char2': chr(ctype_dict2['tolower'][code_point])
319                 },
320                errorcounter)
321        # alpha restriction: "Characters classified as either upper or lower
322	# shall automatically belong to this class.
323        if ((code_point in ctype_dict2['lower']
324             or code_point in ctype_dict2['upper'])
325            and code_point not in ctype_dict2['alpha']):
326            errorcounter = cperror(
327                'error: %(char)s %(cp)s is upper|lower but not alpha' %{
328                    'char': chr(code_point),
329                    'cp': hex(code_point)
330                },
331                errorcounter)
332        # alpha restriction: "No character specified for the keywords cntrl,
333	# digit, punct or space shall be specified."
334        if (code_point in ctype_dict2['alpha']
335            and code_point in ctype_dict2['cntrl']):
336            errorcounter = cperror(
337                'error: %(char)s %(cp)s is alpha and cntrl' %{
338                    'char': chr(code_point),
339                    'cp': hex(code_point)
340                },
341                errorcounter)
342        if (code_point in ctype_dict2['alpha']
343            and code_point in ctype_dict2['digit']):
344            errorcounter = cperror(
345                'error: %(char)s %(cp)s is alpha and digit' %{
346                    'char': chr(code_point),
347                    'cp': hex(code_point)
348                },
349                errorcounter)
350        if (code_point in ctype_dict2['alpha']
351            and code_point in ctype_dict2['punct']):
352            errorcounter = cperror(
353                'error: %(char)s %(cp)s is alpha and punct' %{
354                    'char': chr(code_point),
355                    'cp': hex(code_point)
356                },
357                errorcounter)
358        if (code_point in ctype_dict2['alpha']
359            and code_point in ctype_dict2['space']):
360            errorcounter = cperror(
361                'error: %(char)s %(cp)s is alpha and space' %{
362                    'char': chr(code_point),
363                    'cp': hex(code_point)
364                },
365                errorcounter)
366        # space restriction: "No character specified for the keywords upper,
367	# lower, alpha, digit, graph or xdigit shall be specified."
368	# upper, lower, alpha already checked above.
369        if (code_point in ctype_dict2['space']
370            and code_point in ctype_dict2['digit']):
371            errorcounter = cperror(
372                'error: %(char)s %(cp)s is space and digit' %{
373                    'char': chr(code_point),
374                    'cp': hex(code_point)
375                },
376                errorcounter)
377        if (code_point in ctype_dict2['space']
378            and code_point in ctype_dict2['graph']):
379            errorcounter = cperror(
380                'error: %(char)s %(cp)s is space and graph' %{
381                    'char': chr(code_point),
382                    'cp': hex(code_point)
383                },
384                errorcounter)
385        if (code_point in ctype_dict2['space']
386            and code_point in ctype_dict2['xdigit']):
387            errorcounter = cperror(
388                'error: %(char)s %(cp)s is space and xdigit' %{
389                    'char': chr(code_point),
390                    'cp': hex(code_point)
391                },
392                errorcounter)
393        # cntrl restriction: "No character specified for the keywords upper,
394	# lower, alpha, digit, punct, graph, print or xdigit shall be
395	# specified."  upper, lower, alpha already checked above.
396        if (code_point in ctype_dict2['cntrl']
397            and code_point in ctype_dict2['digit']):
398            errorcounter = cperror(
399                'error: %(char)s %(cp)s is cntrl and digit' %{
400                    'char': chr(code_point),
401                    'cp': hex(code_point)
402                },
403                errorcounter)
404        if (code_point in ctype_dict2['cntrl']
405            and code_point in ctype_dict2['punct']):
406            errorcounter = cperror(
407                'error: %(char)s %(cp)s is cntrl and punct' %{
408                    'char': chr(code_point),
409                    'cp': hex(code_point)
410                },
411                errorcounter)
412        if (code_point in ctype_dict2['cntrl']
413            and code_point in ctype_dict2['graph']):
414            errorcounter = cperror(
415                'error: %(char)s %(cp)s is cntrl and graph' %{
416                    'char': chr(code_point),
417                    'cp': hex(code_point)
418                },
419                errorcounter)
420        if (code_point in ctype_dict2['cntrl']
421            and code_point in ctype_dict2['print']):
422            errorcounter = cperror(
423                'error: %(char)s %(cp)s is cntrl and print' %{
424                    'char': chr(code_point),
425                    'cp': hex(code_point)
426                },
427                errorcounter)
428        if (code_point in ctype_dict2['cntrl']
429            and code_point in ctype_dict2['xdigit']):
430            errorcounter = cperror(
431                'error: %(char)s %(cp)s is cntrl and xdigit' %{
432                    'char': chr(code_point),
433                    'cp': hex(code_point)
434                },
435                errorcounter)
436        # punct restriction: "No character specified for the keywords upper,
437	# lower, alpha, digit, cntrl, xdigit or as the <space> character shall
438	# be specified."  upper, lower, alpha, cntrl already checked above.
439        if (code_point in ctype_dict2['punct']
440            and code_point in ctype_dict2['digit']):
441            errorcounter = cperror(
442                'error: %(char)s %(cp)s is punct and digit' %{
443                    'char': chr(code_point),
444                    'cp': hex(code_point)
445                },
446                errorcounter)
447        if (code_point in ctype_dict2['punct']
448            and code_point in ctype_dict2['xdigit']):
449            errorcounter = cperror(
450                'error: %(char)s %(cp)s is punct and xdigit' %{
451                    'char': chr(code_point),
452                    'cp': hex(code_point)
453                },
454                errorcounter)
455        if (code_point in ctype_dict2['punct']
456            and code_point == 0x0020):
457            errorcounter = cperror(
458                'error: %(char)s %(cp)s is punct.' %{
459                    'char': chr(code_point),
460                    'cp': hex(code_point)
461                },
462                errorcounter)
463        # graph restriction: "No character specified for the keyword cntrl
464	# shall be specified."  Already checked above.
465
466        # print restriction: "No character specified for the keyword cntrl
467	# shall be specified."  Already checked above.
468
469        # graph - print relation: differ only in the <space> character.
470	# How is this possible if there are more than one space character?!
471	# I think susv2/xbd/locale.html should speak of "space characters",
472	# not "space character".
473        if (code_point in ctype_dict2['print']
474            and not (code_point in ctype_dict2['graph']
475                     or code_point in ctype_dict2['space'])):
476            errorcounter = cperror(
477                'error: %(char)s %(cp)s is print but not graph|space' %{
478                    'char': chr(code_point),
479                    'cp': hex(code_point)
480                },
481                errorcounter)
482        if (code_point not in ctype_dict2['print']
483            and (code_point in ctype_dict2['graph']
484                 or code_point ==  0x0020)):
485            errorcounter = cperror(
486                'error: %(char)s %(cp)s graph|space but not print' %{
487                    'char': chr(code_point),
488                    'cp': hex(code_point)
489                },
490                errorcounter)
491    return errorcounter
492
493if __name__ == "__main__":
494    PARSER = argparse.ArgumentParser(
495        description='''
496        Compare the contents of LC_CTYPE in two files and check for errors.
497        ''')
498    PARSER.add_argument(
499        '-o', '--old_ctype_file',
500        nargs='?',
501        type=str,
502        default='i18n',
503        help='The old ctype file, default: %(default)s')
504    PARSER.add_argument(
505        '-n', '--new_ctype_file',
506        nargs='?',
507        type=str,
508        default='unicode-ctype',
509        help='The new ctype file, default: %(default)s')
510    PARSER.add_argument(
511        '-a', '--show_added_characters',
512        action='store_true',
513        help=('Show characters which were added to each '
514              + 'character class in detail.'))
515    PARSER.add_argument(
516        '-m', '--show_missing_characters',
517        action='store_true',
518        help=('Show characters which were removed from each '
519              + 'character class in detail.'))
520    ARGS = PARSER.parse_args()
521
522    OLD_CTYPE_DICT = extract_character_classes(
523        ARGS.old_ctype_file)
524    NEW_CTYPE_DICT = extract_character_classes(
525        ARGS.new_ctype_file)
526    compare_lists(OLD_CTYPE_DICT, NEW_CTYPE_DICT)
527    print('============================================================')
528    print('Checking for errors in old ctype file: %s' %ARGS.old_ctype_file)
529    print('------------------------------------------------------------')
530    NUMBER_OF_ERRORS_IN_OLD_FILE = tests(OLD_CTYPE_DICT, errorcounter = 0)
531    print('------------------------------------------------------------')
532    print('Old file = %s' %ARGS.old_ctype_file)
533    print('Number of errors in old file = %s' %NUMBER_OF_ERRORS_IN_OLD_FILE)
534    print('------------------------------------------------------------')
535    print('============================================================')
536    print('Checking for errors in new ctype file: %s' %ARGS.new_ctype_file)
537    print('------------------------------------------------------------')
538    NUMBER_OF_ERRORS_IN_NEW_FILE = tests(NEW_CTYPE_DICT, errorcounter = 0)
539    print('------------------------------------------------------------')
540    print('New file = %s' %ARGS.new_ctype_file)
541    print('Number of errors in new file = %s' %NUMBER_OF_ERRORS_IN_NEW_FILE)
542    print('------------------------------------------------------------')
543    if NUMBER_OF_ERRORS_IN_NEW_FILE > 0:
544        exit(1)
545    else:
546        exit(0)
547