1#!/usr/bin/python3
2# -*- coding: utf-8 -*-
3# Copyright (C) 2014-2021 Free Software Foundation, Inc.
4# This file is part of the GNU C Library.
5#
6# The GNU C Library is free software; you can redistribute it and/or
7# modify it under the terms of the GNU Lesser General Public
8# License as published by the Free Software Foundation; either
9# version 2.1 of the License, or (at your option) any later version.
10#
11# The GNU C Library is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14# Lesser General Public License for more details.
15#
16# You should have received a copy of the GNU Lesser General Public
17# License along with the GNU C Library; if not, see
18# <https://www.gnu.org/licenses/>.
19
20'''
21This script is useful for checking backward compatibility of newly
22generated UTF-8 file from utf8_gen.py script
23
24To see how this script is used, call it with the “-h” option:
25
26    $ ./utf8_compatibility.py -h
27    … prints usage message …
28'''
29
30import sys
31import re
32import argparse
33import unicode_utils
34
35def create_charmap_dictionary(file_name):
36    '''Create a dictionary for all code points found in the CHARMAP
37    section of a file
38    '''
39    with open(file_name, mode='r') as utf8_file:
40        charmap_dictionary = {}
41        for line in utf8_file:
42            if line.startswith('CHARMAP'):
43                break
44        for line in utf8_file:
45            if line.startswith('END CHARMAP'):
46                return charmap_dictionary
47            if line.startswith('%'):
48                continue
49            match = re.match(
50                r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
51                +r'(:?\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
52                +r'\s+(?P<hexutf8>(/x[0-9a-f]{2}){1,4})',
53                line)
54            if not match:
55                continue
56            codepoint1 = match.group('codepoint1')
57            codepoint2 = match.group('codepoint2')
58            if not codepoint2:
59                codepoint2 = codepoint1
60            for i in range(int(codepoint1, 16),
61                           int(codepoint2, 16) + 1):
62                charmap_dictionary[i] = match.group('hexutf8')
63        sys.stderr.write('No “CHARMAP” or no “END CHARMAP” found in %s\n'
64                         %file_name)
65        exit(1)
66
67def check_charmap(original_file_name, new_file_name):
68    '''Report differences in the CHARMAP section between the old and the
69    new file
70    '''
71    print('************************************************************')
72    print('Report on CHARMAP:')
73    ocharmap = create_charmap_dictionary(original_file_name)
74    ncharmap = create_charmap_dictionary(new_file_name)
75    print('------------------------------------------------------------')
76    print('Total removed characters in newly generated CHARMAP: %d'
77          %len(set(ocharmap)-set(ncharmap)))
78    if ARGS.show_missing_characters:
79        for key in sorted(set(ocharmap)-set(ncharmap)):
80            print('removed: {:s}     {:s} {:s}'.format(
81                unicode_utils.ucs_symbol(key),
82                ocharmap[key],
83                unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
84                if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
85    print('------------------------------------------------------------')
86    changed_charmap = {}
87    for key in set(ocharmap).intersection(set(ncharmap)):
88        if ocharmap[key] != ncharmap[key]:
89            changed_charmap[key] = (ocharmap[key], ncharmap[key])
90    print('Total changed characters in newly generated CHARMAP: %d'
91          %len(changed_charmap))
92    if ARGS.show_changed_characters:
93        for key in sorted(changed_charmap):
94            print('changed: {:s}     {:s}->{:s} {:s}'.format(
95                unicode_utils.ucs_symbol(key),
96                changed_charmap[key][0],
97                changed_charmap[key][1],
98                unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
99                if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
100    print('------------------------------------------------------------')
101    print('Total added characters in newly generated CHARMAP: %d'
102          %len(set(ncharmap)-set(ocharmap)))
103    if ARGS.show_added_characters:
104        for key in sorted(set(ncharmap)-set(ocharmap)):
105            print('added: {:s}     {:s} {:s}'.format(
106                unicode_utils.ucs_symbol(key),
107                ncharmap[key],
108                unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
109                if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
110
111def create_width_dictionary(file_name):
112    '''Create a dictionary for all code points found in the WIDTH
113    section of a file
114    '''
115    with open(file_name, mode='r') as utf8_file:
116        width_dictionary = {}
117        for line in utf8_file:
118            if line.startswith('WIDTH'):
119                break
120        for line in utf8_file:
121            if line.startswith('END WIDTH'):
122                return width_dictionary
123            match = re.match(
124                r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
125                +r'(:?\.\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
126                +r'\s+(?P<width>[02])',
127                line)
128            if not match:
129                continue
130            codepoint1 = match.group('codepoint1')
131            codepoint2 = match.group('codepoint2')
132            if not codepoint2:
133                codepoint2 = codepoint1
134            for i in range(int(codepoint1, 16),
135                           int(codepoint2, 16) + 1):
136                width_dictionary[i] = int(match.group('width'))
137        sys.stderr.write('No “WIDTH” or no “END WIDTH” found in %s\n' %file)
138
139def check_width(original_file_name, new_file_name):
140    '''Report differences in the WIDTH section between the old and the new
141    file
142    '''
143    print('************************************************************')
144    print('Report on WIDTH:')
145    owidth = create_width_dictionary(original_file_name)
146    nwidth = create_width_dictionary(new_file_name)
147    print('------------------------------------------------------------')
148    print('Total removed characters in newly generated WIDTH: %d'
149          %len(set(owidth)-set(nwidth)))
150    print('(Characters not in WIDTH get width 1 by default, '
151          + 'i.e. these have width 1 now.)')
152    if ARGS.show_missing_characters:
153        for key in sorted(set(owidth)-set(nwidth)):
154            print('removed: {:s} '.format(unicode_utils.ucs_symbol(key))
155                  + '{:d} : '.format(owidth[key])
156                  + 'eaw={:s} '.format(
157                      unicode_utils.EAST_ASIAN_WIDTHS[key]
158                      if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
159                  + 'category={:2s} '.format(
160                      unicode_utils.UNICODE_ATTRIBUTES[key]['category']
161                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
162                  + 'bidi={:3s} '.format(
163                      unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
164                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
165                  + 'name={:s}'.format(
166                      unicode_utils.UNICODE_ATTRIBUTES[key]['name']
167                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
168    print('------------------------------------------------------------')
169    changed_width = {}
170    for key in set(owidth).intersection(set(nwidth)):
171        if owidth[key] != nwidth[key]:
172            changed_width[key] = (owidth[key], nwidth[key])
173    print('Total changed characters in newly generated WIDTH: %d'
174          %len(changed_width))
175    if ARGS.show_changed_characters:
176        for key in sorted(changed_width):
177            print('changed width: {:s} '.format(unicode_utils.ucs_symbol(key))
178                  + '{:d}->{:d} : '.format(changed_width[key][0],
179                                          changed_width[key][1])
180                  + 'eaw={:s} '.format(
181                      unicode_utils.EAST_ASIAN_WIDTHS[key]
182                      if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
183                  + 'category={:2s} '.format(
184                      unicode_utils.UNICODE_ATTRIBUTES[key]['category']
185                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
186                  + 'bidi={:3s} '.format(
187                      unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
188                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
189                  + 'name={:s}'.format(
190                      unicode_utils.UNICODE_ATTRIBUTES[key]['name']
191                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
192    print('------------------------------------------------------------')
193    print('Total added characters in newly generated WIDTH: %d'
194          %len(set(nwidth)-set(owidth)))
195    print('(Characters not in WIDTH get width 1 by default, '
196          + 'i.e. these had width 1 before.)')
197    if ARGS.show_added_characters:
198        for key in sorted(set(nwidth)-set(owidth)):
199            print('added: {:s} '.format(unicode_utils.ucs_symbol(key))
200                  + '{:d} : '.format(nwidth[key])
201                  + 'eaw={:s} '.format(
202                      unicode_utils.EAST_ASIAN_WIDTHS[key]
203                      if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
204                  + 'category={:2s} '.format(
205                      unicode_utils.UNICODE_ATTRIBUTES[key]['category']
206                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
207                  + 'bidi={:3s} '.format(
208                      unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
209                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
210                  + 'name={:s}'.format(
211                      unicode_utils.UNICODE_ATTRIBUTES[key]['name']
212                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
213
214if __name__ == "__main__":
215    PARSER = argparse.ArgumentParser(
216        description='''
217        Compare the contents of LC_CTYPE in two files and check for errors.
218        ''')
219    PARSER.add_argument(
220        '-o', '--old_utf8_file',
221        nargs='?',
222        required=True,
223        type=str,
224        help='The old UTF-8 file.')
225    PARSER.add_argument(
226        '-n', '--new_utf8_file',
227        nargs='?',
228        required=True,
229        type=str,
230        help='The new UTF-8 file.')
231    PARSER.add_argument(
232        '-u', '--unicode_data_file',
233        nargs='?',
234        type=str,
235        help='The UnicodeData.txt file to read.')
236    PARSER.add_argument(
237        '-e', '--east_asian_width_file',
238        nargs='?',
239        type=str,
240        help='The EastAsianWidth.txt file to read.')
241    PARSER.add_argument(
242        '-a', '--show_added_characters',
243        action='store_true',
244        help='Show characters which were added in detail.')
245    PARSER.add_argument(
246        '-m', '--show_missing_characters',
247        action='store_true',
248        help='Show characters which were removed in detail.')
249    PARSER.add_argument(
250        '-c', '--show_changed_characters',
251        action='store_true',
252        help='Show characters whose width was changed in detail.')
253    ARGS = PARSER.parse_args()
254
255    if ARGS.unicode_data_file:
256        unicode_utils.fill_attributes(ARGS.unicode_data_file)
257    if ARGS.east_asian_width_file:
258        unicode_utils.fill_east_asian_widths(ARGS.east_asian_width_file)
259    check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file)
260    check_width(ARGS.old_utf8_file, ARGS.new_utf8_file)
261