1#!/usr/bin/python3 2# -*- coding: utf-8 -*- 3# Copyright (C) 2014-2021 Free Software Foundation, Inc. 4# This file is part of the GNU C Library. 5# 6# The GNU C Library is free software; you can redistribute it and/or 7# modify it under the terms of the GNU Lesser General Public 8# License as published by the Free Software Foundation; either 9# version 2.1 of the License, or (at your option) any later version. 10# 11# The GNU C Library is distributed in the hope that it will be useful, 12# but WITHOUT ANY WARRANTY; without even the implied warranty of 13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14# Lesser General Public License for more details. 15# 16# You should have received a copy of the GNU Lesser General Public 17# License along with the GNU C Library; if not, see 18# <https://www.gnu.org/licenses/>. 19 20''' 21This script is useful for checking backward compatibility of newly 22generated UTF-8 file from utf8_gen.py script 23 24To see how this script is used, call it with the “-h” option: 25 26 $ ./utf8_compatibility.py -h 27 … prints usage message … 28''' 29 30import sys 31import re 32import argparse 33import unicode_utils 34 35def create_charmap_dictionary(file_name): 36 '''Create a dictionary for all code points found in the CHARMAP 37 section of a file 38 ''' 39 with open(file_name, mode='r') as utf8_file: 40 charmap_dictionary = {} 41 for line in utf8_file: 42 if line.startswith('CHARMAP'): 43 break 44 for line in utf8_file: 45 if line.startswith('END CHARMAP'): 46 return charmap_dictionary 47 if line.startswith('%'): 48 continue 49 match = re.match( 50 r'^<U(?P<codepoint1>[0-9A-F]{4,8})>' 51 +r'(:?\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?' 52 +r'\s+(?P<hexutf8>(/x[0-9a-f]{2}){1,4})', 53 line) 54 if not match: 55 continue 56 codepoint1 = match.group('codepoint1') 57 codepoint2 = match.group('codepoint2') 58 if not codepoint2: 59 codepoint2 = codepoint1 60 for i in range(int(codepoint1, 16), 61 int(codepoint2, 16) + 1): 62 charmap_dictionary[i] = match.group('hexutf8') 63 sys.stderr.write('No “CHARMAP” or no “END CHARMAP” found in %s\n' 64 %file_name) 65 exit(1) 66 67def check_charmap(original_file_name, new_file_name): 68 '''Report differences in the CHARMAP section between the old and the 69 new file 70 ''' 71 print('************************************************************') 72 print('Report on CHARMAP:') 73 ocharmap = create_charmap_dictionary(original_file_name) 74 ncharmap = create_charmap_dictionary(new_file_name) 75 print('------------------------------------------------------------') 76 print('Total removed characters in newly generated CHARMAP: %d' 77 %len(set(ocharmap)-set(ncharmap))) 78 if ARGS.show_missing_characters: 79 for key in sorted(set(ocharmap)-set(ncharmap)): 80 print('removed: {:s} {:s} {:s}'.format( 81 unicode_utils.ucs_symbol(key), 82 ocharmap[key], 83 unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ 84 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) 85 print('------------------------------------------------------------') 86 changed_charmap = {} 87 for key in set(ocharmap).intersection(set(ncharmap)): 88 if ocharmap[key] != ncharmap[key]: 89 changed_charmap[key] = (ocharmap[key], ncharmap[key]) 90 print('Total changed characters in newly generated CHARMAP: %d' 91 %len(changed_charmap)) 92 if ARGS.show_changed_characters: 93 for key in sorted(changed_charmap): 94 print('changed: {:s} {:s}->{:s} {:s}'.format( 95 unicode_utils.ucs_symbol(key), 96 changed_charmap[key][0], 97 changed_charmap[key][1], 98 unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ 99 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) 100 print('------------------------------------------------------------') 101 print('Total added characters in newly generated CHARMAP: %d' 102 %len(set(ncharmap)-set(ocharmap))) 103 if ARGS.show_added_characters: 104 for key in sorted(set(ncharmap)-set(ocharmap)): 105 print('added: {:s} {:s} {:s}'.format( 106 unicode_utils.ucs_symbol(key), 107 ncharmap[key], 108 unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ 109 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) 110 111def create_width_dictionary(file_name): 112 '''Create a dictionary for all code points found in the WIDTH 113 section of a file 114 ''' 115 with open(file_name, mode='r') as utf8_file: 116 width_dictionary = {} 117 for line in utf8_file: 118 if line.startswith('WIDTH'): 119 break 120 for line in utf8_file: 121 if line.startswith('END WIDTH'): 122 return width_dictionary 123 match = re.match( 124 r'^<U(?P<codepoint1>[0-9A-F]{4,8})>' 125 +r'(:?\.\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?' 126 +r'\s+(?P<width>[02])', 127 line) 128 if not match: 129 continue 130 codepoint1 = match.group('codepoint1') 131 codepoint2 = match.group('codepoint2') 132 if not codepoint2: 133 codepoint2 = codepoint1 134 for i in range(int(codepoint1, 16), 135 int(codepoint2, 16) + 1): 136 width_dictionary[i] = int(match.group('width')) 137 sys.stderr.write('No “WIDTH” or no “END WIDTH” found in %s\n' %file) 138 139def check_width(original_file_name, new_file_name): 140 '''Report differences in the WIDTH section between the old and the new 141 file 142 ''' 143 print('************************************************************') 144 print('Report on WIDTH:') 145 owidth = create_width_dictionary(original_file_name) 146 nwidth = create_width_dictionary(new_file_name) 147 print('------------------------------------------------------------') 148 print('Total removed characters in newly generated WIDTH: %d' 149 %len(set(owidth)-set(nwidth))) 150 print('(Characters not in WIDTH get width 1 by default, ' 151 + 'i.e. these have width 1 now.)') 152 if ARGS.show_missing_characters: 153 for key in sorted(set(owidth)-set(nwidth)): 154 print('removed: {:s} '.format(unicode_utils.ucs_symbol(key)) 155 + '{:d} : '.format(owidth[key]) 156 + 'eaw={:s} '.format( 157 unicode_utils.EAST_ASIAN_WIDTHS[key] 158 if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') 159 + 'category={:2s} '.format( 160 unicode_utils.UNICODE_ATTRIBUTES[key]['category'] 161 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') 162 + 'bidi={:3s} '.format( 163 unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] 164 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') 165 + 'name={:s}'.format( 166 unicode_utils.UNICODE_ATTRIBUTES[key]['name'] 167 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) 168 print('------------------------------------------------------------') 169 changed_width = {} 170 for key in set(owidth).intersection(set(nwidth)): 171 if owidth[key] != nwidth[key]: 172 changed_width[key] = (owidth[key], nwidth[key]) 173 print('Total changed characters in newly generated WIDTH: %d' 174 %len(changed_width)) 175 if ARGS.show_changed_characters: 176 for key in sorted(changed_width): 177 print('changed width: {:s} '.format(unicode_utils.ucs_symbol(key)) 178 + '{:d}->{:d} : '.format(changed_width[key][0], 179 changed_width[key][1]) 180 + 'eaw={:s} '.format( 181 unicode_utils.EAST_ASIAN_WIDTHS[key] 182 if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') 183 + 'category={:2s} '.format( 184 unicode_utils.UNICODE_ATTRIBUTES[key]['category'] 185 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') 186 + 'bidi={:3s} '.format( 187 unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] 188 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') 189 + 'name={:s}'.format( 190 unicode_utils.UNICODE_ATTRIBUTES[key]['name'] 191 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) 192 print('------------------------------------------------------------') 193 print('Total added characters in newly generated WIDTH: %d' 194 %len(set(nwidth)-set(owidth))) 195 print('(Characters not in WIDTH get width 1 by default, ' 196 + 'i.e. these had width 1 before.)') 197 if ARGS.show_added_characters: 198 for key in sorted(set(nwidth)-set(owidth)): 199 print('added: {:s} '.format(unicode_utils.ucs_symbol(key)) 200 + '{:d} : '.format(nwidth[key]) 201 + 'eaw={:s} '.format( 202 unicode_utils.EAST_ASIAN_WIDTHS[key] 203 if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') 204 + 'category={:2s} '.format( 205 unicode_utils.UNICODE_ATTRIBUTES[key]['category'] 206 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') 207 + 'bidi={:3s} '.format( 208 unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] 209 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') 210 + 'name={:s}'.format( 211 unicode_utils.UNICODE_ATTRIBUTES[key]['name'] 212 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) 213 214if __name__ == "__main__": 215 PARSER = argparse.ArgumentParser( 216 description=''' 217 Compare the contents of LC_CTYPE in two files and check for errors. 218 ''') 219 PARSER.add_argument( 220 '-o', '--old_utf8_file', 221 nargs='?', 222 required=True, 223 type=str, 224 help='The old UTF-8 file.') 225 PARSER.add_argument( 226 '-n', '--new_utf8_file', 227 nargs='?', 228 required=True, 229 type=str, 230 help='The new UTF-8 file.') 231 PARSER.add_argument( 232 '-u', '--unicode_data_file', 233 nargs='?', 234 type=str, 235 help='The UnicodeData.txt file to read.') 236 PARSER.add_argument( 237 '-e', '--east_asian_width_file', 238 nargs='?', 239 type=str, 240 help='The EastAsianWidth.txt file to read.') 241 PARSER.add_argument( 242 '-a', '--show_added_characters', 243 action='store_true', 244 help='Show characters which were added in detail.') 245 PARSER.add_argument( 246 '-m', '--show_missing_characters', 247 action='store_true', 248 help='Show characters which were removed in detail.') 249 PARSER.add_argument( 250 '-c', '--show_changed_characters', 251 action='store_true', 252 help='Show characters whose width was changed in detail.') 253 ARGS = PARSER.parse_args() 254 255 if ARGS.unicode_data_file: 256 unicode_utils.fill_attributes(ARGS.unicode_data_file) 257 if ARGS.east_asian_width_file: 258 unicode_utils.fill_east_asian_widths(ARGS.east_asian_width_file) 259 check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file) 260 check_width(ARGS.old_utf8_file, ARGS.new_utf8_file) 261