3232
3333from functools import partial
3434from textwrap import dedent
35- from typing import *
35+ from typing import Iterator , List , Tuple
3636
3737SCRIPT = sys .argv [0 ]
3838VERSION = "3.3"
@@ -904,6 +904,19 @@ def open_data(template, version):
904904 return open (local , 'rb' )
905905
906906
907+ def expand_range (char_range : str ) -> Iterator [int ]:
908+ '''
909+ Parses ranges of code points, as described in UAX #44:
910+ https://www.unicode.org/reports/tr44/#Code_Point_Ranges
911+ '''
912+ if '..' in char_range :
913+ first , last = [int (c , 16 ) for c in char_range .split ('..' )]
914+ else :
915+ first = last = int (char_range , 16 )
916+ for char in range (first , last + 1 ):
917+ yield char
918+
919+
907920class UcdFile :
908921 '''
909922 A file in the standard format of the UCD.
@@ -929,6 +942,12 @@ def records(self) -> Iterator[List[str]]:
929942 def __iter__ (self ) -> Iterator [List [str ]]:
930943 return self .records ()
931944
945+ def expanded (self ) -> Iterator [Tuple [int , List [str ]]]:
946+ for record in self .records ():
947+ char_range , rest = record [0 ], record [1 :]
948+ for char in expand_range (char_range ):
949+ yield char , rest
950+
932951
933952# --------------------------------------------------------------------
934953# the following support code is taken from the unidb utilities
@@ -955,6 +974,9 @@ def __init__(self, version, cjk_check=True):
955974 # expand first-last ranges
956975 field = None
957976 for i in range (0 , 0x110000 ):
977+ # The file UnicodeData.txt has its own distinct way of
978+ # expressing ranges. See:
979+ # https://www.unicode.org/reports/tr44/#Code_Point_Ranges
958980 s = table [i ]
959981 if s :
960982 if s [1 ][- 6 :] == "First>" :
@@ -1019,14 +1041,8 @@ def __init__(self, version, cjk_check=True):
10191041 self .exclusions [char ] = 1
10201042
10211043 widths = [None ] * 0x110000
1022- for s in UcdFile (EASTASIAN_WIDTH , version ):
1023- if '..' in s [0 ]:
1024- first , last = [int (c , 16 ) for c in s [0 ].split ('..' )]
1025- chars = list (range (first , last + 1 ))
1026- else :
1027- chars = [int (s [0 ], 16 )]
1028- for char in chars :
1029- widths [char ] = s [1 ]
1044+ for char , (width ,) in UcdFile (EASTASIAN_WIDTH , version ).expanded ():
1045+ widths [char ] = width
10301046
10311047 for i in range (0 , 0x110000 ):
10321048 if table [i ] is not None :
@@ -1036,26 +1052,16 @@ def __init__(self, version, cjk_check=True):
10361052 if table [i ] is not None :
10371053 table [i ].append (set ())
10381054
1039- for r , p in UcdFile (DERIVED_CORE_PROPERTIES , version ):
1040- if ".." in r :
1041- first , last = [int (c , 16 ) for c in r .split ('..' )]
1042- chars = list (range (first , last + 1 ))
1043- else :
1044- chars = [int (r , 16 )]
1045- for char in chars :
1046- if table [char ]:
1047- # Some properties (e.g. Default_Ignorable_Code_Point)
1048- # apply to unassigned code points; ignore them
1049- table [char ][- 1 ].add (p )
1050-
1051- for s in UcdFile (LINE_BREAK , version ):
1052- if len (s ) < 2 or s [1 ] not in MANDATORY_LINE_BREAKS :
1055+ for char , (p ,) in UcdFile (DERIVED_CORE_PROPERTIES , version ).expanded ():
1056+ if table [char ]:
1057+ # Some properties (e.g. Default_Ignorable_Code_Point)
1058+ # apply to unassigned code points; ignore them
1059+ table [char ][- 1 ].add (p )
1060+
1061+ for char_range , value in UcdFile (LINE_BREAK , version ):
1062+ if value not in MANDATORY_LINE_BREAKS :
10531063 continue
1054- if '..' not in s [0 ]:
1055- first = last = int (s [0 ], 16 )
1056- else :
1057- first , last = [int (c , 16 ) for c in s [0 ].split ('..' )]
1058- for char in range (first , last + 1 ):
1064+ for char in expand_range (char_range ):
10591065 table [char ][- 1 ].add ('Line_Break' )
10601066
10611067 # We only want the quickcheck properties
@@ -1073,11 +1079,7 @@ def __init__(self, version, cjk_check=True):
10731079 quickcheck = 'MN' .index (s [2 ]) + 1 # Maybe or No
10741080 quickcheck_shift = qc_order .index (s [1 ])* 2
10751081 quickcheck <<= quickcheck_shift
1076- if '..' not in s [0 ]:
1077- first = last = int (s [0 ], 16 )
1078- else :
1079- first , last = [int (c , 16 ) for c in s [0 ].split ('..' )]
1080- for char in range (first , last + 1 ):
1082+ for char in expand_range (s [0 ]):
10811083 assert not (quickchecks [char ]>> quickcheck_shift )& 3
10821084 quickchecks [char ] |= quickcheck
10831085 for i in range (0 , 0x110000 ):
0 commit comments