33// found in the LICENSE file.
44
55import 'dart:typed_data' ;
6-
7- import '../canvaskit/canvaskit_api.dart' ;
86import '../text_fragmenter.dart' ;
97
8+ class HardcodedUnicodeProperties {
9+ static bool isControl (int utf16) {
10+ return utf16 < 0x32 ||
11+ (utf16 >= 0x7f && utf16 <= 0x9f ) ||
12+ (utf16 >= 0x200D && utf16 <= 0x200F ) ||
13+ (utf16 >= 0x202A && utf16 <= 0x202E );
14+ }
15+
16+ static const Set <int > _whitespaces = < int > {
17+ 0x0009 , // character tabulation
18+ 0x000A , // line feed
19+ 0x000B , // line tabulation
20+ 0x000C , // form feed
21+ 0x000D , // carriage return
22+ 0x0020 , // space
23+ //0x0085, // next line
24+ //0x00A0, // no-break space
25+ 0x1680 , // ogham space mark
26+ 0x2000 , // en quad
27+ 0x2001 , // em quad
28+ 0x2002 , // en space
29+ 0x2003 , // em space
30+ 0x2004 , // three-per-em space
31+ 0x2005 , // four-per-em space
32+ 0x2006 , // six-per-em space
33+ //0x2007, // figure space
34+ 0x2008 , // punctuation space
35+ 0x2009 , // thin space
36+ 0x200A , // hair space
37+ 0x2028 , // line separator
38+ 0x2029 , // paragraph separator
39+ //0x202F, // narrow no-break space
40+ 0x205F , // medium mathematical space
41+ 0x3000 , // ideographic space
42+ };
43+
44+ static bool isWhitespace (int utf16) {
45+ return _whitespaces.contains (utf16);
46+ }
47+
48+ static const Set <int > _spaces = < int > {
49+ 0x0009 , // character tabulation
50+ 0x000A , // line feed
51+ 0x000B , // line tabulation
52+ 0x000C , // form feed
53+ 0x000D , // carriage return
54+ 0x0020 , // space
55+ 0x0085 , // next line
56+ 0x00A0 , // no-break space
57+ 0x1680 , // ogham space mark
58+ 0x2000 , // en quad
59+ 0x2001 , // em quad
60+ 0x2002 , // en space
61+ 0x2003 , // em space
62+ 0x2004 , // three-per-em space
63+ 0x2005 , // four-per-em space
64+ 0x2006 , // six-per-em space
65+ 0x2007 , // figure space
66+ 0x2008 , // punctuation space
67+ 0x2009 , // thin space
68+ 0x200A , // hair space
69+ 0x2028 , // line separator
70+ 0x2029 , // paragraph separator
71+ 0x202F , // narrow no-break space
72+ 0x205F , // medium mathematical space
73+ 0x3000 , // ideographic space
74+ };
75+ static bool isSpace (int utf16) {
76+ return _spaces.contains (utf16);
77+ }
78+
79+ static bool isTabulation (int utf16) {
80+ return utf16 == 0x0009 ;
81+ }
82+
83+ static bool isHardBreak (int utf16) {
84+ return (utf16 == 0x000A ) || (utf16 == 0x2028 );
85+ }
86+
87+ static const Set <(int first, int second)> _ranges = < (int first, int second)> {
88+ (4352 , 4607 ), // Hangul Jamo
89+ (11904 , 42191 ), // CJK_Radicals
90+ (43072 , 43135 ), // Phags_Pa
91+ (44032 , 55215 ), // Hangul_Syllables
92+ (63744 , 64255 ), // CJK_Compatibility_Ideographs
93+ (65072 , 65103 ), // CJK_Compatibility_Forms
94+ (65381 , 65500 ), // Katakana_Hangul_Halfwidth
95+ (131072 , 196607 ), // Supplementary_Ideographic_Plane
96+ };
97+ static bool isIdeographic (int utf16) {
98+ return _ranges.any ((range) => (range.$1 <= utf16) && (range.$2 > utf16));
99+ }
100+ }
101+
10102class AllCodeUnitFlags {
11- AllCodeUnitFlags (this ._text) : _allFlags = Uint8List (_text.length + 1 ) {
103+ AllCodeUnitFlags (this ._text) : _allFlags = Uint16List (_text.length + 1 ) {
12104 _extract ();
13105 }
14106
15107 final String _text;
16- final Uint8List _allFlags;
108+ final Uint16List _allFlags;
17109
18110 int get length => _allFlags.length;
19111
@@ -25,18 +117,17 @@ class AllCodeUnitFlags {
25117 }
26118
27119 void _extract () {
28- // TODO(jlavrova): 1. This call to CanvasKit is not going to work with Skwasm.
29- // 2. We are only using `whitespace` flags from CanvasKit. Can we hardcode them
30- // here to avoid calling CanvasKit?
31- // 3. Do we need other flags like `control` and `space`?
32- final List <CodeUnitInfo > ckFlags = canvasKit.CodeUnits .compute (_text);
33- assert (ckFlags.length == _allFlags.length);
34-
35- for (var i = 0 ; i < _allFlags.length; i++ ) {
36- _allFlags[i] = ckFlags[i].flags;
120+ // Add whitespaces
121+ _allFlags.fillRange (0 , _allFlags.length, 0 );
122+ for (var i = 0 ; i < _allFlags.length - 1 ; i++ ) {
123+ if (HardcodedUnicodeProperties .isWhitespace (_text.codeUnitAt (i))) {
124+ _allFlags[i] = CodeUnitFlag .whitespace._bitmask;
125+ }
126+ // We can add more flags here, e.g. control characters, ideographic characters, etc.
37127 }
38128
39- // TODO(mdebbar): OPTIMIZATION: can we make `segmentText` update `codeUnitFlags` in-place?
129+ // TODO(mdebbar): OPTIMIZATION:
130+ // We can make `segmentText` update `codeUnitFlags` in-place?
40131 // Get text segmentation resuls using browser APIs.
41132 final SegmentationResult result = segmentText (_text);
42133
0 commit comments