Skip to content

Commit 57abbca

Browse files
dan-blanchardjpz
andauthored
Rebased and cleaned up version of UTF-16/32 BE/LE PR (#206)
* support for UTF-16 and UTF-32 detection missing BOMs * Changes per PR comments - Restored file suffix filter in test.py - Added functionality to identify valid unicode, to enhance detection - Generated some non-trivial unicode examples using supplementary plane 1 * clean up poorly written comments * Run black on PR * Fix some minor linting issues Co-authored-by: Jason Zavaglia <jason.zavaglia@gmail.com>
1 parent eca9558 commit 57abbca

11 files changed

Lines changed: 246 additions & 3 deletions

chardet/charsetprober.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,8 @@ def reset(self):
5252
def charset_name(self):
5353
return None
5454

55-
def feed(self, buf):
56-
pass
55+
def feed(self, byte_str):
56+
raise NotImplementedError
5757

5858
@property
5959
def state(self):

chardet/universaldetector.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ class a user of ``chardet`` should use.
4646
from .latin1prober import Latin1Prober
4747
from .mbcsgroupprober import MBCSGroupProber
4848
from .sbcsgroupprober import SBCSGroupProber
49+
from .utf1632prober import UTF1632Prober
4950

5051

5152
class UniversalDetector:
@@ -82,6 +83,7 @@ class UniversalDetector:
8283

8384
def __init__(self, lang_filter=LanguageFilter.ALL):
8485
self._esc_charset_prober = None
86+
self._utf1632_prober = None
8587
self._charset_probers = []
8688
self.result = None
8789
self.done = None
@@ -107,6 +109,8 @@ def reset(self):
107109
self._last_char = b""
108110
if self._esc_charset_prober:
109111
self._esc_charset_prober.reset()
112+
if self._utf1632_prober:
113+
self._utf1632_prober.reset()
110114
for prober in self._charset_probers:
111115
prober.reset()
112116

@@ -127,7 +131,7 @@ def feed(self, byte_str):
127131
if self.done:
128132
return
129133

130-
if not len(byte_str):
134+
if not byte_str:
131135
return
132136

133137
if not isinstance(byte_str, bytearray):
@@ -184,6 +188,21 @@ def feed(self, byte_str):
184188

185189
self._last_char = byte_str[-1:]
186190

191+
# next we will look to see if it is appears to be either a UTF-16 or
192+
# UTF-32 encoding
193+
if not self._utf1632_prober:
194+
self._utf1632_prober = UTF1632Prober()
195+
196+
if self._utf1632_prober.state == ProbingState.DETECTING:
197+
if self._utf1632_prober.feed(byte_str) == ProbingState.FOUND_IT:
198+
self.result = {
199+
"encoding": self._utf1632_prober.charset_name,
200+
"confidence": self._utf1632_prober.get_confidence(),
201+
"language": "",
202+
}
203+
self.done = True
204+
return
205+
187206
# If we've seen escape sequences, use the EscCharSetProber, which
188207
# uses a simple state machine to check for known escape sequences in
189208
# HZ and ISO-2022 encodings, since those are the only encodings that

chardet/utf1632prober.py

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
######################## BEGIN LICENSE BLOCK ########################
2+
#
3+
# Contributor(s):
4+
# Jason Zavaglia
5+
#
6+
# This library is free software; you can redistribute it and/or
7+
# modify it under the terms of the GNU Lesser General Public
8+
# License as published by the Free Software Foundation; either
9+
# version 2.1 of the License, or (at your option) any later version.
10+
#
11+
# This library is distributed in the hope that it will be useful,
12+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14+
# Lesser General Public License for more details.
15+
#
16+
# You should have received a copy of the GNU Lesser General Public
17+
# License along with this library; if not, write to the Free Software
18+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
19+
# 02110-1301 USA
20+
######################### END LICENSE BLOCK #########################
21+
from .charsetprober import CharSetProber
22+
from .enums import ProbingState
23+
24+
25+
class UTF1632Prober(CharSetProber):
26+
"""
27+
This class simply looks for occurrences of zero bytes, and infers
28+
whether the file is UTF16 or UTF32 (low-endian or big-endian)
29+
For instance, files looking like ( \0 \0 \0 [nonzero] )+
30+
have a good probability to be UTF32BE. Files looking like ( \0 [nonzero] )+
31+
may be guessed to be UTF16BE, and inversely for little-endian varieties.
32+
"""
33+
34+
# how many logical characters to scan before feeling confident of prediction
35+
MIN_CHARS_FOR_DETECTION = 20
36+
# a fixed constant ratio of expected zeros or non-zeros in modulo-position.
37+
EXPECTED_RATIO = 0.94
38+
39+
def __init__(self):
40+
super().__init__()
41+
self.position = 0
42+
self.zeros_at_mod = [0] * 4
43+
self.nonzeros_at_mod = [0] * 4
44+
self._state = ProbingState.DETECTING
45+
self.quad = [0, 0, 0, 0]
46+
self.invalid_utf16be = False
47+
self.invalid_utf16le = False
48+
self.invalid_utf32be = False
49+
self.invalid_utf32le = False
50+
self.first_half_surrogate_pair_detected_16be = False
51+
self.first_half_surrogate_pair_detected_16le = False
52+
self.reset()
53+
54+
def reset(self):
55+
super().reset()
56+
self.position = 0
57+
self.zeros_at_mod = [0] * 4
58+
self.nonzeros_at_mod = [0] * 4
59+
self._state = ProbingState.DETECTING
60+
self.invalid_utf16be = False
61+
self.invalid_utf16le = False
62+
self.invalid_utf32be = False
63+
self.invalid_utf32le = False
64+
self.first_half_surrogate_pair_detected_16be = False
65+
self.first_half_surrogate_pair_detected_16le = False
66+
self.quad = [0, 0, 0, 0]
67+
68+
@property
69+
def charset_name(self):
70+
if self.is_likely_utf32be():
71+
return "utf-32be"
72+
if self.is_likely_utf32le():
73+
return "utf-32le"
74+
if self.is_likely_utf16be():
75+
return "utf-16be"
76+
if self.is_likely_utf16le():
77+
return "utf-16le"
78+
# default to something valid
79+
return "utf-16"
80+
81+
@property
82+
def language(self):
83+
return ""
84+
85+
def approx_32bit_chars(self):
86+
return max(1.0, self.position / 4.0)
87+
88+
def approx_16bit_chars(self):
89+
return max(1.0, self.position / 2.0)
90+
91+
def is_likely_utf32be(self):
92+
approx_chars = self.approx_32bit_chars()
93+
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
94+
self.zeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
95+
and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO
96+
and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO
97+
and self.nonzeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO
98+
and not self.invalid_utf32be
99+
)
100+
101+
def is_likely_utf32le(self):
102+
approx_chars = self.approx_32bit_chars()
103+
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
104+
self.nonzeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
105+
and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO
106+
and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO
107+
and self.zeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO
108+
and not self.invalid_utf32le
109+
)
110+
111+
def is_likely_utf16be(self):
112+
approx_chars = self.approx_16bit_chars()
113+
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
114+
(self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]) / approx_chars
115+
> self.EXPECTED_RATIO
116+
and (self.zeros_at_mod[0] + self.zeros_at_mod[2]) / approx_chars
117+
> self.EXPECTED_RATIO
118+
and not self.invalid_utf16be
119+
)
120+
121+
def is_likely_utf16le(self):
122+
approx_chars = self.approx_16bit_chars()
123+
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
124+
(self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]) / approx_chars
125+
> self.EXPECTED_RATIO
126+
and (self.zeros_at_mod[1] + self.zeros_at_mod[3]) / approx_chars
127+
> self.EXPECTED_RATIO
128+
and not self.invalid_utf16le
129+
)
130+
131+
def validate_utf32_characters(self, quad):
132+
"""
133+
Validate if the quad of bytes is valid UTF-32.
134+
135+
UTF-32 is valid in the range 0x00000000 - 0x0010FFFF
136+
excluding 0x0000D800 - 0x0000DFFF
137+
138+
https://en.wikipedia.org/wiki/UTF-32
139+
"""
140+
if (
141+
quad[0] != 0
142+
or quad[1] > 0x10
143+
or (quad[0] == 0 and quad[1] == 0 and 0xD8 <= quad[2] <= 0xDF)
144+
):
145+
self.invalid_utf32be = True
146+
if (
147+
quad[3] != 0
148+
or quad[2] > 0x10
149+
or (quad[3] == 0 and quad[2] == 0 and 0xD8 <= quad[1] <= 0xDF)
150+
):
151+
self.invalid_utf32le = True
152+
153+
def validate_utf16_characters(self, pair):
154+
"""
155+
Validate if the pair of bytes is valid UTF-16.
156+
157+
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
158+
with an exception for surrogate pairs, which must be in the range
159+
0xD800-0xDBFF followed by 0xDC00-0xDFFF
160+
161+
https://en.wikipedia.org/wiki/UTF-16
162+
"""
163+
if not self.first_half_surrogate_pair_detected_16be:
164+
if 0xD8 <= pair[0] <= 0xDB:
165+
self.first_half_surrogate_pair_detected_16be = True
166+
elif 0xDC <= pair[0] <= 0xDF:
167+
self.invalid_utf16be = True
168+
else:
169+
if 0xDC <= pair[0] <= 0xDF:
170+
self.first_half_surrogate_pair_detected_16be = False
171+
else:
172+
self.invalid_utf16be = True
173+
174+
if not self.first_half_surrogate_pair_detected_16le:
175+
if 0xD8 <= pair[1] <= 0xDB:
176+
self.first_half_surrogate_pair_detected_16le = True
177+
elif 0xDC <= pair[1] <= 0xDF:
178+
self.invalid_utf16le = True
179+
else:
180+
if 0xDC <= pair[1] <= 0xDF:
181+
self.first_half_surrogate_pair_detected_16le = False
182+
else:
183+
self.invalid_utf16le = True
184+
185+
def feed(self, byte_str):
186+
for c in byte_str:
187+
mod4 = self.position % 4
188+
self.quad[mod4] = c
189+
if mod4 == 3:
190+
self.validate_utf32_characters(self.quad)
191+
self.validate_utf16_characters(self.quad[0:2])
192+
self.validate_utf16_characters(self.quad[2:4])
193+
if c == 0:
194+
self.zeros_at_mod[mod4] += 1
195+
else:
196+
self.nonzeros_at_mod[mod4] += 1
197+
self.position += 1
198+
return self.state
199+
200+
@property
201+
def state(self):
202+
if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}:
203+
# terminal, decided states
204+
return self._state
205+
if self.get_confidence() > 0.80:
206+
self._state = ProbingState.FOUND_IT
207+
elif self.position > 4 * 1024:
208+
# if we get to 4kb into the file, and we can't conclude it's UTF,
209+
# let's give up
210+
self._state = ProbingState.NOT_ME
211+
return self._state
212+
213+
def get_confidence(self):
214+
confidence = 0.85
215+
216+
if (
217+
self.is_likely_utf16le()
218+
or self.is_likely_utf16be()
219+
or self.is_likely_utf32le()
220+
or self.is_likely_utf32be()
221+
):
222+
return confidence
223+
else:
224+
return 0.00

tests/UTF-16BE/nobom-utf16be.txt

1.55 KB
Binary file not shown.
12.2 KB
Binary file not shown.

tests/UTF-16LE/nobom-utf16le.txt

1.55 KB
Binary file not shown.
12.2 KB
Binary file not shown.

tests/UTF-32BE/nobom-utf32be.txt

3.1 KB
Binary file not shown.
23.9 KB
Binary file not shown.

tests/UTF-32LE/nobom-utf32le.txt

3.1 KB
Binary file not shown.

0 commit comments

Comments
 (0)