-
Notifications
You must be signed in to change notification settings - Fork 291
chinese encoding to be utf8 #293
Copy link
Copy link
Closed
Description
i got a url part like this "%BD%C0%BD%D1", # GBK encoding
try the following ,it always give me utf8 in windows python 3.11.9
import urllib.parse
import chardet
def detect_and_decode(encoded_string):
# First, URL-decode the string
url_decoded = urllib.parse.unquote(encoded_string)
# Convert to bytes using UTF-8
byte_string = url_decoded.encode('utf-8')
# Detect encoding
result = chardet.detect(byte_string)
detected_encoding = result['encoding']
# If detected encoding is UTF-8 with low confidence, try GBK
if detected_encoding == 'utf-8' and result['confidence'] < 0.8:
try:
decoded_string = byte_string.decode('gbk')
return decoded_string, 'gbk'
except UnicodeDecodeError:
pass
# Decode using detected encoding
try:
decoded_string = byte_string.decode(detected_encoding)
return decoded_string, detected_encoding
except UnicodeDecodeError:
# If decoding fails, return the URL-decoded string
return url_decoded, 'unknown'
# Test the function
test_strings = [
"%BD%C0%BD%D1", # GBK encoding
"%E3%81%93%E3%82%93%E3%81%AB%E3%81%A1%E3%81%AF", # UTF-8 encoding
"%41%42%43", # Simple ASCII
"%E6%B3%B0%E5%9D%A6", # UTF-8 encoded "泰坦"
]
for s in test_strings:
decoded, encoding = detect_and_decode(s)
print(f"Original: {s}")
print(f"Decoded: {decoded}")
print(f"Detected encoding: {encoding}")
print()
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels