Skip to content

chinese encoding to be utf8 #293

@wanghaisheng

Description

@wanghaisheng

i got a url part like this "%BD%C0%BD%D1", # GBK encoding
try the following ,it always give me utf8 in windows python 3.11.9



import urllib.parse
import chardet

def detect_and_decode(encoded_string):
    # First, URL-decode the string
    url_decoded = urllib.parse.unquote(encoded_string)
    
    # Convert to bytes using UTF-8
    byte_string = url_decoded.encode('utf-8')
    
    # Detect encoding
    result = chardet.detect(byte_string)
    detected_encoding = result['encoding']
    
    # If detected encoding is UTF-8 with low confidence, try GBK
    if detected_encoding == 'utf-8' and result['confidence'] < 0.8:
        try:
            decoded_string = byte_string.decode('gbk')
            return decoded_string, 'gbk'
        except UnicodeDecodeError:
            pass
    
    # Decode using detected encoding
    try:
        decoded_string = byte_string.decode(detected_encoding)
        return decoded_string, detected_encoding
    except UnicodeDecodeError:
        # If decoding fails, return the URL-decoded string
        return url_decoded, 'unknown'

# Test the function
test_strings = [
    "%BD%C0%BD%D1",  # GBK encoding
    "%E3%81%93%E3%82%93%E3%81%AB%E3%81%A1%E3%81%AF",  # UTF-8 encoding
    "%41%42%43",  # Simple ASCII
    "%E6%B3%B0%E5%9D%A6",  # UTF-8 encoded "泰坦"
]

for s in test_strings:
    decoded, encoding = detect_and_decode(s)
    print(f"Original: {s}")
    print(f"Decoded: {decoded}")
    print(f"Detected encoding: {encoding}")
    print()

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions