chinese encoding to be utf8

i got a url part like this     "%BD%C0%BD%D1",  # GBK encoding
try the following ,it always give me utf8 in windows python 3.11.9

```


import urllib.parse
import chardet

def detect_and_decode(encoded_string):
    # First, URL-decode the string
    url_decoded = urllib.parse.unquote(encoded_string)
    
    # Convert to bytes using UTF-8
    byte_string = url_decoded.encode('utf-8')
    
    # Detect encoding
    result = chardet.detect(byte_string)
    detected_encoding = result['encoding']
    
    # If detected encoding is UTF-8 with low confidence, try GBK
    if detected_encoding == 'utf-8' and result['confidence'] < 0.8:
        try:
            decoded_string = byte_string.decode('gbk')
            return decoded_string, 'gbk'
        except UnicodeDecodeError:
            pass
    
    # Decode using detected encoding
    try:
        decoded_string = byte_string.decode(detected_encoding)
        return decoded_string, detected_encoding
    except UnicodeDecodeError:
        # If decoding fails, return the URL-decoded string
        return url_decoded, 'unknown'

# Test the function
test_strings = [
    "%BD%C0%BD%D1",  # GBK encoding
    "%E3%81%93%E3%82%93%E3%81%AB%E3%81%A1%E3%81%AF",  # UTF-8 encoding
    "%41%42%43",  # Simple ASCII
    "%E6%B3%B0%E5%9D%A6",  # UTF-8 encoded "泰坦"
]

for s in test_strings:
    decoded, encoding = detect_and_decode(s)
    print(f"Original: {s}")
    print(f"Decoded: {decoded}")
    print(f"Detected encoding: {encoding}")
    print()
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

chinese encoding to be utf8 #293

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

chinese encoding to be utf8 #293

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions