From f34d8289b1e88c000308ea559abb7a6dd56af487 Mon Sep 17 00:00:00 2001 From: auouymous Date: Mon, 29 Jul 2024 17:35:37 -0600 Subject: [PATCH] Limit the length of UTF-8 filenames. sanitize_filename() only counts the number of characters and not the multi-byte UTF-8 sequences. This patch converts the string to an array of UTF-8 bytes and then removes each multi-byte sequence until the length is below the limit. --- src/gpodder/util.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/gpodder/util.py b/src/gpodder/util.py index 9747209bc..0690af65b 100644 --- a/src/gpodder/util.py +++ b/src/gpodder/util.py @@ -1653,9 +1653,19 @@ def sanitize_filename(filename, max_length): >>> sanitize_filename('Cool feed (ogg)', 1) 'C' """ - if max_length > 0 and len(filename) > max_length: + if max_length > 0 and len(filename.encode('utf-8')) > max_length: logger.info('Limiting file/folder name "%s" to %d characters.', filename, max_length) - filename = filename[:max_length] + filename = filename.encode('utf-8') + length = len(filename) + while length > max_length: + # strip continuation bytes + while (filename[-1] & 0xC0) == 0x80: + filename = filename[:-1] + length -= 1 + # strip leader byte + filename = filename[:-1] + length -= 1 + filename = filename.decode('utf-8') # see #361 - at least slash must be removed filename = re.sub(r"[\"*/:<>?\\|]", "_", filename)