Skip to content

Commit a6e956b

Browse files
elpransserhiy-storchaka
authored andcommitted
bpo-34726: Fix handling of hash-based pycs in zipimport. (GH-10327)
Current support for hash-based bytecode files in `zipimport` is rather sparse, which leads to test failures when the test suite is ran with the ``SOURCE_DATE_EPOCH`` environment variable set. This teaches zipimport to handle hash-based pycs properly.
1 parent bfe1839 commit a6e956b

File tree

4 files changed

+1147
-1049
lines changed

4 files changed

+1147
-1049
lines changed

Lib/test/test_cmd_line_script.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -259,10 +259,32 @@ def test_zipfile(self):
259259
self._check_script(zip_name, run_name, zip_name, zip_name, '',
260260
zipimport.zipimporter)
261261

262-
def test_zipfile_compiled(self):
262+
def test_zipfile_compiled_timestamp(self):
263263
with support.temp_dir() as script_dir:
264264
script_name = _make_test_script(script_dir, '__main__')
265-
compiled_name = py_compile.compile(script_name, doraise=True)
265+
compiled_name = py_compile.compile(
266+
script_name, doraise=True,
267+
invalidation_mode=py_compile.PycInvalidationMode.TIMESTAMP)
268+
zip_name, run_name = make_zip_script(script_dir, 'test_zip', compiled_name)
269+
self._check_script(zip_name, run_name, zip_name, zip_name, '',
270+
zipimport.zipimporter)
271+
272+
def test_zipfile_compiled_checked_hash(self):
273+
with support.temp_dir() as script_dir:
274+
script_name = _make_test_script(script_dir, '__main__')
275+
compiled_name = py_compile.compile(
276+
script_name, doraise=True,
277+
invalidation_mode=py_compile.PycInvalidationMode.CHECKED_HASH)
278+
zip_name, run_name = make_zip_script(script_dir, 'test_zip', compiled_name)
279+
self._check_script(zip_name, run_name, zip_name, zip_name, '',
280+
zipimport.zipimporter)
281+
282+
def test_zipfile_compiled_unchecked_hash(self):
283+
with support.temp_dir() as script_dir:
284+
script_name = _make_test_script(script_dir, '__main__')
285+
compiled_name = py_compile.compile(
286+
script_name, doraise=True,
287+
invalidation_mode=py_compile.PycInvalidationMode.UNCHECKED_HASH)
266288
zip_name, run_name = make_zip_script(script_dir, 'test_zip', compiled_name)
267289
self._check_script(zip_name, run_name, zip_name, zip_name, '',
268290
zipimport.zipimporter)

Lib/zipimport.py

Lines changed: 66 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -578,33 +578,53 @@ def _eq_mtime(t1, t2):
578578
# dostime only stores even seconds, so be lenient
579579
return abs(t1 - t2) <= 1
580580

581+
581582
# Given the contents of a .py[co] file, unmarshal the data
582583
# and return the code object. Return None if it the magic word doesn't
583-
# match (we do this instead of raising an exception as we fall back
584+
# match, or if the recorded .py[co] metadata does not match the source,
585+
# (we do this instead of raising an exception as we fall back
584586
# to .py if available and we don't want to mask other errors).
585-
def _unmarshal_code(pathname, data, mtime):
586-
if len(data) < 16:
587-
raise ZipImportError('bad pyc data')
588-
589-
if data[:4] != _bootstrap_external.MAGIC_NUMBER:
590-
_bootstrap._verbose_message('{!r} has bad magic', pathname)
591-
return None # signal caller to try alternative
592-
593-
flags = _unpack_uint32(data[4:8])
594-
if flags != 0:
595-
# Hash-based pyc. We currently refuse to handle checked hash-based
596-
# pycs. We could validate hash-based pycs against the source, but it
597-
# seems likely that most people putting hash-based pycs in a zipfile
598-
# will use unchecked ones.
587+
def _unmarshal_code(self, pathname, fullpath, fullname, data):
588+
exc_details = {
589+
'name': fullname,
590+
'path': fullpath,
591+
}
592+
593+
try:
594+
flags = _bootstrap_external._classify_pyc(data, fullname, exc_details)
595+
except ImportError:
596+
return None
597+
598+
hash_based = flags & 0b1 != 0
599+
if hash_based:
600+
check_source = flags & 0b10 != 0
599601
if (_imp.check_hash_based_pycs != 'never' and
600-
(flags != 0x1 or _imp.check_hash_based_pycs == 'always')):
601-
return None
602-
elif mtime != 0 and not _eq_mtime(_unpack_uint32(data[8:12]), mtime):
603-
_bootstrap._verbose_message('{!r} has bad mtime', pathname)
604-
return None # signal caller to try alternative
602+
(check_source or _imp.check_hash_based_pycs == 'always')):
603+
source_bytes = _get_pyc_source(self, fullpath)
604+
if source_bytes is not None:
605+
source_hash = _imp.source_hash(
606+
_bootstrap_external._RAW_MAGIC_NUMBER,
607+
source_bytes,
608+
)
609+
610+
try:
611+
_boostrap_external._validate_hash_pyc(
612+
data, source_hash, fullname, exc_details)
613+
except ImportError:
614+
return None
615+
else:
616+
source_mtime, source_size = \
617+
_get_mtime_and_size_of_source(self, fullpath)
618+
619+
if source_mtime:
620+
# We don't use _bootstrap_external._validate_timestamp_pyc
621+
# to allow for a more lenient timestamp check.
622+
if (not _eq_mtime(_unpack_uint32(data[8:12]), source_mtime) or
623+
_unpack_uint32(data[12:16]) != source_size):
624+
_bootstrap._verbose_message(
625+
f'bytecode is stale for {fullname!r}')
626+
return None
605627

606-
# XXX the pyc's size field is ignored; timestamp collisions are probably
607-
# unimportant with zip files.
608628
code = marshal.loads(data[16:])
609629
if not isinstance(code, _code_type):
610630
raise TypeError(f'compiled module {pathname!r} is not a code object')
@@ -639,9 +659,9 @@ def _parse_dostime(d, t):
639659
-1, -1, -1))
640660

641661
# Given a path to a .pyc file in the archive, return the
642-
# modification time of the matching .py file, or 0 if no source
643-
# is available.
644-
def _get_mtime_of_source(self, path):
662+
# modification time of the matching .py file and its size,
663+
# or (0, 0) if no source is available.
664+
def _get_mtime_and_size_of_source(self, path):
645665
try:
646666
# strip 'c' or 'o' from *.py[co]
647667
assert path[-1:] in ('c', 'o')
@@ -651,9 +671,27 @@ def _get_mtime_of_source(self, path):
651671
# with an embedded pyc time stamp
652672
time = toc_entry[5]
653673
date = toc_entry[6]
654-
return _parse_dostime(date, time)
674+
uncompressed_size = toc_entry[3]
675+
return _parse_dostime(date, time), uncompressed_size
655676
except (KeyError, IndexError, TypeError):
656-
return 0
677+
return 0, 0
678+
679+
680+
# Given a path to a .pyc file in the archive, return the
681+
# contents of the matching .py file, or None if no source
682+
# is available.
683+
def _get_pyc_source(self, path):
684+
# strip 'c' or 'o' from *.py[co]
685+
assert path[-1:] in ('c', 'o')
686+
path = path[:-1]
687+
688+
try:
689+
toc_entry = self._files[path]
690+
except KeyError:
691+
return None
692+
else:
693+
return _get_data(self.archive, toc_entry)
694+
657695

658696
# Get the code object associated with the module specified by
659697
# 'fullname'.
@@ -670,8 +708,7 @@ def _get_module_code(self, fullname):
670708
modpath = toc_entry[0]
671709
data = _get_data(self.archive, toc_entry)
672710
if isbytecode:
673-
mtime = _get_mtime_of_source(self, fullpath)
674-
code = _unmarshal_code(modpath, data, mtime)
711+
code = _unmarshal_code(self, modpath, fullpath, fullname, data)
675712
else:
676713
code = _compile_source(modpath, data)
677714
if code is None:
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix handling of hash-based bytecode files in :mod:`zipimport`.
2+
Patch by Elvis Pranskevichus.

0 commit comments

Comments
 (0)