|
def compress_identical_objects( |
|
self, |
|
remove_identicals: bool = True, |
|
remove_orphans: bool = True, |
|
) -> None: |
|
""" |
|
Parse the PDF file and merge objects that have the same hash. |
|
This will make objects common to multiple pages. |
|
Recommended to be used just before writing output. |
|
|
|
Args: |
|
remove_identicals: Remove identical objects. |
|
remove_orphans: Remove unreferenced objects. |
|
|
|
""" |
|
|
|
def replace_in_obj( |
|
obj: PdfObject, crossref: Dict[IndirectObject, IndirectObject] |
|
) -> None: |
|
if isinstance(obj, DictionaryObject): |
|
key_val = obj.items() |
|
elif isinstance(obj, ArrayObject): |
|
key_val = enumerate(obj) # type: ignore |
|
else: |
|
return |
|
assert isinstance(obj, (DictionaryObject, ArrayObject)) |
|
for k, v in key_val: |
|
if isinstance(v, IndirectObject): |
|
orphans[v.idnum - 1] = False |
|
if v in crossref: |
|
obj[k] = crossref[v] |
|
else: |
|
"""the filtering on DictionaryObject and ArrayObject only |
|
will be performed within replace_in_obj""" |
|
replace_in_obj(v, crossref) |
|
|
|
# _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...]) |
|
self._idnum_hash = {} |
|
orphans = [True] * len(self._objects) |
|
# look for similar objects |
|
for idx, obj in enumerate(self._objects): |
|
if is_null_or_none(obj): |
|
continue |
|
assert obj is not None, "mypy" # mypy: TypeGuard of `is_null_or_none` does not help here. |
|
assert isinstance(obj.indirect_reference, IndirectObject) |
|
h = obj.hash_value() |
|
if remove_identicals and h in self._idnum_hash: |
|
self._idnum_hash[h][1].append(obj.indirect_reference) |
|
self._objects[idx] = None |
|
else: |
|
self._idnum_hash[h] = (obj.indirect_reference, []) |
|
|
|
# generate the dict converting others to 1st |
|
cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0} |
|
cnv_rev: Dict[IndirectObject, IndirectObject] = {} |
|
for k, v in cnv.items(): |
|
cnv_rev.update(zip(v, (k,) * len(v))) |
|
|
|
# replace reference to merged objects |
|
for obj in self._objects: |
|
if isinstance(obj, (DictionaryObject, ArrayObject)): |
|
replace_in_obj(obj, cnv_rev) |
|
|
|
# remove orphans (if applicable) |
|
orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore |
|
|
|
orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore |
|
|
|
try: |
|
orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore |
|
except AttributeError: |
|
pass |
|
for i in compress(range(len(self._objects)), orphans): |
|
self._objects[i] = None |
Looking at the code for
PdfWriter.compress_identical_objects(remove_orphans=False)atpypdf/pypdf/_writer.py
Lines 1612 to 1685 in 1c9eacd
remove_orphansis never evaluated and always assumed to beTrue.