Skip to content

Commit c696192

Browse files
authored
MAINT: PdfReaderProtocol (#1303)
PyPDF2 has some dependencies that make proper typing hard: * PdfReader has the pages property which returns a List[PageObject] * PageObject has the pdf property which returns the PdfReader it belongs to The simplest solution would be to put both classes in the same file, but that makes PRs hard to read. Additionally, bigger files mean merge conflicts happen more often. Another solution is to just not use type annotations for one of the objects (or use `Any` as the type) The solution implemented in this PR is to define a `Protocol` (PEP 544): A protocol just states which methods a class is expected to have (with their function signature). It's duck typing: If it walks like a duck and it quacks like a duck, then it must be a duck. So we define the expected behavior instead of referencing to the specific class. typing.Iterable is an example for a Protocol. In the Java world, one would call this an interface.
1 parent 5c76c8f commit c696192

4 files changed

Lines changed: 30 additions & 10 deletions

File tree

PyPDF2/_page.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171
TextStringObject,
7272
encode_pdfdocencoding,
7373
)
74+
from .types import PdfReaderProtocol
7475

7576

7677
def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject:
@@ -241,13 +242,11 @@ class PageObject(DictionaryObject):
241242

242243
def __init__(
243244
self,
244-
pdf: Optional[Any] = None, # PdfReader
245+
pdf: Optional[PdfReaderProtocol] = None,
245246
indirect_ref: Optional[IndirectObject] = None,
246247
) -> None:
247-
from ._reader import PdfReader
248-
249248
DictionaryObject.__init__(self)
250-
self.pdf: Optional[PdfReader] = pdf
249+
self.pdf: Optional[PdfReaderProtocol] = pdf
251250
self.indirect_ref = indirect_ref
252251

253252
def hash_value_data(self) -> bytes:

PyPDF2/_reader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -910,9 +910,9 @@ def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]:
910910
return outline_item
911911

912912
@property
913-
def pages(self) -> _VirtualList:
913+
def pages(self) -> List[PageObject]:
914914
"""Read-only property that emulates a list of :py:class:`Page<PyPDF2._page.Page>` objects."""
915-
return _VirtualList(self._get_num_pages, self._get_page)
915+
return _VirtualList(self._get_num_pages, self._get_page) # type: ignore
916916

917917
@property
918918
def page_layout(self) -> Optional[str]:

PyPDF2/_writer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1460,7 +1460,7 @@ def remove_text(self, ignore_byte_string_object: bool = False) -> None:
14601460
pg_dict = cast(DictionaryObject, self.get_object(self._pages))
14611461
pages = cast(List[IndirectObject], pg_dict[PA.KIDS])
14621462
for page in pages:
1463-
page_ref = cast(Dict[str, Any], self.get_object(page))
1463+
page_ref = cast(PageObject, self.get_object(page))
14641464
content = page_ref["/Contents"].get_object()
14651465
if not isinstance(content, ContentStream):
14661466
content = ContentStream(content, page_ref)

PyPDF2/types.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
"""Helpers for working with PDF types."""
22

3-
from typing import List, Union
3+
from typing import Any, Dict, List, Optional, Union
44

55
try:
66
# Python 3.8+: https://peps.python.org/pep-0586
7-
from typing import Literal # type: ignore[attr-defined]
7+
from typing import Literal, Protocol # type: ignore[attr-defined]
88
except ImportError:
9-
from typing_extensions import Literal # type: ignore[misc]
9+
from typing_extensions import Literal, Protocol # type: ignore[misc]
1010

1111
try:
1212
# Python 3.10+: https://www.python.org/dev/peps/pep-0484/
@@ -54,3 +54,24 @@
5454
"/UseOC",
5555
"/UseAttachments",
5656
]
57+
58+
59+
class PdfReaderProtocol(Protocol): # pragma: no cover
60+
@property
61+
def pdf_header(self) -> str:
62+
...
63+
64+
@property
65+
def strict(self) -> bool:
66+
...
67+
68+
@property
69+
def xref(self) -> Dict[int, Dict[int, Any]]:
70+
...
71+
72+
@property
73+
def pages(self) -> List[Any]:
74+
...
75+
76+
def get_object(self, indirect_reference: Any) -> Optional[Any]:
77+
...

0 commit comments

Comments
 (0)