Skip to content

Commit 9daeabd

Browse files
committed
ROB : fix offset correction in revised PDF
fixes #328 problem is observed in pdfs where the xref table in previous versions are not starting at 0 where _zero_xref was changing index which should not
1 parent a61ef5f commit 9daeabd

2 files changed

Lines changed: 15 additions & 8 deletions

File tree

PyPDF2/_reader.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1347,15 +1347,19 @@ def read(self, stream: StreamType) -> None:
13471347
for gen, xref_entry in self.xref.items():
13481348
if gen == 65535:
13491349
continue
1350-
for id in xref_entry:
1350+
xref_k = sorted(
1351+
xref_entry.keys()
1352+
) # must ensure ascendant to prevent damange
1353+
for id in xref_k:
13511354
stream.seek(xref_entry[id], 0)
13521355
try:
13531356
pid, _pgen = self.read_object_header(stream)
13541357
except ValueError:
13551358
break
13561359
if pid == id - self.xref_index:
1357-
self._zero_xref(gen)
1358-
break
1360+
# fixing index item per item is required for revised PDF.
1361+
self.xref[gen][pid] = self.xref[gen][id]
1362+
del self.xref[gen][id]
13591363
# if not, then either it's just plain wrong, or the
13601364
# non-zero-index is actually correct
13611365
stream.seek(loc, 0) # return to where it was
@@ -1750,11 +1754,6 @@ def _read_xref_subsections(
17501754
elif self.strict:
17511755
raise PdfReadError(f"Unknown xref type: {xref_type}")
17521756

1753-
def _zero_xref(self, generation: int) -> None:
1754-
self.xref[generation] = {
1755-
k - self.xref_index: v for (k, v) in list(self.xref[generation].items())
1756-
}
1757-
17581757
def _pairs(self, array: List[int]) -> Iterable[Tuple[int, int]]:
17591758
i = 0
17601759
while True:

tests/test_reader.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1136,3 +1136,11 @@ def test_reader(caplog):
11361136
# ...and now no more required
11371137
reader.pages[0].extract_text()
11381138
assert caplog.text == ""
1139+
1140+
1141+
def test_zeroing_xref():
1142+
# iss #328
1143+
url = "https://github.com/py-pdf/PyPDF2/files/9066120/UTA_OSHA_3115_Fall_Protection_Training_09162021_.pdf"
1144+
name = "UTA_OSHA.pdf"
1145+
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
1146+
len(reader.pages)

0 commit comments

Comments
 (0)