-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Description
When a PDF has been incrementally updated (per the PDF 1.7 spec, §7.5.6) such that the same object appears in two different object streams - an older one and a newer one - _get_object_from_stream can cache the stale version.
This happens because the batch-parse optimization (#3677) caches every object found when decompressing a stream. The guard checks obj_num in self.xref_objStm, which passes for any compressed object regardless of which stream the xref actually points to. If the older stream is decompressed first (e.g. to read a co-located AcroForm dictionary), the old version of the object gets cached and the newer version is never read.
In practice this causes filled-in form field values to silently disappear when reading PDFs saved by form-filling software.
Environment
$ python -m platform
macOS-26.1-arm64-arm-64bit
$ python -c "import pypdf;print(pypdf._debug_versions)"
pypdf==6.9.2, crypt_provider=('cryptography', '44.0.2'), PIL=10.0.1
Code + PDF
This is a minimal, complete example that shows the issue. It constructs a synthetic PDF with two object streams: an old one containing the AcroForm dict and an empty form field, and a new one containing the same field with a value filled in. The xref correctly points the field to the new stream, but after resolving AcroForm the stale version is returned.
import io
from pypdf import PdfReader
def write_obj(buf, objnum, data):
offset = buf.tell()
buf.write(f"{objnum} 0 obj\n".encode())
buf.write(data if isinstance(data, bytes) else data.encode())
buf.write(b"\nendobj\n")
return offset
def write_objstm(buf, objnum, obj_contents):
header_parts, data_parts, cur = [], [], 0
for oid, content in obj_contents:
header_parts.append(f"{oid} {cur}")
data_parts.append(content)
cur += len(content) + 1
header = " ".join(header_parts) + " "
data = b" ".join(data_parts)
stream = header.encode() + data
offset = buf.tell()
buf.write(f"{objnum} 0 obj\n".encode())
buf.write(
f"<< /Type /ObjStm /N {len(obj_contents)} "
f"/First {len(header)} /Length {len(stream)} >>\n".encode()
)
buf.write(b"stream\n")
buf.write(stream)
buf.write(b"\nendstream\nendobj\n")
return offset
buf = io.BytesIO()
buf.write(b"%PDF-1.5\n")
offsets = {}
offsets[1] = write_obj(buf, 1, "<< /Type /Catalog /Pages 2 0 R /AcroForm 5 0 R >>")
offsets[2] = write_obj(buf, 2, "<< /Type /Pages /Kids [3 0 R] /Count 1 >>")
offsets[3] = write_obj(buf, 3,
"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Annots [6 0 R] >>")
# Old object stream: AcroForm (obj 5) + field without /V (obj 6)
offsets[4] = write_objstm(buf, 4, [
(5, b"<< /Fields [6 0 R] >>"),
(6, b"<< /Type /Annot /Subtype /Widget /FT /Tx /T (amount) >>"),
])
# New object stream: field with /V (obj 6, updated)
offsets[7] = write_objstm(buf, 7, [
(6, b"<< /Type /Annot /Subtype /Widget /FT /Tx /T (amount) /V (42) >>"),
])
# Cross-reference stream pointing obj 6 to the new stream
xref_offset = buf.tell()
raw_entries = [
(0, 0, 65535), (1, offsets[1], 0), (1, offsets[2], 0), (1, offsets[3], 0),
(1, offsets[4], 0), (2, 4, 0), (2, 7, 0), (1, offsets[7], 0), (1, xref_offset, 0),
]
stream_data = bytearray()
for typ, f1, f2 in raw_entries:
stream_data.append(typ)
stream_data.extend(f1.to_bytes(4, "big"))
stream_data.extend(f2.to_bytes(2, "big"))
buf.write(
f"8 0 obj\n<< /Type /XRef /Size 9 /W [1 4 2] "
f"/Root 1 0 R /Length {len(stream_data)} /Index [0 9] >>".encode()
)
buf.write(b"\nstream\n")
buf.write(bytes(stream_data))
buf.write(b"\nendstream\nendobj\n")
buf.write(f"startxref\n{xref_offset}\n%%EOF\n".encode())
reader = PdfReader(io.BytesIO(buf.getvalue()))
# Resolving AcroForm decompresses the old stream, caching stale obj 6
acroform = reader.trailer["/Root"].get_object()["/AcroForm"].get_object()
field = reader.get_object(6)
print(field.get("/V")) # Expected: "42", Actual: NoneOutput on main (23d6683):
None
Traceback
No traceback, the stale value is returned silently.