Skip to content

Object stream batch-parse caches stale objects from non-authoritative streams #3697

@astahlman

Description

@astahlman

When a PDF has been incrementally updated (per the PDF 1.7 spec, §7.5.6) such that the same object appears in two different object streams - an older one and a newer one - _get_object_from_stream can cache the stale version.

This happens because the batch-parse optimization (#3677) caches every object found when decompressing a stream. The guard checks obj_num in self.xref_objStm, which passes for any compressed object regardless of which stream the xref actually points to. If the older stream is decompressed first (e.g. to read a co-located AcroForm dictionary), the old version of the object gets cached and the newer version is never read.

In practice this causes filled-in form field values to silently disappear when reading PDFs saved by form-filling software.

Environment

$ python -m platform
macOS-26.1-arm64-arm-64bit

$ python -c "import pypdf;print(pypdf._debug_versions)"
pypdf==6.9.2, crypt_provider=('cryptography', '44.0.2'), PIL=10.0.1

Code + PDF

This is a minimal, complete example that shows the issue. It constructs a synthetic PDF with two object streams: an old one containing the AcroForm dict and an empty form field, and a new one containing the same field with a value filled in. The xref correctly points the field to the new stream, but after resolving AcroForm the stale version is returned.

import io
from pypdf import PdfReader

def write_obj(buf, objnum, data):
    offset = buf.tell()
    buf.write(f"{objnum} 0 obj\n".encode())
    buf.write(data if isinstance(data, bytes) else data.encode())
    buf.write(b"\nendobj\n")
    return offset

def write_objstm(buf, objnum, obj_contents):
    header_parts, data_parts, cur = [], [], 0
    for oid, content in obj_contents:
        header_parts.append(f"{oid} {cur}")
        data_parts.append(content)
        cur += len(content) + 1
    header = " ".join(header_parts) + " "
    data = b" ".join(data_parts)
    stream = header.encode() + data
    offset = buf.tell()
    buf.write(f"{objnum} 0 obj\n".encode())
    buf.write(
        f"<< /Type /ObjStm /N {len(obj_contents)} "
        f"/First {len(header)} /Length {len(stream)} >>\n".encode()
    )
    buf.write(b"stream\n")
    buf.write(stream)
    buf.write(b"\nendstream\nendobj\n")
    return offset

buf = io.BytesIO()
buf.write(b"%PDF-1.5\n")
offsets = {}
offsets[1] = write_obj(buf, 1, "<< /Type /Catalog /Pages 2 0 R /AcroForm 5 0 R >>")
offsets[2] = write_obj(buf, 2, "<< /Type /Pages /Kids [3 0 R] /Count 1 >>")
offsets[3] = write_obj(buf, 3,
    "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Annots [6 0 R] >>")

# Old object stream: AcroForm (obj 5) + field without /V (obj 6)
offsets[4] = write_objstm(buf, 4, [
    (5, b"<< /Fields [6 0 R] >>"),
    (6, b"<< /Type /Annot /Subtype /Widget /FT /Tx /T (amount) >>"),
])
# New object stream: field with /V (obj 6, updated)
offsets[7] = write_objstm(buf, 7, [
    (6, b"<< /Type /Annot /Subtype /Widget /FT /Tx /T (amount) /V (42) >>"),
])

# Cross-reference stream pointing obj 6 to the new stream
xref_offset = buf.tell()
raw_entries = [
    (0, 0, 65535), (1, offsets[1], 0), (1, offsets[2], 0), (1, offsets[3], 0),
    (1, offsets[4], 0), (2, 4, 0), (2, 7, 0), (1, offsets[7], 0), (1, xref_offset, 0),
]
stream_data = bytearray()
for typ, f1, f2 in raw_entries:
    stream_data.append(typ)
    stream_data.extend(f1.to_bytes(4, "big"))
    stream_data.extend(f2.to_bytes(2, "big"))
buf.write(
    f"8 0 obj\n<< /Type /XRef /Size 9 /W [1 4 2] "
    f"/Root 1 0 R /Length {len(stream_data)} /Index [0 9] >>".encode()
)
buf.write(b"\nstream\n")
buf.write(bytes(stream_data))
buf.write(b"\nendstream\nendobj\n")
buf.write(f"startxref\n{xref_offset}\n%%EOF\n".encode())

reader = PdfReader(io.BytesIO(buf.getvalue()))

# Resolving AcroForm decompresses the old stream, caching stale obj 6
acroform = reader.trailer["/Root"].get_object()["/AcroForm"].get_object()

field = reader.get_object(6)
print(field.get("/V"))  # Expected: "42", Actual: None

Output on main (23d6683):
None

Traceback

No traceback, the stale value is returned silently.

Metadata

Metadata

Assignees

No one assigned

    Labels

    is-regressionRegression introduced as a side-effect of another change

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions