Skip to content

SimpleCacheFileSystem does not handle "a" or "r+" properly #1977

@AllanChain

Description

@AllanChain

Currently, SimpleCacheFileSystem only get the file from underlying file system when the mode is reading ("r"). However, for appending ("a") mode, we also need to download the file, but no error should be raised if the file does not exist.

Besides, SimpleCacheFileSystem should return a LocalTempFile instance if the mode is reading and writing ("r+") because the file can be updated.

Related code:

def _open(self, path, mode="rb", **kwargs):
path = self._strip_protocol(path)
sha = self._mapper(path)
if "r" not in mode:
fn = os.path.join(self.storage[-1], sha)
user_specified_kwargs = {
k: v
for k, v in kwargs.items()
if k not in ["autocommit", "block_size", "cache_options"]
} # those were added by open()
return LocalTempFile(
self,
path,
mode=mode,
autocommit=not self._intrans,
fn=fn,
**user_specified_kwargs,
)
fn = self._check_file(path)
if fn:
return open(fn, mode)
fn = os.path.join(self.storage[-1], sha)
logger.debug("Copying %s to local cache", path)
kwargs["mode"] = mode
self._mkcache()
self._cache_size = None
if self.compression:
with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
if isinstance(f, AbstractBufferedFile):
# want no type of caching if just downloading whole thing
f.cache = BaseCache(0, f.cache.fetcher, f.size)
comp = (
infer_compression(path)
if self.compression == "infer"
else self.compression
)
f = compr[comp](f, mode="rb")
data = True
while data:
block = getattr(f, "blocksize", 5 * 2**20)
data = f.read(block)
f2.write(data)
else:
self.fs.get_file(path, fn)
return self._open(path, mode)

Suggested changes (simplified, without compression):

    def _open(self, path, mode="rb", **kwargs):
        path = self._strip_protocol(path)
        sha = self._mapper(path)

        # For read and append, (try) download from remote
        if "r" in mode or "a" in mode:
            fn = self._check_file(path)
            if fn:
                return open(fn, mode)

            fn = os.path.join(self.storage[-1], sha)

            self._mkcache()
            self._cache_size = None

            # append does not require an existing file but read does
            if self.fs.exists(path):
                self.fs.get_file(path, fn)
            elif "r" in mode:
                raise FileNotFoundError(path)

        # just reading does not need special file handling
        if "r" in mode and "+" not in mode:
            return open(fn, mode)

        fn = os.path.join(self.storage[-1], sha)
        user_specified_kwargs = {
            k: v
            for k, v in kwargs.items()
            if k not in ["autocommit", "block_size", "cache_options"]
        }  # those were added by open()
        return LocalTempFile(
            self,
            path,
            mode=mode,
            autocommit=not self._intrans,
            fn=fn,
            **user_specified_kwargs,
        )

Related docs:

I'm willing to create a PR if the maintainers like the changes.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions