Skip to content

Commit d897716

Browse files
jorisvandenbosschekszucs
authored andcommitted
ARROW-16413: [Python] Certain dataset APIs hang with a python filesystem
Closes #13033 from jorisvandenbossche/ARROW-16413 Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com> Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
1 parent 26f2d87 commit d897716

3 files changed

Lines changed: 61 additions & 5 deletions

File tree

python/pyarrow/_dataset.pyx

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -761,8 +761,12 @@ cdef class FileFormat(_Weakrefable):
761761
schema : Schema
762762
The schema inferred from the file
763763
"""
764-
c_source = _make_file_source(file, filesystem)
765-
c_schema = GetResultValue(self.format.Inspect(c_source))
764+
cdef:
765+
CFileSource c_source = _make_file_source(file, filesystem)
766+
CResult[shared_ptr[CSchema]] c_result
767+
with nogil:
768+
c_result = self.format.Inspect(c_source)
769+
c_schema = GetResultValue(c_result)
766770
return pyarrow_wrap_schema(move(c_schema))
767771

768772
def make_fragment(self, file, filesystem=None,

python/pyarrow/_dataset_parquet.pyx

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -788,7 +788,7 @@ cdef class ParquetDatasetFactory(DatasetFactory):
788788
FileFormat format not None,
789789
ParquetFactoryOptions options=None):
790790
cdef:
791-
c_string path
791+
c_string c_path
792792
shared_ptr[CFileSystem] c_filesystem
793793
shared_ptr[CParquetFileFormat] c_format
794794
CResult[shared_ptr[CDatasetFactory]] result
@@ -801,8 +801,9 @@ cdef class ParquetDatasetFactory(DatasetFactory):
801801
options = options or ParquetFactoryOptions()
802802
c_options = options.unwrap()
803803

804-
result = CParquetDatasetFactory.MakeFromMetaDataPath(
805-
c_path, c_filesystem, c_format, c_options)
804+
with nogil:
805+
result = CParquetDatasetFactory.MakeFromMetaDataPath(
806+
c_path, c_filesystem, c_format, c_options)
806807
self.init(GetResultValue(result))
807808

808809
cdef init(self, shared_ptr[CDatasetFactory]& sp):

python/pyarrow/tests/test_dataset.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import datetime
2222
import pathlib
2323
import pickle
24+
import sys
2425
import textwrap
2526
import tempfile
2627
import threading
@@ -2582,6 +2583,32 @@ def test_open_dataset_from_fsspec(tempdir):
25822583
assert dataset.schema.equals(table.schema)
25832584

25842585

2586+
@pytest.mark.parquet
2587+
def test_file_format_inspect_fsspec(tempdir):
2588+
# https://issues.apache.org/jira/browse/ARROW-16413
2589+
fsspec = pytest.importorskip("fsspec")
2590+
2591+
# create bucket + file with pyarrow
2592+
table = pa.table({'a': [1, 2, 3]})
2593+
path = tempdir / "data.parquet"
2594+
pq.write_table(table, path)
2595+
2596+
# read using fsspec filesystem
2597+
fsspec_fs = fsspec.filesystem("file")
2598+
assert fsspec_fs.ls(tempdir)[0].endswith("data.parquet")
2599+
2600+
# inspect using dataset file format
2601+
format = ds.ParquetFileFormat()
2602+
# manually creating a PyFileSystem instead of using fs._ensure_filesystem
2603+
# which would convert an fsspec local filesystem to a native one
2604+
filesystem = fs.PyFileSystem(fs.FSSpecHandler(fsspec_fs))
2605+
schema = format.inspect(path, filesystem)
2606+
assert schema.equals(table.schema)
2607+
2608+
fragment = format.make_fragment(path, filesystem)
2609+
assert fragment.physical_schema.equals(table.schema)
2610+
2611+
25852612
@pytest.mark.pandas
25862613
def test_filter_timestamp(tempdir, dataset_reader):
25872614
# ARROW-11379
@@ -3094,6 +3121,30 @@ def test_parquet_dataset_factory(tempdir):
30943121
assert result.num_rows == 40
30953122

30963123

3124+
@pytest.mark.parquet
3125+
@pytest.mark.pandas # write_to_dataset currently requires pandas
3126+
@pytest.mark.skipif(sys.platform == 'win32',
3127+
reason="Results in FileNotFoundError on Windows")
3128+
def test_parquet_dataset_factory_fsspec(tempdir):
3129+
# https://issues.apache.org/jira/browse/ARROW-16413
3130+
fsspec = pytest.importorskip("fsspec")
3131+
3132+
# create dataset with pyarrow
3133+
root_path = tempdir / "test_parquet_dataset"
3134+
metadata_path, table = _create_parquet_dataset_simple(root_path)
3135+
3136+
# read using fsspec filesystem
3137+
fsspec_fs = fsspec.filesystem("file")
3138+
# manually creating a PyFileSystem, because passing the local fsspec
3139+
# filesystem would internally be converted to native LocalFileSystem
3140+
filesystem = fs.PyFileSystem(fs.FSSpecHandler(fsspec_fs))
3141+
dataset = ds.parquet_dataset(metadata_path, filesystem=filesystem)
3142+
assert dataset.schema.equals(table.schema)
3143+
assert len(dataset.files) == 4
3144+
result = dataset.to_table()
3145+
assert result.num_rows == 40
3146+
3147+
30973148
@pytest.mark.parquet
30983149
@pytest.mark.pandas # write_to_dataset currently requires pandas
30993150
@pytest.mark.parametrize('use_legacy_dataset', [False, True])

0 commit comments

Comments
 (0)