|
21 | 21 | import datetime |
22 | 22 | import pathlib |
23 | 23 | import pickle |
| 24 | +import sys |
24 | 25 | import textwrap |
25 | 26 | import tempfile |
26 | 27 | import threading |
@@ -2582,6 +2583,32 @@ def test_open_dataset_from_fsspec(tempdir): |
2582 | 2583 | assert dataset.schema.equals(table.schema) |
2583 | 2584 |
|
2584 | 2585 |
|
| 2586 | +@pytest.mark.parquet |
| 2587 | +def test_file_format_inspect_fsspec(tempdir): |
| 2588 | + # https://issues.apache.org/jira/browse/ARROW-16413 |
| 2589 | + fsspec = pytest.importorskip("fsspec") |
| 2590 | + |
| 2591 | + # create bucket + file with pyarrow |
| 2592 | + table = pa.table({'a': [1, 2, 3]}) |
| 2593 | + path = tempdir / "data.parquet" |
| 2594 | + pq.write_table(table, path) |
| 2595 | + |
| 2596 | + # read using fsspec filesystem |
| 2597 | + fsspec_fs = fsspec.filesystem("file") |
| 2598 | + assert fsspec_fs.ls(tempdir)[0].endswith("data.parquet") |
| 2599 | + |
| 2600 | + # inspect using dataset file format |
| 2601 | + format = ds.ParquetFileFormat() |
| 2602 | + # manually creating a PyFileSystem instead of using fs._ensure_filesystem |
| 2603 | + # which would convert an fsspec local filesystem to a native one |
| 2604 | + filesystem = fs.PyFileSystem(fs.FSSpecHandler(fsspec_fs)) |
| 2605 | + schema = format.inspect(path, filesystem) |
| 2606 | + assert schema.equals(table.schema) |
| 2607 | + |
| 2608 | + fragment = format.make_fragment(path, filesystem) |
| 2609 | + assert fragment.physical_schema.equals(table.schema) |
| 2610 | + |
| 2611 | + |
2585 | 2612 | @pytest.mark.pandas |
2586 | 2613 | def test_filter_timestamp(tempdir, dataset_reader): |
2587 | 2614 | # ARROW-11379 |
@@ -3094,6 +3121,30 @@ def test_parquet_dataset_factory(tempdir): |
3094 | 3121 | assert result.num_rows == 40 |
3095 | 3122 |
|
3096 | 3123 |
|
| 3124 | +@pytest.mark.parquet |
| 3125 | +@pytest.mark.pandas # write_to_dataset currently requires pandas |
| 3126 | +@pytest.mark.skipif(sys.platform == 'win32', |
| 3127 | + reason="Results in FileNotFoundError on Windows") |
| 3128 | +def test_parquet_dataset_factory_fsspec(tempdir): |
| 3129 | + # https://issues.apache.org/jira/browse/ARROW-16413 |
| 3130 | + fsspec = pytest.importorskip("fsspec") |
| 3131 | + |
| 3132 | + # create dataset with pyarrow |
| 3133 | + root_path = tempdir / "test_parquet_dataset" |
| 3134 | + metadata_path, table = _create_parquet_dataset_simple(root_path) |
| 3135 | + |
| 3136 | + # read using fsspec filesystem |
| 3137 | + fsspec_fs = fsspec.filesystem("file") |
| 3138 | + # manually creating a PyFileSystem, because passing the local fsspec |
| 3139 | + # filesystem would internally be converted to native LocalFileSystem |
| 3140 | + filesystem = fs.PyFileSystem(fs.FSSpecHandler(fsspec_fs)) |
| 3141 | + dataset = ds.parquet_dataset(metadata_path, filesystem=filesystem) |
| 3142 | + assert dataset.schema.equals(table.schema) |
| 3143 | + assert len(dataset.files) == 4 |
| 3144 | + result = dataset.to_table() |
| 3145 | + assert result.num_rows == 40 |
| 3146 | + |
| 3147 | + |
3097 | 3148 | @pytest.mark.parquet |
3098 | 3149 | @pytest.mark.pandas # write_to_dataset currently requires pandas |
3099 | 3150 | @pytest.mark.parametrize('use_legacy_dataset', [False, True]) |
|
0 commit comments