Skip to content

Commit 16e33d3

Browse files
fix test with pandas metadata from common metadata
1 parent a8989b4 commit 16e33d3

2 files changed

Lines changed: 42 additions & 7 deletions

File tree

python/pyarrow/parquet/core.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
ParquetLogicalType,
4343
FileEncryptionProperties,
4444
FileDecryptionProperties)
45-
from pyarrow.fs import (LocalFileSystem, FileSystem,
45+
from pyarrow.fs import (LocalFileSystem, FileSystem, FileType,
4646
_resolve_filesystem_and_path, _ensure_filesystem)
4747
from pyarrow import filesystem as legacyfs
4848
from pyarrow.util import guid, _is_path_like, _stringify_path, _deprecate_api
@@ -1760,7 +1760,7 @@ def __new__(cls, path_or_paths=None, filesystem=None, schema=None,
17601760
)
17611761
warnings.warn(
17621762
"Passing 'use_legacy_dataset=True' to get the legacy behaviour is "
1763-
"deprecated as of pyarrow 10.0.0, and the legacy implementation "
1763+
"deprecated as of pyarrow 11.0.0, and the legacy implementation "
17641764
"will be removed in a future version.",
17651765
FutureWarning, stacklevel=2)
17661766
self = object.__new__(cls)
@@ -2419,6 +2419,7 @@ def __init__(self, path_or_paths, filesystem=None, *, filters=None,
24192419

24202420
# check for single fragment dataset
24212421
single_file = None
2422+
self._base_dir = None
24222423
if not isinstance(path_or_paths, list):
24232424
if _is_path_like(path_or_paths):
24242425
path_or_paths = _stringify_path(path_or_paths)
@@ -2429,8 +2430,11 @@ def __init__(self, path_or_paths, filesystem=None, *, filters=None,
24292430
path_or_paths)
24302431
except ValueError:
24312432
filesystem = LocalFileSystem(use_mmap=memory_map)
2432-
if filesystem.get_file_info(path_or_paths).is_file:
2433+
finfo = filesystem.get_file_info(path_or_paths)
2434+
if finfo.is_file:
24332435
single_file = path_or_paths
2436+
if finfo.type == FileType.Directory:
2437+
self._base_dir = path_or_paths
24342438
else:
24352439
single_file = path_or_paths
24362440

@@ -2554,7 +2558,16 @@ def read(self, columns=None, use_threads=True, use_pandas_metadata=False):
25542558
"""
25552559
# if use_pandas_metadata, we need to include index columns in the
25562560
# column selection, to be able to restore those in the pandas DataFrame
2557-
metadata = self.schema.metadata
2561+
metadata = self.schema.metadata or {}
2562+
2563+
if use_pandas_metadata:
2564+
# if the dataset schema metadata itself doesn't have pandas
2565+
# then try to get this from common file (for backwards compat)
2566+
if b"pandas" not in metadata:
2567+
common_metadata = self._get_common_pandas_metadata()
2568+
if common_metadata:
2569+
metadata = common_metadata
2570+
25582571
if columns is not None and use_pandas_metadata:
25592572
if metadata and b'pandas' in metadata:
25602573
# RangeIndex can be represented as dict instead of column name
@@ -2581,6 +2594,24 @@ def read(self, columns=None, use_threads=True, use_pandas_metadata=False):
25812594

25822595
return table
25832596

2597+
def _get_common_pandas_metadata(self):
2598+
2599+
if not self._base_dir:
2600+
return None
2601+
2602+
metadata = None
2603+
for name in ["_common_metadata", "_metadata"]:
2604+
metadata_path = os.path.join(str(self._base_dir), name)
2605+
finfo = self.filesystem.get_file_info(metadata_path)
2606+
if finfo.is_file:
2607+
pq_meta = read_metadata(
2608+
metadata_path, filesystem=self.filesystem)
2609+
metadata = pq_meta.metadata
2610+
if metadata and b'pandas' in metadata:
2611+
break
2612+
2613+
return metadata
2614+
25842615
def read_pandas(self, **kwargs):
25852616
"""
25862617
Read dataset including pandas metadata, if any. Other arguments passed

python/pyarrow/tests/parquet/test_pandas.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -625,8 +625,12 @@ def test_write_to_dataset_pandas_preserve_index(tempdir, use_legacy_dataset):
625625

626626

627627
@pytest.mark.pandas
628+
@parametrize_legacy_dataset
628629
@pytest.mark.parametrize('preserve_index', [True, False, None])
629-
def test_dataset_read_pandas_common_metadata(tempdir, preserve_index):
630+
@pytest.mark.parametrize('metadata_fname', ["_metadata", "_common_metadata"])
631+
def test_dataset_read_pandas_common_metadata(
632+
tempdir, use_legacy_dataset, preserve_index, metadata_fname
633+
):
630634
# ARROW-1103
631635
nfiles = 5
632636
size = 5
@@ -658,9 +662,9 @@ def test_dataset_read_pandas_common_metadata(tempdir, preserve_index):
658662
table_for_metadata = pa.Table.from_pandas(
659663
df, preserve_index=preserve_index
660664
)
661-
pq.write_metadata(table_for_metadata.schema, dirpath / '_metadata')
665+
pq.write_metadata(table_for_metadata.schema, dirpath / metadata_fname)
662666

663-
dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=True)
667+
dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset)
664668
columns = ['uint8', 'strings']
665669
result = dataset.read_pandas(columns=columns).to_pandas()
666670
expected = pd.concat([x[columns] for x in frames])

0 commit comments

Comments
 (0)