fix test with pandas metadata from common metadata

jorisvandenbossche · jorisvandenbossche · commit 16e33d35d669 · 2022-12-23T13:24:06.000+01:00
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
@@ -42,7 +42,7 @@
                               ParquetLogicalType,
                               FileEncryptionProperties,
                               FileDecryptionProperties)
-from pyarrow.fs import (LocalFileSystem, FileSystem,
+from pyarrow.fs import (LocalFileSystem, FileSystem, FileType,
                         _resolve_filesystem_and_path, _ensure_filesystem)
 from pyarrow import filesystem as legacyfs
 from pyarrow.util import guid, _is_path_like, _stringify_path, _deprecate_api
@@ -1760,7 +1760,7 @@ def __new__(cls, path_or_paths=None, filesystem=None, schema=None,
             )
         warnings.warn(
             "Passing 'use_legacy_dataset=True' to get the legacy behaviour is "
-            "deprecated as of pyarrow 10.0.0, and the legacy implementation "
+            "deprecated as of pyarrow 11.0.0, and the legacy implementation "
             "will be removed in a future version.",
             FutureWarning, stacklevel=2)
         self = object.__new__(cls)
@@ -2419,6 +2419,7 @@ def __init__(self, path_or_paths, filesystem=None, *, filters=None,
 
         # check for single fragment dataset
         single_file = None
+        self._base_dir = None
         if not isinstance(path_or_paths, list):
             if _is_path_like(path_or_paths):
                 path_or_paths = _stringify_path(path_or_paths)
@@ -2429,8 +2430,11 @@ def __init__(self, path_or_paths, filesystem=None, *, filters=None,
                             path_or_paths)
                     except ValueError:
                         filesystem = LocalFileSystem(use_mmap=memory_map)
-                if filesystem.get_file_info(path_or_paths).is_file:
+                finfo = filesystem.get_file_info(path_or_paths)
+                if finfo.is_file:
                     single_file = path_or_paths
+                if finfo.type == FileType.Directory:
+                    self._base_dir = path_or_paths
             else:
                 single_file = path_or_paths
 
@@ -2554,7 +2558,16 @@ def read(self, columns=None, use_threads=True, use_pandas_metadata=False):
         """
         # if use_pandas_metadata, we need to include index columns in the
         # column selection, to be able to restore those in the pandas DataFrame
-        metadata = self.schema.metadata
+        metadata = self.schema.metadata or {}
+
+        if use_pandas_metadata:
+            # if the dataset schema metadata itself doesn't have pandas
+            # then try to get this from common file (for backwards compat)
+            if b"pandas" not in metadata:
+                common_metadata = self._get_common_pandas_metadata()
+                if common_metadata:
+                    metadata = common_metadata
+
         if columns is not None and use_pandas_metadata:
             if metadata and b'pandas' in metadata:
                 # RangeIndex can be represented as dict instead of column name
@@ -2581,6 +2594,24 @@ def read(self, columns=None, use_threads=True, use_pandas_metadata=False):
 
         return table
 
+    def _get_common_pandas_metadata(self):
+
+        if not self._base_dir:
+            return None
+
+        metadata = None
+        for name in ["_common_metadata", "_metadata"]:
+            metadata_path = os.path.join(str(self._base_dir), name)
+            finfo = self.filesystem.get_file_info(metadata_path)
+            if finfo.is_file:
+                pq_meta = read_metadata(
+                    metadata_path, filesystem=self.filesystem)
+                metadata = pq_meta.metadata
+                if metadata and b'pandas' in metadata:
+                    break
+
+        return metadata
+
     def read_pandas(self, **kwargs):
         """
         Read dataset including pandas metadata, if any. Other arguments passed
diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py
@@ -625,8 +625,12 @@ def test_write_to_dataset_pandas_preserve_index(tempdir, use_legacy_dataset):
 
 
 @pytest.mark.pandas
+@parametrize_legacy_dataset
 @pytest.mark.parametrize('preserve_index', [True, False, None])
-def test_dataset_read_pandas_common_metadata(tempdir, preserve_index):
+@pytest.mark.parametrize('metadata_fname', ["_metadata", "_common_metadata"])
+def test_dataset_read_pandas_common_metadata(
+    tempdir, use_legacy_dataset, preserve_index, metadata_fname
+):
     # ARROW-1103
     nfiles = 5
     size = 5
@@ -658,9 +662,9 @@ def test_dataset_read_pandas_common_metadata(tempdir, preserve_index):
     table_for_metadata = pa.Table.from_pandas(
         df, preserve_index=preserve_index
     )
-    pq.write_metadata(table_for_metadata.schema, dirpath / '_metadata')
+    pq.write_metadata(table_for_metadata.schema, dirpath / metadata_fname)
 
-    dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=True)
+    dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset)
     columns = ['uint8', 'strings']
     result = dataset.read_pandas(columns=columns).to_pandas()
     expected = pd.concat([x[columns] for x in frames])