4242 ParquetLogicalType ,
4343 FileEncryptionProperties ,
4444 FileDecryptionProperties )
45- from pyarrow .fs import (LocalFileSystem , FileSystem ,
45+ from pyarrow .fs import (LocalFileSystem , FileSystem , FileType ,
4646 _resolve_filesystem_and_path , _ensure_filesystem )
4747from pyarrow import filesystem as legacyfs
4848from pyarrow .util import guid , _is_path_like , _stringify_path , _deprecate_api
@@ -1760,7 +1760,7 @@ def __new__(cls, path_or_paths=None, filesystem=None, schema=None,
17601760 )
17611761 warnings .warn (
17621762 "Passing 'use_legacy_dataset=True' to get the legacy behaviour is "
1763- "deprecated as of pyarrow 10 .0.0, and the legacy implementation "
1763+ "deprecated as of pyarrow 11 .0.0, and the legacy implementation "
17641764 "will be removed in a future version." ,
17651765 FutureWarning , stacklevel = 2 )
17661766 self = object .__new__ (cls )
@@ -2419,6 +2419,7 @@ def __init__(self, path_or_paths, filesystem=None, *, filters=None,
24192419
24202420 # check for single fragment dataset
24212421 single_file = None
2422+ self ._base_dir = None
24222423 if not isinstance (path_or_paths , list ):
24232424 if _is_path_like (path_or_paths ):
24242425 path_or_paths = _stringify_path (path_or_paths )
@@ -2429,8 +2430,11 @@ def __init__(self, path_or_paths, filesystem=None, *, filters=None,
24292430 path_or_paths )
24302431 except ValueError :
24312432 filesystem = LocalFileSystem (use_mmap = memory_map )
2432- if filesystem .get_file_info (path_or_paths ).is_file :
2433+ finfo = filesystem .get_file_info (path_or_paths )
2434+ if finfo .is_file :
24332435 single_file = path_or_paths
2436+ if finfo .type == FileType .Directory :
2437+ self ._base_dir = path_or_paths
24342438 else :
24352439 single_file = path_or_paths
24362440
@@ -2554,7 +2558,16 @@ def read(self, columns=None, use_threads=True, use_pandas_metadata=False):
25542558 """
25552559 # if use_pandas_metadata, we need to include index columns in the
25562560 # column selection, to be able to restore those in the pandas DataFrame
2557- metadata = self .schema .metadata
2561+ metadata = self .schema .metadata or {}
2562+
2563+ if use_pandas_metadata :
2564+ # if the dataset schema metadata itself doesn't have pandas
2565+ # then try to get this from common file (for backwards compat)
2566+ if b"pandas" not in metadata :
2567+ common_metadata = self ._get_common_pandas_metadata ()
2568+ if common_metadata :
2569+ metadata = common_metadata
2570+
25582571 if columns is not None and use_pandas_metadata :
25592572 if metadata and b'pandas' in metadata :
25602573 # RangeIndex can be represented as dict instead of column name
@@ -2581,6 +2594,24 @@ def read(self, columns=None, use_threads=True, use_pandas_metadata=False):
25812594
25822595 return table
25832596
2597+ def _get_common_pandas_metadata (self ):
2598+
2599+ if not self ._base_dir :
2600+ return None
2601+
2602+ metadata = None
2603+ for name in ["_common_metadata" , "_metadata" ]:
2604+ metadata_path = os .path .join (str (self ._base_dir ), name )
2605+ finfo = self .filesystem .get_file_info (metadata_path )
2606+ if finfo .is_file :
2607+ pq_meta = read_metadata (
2608+ metadata_path , filesystem = self .filesystem )
2609+ metadata = pq_meta .metadata
2610+ if metadata and b'pandas' in metadata :
2611+ break
2612+
2613+ return metadata
2614+
25842615 def read_pandas (self , ** kwargs ):
25852616 """
25862617 Read dataset including pandas metadata, if any. Other arguments passed
0 commit comments