@@ -32,7 +32,7 @@ from pyarrow.includes.libarrow_dataset cimport *
3232from pyarrow._acero cimport ExecNodeOptions
3333from pyarrow._compute cimport Expression, _bind
3434from pyarrow._compute import _forbid_instantiation
35- from pyarrow._fs cimport FileSystem, FileSelector
35+ from pyarrow._fs cimport FileSystem, FileSelector, FileInfo
3636from pyarrow._csv cimport (
3737 ConvertOptions, ParseOptions, ReadOptions, WriteOptions)
3838from pyarrow.util import _is_iterable, _is_path_like, _stringify_path
@@ -96,27 +96,33 @@ def _get_parquet_symbol(name):
9696 return _dataset_pq and getattr (_dataset_pq, name)
9797
9898
99- cdef CFileSource _make_file_source(object file , FileSystem filesystem = None ):
99+ cdef CFileSource _make_file_source(object file , FileSystem filesystem = None , object file_size = None ):
100100
101101 cdef:
102102 CFileSource c_source
103103 shared_ptr[CFileSystem] c_filesystem
104+ CFileInfo c_info
104105 c_string c_path
105106 shared_ptr[CRandomAccessFile] c_file
106107 shared_ptr[CBuffer] c_buffer
108+ int64_t c_size
107109
108110 if isinstance (file , Buffer):
109111 c_buffer = pyarrow_unwrap_buffer(file )
110112 c_source = CFileSource(move(c_buffer))
111-
112113 elif _is_path_like(file ):
113114 if filesystem is None :
114115 raise ValueError (" cannot construct a FileSource from "
115116 " a path without a FileSystem" )
116117 c_filesystem = filesystem.unwrap()
117118 c_path = tobytes(_stringify_path(file ))
118- c_source = CFileSource(move(c_path), move(c_filesystem))
119119
120+ if file_size is not None :
121+ c_size = file_size
122+ c_info = FileInfo(c_path, size = c_size).unwrap()
123+ c_source = CFileSource(move(c_info), move(c_filesystem))
124+ else :
125+ c_source = CFileSource(move(c_path), move(c_filesystem))
120126 elif hasattr (file , ' read' ):
121127 # Optimistically hope this is file-like
122128 c_file = get_native_file(file , False ).get_random_access_file()
@@ -1230,15 +1236,16 @@ cdef class FileFormat(_Weakrefable):
12301236 The schema inferred from the file
12311237 """
12321238 cdef:
1233- CFileSource c_source = _make_file_source(file , filesystem)
1239+ CFileSource c_source = _make_file_source(file , filesystem, file_size = None )
12341240 CResult[shared_ptr[CSchema]] c_result
12351241 with nogil:
12361242 c_result = self .format.Inspect(c_source)
12371243 c_schema = GetResultValue(c_result)
12381244 return pyarrow_wrap_schema(move(c_schema))
12391245
12401246 def make_fragment (self , file , filesystem = None ,
1241- Expression partition_expression = None ):
1247+ Expression partition_expression = None ,
1248+ *, file_size = None ):
12421249 """
12431250 Make a FileFragment from a given file.
12441251
@@ -1252,6 +1259,9 @@ cdef class FileFormat(_Weakrefable):
12521259 partition_expression : Expression, optional
12531260 An expression that is guaranteed true for all rows in the fragment. Allows
12541261 fragment to be potentially skipped while scanning with a filter.
1262+ file_size : int, optional
1263+ The size of the file in bytes. Can improve performance with high-latency filesystems
1264+ when file size needs to be known before reading.
12551265
12561266 Returns
12571267 -------
@@ -1260,8 +1270,7 @@ cdef class FileFormat(_Weakrefable):
12601270 """
12611271 if partition_expression is None :
12621272 partition_expression = _true
1263-
1264- c_source = _make_file_source(file , filesystem)
1273+ c_source = _make_file_source(file , filesystem, file_size)
12651274 c_fragment = < shared_ptr[CFragment]> GetResultValue(
12661275 self .format.MakeFragment(move(c_source),
12671276 partition_expression.unwrap(),
0 commit comments