ArrowInvalid Traceback (most recent call last)
Input In [89], in <cell line: 7>()
4 s2 = pd.date_range("2022-01-01", "2022-02-02", periods=500)
6 pd.DataFrame({"a": s1}).to_parquet("1.pq") # Success!
----> 7 pd.DataFrame({"a": s2}).to_parquet("2.pq")
File ~/miniconda3/envs/dask/lib/python3.8/site-packages/pandas/util/_decorators.py:207, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
205 else:
206 kwargs[new_arg_name] = new_arg_value
--> 207 return func(*args, **kwargs)
File ~/miniconda3/envs/dask/lib/python3.8/site-packages/pandas/core/frame.py:2677, in DataFrame.to_parquet(self, path, engine, compression, index, partition_cols, storage_options, **kwargs)
2589 """
2590 Write a DataFrame to the binary parquet format.
2591
(...)
2673 >>> content = f.read()
2674 """
2675 from pandas.io.parquet import to_parquet
-> 2677 return to_parquet(
2678 self,
2679 path,
2680 engine,
2681 compression=compression,
2682 index=index,
2683 partition_cols=partition_cols,
2684 storage_options=storage_options,
2685 **kwargs,
2686 )
File ~/miniconda3/envs/dask/lib/python3.8/site-packages/pandas/io/parquet.py:412, in to_parquet(df, path, engine, compression, index, storage_options, partition_cols, **kwargs)
408 impl = get_engine(engine)
410 path_or_buf: FilePathOrBuffer = io.BytesIO() if path is None else path
--> 412 impl.write(
413 df,
414 path_or_buf,
415 compression=compression,
416 index=index,
417 partition_cols=partition_cols,
418 storage_options=storage_options,
419 **kwargs,
420 )
422 if path is None:
423 assert isinstance(path_or_buf, io.BytesIO)
File ~/miniconda3/envs/dask/lib/python3.8/site-packages/pandas/io/parquet.py:194, in PyArrowImpl.write(self, df, path, compression, index, storage_options, partition_cols, **kwargs)
185 self.api.parquet.write_to_dataset(
186 table,
187 path_or_handle,
(...)
190 **kwargs,
191 )
192 else:
193 # write to single output file
--> 194 self.api.parquet.write_table(
195 table, path_or_handle, compression=compression, **kwargs
196 )
197 finally:
198 if handles is not None:
File ~/miniconda3/envs/dask/lib/python3.8/site-packages/pyarrow/parquet.py:2092, in write_table(table, where, row_group_size, version, use_dictionary, compression, write_statistics, use_deprecated_int96_timestamps, coerce_timestamps, allow_truncated_timestamps, data_page_size, flavor, filesystem, compression_level, use_byte_stream_split, column_encoding, data_page_version, use_compliant_nested_type, **kwargs)
2073 try:
2074 with ParquetWriter(
2075 where, table.schema,
2076 filesystem=filesystem,
(...)
2090 use_compliant_nested_type=use_compliant_nested_type,
2091 **kwargs) as writer:
-> 2092 writer.write_table(table, row_group_size=row_group_size)
2093 except Exception:
2094 if _is_path_like(where):
File ~/miniconda3/envs/dask/lib/python3.8/site-packages/pyarrow/parquet.py:754, in ParquetWriter.write_table(self, table, row_group_size)
749 msg = ('Table schema does not match schema used to create file: '
750 '\ntable:\n{!s} vs. \nfile:\n{!s}'
751 .format(table.schema, self.schema))
752 raise ValueError(msg)
--> 754 self.writer.write_table(table, row_group_size=row_group_size)
File ~/miniconda3/envs/dask/lib/python3.8/site-packages/pyarrow/_parquet.pyx:1506, in pyarrow._parquet.ParquetWriter.write_table()
File ~/miniconda3/envs/dask/lib/python3.8/site-packages/pyarrow/error.pxi:99, in pyarrow.lib.check_status()
ArrowInvalid: Casting from timestamp[ns] to timestamp[us] would lose data: 1641000740681362725
Pandas version checks
I have checked that this issue has not already been reported.
I have confirmed this bug exists on the latest version of pandas.
I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
Issue Description
I'm not sure whether this is properly an issue with pandas or with pyarrow. It also may be user-error. But I ran into this quite bizarre behavior when writing dataframes with
datetime64[ns]series that depended upon whether the series has afreqor not.With the first series, defined with a
freq, writing to parquet happens without issue. But the second complains with anArrowInvaliderror:Details
ArrowInvalid Traceback (most recent call last) Input In [89], in <cell line: 7>() 4 s2 = pd.date_range("2022-01-01", "2022-02-02", periods=500) 6 pd.DataFrame({"a": s1}).to_parquet("1.pq") # Success! ----> 7 pd.DataFrame({"a": s2}).to_parquet("2.pq") File ~/miniconda3/envs/dask/lib/python3.8/site-packages/pandas/util/_decorators.py:207, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs) 205 else: 206 kwargs[new_arg_name] = new_arg_value --> 207 return func(*args, **kwargs) File ~/miniconda3/envs/dask/lib/python3.8/site-packages/pandas/core/frame.py:2677, in DataFrame.to_parquet(self, path, engine, compression, index, partition_cols, storage_options, **kwargs) 2589 """ 2590 Write a DataFrame to the binary parquet format. 2591 (...) 2673 >>> content = f.read() 2674 """ 2675 from pandas.io.parquet import to_parquet -> 2677 return to_parquet( 2678 self, 2679 path, 2680 engine, 2681 compression=compression, 2682 index=index, 2683 partition_cols=partition_cols, 2684 storage_options=storage_options, 2685 **kwargs, 2686 ) File ~/miniconda3/envs/dask/lib/python3.8/site-packages/pandas/io/parquet.py:412, in to_parquet(df, path, engine, compression, index, storage_options, partition_cols, **kwargs) 408 impl = get_engine(engine) 410 path_or_buf: FilePathOrBuffer = io.BytesIO() if path is None else path --> 412 impl.write( 413 df, 414 path_or_buf, 415 compression=compression, 416 index=index, 417 partition_cols=partition_cols, 418 storage_options=storage_options, 419 **kwargs, 420 ) 422 if path is None: 423 assert isinstance(path_or_buf, io.BytesIO) File ~/miniconda3/envs/dask/lib/python3.8/site-packages/pandas/io/parquet.py:194, in PyArrowImpl.write(self, df, path, compression, index, storage_options, partition_cols, **kwargs) 185 self.api.parquet.write_to_dataset( 186 table, 187 path_or_handle, (...) 190 **kwargs, 191 ) 192 else: 193 # write to single output file --> 194 self.api.parquet.write_table( 195 table, path_or_handle, compression=compression, **kwargs 196 ) 197 finally: 198 if handles is not None: File ~/miniconda3/envs/dask/lib/python3.8/site-packages/pyarrow/parquet.py:2092, in write_table(table, where, row_group_size, version, use_dictionary, compression, write_statistics, use_deprecated_int96_timestamps, coerce_timestamps, allow_truncated_timestamps, data_page_size, flavor, filesystem, compression_level, use_byte_stream_split, column_encoding, data_page_version, use_compliant_nested_type, **kwargs) 2073 try: 2074 with ParquetWriter( 2075 where, table.schema, 2076 filesystem=filesystem, (...) 2090 use_compliant_nested_type=use_compliant_nested_type, 2091 **kwargs) as writer: -> 2092 writer.write_table(table, row_group_size=row_group_size) 2093 except Exception: 2094 if _is_path_like(where): File ~/miniconda3/envs/dask/lib/python3.8/site-packages/pyarrow/parquet.py:754, in ParquetWriter.write_table(self, table, row_group_size) 749 msg = ('Table schema does not match schema used to create file: ' 750 '\ntable:\n{!s} vs. \nfile:\n{!s}' 751 .format(table.schema, self.schema)) 752 raise ValueError(msg) --> 754 self.writer.write_table(table, row_group_size=row_group_size) File ~/miniconda3/envs/dask/lib/python3.8/site-packages/pyarrow/_parquet.pyx:1506, in pyarrow._parquet.ParquetWriter.write_table() File ~/miniconda3/envs/dask/lib/python3.8/site-packages/pyarrow/error.pxi:99, in pyarrow.lib.check_status() ArrowInvalid: Casting from timestamp[ns] to timestamp[us] would lose data: 1641000740681362725Now, I can fix this by passing
allow_truncated_timestamps=Trueandcoerce_timestamps=usdown to the pyarrow engine, but I'm pretty baffled as to why it is able to implicitly handle the datetime index in the first case, but not in the second.Expected Behavior
I expected both datetime series to be able to be written to parquet. But I would also be okay if both of the above required explicit coercion to
usor similar. What is quite confusing to me is the inconsistency. Perhaps @mroeschke has some idea of what's going on here?Installed Versions
Details
INSTALLED VERSIONS
commit : 224458e
python : 3.8.8.final.0
python-bits : 64
OS : Linux
OS-release : 5.15.0-46-generic
Version : #49~20.04.1-Ubuntu SMP Thu Aug 4 19:15:44 UTC 2022
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8
pandas : 1.5.0rc0
numpy : 1.21.5
pytz : 2021.3
dateutil : 2.8.2
setuptools : 65.3.0
pip : 21.3.1
Cython : 0.29.22
pytest : 6.2.2
hypothesis : None
sphinx : 4.4.0
blosc : 1.10.2
feather : None
xlsxwriter : None
lxml.etree : 4.9.1
html5lib : 1.1
pymysql : None
psycopg2 : None
jinja2 : 3.1.2
IPython : 8.4.0
pandas_datareader: None
bs4 : 4.11.1
bottleneck : 1.3.5
brotli :
fastparquet : 0.8.1
fsspec : 2022.7.1
gcsfs : 0.7.2
matplotlib : 3.3.4
numba : 0.53.1
numexpr : 2.8.0
odfpy : None
openpyxl : 3.0.9
pandas_gbq : None
pyarrow : 7.0.0
pyreadstat : None
pyxlsb : None
s3fs : 2021.07.0
scipy : 1.7.3
snappy :
sqlalchemy : 1.4.20
tables : 3.6.1
tabulate : 0.8.9
xarray : 0.21.1
xlrd : None
xlwt : None
zstandard : None
tzdata : 2021.5