-
Notifications
You must be signed in to change notification settings - Fork 4k
Closed
Description
See eg https://github.com/ursacomputing/crossbow/actions/runs/8104554803/job/22151387751
___________________________ test_filters_equivalency ___________________________
tempdir = PosixPath('/tmp/pytest-of-root/pytest-0/test_filters_equivalency0')
@pytest.mark.pandas
def test_filters_equivalency(tempdir):
local = LocalFileSystem()
base_path = tempdir
integer_keys = [0, 1]
string_keys = ['a', 'b', 'c']
boolean_keys = [True, False]
partition_spec = [
['integer', integer_keys],
['string', string_keys],
['boolean', boolean_keys]
]
df = pd.DataFrame({
'integer': np.array(integer_keys, dtype='i4').repeat(15),
'string': np.tile(np.tile(np.array(string_keys, dtype=object), 5), 2),
'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 5),
3),
}, columns=['integer', 'string', 'boolean'])
_generate_partition_directories(local, base_path, partition_spec, df)
# Old filters syntax:
# integer == 1 AND string != b AND boolean == True
dataset = pq.ParquetDataset(
base_path, filesystem=local,
filters=[('integer', '=', 1), ('string', '!=', 'b'),
('boolean', '==', 'True')],
)
table = dataset.read()
result_df = (table.to_pandas().reset_index(drop=True))
assert 0 not in result_df['integer'].values
assert 'b' not in result_df['string'].values
assert False not in result_df['boolean'].values
# filters in disjunctive normal form:
# (integer == 1 AND string != b AND boolean == True) OR
# (integer == 2 AND boolean == False)
# TODO(ARROW-3388): boolean columns are reconstructed as string
filters = [
[
('integer', '=', 1),
('string', '!=', 'b'),
('boolean', '==', 'True')
],
[('integer', '=', 0), ('boolean', '==', 'False')]
]
dataset = pq.ParquetDataset(
base_path, filesystem=local, filters=filters)
table = dataset.read()
result_df = table.to_pandas().reset_index(drop=True)
# Check that all rows in the DF fulfill the filter
df_filter_1 = (result_df['integer'] == 1) \
& (result_df['string'] != 'b') \
& (result_df['boolean'] == 'True')
df_filter_2 = (np.array(result_df['integer']) == 0) \
& (result_df['boolean'] == 'False')
> assert df_filter_1.sum() > 0
E assert np.int64(0) > 0
E + where np.int64(0) = <bound method Series.sum of Series([], dtype: bool)>()
E + where <bound method Series.sum of Series([], dtype: bool)> = Series([], dtype: bool).sum
opt/conda/envs/arrow/lib/python3.10/site-packages/pyarrow/tests/parquet/test_dataset.py:153: AssertionError
__________________________ test_filters_inclusive_set __________________________
tempdir = PosixPath('/tmp/pytest-of-root/pytest-0/test_filters_inclusive_set0')
@pytest.mark.pandas
def test_filters_inclusive_set(tempdir):
local = LocalFileSystem()
base_path = tempdir
integer_keys = [0, 1]
string_keys = ['a', 'b', 'c']
boolean_keys = [True, False]
partition_spec = [
['integer', integer_keys],
['string', string_keys],
['boolean', boolean_keys]
]
df = pd.DataFrame({
'integer': np.array(integer_keys, dtype='i4').repeat(15),
'string': np.tile(np.tile(np.array(string_keys, dtype=object), 5), 2),
'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 5),
3),
}, columns=['integer', 'string', 'boolean'])
_generate_partition_directories(local, base_path, partition_spec, df)
dataset = pq.ParquetDataset(
base_path, filesystem=local,
filters=[('string', 'in', 'ab')],
)
table = dataset.read()
result_df = (table.to_pandas().reset_index(drop=True))
> assert 'a' in result_df['string'].values
E AssertionError: assert 'a' in [], Categories (3, object): ['a', 'b', 'c']
E + where [], Categories (3, object): ['a', 'b', 'c'] = Series([], Name: string, dtype: category\nCategories (3, object): ['a', 'b', 'c']).values
opt/conda/envs/arrow/lib/python3.10/site-packages/pyarrow/tests/parquet/test_dataset.py:328: AssertionError
From debugging the failure, it seems this is due to pandas changing a filter operation to sometimes preserve a RangeIndex now instead of returning an Integer64Index. And the conversion to Arrow changes based on that (RangeIndex is metadata only by default, integer index becomes a column)