Skip to content

[Python][CI] Dataset partition filter tests failing with pandas nightly #40428

@jorisvandenbossche

Description

@jorisvandenbossche

See eg https://github.com/ursacomputing/crossbow/actions/runs/8104554803/job/22151387751

___________________________ test_filters_equivalency ___________________________

tempdir = PosixPath('/tmp/pytest-of-root/pytest-0/test_filters_equivalency0')

    @pytest.mark.pandas
    def test_filters_equivalency(tempdir):
        local = LocalFileSystem()
        base_path = tempdir
    
        integer_keys = [0, 1]
        string_keys = ['a', 'b', 'c']
        boolean_keys = [True, False]
        partition_spec = [
            ['integer', integer_keys],
            ['string', string_keys],
            ['boolean', boolean_keys]
        ]
    
        df = pd.DataFrame({
            'integer': np.array(integer_keys, dtype='i4').repeat(15),
            'string': np.tile(np.tile(np.array(string_keys, dtype=object), 5), 2),
            'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 5),
                               3),
        }, columns=['integer', 'string', 'boolean'])
    
        _generate_partition_directories(local, base_path, partition_spec, df)
    
        # Old filters syntax:
        #  integer == 1 AND string != b AND boolean == True
        dataset = pq.ParquetDataset(
            base_path, filesystem=local,
            filters=[('integer', '=', 1), ('string', '!=', 'b'),
                     ('boolean', '==', 'True')],
        )
        table = dataset.read()
        result_df = (table.to_pandas().reset_index(drop=True))
    
        assert 0 not in result_df['integer'].values
        assert 'b' not in result_df['string'].values
        assert False not in result_df['boolean'].values
    
        # filters in disjunctive normal form:
        #  (integer == 1 AND string != b AND boolean == True) OR
        #  (integer == 2 AND boolean == False)
        # TODO(ARROW-3388): boolean columns are reconstructed as string
        filters = [
            [
                ('integer', '=', 1),
                ('string', '!=', 'b'),
                ('boolean', '==', 'True')
            ],
            [('integer', '=', 0), ('boolean', '==', 'False')]
        ]
        dataset = pq.ParquetDataset(
            base_path, filesystem=local, filters=filters)
        table = dataset.read()
        result_df = table.to_pandas().reset_index(drop=True)
    
        # Check that all rows in the DF fulfill the filter
        df_filter_1 = (result_df['integer'] == 1) \
            & (result_df['string'] != 'b') \
            & (result_df['boolean'] == 'True')
        df_filter_2 = (np.array(result_df['integer']) == 0) \
            & (result_df['boolean'] == 'False')
>       assert df_filter_1.sum() > 0
E       assert np.int64(0) > 0
E        +  where np.int64(0) = <bound method Series.sum of Series([], dtype: bool)>()
E        +    where <bound method Series.sum of Series([], dtype: bool)> = Series([], dtype: bool).sum

opt/conda/envs/arrow/lib/python3.10/site-packages/pyarrow/tests/parquet/test_dataset.py:153: AssertionError
__________________________ test_filters_inclusive_set __________________________

tempdir = PosixPath('/tmp/pytest-of-root/pytest-0/test_filters_inclusive_set0')

    @pytest.mark.pandas
    def test_filters_inclusive_set(tempdir):
        local = LocalFileSystem()
        base_path = tempdir
    
        integer_keys = [0, 1]
        string_keys = ['a', 'b', 'c']
        boolean_keys = [True, False]
        partition_spec = [
            ['integer', integer_keys],
            ['string', string_keys],
            ['boolean', boolean_keys]
        ]
    
        df = pd.DataFrame({
            'integer': np.array(integer_keys, dtype='i4').repeat(15),
            'string': np.tile(np.tile(np.array(string_keys, dtype=object), 5), 2),
            'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 5),
                               3),
        }, columns=['integer', 'string', 'boolean'])
    
        _generate_partition_directories(local, base_path, partition_spec, df)
    
        dataset = pq.ParquetDataset(
            base_path, filesystem=local,
            filters=[('string', 'in', 'ab')],
        )
        table = dataset.read()
        result_df = (table.to_pandas().reset_index(drop=True))
    
>       assert 'a' in result_df['string'].values
E       AssertionError: assert 'a' in [], Categories (3, object): ['a', 'b', 'c']
E        +  where [], Categories (3, object): ['a', 'b', 'c'] = Series([], Name: string, dtype: category\nCategories (3, object): ['a', 'b', 'c']).values

opt/conda/envs/arrow/lib/python3.10/site-packages/pyarrow/tests/parquet/test_dataset.py:328: AssertionError

From debugging the failure, it seems this is due to pandas changing a filter operation to sometimes preserve a RangeIndex now instead of returning an Integer64Index. And the conversion to Arrow changes based on that (RangeIndex is metadata only by default, integer index becomes a column)

Metadata

Metadata

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions