ARROW-15870: [Python] Start to raise deprecation warnings for use_legacy_dataset=True in parquet.read_table

jorisvandenbossche · jorisvandenbossche · commit 3bf061783f4e · 2022-03-15T17:59:47.000+01:00
Currently, users can still specify `use_legacy_dataset=True` explicitly to get the old implementation/behaviour. But if we want to remove that implementation at some point (ARROW-15868), we should start deprecating that option, to futher nudge people to the new implementation. Closes #12584 from jorisvandenbossche/ARROW-15870 Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com> Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
@@ -1908,7 +1908,8 @@ def partitioning(self):
     pyarrow 1.0.0. Among other things, this allows to pass `filters`
     for all columns and not only the partition keys, enables
     different partitioning schemes, etc.
-    Set to True to use the legacy behaviour.
+    Set to True to use the legacy behaviour (this option is deprecated,
+    and the legacy implementation will be removed in a future version).
 ignore_prefixes : list, optional
     Files matching any of these prefixes will be ignored by the
     discovery process if use_legacy_dataset=False.
@@ -2006,6 +2007,12 @@ def read_table(source, columns=None, use_threads=True, metadata=None,
         return dataset.read(columns=columns, use_threads=use_threads,
                             use_pandas_metadata=use_pandas_metadata)
 
+    warnings.warn(
+        "Passing 'use_legacy_dataset=True' to get the legacy behaviour is "
+        "deprecated as of pyarrow 8.0.0, and the legacy implementation will "
+        "be removed in a future version.",
+        DeprecationWarning, stacklevel=2)
+
     if ignore_prefixes is not None:
         raise ValueError(
             "The 'ignore_prefixes' keyword is only supported when "
diff --git a/python/pyarrow/tests/parquet/__init__.py b/python/pyarrow/tests/parquet/__init__.py
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not parquet'
+pytestmark = [
+    pytest.mark.parquet,
+    pytest.mark.filterwarnings(
+        "ignore:Passing 'use_legacy_dataset=True':DeprecationWarning"
+    ),
+]
diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py
@@ -45,9 +45,6 @@
     pd = tm = None
 
 
-pytestmark = pytest.mark.parquet
-
-
 def test_parquet_invalid_version(tempdir):
     table = pa.table({'a': [1, 2, 3]})
     with pytest.raises(ValueError, match="Unsupported Parquet format version"):
@@ -593,7 +590,10 @@ def test_read_table_doesnt_warn(datadir, use_legacy_dataset):
         pq.read_table(datadir / 'v0.7.1.parquet',
                       use_legacy_dataset=use_legacy_dataset)
 
-    assert len(record) == 0
+    if use_legacy_dataset:
+        assert len(record) == 1
+    else:
+        assert len(record) == 0
 
 
 @pytest.mark.pandas
@@ -758,3 +758,15 @@ def test_permutation_of_column_order(tempdir):
                       names=['a', 'b'])
 
     assert table == table2
+
+
+def test_read_table_legacy_deprecated(tempdir):
+    # ARROW-15870
+    table = pa.table({'a': [1, 2, 3]})
+    path = tempdir / 'data.parquet'
+    pq.write_table(table, path)
+
+    with pytest.warns(
+        DeprecationWarning, match="Passing 'use_legacy_dataset=True'"
+    ):
+        pq.read_table(path, use_legacy_dataset=True)
diff --git a/python/pyarrow/tests/parquet/test_compliant_nested_type.py b/python/pyarrow/tests/parquet/test_compliant_nested_type.py
@@ -35,7 +35,6 @@
 except ImportError:
     pd = tm = None
 
-pytestmark = pytest.mark.parquet
 
 # Tests for ARROW-11497
 _test_data_simple = [
diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py
@@ -44,9 +44,6 @@
     pd = tm = None
 
 
-pytestmark = pytest.mark.parquet
-
-
 # General roundtrip of data types
 # -----------------------------------------------------------------------------
 
diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py
@@ -46,8 +46,6 @@
 except ImportError:
     pd = tm = None
 
-pytestmark = pytest.mark.parquet
-
 
 @pytest.mark.pandas
 def test_parquet_piece_read(tempdir):
diff --git a/python/pyarrow/tests/parquet/test_datetime.py b/python/pyarrow/tests/parquet/test_datetime.py
@@ -41,9 +41,6 @@
     pd = tm = None
 
 
-pytestmark = pytest.mark.parquet
-
-
 @pytest.mark.pandas
 @parametrize_legacy_dataset
 def test_pandas_parquet_datetime_tz(use_legacy_dataset):
diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py
@@ -40,9 +40,6 @@
     pd = tm = None
 
 
-pytestmark = pytest.mark.parquet
-
-
 @pytest.mark.pandas
 def test_parquet_metadata_api():
     df = alltypes_sample(size=10000)
diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py
@@ -46,9 +46,6 @@
     pd = tm = None
 
 
-pytestmark = pytest.mark.parquet
-
-
 @pytest.mark.pandas
 def test_pandas_parquet_custom_metadata(tempdir):
     df = alltypes_sample(size=10000)
diff --git a/python/pyarrow/tests/parquet/test_parquet_file.py b/python/pyarrow/tests/parquet/test_parquet_file.py
@@ -36,8 +36,6 @@
 except ImportError:
     pd = tm = None
 
-pytestmark = pytest.mark.parquet
-
 
 @pytest.mark.pandas
 def test_pass_separate_metadata():
diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py b/python/pyarrow/tests/parquet/test_parquet_writer.py
@@ -36,8 +36,6 @@
 except ImportError:
     pd = tm = None
 
-pytestmark = pytest.mark.parquet
-
 
 @pytest.mark.pandas
 @parametrize_legacy_dataset