Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
2cd9ab3
edit train/test_size default behavior
nelson-liu Sep 19, 2016
8e0e817
revert changes to cross_Validation
nelson-liu Sep 20, 2016
f0dddd9
fix improper merge resolution
nelson-liu Sep 20, 2016
2746367
edit default train/test_size behavior for other splitters
nelson-liu Sep 21, 2016
4aa0f77
add deprecation warnings to groupshufflesplit and train_test_split
nelson-liu Oct 4, 2016
2d1c51c
edit train/test_size default behavior
nelson-liu Sep 19, 2016
719662a
revert changes to cross_Validation
nelson-liu Sep 20, 2016
177c48d
correctly format docstrings and remove warnings of changed default va…
nelson-liu Dec 25, 2016
e0ca540
restored original behavior with added DeprecationWarnings
nelson-liu Dec 25, 2016
2d50779
add unit tests for deprecationwarnings
nelson-liu Dec 25, 2016
125844d
reset GroupShuffleSplit default test_size to 0.2
nelson-liu Dec 25, 2016
7abd7ad
remove extraneous test
nelson-liu Dec 25, 2016
18d5c1f
fix flake8 violations
nelson-liu Dec 25, 2016
71aabb0
fix indentation error overriding test size in groupsamplesplit
nelson-liu Dec 25, 2016
3707437
change DeprecationWarnings to FutureWarnings
nelson-liu Dec 25, 2016
da58d82
reword docstrings for test_size parameters
nelson-liu Dec 25, 2016
95ce853
fix flake8 error in line length
nelson-liu Dec 25, 2016
281bd53
remove extraneous newline
nelson-liu Dec 25, 2016
51e6397
edit indentation errors and clarify future test_size behavior
nelson-liu Dec 26, 2016
453ada5
ignore FutureWarnings in unrelated tests
nelson-liu Dec 26, 2016
7b3dd0f
fix flake8 error
nelson-liu Dec 27, 2016
beaf8d0
add more details about the defaults
nelson-liu Mar 7, 2017
21b4b7a
Merge branch 'master' into edit_train_test_split_api
nelson-liu Mar 7, 2017
5392f85
fix typo in GroupShuffleSplit stating default is 0.1
nelson-liu Mar 10, 2017
fd49cb9
Add what's new
jnothman Jun 14, 2017
a042aac
Merge branch 'master' into edit_train_test_split_api
jnothman Jun 14, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,11 @@ API changes summary
method ``check_decision_proba_consistency`` has been added in
**sklearn.utils.estimator_checks** to check their consistency.
:issue:`7578` by :user:`Shubham Bhardwaj <shubham0704>`

- In version 0.21, the default behavior of splitters that use the
    ``test_size`` and ``train_size`` parameter will change, such that
specifying ``train_size`` alone will cause ``test_size`` to be the
remainder. :issue:`7459` by :user:`Nelson Liu <nelson-liu>`.

- All tree based estimators now accept a ``min_impurity_decrease``
parameter in lieu of the ``min_impurity_split``, which is now deprecated.
Expand Down Expand Up @@ -506,7 +511,6 @@ API changes summary
- ``utils.stats.rankdata``
- ``neighbors.approximate.LSHForest``


.. _changes_0_18_1:

Version 0.18.1
Expand Down
112 changes: 77 additions & 35 deletions sklearn/model_selection/_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -1133,7 +1133,7 @@ def __init__(self, n_splits=5, n_repeats=10, random_state=None):
class BaseShuffleSplit(with_metaclass(ABCMeta)):
"""Base class for ShuffleSplit and StratifiedShuffleSplit"""

def __init__(self, n_splits=10, test_size=0.1, train_size=None,
def __init__(self, n_splits=10, test_size="default", train_size=None,
random_state=None):
_validate_shuffle_split_init(test_size, train_size)
self.n_splits = n_splits
Expand Down Expand Up @@ -1211,16 +1211,20 @@ class ShuffleSplit(BaseShuffleSplit):

Parameters
----------
n_splits : int (default 10)
n_splits : int, default 10
Number of re-shuffling & splitting iterations.

test_size : float, int, or None, default 0.1
If float, should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the test split. If
int, represents the absolute number of test samples. If None,
the value is automatically set to the complement of the train size.

train_size : float, int, or None (default is None)
test_size : float, int, None, default=0.1
If float, should be between 0.0 and 1.0 and represent the proportion
of the dataset to include in the test split. If int, represents the
absolute number of test samples. If None, the value is set to the
complement of the train size. By default (the is parameter
unspecified), the value is set to 0.1.
The default will change in version 0.21. It will remain 0.1 only
if ``train_size`` is unspecified, otherwise it will complement
the specified ``train_size``.

train_size : float, int, or None, default=None
If float, should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the train split. If
int, represents the absolute number of train samples. If None,
Expand Down Expand Up @@ -1260,7 +1264,8 @@ class ShuffleSplit(BaseShuffleSplit):

def _iter_indices(self, X, y=None, groups=None):
n_samples = _num_samples(X)
n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
n_train, n_test = _validate_shuffle_split(n_samples,
self.test_size,
self.train_size)
rng = check_random_state(self.random_state)
for i in range(self.n_splits):
Expand Down Expand Up @@ -1299,13 +1304,16 @@ class GroupShuffleSplit(ShuffleSplit):
n_splits : int (default 5)
Number of re-shuffling & splitting iterations.

test_size : float (default 0.2), int, or None
If float, should be between 0.0 and 1.0 and represent the
proportion of the groups to include in the test split. If
int, represents the absolute number of test groups. If None,
the value is automatically set to the complement of the train size.
test_size : float, int, None, optional
If float, should be between 0.0 and 1.0 and represent the proportion
of the dataset to include in the test split. If int, represents the
absolute number of test samples. If None, the value is set to the
complement of the train size. By default, the value is set to 0.2.
The default will change in version 0.21. It will remain 0.2 only
if ``train_size`` is unspecified, otherwise it will complement
the specified ``train_size``.

train_size : float, int, or None (default is None)
train_size : float, int, or None, default is None
If float, should be between 0.0 and 1.0 and represent the
proportion of the groups to include in the train split. If
int, represents the absolute number of train groups. If None,
Expand All @@ -1319,8 +1327,16 @@ class GroupShuffleSplit(ShuffleSplit):

'''

def __init__(self, n_splits=5, test_size=0.2, train_size=None,
def __init__(self, n_splits=5, test_size="default", train_size=None,
random_state=None):
if test_size == "default":
if train_size is not None:
warnings.warn("From version 0.21, test_size will always "
"complement train_size unless both "
"are specified.",
FutureWarning)
test_size = 0.2

super(GroupShuffleSplit, self).__init__(
n_splits=n_splits,
test_size=test_size,
Expand Down Expand Up @@ -1428,16 +1444,19 @@ class StratifiedShuffleSplit(BaseShuffleSplit):

Parameters
----------
n_splits : int (default 10)
n_splits : int, default 10
Number of re-shuffling & splitting iterations.

test_size : float (default 0.1), int, or None
If float, should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the test split. If
int, represents the absolute number of test samples. If None,
the value is automatically set to the complement of the train size.
test_size : float, int, None, optional
If float, should be between 0.0 and 1.0 and represent the proportion
of the dataset to include in the test split. If int, represents the
absolute number of test samples. If None, the value is set to the
complement of the train size. By default, the value is set to 0.1.
The default will change in version 0.21. It will remain 0.1 only
if ``train_size`` is unspecified, otherwise it will complement
the specified ``train_size``.

train_size : float, int, or None (default is None)
train_size : float, int, or None, default is None
If float, should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the train split. If
int, represents the absolute number of train samples. If None,
Expand Down Expand Up @@ -1468,7 +1487,7 @@ class StratifiedShuffleSplit(BaseShuffleSplit):
TRAIN: [0 2] TEST: [3 1]
"""

def __init__(self, n_splits=10, test_size=0.1, train_size=None,
def __init__(self, n_splits=10, test_size="default", train_size=None,
random_state=None):
super(StratifiedShuffleSplit, self).__init__(
n_splits, test_size, train_size, random_state)
Expand Down Expand Up @@ -1563,6 +1582,14 @@ def _validate_shuffle_split_init(test_size, train_size):
NOTE This does not take into account the number of samples which is known
only at split
"""
if test_size == "default":
if train_size is not None:
warnings.warn("From version 0.21, test_size will always "
"complement train_size unless both "
"are specified.",
FutureWarning)
test_size = 0.1

if test_size is None and train_size is None:
raise ValueError('test_size and train_size can not both be None')

Expand Down Expand Up @@ -1597,16 +1624,21 @@ def _validate_shuffle_split(n_samples, test_size, train_size):
Validation helper to check if the test/test sizes are meaningful wrt to the
size of the data (n_samples)
"""
if (test_size is not None and np.asarray(test_size).dtype.kind == 'i' and
if (test_size is not None and
np.asarray(test_size).dtype.kind == 'i' and
test_size >= n_samples):
raise ValueError('test_size=%d should be smaller than the number of '
'samples %d' % (test_size, n_samples))

if (train_size is not None and np.asarray(train_size).dtype.kind == 'i' and
if (train_size is not None and
np.asarray(train_size).dtype.kind == 'i' and
train_size >= n_samples):
raise ValueError("train_size=%d should be smaller than the number of"
" samples %d" % (train_size, n_samples))

if test_size == "default":
test_size = 0.1

if np.asarray(test_size).dtype.kind == 'f':
n_test = ceil(test_size * n_samples)
elif np.asarray(test_size).dtype.kind == 'i':
Expand Down Expand Up @@ -1844,14 +1876,16 @@ def train_test_split(*arrays, **options):
Allowed inputs are lists, numpy arrays, scipy-sparse
matrices or pandas dataframes.

test_size : float, int, or None (default is None)
If float, should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the test split. If
int, represents the absolute number of test samples. If None,
the value is automatically set to the complement of the train size.
If train size is also None, test size is set to 0.25.
test_size : float, int, None, optional
If float, should be between 0.0 and 1.0 and represent the proportion
of the dataset to include in the test split. If int, represents the
absolute number of test samples. If None, the value is set to the
complement of the train size. By default, the value is set to 0.25.
The default will change in version 0.21. It will remain 0.25 only
if ``train_size`` is unspecified, otherwise it will complement
the specified ``train_size``.

train_size : float, int, or None (default is None)
train_size : float, int, or None, default None
If float, should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the train split. If
int, represents the absolute number of train samples. If None,
Expand Down Expand Up @@ -1917,7 +1951,7 @@ def train_test_split(*arrays, **options):
n_arrays = len(arrays)
if n_arrays == 0:
raise ValueError("At least one array required as input")
test_size = options.pop('test_size', None)
test_size = options.pop('test_size', 'default')
train_size = options.pop('train_size', None)
random_state = options.pop('random_state', None)
stratify = options.pop('stratify', None)
Expand All @@ -1926,6 +1960,14 @@ def train_test_split(*arrays, **options):
if options:
raise TypeError("Invalid parameters passed: %s" % str(options))

if test_size == 'default':
test_size = None
if train_size is not None:
warnings.warn("From version 0.21, test_size will always "
"complement train_size unless both "
"are specified.",
FutureWarning)

if test_size is None and train_size is None:
test_size = 0.25

Expand Down
16 changes: 14 additions & 2 deletions sklearn/model_selection/tests/test_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_warns_message
from sklearn.utils.testing import assert_warns
from sklearn.utils.testing import assert_raise_message
from sklearn.utils.testing import ignore_warnings
from sklearn.utils.validation import _num_samples
Expand Down Expand Up @@ -163,8 +164,8 @@ def test_cross_validator_with_default_params():
skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)"
lolo_repr = "LeaveOneGroupOut()"
lopo_repr = "LeavePGroupsOut(n_groups=2)"
ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, test_size=0.1, "
"train_size=None)")
ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, "
"test_size='default',\n train_size=None)")
ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))"

n_splits_expected = [n_samples, comb(n_samples, p), n_splits, n_splits,
Expand Down Expand Up @@ -527,6 +528,7 @@ def test_shuffle_split():
assert_array_equal(t3[1], t4[1])


@ignore_warnings
def test_stratified_shuffle_split_init():
X = np.arange(7)
y = np.asarray([0, 1, 1, 1, 2, 2, 2])
Expand Down Expand Up @@ -859,6 +861,7 @@ def test_leave_one_p_group_out_error_on_fewer_number_of_groups():
LeavePGroupsOut(n_groups=3).split(X, y, groups))


@ignore_warnings
def test_repeated_cv_value_errors():
# n_repeats is not integer or <= 0
for cv in (RepeatedKFold, RepeatedStratifiedKFold):
Expand Down Expand Up @@ -1070,6 +1073,7 @@ def train_test_split_list_input():
np.testing.assert_equal(y_test3, y_test2)


@ignore_warnings
def test_shufflesplit_errors():
# When the {test|train}_size is a float/invalid, error is raised at init
assert_raises(ValueError, ShuffleSplit, test_size=None, train_size=None)
Expand Down Expand Up @@ -1366,6 +1370,14 @@ def test_nested_cv():
fit_params={'groups': groups})


def test_train_test_default_warning():
assert_warns(FutureWarning, ShuffleSplit, train_size=0.75)
assert_warns(FutureWarning, GroupShuffleSplit, train_size=0.75)
assert_warns(FutureWarning, StratifiedShuffleSplit, train_size=0.75)
assert_warns(FutureWarning, train_test_split, range(3),
train_size=0.75)


def test_build_repr():
class MockSplitter:
def __init__(self, a, b=0, c=None):
Expand Down