From 2cd9ab3c6a9edd869d722aca1b3f2e6c135f4d31 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 19 Sep 2016 13:26:34 -0700 Subject: [PATCH 01/24] edit train/test_size default behavior --- sklearn/cross_validation.py | 41 ++++++++++++------ sklearn/model_selection/_split.py | 48 ++++++++++++++------- sklearn/model_selection/tests/test_split.py | 7 ++- sklearn/tests/test_cross_validation.py | 32 +++++++++----- 4 files changed, 85 insertions(+), 43 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index a4a1e3d65c7ca..1de6600ab4db6 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -239,8 +239,8 @@ def __repr__(self): ) def __len__(self): - return int(factorial(self.n) / factorial(self.n - self.p) - / factorial(self.p)) + return int(factorial(self.n) / factorial(self.n - self.p) / + factorial(self.p)) class _BaseKFold(with_metaclass(ABCMeta, _PartitionIterator)): @@ -767,7 +767,7 @@ def __len__(self): class BaseShuffleSplit(with_metaclass(ABCMeta)): """Base class for ShuffleSplit and StratifiedShuffleSplit""" - def __init__(self, n, n_iter=10, test_size=0.1, train_size=None, + def __init__(self, n, n_iter=10, test_size=None, train_size=None, random_state=None): self.n = n self.n_iter = n_iter @@ -878,9 +878,8 @@ def __len__(self): def _validate_shuffle_split(n, test_size, train_size): if test_size is None and train_size is None: - raise ValueError( - 'test_size and train_size can not both be None') - + train_size = 0.9 + test_size = 0.1 if test_size is not None: if np.asarray(test_size).dtype.kind == 'f': if test_size >= 1.: @@ -914,21 +913,37 @@ def _validate_shuffle_split(n, test_size, train_size): else: raise ValueError("Invalid value for train_size: %r" % train_size) - if np.asarray(test_size).dtype.kind == 'f': - n_test = ceil(test_size * n) - elif np.asarray(test_size).dtype.kind == 'i': - n_test = float(test_size) + if test_size is None: + # only train_size set, so set test_size as + # n - n_train + if np.asarray(train_size).dtype.kind == 'f': + n_train = floor(train_size * n) + elif np.asarray(train_size).dtype.kind == 'i': + n_train = float(train_size) + + # set n_test to be the complement of n_train + n_test = n - n_train + + elif train_size is None: + # only test_size was set, so set train_size as + # n - n_test + if np.asarray(test_size).dtype.kind == 'f': + n_test = ceil(test_size * n) + elif np.asarray(test_size).dtype.kind == 'i': + n_test = float(test_size) - if train_size is None: n_train = n - n_test else: + # both train_size and test_size set, so subsample if np.asarray(train_size).dtype.kind == 'f': n_train = floor(train_size * n) else: n_train = float(train_size) - if test_size is None: - n_test = n - n_train + if np.asarray(test_size).dtype.kind == 'f': + n_test = ceil(test_size * n) + else: + n_test = float(test_size) if n_train + n_test > n: raise ValueError('The sum of train_size and test_size = %d, ' diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index b2ed060e31717..cee670b097d6e 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -916,7 +916,7 @@ def get_n_splits(self, X, y, groups): class BaseShuffleSplit(with_metaclass(ABCMeta)): """Base class for ShuffleSplit and StratifiedShuffleSplit""" - def __init__(self, n_splits=10, test_size=0.1, train_size=None, + def __init__(self, n_splits=10, test_size=None, train_size=None, random_state=None): _validate_shuffle_split_init(test_size, train_size) self.n_splits = n_splits @@ -1330,9 +1330,6 @@ def _validate_shuffle_split_init(test_size, train_size): NOTE This does not take into account the number of samples which is known only at split """ - if test_size is None and train_size is None: - raise ValueError('test_size and train_size can not both be None') - if test_size is not None: if np.asarray(test_size).dtype.kind == 'f': if test_size >= 1.: @@ -1364,7 +1361,11 @@ def _validate_shuffle_split(n_samples, test_size, train_size): Validation helper to check if the test/test sizes are meaningful wrt to the size of the data (n_samples) """ - if (test_size is not None and np.asarray(test_size).dtype.kind == 'i' and + if test_size is None and train_size is None: + train_size = 0.9 + test_size = 0.1 + + if (test_size is not None and np.asarray(test_size).dtype.kind == 'i'and test_size >= n_samples): raise ValueError('test_size=%d should be smaller than the number of ' 'samples %d' % (test_size, n_samples)) @@ -1374,20 +1375,37 @@ def _validate_shuffle_split(n_samples, test_size, train_size): raise ValueError("train_size=%d should be smaller than the number of" " samples %d" % (train_size, n_samples)) - if np.asarray(test_size).dtype.kind == 'f': - n_test = ceil(test_size * n_samples) - elif np.asarray(test_size).dtype.kind == 'i': - n_test = float(test_size) + if test_size is None: + # only train_size set, so set test_size as + # n - n_train + if np.asarray(train_size).dtype.kind == 'f': + n_train = floor(train_size * n_samples) + elif np.asarray(train_size).dtype.kind == 'i': + n_train = float(train_size) + + # set n_test to be the complement of n_train + n_test = n_samples - n_train + + elif train_size is None: + # only test_size was set, so set train_size as + # n - n_test + if np.asarray(test_size).dtype.kind == 'f': + n_test = ceil(test_size * n_samples) + elif np.asarray(test_size).dtype.kind == 'i': + n_test = float(test_size) - if train_size is None: n_train = n_samples - n_test - elif np.asarray(train_size).dtype.kind == 'f': - n_train = floor(train_size * n_samples) else: - n_train = float(train_size) + # both train_size and test_size set, so subsample + if np.asarray(train_size).dtype.kind == 'f': + n_train = floor(train_size * n_samples) + else: + n_train = float(train_size) - if test_size is None: - n_test = n_samples - n_train + if np.asarray(test_size).dtype.kind == 'f': + n_test = ceil(test_size * n_samples) + else: + n_test = float(test_size) if n_train + n_test > n_samples: raise ValueError('The sum of train_size and test_size = %d, ' diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 601e9b259c537..46920cfdc6965 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -158,9 +158,9 @@ def test_cross_validator_with_default_params(): lpo_repr = "LeavePOut(p=2)" kf_repr = "KFold(n_splits=2, random_state=None, shuffle=False)" skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)" - lolo_repr = "LeaveOneGroupOut()" - lopo_repr = "LeavePGroupsOut(n_groups=2)" - ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, test_size=0.1, " + lolo_repr = "LeaveOneLabelOut()" + lopo_repr = "LeavePLabelOut(n_labels=2)" + ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, test_size=None, " "train_size=None)") ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))" @@ -922,7 +922,6 @@ def train_test_split_list_input(): def test_shufflesplit_errors(): # When the {test|train}_size is a float/invalid, error is raised at init - assert_raises(ValueError, ShuffleSplit, test_size=None, train_size=None) assert_raises(ValueError, ShuffleSplit, test_size=2.0) assert_raises(ValueError, ShuffleSplit, test_size=1.0) assert_raises(ValueError, ShuffleSplit, test_size=0.1, train_size=0.95) diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 4d756bdaa0cf8..6c9b0582fe52c 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -24,10 +24,6 @@ from sklearn.utils.testing import ignore_warnings from sklearn.utils.mocking import CheckingClassifier, MockDataFrame -with warnings.catch_warnings(): - warnings.simplefilter('ignore') - from sklearn import cross_validation as cval - from sklearn.datasets import make_regression from sklearn.datasets import load_boston from sklearn.datasets import load_digits @@ -48,6 +44,10 @@ from sklearn.preprocessing import Imputer from sklearn.pipeline import Pipeline +with warnings.catch_warnings(): + warnings.simplefilter('ignore') + from sklearn import cross_validation as cval + class MockClassifier(object): """Dummy classifier to test the cross-validation""" @@ -493,10 +493,9 @@ def test_stratified_shuffle_split_iter(): assert_array_equal(np.unique(y[train]), np.unique(y[test])) # Checks if folds keep classes proportions p_train = (np.bincount(np.unique(y[train], - return_inverse=True)[1]) / + return_inverse=True)[1]) / float(len(y[train]))) - p_test = (np.bincount(np.unique(y[test], - return_inverse=True)[1]) / + p_test = (np.bincount(np.unique(y[test], return_inverse=True)[1]) / float(len(y[test]))) assert_array_almost_equal(p_train, p_test, 1) assert_equal(len(train) + len(test), y.size) @@ -868,6 +867,7 @@ def train_test_split_pandas(): assert_true(isinstance(X_train, InputFeatureType)) assert_true(isinstance(X_test, InputFeatureType)) + def train_test_split_mock_pandas(): # X mock dataframe X_df = MockDataFrame(X) @@ -954,8 +954,8 @@ def test_permutation_score(): # test with custom scoring object def custom_score(y_true, y_pred): - return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) - / y_true.shape[0]) + return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) / + y_true.shape[0]) scorer = make_scorer(custom_score) score, _, pvalue = cval.permutation_test_score( @@ -1024,8 +1024,6 @@ def test_shufflesplit_errors(): assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3) assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j) - assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None, - train_size=None) def test_shufflesplit_reproducible(): @@ -1035,6 +1033,18 @@ def test_shufflesplit_reproducible(): assert_array_equal(list(a for a, b in ss), list(a for a, b in ss)) +def test_shufflesplit_train_test_size(): + # check that same sequence of train-test is given + # when setting train_size to be the complement of test_size + # and vice-versa + ss_default = cval.ShuffleSplit(10, random_state=0) + ss_train = cval.ShuffleSplit(10, random_state=0, train_size=.9) + ss_test = cval.ShuffleSplit(10, random_state=0, test_size=.1) + assert_array_equal(list(a for a, b in ss_default), + list(a for a, b in ss_train), + list(a for a, b in ss_test)) + + def test_safe_split_with_precomputed_kernel(): clf = SVC() clfp = SVC(kernel="precomputed") From 8e0e817f18418e43eb0fb33e6e594b9393297db8 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 19 Sep 2016 21:22:44 -0700 Subject: [PATCH 02/24] revert changes to cross_Validation --- sklearn/cross_validation.py | 41 ++++++++------------------ sklearn/tests/test_cross_validation.py | 32 +++++++------------- 2 files changed, 24 insertions(+), 49 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 1de6600ab4db6..a4a1e3d65c7ca 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -239,8 +239,8 @@ def __repr__(self): ) def __len__(self): - return int(factorial(self.n) / factorial(self.n - self.p) / - factorial(self.p)) + return int(factorial(self.n) / factorial(self.n - self.p) + / factorial(self.p)) class _BaseKFold(with_metaclass(ABCMeta, _PartitionIterator)): @@ -767,7 +767,7 @@ def __len__(self): class BaseShuffleSplit(with_metaclass(ABCMeta)): """Base class for ShuffleSplit and StratifiedShuffleSplit""" - def __init__(self, n, n_iter=10, test_size=None, train_size=None, + def __init__(self, n, n_iter=10, test_size=0.1, train_size=None, random_state=None): self.n = n self.n_iter = n_iter @@ -878,8 +878,9 @@ def __len__(self): def _validate_shuffle_split(n, test_size, train_size): if test_size is None and train_size is None: - train_size = 0.9 - test_size = 0.1 + raise ValueError( + 'test_size and train_size can not both be None') + if test_size is not None: if np.asarray(test_size).dtype.kind == 'f': if test_size >= 1.: @@ -913,37 +914,21 @@ def _validate_shuffle_split(n, test_size, train_size): else: raise ValueError("Invalid value for train_size: %r" % train_size) - if test_size is None: - # only train_size set, so set test_size as - # n - n_train - if np.asarray(train_size).dtype.kind == 'f': - n_train = floor(train_size * n) - elif np.asarray(train_size).dtype.kind == 'i': - n_train = float(train_size) - - # set n_test to be the complement of n_train - n_test = n - n_train - - elif train_size is None: - # only test_size was set, so set train_size as - # n - n_test - if np.asarray(test_size).dtype.kind == 'f': - n_test = ceil(test_size * n) - elif np.asarray(test_size).dtype.kind == 'i': - n_test = float(test_size) + if np.asarray(test_size).dtype.kind == 'f': + n_test = ceil(test_size * n) + elif np.asarray(test_size).dtype.kind == 'i': + n_test = float(test_size) + if train_size is None: n_train = n - n_test else: - # both train_size and test_size set, so subsample if np.asarray(train_size).dtype.kind == 'f': n_train = floor(train_size * n) else: n_train = float(train_size) - if np.asarray(test_size).dtype.kind == 'f': - n_test = ceil(test_size * n) - else: - n_test = float(test_size) + if test_size is None: + n_test = n - n_train if n_train + n_test > n: raise ValueError('The sum of train_size and test_size = %d, ' diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 6c9b0582fe52c..4d756bdaa0cf8 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -24,6 +24,10 @@ from sklearn.utils.testing import ignore_warnings from sklearn.utils.mocking import CheckingClassifier, MockDataFrame +with warnings.catch_warnings(): + warnings.simplefilter('ignore') + from sklearn import cross_validation as cval + from sklearn.datasets import make_regression from sklearn.datasets import load_boston from sklearn.datasets import load_digits @@ -44,10 +48,6 @@ from sklearn.preprocessing import Imputer from sklearn.pipeline import Pipeline -with warnings.catch_warnings(): - warnings.simplefilter('ignore') - from sklearn import cross_validation as cval - class MockClassifier(object): """Dummy classifier to test the cross-validation""" @@ -493,9 +493,10 @@ def test_stratified_shuffle_split_iter(): assert_array_equal(np.unique(y[train]), np.unique(y[test])) # Checks if folds keep classes proportions p_train = (np.bincount(np.unique(y[train], - return_inverse=True)[1]) / + return_inverse=True)[1]) / float(len(y[train]))) - p_test = (np.bincount(np.unique(y[test], return_inverse=True)[1]) / + p_test = (np.bincount(np.unique(y[test], + return_inverse=True)[1]) / float(len(y[test]))) assert_array_almost_equal(p_train, p_test, 1) assert_equal(len(train) + len(test), y.size) @@ -867,7 +868,6 @@ def train_test_split_pandas(): assert_true(isinstance(X_train, InputFeatureType)) assert_true(isinstance(X_test, InputFeatureType)) - def train_test_split_mock_pandas(): # X mock dataframe X_df = MockDataFrame(X) @@ -954,8 +954,8 @@ def test_permutation_score(): # test with custom scoring object def custom_score(y_true, y_pred): - return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) / - y_true.shape[0]) + return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) + / y_true.shape[0]) scorer = make_scorer(custom_score) score, _, pvalue = cval.permutation_test_score( @@ -1024,6 +1024,8 @@ def test_shufflesplit_errors(): assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3) assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j) + assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None, + train_size=None) def test_shufflesplit_reproducible(): @@ -1033,18 +1035,6 @@ def test_shufflesplit_reproducible(): assert_array_equal(list(a for a, b in ss), list(a for a, b in ss)) -def test_shufflesplit_train_test_size(): - # check that same sequence of train-test is given - # when setting train_size to be the complement of test_size - # and vice-versa - ss_default = cval.ShuffleSplit(10, random_state=0) - ss_train = cval.ShuffleSplit(10, random_state=0, train_size=.9) - ss_test = cval.ShuffleSplit(10, random_state=0, test_size=.1) - assert_array_equal(list(a for a, b in ss_default), - list(a for a, b in ss_train), - list(a for a, b in ss_test)) - - def test_safe_split_with_precomputed_kernel(): clf = SVC() clfp = SVC(kernel="precomputed") From f0dddd9ed4f1aac40dbefa22406191324d9475e2 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 19 Sep 2016 21:27:25 -0700 Subject: [PATCH 03/24] fix improper merge resolution --- sklearn/model_selection/tests/test_split.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 46920cfdc6965..baa2d94b96385 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -158,8 +158,8 @@ def test_cross_validator_with_default_params(): lpo_repr = "LeavePOut(p=2)" kf_repr = "KFold(n_splits=2, random_state=None, shuffle=False)" skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)" - lolo_repr = "LeaveOneLabelOut()" - lopo_repr = "LeavePLabelOut(n_labels=2)" + lolo_repr = "LeaveOneGroupOut()" + lopo_repr = "LeavePGroupsOut(n_groups=2)" ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, test_size=None, " "train_size=None)") ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))" From 2746367a122d9baab91354f64f3055b549f16a41 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Tue, 20 Sep 2016 21:07:03 -0700 Subject: [PATCH 04/24] edit default train/test_size behavior for other splitters --- sklearn/model_selection/_split.py | 68 ++++++++++++++++++------------- 1 file changed, 39 insertions(+), 29 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index cee670b097d6e..a913216ee7ee7 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -994,20 +994,23 @@ class ShuffleSplit(BaseShuffleSplit): Parameters ---------- - n_splits : int (default 10) + n_splits : int, default 10 Number of re-shuffling & splitting iterations. - test_size : float, int, or None, default 0.1 - If float, should be between 0.0 and 1.0 and represent the - proportion of the dataset to include in the test split. If - int, represents the absolute number of test samples. If None, - the value is automatically set to the complement of the train size. + test_size : float, int, or None, default None + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to include in the test split. If int, represents the + absolute number of test samples. If None, and `train_size` is None, + the value is set to 0.1. If None and `train_size` is not None, the + value is automatically set to the complement of the train size. - train_size : float, int, or None (default is None) + train_size : float, int, or None, default None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If - int, represents the absolute number of train samples. If None, - the value is automatically set to the complement of the test size. + int, represents the absolute number of train samples. If None, and + `test_size` is None, the value is set to 0.9. If None and + `test_size` is not None, the value is automatically set to the + complement of the test size. random_state : int or RandomState Pseudo-random number generator state used for random sampling. @@ -1036,6 +1039,7 @@ class ShuffleSplit(BaseShuffleSplit): TRAIN: [3 1] TEST: [2] TRAIN: [2 1] TEST: [0] TRAIN: [0 2] TEST: [3] + """ def _iter_indices(self, X, y=None, groups=None): @@ -1079,23 +1083,26 @@ class GroupShuffleSplit(ShuffleSplit): n_splits : int (default 5) Number of re-shuffling & splitting iterations. - test_size : float (default 0.2), int, or None - If float, should be between 0.0 and 1.0 and represent the - proportion of the groups to include in the test split. If - int, represents the absolute number of test groups. If None, - the value is automatically set to the complement of the train size. + test_size : float, int, or None, default None + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to include in the test split. If int, represents the + absolute number of test samples. If None, and `train_size` is None, + the value is set to 0.1. If None and `train_size` is not None, the + value is automatically set to the complement of the train size. - train_size : float, int, or None (default is None) + train_size : float, int, or None, default is None If float, should be between 0.0 and 1.0 and represent the - proportion of the groups to include in the train split. If - int, represents the absolute number of train groups. If None, - the value is automatically set to the complement of the test size. + proportion of the dataset to include in the train split. If + int, represents the absolute number of train samples. If None, and + `test_size` is None, the value is set to 0.9. If None and + `test_size` is not None, the value is automatically set to the + complement of the test size. random_state : int or RandomState Pseudo-random number generator state used for random sampling. ''' - def __init__(self, n_splits=5, test_size=0.2, train_size=None, + def __init__(self, n_splits=5, test_size=None, train_size=None, random_state=None): super(GroupShuffleSplit, self).__init__( n_splits=n_splits, @@ -1203,20 +1210,23 @@ class StratifiedShuffleSplit(BaseShuffleSplit): Parameters ---------- - n_splits : int (default 10) + n_splits : int, default 10 Number of re-shuffling & splitting iterations. - test_size : float (default 0.1), int, or None - If float, should be between 0.0 and 1.0 and represent the - proportion of the dataset to include in the test split. If - int, represents the absolute number of test samples. If None, - the value is automatically set to the complement of the train size. + test_size : float, int, or None, default None + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to include in the test split. If int, represents the + absolute number of test samples. If None, and `train_size` is None, + the value is set to 0.1. If None and `train_size` is not None, the + value is automatically set to the complement of the train size. - train_size : float, int, or None (default is None) + train_size : float, int, or None, default is None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If - int, represents the absolute number of train samples. If None, - the value is automatically set to the complement of the test size. + int, represents the absolute number of train samples. If None, and + `test_size` is None, the value is set to 0.9. If None and + `test_size` is not None, the value is automatically set to the + complement of the test size. random_state : int or RandomState Pseudo-random number generator state used for random sampling. @@ -1240,7 +1250,7 @@ class StratifiedShuffleSplit(BaseShuffleSplit): TRAIN: [0 2] TEST: [3 1] """ - def __init__(self, n_splits=10, test_size=0.1, train_size=None, + def __init__(self, n_splits=10, test_size=None, train_size=None, random_state=None): super(StratifiedShuffleSplit, self).__init__( n_splits, test_size, train_size, random_state) From 4aa0f77c6dad28283c9d532d7705ec8702f11d6e Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 3 Oct 2016 20:34:29 -0700 Subject: [PATCH 05/24] add deprecation warnings to groupshufflesplit and train_test_split --- sklearn/model_selection/_split.py | 34 +++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index a913216ee7ee7..c182acfe6c176 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1087,14 +1087,14 @@ class GroupShuffleSplit(ShuffleSplit): If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, and `train_size` is None, - the value is set to 0.1. If None and `train_size` is not None, the + the value is set to 0.2. If None and `train_size` is not None, the value is automatically set to the complement of the train size. train_size : float, int, or None, default is None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, and - `test_size` is None, the value is set to 0.9. If None and + `test_size` is None, the value is set to 0.8. If None and `test_size` is not None, the value is automatically set to the complement of the test size. @@ -1104,6 +1104,13 @@ class GroupShuffleSplit(ShuffleSplit): def __init__(self, n_splits=5, test_size=None, train_size=None, random_state=None): + if test_size is None and train_size is None: + warnings.warn("The default value of the test_size parameter" + "will change from 0.1 to 0.2 in version 0.21.", + DeprecationWarning) + test_size = 0.2 + train_size = 0.8 + super(GroupShuffleSplit, self).__init__( n_splits=n_splits, test_size=test_size, @@ -1639,18 +1646,20 @@ def train_test_split(*arrays, **options): Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas dataframes. - test_size : float, int, or None (default is None) - If float, should be between 0.0 and 1.0 and represent the - proportion of the dataset to include in the test split. If - int, represents the absolute number of test samples. If None, - the value is automatically set to the complement of the train size. - If train size is also None, test size is set to 0.25. + test_size : float, int, or None, default None + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to include in the test split. If int, represents the + absolute number of test samples. If None, and `train_size` is None, + the value is set to 0.25. If None and `train_size` is not None, the + value is automatically set to the complement of the train size. - train_size : float, int, or None (default is None) + train_size : float, int, or None, default None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If - int, represents the absolute number of train samples. If None, - the value is automatically set to the complement of the test size. + int, represents the absolute number of train samples. If None, and + `test_size` is None, the value is set to 0.75. If None and + `test_size` is not None, the value is automatically set to the + complement of the test size. random_state : int or RandomState Pseudo-random number generator state used for random sampling. @@ -1711,6 +1720,9 @@ def train_test_split(*arrays, **options): raise TypeError("Invalid parameters passed: %s" % str(options)) if test_size is None and train_size is None: + warnings.warn("The default value of the test_size parameter" + "will change from 0.25 to 0.1 in version 0.21.", + DeprecationWarning) test_size = 0.25 arrays = indexable(*arrays) From 2d1c51cc8f9150a11a5bfa6085e1b7b871c0b9b5 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 19 Sep 2016 13:26:34 -0700 Subject: [PATCH 06/24] edit train/test_size default behavior --- sklearn/cross_validation.py | 41 ++++++++++++++------- sklearn/model_selection/tests/test_split.py | 12 ++++++ sklearn/tests/test_cross_validation.py | 32 ++++++++++------ 3 files changed, 61 insertions(+), 24 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index a4a1e3d65c7ca..1de6600ab4db6 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -239,8 +239,8 @@ def __repr__(self): ) def __len__(self): - return int(factorial(self.n) / factorial(self.n - self.p) - / factorial(self.p)) + return int(factorial(self.n) / factorial(self.n - self.p) / + factorial(self.p)) class _BaseKFold(with_metaclass(ABCMeta, _PartitionIterator)): @@ -767,7 +767,7 @@ def __len__(self): class BaseShuffleSplit(with_metaclass(ABCMeta)): """Base class for ShuffleSplit and StratifiedShuffleSplit""" - def __init__(self, n, n_iter=10, test_size=0.1, train_size=None, + def __init__(self, n, n_iter=10, test_size=None, train_size=None, random_state=None): self.n = n self.n_iter = n_iter @@ -878,9 +878,8 @@ def __len__(self): def _validate_shuffle_split(n, test_size, train_size): if test_size is None and train_size is None: - raise ValueError( - 'test_size and train_size can not both be None') - + train_size = 0.9 + test_size = 0.1 if test_size is not None: if np.asarray(test_size).dtype.kind == 'f': if test_size >= 1.: @@ -914,21 +913,37 @@ def _validate_shuffle_split(n, test_size, train_size): else: raise ValueError("Invalid value for train_size: %r" % train_size) - if np.asarray(test_size).dtype.kind == 'f': - n_test = ceil(test_size * n) - elif np.asarray(test_size).dtype.kind == 'i': - n_test = float(test_size) + if test_size is None: + # only train_size set, so set test_size as + # n - n_train + if np.asarray(train_size).dtype.kind == 'f': + n_train = floor(train_size * n) + elif np.asarray(train_size).dtype.kind == 'i': + n_train = float(train_size) + + # set n_test to be the complement of n_train + n_test = n - n_train + + elif train_size is None: + # only test_size was set, so set train_size as + # n - n_test + if np.asarray(test_size).dtype.kind == 'f': + n_test = ceil(test_size * n) + elif np.asarray(test_size).dtype.kind == 'i': + n_test = float(test_size) - if train_size is None: n_train = n - n_test else: + # both train_size and test_size set, so subsample if np.asarray(train_size).dtype.kind == 'f': n_train = floor(train_size * n) else: n_train = float(train_size) - if test_size is None: - n_test = n - n_train + if np.asarray(test_size).dtype.kind == 'f': + n_test = ceil(test_size * n) + else: + n_test = float(test_size) if n_train + n_test > n: raise ValueError('The sum of train_size and test_size = %d, ' diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index baa2d94b96385..48740671b73bb 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -943,6 +943,18 @@ def test_shufflesplit_reproducible(): list(a for a, b in ss.split(X))) +def test_shufflesplit_train_test_size(): + # check that same sequence of train-test is given + # when setting train_size to be the complement of test_size + # and vice-versa + ss_default = ShuffleSplit(random_state=0) + ss_train = ShuffleSplit(random_state=0, train_size=.9) + ss_test = ShuffleSplit(random_state=0, test_size=.1) + assert_array_equal(list(a for a, b in ss_default.split(X)), + list(a for a, b in ss_train.split(X)), + list(a for a, b in ss_test.split(X))) + + def test_stratifiedshufflesplit_list_input(): # Check that when y is a list / list of string labels, it works. sss = StratifiedShuffleSplit(test_size=2, random_state=42) diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 4d756bdaa0cf8..6c9b0582fe52c 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -24,10 +24,6 @@ from sklearn.utils.testing import ignore_warnings from sklearn.utils.mocking import CheckingClassifier, MockDataFrame -with warnings.catch_warnings(): - warnings.simplefilter('ignore') - from sklearn import cross_validation as cval - from sklearn.datasets import make_regression from sklearn.datasets import load_boston from sklearn.datasets import load_digits @@ -48,6 +44,10 @@ from sklearn.preprocessing import Imputer from sklearn.pipeline import Pipeline +with warnings.catch_warnings(): + warnings.simplefilter('ignore') + from sklearn import cross_validation as cval + class MockClassifier(object): """Dummy classifier to test the cross-validation""" @@ -493,10 +493,9 @@ def test_stratified_shuffle_split_iter(): assert_array_equal(np.unique(y[train]), np.unique(y[test])) # Checks if folds keep classes proportions p_train = (np.bincount(np.unique(y[train], - return_inverse=True)[1]) / + return_inverse=True)[1]) / float(len(y[train]))) - p_test = (np.bincount(np.unique(y[test], - return_inverse=True)[1]) / + p_test = (np.bincount(np.unique(y[test], return_inverse=True)[1]) / float(len(y[test]))) assert_array_almost_equal(p_train, p_test, 1) assert_equal(len(train) + len(test), y.size) @@ -868,6 +867,7 @@ def train_test_split_pandas(): assert_true(isinstance(X_train, InputFeatureType)) assert_true(isinstance(X_test, InputFeatureType)) + def train_test_split_mock_pandas(): # X mock dataframe X_df = MockDataFrame(X) @@ -954,8 +954,8 @@ def test_permutation_score(): # test with custom scoring object def custom_score(y_true, y_pred): - return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) - / y_true.shape[0]) + return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) / + y_true.shape[0]) scorer = make_scorer(custom_score) score, _, pvalue = cval.permutation_test_score( @@ -1024,8 +1024,6 @@ def test_shufflesplit_errors(): assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3) assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j) - assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None, - train_size=None) def test_shufflesplit_reproducible(): @@ -1035,6 +1033,18 @@ def test_shufflesplit_reproducible(): assert_array_equal(list(a for a, b in ss), list(a for a, b in ss)) +def test_shufflesplit_train_test_size(): + # check that same sequence of train-test is given + # when setting train_size to be the complement of test_size + # and vice-versa + ss_default = cval.ShuffleSplit(10, random_state=0) + ss_train = cval.ShuffleSplit(10, random_state=0, train_size=.9) + ss_test = cval.ShuffleSplit(10, random_state=0, test_size=.1) + assert_array_equal(list(a for a, b in ss_default), + list(a for a, b in ss_train), + list(a for a, b in ss_test)) + + def test_safe_split_with_precomputed_kernel(): clf = SVC() clfp = SVC(kernel="precomputed") From 719662a2ad6f213ada795f8c6873d408ba61ade4 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 19 Sep 2016 21:22:44 -0700 Subject: [PATCH 07/24] revert changes to cross_Validation --- sklearn/cross_validation.py | 41 ++++++++------------------ sklearn/tests/test_cross_validation.py | 32 +++++++------------- 2 files changed, 24 insertions(+), 49 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 1de6600ab4db6..a4a1e3d65c7ca 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -239,8 +239,8 @@ def __repr__(self): ) def __len__(self): - return int(factorial(self.n) / factorial(self.n - self.p) / - factorial(self.p)) + return int(factorial(self.n) / factorial(self.n - self.p) + / factorial(self.p)) class _BaseKFold(with_metaclass(ABCMeta, _PartitionIterator)): @@ -767,7 +767,7 @@ def __len__(self): class BaseShuffleSplit(with_metaclass(ABCMeta)): """Base class for ShuffleSplit and StratifiedShuffleSplit""" - def __init__(self, n, n_iter=10, test_size=None, train_size=None, + def __init__(self, n, n_iter=10, test_size=0.1, train_size=None, random_state=None): self.n = n self.n_iter = n_iter @@ -878,8 +878,9 @@ def __len__(self): def _validate_shuffle_split(n, test_size, train_size): if test_size is None and train_size is None: - train_size = 0.9 - test_size = 0.1 + raise ValueError( + 'test_size and train_size can not both be None') + if test_size is not None: if np.asarray(test_size).dtype.kind == 'f': if test_size >= 1.: @@ -913,37 +914,21 @@ def _validate_shuffle_split(n, test_size, train_size): else: raise ValueError("Invalid value for train_size: %r" % train_size) - if test_size is None: - # only train_size set, so set test_size as - # n - n_train - if np.asarray(train_size).dtype.kind == 'f': - n_train = floor(train_size * n) - elif np.asarray(train_size).dtype.kind == 'i': - n_train = float(train_size) - - # set n_test to be the complement of n_train - n_test = n - n_train - - elif train_size is None: - # only test_size was set, so set train_size as - # n - n_test - if np.asarray(test_size).dtype.kind == 'f': - n_test = ceil(test_size * n) - elif np.asarray(test_size).dtype.kind == 'i': - n_test = float(test_size) + if np.asarray(test_size).dtype.kind == 'f': + n_test = ceil(test_size * n) + elif np.asarray(test_size).dtype.kind == 'i': + n_test = float(test_size) + if train_size is None: n_train = n - n_test else: - # both train_size and test_size set, so subsample if np.asarray(train_size).dtype.kind == 'f': n_train = floor(train_size * n) else: n_train = float(train_size) - if np.asarray(test_size).dtype.kind == 'f': - n_test = ceil(test_size * n) - else: - n_test = float(test_size) + if test_size is None: + n_test = n - n_train if n_train + n_test > n: raise ValueError('The sum of train_size and test_size = %d, ' diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 6c9b0582fe52c..4d756bdaa0cf8 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -24,6 +24,10 @@ from sklearn.utils.testing import ignore_warnings from sklearn.utils.mocking import CheckingClassifier, MockDataFrame +with warnings.catch_warnings(): + warnings.simplefilter('ignore') + from sklearn import cross_validation as cval + from sklearn.datasets import make_regression from sklearn.datasets import load_boston from sklearn.datasets import load_digits @@ -44,10 +48,6 @@ from sklearn.preprocessing import Imputer from sklearn.pipeline import Pipeline -with warnings.catch_warnings(): - warnings.simplefilter('ignore') - from sklearn import cross_validation as cval - class MockClassifier(object): """Dummy classifier to test the cross-validation""" @@ -493,9 +493,10 @@ def test_stratified_shuffle_split_iter(): assert_array_equal(np.unique(y[train]), np.unique(y[test])) # Checks if folds keep classes proportions p_train = (np.bincount(np.unique(y[train], - return_inverse=True)[1]) / + return_inverse=True)[1]) / float(len(y[train]))) - p_test = (np.bincount(np.unique(y[test], return_inverse=True)[1]) / + p_test = (np.bincount(np.unique(y[test], + return_inverse=True)[1]) / float(len(y[test]))) assert_array_almost_equal(p_train, p_test, 1) assert_equal(len(train) + len(test), y.size) @@ -867,7 +868,6 @@ def train_test_split_pandas(): assert_true(isinstance(X_train, InputFeatureType)) assert_true(isinstance(X_test, InputFeatureType)) - def train_test_split_mock_pandas(): # X mock dataframe X_df = MockDataFrame(X) @@ -954,8 +954,8 @@ def test_permutation_score(): # test with custom scoring object def custom_score(y_true, y_pred): - return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) / - y_true.shape[0]) + return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) + / y_true.shape[0]) scorer = make_scorer(custom_score) score, _, pvalue = cval.permutation_test_score( @@ -1024,6 +1024,8 @@ def test_shufflesplit_errors(): assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3) assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j) + assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None, + train_size=None) def test_shufflesplit_reproducible(): @@ -1033,18 +1035,6 @@ def test_shufflesplit_reproducible(): assert_array_equal(list(a for a, b in ss), list(a for a, b in ss)) -def test_shufflesplit_train_test_size(): - # check that same sequence of train-test is given - # when setting train_size to be the complement of test_size - # and vice-versa - ss_default = cval.ShuffleSplit(10, random_state=0) - ss_train = cval.ShuffleSplit(10, random_state=0, train_size=.9) - ss_test = cval.ShuffleSplit(10, random_state=0, test_size=.1) - assert_array_equal(list(a for a, b in ss_default), - list(a for a, b in ss_train), - list(a for a, b in ss_test)) - - def test_safe_split_with_precomputed_kernel(): clf = SVC() clfp = SVC(kernel="precomputed") From 177c48dccbea021a2ec87d97ceee33555d2f72cf Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sat, 24 Dec 2016 14:28:47 -1000 Subject: [PATCH 08/24] correctly format docstrings and remove warnings of changed default values --- sklearn/model_selection/_split.py | 55 ++++++++++++++----------------- 1 file changed, 24 insertions(+), 31 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index c182acfe6c176..b139b2c9e0b65 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1000,17 +1000,17 @@ class ShuffleSplit(BaseShuffleSplit): test_size : float, int, or None, default None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the - absolute number of test samples. If None, and `train_size` is None, - the value is set to 0.1. If None and `train_size` is not None, the - value is automatically set to the complement of the train size. + absolute number of test samples. If None, and ``train_size`` is None, + the value is set to 0.1. If None and ``train_size`` is not None, the + value is automatically set to the complement of ``train_size``. train_size : float, int, or None, default None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, and - `test_size` is None, the value is set to 0.9. If None and - `test_size` is not None, the value is automatically set to the - complement of the test size. + ``test_size`` is None, the value is set to 0.9. If None and + ``test_size`` is not None, the value is automatically set to the + complement of ``test_size``. random_state : int or RandomState Pseudo-random number generator state used for random sampling. @@ -1039,7 +1039,6 @@ class ShuffleSplit(BaseShuffleSplit): TRAIN: [3 1] TEST: [2] TRAIN: [2 1] TEST: [0] TRAIN: [0 2] TEST: [3] - """ def _iter_indices(self, X, y=None, groups=None): @@ -1086,17 +1085,17 @@ class GroupShuffleSplit(ShuffleSplit): test_size : float, int, or None, default None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the - absolute number of test samples. If None, and `train_size` is None, - the value is set to 0.2. If None and `train_size` is not None, the - value is automatically set to the complement of the train size. + absolute number of test samples. If None, and ``train_size`` is None, + the value is set to 0.2. If None and ``train_size`` is not None, the + value is automatically set to the complement of ``train_size``. train_size : float, int, or None, default is None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, and - `test_size` is None, the value is set to 0.8. If None and - `test_size` is not None, the value is automatically set to the - complement of the test size. + ``test_size`` is None, the value is set to 0.8. If None and + ``test_size`` is not None, the value is automatically set to the + complement of ``test_size``. random_state : int or RandomState Pseudo-random number generator state used for random sampling. @@ -1105,9 +1104,6 @@ class GroupShuffleSplit(ShuffleSplit): def __init__(self, n_splits=5, test_size=None, train_size=None, random_state=None): if test_size is None and train_size is None: - warnings.warn("The default value of the test_size parameter" - "will change from 0.1 to 0.2 in version 0.21.", - DeprecationWarning) test_size = 0.2 train_size = 0.8 @@ -1223,17 +1219,17 @@ class StratifiedShuffleSplit(BaseShuffleSplit): test_size : float, int, or None, default None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the - absolute number of test samples. If None, and `train_size` is None, - the value is set to 0.1. If None and `train_size` is not None, the - value is automatically set to the complement of the train size. + absolute number of test samples. If None, and ``train_size`` is None, + the value is set to 0.1. If None and ``train_size`` is not None, the + value is automatically set to the complement of ``train_size``. train_size : float, int, or None, default is None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, and - `test_size` is None, the value is set to 0.9. If None and - `test_size` is not None, the value is automatically set to the - complement of the test size. + ``test_size`` is None, the value is set to 0.9. If None and + ``test_size`` is not None, the value is automatically set to the + complement of ``test_size``. random_state : int or RandomState Pseudo-random number generator state used for random sampling. @@ -1649,17 +1645,17 @@ def train_test_split(*arrays, **options): test_size : float, int, or None, default None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the - absolute number of test samples. If None, and `train_size` is None, - the value is set to 0.25. If None and `train_size` is not None, the - value is automatically set to the complement of the train size. + absolute number of test samples. If None, and ``train_size`` is None, + the value is set to 0.25. If None and ``train_size`` is not None, the + value is automatically set to the complement of ``train_size``. train_size : float, int, or None, default None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, and - `test_size` is None, the value is set to 0.75. If None and - `test_size` is not None, the value is automatically set to the - complement of the test size. + ``test_size`` is None, the value is set to 0.75. If None and + ``test_size`` is not None, the value is automatically set to the + complement of ``test_size``. random_state : int or RandomState Pseudo-random number generator state used for random sampling. @@ -1720,9 +1716,6 @@ def train_test_split(*arrays, **options): raise TypeError("Invalid parameters passed: %s" % str(options)) if test_size is None and train_size is None: - warnings.warn("The default value of the test_size parameter" - "will change from 0.25 to 0.1 in version 0.21.", - DeprecationWarning) test_size = 0.25 arrays = indexable(*arrays) From e0ca540aeb32b548aa01a13896d76b4862761f34 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sat, 24 Dec 2016 16:07:00 -1000 Subject: [PATCH 09/24] restored original behavior with added DeprecationWarnings --- sklearn/model_selection/_split.py | 154 ++++++++++++++---------------- 1 file changed, 73 insertions(+), 81 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index b139b2c9e0b65..8f537b7145b4d 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -916,7 +916,7 @@ def get_n_splits(self, X, y, groups): class BaseShuffleSplit(with_metaclass(ABCMeta)): """Base class for ShuffleSplit and StratifiedShuffleSplit""" - def __init__(self, n_splits=10, test_size=None, train_size=None, + def __init__(self, n_splits=10, test_size="default", train_size=None, random_state=None): _validate_shuffle_split_init(test_size, train_size) self.n_splits = n_splits @@ -997,20 +997,17 @@ class ShuffleSplit(BaseShuffleSplit): n_splits : int, default 10 Number of re-shuffling & splitting iterations. - test_size : float, int, or None, default None + test_size : float, int, None, or 'default', default 'default' If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the - absolute number of test samples. If None, and ``train_size`` is None, - the value is set to 0.1. If None and ``train_size`` is not None, the - value is automatically set to the complement of ``train_size``. + absolute number of test samples. If None, the value is set to the + complement of the train size. If 'default', the value is set to 0.1. train_size : float, int, or None, default None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If - int, represents the absolute number of train samples. If None, and - ``test_size`` is None, the value is set to 0.9. If None and - ``test_size`` is not None, the value is automatically set to the - complement of ``test_size``. + int, represents the absolute number of train samples. If None, + the value is automatically set to the complement of the test size. random_state : int or RandomState Pseudo-random number generator state used for random sampling. @@ -1043,7 +1040,8 @@ class ShuffleSplit(BaseShuffleSplit): def _iter_indices(self, X, y=None, groups=None): n_samples = _num_samples(X) - n_train, n_test = _validate_shuffle_split(n_samples, self.test_size, + n_train, n_test = _validate_shuffle_split(n_samples, + self.test_size, self.train_size) rng = check_random_state(self.random_state) for i in range(self.n_splits): @@ -1082,31 +1080,25 @@ class GroupShuffleSplit(ShuffleSplit): n_splits : int (default 5) Number of re-shuffling & splitting iterations. - test_size : float, int, or None, default None - If float, should be between 0.0 and 1.0 and represent the proportion - of the dataset to include in the test split. If int, represents the - absolute number of test samples. If None, and ``train_size`` is None, - the value is set to 0.2. If None and ``train_size`` is not None, the - value is automatically set to the complement of ``train_size``. + test_size : float, int, None, or 'default', default 'default' + If float, should be between 0.0 and 1.0 and represent the + proportion of the groups to include in the test split. If + int, represents the absolute number of test groups. If None, + the value is automatically set to the complement of the train + size. If 'default', the value is set to 0.1. train_size : float, int, or None, default is None If float, should be between 0.0 and 1.0 and represent the - proportion of the dataset to include in the train split. If - int, represents the absolute number of train samples. If None, and - ``test_size`` is None, the value is set to 0.8. If None and - ``test_size`` is not None, the value is automatically set to the - complement of ``test_size``. + proportion of the groups to include in the train split. If + int, represents the absolute number of train groups. If None, + the value is automatically set to the complement of the test size. random_state : int or RandomState Pseudo-random number generator state used for random sampling. ''' - def __init__(self, n_splits=5, test_size=None, train_size=None, + def __init__(self, n_splits=5, test_size="default", train_size=None, random_state=None): - if test_size is None and train_size is None: - test_size = 0.2 - train_size = 0.8 - super(GroupShuffleSplit, self).__init__( n_splits=n_splits, test_size=test_size, @@ -1216,20 +1208,18 @@ class StratifiedShuffleSplit(BaseShuffleSplit): n_splits : int, default 10 Number of re-shuffling & splitting iterations. - test_size : float, int, or None, default None - If float, should be between 0.0 and 1.0 and represent the proportion - of the dataset to include in the test split. If int, represents the - absolute number of test samples. If None, and ``train_size`` is None, - the value is set to 0.1. If None and ``train_size`` is not None, the - value is automatically set to the complement of ``train_size``. + test_size : float, int, None, or 'default', default 'default' + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the test split. If + int, represents the absolute number of test samples. If None, + the value is automatically set to the complement of the train size. + If 'default', the value is set to 0.1. train_size : float, int, or None, default is None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If - int, represents the absolute number of train samples. If None, and - ``test_size`` is None, the value is set to 0.9. If None and - ``test_size`` is not None, the value is automatically set to the - complement of ``test_size``. + int, represents the absolute number of train samples. If None, + the value is automatically set to the complement of the test size. random_state : int or RandomState Pseudo-random number generator state used for random sampling. @@ -1253,7 +1243,7 @@ class StratifiedShuffleSplit(BaseShuffleSplit): TRAIN: [0 2] TEST: [3 1] """ - def __init__(self, n_splits=10, test_size=None, train_size=None, + def __init__(self, n_splits=10, test_size="default", train_size=None, random_state=None): super(StratifiedShuffleSplit, self).__init__( n_splits, test_size, train_size, random_state) @@ -1343,13 +1333,25 @@ def _validate_shuffle_split_init(test_size, train_size): NOTE This does not take into account the number of samples which is known only at split """ + if test_size == "default": + if train_size is not None: + warnings.warn("test_size will always complement train_size " + "unless both are specified or both are unspecified " + "in version 0.21.", + DeprecationWarning) + test_size = 0.1 + + if test_size is None and train_size is None: + raise ValueError('test_size and train_size can not both be None') + if test_size is not None: if np.asarray(test_size).dtype.kind == 'f': if test_size >= 1.: raise ValueError( 'test_size=%f should be smaller ' 'than 1.0 or be an integer' % test_size) - elif np.asarray(test_size).dtype.kind != 'i': + elif (np.asarray(test_size).dtype.kind != 'i' and + test_size != "default"): # int values are checked during split based on the input raise ValueError("Invalid value for test_size: %r" % test_size) @@ -1374,51 +1376,35 @@ def _validate_shuffle_split(n_samples, test_size, train_size): Validation helper to check if the test/test sizes are meaningful wrt to the size of the data (n_samples) """ - if test_size is None and train_size is None: - train_size = 0.9 - test_size = 0.1 - - if (test_size is not None and np.asarray(test_size).dtype.kind == 'i'and + if (test_size is not None and + np.asarray(test_size).dtype.kind == 'i' and test_size >= n_samples): raise ValueError('test_size=%d should be smaller than the number of ' 'samples %d' % (test_size, n_samples)) - if (train_size is not None and np.asarray(train_size).dtype.kind == 'i' and + if (train_size is not None and + np.asarray(train_size).dtype.kind == 'i' and train_size >= n_samples): raise ValueError("train_size=%d should be smaller than the number of" " samples %d" % (train_size, n_samples)) - if test_size is None: - # only train_size set, so set test_size as - # n - n_train - if np.asarray(train_size).dtype.kind == 'f': - n_train = floor(train_size * n_samples) - elif np.asarray(train_size).dtype.kind == 'i': - n_train = float(train_size) - - # set n_test to be the complement of n_train - n_test = n_samples - n_train + if test_size == "default": + test_size = 0.1 - elif train_size is None: - # only test_size was set, so set train_size as - # n - n_test - if np.asarray(test_size).dtype.kind == 'f': - n_test = ceil(test_size * n_samples) - elif np.asarray(test_size).dtype.kind == 'i': - n_test = float(test_size) + if np.asarray(test_size).dtype.kind == 'f': + n_test = ceil(test_size * n_samples) + elif np.asarray(test_size).dtype.kind == 'i': + n_test = float(test_size) + if train_size is None: n_train = n_samples - n_test + elif np.asarray(train_size).dtype.kind == 'f': + n_train = floor(train_size * n_samples) else: - # both train_size and test_size set, so subsample - if np.asarray(train_size).dtype.kind == 'f': - n_train = floor(train_size * n_samples) - else: - n_train = float(train_size) + n_train = float(train_size) - if np.asarray(test_size).dtype.kind == 'f': - n_test = ceil(test_size * n_samples) - else: - n_test = float(test_size) + if test_size is None: + n_test = n_samples - n_train if n_train + n_test > n_samples: raise ValueError('The sum of train_size and test_size = %d, ' @@ -1642,20 +1628,18 @@ def train_test_split(*arrays, **options): Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas dataframes. - test_size : float, int, or None, default None - If float, should be between 0.0 and 1.0 and represent the proportion - of the dataset to include in the test split. If int, represents the - absolute number of test samples. If None, and ``train_size`` is None, - the value is set to 0.25. If None and ``train_size`` is not None, the - value is automatically set to the complement of ``train_size``. + test_size : float, int, None, or 'default', default 'default' + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the test split. If + int, represents the absolute number of test samples. If None or + 'default', the value is automatically set to the complement of + the train size. If train size is also None, test size is set to 0.25. train_size : float, int, or None, default None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If - int, represents the absolute number of train samples. If None, and - ``test_size`` is None, the value is set to 0.75. If None and - ``test_size`` is not None, the value is automatically set to the - complement of ``test_size``. + int, represents the absolute number of train samples. If None, + the value is automatically set to the complement of the test size. random_state : int or RandomState Pseudo-random number generator state used for random sampling. @@ -1707,7 +1691,7 @@ def train_test_split(*arrays, **options): n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") - test_size = options.pop('test_size', None) + test_size = options.pop('test_size', 'default') train_size = options.pop('train_size', None) random_state = options.pop('random_state', None) stratify = options.pop('stratify', None) @@ -1715,6 +1699,14 @@ def train_test_split(*arrays, **options): if options: raise TypeError("Invalid parameters passed: %s" % str(options)) + if test_size == 'default': + test_size = None + if train_size is not None: + warnings.warn("test_size will always complement train_size " + "unless both are specified or both are unspecified " + "in version 0.21.", + DeprecationWarning) + if test_size is None and train_size is None: test_size = 0.25 From 2d507798d325dc4c3a9bd24e59582528259e8fe4 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sat, 24 Dec 2016 16:07:20 -1000 Subject: [PATCH 10/24] add unit tests for deprecationwarnings --- sklearn/model_selection/tests/test_split.py | 31 +++++++++++++-------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 48740671b73bb..e74f412dd7c11 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -21,6 +21,7 @@ from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_warns_message +from sklearn.utils.testing import assert_warns from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import ignore_warnings from sklearn.utils.validation import _num_samples @@ -160,8 +161,8 @@ def test_cross_validator_with_default_params(): skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)" lolo_repr = "LeaveOneGroupOut()" lopo_repr = "LeavePGroupsOut(n_groups=2)" - ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, test_size=None, " - "train_size=None)") + ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, " + "test_size='default',\n train_size=None)") ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))" n_splits_expected = [n_samples, comb(n_samples, p), n_splits, n_splits, @@ -922,6 +923,7 @@ def train_test_split_list_input(): def test_shufflesplit_errors(): # When the {test|train}_size is a float/invalid, error is raised at init + assert_raises(ValueError, ShuffleSplit, test_size=None, train_size=None) assert_raises(ValueError, ShuffleSplit, test_size=2.0) assert_raises(ValueError, ShuffleSplit, test_size=1.0) assert_raises(ValueError, ShuffleSplit, test_size=0.1, train_size=0.95) @@ -943,16 +945,16 @@ def test_shufflesplit_reproducible(): list(a for a, b in ss.split(X))) -def test_shufflesplit_train_test_size(): - # check that same sequence of train-test is given - # when setting train_size to be the complement of test_size - # and vice-versa - ss_default = ShuffleSplit(random_state=0) - ss_train = ShuffleSplit(random_state=0, train_size=.9) - ss_test = ShuffleSplit(random_state=0, test_size=.1) - assert_array_equal(list(a for a, b in ss_default.split(X)), - list(a for a, b in ss_train.split(X)), - list(a for a, b in ss_test.split(X))) +# def test_shufflesplit_train_test_size(): +# # check that same sequence of train-test is given +# # when setting train_size to be the complement of test_size +# # and vice-versa +# ss_default = ShuffleSplit(random_state=0) +# ss_train = ShuffleSplit(random_state=0, train_size=.9) +# ss_test = ShuffleSplit(random_state=0, test_size=.1) +# assert_array_equal(list(a for a, b in ss_default.split(X)), +# list(a for a, b in ss_train.split(X)), +# list(a for a, b in ss_test.split(X))) def test_stratifiedshufflesplit_list_input(): @@ -1203,6 +1205,11 @@ def test_nested_cv(): cross_val_score(gs, X=X, y=y, groups=groups, cv=outer_cv, fit_params={'groups': groups}) +def test_train_test_default_warning(): + assert_warns(DeprecationWarning, ShuffleSplit, train_size=0.75) + assert_warns(DeprecationWarning, GroupShuffleSplit, train_size=0.75) + assert_warns(DeprecationWarning, StratifiedShuffleSplit, train_size=0.75) + assert_warns(DeprecationWarning, train_test_split, range(3), train_size=0.75) def test_build_repr(): class MockSplitter: From 125844dbf5de93acbee638a3777f24f315c5c1f1 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sat, 24 Dec 2016 16:22:23 -1000 Subject: [PATCH 11/24] reset GroupShuffleSplit default test_size to 0.2 --- sklearn/model_selection/_split.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 8f537b7145b4d..faf39bffc6f77 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1085,7 +1085,7 @@ class GroupShuffleSplit(ShuffleSplit): proportion of the groups to include in the test split. If int, represents the absolute number of test groups. If None, the value is automatically set to the complement of the train - size. If 'default', the value is set to 0.1. + size. If 'default', the value is set to 0.2. train_size : float, int, or None, default is None If float, should be between 0.0 and 1.0 and represent the @@ -1099,6 +1099,14 @@ class GroupShuffleSplit(ShuffleSplit): def __init__(self, n_splits=5, test_size="default", train_size=None, random_state=None): + if test_size == "default": + if train_size is not None: + warnings.warn("test_size will always complement train_size " + "unless both are specified or both are " + "unspecified in version 0.21.", + DeprecationWarning) + test_size = 0.2 + super(GroupShuffleSplit, self).__init__( n_splits=n_splits, test_size=test_size, From 7abd7ad83c69383ef8735578ae16b4aca3737224 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sat, 24 Dec 2016 20:12:06 -1000 Subject: [PATCH 12/24] remove extraneous test --- sklearn/model_selection/tests/test_split.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index e74f412dd7c11..5550005c2bc86 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -945,18 +945,6 @@ def test_shufflesplit_reproducible(): list(a for a, b in ss.split(X))) -# def test_shufflesplit_train_test_size(): -# # check that same sequence of train-test is given -# # when setting train_size to be the complement of test_size -# # and vice-versa -# ss_default = ShuffleSplit(random_state=0) -# ss_train = ShuffleSplit(random_state=0, train_size=.9) -# ss_test = ShuffleSplit(random_state=0, test_size=.1) -# assert_array_equal(list(a for a, b in ss_default.split(X)), -# list(a for a, b in ss_train.split(X)), -# list(a for a, b in ss_test.split(X))) - - def test_stratifiedshufflesplit_list_input(): # Check that when y is a list / list of string labels, it works. sss = StratifiedShuffleSplit(test_size=2, random_state=42) From 18d5c1f42f5e207fdf550fdc4d63274866b89031 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sat, 24 Dec 2016 23:06:26 -0800 Subject: [PATCH 13/24] fix flake8 violations --- sklearn/model_selection/_split.py | 1 + sklearn/model_selection/tests/test_split.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index faf39bffc6f77..c0a5ef3eb99ad 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1736,6 +1736,7 @@ def train_test_split(*arrays, **options): train_test_split.__test__ = False # to avoid a pb with nosetests + def _build_repr(self): # XXX This is copied from BaseEstimator's get_params cls = self.__class__ diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 5550005c2bc86..9259d08cee317 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1193,11 +1193,14 @@ def test_nested_cv(): cross_val_score(gs, X=X, y=y, groups=groups, cv=outer_cv, fit_params={'groups': groups}) + def test_train_test_default_warning(): assert_warns(DeprecationWarning, ShuffleSplit, train_size=0.75) assert_warns(DeprecationWarning, GroupShuffleSplit, train_size=0.75) assert_warns(DeprecationWarning, StratifiedShuffleSplit, train_size=0.75) - assert_warns(DeprecationWarning, train_test_split, range(3), train_size=0.75) + assert_warns(DeprecationWarning, train_test_split, range(3), + train_size=0.75) + def test_build_repr(): class MockSplitter: From 71aabb0dd975e95381b26c5bdbf0918a14753cdd Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sat, 24 Dec 2016 23:21:46 -0800 Subject: [PATCH 14/24] fix indentation error overriding test size in groupsamplesplit --- sklearn/model_selection/_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index c0a5ef3eb99ad..7388b1cdc5766 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1105,7 +1105,7 @@ def __init__(self, n_splits=5, test_size="default", train_size=None, "unless both are specified or both are " "unspecified in version 0.21.", DeprecationWarning) - test_size = 0.2 + test_size = 0.2 super(GroupShuffleSplit, self).__init__( n_splits=n_splits, From 3707437bf2441d9ab0c969d215596383c7606314 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 25 Dec 2016 09:54:00 -0800 Subject: [PATCH 15/24] change DeprecationWarnings to FutureWarnings --- sklearn/model_selection/_split.py | 27 ++++++++++----------- sklearn/model_selection/tests/test_split.py | 8 +++--- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 7388b1cdc5766..3097276341fad 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1101,10 +1101,10 @@ def __init__(self, n_splits=5, test_size="default", train_size=None, random_state=None): if test_size == "default": if train_size is not None: - warnings.warn("test_size will always complement train_size " - "unless both are specified or both are " - "unspecified in version 0.21.", - DeprecationWarning) + warnings.warn("From version 0.21, test_size will always " + "complement train_size unless both are specified " + "or both are unspecified.", + FutureWarning) test_size = 0.2 super(GroupShuffleSplit, self).__init__( @@ -1343,10 +1343,10 @@ def _validate_shuffle_split_init(test_size, train_size): """ if test_size == "default": if train_size is not None: - warnings.warn("test_size will always complement train_size " - "unless both are specified or both are unspecified " - "in version 0.21.", - DeprecationWarning) + warnings.warn("From version 0.21, test_size will always " + "complement train_size unless both are specified " + "or both are unspecified.", + FutureWarning) test_size = 0.1 if test_size is None and train_size is None: @@ -1358,8 +1358,7 @@ def _validate_shuffle_split_init(test_size, train_size): raise ValueError( 'test_size=%f should be smaller ' 'than 1.0 or be an integer' % test_size) - elif (np.asarray(test_size).dtype.kind != 'i' and - test_size != "default"): + elif (np.asarray(test_size).dtype.kind != 'i'): # int values are checked during split based on the input raise ValueError("Invalid value for test_size: %r" % test_size) @@ -1710,10 +1709,10 @@ def train_test_split(*arrays, **options): if test_size == 'default': test_size = None if train_size is not None: - warnings.warn("test_size will always complement train_size " - "unless both are specified or both are unspecified " - "in version 0.21.", - DeprecationWarning) + warnings.warn("From version 0.21, test_size will always " + "complement train_size unless both are specified " + "or both are unspecified.", + FutureWarning) if test_size is None and train_size is None: test_size = 0.25 diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 9259d08cee317..5740c3131b368 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1195,10 +1195,10 @@ def test_nested_cv(): def test_train_test_default_warning(): - assert_warns(DeprecationWarning, ShuffleSplit, train_size=0.75) - assert_warns(DeprecationWarning, GroupShuffleSplit, train_size=0.75) - assert_warns(DeprecationWarning, StratifiedShuffleSplit, train_size=0.75) - assert_warns(DeprecationWarning, train_test_split, range(3), + assert_warns(FutureWarning, ShuffleSplit, train_size=0.75) + assert_warns(FutureWarning, GroupShuffleSplit, train_size=0.75) + assert_warns(FutureWarning, StratifiedShuffleSplit, train_size=0.75) + assert_warns(FutureWarning, train_test_split, range(3), train_size=0.75) From da58d82da3c9d612dd5a9c7fd3c0249c621e6b9f Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 25 Dec 2016 10:00:10 -0800 Subject: [PATCH 16/24] reword docstrings for test_size parameters --- sklearn/model_selection/_split.py | 50 ++++++++++++++++++------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 3097276341fad..64faacd3da6a4 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -997,11 +997,14 @@ class ShuffleSplit(BaseShuffleSplit): n_splits : int, default 10 Number of re-shuffling & splitting iterations. - test_size : float, int, None, or 'default', default 'default' + test_size : float, int, None, optional If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the - complement of the train size. If 'default', the value is set to 0.1. + complement of the train size. By default, the value is set to 0.1. + The default will change in version 0.21. It will remain 0.1 only + if ``train_size`` is unspecified, otherwise it will complement + the specified ``train_size``. train_size : float, int, or None, default None If float, should be between 0.0 and 1.0 and represent the @@ -1080,12 +1083,14 @@ class GroupShuffleSplit(ShuffleSplit): n_splits : int (default 5) Number of re-shuffling & splitting iterations. - test_size : float, int, None, or 'default', default 'default' - If float, should be between 0.0 and 1.0 and represent the - proportion of the groups to include in the test split. If - int, represents the absolute number of test groups. If None, - the value is automatically set to the complement of the train - size. If 'default', the value is set to 0.2. + test_size : float, int, None, optional + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to include in the test split. If int, represents the + absolute number of test samples. If None, the value is set to the + complement of the train size. By default, the value is set to 0.1. + The default will change in version 0.21. It will remain 0.2 only + if ``train_size`` is unspecified, otherwise it will complement + the specified ``train_size``. train_size : float, int, or None, default is None If float, should be between 0.0 and 1.0 and represent the @@ -1216,12 +1221,14 @@ class StratifiedShuffleSplit(BaseShuffleSplit): n_splits : int, default 10 Number of re-shuffling & splitting iterations. - test_size : float, int, None, or 'default', default 'default' - If float, should be between 0.0 and 1.0 and represent the - proportion of the dataset to include in the test split. If - int, represents the absolute number of test samples. If None, - the value is automatically set to the complement of the train size. - If 'default', the value is set to 0.1. + test_size : float, int, None, optional + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to include in the test split. If int, represents the + absolute number of test samples. If None, the value is set to the + complement of the train size. By default, the value is set to 0.1. + The default will change in version 0.21. It will remain 0.1 only + if ``train_size`` is unspecified, otherwise it will complement + the specified ``train_size``. train_size : float, int, or None, default is None If float, should be between 0.0 and 1.0 and represent the @@ -1635,12 +1642,15 @@ def train_test_split(*arrays, **options): Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas dataframes. - test_size : float, int, None, or 'default', default 'default' - If float, should be between 0.0 and 1.0 and represent the - proportion of the dataset to include in the test split. If - int, represents the absolute number of test samples. If None or - 'default', the value is automatically set to the complement of - the train size. If train size is also None, test size is set to 0.25. + test_size : float, int, None, optional + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to include in the test split. If int, represents the + absolute number of test samples. If None, the value is set to the + complement of the train size. By default, the value is set to 0.25. + The default will change in version 0.21. It will remain 0.25 only + if ``train_size`` is unspecified, otherwise it will complement + the specified ``train_size``. + train_size : float, int, or None, default None If float, should be between 0.0 and 1.0 and represent the From 95ce853a9f6fda3b88358c8e4352b0d5c4fe4be7 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 25 Dec 2016 10:23:07 -0800 Subject: [PATCH 17/24] fix flake8 error in line length --- sklearn/model_selection/_split.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 64faacd3da6a4..af17fb36fa396 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1107,8 +1107,8 @@ def __init__(self, n_splits=5, test_size="default", train_size=None, if test_size == "default": if train_size is not None: warnings.warn("From version 0.21, test_size will always " - "complement train_size unless both are specified " - "or both are unspecified.", + "complement train_size unless both " + "are specified or both are unspecified.", FutureWarning) test_size = 0.2 @@ -1351,8 +1351,8 @@ def _validate_shuffle_split_init(test_size, train_size): if test_size == "default": if train_size is not None: warnings.warn("From version 0.21, test_size will always " - "complement train_size unless both are specified " - "or both are unspecified.", + "complement train_size unless both " + "are specified or both are unspecified.", FutureWarning) test_size = 0.1 @@ -1720,8 +1720,8 @@ def train_test_split(*arrays, **options): test_size = None if train_size is not None: warnings.warn("From version 0.21, test_size will always " - "complement train_size unless both are specified " - "or both are unspecified.", + "complement train_size unless both " + "are specified or both are unspecified.", FutureWarning) if test_size is None and train_size is None: From 281bd530e610ea1e66932162a25d85d57ecc38cc Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 25 Dec 2016 10:54:36 -0800 Subject: [PATCH 18/24] remove extraneous newline --- sklearn/model_selection/_split.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index af17fb36fa396..6bd3c7b4e1dfc 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1651,7 +1651,6 @@ def train_test_split(*arrays, **options): if ``train_size`` is unspecified, otherwise it will complement the specified ``train_size``. - train_size : float, int, or None, default None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If From 51e6397110de606fdddac8cd290e24cec37aa11b Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 26 Dec 2016 15:21:59 -0800 Subject: [PATCH 19/24] edit indentation errors and clarify future test_size behavior --- sklearn/model_selection/_split.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 6bd3c7b4e1dfc..71aa7dc2c9330 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1108,7 +1108,7 @@ def __init__(self, n_splits=5, test_size="default", train_size=None, if train_size is not None: warnings.warn("From version 0.21, test_size will always " "complement train_size unless both " - "are specified or both are unspecified.", + "are specified.", FutureWarning) test_size = 0.2 @@ -1350,10 +1350,10 @@ def _validate_shuffle_split_init(test_size, train_size): """ if test_size == "default": if train_size is not None: - warnings.warn("From version 0.21, test_size will always " - "complement train_size unless both " - "are specified or both are unspecified.", - FutureWarning) + warnings.warn("From version 0.21, test_size will always " + "complement train_size unless both " + "are specified.", + FutureWarning) test_size = 0.1 if test_size is None and train_size is None: @@ -1365,7 +1365,7 @@ def _validate_shuffle_split_init(test_size, train_size): raise ValueError( 'test_size=%f should be smaller ' 'than 1.0 or be an integer' % test_size) - elif (np.asarray(test_size).dtype.kind != 'i'): + elif np.asarray(test_size).dtype.kind != 'i': # int values are checked during split based on the input raise ValueError("Invalid value for test_size: %r" % test_size) @@ -1718,10 +1718,10 @@ def train_test_split(*arrays, **options): if test_size == 'default': test_size = None if train_size is not None: - warnings.warn("From version 0.21, test_size will always " - "complement train_size unless both " - "are specified or both are unspecified.", - FutureWarning) + warnings.warn("From version 0.21, test_size will always " + "complement train_size unless both " + "are specified.", + FutureWarning) if test_size is None and train_size is None: test_size = 0.25 From 453ada52c1f95d72eca5f33affa8c0aa2f7302a0 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 26 Dec 2016 15:22:35 -0800 Subject: [PATCH 20/24] ignore FutureWarnings in unrelated tests --- sklearn/model_selection/tests/test_split.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 5740c3131b368..da63426a71e35 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -517,7 +517,7 @@ def test_shuffle_split(): assert_array_equal(t2[1], t3[1]) assert_array_equal(t3[1], t4[1]) - +@ignore_warnings def test_stratified_shuffle_split_init(): X = np.arange(7) y = np.asarray([0, 1, 1, 1, 2, 2, 2]) @@ -805,6 +805,7 @@ def test_leave_one_p_group_out_error_on_fewer_number_of_groups(): LeavePGroupsOut(n_groups=3).split(X, y, groups)) +@ignore_warnings def test_train_test_split_errors(): assert_raises(ValueError, train_test_split) assert_raises(ValueError, train_test_split, range(3), train_size=1.1) @@ -921,6 +922,7 @@ def train_test_split_list_input(): np.testing.assert_equal(y_test3, y_test2) +@ignore_warnings def test_shufflesplit_errors(): # When the {test|train}_size is a float/invalid, error is raised at init assert_raises(ValueError, ShuffleSplit, test_size=None, train_size=None) From 7b3dd0f52608fc4eed7a6984f8ccee658638a33f Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 26 Dec 2016 16:11:32 -0800 Subject: [PATCH 21/24] fix flake8 error --- sklearn/model_selection/tests/test_split.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index da63426a71e35..1d23c2c6713ad 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -517,6 +517,7 @@ def test_shuffle_split(): assert_array_equal(t2[1], t3[1]) assert_array_equal(t3[1], t4[1]) + @ignore_warnings def test_stratified_shuffle_split_init(): X = np.arange(7) From beaf8d043a97b7fa38dbdef9b2fc7704786274b6 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Tue, 7 Mar 2017 11:42:29 -0800 Subject: [PATCH 22/24] add more details about the defaults --- sklearn/model_selection/_split.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 71aa7dc2c9330..2282ba8414b11 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -997,16 +997,17 @@ class ShuffleSplit(BaseShuffleSplit): n_splits : int, default 10 Number of re-shuffling & splitting iterations. - test_size : float, int, None, optional + test_size : float, int, None, default=0.1 If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the - complement of the train size. By default, the value is set to 0.1. + complement of the train size. By default (the is parameter + unspecified), the value is set to 0.1. The default will change in version 0.21. It will remain 0.1 only if ``train_size`` is unspecified, otherwise it will complement the specified ``train_size``. - train_size : float, int, or None, default None + train_size : float, int, or None, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, From 5392f8551bf17a43b4f82ea5fe1afdc1785f3fa7 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Thu, 9 Mar 2017 18:39:33 -0800 Subject: [PATCH 23/24] fix typo in GroupShuffleSplit stating default is 0.1 --- sklearn/model_selection/_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 88b3caf34b6bf..1d78d30f052d8 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1259,7 +1259,7 @@ class GroupShuffleSplit(ShuffleSplit): If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the - complement of the train size. By default, the value is set to 0.1. + complement of the train size. By default, the value is set to 0.2. The default will change in version 0.21. It will remain 0.2 only if ``train_size`` is unspecified, otherwise it will complement the specified ``train_size``. From fd49cb9d226aca2c84bbbddd4c96042b54a24b8c Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 14 Jun 2017 21:27:33 +1000 Subject: [PATCH 24/24] Add what's new --- doc/whats_new.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 11d7add579e8b..83a86fde37691 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -280,6 +280,11 @@ API changes summary method ``check_decision_proba_consistency`` has been added in **sklearn.utils.estimator_checks** to check their consistency. :issue:`7578` by :user:`Shubham Bhardwaj ` + + - In version 0.21, the default behavior of splitters that use the +     ``test_size`` and ``train_size`` parameter will change, such that + specifying ``train_size`` alone will cause ``test_size`` to be the + remainder. :issue:`7459` by :user:`Nelson Liu `. .. _changes_0_18_1: