Skip to content

Commit 4fcf20a

Browse files
committed
TST/FIX in future, average='binary' iff 2 labels in y one of which is pos_label
1 parent fc51c06 commit 4fcf20a

4 files changed

Lines changed: 108 additions & 52 deletions

File tree

sklearn/metrics/classification.py

Lines changed: 75 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -501,15 +501,19 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
501501
Integer array of labels.
502502
503503
pos_label : str or int, 1 by default
504-
If ``average`` is not ``None`` and the classification target is binary,
505-
only this class's scores will be returned.
504+
The class to report if ``average='binary'``. Until version 0.18 it is
505+
necessary to set ``pos_label=None`` if seeking to use another averaging
506+
method over binary targets.
506507
507-
average : one of [None, 'micro', 'macro', 'samples', 'weighted']
508+
average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \
509+
'weighted']
508510
This parameter is required for multiclass/multilabel targets.
509-
If ``None``, the scores for each class are returned. Otherwise,
510-
unless ``pos_label`` is given in binary classification, this
511+
If ``None``, the scores for each class are returned. Otherwise, this
511512
determines the type of averaging performed on the data:
512513
514+
``'binary'``:
515+
Only report results for the class specified by ``pos_label``.
516+
This is applicable only if targets (``y_{true,pred}``) are binary.
513517
``'micro'``:
514518
Calculate metrics globally by counting the total true positives,
515519
false negatives and false positives.
@@ -526,6 +530,10 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
526530
meaningful for multilabel classification where this differs from
527531
:func:`accuracy_score`).
528532
533+
Note that if ``pos_label`` is given in binary classification with
534+
`average != 'binary'`, only that positive class is reported. This
535+
behavior is deprecated and will change in version 0.18.
536+
529537
sample_weight : array-like of shape = [n_samples], optional
530538
Sample weights.
531539
@@ -588,15 +596,19 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1,
588596
Integer array of labels.
589597
590598
pos_label : str or int, 1 by default
591-
If ``average`` is not ``None`` and the classification target is binary,
592-
only this class's scores will be returned.
599+
The class to report if ``average='binary'``. Until version 0.18 it is
600+
necessary to set ``pos_label=None`` if seeking to use another averaging
601+
method over binary targets.
593602
594-
average : one of [None, 'micro', 'macro', 'samples', 'weighted']
603+
average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \
604+
'weighted']
595605
This parameter is required for multiclass/multilabel targets.
596-
If ``None``, the scores for each class are returned. Otherwise,
597-
unless ``pos_label`` is given in binary classification, this
606+
If ``None``, the scores for each class are returned. Otherwise, this
598607
determines the type of averaging performed on the data:
599608
609+
``'binary'``:
610+
Only report results for the class specified by ``pos_label``.
611+
This is applicable only if targets (``y_{true,pred}``) are binary.
600612
``'micro'``:
601613
Calculate metrics globally by counting the total true positives,
602614
false negatives and false positives.
@@ -613,6 +625,10 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1,
613625
meaningful for multilabel classification where this differs from
614626
:func:`accuracy_score`).
615627
628+
Note that if ``pos_label`` is given in binary classification with
629+
`average != 'binary'`, only that positive class is reported. This
630+
behavior is deprecated and will change in version 0.18.
631+
616632
sample_weight : array-like of shape = [n_samples], optional
617633
Sample weights.
618634
@@ -748,14 +764,18 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
748764
Integer array of labels.
749765
750766
pos_label : str or int, 1 by default
751-
If ``average`` is not ``None`` and the classification target is binary,
752-
only this class's scores will be returned.
767+
The class to report if ``average='binary'``. Until version 0.18 it is
768+
necessary to set ``pos_label=None`` if seeking to use another averaging
769+
method over binary targets.
753770
754-
average : string, [None (default), 'micro', 'macro', 'samples', 'weighted']
755-
If ``None``, the scores for each class are returned. Otherwise,
756-
unless ``pos_label`` is given in binary classification, this
771+
average : string, [None (default), 'binary', 'micro', 'macro', 'samples', \
772+
'weighted']
773+
If ``None``, the scores for each class are returned. Otherwise, this
757774
determines the type of averaging performed on the data:
758775
776+
``'binary'``:
777+
Only report results for the class specified by ``pos_label``.
778+
This is applicable only if targets (``y_{true,pred}``) are binary.
759779
``'micro'``:
760780
Calculate metrics globally by counting the total true positives,
761781
false negatives and false positives.
@@ -772,6 +792,10 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
772792
meaningful for multilabel classification where this differs from
773793
:func:`accuracy_score`).
774794
795+
Note that if ``pos_label`` is given in binary classification with
796+
`average != 'binary'`, only that positive class is reported. This
797+
behavior is deprecated and will change in version 0.18.
798+
775799
warn_for : tuple or set, for internal use
776800
This determines which warnings will be made in the case that this
777801
function is being used to return only one of its metrics.
@@ -832,11 +856,11 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
832856

833857
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
834858

835-
if average == 'binary' and y_type != 'binary':
859+
if average == 'binary' and (y_type != 'binary' or pos_label is None):
836860
warnings.warn('The default `weighted` averaging is deprecated, '
837861
'and from version 0.18, use of precision, recall or '
838-
'F-score with multiclass or multilabel data will result '
839-
'in an exception. '
862+
'F-score with multiclass or multilabel data or '
863+
'pos_label=None will result in an exception. '
840864
'Please set an explicit value for `average`, one of '
841865
'%s. In cross validation use, for instance, '
842866
'scoring="f1_weighted" instead of scoring="f1".'
@@ -898,14 +922,12 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
898922
### Select labels to keep ###
899923

900924
if y_type == 'binary' and average is not None and pos_label is not None:
901-
if average != 'binary' and label_order is not None \
902-
and len(label_order) == 2:
903-
warnings.warn('In the future, providing two `labels` values, as '
904-
'well as `average!=`binary`` will average over '
905-
'those labels. For now, please use `labels=None` '
906-
'with `pos_label` to evaluate precision, recall and '
907-
'F-score for the positive label only.',
908-
FutureWarning)
925+
if average != 'binary':
926+
warnings.warn('From version 0.18, binary input will not be '
927+
'handled specially when using averaged '
928+
'precision/recall/F-score. '
929+
'Please use average=\'binary\' to report only the '
930+
'positive class performance.', DeprecationWarning)
909931
if pos_label not in labels:
910932
if len(labels) == 1:
911933
# Only negative labels
@@ -953,6 +975,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
953975
weights = None
954976

955977
if average is not None:
978+
assert average != 'binary' or len(precision) == 1
956979
precision = np.average(precision, weights=weights)
957980
recall = np.average(recall, weights=weights)
958981
f_score = np.average(f_score, weights=weights)
@@ -990,15 +1013,19 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1,
9901013
Integer array of labels.
9911014
9921015
pos_label : str or int, 1 by default
993-
If ``average`` is not ``None`` and the classification target is binary,
994-
only this class's scores will be returned.
1016+
The class to report if ``average='binary'``. Until version 0.18 it is
1017+
necessary to set ``pos_label=None`` if seeking to use another averaging
1018+
method over binary targets.
9951019
996-
average : one of [None, 'micro', 'macro', 'samples', 'weighted']
1020+
average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \
1021+
'weighted']
9971022
This parameter is required for multiclass/multilabel targets.
998-
If ``None``, the scores for each class are returned. Otherwise,
999-
unless ``pos_label`` is given in binary classification, this
1023+
If ``None``, the scores for each class are returned. Otherwise, this
10001024
determines the type of averaging performed on the data:
10011025
1026+
``'binary'``:
1027+
Only report results for the class specified by ``pos_label``.
1028+
This is applicable only if targets (``y_{true,pred}``) are binary.
10021029
``'micro'``:
10031030
Calculate metrics globally by counting the total true positives,
10041031
false negatives and false positives.
@@ -1015,6 +1042,10 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1,
10151042
meaningful for multilabel classification where this differs from
10161043
:func:`accuracy_score`).
10171044
1045+
Note that if ``pos_label`` is given in binary classification with
1046+
`average != 'binary'`, only that positive class is reported. This
1047+
behavior is deprecated and will change in version 0.18.
1048+
10181049
sample_weight : array-like of shape = [n_samples], optional
10191050
Sample weights.
10201051
@@ -1073,15 +1104,19 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
10731104
Integer array of labels.
10741105
10751106
pos_label : str or int, 1 by default
1076-
If ``average`` is not ``None`` and the classification target is binary,
1077-
only this class's scores will be returned.
1107+
The class to report if ``average='binary'``. Until version 0.18 it is
1108+
necessary to set ``pos_label=None`` if seeking to use another averaging
1109+
method over binary targets.
10781110
1079-
average : one of [None, 'micro', 'macro', 'samples', 'weighted']
1111+
average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \
1112+
'weighted']
10801113
This parameter is required for multiclass/multilabel targets.
1081-
If ``None``, the scores for each class are returned. Otherwise,
1082-
unless ``pos_label`` is given in binary classification, this
1114+
If ``None``, the scores for each class are returned. Otherwise, this
10831115
determines the type of averaging performed on the data:
10841116
1117+
``'binary'``:
1118+
Only report results for the class specified by ``pos_label``.
1119+
This is applicable only if targets (``y_{true,pred}``) are binary.
10851120
``'micro'``:
10861121
Calculate metrics globally by counting the total true positives,
10871122
false negatives and false positives.
@@ -1098,6 +1133,10 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
10981133
meaningful for multilabel classification where this differs from
10991134
:func:`accuracy_score`).
11001135
1136+
Note that if ``pos_label`` is given in binary classification with
1137+
`average != 'binary'`, only that positive class is reported. This
1138+
behavior is deprecated and will change in version 0.18.
1139+
11011140
sample_weight : array-like of shape = [n_samples], optional
11021141
Sample weights.
11031142

sklearn/metrics/tests/test_classification.py

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -138,18 +138,23 @@ def test_precision_recall_f1_score_binary():
138138

139139
# individual scoring function that can be used for grid search: in the
140140
# binary class case the score is the value of the measure for the positive
141-
# class (e.g. label == 1)
142-
ps = precision_score(y_true, y_pred)
143-
assert_array_almost_equal(ps, 0.85, 2)
141+
# class (e.g. label == 1). This is deprecated for average != 'binary'.
142+
assert_dep_warning = partial(assert_warns, DeprecationWarning)
143+
for kwargs, my_assert in [({}, assert_no_warnings),
144+
({'average': 'binary'}, assert_no_warnings),
145+
({'average': 'micro'}, assert_dep_warning)]:
146+
ps = my_assert(precision_score, y_true, y_pred, **kwargs)
147+
assert_array_almost_equal(ps, 0.85, 2)
144148

145-
rs = recall_score(y_true, y_pred)
146-
assert_array_almost_equal(rs, 0.68, 2)
149+
rs = my_assert(recall_score, y_true, y_pred, **kwargs)
150+
assert_array_almost_equal(rs, 0.68, 2)
147151

148-
fs = f1_score(y_true, y_pred)
149-
assert_array_almost_equal(fs, 0.76, 2)
152+
fs = my_assert(f1_score, y_true, y_pred, **kwargs)
153+
assert_array_almost_equal(fs, 0.76, 2)
150154

151-
assert_almost_equal(fbeta_score(y_true, y_pred, beta=2),
152-
(1 + 2 ** 2) * ps * rs / (2 ** 2 * ps + rs), 2)
155+
assert_almost_equal(my_assert(fbeta_score, y_true, y_pred, beta=2,
156+
**kwargs),
157+
(1 + 2 ** 2) * ps * rs / (2 ** 2 * ps + rs), 2)
153158

154159

155160
@ignore_warnings
@@ -204,6 +209,7 @@ def test_average_precision_score_tied_values():
204209
assert_not_equal(average_precision_score(y_true, y_score), 1.)
205210

206211

212+
@ignore_warnings
207213
def test_precision_recall_fscore_support_errors():
208214
y_true, y_pred, _ = make_prediction(binary=True)
209215

@@ -996,6 +1002,8 @@ def test_prf_average_compat():
9961002
"""
9971003
y_true = [1, 2, 3, 3]
9981004
y_pred = [1, 2, 3, 1]
1005+
y_true_bin = [0, 1, 1]
1006+
y_pred_bin = [0, 1, 0]
9991007

10001008
for metric in [precision_score, recall_score, f1_score,
10011009
partial(fbeta_score, beta=2)]:
@@ -1006,7 +1014,16 @@ def test_prf_average_compat():
10061014
'average does not act like "weighted" by default')
10071015

10081016
# check binary passes without warning
1009-
assert_no_warnings(metric, [0, 1, 1], [0, 1, 0])
1017+
assert_no_warnings(metric, y_true_bin, y_pred_bin)
1018+
1019+
# but binary with pos_label=None should behave like multiclass
1020+
score = assert_warns(DeprecationWarning, metric,
1021+
y_true_bin, y_pred_bin, pos_label=None)
1022+
score_weighted = assert_no_warnings(metric, y_true_bin, y_pred_bin,
1023+
pos_label=None, average='weighted')
1024+
assert_equal(score, score_weighted,
1025+
'average does not act like "weighted" by default with '
1026+
'binary data and pos_label=None')
10101027

10111028

10121029
@ignore_warnings # sequence of sequences is deprecated

sklearn/metrics/tests/test_common.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,7 @@
215215

216216
"precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score",
217217

218+
# pos_label support deprecated; to be removed in 0.18:
218219
"weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score",
219220
"weighted_precision_score", "weighted_recall_score",
220221

@@ -226,7 +227,7 @@
226227
]
227228

228229
# Metrics with a "labels" argument
229-
# XXX: Handle multi_class metrics that has a labels argument as well as a
230+
# TODO: Handle multi_class metrics that has a labels argument as well as a
230231
# decision function argument. e.g hinge_loss
231232
METRICS_WITH_LABELS = [
232233
"confusion_matrix",
@@ -930,7 +931,7 @@ def check_sample_weight_invariance(name, metric, y1, y2):
930931
unweighted_score,
931932
metric(y1, y2, sample_weight=np.ones(shape=len(y1))),
932933
err_msg="For %s sample_weight=None is not equivalent to "
933-
"sample_weight=ones" % name)
934+
"sample_weight=ones" % name)
934935

935936
# check that the weighted and unweighted scores are unequal
936937
weighted_score = metric(y1, y2, sample_weight=sample_weight)
@@ -945,8 +946,8 @@ def check_sample_weight_invariance(name, metric, y1, y2):
945946
assert_almost_equal(
946947
weighted_score, weighted_score_list,
947948
err_msg="Weighted scores for array and list sample_weight input are "
948-
"not equal (%f != %f) for %s" % (
949-
weighted_score, weighted_score_list, name))
949+
"not equal (%f != %f) for %s" % (
950+
weighted_score, weighted_score_list, name))
950951

951952
# check that integer weights is the same as repeated samples
952953
repeat_weighted_score = metric(

sklearn/metrics/tests/test_score_objects.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,8 +172,7 @@ def test_classification_scores():
172172
# test fbeta score that takes an argument
173173
scorer = make_scorer(fbeta_score, beta=2)
174174
score1 = scorer(clf, X_test, y_test)
175-
score2 = fbeta_score(y_test, clf.predict(X_test), beta=2,
176-
average='weighted')
175+
score2 = fbeta_score(y_test, clf.predict(X_test), beta=2)
177176
assert_almost_equal(score1, score2)
178177

179178
# test that custom scorer can be pickled

0 commit comments

Comments
 (0)