Added a unit test to ensure that there are no spurious repeating values in the thresholds returned by roc_curve because of machine precision, and a quick stab at a fix.

jblackburne · jblackburne · commit 84cac3526a2f · 2014-06-10T22:03:30.000-07:00
diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
@@ -725,7 +725,8 @@ def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):
     # y_score typically has many tied values. Here we extract
     # the indices associated with the distinct values. We also
     # concatenate a value for the end of the curve.
-    distinct_value_indices = np.where(np.diff(y_score))[0]
+    y_round = np.round(y_score, 6) # a million thresholds should be enough?
+    distinct_value_indices = np.where(np.diff(y_round))[0]
     threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]
 
     # accumulate the true positives with decreasing threshold
diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
@@ -7,6 +7,7 @@
 
 from sklearn import datasets
 from sklearn import svm
+from sklearn import ensemble
 
 from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
 from sklearn.datasets import make_multilabel_classification
@@ -466,6 +467,29 @@ def test_roc_returns_consistency():
     assert_equal(fpr.shape, thresholds.shape)
 
 
+def test_roc_nonrepeating_thresholds():
+    """Test to ensure that we don't return spurious repeating thresholds
+    due to machine precision issues
+    """
+    dataset = datasets.load_digits()
+    X = dataset['data']
+    y = dataset['target']
+
+    # This random forest classifier can only return probabilities
+    # significant to two decimal places
+    clf = ensemble.RandomForestClassifier(n_estimators=100, random_state=0)
+
+    # How well can the classifier predict whether a digit is less than 5?
+    # This task contributes floating point roundoff errors to the probabilities
+    probas_pred = clf.fit(X[::2], y[::2]).predict_proba(X[1::2])
+    probas_pred = probas_pred[:, :5].sum(axis=1)
+    y_true = [yy < 5 for yy in y[1::2]]
+
+    # Check for repeating values in the thresholds
+    fpr, tpr, thresholds = roc_curve(y_true, probas_pred)
+    assert_equal(thresholds.size, np.unique(np.round(thresholds, 2)).size)
+
+
 def test_roc_curve_multi():
     """roc_curve not applicable for multi-class problems"""
     y_true, _, probas_pred = make_prediction(binary=False)