WIP implementation of Gini coeff and Lorenz curve

ogrisel · ogrisel · commit a608c703b596 · 2019-10-10T19:29:56.000+02:00
diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
@@ -14,6 +14,8 @@
 from .ranking import precision_recall_curve
 from .ranking import roc_auc_score
 from .ranking import roc_curve
+from .ranking import gini_score
+from .ranking import lorenz_curve
 
 from .classification import accuracy_score
 from .classification import balanced_accuracy_score
@@ -106,6 +108,7 @@
     'fbeta_score',
     'fowlkes_mallows_score',
     'get_scorer',
+    'gini_score',
     'hamming_loss',
     'hinge_loss',
     'homogeneity_completeness_v_measure',
@@ -114,6 +117,7 @@
     'jaccard_similarity_score',
     'label_ranking_average_precision_score',
     'label_ranking_loss',
+    'lorenz_curve',
     'log_loss',
     'make_scorer',
     'nan_euclidean_distances',
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
@@ -1390,3 +1390,56 @@ def ndcg_score(y_true, y_score, k=None, sample_weight=None, ignore_ties=False):
     _check_dcg_target_type(y_true)
     gain = _ndcg_sample_scores(y_true, y_score, k=k, ignore_ties=ignore_ties)
     return np.average(gain, weights=sample_weight)
+
+
+def lorenz_curve(y_true, y_pred, sample_weight=None,
+                 ascending_predictions=True,
+                 normalize=True,
+                 return_gini=False):
+    y_true = check_array(y_true, ensure_2d=False)
+    y_pred = check_array(y_pred, ensure_2d=False)
+    check_consistent_length(y_true, y_pred)
+    y_true_min = y_true.min()
+    if y_true_min < 0:
+        raise ValueError("lorenz_curve is only defined for regression problems"
+                         " with non-negative target values. Observed minimum"
+                         " target value is %f" % y_true_min)
+    if sample_weight is None:
+        sample_weight = np.ones(len(y_true), dtype=np.float64)
+    else:
+        sample_weight = check_array(sample_weight, ensure_2d=False)
+        check_consistent_length(y_true, sample_weight)
+
+    # Rank the ranking base on y_pred
+    ranking = np.argsort(y_pred)
+    if not ascending_predictions:
+        ranking = ranking[::-1]
+
+    ranked_sample_weight = sample_weight[ranking]
+    ranked_target = y_true[ranking]
+
+    # Accumulate the sample weights and target values
+    cumulated_samples = np.cumsum(ranked_sample_weight)
+    cumulated_target = np.cumsum(ranked_target)
+
+    # Normalize to report fractions instead of absolute values.
+    # Normalization is necessary to compute the Gini index from
+    # the area under the Lorenz curve
+    if normalize:
+        cumulated_samples /= cumulated_samples[-1]
+        cumulated_target /= cumulated_target[-1]
+
+    if return_gini:
+        if not normalize or not ascending_predictions:
+            raise ValueError("Gini coefficient requires normalize=True"
+                             " and ascending_predictions=True")
+        gini = 1 - 2 * auc(cumulated_samples, cumulated_target)
+        return cumulated_samples, cumulated_target, gini
+    return cumulated_samples, cumulated_target
+
+
+def gini_score(y_true, y_pred, sample_weight=None):
+    cumulated_weights, cumulated_values = lorenz_curve(
+        y_true, y_pred, sample_weight=sample_weight,
+        ascending_predictions=True, normalize=True)
+    return 1 - 2 * auc(cumulated_weights, cumulated_values)
diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py
@@ -31,7 +31,7 @@
                f1_score, roc_auc_score, average_precision_score,
                precision_score, recall_score, log_loss,
                balanced_accuracy_score, explained_variance_score,
-               brier_score_loss, jaccard_score)
+               brier_score_loss, jaccard_score, gini_score)
 
 from .cluster import adjusted_rand_score
 from .cluster import homogeneity_score
@@ -634,6 +634,8 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
     mean_gamma_deviance, greater_is_better=False
 )
 
+gini_scorer = make_scorer(gini_score)
+
 # Standard Classification Scores
 accuracy_scorer = make_scorer(accuracy_score)
 balanced_accuracy_scorer = make_scorer(balanced_accuracy_score)
@@ -707,7 +709,8 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
                mutual_info_score=mutual_info_scorer,
                adjusted_mutual_info_score=adjusted_mutual_info_scorer,
                normalized_mutual_info_score=normalized_mutual_info_scorer,
-               fowlkes_mallows_score=fowlkes_mallows_scorer)
+               fowlkes_mallows_score=fowlkes_mallows_scorer,
+               gini_score=gini_scorer)
 
 
 for name, metric in [('precision', precision_score),