Skip to content

Commit a608c70

Browse files
committed
WIP implementation of Gini coeff and Lorenz curve
1 parent 0ea2dce commit a608c70

3 files changed

Lines changed: 62 additions & 2 deletions

File tree

sklearn/metrics/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
from .ranking import precision_recall_curve
1515
from .ranking import roc_auc_score
1616
from .ranking import roc_curve
17+
from .ranking import gini_score
18+
from .ranking import lorenz_curve
1719

1820
from .classification import accuracy_score
1921
from .classification import balanced_accuracy_score
@@ -106,6 +108,7 @@
106108
'fbeta_score',
107109
'fowlkes_mallows_score',
108110
'get_scorer',
111+
'gini_score',
109112
'hamming_loss',
110113
'hinge_loss',
111114
'homogeneity_completeness_v_measure',
@@ -114,6 +117,7 @@
114117
'jaccard_similarity_score',
115118
'label_ranking_average_precision_score',
116119
'label_ranking_loss',
120+
'lorenz_curve',
117121
'log_loss',
118122
'make_scorer',
119123
'nan_euclidean_distances',

sklearn/metrics/ranking.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1390,3 +1390,56 @@ def ndcg_score(y_true, y_score, k=None, sample_weight=None, ignore_ties=False):
13901390
_check_dcg_target_type(y_true)
13911391
gain = _ndcg_sample_scores(y_true, y_score, k=k, ignore_ties=ignore_ties)
13921392
return np.average(gain, weights=sample_weight)
1393+
1394+
1395+
def lorenz_curve(y_true, y_pred, sample_weight=None,
1396+
ascending_predictions=True,
1397+
normalize=True,
1398+
return_gini=False):
1399+
y_true = check_array(y_true, ensure_2d=False)
1400+
y_pred = check_array(y_pred, ensure_2d=False)
1401+
check_consistent_length(y_true, y_pred)
1402+
y_true_min = y_true.min()
1403+
if y_true_min < 0:
1404+
raise ValueError("lorenz_curve is only defined for regression problems"
1405+
" with non-negative target values. Observed minimum"
1406+
" target value is %f" % y_true_min)
1407+
if sample_weight is None:
1408+
sample_weight = np.ones(len(y_true), dtype=np.float64)
1409+
else:
1410+
sample_weight = check_array(sample_weight, ensure_2d=False)
1411+
check_consistent_length(y_true, sample_weight)
1412+
1413+
# Rank the ranking base on y_pred
1414+
ranking = np.argsort(y_pred)
1415+
if not ascending_predictions:
1416+
ranking = ranking[::-1]
1417+
1418+
ranked_sample_weight = sample_weight[ranking]
1419+
ranked_target = y_true[ranking]
1420+
1421+
# Accumulate the sample weights and target values
1422+
cumulated_samples = np.cumsum(ranked_sample_weight)
1423+
cumulated_target = np.cumsum(ranked_target)
1424+
1425+
# Normalize to report fractions instead of absolute values.
1426+
# Normalization is necessary to compute the Gini index from
1427+
# the area under the Lorenz curve
1428+
if normalize:
1429+
cumulated_samples /= cumulated_samples[-1]
1430+
cumulated_target /= cumulated_target[-1]
1431+
1432+
if return_gini:
1433+
if not normalize or not ascending_predictions:
1434+
raise ValueError("Gini coefficient requires normalize=True"
1435+
" and ascending_predictions=True")
1436+
gini = 1 - 2 * auc(cumulated_samples, cumulated_target)
1437+
return cumulated_samples, cumulated_target, gini
1438+
return cumulated_samples, cumulated_target
1439+
1440+
1441+
def gini_score(y_true, y_pred, sample_weight=None):
1442+
cumulated_weights, cumulated_values = lorenz_curve(
1443+
y_true, y_pred, sample_weight=sample_weight,
1444+
ascending_predictions=True, normalize=True)
1445+
return 1 - 2 * auc(cumulated_weights, cumulated_values)

sklearn/metrics/scorer.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
f1_score, roc_auc_score, average_precision_score,
3232
precision_score, recall_score, log_loss,
3333
balanced_accuracy_score, explained_variance_score,
34-
brier_score_loss, jaccard_score)
34+
brier_score_loss, jaccard_score, gini_score)
3535

3636
from .cluster import adjusted_rand_score
3737
from .cluster import homogeneity_score
@@ -634,6 +634,8 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
634634
mean_gamma_deviance, greater_is_better=False
635635
)
636636

637+
gini_scorer = make_scorer(gini_score)
638+
637639
# Standard Classification Scores
638640
accuracy_scorer = make_scorer(accuracy_score)
639641
balanced_accuracy_scorer = make_scorer(balanced_accuracy_score)
@@ -707,7 +709,8 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
707709
mutual_info_score=mutual_info_scorer,
708710
adjusted_mutual_info_score=adjusted_mutual_info_scorer,
709711
normalized_mutual_info_score=normalized_mutual_info_scorer,
710-
fowlkes_mallows_score=fowlkes_mallows_scorer)
712+
fowlkes_mallows_score=fowlkes_mallows_scorer,
713+
gini_score=gini_scorer)
711714

712715

713716
for name, metric in [('precision', precision_score),

0 commit comments

Comments
 (0)