From 4d7978907576551c84cdbc3507904f70df4a89e4 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Tue, 3 Jun 2014 22:31:57 -0400 Subject: [PATCH 01/55] Adding new_labels argument to LabelEncoder --- sklearn/preprocessing/label.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index a99ed15973238..845f0d2101c94 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -43,6 +43,15 @@ def _check_numpy_unicode_bug(labels): class LabelEncoder(BaseEstimator, TransformerMixin): """Encode labels with value between 0 and n_classes-1. + Parameters + ---------- + + new_labels : string, optional (default: "raise") + Determines how to handle newly seen labels, i.e., data + not seen in the fit domain. If "raise", then raise ValueError; + if "map", then re-map the new labels to class N, where seen + classes are in {0, ..., N-1}. + Attributes ---------- `classes_` : array of shape (n_class,) @@ -77,6 +86,9 @@ class LabelEncoder(BaseEstimator, TransformerMixin): ['tokyo', 'tokyo', 'paris'] """ + def __init__(self, new_labels="raise"): + """Constructor""" + self.new_labels = new_labels def _check_fitted(self): if not hasattr(self, "classes_"): @@ -134,7 +146,27 @@ def transform(self, y): _check_numpy_unicode_bug(classes) if len(np.intersect1d(classes, self.classes_)) < len(classes): diff = np.setdiff1d(classes, self.classes_) - raise ValueError("y contains new labels: %s" % str(diff)) + + # If we are mapping new labels, get "new" ID and change in copy. + if self.new_labels == "map": + # Get new ID and append to class list + missing_id = len(self.classes_) + self.classes_.resize(len(self.classes_)+1) + self.classes_[-1] = missing_id + + # Reset the value in y_copy + missing_mask = np.in1d(y, diff) + y_copy = np.array(y) + y_copy[missing_mask] = missing_id + + # Return mapped encoding + return np.searchsorted(self.classes_, y_copy) + elif self.new_labels == "raise": + raise ValueError("y contains new labels: %s" % str(diff)) + else: + raise ValueError("Value of argument `new_labels`={0} " + "is unknown.".format(self.new_labels)) + return np.searchsorted(self.classes_, y) def inverse_transform(self, y): From 2bc5686d8a31e2a169ffdb23327c00c14a807e6c Mon Sep 17 00:00:00 2001 From: mjbommar Date: Tue, 3 Jun 2014 22:32:10 -0400 Subject: [PATCH 02/55] Adding tests for new_labels argument. --- sklearn/preprocessing/tests/test_label.py | 24 +++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index d7e98c553fe55..757eb965ec9bb 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -177,6 +177,30 @@ def test_label_encoder(): assert_raises(ValueError, le.transform, [0, 6]) +def test_label_encoder_new_label(): + """Test LabelEncoder's transform on new labels""" + le = LabelEncoder(new_labels="map") + le.fit(["a", "b", "b", "c"]) + assert_array_equal(le.classes_, ["a", "b", "c"]) + assert_array_equal(le.transform(["a", "a", "c"]), + [0, 0, 2]) + assert_array_equal(le.inverse_transform([2, 1, 0]), + ["c", "b", "a"]) + le.transform(["b", "c", "d"]) + + +def test_label_encoder_new_label_arg(): + """Test LabelEncoder's new_labels argument handling""" + le = LabelEncoder(new_labels="xyz") + le.fit(["a", "b", "b", "c"]) + assert_array_equal(le.classes_, ["a", "b", "c"]) + assert_array_equal(le.transform(["a", "a", "c"]), + [0, 0, 2]) + assert_array_equal(le.inverse_transform([2, 1, 0]), + ["c", "b", "a"]) + assert_raises(ValueError, le.transform, ["c", "d"]) + + def test_label_encoder_fit_transform(): """Test fit_transform""" le = LabelEncoder() From 8c1fafe249b1309d48b1e20116eedeb2dc1531fb Mon Sep 17 00:00:00 2001 From: mjbommar Date: Wed, 4 Jun 2014 08:01:33 -0400 Subject: [PATCH 03/55] Changing classes_ update strategy --- sklearn/preprocessing/label.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 845f0d2101c94..11d1ab5c2f3c3 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -151,8 +151,7 @@ def transform(self, y): if self.new_labels == "map": # Get new ID and append to class list missing_id = len(self.classes_) - self.classes_.resize(len(self.classes_)+1) - self.classes_[-1] = missing_id + self.classes_ = np.append(self.classes_, missing_id) # Reset the value in y_copy missing_mask = np.in1d(y, diff) From 1ffb24a58167758abaf2441a24ec61c9b0ab4031 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Wed, 4 Jun 2014 08:31:33 -0400 Subject: [PATCH 04/55] Adding nan behavior, renaming to --- sklearn/preprocessing/label.py | 38 ++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 11d1ab5c2f3c3..7b3a7bfee7ba1 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -47,10 +47,13 @@ class LabelEncoder(BaseEstimator, TransformerMixin): ---------- new_labels : string, optional (default: "raise") - Determines how to handle newly seen labels, i.e., data - not seen in the fit domain. If "raise", then raise ValueError; - if "map", then re-map the new labels to class N, where seen - classes are in {0, ..., N-1}. + Determines how to handle new labels, i.e., data + not seen in the training domain. + - If "raise", then raise ValueError. + - If "update", then re-map the new labels to classes + `[N, ..., N+m-1]`, where `m` is the number of new labels. + - If "nan", then re-map the new labels to numpy.nan. + Attributes ---------- @@ -148,21 +151,30 @@ def transform(self, y): diff = np.setdiff1d(classes, self.classes_) # If we are mapping new labels, get "new" ID and change in copy. - if self.new_labels == "map": - # Get new ID and append to class list - missing_id = len(self.classes_) - self.classes_ = np.append(self.classes_, missing_id) + if self.new_labels == "update": + # Update the class list with new labels + self.classes_ = np.append(self.classes_, np.sort(diff)) + + # Return mapped encoding + return np.searchsorted(self.classes_, y) + elif self.new_labels == "nan": + # Create copy of array and return + y_array = np.array(y) + z = np.zeros(y_array.shape) - # Reset the value in y_copy + # Find entries with new labels missing_mask = np.in1d(y, diff) - y_copy = np.array(y) - y_copy[missing_mask] = missing_id - # Return mapped encoding - return np.searchsorted(self.classes_, y_copy) + # Populate return array properly and return + z[-missing_mask] = np.searchsorted(self.classes_, + y_array[-missing_mask]) + z[missing_mask] = np.nan + return z elif self.new_labels == "raise": + # Return ValueError, original behavior. raise ValueError("y contains new labels: %s" % str(diff)) else: + # Raise on invalid argument. raise ValueError("Value of argument `new_labels`={0} " "is unknown.".format(self.new_labels)) From 99f65a9a3bcedb2b77bf04a43fa12c70d8363d0c Mon Sep 17 00:00:00 2001 From: mjbommar Date: Wed, 4 Jun 2014 08:32:10 -0400 Subject: [PATCH 05/55] Updating tests to include nan case and update name --- sklearn/preprocessing/tests/test_label.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 757eb965ec9bb..f98239bf428dc 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -177,16 +177,30 @@ def test_label_encoder(): assert_raises(ValueError, le.transform, [0, 6]) -def test_label_encoder_new_label(): +def test_label_encoder_new_label_update(): """Test LabelEncoder's transform on new labels""" - le = LabelEncoder(new_labels="map") + le = LabelEncoder(new_labels="update") le.fit(["a", "b", "b", "c"]) assert_array_equal(le.classes_, ["a", "b", "c"]) assert_array_equal(le.transform(["a", "a", "c"]), [0, 0, 2]) assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"]) - le.transform(["b", "c", "d"]) + assert_array_equal(le.transform(["b", "c", "d"]), + [1, 2, 3]) + + +def test_label_encoder_new_label_nan(): + """Test LabelEncoder's transform on new labels""" + le = LabelEncoder(new_labels="nan") + le.fit(["a", "b", "b", "c"]) + assert_array_equal(le.classes_, ["a", "b", "c"]) + assert_array_equal(le.transform(["a", "a", "c"]), + [0, 0, 2]) + assert_array_equal(le.inverse_transform([2, 1, 0]), + ["c", "b", "a"]) + assert_array_equal(le.transform(["b", "c", "d"]), + [1, 2, np.nan]) def test_label_encoder_new_label_arg(): From af8c6a9f8f583adf4f4cbe612031d68582eb4643 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Wed, 4 Jun 2014 11:40:50 -0400 Subject: [PATCH 06/55] Fixing docstring for test-doc pass --- sklearn/preprocessing/label.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 7b3a7bfee7ba1..472059b6d66da 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -65,7 +65,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): `LabelEncoder` can be used to normalize labels. >>> from sklearn import preprocessing - >>> le = preprocessing.LabelEncoder() + >>> le = preprocessing.LabelEncoder(new_values='raise') >>> le.fit([1, 2, 2, 6]) LabelEncoder() >>> le.classes_ @@ -78,7 +78,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): It can also be used to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels. - >>> le = preprocessing.LabelEncoder() + >>> le = preprocessing.LabelEncoder(new_values='raise') >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) LabelEncoder() >>> list(le.classes_) From 8ffc839e2dd9002964fa6a922646ec25b1f21e7e Mon Sep 17 00:00:00 2001 From: mjbommar Date: Wed, 4 Jun 2014 11:42:01 -0400 Subject: [PATCH 07/55] Fixing docstring for test-doc pass (for real) --- sklearn/preprocessing/label.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 472059b6d66da..794bd4fb81cd3 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -65,7 +65,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): `LabelEncoder` can be used to normalize labels. >>> from sklearn import preprocessing - >>> le = preprocessing.LabelEncoder(new_values='raise') + >>> le = preprocessing.LabelEncoder(new_labels='raise') >>> le.fit([1, 2, 2, 6]) LabelEncoder() >>> le.classes_ @@ -78,7 +78,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): It can also be used to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels. - >>> le = preprocessing.LabelEncoder(new_values='raise') + >>> le = preprocessing.LabelEncoder(new_labels='raise') >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) LabelEncoder() >>> list(le.classes_) From e6fbc479d9e063fb60c8bca861fb6618eb705f5a Mon Sep 17 00:00:00 2001 From: mjbommar Date: Wed, 4 Jun 2014 13:07:34 -0400 Subject: [PATCH 08/55] Updating doctests --- doc/modules/preprocessing.rst | 4 ++-- sklearn/preprocessing/label.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 71653e9afe6b1..2664bd2428513 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -396,7 +396,7 @@ follows:: >>> from sklearn import preprocessing >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder() + LabelEncoder(new_labels='raise') >>> le.classes_ array([1, 2, 6]) >>> le.transform([1, 1, 2, 6]) @@ -409,7 +409,7 @@ hashable and comparable) to numerical labels:: >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder() + LabelEncoder(new_labels='raise') >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 794bd4fb81cd3..a96f864644f88 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -65,9 +65,9 @@ class LabelEncoder(BaseEstimator, TransformerMixin): `LabelEncoder` can be used to normalize labels. >>> from sklearn import preprocessing - >>> le = preprocessing.LabelEncoder(new_labels='raise') + >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder() + LabelEncoder(new_labels='raise') >>> le.classes_ array([1, 2, 6]) >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS @@ -78,9 +78,9 @@ class LabelEncoder(BaseEstimator, TransformerMixin): It can also be used to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels. - >>> le = preprocessing.LabelEncoder(new_labels='raise') + >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder() + LabelEncoder(new_labels='raise') >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS From 46118d9995b271296fdbc979b4f057a90dd59547 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Wed, 4 Jun 2014 13:37:32 -0400 Subject: [PATCH 09/55] Updating constructor documentation --- sklearn/preprocessing/label.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index a96f864644f88..4e84a425c0634 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -49,10 +49,11 @@ class LabelEncoder(BaseEstimator, TransformerMixin): new_labels : string, optional (default: "raise") Determines how to handle new labels, i.e., data not seen in the training domain. - - If "raise", then raise ValueError. - - If "update", then re-map the new labels to classes - `[N, ..., N+m-1]`, where `m` is the number of new labels. - - If "nan", then re-map the new labels to numpy.nan. + + - If ``"raise"``, then raise ValueError. + - If ``"update"``, then re-map the new labels to + classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels. + - If ``"nan"``, then re-map the new labels to ``numpy.nan``. Attributes From 8d21ec1fca8a437a294b7c249e38485f6773341b Mon Sep 17 00:00:00 2001 From: mjbommar Date: Thu, 5 Jun 2014 11:09:23 -0400 Subject: [PATCH 10/55] Adding specific "label" option to new_labels --- sklearn/preprocessing/label.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 4e84a425c0634..4bbe93dd7aea9 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -90,9 +90,10 @@ class LabelEncoder(BaseEstimator, TransformerMixin): ['tokyo', 'tokyo', 'paris'] """ - def __init__(self, new_labels="raise"): + def __init__(self, new_labels="raise", new_label_class=-1): """Constructor""" self.new_labels = new_labels + self.new_label_class = new_label_class def _check_fitted(self): if not hasattr(self, "classes_"): @@ -171,6 +172,19 @@ def transform(self, y): y_array[-missing_mask]) z[missing_mask] = np.nan return z + elif self.new_labels == "label": + # Create copy of array and return + y_array = np.array(y) + z = np.zeros(y_array.shape) + + # Find entries with new labels + missing_mask = np.in1d(y, diff) + + # Populate return array properly and return + z[-missing_mask] = np.searchsorted(self.classes_, + y_array[-missing_mask]) + z[missing_mask] = self.new_label_class + return z elif self.new_labels == "raise": # Return ValueError, original behavior. raise ValueError("y contains new labels: %s" % str(diff)) From 343c726a5a265ed4c2d4c0ae51169f43e204b082 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Thu, 5 Jun 2014 11:09:41 -0400 Subject: [PATCH 11/55] Adding test for "label" option to ``new_labels`` --- sklearn/preprocessing/tests/test_label.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index f98239bf428dc..4cfdd98e61591 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -203,6 +203,19 @@ def test_label_encoder_new_label_nan(): [1, 2, np.nan]) +def test_label_encoder_new_label_replace(): + """Test LabelEncoder's transform on new labels""" + le = LabelEncoder(new_labels="label", new_label_class=-2) + le.fit(["a", "b", "b", "c"]) + assert_array_equal(le.classes_, ["a", "b", "c"]) + assert_array_equal(le.transform(["a", "a", "c"]), + [0, 0, 2]) + assert_array_equal(le.inverse_transform([2, 1, 0]), + ["c", "b", "a"]) + assert_array_equal(le.transform(["b", "c", "d"]), + [1, 2, -2]) + + def test_label_encoder_new_label_arg(): """Test LabelEncoder's new_labels argument handling""" le = LabelEncoder(new_labels="xyz") From be97c1403c2f637cc89b814b8aec218b57754114 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Thu, 5 Jun 2014 11:11:17 -0400 Subject: [PATCH 12/55] Updating docstring for ``new_labels="label"`` --- sklearn/preprocessing/label.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 4bbe93dd7aea9..ad6e7b2f394b1 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -54,6 +54,11 @@ class LabelEncoder(BaseEstimator, TransformerMixin): - If ``"update"``, then re-map the new labels to classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels. - If ``"nan"``, then re-map the new labels to ``numpy.nan``. + - If ``"label"``, then use the value of ``new_label_class``. + + new_label_class : integer, optional (default: -1) + If ``new_labels="label"``, then this value will be assigned to + as the class for any new labels that are encountered. Attributes From cdd7147ff3e2a74f564e962204edfc5ff507a628 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Thu, 5 Jun 2014 11:11:45 -0400 Subject: [PATCH 13/55] pep8 --- sklearn/preprocessing/label.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index ad6e7b2f394b1..5bd081c5b331d 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -55,7 +55,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels. - If ``"nan"``, then re-map the new labels to ``numpy.nan``. - If ``"label"``, then use the value of ``new_label_class``. - + new_label_class : integer, optional (default: -1) If ``new_labels="label"``, then this value will be assigned to as the class for any new labels that are encountered. From 170d00c07baf2710d244bcae0e62de5d0da011a4 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Thu, 5 Jun 2014 11:34:23 -0400 Subject: [PATCH 14/55] Autodoc fix --- sklearn/preprocessing/label.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 5bd081c5b331d..c3bff45fae0e6 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -73,7 +73,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> from sklearn import preprocessing >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder(new_labels='raise') + LabelEncoder(new_label_class=-1, new_labels='raise') >>> le.classes_ array([1, 2, 6]) >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS @@ -86,7 +86,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder(new_labels='raise') + LabelEncoder(new_label_class=-1, new_labels='raise') >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS From 2d87e88a6fa015f979aaa03a15d3ff0a0735f466 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Sun, 8 Jun 2014 09:09:16 -0400 Subject: [PATCH 15/55] Fixing rst docs --- doc/modules/preprocessing.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 2664bd2428513..492d2c425fec0 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -396,7 +396,7 @@ follows:: >>> from sklearn import preprocessing >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder(new_labels='raise') + LabelEncoder(new_label_class=-1, new_labels='raise') >>> le.classes_ array([1, 2, 6]) >>> le.transform([1, 1, 2, 6]) @@ -409,7 +409,7 @@ hashable and comparable) to numerical labels:: >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder(new_labels='raise') + LabelEncoder(new_label_class=-1, new_labels='raise') >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) From bb8d9a64725290dba18f5dcb8e48af3af8af7973 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Sun, 8 Jun 2014 14:04:07 -0400 Subject: [PATCH 16/55] Changing dtypes for new_labels --- sklearn/preprocessing/label.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index c3bff45fae0e6..38b20bfce5f02 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -167,7 +167,7 @@ def transform(self, y): elif self.new_labels == "nan": # Create copy of array and return y_array = np.array(y) - z = np.zeros(y_array.shape) + z = np.zeros(y_array.shape, dtype=float) # Find entries with new labels missing_mask = np.in1d(y, diff) @@ -180,7 +180,7 @@ def transform(self, y): elif self.new_labels == "label": # Create copy of array and return y_array = np.array(y) - z = np.zeros(y_array.shape) + z = np.zeros(y_array.shape, dtype=int) # Find entries with new labels missing_mask = np.in1d(y, diff) From ab788f75c8cff94c64dad4d555c92c7e750396d5 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Sun, 8 Jun 2014 15:52:36 -0400 Subject: [PATCH 17/55] Adding example for new_labels argument --- doc/modules/preprocessing.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 492d2c425fec0..5f66971f96d20 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -417,6 +417,20 @@ hashable and comparable) to numerical labels:: >>> list(le.inverse_transform([2, 2, 1])) ['tokyo', 'tokyo', 'paris'] +By default, ``LabelEncoder`` will throw a ``ValueError`` in the event that +labels are passed in ``transform`` that were not seen in ``fit``. This +behavior can be handled with the ``new_labels`` parameter, which supports +``"raise"``, ``"nan"``, ``"update"``, and ``"label"`` strategies for +handling new labels. For example, the ``"label"`` strategy will assign +the unseen values a label of ``-1``. + + >>> le = preprocessing.LabelEncoder(new_labels="label") + >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) + LabelEncoder(new_label_class=-1, new_labels='label') + >>> list(le.classes_) + ['amsterdam', 'paris', 'tokyo'] + >>> le.transform(["tokyo", "tokyo", "paris", "rome"]) + array([ 2, 2, 1, -1]) Imputation of missing values ============================ From a597fc36ba08da96e99d61d27d3b056f3dba3803 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Mon, 14 Jul 2014 22:22:30 -0400 Subject: [PATCH 18/55] Adding new_labels handling to fit/fit_transform --- sklearn/preprocessing/label.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 38b20bfce5f02..b65fed2adfd38 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -116,6 +116,12 @@ def fit(self, y): ------- self : returns an instance of self. """ + # Check new_labels parameter + if self.new_labels not in ["update", "nan", "raise", "label"]: + # Raise on invalid argument. + raise ValueError("Value of argument `new_labels`={0} " + "is unknown.".format(self.new_labels)) + y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) self.classes_ = np.unique(y) @@ -133,6 +139,12 @@ def fit_transform(self, y): ------- y : array-like of shape [n_samples] """ + # Check new_labels parameter + if self.new_labels not in ["update", "nan", "raise", "label"]: + # Raise on invalid argument. + raise ValueError("Value of argument `new_labels`={0} " + "is unknown.".format(self.new_labels)) + y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) self.classes_, y = np.unique(y, return_inverse=True) From 291d752c9bd4a581ea1b0b994e4b9de3f18dc340 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Mon, 14 Jul 2014 22:23:06 -0400 Subject: [PATCH 19/55] Improving difficulty of test cases with non-increasing unseen labels --- sklearn/preprocessing/tests/test_label.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 4cfdd98e61591..c58d5bcf36935 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -186,8 +186,8 @@ def test_label_encoder_new_label_update(): [0, 0, 2]) assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"]) - assert_array_equal(le.transform(["b", "c", "d"]), - [1, 2, 3]) + assert_array_equal(le.transform(["_", "b", "c", "d"]), + [3, 1, 2, 4]) def test_label_encoder_new_label_nan(): @@ -199,8 +199,8 @@ def test_label_encoder_new_label_nan(): [0, 0, 2]) assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"]) - assert_array_equal(le.transform(["b", "c", "d"]), - [1, 2, np.nan]) + assert_array_equal(le.transform(["_", "b", "c", "d"]), + [np.nan, 1, 2, np.nan]) def test_label_encoder_new_label_replace(): From fe0141d545ec08ac680b76565c8800a346403312 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Mon, 14 Jul 2014 22:27:21 -0400 Subject: [PATCH 20/55] Moving ValueError check to fit --- sklearn/preprocessing/tests/test_label.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index c58d5bcf36935..380da039c74b7 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -219,13 +219,7 @@ def test_label_encoder_new_label_replace(): def test_label_encoder_new_label_arg(): """Test LabelEncoder's new_labels argument handling""" le = LabelEncoder(new_labels="xyz") - le.fit(["a", "b", "b", "c"]) - assert_array_equal(le.classes_, ["a", "b", "c"]) - assert_array_equal(le.transform(["a", "a", "c"]), - [0, 0, 2]) - assert_array_equal(le.inverse_transform([2, 1, 0]), - ["c", "b", "a"]) - assert_raises(ValueError, le.transform, ["c", "d"]) + assert_raises(ValueError, le.fit, ["a", "b", "b", "c"]) def test_label_encoder_fit_transform(): From e1b7ed58fb253259dd3d8e31b1142dcbaa8cfdb4 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Mon, 14 Jul 2014 22:50:14 -0400 Subject: [PATCH 21/55] Improving difficult for new_labels='update' test to include multiple transform with new labels --- sklearn/preprocessing/tests/test_label.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 380da039c74b7..090214f2a661d 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -186,8 +186,14 @@ def test_label_encoder_new_label_update(): [0, 0, 2]) assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"]) - assert_array_equal(le.transform(["_", "b", "c", "d"]), - [3, 1, 2, 4]) + assert_array_equal(le.transform(["b", "c", "_"]), + [1, 2, 3]) + assert_array_equal(le.classes_, ["a", "b", "c", "_"]) + print(le.classes_) + assert_array_equal(le.transform(["_", "z", "a"]), + [3, 4, 0]) + assert_array_equal(le.classes_, ["a", "b", "c", "_", "z"]) + def test_label_encoder_new_label_nan(): From 9fd7736d9984f4d27150231fa3eb6736a0ba7434 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Mon, 14 Jul 2014 22:51:24 -0400 Subject: [PATCH 22/55] Fixing negative indexing, renamed z->out, failing approach for new_labels=update w/ searchsorted --- sklearn/preprocessing/label.py | 45 ++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index b65fed2adfd38..fb9a6c5d32163 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -168,40 +168,53 @@ def transform(self, y): _check_numpy_unicode_bug(classes) if len(np.intersect1d(classes, self.classes_)) < len(classes): diff = np.setdiff1d(classes, self.classes_) + # Create copy of array and return + y = np.array(y) # If we are mapping new labels, get "new" ID and change in copy. if self.new_labels == "update": + # Setup out + out = np.zeros(y.shape, dtype=int) + + # Find entries with new labels + missing_mask = np.in1d(y, diff) + new_class_values = np.sort(diff) + + # Populate return array properly and return + out[~missing_mask] = np.searchsorted(self.classes_, + y[~missing_mask]) + out[missing_mask] = np.searchsorted(new_class_values, + y[missing_mask]) + len(self.classes_) + # Update the class list with new labels - self.classes_ = np.append(self.classes_, np.sort(diff)) + self.classes_ = np.append(self.classes_, new_class_values) # Return mapped encoding - return np.searchsorted(self.classes_, y) + return out elif self.new_labels == "nan": - # Create copy of array and return - y_array = np.array(y) - z = np.zeros(y_array.shape, dtype=float) + # Setup out + out = np.zeros(y.shape, dtype=float) # Find entries with new labels missing_mask = np.in1d(y, diff) # Populate return array properly and return - z[-missing_mask] = np.searchsorted(self.classes_, - y_array[-missing_mask]) - z[missing_mask] = np.nan - return z + out[~missing_mask] = np.searchsorted(self.classes_, + y[~missing_mask]) + out[missing_mask] = np.nan + return out elif self.new_labels == "label": - # Create copy of array and return - y_array = np.array(y) - z = np.zeros(y_array.shape, dtype=int) + # Setup out + out = np.zeros(y.shape, dtype=int) # Find entries with new labels missing_mask = np.in1d(y, diff) # Populate return array properly and return - z[-missing_mask] = np.searchsorted(self.classes_, - y_array[-missing_mask]) - z[missing_mask] = self.new_label_class - return z + out[~missing_mask] = np.searchsorted(self.classes_, + y[~missing_mask]) + out[missing_mask] = self.new_label_class + return out elif self.new_labels == "raise": # Return ValueError, original behavior. raise ValueError("y contains new labels: %s" % str(diff)) From e3c14bbf0c07c7988b8f335b82c54a031545a785 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Mon, 14 Jul 2014 22:56:03 -0400 Subject: [PATCH 23/55] PEP8 --- sklearn/preprocessing/label.py | 22 ++++++++++++---------- sklearn/preprocessing/tests/test_label.py | 1 - 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index fb9a6c5d32163..7e016d068035d 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -1,6 +1,6 @@ # Authors: Alexandre Gramfort -# Mathieu Blondel -# Olivier Grisel +# Mathieu Blondel +# Olivier Grisel # Andreas Mueller # License: BSD 3 clause @@ -95,6 +95,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): ['tokyo', 'tokyo', 'paris'] """ + def __init__(self, new_labels="raise", new_label_class=-1): """Constructor""" self.new_labels = new_labels @@ -119,8 +120,8 @@ def fit(self, y): # Check new_labels parameter if self.new_labels not in ["update", "nan", "raise", "label"]: # Raise on invalid argument. - raise ValueError("Value of argument `new_labels`={0} " - "is unknown.".format(self.new_labels)) + raise ValueError("Value of argument `new_labels`={0} " + "is unknown.".format(self.new_labels)) y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) @@ -142,8 +143,8 @@ def fit_transform(self, y): # Check new_labels parameter if self.new_labels not in ["update", "nan", "raise", "label"]: # Raise on invalid argument. - raise ValueError("Value of argument `new_labels`={0} " - "is unknown.".format(self.new_labels)) + raise ValueError("Value of argument `new_labels`={0} " + "is unknown.".format(self.new_labels)) y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) @@ -182,9 +183,10 @@ def transform(self, y): # Populate return array properly and return out[~missing_mask] = np.searchsorted(self.classes_, - y[~missing_mask]) + y[~missing_mask]) out[missing_mask] = np.searchsorted(new_class_values, - y[missing_mask]) + len(self.classes_) + y[missing_mask]) + \ + len(self.classes_) # Update the class list with new labels self.classes_ = np.append(self.classes_, new_class_values) @@ -200,7 +202,7 @@ def transform(self, y): # Populate return array properly and return out[~missing_mask] = np.searchsorted(self.classes_, - y[~missing_mask]) + y[~missing_mask]) out[missing_mask] = np.nan return out elif self.new_labels == "label": @@ -212,7 +214,7 @@ def transform(self, y): # Populate return array properly and return out[~missing_mask] = np.searchsorted(self.classes_, - y[~missing_mask]) + y[~missing_mask]) out[missing_mask] = self.new_label_class return out elif self.new_labels == "raise": diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 090214f2a661d..f04acbc86ca31 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -195,7 +195,6 @@ def test_label_encoder_new_label_update(): assert_array_equal(le.classes_, ["a", "b", "c", "_", "z"]) - def test_label_encoder_new_label_nan(): """Test LabelEncoder's transform on new labels""" le = LabelEncoder(new_labels="nan") From fe797363a9fa7e57f44c1dc0fe2e52db20d7f91d Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Sat, 19 Jul 2014 09:02:49 -0400 Subject: [PATCH 24/55] Removing nan option and corresponding test --- sklearn/preprocessing/label.py | 17 ++--------------- sklearn/preprocessing/tests/test_label.py | 13 ------------- 2 files changed, 2 insertions(+), 28 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 7e016d068035d..c69aa49d78d7b 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -53,7 +53,6 @@ class LabelEncoder(BaseEstimator, TransformerMixin): - If ``"raise"``, then raise ValueError. - If ``"update"``, then re-map the new labels to classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels. - - If ``"nan"``, then re-map the new labels to ``numpy.nan``. - If ``"label"``, then use the value of ``new_label_class``. new_label_class : integer, optional (default: -1) @@ -118,7 +117,7 @@ def fit(self, y): self : returns an instance of self. """ # Check new_labels parameter - if self.new_labels not in ["update", "nan", "raise", "label"]: + if self.new_labels not in ["update", "raise", "label"]: # Raise on invalid argument. raise ValueError("Value of argument `new_labels`={0} " "is unknown.".format(self.new_labels)) @@ -141,7 +140,7 @@ def fit_transform(self, y): y : array-like of shape [n_samples] """ # Check new_labels parameter - if self.new_labels not in ["update", "nan", "raise", "label"]: + if self.new_labels not in ["update", "raise", "label"]: # Raise on invalid argument. raise ValueError("Value of argument `new_labels`={0} " "is unknown.".format(self.new_labels)) @@ -193,18 +192,6 @@ def transform(self, y): # Return mapped encoding return out - elif self.new_labels == "nan": - # Setup out - out = np.zeros(y.shape, dtype=float) - - # Find entries with new labels - missing_mask = np.in1d(y, diff) - - # Populate return array properly and return - out[~missing_mask] = np.searchsorted(self.classes_, - y[~missing_mask]) - out[missing_mask] = np.nan - return out elif self.new_labels == "label": # Setup out out = np.zeros(y.shape, dtype=int) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index f04acbc86ca31..9d028e6aaccea 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -195,19 +195,6 @@ def test_label_encoder_new_label_update(): assert_array_equal(le.classes_, ["a", "b", "c", "_", "z"]) -def test_label_encoder_new_label_nan(): - """Test LabelEncoder's transform on new labels""" - le = LabelEncoder(new_labels="nan") - le.fit(["a", "b", "b", "c"]) - assert_array_equal(le.classes_, ["a", "b", "c"]) - assert_array_equal(le.transform(["a", "a", "c"]), - [0, 0, 2]) - assert_array_equal(le.inverse_transform([2, 1, 0]), - ["c", "b", "a"]) - assert_array_equal(le.transform(["_", "b", "c", "d"]), - [np.nan, 1, 2, np.nan]) - - def test_label_encoder_new_label_replace(): """Test LabelEncoder's transform on new labels""" le = LabelEncoder(new_labels="label", new_label_class=-2) From b83b37f4774fd3033afe4a51f6a1cb8edc83292a Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Sat, 19 Jul 2014 11:10:12 -0400 Subject: [PATCH 25/55] Handling repeated transform calls with new_class_mapping_, refactoring, cleaning after removing np.nan. --- sklearn/preprocessing/label.py | 100 +++++++++++++--------- sklearn/preprocessing/tests/test_label.py | 19 ++-- 2 files changed, 74 insertions(+), 45 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index c69aa49d78d7b..8680501c4a258 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -4,6 +4,7 @@ # Andreas Mueller # License: BSD 3 clause +import operator import numpy as np from ..base import BaseEstimator, TransformerMixin @@ -53,18 +54,19 @@ class LabelEncoder(BaseEstimator, TransformerMixin): - If ``"raise"``, then raise ValueError. - If ``"update"``, then re-map the new labels to classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels. - - If ``"label"``, then use the value of ``new_label_class``. - - new_label_class : integer, optional (default: -1) - If ``new_labels="label"``, then this value will be assigned to - as the class for any new labels that are encountered. - + - If an integer value is passed, then use re-label with this value. + N.B. that default values are in [0, 1, ...], so caution should be + taken if a non-negative value is passed to not accidentally + intersect. Attributes ---------- `classes_` : array of shape (n_class,) Holds the label for each class. + `new_label_mapping_` : dictionary + Stores the mapping for classes not seen during original ``fit``. + Examples -------- `LabelEncoder` can be used to normalize labels. @@ -95,15 +97,34 @@ class LabelEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, new_labels="raise", new_label_class=-1): + def __init__(self, new_labels="raise"): """Constructor""" self.new_labels = new_labels - self.new_label_class = new_label_class + self.new_label_mapping_ = {} def _check_fitted(self): if not hasattr(self, "classes_"): raise ValueError("LabelEncoder was not fitted yet.") + def get_classes(self): + """Get classes that have been observed by the encoder. Note that this + method returns classes seen both at original ``fit`` time (i.e., + ``self.classes_``) and classes seen after ``fit`` (i.e., + ``self.new_label_mapping_.keys()``) for applicable values of + ``new_labels``. + + Returns + ------- + classes : array-like of shape [n_classes] + """ + # If we've seen updates, include them in the order they were added. + if len(self.new_label_mapping_) > 0: + sorted_new, _ = zip(*sorted(self.new_label_mapping_.iteritems(), + key=operator.itemgetter(1))) + return np.append(self.classes_, sorted_new) + else: + return self.classes_ + def fit(self, y): """Fit label encoder @@ -117,10 +138,12 @@ def fit(self, y): self : returns an instance of self. """ # Check new_labels parameter - if self.new_labels not in ["update", "raise", "label"]: + if self.new_labels not in ["update", "raise"] and \ + type(self.new_labels) not in [int]: # Raise on invalid argument. raise ValueError("Value of argument `new_labels`={0} " - "is unknown.".format(self.new_labels)) + "is unknown and not integer." + .format(self.new_labels)) y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) @@ -140,10 +163,12 @@ def fit_transform(self, y): y : array-like of shape [n_samples] """ # Check new_labels parameter - if self.new_labels not in ["update", "raise", "label"]: + if self.new_labels not in ["update", "raise"] and \ + type(self.new_labels) not in [int]: # Raise on invalid argument. raise ValueError("Value of argument `new_labels`={0} " - "is unknown.".format(self.new_labels)) + "is unknown and not integer." + .format(self.new_labels)) y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) @@ -166,47 +191,42 @@ def transform(self, y): classes = np.unique(y) _check_numpy_unicode_bug(classes) - if len(np.intersect1d(classes, self.classes_)) < len(classes): - diff = np.setdiff1d(classes, self.classes_) + if len(np.intersect1d(classes, self.get_classes())) < len(classes): + # Get the new classes + diff_fit = np.setdiff1d(classes, self.classes_) + diff_new = np.setdiff1d(classes, self.get_classes()) + # Create copy of array and return y = np.array(y) # If we are mapping new labels, get "new" ID and change in copy. if self.new_labels == "update": - # Setup out - out = np.zeros(y.shape, dtype=int) - - # Find entries with new labels - missing_mask = np.in1d(y, diff) - new_class_values = np.sort(diff) + # Update the new label mapping + next_label = len(self.get_classes()) + self.new_label_mapping_.update(dict(zip(diff_new, + range(next_label, + next_label + + len(diff_new))))) - # Populate return array properly and return - out[~missing_mask] = np.searchsorted(self.classes_, - y[~missing_mask]) - out[missing_mask] = np.searchsorted(new_class_values, - y[missing_mask]) + \ - len(self.classes_) - - # Update the class list with new labels - self.classes_ = np.append(self.classes_, new_class_values) + # Find entries with new labels + missing_mask = np.in1d(y, diff_fit) - # Return mapped encoding + # Populate return array properly by mask and return + out = np.searchsorted(self.classes_, y) + out[missing_mask] = [self.new_label_mapping_[value] + for value in y[missing_mask]] return out - elif self.new_labels == "label": - # Setup out - out = np.zeros(y.shape, dtype=int) - + elif type(self.new_labels) in [int]: # Find entries with new labels - missing_mask = np.in1d(y, diff) + missing_mask = np.in1d(y, diff_fit) - # Populate return array properly and return - out[~missing_mask] = np.searchsorted(self.classes_, - y[~missing_mask]) - out[missing_mask] = self.new_label_class + # Populate return array properly by mask and return + out = np.searchsorted(self.classes_, y) + out[missing_mask] = self.new_labels return out elif self.new_labels == "raise": # Return ValueError, original behavior. - raise ValueError("y contains new labels: %s" % str(diff)) + raise ValueError("y contains new labels: %s" % str(diff_fit)) else: # Raise on invalid argument. raise ValueError("Value of argument `new_labels`={0} " diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 9d028e6aaccea..9b7de7ba51c42 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -177,6 +177,17 @@ def test_label_encoder(): assert_raises(ValueError, le.transform, [0, 6]) +def test_label_encoder_get_classes(): + """Test LabelEncoder's get_classes method.""" + le = LabelEncoder(new_labels="update") + le.fit([1, 1, 4, 5, -1, 0]) + assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) + assert_array_equal(le.classes_, le.get_classes()) + le.transform([10]) + assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) + assert_array_equal(le.get_classes(), [-1, 0, 1, 4, 5, 10]) + + def test_label_encoder_new_label_update(): """Test LabelEncoder's transform on new labels""" le = LabelEncoder(new_labels="update") @@ -188,16 +199,14 @@ def test_label_encoder_new_label_update(): ["c", "b", "a"]) assert_array_equal(le.transform(["b", "c", "_"]), [1, 2, 3]) - assert_array_equal(le.classes_, ["a", "b", "c", "_"]) - print(le.classes_) + assert_array_equal(le.get_classes(), ["a", "b", "c", "_"]) assert_array_equal(le.transform(["_", "z", "a"]), [3, 4, 0]) - assert_array_equal(le.classes_, ["a", "b", "c", "_", "z"]) def test_label_encoder_new_label_replace(): """Test LabelEncoder's transform on new labels""" - le = LabelEncoder(new_labels="label", new_label_class=-2) + le = LabelEncoder(new_labels=-99) le.fit(["a", "b", "b", "c"]) assert_array_equal(le.classes_, ["a", "b", "c"]) assert_array_equal(le.transform(["a", "a", "c"]), @@ -205,7 +214,7 @@ def test_label_encoder_new_label_replace(): assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"]) assert_array_equal(le.transform(["b", "c", "d"]), - [1, 2, -2]) + [1, 2, -99]) def test_label_encoder_new_label_arg(): From 0b8e63cff88ecb3474fd694119d6efb380b94460 Mon Sep 17 00:00:00 2001 From: pvnguyen Date: Mon, 21 Jul 2014 14:44:42 -0700 Subject: [PATCH 26/55] Update outlier_detection.rst Add reference for One-class SVM. --- doc/modules/outlier_detection.rst | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst index ee7c483c73a7e..a99758989e195 100644 --- a/doc/modules/outlier_detection.rst +++ b/doc/modules/outlier_detection.rst @@ -53,8 +53,8 @@ coming from the same population than the initial observations. Otherwise, if they lay outside the frontier, we can say that they are abnormal with a given confidence in our assessment. -The One-Class SVM has been introduced in [1] for that purpose and -implemented in the :ref:`svm` module in the +The One-Class SVM has been introduced by Schölkopf et al. for that purpose +and implemented in the :ref:`svm` module in the :class:`svm.OneClassSVM` object. It requires the choice of a kernel and a scalar parameter to define a frontier. The RBF kernel is usually chosen although there exists no exact formula or algorithm to @@ -63,6 +63,12 @@ implementation. The :math:`\nu` parameter, also known as the margin of the One-Class SVM, corresponds to the probability of finding a new, but regular, observation outside the frontier. +.. topic:: References: + + * `Estimating the support of a high-dimensional distribution + `_ Schölkopf, + Bernhard, et al. Neural computation 13.7 (2001): 1443-1471. + .. topic:: Examples: * See :ref:`example_svm_plot_oneclass.py` for visualizing the @@ -73,7 +79,7 @@ but regular, observation outside the frontier. :target: ../auto_examples/svm/plot_oneclasse.html :align: center :scale: 75% - + Outlier Detection ================= From 62f1f57ce586fb8c74be6d1e0a736565c9f04b45 Mon Sep 17 00:00:00 2001 From: Kyle Kastner Date: Wed, 23 Jul 2014 16:31:23 +0200 Subject: [PATCH 27/55] Added directory checking for documentation builds, and corrected for Windows pathing --- doc/sphinxext/gen_rst.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/doc/sphinxext/gen_rst.py b/doc/sphinxext/gen_rst.py index 213484a515481..9a8d2535a2b26 100644 --- a/doc/sphinxext/gen_rst.py +++ b/doc/sphinxext/gen_rst.py @@ -468,7 +468,11 @@ def generate_example_rst(app): examples. """ root_dir = os.path.join(app.builder.srcdir, 'auto_examples') - example_dir = os.path.abspath(app.builder.srcdir + '/../' + 'examples') + example_dir = os.path.abspath(os.path.join(app.builder.srcdir, '..', + 'examples')) + generated_dir = os.path.abspath(os.path.join(app.builder.srcdir, + 'modules', 'generated')) + try: plot_gallery = eval(app.builder.config.plot_gallery) except TypeError: @@ -477,10 +481,12 @@ def generate_example_rst(app): os.makedirs(example_dir) if not os.path.exists(root_dir): os.makedirs(root_dir) + if not os.path.exists(generated_dir): + os.makedirs(generated_dir) # we create an index.rst with all examples fhindex = open(os.path.join(root_dir, 'index.rst'), 'w') - #Note: The sidebar button has been removed from the examples page for now + # Note: The sidebar button has been removed from the examples page for now # due to how it messes up the layout. Will be fixed at a later point fhindex.write("""\ From d814353cc3c93536eea3df8a0dcc765ea18f0dfa Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 23 Jul 2014 14:44:13 +0200 Subject: [PATCH 28/55] MAINT More robust windows installation script --- appveyor.yml | 11 ++++++++--- continuous_integration/appveyor/install.ps1 | 21 ++++++++++++++++----- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 54bd67e5f0c26..7d91bf06eecf1 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -13,7 +13,7 @@ environment: PYTHON_VERSION: "2.7.8" PYTHON_ARCH: "32" - - PYTHON: "C:\\Python27" + - PYTHON: "C:\\Python27_64" PYTHON_VERSION: "2.7.8" PYTHON_ARCH: "64" @@ -21,10 +21,15 @@ environment: PYTHON_VERSION: "3.4.1" PYTHON_ARCH: "32" - - PYTHON: "C:\\Python34" + - PYTHON: "C:\\Python34_64" PYTHON_VERSION: "3.4.1" PYTHON_ARCH: "64" +branches: + only: + - master + - 0.15.X + install: # Install Python (from the official .msi of http://python.org) and pip when # not already installed. @@ -53,7 +58,7 @@ test_script: # Skip joblib tests that require multiprocessing as they are prone to random # slow down - - "python -c \"import nose; nose.main()\" -v -s sklearn" + - "python -c \"import nose; nose.main()\" -s sklearn" artifacts: # Archive the generated wheel package in the ci.appveyor.com build report. diff --git a/continuous_integration/appveyor/install.ps1 b/continuous_integration/appveyor/install.ps1 index fc06c58078965..2a96d3372ecab 100644 --- a/continuous_integration/appveyor/install.ps1 +++ b/continuous_integration/appveyor/install.ps1 @@ -52,12 +52,17 @@ function InstallPython ($python_version, $architecture, $python_home) { } else { $platform_suffix = ".amd64" } - $filepath = DownloadPython $python_version $platform_suffix - Write-Host "Installing" $filepath "to" $python_home + $msipath = DownloadPython $python_version $platform_suffix + Write-Host "Installing" $msipath "to" $python_home $install_log = $python_home + ".log" - $args = "/qn /log $install_log /i $filepath TARGETDIR=$python_home" - Write-Host "msiexec.exe" $args - Start-Process -FilePath "msiexec.exe" -ArgumentList $args -Wait -Passthru + $install_args = "/qn /log $install_log /i $msipath TARGETDIR=$python_home" + $uninstall_args = "/qn /x $msipath" + RunCommand "msiexec.exe" $install_args + if (-not(Test-Path $python_home)) { + Write-Host "Python seems to be installed else-where, reinstalling." + RunCommand "msiexec.exe" $uninstall_args + RunCommand "msiexec.exe" $install_args + } if (Test-Path $python_home) { Write-Host "Python $python_version ($architecture) installation complete" } else { @@ -67,6 +72,11 @@ function InstallPython ($python_version, $architecture, $python_home) { } } +function RunCommand ($command, $command_args) { + Write-Host $command $command_args + Start-Process -FilePath $command -ArgumentList $command_args -Wait -Passthru +} + function InstallPip ($python_home) { $pip_path = $python_home + "\Scripts\pip.exe" @@ -82,6 +92,7 @@ function InstallPip ($python_home) { } } + function main () { InstallPython $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON InstallPip $env:PYTHON From 0af2d8f96f1bf424235f2587effec7afd249c045 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 23 Jul 2014 16:04:29 +0200 Subject: [PATCH 29/55] MAINT move skip for unstable 32bit to _check_transformer --- sklearn/utils/estimator_checks.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 874dae7c338a8..c114d45887794 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -136,15 +136,6 @@ def check_regressors_classifiers_sparse_data(name, Estimator): def check_transformer(name, Transformer): - if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit(): - # Those transformers yield non-deterministic output when executed on - # a 32bit Python. The same transformers are stable on 64bit Python. - # FIXME: try to isolate a minimalistic reproduction case only depending - # on numpy & scipy and/or maybe generate a test dataset that does not - # cause such unstable behaviors. - msg = name + ' is non deterministic on 32bit Python' - raise SkipTest(msg) - X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X = StandardScaler().fit_transform(X) @@ -166,6 +157,14 @@ def check_transformer_data_not_an_array(name, Transformer): def _check_transformer(name, Transformer, X, y): + if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit(): + # Those transformers yield non-deterministic output when executed on + # a 32bit Python. The same transformers are stable on 64bit Python. + # FIXME: try to isolate a minimalistic reproduction case only depending + # on numpy & scipy and/or maybe generate a test dataset that does not + # cause such unstable behaviors. + msg = name + ' is non deterministic on 32bit Python' + raise SkipTest(msg) n_samples, n_features = np.asarray(X).shape # catch deprecation warnings with warnings.catch_warnings(record=True): From f3afd4e5e294baae0edca2e1b5911b766224ca17 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 23 Jul 2014 15:18:16 +0200 Subject: [PATCH 30/55] FIX unstable test on 32 bit windows --- sklearn/feature_selection/tests/test_feature_select.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index 2734eb5ab1729..5ef88c41a7b61 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -48,8 +48,8 @@ def test_f_oneway_ints(): # test that is gives the same result as with float f, p = f_oneway(X.astype(np.float), y) - assert_array_almost_equal(f, fint, decimal=5) - assert_array_almost_equal(p, pint, decimal=5) + assert_array_almost_equal(f, fint, decimal=4) + assert_array_almost_equal(p, pint, decimal=4) def test_f_classif(): From 4b6978e754d3dde7fa880b5f06df7247ab33ea69 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Tue, 3 Jun 2014 22:31:57 -0400 Subject: [PATCH 31/55] Adding new_labels argument to LabelEncoder --- sklearn/preprocessing/label.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index f304bf6104cae..774aa0ccf4cd3 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -53,6 +53,15 @@ def _check_numpy_unicode_bug(labels): class LabelEncoder(BaseEstimator, TransformerMixin): """Encode labels with value between 0 and n_classes-1. + Parameters + ---------- + + new_labels : string, optional (default: "raise") + Determines how to handle newly seen labels, i.e., data + not seen in the fit domain. If "raise", then raise ValueError; + if "map", then re-map the new labels to class N, where seen + classes are in {0, ..., N-1}. + Attributes ---------- `classes_` : array of shape (n_class,) @@ -87,6 +96,9 @@ class LabelEncoder(BaseEstimator, TransformerMixin): ['tokyo', 'tokyo', 'paris'] """ + def __init__(self, new_labels="raise"): + """Constructor""" + self.new_labels = new_labels def _check_fitted(self): if not hasattr(self, "classes_"): @@ -144,7 +156,27 @@ def transform(self, y): _check_numpy_unicode_bug(classes) if len(np.intersect1d(classes, self.classes_)) < len(classes): diff = np.setdiff1d(classes, self.classes_) - raise ValueError("y contains new labels: %s" % str(diff)) + + # If we are mapping new labels, get "new" ID and change in copy. + if self.new_labels == "map": + # Get new ID and append to class list + missing_id = len(self.classes_) + self.classes_.resize(len(self.classes_)+1) + self.classes_[-1] = missing_id + + # Reset the value in y_copy + missing_mask = np.in1d(y, diff) + y_copy = np.array(y) + y_copy[missing_mask] = missing_id + + # Return mapped encoding + return np.searchsorted(self.classes_, y_copy) + elif self.new_labels == "raise": + raise ValueError("y contains new labels: %s" % str(diff)) + else: + raise ValueError("Value of argument `new_labels`={0} " + "is unknown.".format(self.new_labels)) + return np.searchsorted(self.classes_, y) def inverse_transform(self, y): From d99020715163838c0bb3ff792e82b5e26b824fde Mon Sep 17 00:00:00 2001 From: mjbommar Date: Tue, 3 Jun 2014 22:32:10 -0400 Subject: [PATCH 32/55] Adding tests for new_labels argument. --- sklearn/preprocessing/tests/test_label.py | 24 +++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index dfdb2d23a2134..f8e20645e6d6e 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -210,6 +210,30 @@ def test_label_encoder(): assert_raises(ValueError, le.transform, [0, 6]) +def test_label_encoder_new_label(): + """Test LabelEncoder's transform on new labels""" + le = LabelEncoder(new_labels="map") + le.fit(["a", "b", "b", "c"]) + assert_array_equal(le.classes_, ["a", "b", "c"]) + assert_array_equal(le.transform(["a", "a", "c"]), + [0, 0, 2]) + assert_array_equal(le.inverse_transform([2, 1, 0]), + ["c", "b", "a"]) + le.transform(["b", "c", "d"]) + + +def test_label_encoder_new_label_arg(): + """Test LabelEncoder's new_labels argument handling""" + le = LabelEncoder(new_labels="xyz") + le.fit(["a", "b", "b", "c"]) + assert_array_equal(le.classes_, ["a", "b", "c"]) + assert_array_equal(le.transform(["a", "a", "c"]), + [0, 0, 2]) + assert_array_equal(le.inverse_transform([2, 1, 0]), + ["c", "b", "a"]) + assert_raises(ValueError, le.transform, ["c", "d"]) + + def test_label_encoder_fit_transform(): """Test fit_transform""" le = LabelEncoder() From a69840bcfcaf5e67796d741a930136bdea9ed820 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Wed, 4 Jun 2014 08:01:33 -0400 Subject: [PATCH 33/55] Changing classes_ update strategy --- sklearn/preprocessing/label.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 774aa0ccf4cd3..b360b5b9f44c9 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -161,8 +161,7 @@ def transform(self, y): if self.new_labels == "map": # Get new ID and append to class list missing_id = len(self.classes_) - self.classes_.resize(len(self.classes_)+1) - self.classes_[-1] = missing_id + self.classes_ = np.append(self.classes_, missing_id) # Reset the value in y_copy missing_mask = np.in1d(y, diff) From fce9fb541c25973e6ff9dc1e578d7c84c77df396 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Wed, 4 Jun 2014 08:31:33 -0400 Subject: [PATCH 34/55] Adding nan behavior, renaming to --- sklearn/preprocessing/label.py | 38 ++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index b360b5b9f44c9..36c872468f6f8 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -57,10 +57,13 @@ class LabelEncoder(BaseEstimator, TransformerMixin): ---------- new_labels : string, optional (default: "raise") - Determines how to handle newly seen labels, i.e., data - not seen in the fit domain. If "raise", then raise ValueError; - if "map", then re-map the new labels to class N, where seen - classes are in {0, ..., N-1}. + Determines how to handle new labels, i.e., data + not seen in the training domain. + - If "raise", then raise ValueError. + - If "update", then re-map the new labels to classes + `[N, ..., N+m-1]`, where `m` is the number of new labels. + - If "nan", then re-map the new labels to numpy.nan. + Attributes ---------- @@ -158,21 +161,30 @@ def transform(self, y): diff = np.setdiff1d(classes, self.classes_) # If we are mapping new labels, get "new" ID and change in copy. - if self.new_labels == "map": - # Get new ID and append to class list - missing_id = len(self.classes_) - self.classes_ = np.append(self.classes_, missing_id) + if self.new_labels == "update": + # Update the class list with new labels + self.classes_ = np.append(self.classes_, np.sort(diff)) + + # Return mapped encoding + return np.searchsorted(self.classes_, y) + elif self.new_labels == "nan": + # Create copy of array and return + y_array = np.array(y) + z = np.zeros(y_array.shape) - # Reset the value in y_copy + # Find entries with new labels missing_mask = np.in1d(y, diff) - y_copy = np.array(y) - y_copy[missing_mask] = missing_id - # Return mapped encoding - return np.searchsorted(self.classes_, y_copy) + # Populate return array properly and return + z[-missing_mask] = np.searchsorted(self.classes_, + y_array[-missing_mask]) + z[missing_mask] = np.nan + return z elif self.new_labels == "raise": + # Return ValueError, original behavior. raise ValueError("y contains new labels: %s" % str(diff)) else: + # Raise on invalid argument. raise ValueError("Value of argument `new_labels`={0} " "is unknown.".format(self.new_labels)) From 76921e54931813c171d5612d541fbddd0539e128 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Wed, 4 Jun 2014 08:32:10 -0400 Subject: [PATCH 35/55] Updating tests to include nan case and update name --- sklearn/preprocessing/tests/test_label.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index f8e20645e6d6e..14eb7c3e1fe2b 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -210,16 +210,30 @@ def test_label_encoder(): assert_raises(ValueError, le.transform, [0, 6]) -def test_label_encoder_new_label(): +def test_label_encoder_new_label_update(): """Test LabelEncoder's transform on new labels""" - le = LabelEncoder(new_labels="map") + le = LabelEncoder(new_labels="update") le.fit(["a", "b", "b", "c"]) assert_array_equal(le.classes_, ["a", "b", "c"]) assert_array_equal(le.transform(["a", "a", "c"]), [0, 0, 2]) assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"]) - le.transform(["b", "c", "d"]) + assert_array_equal(le.transform(["b", "c", "d"]), + [1, 2, 3]) + + +def test_label_encoder_new_label_nan(): + """Test LabelEncoder's transform on new labels""" + le = LabelEncoder(new_labels="nan") + le.fit(["a", "b", "b", "c"]) + assert_array_equal(le.classes_, ["a", "b", "c"]) + assert_array_equal(le.transform(["a", "a", "c"]), + [0, 0, 2]) + assert_array_equal(le.inverse_transform([2, 1, 0]), + ["c", "b", "a"]) + assert_array_equal(le.transform(["b", "c", "d"]), + [1, 2, np.nan]) def test_label_encoder_new_label_arg(): From 0e39a2ae5eb136aa7376410ea219c5fda3d54c83 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Wed, 4 Jun 2014 11:40:50 -0400 Subject: [PATCH 36/55] Fixing docstring for test-doc pass --- sklearn/preprocessing/label.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 36c872468f6f8..e0f3a8598241c 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -75,7 +75,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): `LabelEncoder` can be used to normalize labels. >>> from sklearn import preprocessing - >>> le = preprocessing.LabelEncoder() + >>> le = preprocessing.LabelEncoder(new_values='raise') >>> le.fit([1, 2, 2, 6]) LabelEncoder() >>> le.classes_ @@ -88,7 +88,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): It can also be used to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels. - >>> le = preprocessing.LabelEncoder() + >>> le = preprocessing.LabelEncoder(new_values='raise') >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) LabelEncoder() >>> list(le.classes_) From 1da288087f7b3c121ca68eaac18040842b34cb36 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Wed, 4 Jun 2014 11:42:01 -0400 Subject: [PATCH 37/55] Fixing docstring for test-doc pass (for real) --- sklearn/preprocessing/label.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index e0f3a8598241c..294d9f81ddc4e 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -75,7 +75,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): `LabelEncoder` can be used to normalize labels. >>> from sklearn import preprocessing - >>> le = preprocessing.LabelEncoder(new_values='raise') + >>> le = preprocessing.LabelEncoder(new_labels='raise') >>> le.fit([1, 2, 2, 6]) LabelEncoder() >>> le.classes_ @@ -88,7 +88,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): It can also be used to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels. - >>> le = preprocessing.LabelEncoder(new_values='raise') + >>> le = preprocessing.LabelEncoder(new_labels='raise') >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) LabelEncoder() >>> list(le.classes_) From 926b1666678390899e93917ecd188b02f47cd79c Mon Sep 17 00:00:00 2001 From: mjbommar Date: Wed, 4 Jun 2014 13:07:34 -0400 Subject: [PATCH 38/55] Updating doctests --- doc/modules/preprocessing.rst | 4 ++-- sklearn/preprocessing/label.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 4d3b04ade3c7b..fc928df3833c2 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -397,7 +397,7 @@ follows:: >>> from sklearn import preprocessing >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder() + LabelEncoder(new_labels='raise') >>> le.classes_ array([1, 2, 6]) >>> le.transform([1, 1, 2, 6]) @@ -410,7 +410,7 @@ hashable and comparable) to numerical labels:: >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder() + LabelEncoder(new_labels='raise') >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 294d9f81ddc4e..5534f3b1cbd9f 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -75,9 +75,9 @@ class LabelEncoder(BaseEstimator, TransformerMixin): `LabelEncoder` can be used to normalize labels. >>> from sklearn import preprocessing - >>> le = preprocessing.LabelEncoder(new_labels='raise') + >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder() + LabelEncoder(new_labels='raise') >>> le.classes_ array([1, 2, 6]) >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS @@ -88,9 +88,9 @@ class LabelEncoder(BaseEstimator, TransformerMixin): It can also be used to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels. - >>> le = preprocessing.LabelEncoder(new_labels='raise') + >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder() + LabelEncoder(new_labels='raise') >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS From 5ef9b85119ef10b6335e1f9708562c6f704c0da4 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Wed, 4 Jun 2014 13:37:32 -0400 Subject: [PATCH 39/55] Updating constructor documentation --- sklearn/preprocessing/label.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 5534f3b1cbd9f..364d2baed0096 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -59,10 +59,11 @@ class LabelEncoder(BaseEstimator, TransformerMixin): new_labels : string, optional (default: "raise") Determines how to handle new labels, i.e., data not seen in the training domain. - - If "raise", then raise ValueError. - - If "update", then re-map the new labels to classes - `[N, ..., N+m-1]`, where `m` is the number of new labels. - - If "nan", then re-map the new labels to numpy.nan. + + - If ``"raise"``, then raise ValueError. + - If ``"update"``, then re-map the new labels to + classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels. + - If ``"nan"``, then re-map the new labels to ``numpy.nan``. Attributes From 4dfb4cb5888a3985b38d4c89e28a7cc23e77b0d9 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Thu, 5 Jun 2014 11:09:23 -0400 Subject: [PATCH 40/55] Adding specific "label" option to new_labels --- sklearn/preprocessing/label.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 364d2baed0096..7c64a6a2f22fd 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -100,9 +100,10 @@ class LabelEncoder(BaseEstimator, TransformerMixin): ['tokyo', 'tokyo', 'paris'] """ - def __init__(self, new_labels="raise"): + def __init__(self, new_labels="raise", new_label_class=-1): """Constructor""" self.new_labels = new_labels + self.new_label_class = new_label_class def _check_fitted(self): if not hasattr(self, "classes_"): @@ -181,6 +182,19 @@ def transform(self, y): y_array[-missing_mask]) z[missing_mask] = np.nan return z + elif self.new_labels == "label": + # Create copy of array and return + y_array = np.array(y) + z = np.zeros(y_array.shape) + + # Find entries with new labels + missing_mask = np.in1d(y, diff) + + # Populate return array properly and return + z[-missing_mask] = np.searchsorted(self.classes_, + y_array[-missing_mask]) + z[missing_mask] = self.new_label_class + return z elif self.new_labels == "raise": # Return ValueError, original behavior. raise ValueError("y contains new labels: %s" % str(diff)) From 392e54b03f71c1ac7308c856e874a409ecf2f4bc Mon Sep 17 00:00:00 2001 From: mjbommar Date: Thu, 5 Jun 2014 11:09:41 -0400 Subject: [PATCH 41/55] Adding test for "label" option to ``new_labels`` --- sklearn/preprocessing/tests/test_label.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 14eb7c3e1fe2b..7347f73c8cdd7 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -236,6 +236,19 @@ def test_label_encoder_new_label_nan(): [1, 2, np.nan]) +def test_label_encoder_new_label_replace(): + """Test LabelEncoder's transform on new labels""" + le = LabelEncoder(new_labels="label", new_label_class=-2) + le.fit(["a", "b", "b", "c"]) + assert_array_equal(le.classes_, ["a", "b", "c"]) + assert_array_equal(le.transform(["a", "a", "c"]), + [0, 0, 2]) + assert_array_equal(le.inverse_transform([2, 1, 0]), + ["c", "b", "a"]) + assert_array_equal(le.transform(["b", "c", "d"]), + [1, 2, -2]) + + def test_label_encoder_new_label_arg(): """Test LabelEncoder's new_labels argument handling""" le = LabelEncoder(new_labels="xyz") From e05363553bb869d8eaef29acb796450899276dc0 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Thu, 5 Jun 2014 11:11:17 -0400 Subject: [PATCH 42/55] Updating docstring for ``new_labels="label"`` --- sklearn/preprocessing/label.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 7c64a6a2f22fd..55a289c492dfd 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -64,6 +64,11 @@ class LabelEncoder(BaseEstimator, TransformerMixin): - If ``"update"``, then re-map the new labels to classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels. - If ``"nan"``, then re-map the new labels to ``numpy.nan``. + - If ``"label"``, then use the value of ``new_label_class``. + + new_label_class : integer, optional (default: -1) + If ``new_labels="label"``, then this value will be assigned to + as the class for any new labels that are encountered. Attributes From 122a98fe6ce0f7cf5f9f87172b73129d5d6412d4 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Thu, 5 Jun 2014 11:11:45 -0400 Subject: [PATCH 43/55] pep8 --- sklearn/preprocessing/label.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 55a289c492dfd..e6afbac5535bb 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -65,7 +65,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels. - If ``"nan"``, then re-map the new labels to ``numpy.nan``. - If ``"label"``, then use the value of ``new_label_class``. - + new_label_class : integer, optional (default: -1) If ``new_labels="label"``, then this value will be assigned to as the class for any new labels that are encountered. From de183728162070a8bcd9d0edbd6209dfb9c40665 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Thu, 5 Jun 2014 11:34:23 -0400 Subject: [PATCH 44/55] Autodoc fix --- sklearn/preprocessing/label.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index e6afbac5535bb..c1d8403945919 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -83,7 +83,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> from sklearn import preprocessing >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder(new_labels='raise') + LabelEncoder(new_label_class=-1, new_labels='raise') >>> le.classes_ array([1, 2, 6]) >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS @@ -96,7 +96,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder(new_labels='raise') + LabelEncoder(new_label_class=-1, new_labels='raise') >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS From d735ca27df70bb213e6a346a379259fe32bdba55 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Sun, 8 Jun 2014 09:09:16 -0400 Subject: [PATCH 45/55] Fixing rst docs --- doc/modules/preprocessing.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index fc928df3833c2..7d4650cba9871 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -397,7 +397,7 @@ follows:: >>> from sklearn import preprocessing >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder(new_labels='raise') + LabelEncoder(new_label_class=-1, new_labels='raise') >>> le.classes_ array([1, 2, 6]) >>> le.transform([1, 1, 2, 6]) @@ -410,7 +410,7 @@ hashable and comparable) to numerical labels:: >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder(new_labels='raise') + LabelEncoder(new_label_class=-1, new_labels='raise') >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) From d276565787c343ffc4bd46b6a6d8879a8cf32582 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Sun, 8 Jun 2014 14:04:07 -0400 Subject: [PATCH 46/55] Changing dtypes for new_labels --- sklearn/preprocessing/label.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index c1d8403945919..ec50567c84634 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -177,7 +177,7 @@ def transform(self, y): elif self.new_labels == "nan": # Create copy of array and return y_array = np.array(y) - z = np.zeros(y_array.shape) + z = np.zeros(y_array.shape, dtype=float) # Find entries with new labels missing_mask = np.in1d(y, diff) @@ -190,7 +190,7 @@ def transform(self, y): elif self.new_labels == "label": # Create copy of array and return y_array = np.array(y) - z = np.zeros(y_array.shape) + z = np.zeros(y_array.shape, dtype=int) # Find entries with new labels missing_mask = np.in1d(y, diff) From a01f8b0f1f5c353c1f0cea664a0887fee9265d49 Mon Sep 17 00:00:00 2001 From: mjbommar Date: Sun, 8 Jun 2014 15:52:36 -0400 Subject: [PATCH 47/55] Adding example for new_labels argument --- doc/modules/preprocessing.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 7d4650cba9871..2915eb9c45dd1 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -418,6 +418,20 @@ hashable and comparable) to numerical labels:: >>> list(le.inverse_transform([2, 2, 1])) ['tokyo', 'tokyo', 'paris'] +By default, ``LabelEncoder`` will throw a ``ValueError`` in the event that +labels are passed in ``transform`` that were not seen in ``fit``. This +behavior can be handled with the ``new_labels`` parameter, which supports +``"raise"``, ``"nan"``, ``"update"``, and ``"label"`` strategies for +handling new labels. For example, the ``"label"`` strategy will assign +the unseen values a label of ``-1``. + + >>> le = preprocessing.LabelEncoder(new_labels="label") + >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) + LabelEncoder(new_label_class=-1, new_labels='label') + >>> list(le.classes_) + ['amsterdam', 'paris', 'tokyo'] + >>> le.transform(["tokyo", "tokyo", "paris", "rome"]) + array([ 2, 2, 1, -1]) Imputation of missing values ============================ From 495347c210f985879773bff981752ed3740ea7eb Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Mon, 14 Jul 2014 22:22:30 -0400 Subject: [PATCH 48/55] Adding new_labels handling to fit/fit_transform --- sklearn/preprocessing/label.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index ec50567c84634..24e44158f4681 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -126,6 +126,12 @@ def fit(self, y): ------- self : returns an instance of self. """ + # Check new_labels parameter + if self.new_labels not in ["update", "nan", "raise", "label"]: + # Raise on invalid argument. + raise ValueError("Value of argument `new_labels`={0} " + "is unknown.".format(self.new_labels)) + y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) self.classes_ = np.unique(y) @@ -143,6 +149,12 @@ def fit_transform(self, y): ------- y : array-like of shape [n_samples] """ + # Check new_labels parameter + if self.new_labels not in ["update", "nan", "raise", "label"]: + # Raise on invalid argument. + raise ValueError("Value of argument `new_labels`={0} " + "is unknown.".format(self.new_labels)) + y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) self.classes_, y = np.unique(y, return_inverse=True) From dee4ae0cb73d884b098735fe8a1036221ef8f1f2 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Mon, 14 Jul 2014 22:23:06 -0400 Subject: [PATCH 49/55] Improving difficulty of test cases with non-increasing unseen labels --- sklearn/preprocessing/tests/test_label.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 7347f73c8cdd7..0798a13291c0f 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -219,8 +219,8 @@ def test_label_encoder_new_label_update(): [0, 0, 2]) assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"]) - assert_array_equal(le.transform(["b", "c", "d"]), - [1, 2, 3]) + assert_array_equal(le.transform(["_", "b", "c", "d"]), + [3, 1, 2, 4]) def test_label_encoder_new_label_nan(): @@ -232,8 +232,8 @@ def test_label_encoder_new_label_nan(): [0, 0, 2]) assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"]) - assert_array_equal(le.transform(["b", "c", "d"]), - [1, 2, np.nan]) + assert_array_equal(le.transform(["_", "b", "c", "d"]), + [np.nan, 1, 2, np.nan]) def test_label_encoder_new_label_replace(): From c29701784f4a1dd27a27dd341d6d22cb4aa20d06 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Mon, 14 Jul 2014 22:27:21 -0400 Subject: [PATCH 50/55] Moving ValueError check to fit --- sklearn/preprocessing/tests/test_label.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 0798a13291c0f..8c1dfa868dd10 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -252,13 +252,7 @@ def test_label_encoder_new_label_replace(): def test_label_encoder_new_label_arg(): """Test LabelEncoder's new_labels argument handling""" le = LabelEncoder(new_labels="xyz") - le.fit(["a", "b", "b", "c"]) - assert_array_equal(le.classes_, ["a", "b", "c"]) - assert_array_equal(le.transform(["a", "a", "c"]), - [0, 0, 2]) - assert_array_equal(le.inverse_transform([2, 1, 0]), - ["c", "b", "a"]) - assert_raises(ValueError, le.transform, ["c", "d"]) + assert_raises(ValueError, le.fit, ["a", "b", "b", "c"]) def test_label_encoder_fit_transform(): From f29800b07b4772dceca7fdbec597532f41291e9d Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Mon, 14 Jul 2014 22:50:14 -0400 Subject: [PATCH 51/55] Improving difficult for new_labels='update' test to include multiple transform with new labels --- sklearn/preprocessing/tests/test_label.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 8c1dfa868dd10..e76b49827dfc2 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -219,8 +219,14 @@ def test_label_encoder_new_label_update(): [0, 0, 2]) assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"]) - assert_array_equal(le.transform(["_", "b", "c", "d"]), - [3, 1, 2, 4]) + assert_array_equal(le.transform(["b", "c", "_"]), + [1, 2, 3]) + assert_array_equal(le.classes_, ["a", "b", "c", "_"]) + print(le.classes_) + assert_array_equal(le.transform(["_", "z", "a"]), + [3, 4, 0]) + assert_array_equal(le.classes_, ["a", "b", "c", "_", "z"]) + def test_label_encoder_new_label_nan(): From 74b75896110364183e56f8d7840137044d357f71 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Mon, 14 Jul 2014 22:51:24 -0400 Subject: [PATCH 52/55] Fixing negative indexing, renamed z->out, failing approach for new_labels=update w/ searchsorted --- sklearn/preprocessing/label.py | 45 ++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 24e44158f4681..26c5f1ef8fb01 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -178,40 +178,53 @@ def transform(self, y): _check_numpy_unicode_bug(classes) if len(np.intersect1d(classes, self.classes_)) < len(classes): diff = np.setdiff1d(classes, self.classes_) + # Create copy of array and return + y = np.array(y) # If we are mapping new labels, get "new" ID and change in copy. if self.new_labels == "update": + # Setup out + out = np.zeros(y.shape, dtype=int) + + # Find entries with new labels + missing_mask = np.in1d(y, diff) + new_class_values = np.sort(diff) + + # Populate return array properly and return + out[~missing_mask] = np.searchsorted(self.classes_, + y[~missing_mask]) + out[missing_mask] = np.searchsorted(new_class_values, + y[missing_mask]) + len(self.classes_) + # Update the class list with new labels - self.classes_ = np.append(self.classes_, np.sort(diff)) + self.classes_ = np.append(self.classes_, new_class_values) # Return mapped encoding - return np.searchsorted(self.classes_, y) + return out elif self.new_labels == "nan": - # Create copy of array and return - y_array = np.array(y) - z = np.zeros(y_array.shape, dtype=float) + # Setup out + out = np.zeros(y.shape, dtype=float) # Find entries with new labels missing_mask = np.in1d(y, diff) # Populate return array properly and return - z[-missing_mask] = np.searchsorted(self.classes_, - y_array[-missing_mask]) - z[missing_mask] = np.nan - return z + out[~missing_mask] = np.searchsorted(self.classes_, + y[~missing_mask]) + out[missing_mask] = np.nan + return out elif self.new_labels == "label": - # Create copy of array and return - y_array = np.array(y) - z = np.zeros(y_array.shape, dtype=int) + # Setup out + out = np.zeros(y.shape, dtype=int) # Find entries with new labels missing_mask = np.in1d(y, diff) # Populate return array properly and return - z[-missing_mask] = np.searchsorted(self.classes_, - y_array[-missing_mask]) - z[missing_mask] = self.new_label_class - return z + out[~missing_mask] = np.searchsorted(self.classes_, + y[~missing_mask]) + out[missing_mask] = self.new_label_class + return out elif self.new_labels == "raise": # Return ValueError, original behavior. raise ValueError("y contains new labels: %s" % str(diff)) From 3e1be5dc3318d16d21e7148a9c4594ecb9a7a7d8 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Mon, 14 Jul 2014 22:56:03 -0400 Subject: [PATCH 53/55] PEP8 --- sklearn/preprocessing/label.py | 22 ++++++++++++---------- sklearn/preprocessing/tests/test_label.py | 1 - 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 26c5f1ef8fb01..ebd900890f7bd 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -1,6 +1,6 @@ # Authors: Alexandre Gramfort -# Mathieu Blondel -# Olivier Grisel +# Mathieu Blondel +# Olivier Grisel # Andreas Mueller # Joel Nothman # Hamzeh Alsalhi @@ -105,6 +105,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): ['tokyo', 'tokyo', 'paris'] """ + def __init__(self, new_labels="raise", new_label_class=-1): """Constructor""" self.new_labels = new_labels @@ -129,8 +130,8 @@ def fit(self, y): # Check new_labels parameter if self.new_labels not in ["update", "nan", "raise", "label"]: # Raise on invalid argument. - raise ValueError("Value of argument `new_labels`={0} " - "is unknown.".format(self.new_labels)) + raise ValueError("Value of argument `new_labels`={0} " + "is unknown.".format(self.new_labels)) y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) @@ -152,8 +153,8 @@ def fit_transform(self, y): # Check new_labels parameter if self.new_labels not in ["update", "nan", "raise", "label"]: # Raise on invalid argument. - raise ValueError("Value of argument `new_labels`={0} " - "is unknown.".format(self.new_labels)) + raise ValueError("Value of argument `new_labels`={0} " + "is unknown.".format(self.new_labels)) y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) @@ -192,9 +193,10 @@ def transform(self, y): # Populate return array properly and return out[~missing_mask] = np.searchsorted(self.classes_, - y[~missing_mask]) + y[~missing_mask]) out[missing_mask] = np.searchsorted(new_class_values, - y[missing_mask]) + len(self.classes_) + y[missing_mask]) + \ + len(self.classes_) # Update the class list with new labels self.classes_ = np.append(self.classes_, new_class_values) @@ -210,7 +212,7 @@ def transform(self, y): # Populate return array properly and return out[~missing_mask] = np.searchsorted(self.classes_, - y[~missing_mask]) + y[~missing_mask]) out[missing_mask] = np.nan return out elif self.new_labels == "label": @@ -222,7 +224,7 @@ def transform(self, y): # Populate return array properly and return out[~missing_mask] = np.searchsorted(self.classes_, - y[~missing_mask]) + y[~missing_mask]) out[missing_mask] = self.new_label_class return out elif self.new_labels == "raise": diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index e76b49827dfc2..30e9262b1968d 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -228,7 +228,6 @@ def test_label_encoder_new_label_update(): assert_array_equal(le.classes_, ["a", "b", "c", "_", "z"]) - def test_label_encoder_new_label_nan(): """Test LabelEncoder's transform on new labels""" le = LabelEncoder(new_labels="nan") From abf01cc93caf381770cdfcaf19936c3871d53a46 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Sat, 19 Jul 2014 09:02:49 -0400 Subject: [PATCH 54/55] Removing nan option and corresponding test --- sklearn/preprocessing/label.py | 17 ++--------------- sklearn/preprocessing/tests/test_label.py | 13 ------------- 2 files changed, 2 insertions(+), 28 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index ebd900890f7bd..4a4caa9cda17f 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -63,7 +63,6 @@ class LabelEncoder(BaseEstimator, TransformerMixin): - If ``"raise"``, then raise ValueError. - If ``"update"``, then re-map the new labels to classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels. - - If ``"nan"``, then re-map the new labels to ``numpy.nan``. - If ``"label"``, then use the value of ``new_label_class``. new_label_class : integer, optional (default: -1) @@ -128,7 +127,7 @@ def fit(self, y): self : returns an instance of self. """ # Check new_labels parameter - if self.new_labels not in ["update", "nan", "raise", "label"]: + if self.new_labels not in ["update", "raise", "label"]: # Raise on invalid argument. raise ValueError("Value of argument `new_labels`={0} " "is unknown.".format(self.new_labels)) @@ -151,7 +150,7 @@ def fit_transform(self, y): y : array-like of shape [n_samples] """ # Check new_labels parameter - if self.new_labels not in ["update", "nan", "raise", "label"]: + if self.new_labels not in ["update", "raise", "label"]: # Raise on invalid argument. raise ValueError("Value of argument `new_labels`={0} " "is unknown.".format(self.new_labels)) @@ -203,18 +202,6 @@ def transform(self, y): # Return mapped encoding return out - elif self.new_labels == "nan": - # Setup out - out = np.zeros(y.shape, dtype=float) - - # Find entries with new labels - missing_mask = np.in1d(y, diff) - - # Populate return array properly and return - out[~missing_mask] = np.searchsorted(self.classes_, - y[~missing_mask]) - out[missing_mask] = np.nan - return out elif self.new_labels == "label": # Setup out out = np.zeros(y.shape, dtype=int) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 30e9262b1968d..69d6c6d04ae88 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -228,19 +228,6 @@ def test_label_encoder_new_label_update(): assert_array_equal(le.classes_, ["a", "b", "c", "_", "z"]) -def test_label_encoder_new_label_nan(): - """Test LabelEncoder's transform on new labels""" - le = LabelEncoder(new_labels="nan") - le.fit(["a", "b", "b", "c"]) - assert_array_equal(le.classes_, ["a", "b", "c"]) - assert_array_equal(le.transform(["a", "a", "c"]), - [0, 0, 2]) - assert_array_equal(le.inverse_transform([2, 1, 0]), - ["c", "b", "a"]) - assert_array_equal(le.transform(["_", "b", "c", "d"]), - [np.nan, 1, 2, np.nan]) - - def test_label_encoder_new_label_replace(): """Test LabelEncoder's transform on new labels""" le = LabelEncoder(new_labels="label", new_label_class=-2) From f26a9022f5c5aaa9c9c8b6a108c4b5f95f068c31 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Sat, 19 Jul 2014 11:10:12 -0400 Subject: [PATCH 55/55] Handling repeated transform calls with new_class_mapping_, refactoring, cleaning after removing np.nan. --- sklearn/preprocessing/label.py | 100 +++++++++++++--------- sklearn/preprocessing/tests/test_label.py | 19 ++-- 2 files changed, 74 insertions(+), 45 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 4a4caa9cda17f..6112b8cf77925 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -10,6 +10,7 @@ import itertools import array import warnings +import operator import numpy as np import scipy.sparse as sp @@ -63,18 +64,19 @@ class LabelEncoder(BaseEstimator, TransformerMixin): - If ``"raise"``, then raise ValueError. - If ``"update"``, then re-map the new labels to classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels. - - If ``"label"``, then use the value of ``new_label_class``. - - new_label_class : integer, optional (default: -1) - If ``new_labels="label"``, then this value will be assigned to - as the class for any new labels that are encountered. - + - If an integer value is passed, then use re-label with this value. + N.B. that default values are in [0, 1, ...], so caution should be + taken if a non-negative value is passed to not accidentally + intersect. Attributes ---------- `classes_` : array of shape (n_class,) Holds the label for each class. + `new_label_mapping_` : dictionary + Stores the mapping for classes not seen during original ``fit``. + Examples -------- `LabelEncoder` can be used to normalize labels. @@ -105,15 +107,34 @@ class LabelEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, new_labels="raise", new_label_class=-1): + def __init__(self, new_labels="raise"): """Constructor""" self.new_labels = new_labels - self.new_label_class = new_label_class + self.new_label_mapping_ = {} def _check_fitted(self): if not hasattr(self, "classes_"): raise ValueError("LabelEncoder was not fitted yet.") + def get_classes(self): + """Get classes that have been observed by the encoder. Note that this + method returns classes seen both at original ``fit`` time (i.e., + ``self.classes_``) and classes seen after ``fit`` (i.e., + ``self.new_label_mapping_.keys()``) for applicable values of + ``new_labels``. + + Returns + ------- + classes : array-like of shape [n_classes] + """ + # If we've seen updates, include them in the order they were added. + if len(self.new_label_mapping_) > 0: + sorted_new, _ = zip(*sorted(self.new_label_mapping_.iteritems(), + key=operator.itemgetter(1))) + return np.append(self.classes_, sorted_new) + else: + return self.classes_ + def fit(self, y): """Fit label encoder @@ -127,10 +148,12 @@ def fit(self, y): self : returns an instance of self. """ # Check new_labels parameter - if self.new_labels not in ["update", "raise", "label"]: + if self.new_labels not in ["update", "raise"] and \ + type(self.new_labels) not in [int]: # Raise on invalid argument. raise ValueError("Value of argument `new_labels`={0} " - "is unknown.".format(self.new_labels)) + "is unknown and not integer." + .format(self.new_labels)) y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) @@ -150,10 +173,12 @@ def fit_transform(self, y): y : array-like of shape [n_samples] """ # Check new_labels parameter - if self.new_labels not in ["update", "raise", "label"]: + if self.new_labels not in ["update", "raise"] and \ + type(self.new_labels) not in [int]: # Raise on invalid argument. raise ValueError("Value of argument `new_labels`={0} " - "is unknown.".format(self.new_labels)) + "is unknown and not integer." + .format(self.new_labels)) y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) @@ -176,47 +201,42 @@ def transform(self, y): classes = np.unique(y) _check_numpy_unicode_bug(classes) - if len(np.intersect1d(classes, self.classes_)) < len(classes): - diff = np.setdiff1d(classes, self.classes_) + if len(np.intersect1d(classes, self.get_classes())) < len(classes): + # Get the new classes + diff_fit = np.setdiff1d(classes, self.classes_) + diff_new = np.setdiff1d(classes, self.get_classes()) + # Create copy of array and return y = np.array(y) # If we are mapping new labels, get "new" ID and change in copy. if self.new_labels == "update": - # Setup out - out = np.zeros(y.shape, dtype=int) - - # Find entries with new labels - missing_mask = np.in1d(y, diff) - new_class_values = np.sort(diff) + # Update the new label mapping + next_label = len(self.get_classes()) + self.new_label_mapping_.update(dict(zip(diff_new, + range(next_label, + next_label + + len(diff_new))))) - # Populate return array properly and return - out[~missing_mask] = np.searchsorted(self.classes_, - y[~missing_mask]) - out[missing_mask] = np.searchsorted(new_class_values, - y[missing_mask]) + \ - len(self.classes_) - - # Update the class list with new labels - self.classes_ = np.append(self.classes_, new_class_values) + # Find entries with new labels + missing_mask = np.in1d(y, diff_fit) - # Return mapped encoding + # Populate return array properly by mask and return + out = np.searchsorted(self.classes_, y) + out[missing_mask] = [self.new_label_mapping_[value] + for value in y[missing_mask]] return out - elif self.new_labels == "label": - # Setup out - out = np.zeros(y.shape, dtype=int) - + elif type(self.new_labels) in [int]: # Find entries with new labels - missing_mask = np.in1d(y, diff) + missing_mask = np.in1d(y, diff_fit) - # Populate return array properly and return - out[~missing_mask] = np.searchsorted(self.classes_, - y[~missing_mask]) - out[missing_mask] = self.new_label_class + # Populate return array properly by mask and return + out = np.searchsorted(self.classes_, y) + out[missing_mask] = self.new_labels return out elif self.new_labels == "raise": # Return ValueError, original behavior. - raise ValueError("y contains new labels: %s" % str(diff)) + raise ValueError("y contains new labels: %s" % str(diff_fit)) else: # Raise on invalid argument. raise ValueError("Value of argument `new_labels`={0} " diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 69d6c6d04ae88..70a47fcffd498 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -210,6 +210,17 @@ def test_label_encoder(): assert_raises(ValueError, le.transform, [0, 6]) +def test_label_encoder_get_classes(): + """Test LabelEncoder's get_classes method.""" + le = LabelEncoder(new_labels="update") + le.fit([1, 1, 4, 5, -1, 0]) + assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) + assert_array_equal(le.classes_, le.get_classes()) + le.transform([10]) + assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) + assert_array_equal(le.get_classes(), [-1, 0, 1, 4, 5, 10]) + + def test_label_encoder_new_label_update(): """Test LabelEncoder's transform on new labels""" le = LabelEncoder(new_labels="update") @@ -221,16 +232,14 @@ def test_label_encoder_new_label_update(): ["c", "b", "a"]) assert_array_equal(le.transform(["b", "c", "_"]), [1, 2, 3]) - assert_array_equal(le.classes_, ["a", "b", "c", "_"]) - print(le.classes_) + assert_array_equal(le.get_classes(), ["a", "b", "c", "_"]) assert_array_equal(le.transform(["_", "z", "a"]), [3, 4, 0]) - assert_array_equal(le.classes_, ["a", "b", "c", "_", "z"]) def test_label_encoder_new_label_replace(): """Test LabelEncoder's transform on new labels""" - le = LabelEncoder(new_labels="label", new_label_class=-2) + le = LabelEncoder(new_labels=-99) le.fit(["a", "b", "b", "c"]) assert_array_equal(le.classes_, ["a", "b", "c"]) assert_array_equal(le.transform(["a", "a", "c"]), @@ -238,7 +247,7 @@ def test_label_encoder_new_label_replace(): assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"]) assert_array_equal(le.transform(["b", "c", "d"]), - [1, 2, -2]) + [1, 2, -99]) def test_label_encoder_new_label_arg():