From 4d7978907576551c84cdbc3507904f70df4a89e4 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Tue, 3 Jun 2014 22:31:57 -0400
Subject: [PATCH 01/55] Adding new_labels argument to LabelEncoder

---
 sklearn/preprocessing/label.py | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index a99ed15973238..845f0d2101c94 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -43,6 +43,15 @@ def _check_numpy_unicode_bug(labels):
 class LabelEncoder(BaseEstimator, TransformerMixin):
     """Encode labels with value between 0 and n_classes-1.
 
+    Parameters
+    ----------
+
+    new_labels : string, optional (default: "raise")
+        Determines how to handle newly seen labels, i.e., data
+        not seen in the fit domain.  If "raise", then raise ValueError;
+        if "map", then re-map the new labels to class N, where seen
+        classes are in {0, ..., N-1}.
+
     Attributes
     ----------
     `classes_` : array of shape (n_class,)
@@ -77,6 +86,9 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     ['tokyo', 'tokyo', 'paris']
 
     """
+    def __init__(self, new_labels="raise"):
+        """Constructor"""
+        self.new_labels = new_labels
 
     def _check_fitted(self):
         if not hasattr(self, "classes_"):
@@ -134,7 +146,27 @@ def transform(self, y):
         _check_numpy_unicode_bug(classes)
         if len(np.intersect1d(classes, self.classes_)) < len(classes):
             diff = np.setdiff1d(classes, self.classes_)
-            raise ValueError("y contains new labels: %s" % str(diff))
+
+            # If we are mapping new labels, get "new" ID and change in copy.
+            if self.new_labels == "map":
+                # Get new ID and append to class list
+                missing_id = len(self.classes_)
+                self.classes_.resize(len(self.classes_)+1)
+                self.classes_[-1] = missing_id
+
+                # Reset the value in y_copy
+                missing_mask = np.in1d(y, diff)
+                y_copy = np.array(y)
+                y_copy[missing_mask] = missing_id
+
+                # Return mapped encoding
+                return np.searchsorted(self.classes_, y_copy)
+            elif self.new_labels == "raise":
+                raise ValueError("y contains new labels: %s" % str(diff))
+            else:
+                raise ValueError("Value of argument `new_labels`={0} "
+                                 "is unknown.".format(self.new_labels))
+
         return np.searchsorted(self.classes_, y)
 
     def inverse_transform(self, y):

From 2bc5686d8a31e2a169ffdb23327c00c14a807e6c Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Tue, 3 Jun 2014 22:32:10 -0400
Subject: [PATCH 02/55] Adding tests for new_labels argument.

---
 sklearn/preprocessing/tests/test_label.py | 24 +++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index d7e98c553fe55..757eb965ec9bb 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -177,6 +177,30 @@ def test_label_encoder():
     assert_raises(ValueError, le.transform, [0, 6])
 
 
+def test_label_encoder_new_label():
+    """Test LabelEncoder's transform on new labels"""
+    le = LabelEncoder(new_labels="map")
+    le.fit(["a", "b", "b", "c"])
+    assert_array_equal(le.classes_, ["a", "b", "c"])
+    assert_array_equal(le.transform(["a", "a", "c"]),
+                       [0, 0, 2])
+    assert_array_equal(le.inverse_transform([2, 1, 0]),
+                       ["c", "b", "a"])
+    le.transform(["b", "c", "d"])
+
+
+def test_label_encoder_new_label_arg():
+    """Test LabelEncoder's  new_labels argument handling"""
+    le = LabelEncoder(new_labels="xyz")
+    le.fit(["a", "b", "b", "c"])
+    assert_array_equal(le.classes_, ["a", "b", "c"])
+    assert_array_equal(le.transform(["a", "a", "c"]),
+                       [0, 0, 2])
+    assert_array_equal(le.inverse_transform([2, 1, 0]),
+                       ["c", "b", "a"])
+    assert_raises(ValueError, le.transform, ["c", "d"])
+
+
 def test_label_encoder_fit_transform():
     """Test fit_transform"""
     le = LabelEncoder()

From 8c1fafe249b1309d48b1e20116eedeb2dc1531fb Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Wed, 4 Jun 2014 08:01:33 -0400
Subject: [PATCH 03/55] Changing classes_ update strategy

---
 sklearn/preprocessing/label.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 845f0d2101c94..11d1ab5c2f3c3 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -151,8 +151,7 @@ def transform(self, y):
             if self.new_labels == "map":
                 # Get new ID and append to class list
                 missing_id = len(self.classes_)
-                self.classes_.resize(len(self.classes_)+1)
-                self.classes_[-1] = missing_id
+                self.classes_ = np.append(self.classes_, missing_id)
 
                 # Reset the value in y_copy
                 missing_mask = np.in1d(y, diff)

From 1ffb24a58167758abaf2441a24ec61c9b0ab4031 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Wed, 4 Jun 2014 08:31:33 -0400
Subject: [PATCH 04/55] Adding nan behavior, renaming  to

---
 sklearn/preprocessing/label.py | 38 ++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 11d1ab5c2f3c3..7b3a7bfee7ba1 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -47,10 +47,13 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     ----------
 
     new_labels : string, optional (default: "raise")
-        Determines how to handle newly seen labels, i.e., data
-        not seen in the fit domain.  If "raise", then raise ValueError;
-        if "map", then re-map the new labels to class N, where seen
-        classes are in {0, ..., N-1}.
+        Determines how to handle new labels, i.e., data
+        not seen in the training domain.
+        - If "raise", then raise ValueError.
+        - If "update", then re-map the new labels to classes
+          `[N, ..., N+m-1]`, where `m` is the number of new labels.
+        - If "nan", then re-map the new labels to numpy.nan.
+
 
     Attributes
     ----------
@@ -148,21 +151,30 @@ def transform(self, y):
             diff = np.setdiff1d(classes, self.classes_)
 
             # If we are mapping new labels, get "new" ID and change in copy.
-            if self.new_labels == "map":
-                # Get new ID and append to class list
-                missing_id = len(self.classes_)
-                self.classes_ = np.append(self.classes_, missing_id)
+            if self.new_labels == "update":
+                # Update the class list with new labels
+                self.classes_ = np.append(self.classes_, np.sort(diff))
+
+                # Return mapped encoding
+                return np.searchsorted(self.classes_, y)
+            elif self.new_labels == "nan":
+                # Create copy of array and return
+                y_array = np.array(y)
+                z = np.zeros(y_array.shape)
 
-                # Reset the value in y_copy
+                # Find entries with new labels
                 missing_mask = np.in1d(y, diff)
-                y_copy = np.array(y)
-                y_copy[missing_mask] = missing_id
 
-                # Return mapped encoding
-                return np.searchsorted(self.classes_, y_copy)
+                # Populate return array properly and return
+                z[-missing_mask] = np.searchsorted(self.classes_,
+                                                   y_array[-missing_mask])
+                z[missing_mask] = np.nan
+                return z
             elif self.new_labels == "raise":
+                # Return ValueError, original behavior.
                 raise ValueError("y contains new labels: %s" % str(diff))
             else:
+                # Raise on invalid argument.
                 raise ValueError("Value of argument `new_labels`={0} "
                                  "is unknown.".format(self.new_labels))
 

From 99f65a9a3bcedb2b77bf04a43fa12c70d8363d0c Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Wed, 4 Jun 2014 08:32:10 -0400
Subject: [PATCH 05/55] Updating tests to include nan case and update name

---
 sklearn/preprocessing/tests/test_label.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 757eb965ec9bb..f98239bf428dc 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -177,16 +177,30 @@ def test_label_encoder():
     assert_raises(ValueError, le.transform, [0, 6])
 
 
-def test_label_encoder_new_label():
+def test_label_encoder_new_label_update():
     """Test LabelEncoder's transform on new labels"""
-    le = LabelEncoder(new_labels="map")
+    le = LabelEncoder(new_labels="update")
     le.fit(["a", "b", "b", "c"])
     assert_array_equal(le.classes_, ["a", "b", "c"])
     assert_array_equal(le.transform(["a", "a", "c"]),
                        [0, 0, 2])
     assert_array_equal(le.inverse_transform([2, 1, 0]),
                        ["c", "b", "a"])
-    le.transform(["b", "c", "d"])
+    assert_array_equal(le.transform(["b", "c", "d"]),
+                       [1, 2, 3])
+
+
+def test_label_encoder_new_label_nan():
+    """Test LabelEncoder's transform on new labels"""
+    le = LabelEncoder(new_labels="nan")
+    le.fit(["a", "b", "b", "c"])
+    assert_array_equal(le.classes_, ["a", "b", "c"])
+    assert_array_equal(le.transform(["a", "a", "c"]),
+                       [0, 0, 2])
+    assert_array_equal(le.inverse_transform([2, 1, 0]),
+                       ["c", "b", "a"])
+    assert_array_equal(le.transform(["b", "c", "d"]),
+                       [1, 2, np.nan])
 
 
 def test_label_encoder_new_label_arg():

From af8c6a9f8f583adf4f4cbe612031d68582eb4643 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Wed, 4 Jun 2014 11:40:50 -0400
Subject: [PATCH 06/55] Fixing docstring for test-doc pass

---
 sklearn/preprocessing/label.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 7b3a7bfee7ba1..472059b6d66da 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -65,7 +65,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     `LabelEncoder` can be used to normalize labels.
 
     >>> from sklearn import preprocessing
-    >>> le = preprocessing.LabelEncoder()
+    >>> le = preprocessing.LabelEncoder(new_values='raise')
     >>> le.fit([1, 2, 2, 6])
     LabelEncoder()
     >>> le.classes_
@@ -78,7 +78,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     It can also be used to transform non-numerical labels (as long as they are
     hashable and comparable) to numerical labels.
 
-    >>> le = preprocessing.LabelEncoder()
+    >>> le = preprocessing.LabelEncoder(new_values='raise')
     >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
     LabelEncoder()
     >>> list(le.classes_)

From 8ffc839e2dd9002964fa6a922646ec25b1f21e7e Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Wed, 4 Jun 2014 11:42:01 -0400
Subject: [PATCH 07/55] Fixing docstring for test-doc pass (for real)

---
 sklearn/preprocessing/label.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 472059b6d66da..794bd4fb81cd3 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -65,7 +65,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     `LabelEncoder` can be used to normalize labels.
 
     >>> from sklearn import preprocessing
-    >>> le = preprocessing.LabelEncoder(new_values='raise')
+    >>> le = preprocessing.LabelEncoder(new_labels='raise')
     >>> le.fit([1, 2, 2, 6])
     LabelEncoder()
     >>> le.classes_
@@ -78,7 +78,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     It can also be used to transform non-numerical labels (as long as they are
     hashable and comparable) to numerical labels.
 
-    >>> le = preprocessing.LabelEncoder(new_values='raise')
+    >>> le = preprocessing.LabelEncoder(new_labels='raise')
     >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
     LabelEncoder()
     >>> list(le.classes_)

From e6fbc479d9e063fb60c8bca861fb6618eb705f5a Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Wed, 4 Jun 2014 13:07:34 -0400
Subject: [PATCH 08/55] Updating doctests

---
 doc/modules/preprocessing.rst  | 4 ++--
 sklearn/preprocessing/label.py | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 71653e9afe6b1..2664bd2428513 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -396,7 +396,7 @@ follows::
     >>> from sklearn import preprocessing
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit([1, 2, 2, 6])
-    LabelEncoder()
+    LabelEncoder(new_labels='raise')
     >>> le.classes_
     array([1, 2, 6])
     >>> le.transform([1, 1, 2, 6])
@@ -409,7 +409,7 @@ hashable and comparable) to numerical labels::
 
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
-    LabelEncoder()
+    LabelEncoder(new_labels='raise')
     >>> list(le.classes_)
     ['amsterdam', 'paris', 'tokyo']
     >>> le.transform(["tokyo", "tokyo", "paris"])
diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 794bd4fb81cd3..a96f864644f88 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -65,9 +65,9 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     `LabelEncoder` can be used to normalize labels.
 
     >>> from sklearn import preprocessing
-    >>> le = preprocessing.LabelEncoder(new_labels='raise')
+    >>> le = preprocessing.LabelEncoder()
     >>> le.fit([1, 2, 2, 6])
-    LabelEncoder()
+    LabelEncoder(new_labels='raise')
     >>> le.classes_
     array([1, 2, 6])
     >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS
@@ -78,9 +78,9 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     It can also be used to transform non-numerical labels (as long as they are
     hashable and comparable) to numerical labels.
 
-    >>> le = preprocessing.LabelEncoder(new_labels='raise')
+    >>> le = preprocessing.LabelEncoder()
     >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
-    LabelEncoder()
+    LabelEncoder(new_labels='raise')
     >>> list(le.classes_)
     ['amsterdam', 'paris', 'tokyo']
     >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS

From 46118d9995b271296fdbc979b4f057a90dd59547 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Wed, 4 Jun 2014 13:37:32 -0400
Subject: [PATCH 09/55] Updating constructor documentation

---
 sklearn/preprocessing/label.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index a96f864644f88..4e84a425c0634 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -49,10 +49,11 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     new_labels : string, optional (default: "raise")
         Determines how to handle new labels, i.e., data
         not seen in the training domain.
-        - If "raise", then raise ValueError.
-        - If "update", then re-map the new labels to classes
-          `[N, ..., N+m-1]`, where `m` is the number of new labels.
-        - If "nan", then re-map the new labels to numpy.nan.
+
+        - If ``"raise"``, then raise ValueError.
+        - If ``"update"``, then re-map the new labels to
+          classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels.
+        - If ``"nan"``, then re-map the new labels to ``numpy.nan``.
 
 
     Attributes

From 8d21ec1fca8a437a294b7c249e38485f6773341b Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Thu, 5 Jun 2014 11:09:23 -0400
Subject: [PATCH 10/55] Adding specific "label" option to new_labels

---
 sklearn/preprocessing/label.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 4e84a425c0634..4bbe93dd7aea9 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -90,9 +90,10 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     ['tokyo', 'tokyo', 'paris']
 
     """
-    def __init__(self, new_labels="raise"):
+    def __init__(self, new_labels="raise", new_label_class=-1):
         """Constructor"""
         self.new_labels = new_labels
+        self.new_label_class = new_label_class
 
     def _check_fitted(self):
         if not hasattr(self, "classes_"):
@@ -171,6 +172,19 @@ def transform(self, y):
                                                    y_array[-missing_mask])
                 z[missing_mask] = np.nan
                 return z
+            elif self.new_labels == "label":
+                # Create copy of array and return
+                y_array = np.array(y)
+                z = np.zeros(y_array.shape)
+
+                # Find entries with new labels
+                missing_mask = np.in1d(y, diff)
+
+                # Populate return array properly and return
+                z[-missing_mask] = np.searchsorted(self.classes_,
+                                                   y_array[-missing_mask])
+                z[missing_mask] = self.new_label_class
+                return z
             elif self.new_labels == "raise":
                 # Return ValueError, original behavior.
                 raise ValueError("y contains new labels: %s" % str(diff))

From 343c726a5a265ed4c2d4c0ae51169f43e204b082 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Thu, 5 Jun 2014 11:09:41 -0400
Subject: [PATCH 11/55] Adding test for "label" option to ``new_labels``

---
 sklearn/preprocessing/tests/test_label.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index f98239bf428dc..4cfdd98e61591 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -203,6 +203,19 @@ def test_label_encoder_new_label_nan():
                        [1, 2, np.nan])
 
 
+def test_label_encoder_new_label_replace():
+    """Test LabelEncoder's transform on new labels"""
+    le = LabelEncoder(new_labels="label", new_label_class=-2)
+    le.fit(["a", "b", "b", "c"])
+    assert_array_equal(le.classes_, ["a", "b", "c"])
+    assert_array_equal(le.transform(["a", "a", "c"]),
+                       [0, 0, 2])
+    assert_array_equal(le.inverse_transform([2, 1, 0]),
+                       ["c", "b", "a"])
+    assert_array_equal(le.transform(["b", "c", "d"]),
+                       [1, 2, -2])
+
+
 def test_label_encoder_new_label_arg():
     """Test LabelEncoder's  new_labels argument handling"""
     le = LabelEncoder(new_labels="xyz")

From be97c1403c2f637cc89b814b8aec218b57754114 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Thu, 5 Jun 2014 11:11:17 -0400
Subject: [PATCH 12/55] Updating docstring for ``new_labels="label"``

---
 sklearn/preprocessing/label.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 4bbe93dd7aea9..ad6e7b2f394b1 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -54,6 +54,11 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
         - If ``"update"``, then re-map the new labels to
           classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels.
         - If ``"nan"``, then re-map the new labels to ``numpy.nan``.
+        - If ``"label"``, then use the value of ``new_label_class``.
+    
+    new_label_class : integer, optional (default: -1)
+        If ``new_labels="label"``, then this value will be assigned to
+        as the class for any new labels that are encountered.
 
 
     Attributes

From cdd7147ff3e2a74f564e962204edfc5ff507a628 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Thu, 5 Jun 2014 11:11:45 -0400
Subject: [PATCH 13/55] pep8

---
 sklearn/preprocessing/label.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index ad6e7b2f394b1..5bd081c5b331d 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -55,7 +55,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
           classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels.
         - If ``"nan"``, then re-map the new labels to ``numpy.nan``.
         - If ``"label"``, then use the value of ``new_label_class``.
-    
+
     new_label_class : integer, optional (default: -1)
         If ``new_labels="label"``, then this value will be assigned to
         as the class for any new labels that are encountered.

From 170d00c07baf2710d244bcae0e62de5d0da011a4 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Thu, 5 Jun 2014 11:34:23 -0400
Subject: [PATCH 14/55] Autodoc fix

---
 sklearn/preprocessing/label.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 5bd081c5b331d..c3bff45fae0e6 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -73,7 +73,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     >>> from sklearn import preprocessing
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit([1, 2, 2, 6])
-    LabelEncoder(new_labels='raise')
+    LabelEncoder(new_label_class=-1, new_labels='raise')
     >>> le.classes_
     array([1, 2, 6])
     >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS
@@ -86,7 +86,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
 
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
-    LabelEncoder(new_labels='raise')
+    LabelEncoder(new_label_class=-1, new_labels='raise')
     >>> list(le.classes_)
     ['amsterdam', 'paris', 'tokyo']
     >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS

From 2d87e88a6fa015f979aaa03a15d3ff0a0735f466 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Sun, 8 Jun 2014 09:09:16 -0400
Subject: [PATCH 15/55] Fixing rst docs

---
 doc/modules/preprocessing.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 2664bd2428513..492d2c425fec0 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -396,7 +396,7 @@ follows::
     >>> from sklearn import preprocessing
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit([1, 2, 2, 6])
-    LabelEncoder(new_labels='raise')
+    LabelEncoder(new_label_class=-1, new_labels='raise')
     >>> le.classes_
     array([1, 2, 6])
     >>> le.transform([1, 1, 2, 6])
@@ -409,7 +409,7 @@ hashable and comparable) to numerical labels::
 
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
-    LabelEncoder(new_labels='raise')
+    LabelEncoder(new_label_class=-1, new_labels='raise')
     >>> list(le.classes_)
     ['amsterdam', 'paris', 'tokyo']
     >>> le.transform(["tokyo", "tokyo", "paris"])

From bb8d9a64725290dba18f5dcb8e48af3af8af7973 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Sun, 8 Jun 2014 14:04:07 -0400
Subject: [PATCH 16/55] Changing dtypes for new_labels

---
 sklearn/preprocessing/label.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index c3bff45fae0e6..38b20bfce5f02 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -167,7 +167,7 @@ def transform(self, y):
             elif self.new_labels == "nan":
                 # Create copy of array and return
                 y_array = np.array(y)
-                z = np.zeros(y_array.shape)
+                z = np.zeros(y_array.shape, dtype=float)
 
                 # Find entries with new labels
                 missing_mask = np.in1d(y, diff)
@@ -180,7 +180,7 @@ def transform(self, y):
             elif self.new_labels == "label":
                 # Create copy of array and return
                 y_array = np.array(y)
-                z = np.zeros(y_array.shape)
+                z = np.zeros(y_array.shape, dtype=int)
 
                 # Find entries with new labels
                 missing_mask = np.in1d(y, diff)

From ab788f75c8cff94c64dad4d555c92c7e750396d5 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Sun, 8 Jun 2014 15:52:36 -0400
Subject: [PATCH 17/55] Adding example for new_labels argument

---
 doc/modules/preprocessing.rst | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 492d2c425fec0..5f66971f96d20 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -417,6 +417,20 @@ hashable and comparable) to numerical labels::
     >>> list(le.inverse_transform([2, 2, 1]))
     ['tokyo', 'tokyo', 'paris']
 
+By default, ``LabelEncoder`` will throw a ``ValueError`` in the event that
+labels are passed in ``transform`` that were not seen in ``fit``.  This
+behavior can be handled with the ``new_labels`` parameter, which supports
+``"raise"``, ``"nan"``, ``"update"``, and ``"label"`` strategies for
+handling new labels.  For example, the ``"label"`` strategy will assign
+the unseen values a label of ``-1``.
+
+    >>> le = preprocessing.LabelEncoder(new_labels="label")
+    >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
+    LabelEncoder(new_label_class=-1, new_labels='label')
+    >>> list(le.classes_)
+    ['amsterdam', 'paris', 'tokyo']
+    >>> le.transform(["tokyo", "tokyo", "paris", "rome"])
+    array([ 2,  2,  1, -1])
 
 Imputation of missing values
 ============================

From a597fc36ba08da96e99d61d27d3b056f3dba3803 Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael@bommaritollc.com>
Date: Mon, 14 Jul 2014 22:22:30 -0400
Subject: [PATCH 18/55] Adding new_labels handling to fit/fit_transform

---
 sklearn/preprocessing/label.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 38b20bfce5f02..b65fed2adfd38 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -116,6 +116,12 @@ def fit(self, y):
         -------
         self : returns an instance of self.
         """
+        # Check new_labels parameter
+        if self.new_labels not in ["update", "nan", "raise", "label"]:
+            # Raise on invalid argument.
+                raise ValueError("Value of argument `new_labels`={0} "
+                                 "is unknown.".format(self.new_labels))
+
         y = column_or_1d(y, warn=True)
         _check_numpy_unicode_bug(y)
         self.classes_ = np.unique(y)
@@ -133,6 +139,12 @@ def fit_transform(self, y):
         -------
         y : array-like of shape [n_samples]
         """
+        # Check new_labels parameter
+        if self.new_labels not in ["update", "nan", "raise", "label"]:
+            # Raise on invalid argument.
+                raise ValueError("Value of argument `new_labels`={0} "
+                                 "is unknown.".format(self.new_labels))
+
         y = column_or_1d(y, warn=True)
         _check_numpy_unicode_bug(y)
         self.classes_, y = np.unique(y, return_inverse=True)

From 291d752c9bd4a581ea1b0b994e4b9de3f18dc340 Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael@bommaritollc.com>
Date: Mon, 14 Jul 2014 22:23:06 -0400
Subject: [PATCH 19/55] Improving difficulty of test cases with non-increasing
 unseen labels

---
 sklearn/preprocessing/tests/test_label.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 4cfdd98e61591..c58d5bcf36935 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -186,8 +186,8 @@ def test_label_encoder_new_label_update():
                        [0, 0, 2])
     assert_array_equal(le.inverse_transform([2, 1, 0]),
                        ["c", "b", "a"])
-    assert_array_equal(le.transform(["b", "c", "d"]),
-                       [1, 2, 3])
+    assert_array_equal(le.transform(["_", "b", "c", "d"]),
+                       [3, 1, 2, 4])
 
 
 def test_label_encoder_new_label_nan():
@@ -199,8 +199,8 @@ def test_label_encoder_new_label_nan():
                        [0, 0, 2])
     assert_array_equal(le.inverse_transform([2, 1, 0]),
                        ["c", "b", "a"])
-    assert_array_equal(le.transform(["b", "c", "d"]),
-                       [1, 2, np.nan])
+    assert_array_equal(le.transform(["_", "b", "c", "d"]),
+                       [np.nan, 1, 2, np.nan])
 
 
 def test_label_encoder_new_label_replace():

From fe0141d545ec08ac680b76565c8800a346403312 Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael@bommaritollc.com>
Date: Mon, 14 Jul 2014 22:27:21 -0400
Subject: [PATCH 20/55] Moving ValueError check to fit

---
 sklearn/preprocessing/tests/test_label.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index c58d5bcf36935..380da039c74b7 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -219,13 +219,7 @@ def test_label_encoder_new_label_replace():
 def test_label_encoder_new_label_arg():
     """Test LabelEncoder's  new_labels argument handling"""
     le = LabelEncoder(new_labels="xyz")
-    le.fit(["a", "b", "b", "c"])
-    assert_array_equal(le.classes_, ["a", "b", "c"])
-    assert_array_equal(le.transform(["a", "a", "c"]),
-                       [0, 0, 2])
-    assert_array_equal(le.inverse_transform([2, 1, 0]),
-                       ["c", "b", "a"])
-    assert_raises(ValueError, le.transform, ["c", "d"])
+    assert_raises(ValueError, le.fit, ["a", "b", "b", "c"])
 
 
 def test_label_encoder_fit_transform():

From e1b7ed58fb253259dd3d8e31b1142dcbaa8cfdb4 Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael@bommaritollc.com>
Date: Mon, 14 Jul 2014 22:50:14 -0400
Subject: [PATCH 21/55] Improving difficult for new_labels='update' test to
 include multiple transform with new labels

---
 sklearn/preprocessing/tests/test_label.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 380da039c74b7..090214f2a661d 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -186,8 +186,14 @@ def test_label_encoder_new_label_update():
                        [0, 0, 2])
     assert_array_equal(le.inverse_transform([2, 1, 0]),
                        ["c", "b", "a"])
-    assert_array_equal(le.transform(["_", "b", "c", "d"]),
-                       [3, 1, 2, 4])
+    assert_array_equal(le.transform(["b", "c", "_"]),
+                       [1, 2, 3])
+    assert_array_equal(le.classes_, ["a", "b", "c", "_"])
+    print(le.classes_)
+    assert_array_equal(le.transform(["_", "z", "a"]),
+                       [3, 4, 0])
+    assert_array_equal(le.classes_, ["a", "b", "c", "_", "z"])
+
 
 
 def test_label_encoder_new_label_nan():

From 9fd7736d9984f4d27150231fa3eb6736a0ba7434 Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael@bommaritollc.com>
Date: Mon, 14 Jul 2014 22:51:24 -0400
Subject: [PATCH 22/55] Fixing negative indexing, renamed z->out, failing
 approach for new_labels=update w/ searchsorted

---
 sklearn/preprocessing/label.py | 45 ++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index b65fed2adfd38..fb9a6c5d32163 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -168,40 +168,53 @@ def transform(self, y):
         _check_numpy_unicode_bug(classes)
         if len(np.intersect1d(classes, self.classes_)) < len(classes):
             diff = np.setdiff1d(classes, self.classes_)
+            # Create copy of array and return
+            y = np.array(y)
 
             # If we are mapping new labels, get "new" ID and change in copy.
             if self.new_labels == "update":
+                # Setup out
+                out = np.zeros(y.shape, dtype=int)
+
+                #  Find entries with new labels
+                missing_mask = np.in1d(y, diff)
+                new_class_values = np.sort(diff)
+
+                # Populate return array properly and return
+                out[~missing_mask] = np.searchsorted(self.classes_,
+                                                   y[~missing_mask])
+                out[missing_mask] = np.searchsorted(new_class_values,
+                                                   y[missing_mask]) + len(self.classes_)
+
                 # Update the class list with new labels
-                self.classes_ = np.append(self.classes_, np.sort(diff))
+                self.classes_ = np.append(self.classes_, new_class_values)
 
                 # Return mapped encoding
-                return np.searchsorted(self.classes_, y)
+                return out
             elif self.new_labels == "nan":
-                # Create copy of array and return
-                y_array = np.array(y)
-                z = np.zeros(y_array.shape, dtype=float)
+                # Setup out
+                out = np.zeros(y.shape, dtype=float)
 
                 # Find entries with new labels
                 missing_mask = np.in1d(y, diff)
 
                 # Populate return array properly and return
-                z[-missing_mask] = np.searchsorted(self.classes_,
-                                                   y_array[-missing_mask])
-                z[missing_mask] = np.nan
-                return z
+                out[~missing_mask] = np.searchsorted(self.classes_,
+                                                   y[~missing_mask])
+                out[missing_mask] = np.nan
+                return out
             elif self.new_labels == "label":
-                # Create copy of array and return
-                y_array = np.array(y)
-                z = np.zeros(y_array.shape, dtype=int)
+                # Setup out
+                out = np.zeros(y.shape, dtype=int)
 
                 # Find entries with new labels
                 missing_mask = np.in1d(y, diff)
 
                 # Populate return array properly and return
-                z[-missing_mask] = np.searchsorted(self.classes_,
-                                                   y_array[-missing_mask])
-                z[missing_mask] = self.new_label_class
-                return z
+                out[~missing_mask] = np.searchsorted(self.classes_,
+                                                   y[~missing_mask])
+                out[missing_mask] = self.new_label_class
+                return out
             elif self.new_labels == "raise":
                 # Return ValueError, original behavior.
                 raise ValueError("y contains new labels: %s" % str(diff))

From e3c14bbf0c07c7988b8f335b82c54a031545a785 Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael@bommaritollc.com>
Date: Mon, 14 Jul 2014 22:56:03 -0400
Subject: [PATCH 23/55] PEP8

---
 sklearn/preprocessing/label.py            | 22 ++++++++++++----------
 sklearn/preprocessing/tests/test_label.py |  1 -
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index fb9a6c5d32163..7e016d068035d 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -1,6 +1,6 @@
 # Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
-#          Mathieu Blondel <mathieu@mblondel.org>
-#          Olivier Grisel <olivier.grisel@ensta.org>
+# Mathieu Blondel <mathieu@mblondel.org>
+# Olivier Grisel <olivier.grisel@ensta.org>
 #          Andreas Mueller <amueller@ais.uni-bonn.de>
 # License: BSD 3 clause
 
@@ -95,6 +95,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     ['tokyo', 'tokyo', 'paris']
 
     """
+
     def __init__(self, new_labels="raise", new_label_class=-1):
         """Constructor"""
         self.new_labels = new_labels
@@ -119,8 +120,8 @@ def fit(self, y):
         # Check new_labels parameter
         if self.new_labels not in ["update", "nan", "raise", "label"]:
             # Raise on invalid argument.
-                raise ValueError("Value of argument `new_labels`={0} "
-                                 "is unknown.".format(self.new_labels))
+            raise ValueError("Value of argument `new_labels`={0} "
+                             "is unknown.".format(self.new_labels))
 
         y = column_or_1d(y, warn=True)
         _check_numpy_unicode_bug(y)
@@ -142,8 +143,8 @@ def fit_transform(self, y):
         # Check new_labels parameter
         if self.new_labels not in ["update", "nan", "raise", "label"]:
             # Raise on invalid argument.
-                raise ValueError("Value of argument `new_labels`={0} "
-                                 "is unknown.".format(self.new_labels))
+            raise ValueError("Value of argument `new_labels`={0} "
+                             "is unknown.".format(self.new_labels))
 
         y = column_or_1d(y, warn=True)
         _check_numpy_unicode_bug(y)
@@ -182,9 +183,10 @@ def transform(self, y):
 
                 # Populate return array properly and return
                 out[~missing_mask] = np.searchsorted(self.classes_,
-                                                   y[~missing_mask])
+                                                     y[~missing_mask])
                 out[missing_mask] = np.searchsorted(new_class_values,
-                                                   y[missing_mask]) + len(self.classes_)
+                                                    y[missing_mask]) + \
+                    len(self.classes_)
 
                 # Update the class list with new labels
                 self.classes_ = np.append(self.classes_, new_class_values)
@@ -200,7 +202,7 @@ def transform(self, y):
 
                 # Populate return array properly and return
                 out[~missing_mask] = np.searchsorted(self.classes_,
-                                                   y[~missing_mask])
+                                                     y[~missing_mask])
                 out[missing_mask] = np.nan
                 return out
             elif self.new_labels == "label":
@@ -212,7 +214,7 @@ def transform(self, y):
 
                 # Populate return array properly and return
                 out[~missing_mask] = np.searchsorted(self.classes_,
-                                                   y[~missing_mask])
+                                                     y[~missing_mask])
                 out[missing_mask] = self.new_label_class
                 return out
             elif self.new_labels == "raise":
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 090214f2a661d..f04acbc86ca31 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -195,7 +195,6 @@ def test_label_encoder_new_label_update():
     assert_array_equal(le.classes_, ["a", "b", "c", "_", "z"])
 
 
-
 def test_label_encoder_new_label_nan():
     """Test LabelEncoder's transform on new labels"""
     le = LabelEncoder(new_labels="nan")

From fe797363a9fa7e57f44c1dc0fe2e52db20d7f91d Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael@bommaritollc.com>
Date: Sat, 19 Jul 2014 09:02:49 -0400
Subject: [PATCH 24/55] Removing nan option and corresponding test

---
 sklearn/preprocessing/label.py            | 17 ++---------------
 sklearn/preprocessing/tests/test_label.py | 13 -------------
 2 files changed, 2 insertions(+), 28 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 7e016d068035d..c69aa49d78d7b 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -53,7 +53,6 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
         - If ``"raise"``, then raise ValueError.
         - If ``"update"``, then re-map the new labels to
           classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels.
-        - If ``"nan"``, then re-map the new labels to ``numpy.nan``.
         - If ``"label"``, then use the value of ``new_label_class``.
 
     new_label_class : integer, optional (default: -1)
@@ -118,7 +117,7 @@ def fit(self, y):
         self : returns an instance of self.
         """
         # Check new_labels parameter
-        if self.new_labels not in ["update", "nan", "raise", "label"]:
+        if self.new_labels not in ["update", "raise", "label"]:
             # Raise on invalid argument.
             raise ValueError("Value of argument `new_labels`={0} "
                              "is unknown.".format(self.new_labels))
@@ -141,7 +140,7 @@ def fit_transform(self, y):
         y : array-like of shape [n_samples]
         """
         # Check new_labels parameter
-        if self.new_labels not in ["update", "nan", "raise", "label"]:
+        if self.new_labels not in ["update", "raise", "label"]:
             # Raise on invalid argument.
             raise ValueError("Value of argument `new_labels`={0} "
                              "is unknown.".format(self.new_labels))
@@ -193,18 +192,6 @@ def transform(self, y):
 
                 # Return mapped encoding
                 return out
-            elif self.new_labels == "nan":
-                # Setup out
-                out = np.zeros(y.shape, dtype=float)
-
-                # Find entries with new labels
-                missing_mask = np.in1d(y, diff)
-
-                # Populate return array properly and return
-                out[~missing_mask] = np.searchsorted(self.classes_,
-                                                     y[~missing_mask])
-                out[missing_mask] = np.nan
-                return out
             elif self.new_labels == "label":
                 # Setup out
                 out = np.zeros(y.shape, dtype=int)
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index f04acbc86ca31..9d028e6aaccea 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -195,19 +195,6 @@ def test_label_encoder_new_label_update():
     assert_array_equal(le.classes_, ["a", "b", "c", "_", "z"])
 
 
-def test_label_encoder_new_label_nan():
-    """Test LabelEncoder's transform on new labels"""
-    le = LabelEncoder(new_labels="nan")
-    le.fit(["a", "b", "b", "c"])
-    assert_array_equal(le.classes_, ["a", "b", "c"])
-    assert_array_equal(le.transform(["a", "a", "c"]),
-                       [0, 0, 2])
-    assert_array_equal(le.inverse_transform([2, 1, 0]),
-                       ["c", "b", "a"])
-    assert_array_equal(le.transform(["_", "b", "c", "d"]),
-                       [np.nan, 1, 2, np.nan])
-
-
 def test_label_encoder_new_label_replace():
     """Test LabelEncoder's transform on new labels"""
     le = LabelEncoder(new_labels="label", new_label_class=-2)

From b83b37f4774fd3033afe4a51f6a1cb8edc83292a Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael@bommaritollc.com>
Date: Sat, 19 Jul 2014 11:10:12 -0400
Subject: [PATCH 25/55] Handling repeated transform calls with
 new_class_mapping_, refactoring, cleaning after removing np.nan.

---
 sklearn/preprocessing/label.py            | 100 +++++++++++++---------
 sklearn/preprocessing/tests/test_label.py |  19 ++--
 2 files changed, 74 insertions(+), 45 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index c69aa49d78d7b..8680501c4a258 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -4,6 +4,7 @@
 #          Andreas Mueller <amueller@ais.uni-bonn.de>
 # License: BSD 3 clause
 
+import operator
 import numpy as np
 
 from ..base import BaseEstimator, TransformerMixin
@@ -53,18 +54,19 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
         - If ``"raise"``, then raise ValueError.
         - If ``"update"``, then re-map the new labels to
           classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels.
-        - If ``"label"``, then use the value of ``new_label_class``.
-
-    new_label_class : integer, optional (default: -1)
-        If ``new_labels="label"``, then this value will be assigned to
-        as the class for any new labels that are encountered.
-
+        - If an integer value is passed, then use re-label with this value.
+          N.B. that default values are in [0, 1, ...], so caution should be
+          taken if a non-negative value is passed to not accidentally
+          intersect.
 
     Attributes
     ----------
     `classes_` : array of shape (n_class,)
         Holds the label for each class.
 
+    `new_label_mapping_` : dictionary
+        Stores the mapping for classes not seen during original ``fit``.
+
     Examples
     --------
     `LabelEncoder` can be used to normalize labels.
@@ -95,15 +97,34 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
 
     """
 
-    def __init__(self, new_labels="raise", new_label_class=-1):
+    def __init__(self, new_labels="raise"):
         """Constructor"""
         self.new_labels = new_labels
-        self.new_label_class = new_label_class
+        self.new_label_mapping_ = {}
 
     def _check_fitted(self):
         if not hasattr(self, "classes_"):
             raise ValueError("LabelEncoder was not fitted yet.")
 
+    def get_classes(self):
+        """Get classes that have been observed by the encoder.  Note that this
+        method returns classes seen both at original ``fit`` time (i.e.,
+        ``self.classes_``) and classes seen after ``fit`` (i.e.,
+        ``self.new_label_mapping_.keys()``) for applicable values of
+        ``new_labels``.
+
+        Returns
+        -------
+        classes : array-like of shape [n_classes]
+        """
+        # If we've seen updates, include them in the order they were added.
+        if len(self.new_label_mapping_) > 0:
+            sorted_new, _ = zip(*sorted(self.new_label_mapping_.iteritems(),
+                                        key=operator.itemgetter(1)))
+            return np.append(self.classes_, sorted_new)
+        else:
+            return self.classes_
+
     def fit(self, y):
         """Fit label encoder
 
@@ -117,10 +138,12 @@ def fit(self, y):
         self : returns an instance of self.
         """
         # Check new_labels parameter
-        if self.new_labels not in ["update", "raise", "label"]:
+        if self.new_labels not in ["update", "raise"] and \
+                type(self.new_labels) not in [int]:
             # Raise on invalid argument.
             raise ValueError("Value of argument `new_labels`={0} "
-                             "is unknown.".format(self.new_labels))
+                             "is unknown and not integer."
+                             .format(self.new_labels))
 
         y = column_or_1d(y, warn=True)
         _check_numpy_unicode_bug(y)
@@ -140,10 +163,12 @@ def fit_transform(self, y):
         y : array-like of shape [n_samples]
         """
         # Check new_labels parameter
-        if self.new_labels not in ["update", "raise", "label"]:
+        if self.new_labels not in ["update", "raise"] and \
+                type(self.new_labels) not in [int]:
             # Raise on invalid argument.
             raise ValueError("Value of argument `new_labels`={0} "
-                             "is unknown.".format(self.new_labels))
+                             "is unknown and not integer."
+                             .format(self.new_labels))
 
         y = column_or_1d(y, warn=True)
         _check_numpy_unicode_bug(y)
@@ -166,47 +191,42 @@ def transform(self, y):
 
         classes = np.unique(y)
         _check_numpy_unicode_bug(classes)
-        if len(np.intersect1d(classes, self.classes_)) < len(classes):
-            diff = np.setdiff1d(classes, self.classes_)
+        if len(np.intersect1d(classes, self.get_classes())) < len(classes):
+            # Get the new classes
+            diff_fit = np.setdiff1d(classes, self.classes_)
+            diff_new = np.setdiff1d(classes, self.get_classes())
+
             # Create copy of array and return
             y = np.array(y)
 
             # If we are mapping new labels, get "new" ID and change in copy.
             if self.new_labels == "update":
-                # Setup out
-                out = np.zeros(y.shape, dtype=int)
-
-                #  Find entries with new labels
-                missing_mask = np.in1d(y, diff)
-                new_class_values = np.sort(diff)
+                # Update the new label mapping
+                next_label = len(self.get_classes())
+                self.new_label_mapping_.update(dict(zip(diff_new,
+                                                        range(next_label,
+                                                              next_label +
+                                                              len(diff_new)))))
 
-                # Populate return array properly and return
-                out[~missing_mask] = np.searchsorted(self.classes_,
-                                                     y[~missing_mask])
-                out[missing_mask] = np.searchsorted(new_class_values,
-                                                    y[missing_mask]) + \
-                    len(self.classes_)
-
-                # Update the class list with new labels
-                self.classes_ = np.append(self.classes_, new_class_values)
+                # Find entries with new labels
+                missing_mask = np.in1d(y, diff_fit)
 
-                # Return mapped encoding
+                # Populate return array properly by mask and return
+                out = np.searchsorted(self.classes_, y)
+                out[missing_mask] = [self.new_label_mapping_[value]
+                                     for value in y[missing_mask]]
                 return out
-            elif self.new_labels == "label":
-                # Setup out
-                out = np.zeros(y.shape, dtype=int)
-
+            elif type(self.new_labels) in [int]:
                 # Find entries with new labels
-                missing_mask = np.in1d(y, diff)
+                missing_mask = np.in1d(y, diff_fit)
 
-                # Populate return array properly and return
-                out[~missing_mask] = np.searchsorted(self.classes_,
-                                                     y[~missing_mask])
-                out[missing_mask] = self.new_label_class
+                # Populate return array properly by mask and return
+                out = np.searchsorted(self.classes_, y)
+                out[missing_mask] = self.new_labels
                 return out
             elif self.new_labels == "raise":
                 # Return ValueError, original behavior.
-                raise ValueError("y contains new labels: %s" % str(diff))
+                raise ValueError("y contains new labels: %s" % str(diff_fit))
             else:
                 # Raise on invalid argument.
                 raise ValueError("Value of argument `new_labels`={0} "
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 9d028e6aaccea..9b7de7ba51c42 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -177,6 +177,17 @@ def test_label_encoder():
     assert_raises(ValueError, le.transform, [0, 6])
 
 
+def test_label_encoder_get_classes():
+    """Test LabelEncoder's get_classes method."""
+    le = LabelEncoder(new_labels="update")
+    le.fit([1, 1, 4, 5, -1, 0])
+    assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
+    assert_array_equal(le.classes_, le.get_classes())
+    le.transform([10])
+    assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
+    assert_array_equal(le.get_classes(), [-1, 0, 1, 4, 5, 10])
+
+
 def test_label_encoder_new_label_update():
     """Test LabelEncoder's transform on new labels"""
     le = LabelEncoder(new_labels="update")
@@ -188,16 +199,14 @@ def test_label_encoder_new_label_update():
                        ["c", "b", "a"])
     assert_array_equal(le.transform(["b", "c", "_"]),
                        [1, 2, 3])
-    assert_array_equal(le.classes_, ["a", "b", "c", "_"])
-    print(le.classes_)
+    assert_array_equal(le.get_classes(), ["a", "b", "c", "_"])
     assert_array_equal(le.transform(["_", "z", "a"]),
                        [3, 4, 0])
-    assert_array_equal(le.classes_, ["a", "b", "c", "_", "z"])
 
 
 def test_label_encoder_new_label_replace():
     """Test LabelEncoder's transform on new labels"""
-    le = LabelEncoder(new_labels="label", new_label_class=-2)
+    le = LabelEncoder(new_labels=-99)
     le.fit(["a", "b", "b", "c"])
     assert_array_equal(le.classes_, ["a", "b", "c"])
     assert_array_equal(le.transform(["a", "a", "c"]),
@@ -205,7 +214,7 @@ def test_label_encoder_new_label_replace():
     assert_array_equal(le.inverse_transform([2, 1, 0]),
                        ["c", "b", "a"])
     assert_array_equal(le.transform(["b", "c", "d"]),
-                       [1, 2, -2])
+                       [1, 2, -99])
 
 
 def test_label_encoder_new_label_arg():

From 0b8e63cff88ecb3474fd694119d6efb380b94460 Mon Sep 17 00:00:00 2001
From: pvnguyen <phuongnv87@gmail.com>
Date: Mon, 21 Jul 2014 14:44:42 -0700
Subject: [PATCH 26/55] Update outlier_detection.rst

Add reference for One-class SVM.
---
 doc/modules/outlier_detection.rst | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst
index ee7c483c73a7e..a99758989e195 100644
--- a/doc/modules/outlier_detection.rst
+++ b/doc/modules/outlier_detection.rst
@@ -53,8 +53,8 @@ coming from the same population than the initial
 observations. Otherwise, if they lay outside the frontier, we can say
 that they are abnormal with a given confidence in our assessment.
 
-The One-Class SVM has been introduced in [1] for that purpose and
-implemented in the :ref:`svm` module in the
+The One-Class SVM has been introduced by Schölkopf et al. for that purpose 
+and implemented in the :ref:`svm` module in the
 :class:`svm.OneClassSVM` object. It requires the choice of a
 kernel and a scalar parameter to define a frontier.  The RBF kernel is
 usually chosen although there exists no exact formula or algorithm to
@@ -63,6 +63,12 @@ implementation. The :math:`\nu` parameter, also known as the margin of
 the One-Class SVM, corresponds to the probability of finding a new,
 but regular, observation outside the frontier.
 
+.. topic:: References:
+
+    * `Estimating the support of a high-dimensional distribution
+      <http://dl.acm.org/citation.cfm?id=1119749>`_ Schölkopf, 
+      Bernhard, et al. Neural computation 13.7 (2001): 1443-1471.
+      
 .. topic:: Examples:
 
    * See :ref:`example_svm_plot_oneclass.py` for visualizing the
@@ -73,7 +79,7 @@ but regular, observation outside the frontier.
    :target: ../auto_examples/svm/plot_oneclasse.html
    :align: center
    :scale: 75%
-
+   
 
 Outlier Detection
 =================

From 62f1f57ce586fb8c74be6d1e0a736565c9f04b45 Mon Sep 17 00:00:00 2001
From: Kyle Kastner <kastnerkyle@gmail.com>
Date: Wed, 23 Jul 2014 16:31:23 +0200
Subject: [PATCH 27/55] Added directory checking for documentation builds, and
 corrected for Windows pathing

---
 doc/sphinxext/gen_rst.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/doc/sphinxext/gen_rst.py b/doc/sphinxext/gen_rst.py
index 213484a515481..9a8d2535a2b26 100644
--- a/doc/sphinxext/gen_rst.py
+++ b/doc/sphinxext/gen_rst.py
@@ -468,7 +468,11 @@ def generate_example_rst(app):
         examples.
     """
     root_dir = os.path.join(app.builder.srcdir, 'auto_examples')
-    example_dir = os.path.abspath(app.builder.srcdir + '/../' + 'examples')
+    example_dir = os.path.abspath(os.path.join(app.builder.srcdir, '..',
+                                               'examples'))
+    generated_dir = os.path.abspath(os.path.join(app.builder.srcdir,
+                                                 'modules', 'generated'))
+
     try:
         plot_gallery = eval(app.builder.config.plot_gallery)
     except TypeError:
@@ -477,10 +481,12 @@ def generate_example_rst(app):
         os.makedirs(example_dir)
     if not os.path.exists(root_dir):
         os.makedirs(root_dir)
+    if not os.path.exists(generated_dir):
+        os.makedirs(generated_dir)
 
     # we create an index.rst with all examples
     fhindex = open(os.path.join(root_dir, 'index.rst'), 'w')
-    #Note: The sidebar button has been removed from the examples page for now
+    # Note: The sidebar button has been removed from the examples page for now
     #      due to how it messes up the layout. Will be fixed at a later point
     fhindex.write("""\
 

From d814353cc3c93536eea3df8a0dcc765ea18f0dfa Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 23 Jul 2014 14:44:13 +0200
Subject: [PATCH 28/55] MAINT More robust windows installation script

---
 appveyor.yml                                | 11 ++++++++---
 continuous_integration/appveyor/install.ps1 | 21 ++++++++++++++++-----
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/appveyor.yml b/appveyor.yml
index 54bd67e5f0c26..7d91bf06eecf1 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -13,7 +13,7 @@ environment:
       PYTHON_VERSION: "2.7.8"
       PYTHON_ARCH: "32"
 
-    - PYTHON: "C:\\Python27"
+    - PYTHON: "C:\\Python27_64"
       PYTHON_VERSION: "2.7.8"
       PYTHON_ARCH: "64"
 
@@ -21,10 +21,15 @@ environment:
       PYTHON_VERSION: "3.4.1"
       PYTHON_ARCH: "32"
 
-    - PYTHON: "C:\\Python34"
+    - PYTHON: "C:\\Python34_64"
       PYTHON_VERSION: "3.4.1"
       PYTHON_ARCH: "64"
 
+branches:
+  only:
+    - master
+    - 0.15.X
+
 install:
   # Install Python (from the official .msi of http://python.org) and pip when
   # not already installed.
@@ -53,7 +58,7 @@ test_script:
 
   # Skip joblib tests that require multiprocessing as they are prone to random
   # slow down
-  - "python -c \"import nose; nose.main()\" -v -s sklearn"
+  - "python -c \"import nose; nose.main()\" -s sklearn"
 
 artifacts:
   # Archive the generated wheel package in the ci.appveyor.com build report.
diff --git a/continuous_integration/appveyor/install.ps1 b/continuous_integration/appveyor/install.ps1
index fc06c58078965..2a96d3372ecab 100644
--- a/continuous_integration/appveyor/install.ps1
+++ b/continuous_integration/appveyor/install.ps1
@@ -52,12 +52,17 @@ function InstallPython ($python_version, $architecture, $python_home) {
     } else {
         $platform_suffix = ".amd64"
     }
-    $filepath = DownloadPython $python_version $platform_suffix
-    Write-Host "Installing" $filepath "to" $python_home
+    $msipath = DownloadPython $python_version $platform_suffix
+    Write-Host "Installing" $msipath "to" $python_home
     $install_log = $python_home + ".log"
-    $args = "/qn  /log $install_log /i $filepath TARGETDIR=$python_home"
-    Write-Host "msiexec.exe" $args
-    Start-Process -FilePath "msiexec.exe" -ArgumentList $args -Wait -Passthru
+    $install_args = "/qn /log $install_log /i $msipath TARGETDIR=$python_home"
+    $uninstall_args = "/qn /x $msipath"
+    RunCommand "msiexec.exe" $install_args
+    if (-not(Test-Path $python_home)) {
+        Write-Host "Python seems to be installed else-where, reinstalling."
+        RunCommand "msiexec.exe" $uninstall_args
+        RunCommand "msiexec.exe" $install_args
+    }
     if (Test-Path $python_home) {
         Write-Host "Python $python_version ($architecture) installation complete"
     } else {
@@ -67,6 +72,11 @@ function InstallPython ($python_version, $architecture, $python_home) {
     }
 }
 
+function RunCommand ($command, $command_args) {
+    Write-Host $command $command_args
+    Start-Process -FilePath $command -ArgumentList $command_args -Wait -Passthru
+}
+
 
 function InstallPip ($python_home) {
     $pip_path = $python_home + "\Scripts\pip.exe"
@@ -82,6 +92,7 @@ function InstallPip ($python_home) {
     }
 }
 
+
 function main () {
     InstallPython $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON
     InstallPip $env:PYTHON

From 0af2d8f96f1bf424235f2587effec7afd249c045 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 23 Jul 2014 16:04:29 +0200
Subject: [PATCH 29/55] MAINT move skip for unstable 32bit to
 _check_transformer

---
 sklearn/utils/estimator_checks.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 874dae7c338a8..c114d45887794 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -136,15 +136,6 @@ def check_regressors_classifiers_sparse_data(name, Estimator):
 
 
 def check_transformer(name, Transformer):
-    if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit():
-        # Those transformers yield non-deterministic output when executed on
-        # a 32bit Python. The same transformers are stable on 64bit Python.
-        # FIXME: try to isolate a minimalistic reproduction case only depending
-        # on numpy & scipy and/or maybe generate a test dataset that does not
-        # cause such unstable behaviors.
-        msg = name + ' is non deterministic on 32bit Python'
-        raise SkipTest(msg)
-
     X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                       random_state=0, n_features=2, cluster_std=0.1)
     X = StandardScaler().fit_transform(X)
@@ -166,6 +157,14 @@ def check_transformer_data_not_an_array(name, Transformer):
 
 
 def _check_transformer(name, Transformer, X, y):
+    if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit():
+        # Those transformers yield non-deterministic output when executed on
+        # a 32bit Python. The same transformers are stable on 64bit Python.
+        # FIXME: try to isolate a minimalistic reproduction case only depending
+        # on numpy & scipy and/or maybe generate a test dataset that does not
+        # cause such unstable behaviors.
+        msg = name + ' is non deterministic on 32bit Python'
+        raise SkipTest(msg)
     n_samples, n_features = np.asarray(X).shape
     # catch deprecation warnings
     with warnings.catch_warnings(record=True):

From f3afd4e5e294baae0edca2e1b5911b766224ca17 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 23 Jul 2014 15:18:16 +0200
Subject: [PATCH 30/55] FIX unstable test on 32 bit windows

---
 sklearn/feature_selection/tests/test_feature_select.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py
index 2734eb5ab1729..5ef88c41a7b61 100644
--- a/sklearn/feature_selection/tests/test_feature_select.py
+++ b/sklearn/feature_selection/tests/test_feature_select.py
@@ -48,8 +48,8 @@ def test_f_oneway_ints():
 
     # test that is gives the same result as with float
     f, p = f_oneway(X.astype(np.float), y)
-    assert_array_almost_equal(f, fint, decimal=5)
-    assert_array_almost_equal(p, pint, decimal=5)
+    assert_array_almost_equal(f, fint, decimal=4)
+    assert_array_almost_equal(p, pint, decimal=4)
 
 
 def test_f_classif():

From 4b6978e754d3dde7fa880b5f06df7247ab33ea69 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Tue, 3 Jun 2014 22:31:57 -0400
Subject: [PATCH 31/55] Adding new_labels argument to LabelEncoder

---
 sklearn/preprocessing/label.py | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index f304bf6104cae..774aa0ccf4cd3 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -53,6 +53,15 @@ def _check_numpy_unicode_bug(labels):
 class LabelEncoder(BaseEstimator, TransformerMixin):
     """Encode labels with value between 0 and n_classes-1.
 
+    Parameters
+    ----------
+
+    new_labels : string, optional (default: "raise")
+        Determines how to handle newly seen labels, i.e., data
+        not seen in the fit domain.  If "raise", then raise ValueError;
+        if "map", then re-map the new labels to class N, where seen
+        classes are in {0, ..., N-1}.
+
     Attributes
     ----------
     `classes_` : array of shape (n_class,)
@@ -87,6 +96,9 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     ['tokyo', 'tokyo', 'paris']
 
     """
+    def __init__(self, new_labels="raise"):
+        """Constructor"""
+        self.new_labels = new_labels
 
     def _check_fitted(self):
         if not hasattr(self, "classes_"):
@@ -144,7 +156,27 @@ def transform(self, y):
         _check_numpy_unicode_bug(classes)
         if len(np.intersect1d(classes, self.classes_)) < len(classes):
             diff = np.setdiff1d(classes, self.classes_)
-            raise ValueError("y contains new labels: %s" % str(diff))
+
+            # If we are mapping new labels, get "new" ID and change in copy.
+            if self.new_labels == "map":
+                # Get new ID and append to class list
+                missing_id = len(self.classes_)
+                self.classes_.resize(len(self.classes_)+1)
+                self.classes_[-1] = missing_id
+
+                # Reset the value in y_copy
+                missing_mask = np.in1d(y, diff)
+                y_copy = np.array(y)
+                y_copy[missing_mask] = missing_id
+
+                # Return mapped encoding
+                return np.searchsorted(self.classes_, y_copy)
+            elif self.new_labels == "raise":
+                raise ValueError("y contains new labels: %s" % str(diff))
+            else:
+                raise ValueError("Value of argument `new_labels`={0} "
+                                 "is unknown.".format(self.new_labels))
+
         return np.searchsorted(self.classes_, y)
 
     def inverse_transform(self, y):

From d99020715163838c0bb3ff792e82b5e26b824fde Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Tue, 3 Jun 2014 22:32:10 -0400
Subject: [PATCH 32/55] Adding tests for new_labels argument.

---
 sklearn/preprocessing/tests/test_label.py | 24 +++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index dfdb2d23a2134..f8e20645e6d6e 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -210,6 +210,30 @@ def test_label_encoder():
     assert_raises(ValueError, le.transform, [0, 6])
 
 
+def test_label_encoder_new_label():
+    """Test LabelEncoder's transform on new labels"""
+    le = LabelEncoder(new_labels="map")
+    le.fit(["a", "b", "b", "c"])
+    assert_array_equal(le.classes_, ["a", "b", "c"])
+    assert_array_equal(le.transform(["a", "a", "c"]),
+                       [0, 0, 2])
+    assert_array_equal(le.inverse_transform([2, 1, 0]),
+                       ["c", "b", "a"])
+    le.transform(["b", "c", "d"])
+
+
+def test_label_encoder_new_label_arg():
+    """Test LabelEncoder's  new_labels argument handling"""
+    le = LabelEncoder(new_labels="xyz")
+    le.fit(["a", "b", "b", "c"])
+    assert_array_equal(le.classes_, ["a", "b", "c"])
+    assert_array_equal(le.transform(["a", "a", "c"]),
+                       [0, 0, 2])
+    assert_array_equal(le.inverse_transform([2, 1, 0]),
+                       ["c", "b", "a"])
+    assert_raises(ValueError, le.transform, ["c", "d"])
+
+
 def test_label_encoder_fit_transform():
     """Test fit_transform"""
     le = LabelEncoder()

From a69840bcfcaf5e67796d741a930136bdea9ed820 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Wed, 4 Jun 2014 08:01:33 -0400
Subject: [PATCH 33/55] Changing classes_ update strategy

---
 sklearn/preprocessing/label.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 774aa0ccf4cd3..b360b5b9f44c9 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -161,8 +161,7 @@ def transform(self, y):
             if self.new_labels == "map":
                 # Get new ID and append to class list
                 missing_id = len(self.classes_)
-                self.classes_.resize(len(self.classes_)+1)
-                self.classes_[-1] = missing_id
+                self.classes_ = np.append(self.classes_, missing_id)
 
                 # Reset the value in y_copy
                 missing_mask = np.in1d(y, diff)

From fce9fb541c25973e6ff9dc1e578d7c84c77df396 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Wed, 4 Jun 2014 08:31:33 -0400
Subject: [PATCH 34/55] Adding nan behavior, renaming  to

---
 sklearn/preprocessing/label.py | 38 ++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index b360b5b9f44c9..36c872468f6f8 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -57,10 +57,13 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     ----------
 
     new_labels : string, optional (default: "raise")
-        Determines how to handle newly seen labels, i.e., data
-        not seen in the fit domain.  If "raise", then raise ValueError;
-        if "map", then re-map the new labels to class N, where seen
-        classes are in {0, ..., N-1}.
+        Determines how to handle new labels, i.e., data
+        not seen in the training domain.
+        - If "raise", then raise ValueError.
+        - If "update", then re-map the new labels to classes
+          `[N, ..., N+m-1]`, where `m` is the number of new labels.
+        - If "nan", then re-map the new labels to numpy.nan.
+
 
     Attributes
     ----------
@@ -158,21 +161,30 @@ def transform(self, y):
             diff = np.setdiff1d(classes, self.classes_)
 
             # If we are mapping new labels, get "new" ID and change in copy.
-            if self.new_labels == "map":
-                # Get new ID and append to class list
-                missing_id = len(self.classes_)
-                self.classes_ = np.append(self.classes_, missing_id)
+            if self.new_labels == "update":
+                # Update the class list with new labels
+                self.classes_ = np.append(self.classes_, np.sort(diff))
+
+                # Return mapped encoding
+                return np.searchsorted(self.classes_, y)
+            elif self.new_labels == "nan":
+                # Create copy of array and return
+                y_array = np.array(y)
+                z = np.zeros(y_array.shape)
 
-                # Reset the value in y_copy
+                # Find entries with new labels
                 missing_mask = np.in1d(y, diff)
-                y_copy = np.array(y)
-                y_copy[missing_mask] = missing_id
 
-                # Return mapped encoding
-                return np.searchsorted(self.classes_, y_copy)
+                # Populate return array properly and return
+                z[-missing_mask] = np.searchsorted(self.classes_,
+                                                   y_array[-missing_mask])
+                z[missing_mask] = np.nan
+                return z
             elif self.new_labels == "raise":
+                # Return ValueError, original behavior.
                 raise ValueError("y contains new labels: %s" % str(diff))
             else:
+                # Raise on invalid argument.
                 raise ValueError("Value of argument `new_labels`={0} "
                                  "is unknown.".format(self.new_labels))
 

From 76921e54931813c171d5612d541fbddd0539e128 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Wed, 4 Jun 2014 08:32:10 -0400
Subject: [PATCH 35/55] Updating tests to include nan case and update name

---
 sklearn/preprocessing/tests/test_label.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index f8e20645e6d6e..14eb7c3e1fe2b 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -210,16 +210,30 @@ def test_label_encoder():
     assert_raises(ValueError, le.transform, [0, 6])
 
 
-def test_label_encoder_new_label():
+def test_label_encoder_new_label_update():
     """Test LabelEncoder's transform on new labels"""
-    le = LabelEncoder(new_labels="map")
+    le = LabelEncoder(new_labels="update")
     le.fit(["a", "b", "b", "c"])
     assert_array_equal(le.classes_, ["a", "b", "c"])
     assert_array_equal(le.transform(["a", "a", "c"]),
                        [0, 0, 2])
     assert_array_equal(le.inverse_transform([2, 1, 0]),
                        ["c", "b", "a"])
-    le.transform(["b", "c", "d"])
+    assert_array_equal(le.transform(["b", "c", "d"]),
+                       [1, 2, 3])
+
+
+def test_label_encoder_new_label_nan():
+    """Test LabelEncoder's transform on new labels"""
+    le = LabelEncoder(new_labels="nan")
+    le.fit(["a", "b", "b", "c"])
+    assert_array_equal(le.classes_, ["a", "b", "c"])
+    assert_array_equal(le.transform(["a", "a", "c"]),
+                       [0, 0, 2])
+    assert_array_equal(le.inverse_transform([2, 1, 0]),
+                       ["c", "b", "a"])
+    assert_array_equal(le.transform(["b", "c", "d"]),
+                       [1, 2, np.nan])
 
 
 def test_label_encoder_new_label_arg():

From 0e39a2ae5eb136aa7376410ea219c5fda3d54c83 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Wed, 4 Jun 2014 11:40:50 -0400
Subject: [PATCH 36/55] Fixing docstring for test-doc pass

---
 sklearn/preprocessing/label.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 36c872468f6f8..e0f3a8598241c 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -75,7 +75,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     `LabelEncoder` can be used to normalize labels.
 
     >>> from sklearn import preprocessing
-    >>> le = preprocessing.LabelEncoder()
+    >>> le = preprocessing.LabelEncoder(new_values='raise')
     >>> le.fit([1, 2, 2, 6])
     LabelEncoder()
     >>> le.classes_
@@ -88,7 +88,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     It can also be used to transform non-numerical labels (as long as they are
     hashable and comparable) to numerical labels.
 
-    >>> le = preprocessing.LabelEncoder()
+    >>> le = preprocessing.LabelEncoder(new_values='raise')
     >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
     LabelEncoder()
     >>> list(le.classes_)

From 1da288087f7b3c121ca68eaac18040842b34cb36 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Wed, 4 Jun 2014 11:42:01 -0400
Subject: [PATCH 37/55] Fixing docstring for test-doc pass (for real)

---
 sklearn/preprocessing/label.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index e0f3a8598241c..294d9f81ddc4e 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -75,7 +75,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     `LabelEncoder` can be used to normalize labels.
 
     >>> from sklearn import preprocessing
-    >>> le = preprocessing.LabelEncoder(new_values='raise')
+    >>> le = preprocessing.LabelEncoder(new_labels='raise')
     >>> le.fit([1, 2, 2, 6])
     LabelEncoder()
     >>> le.classes_
@@ -88,7 +88,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     It can also be used to transform non-numerical labels (as long as they are
     hashable and comparable) to numerical labels.
 
-    >>> le = preprocessing.LabelEncoder(new_values='raise')
+    >>> le = preprocessing.LabelEncoder(new_labels='raise')
     >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
     LabelEncoder()
     >>> list(le.classes_)

From 926b1666678390899e93917ecd188b02f47cd79c Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Wed, 4 Jun 2014 13:07:34 -0400
Subject: [PATCH 38/55] Updating doctests

---
 doc/modules/preprocessing.rst  | 4 ++--
 sklearn/preprocessing/label.py | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 4d3b04ade3c7b..fc928df3833c2 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -397,7 +397,7 @@ follows::
     >>> from sklearn import preprocessing
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit([1, 2, 2, 6])
-    LabelEncoder()
+    LabelEncoder(new_labels='raise')
     >>> le.classes_
     array([1, 2, 6])
     >>> le.transform([1, 1, 2, 6])
@@ -410,7 +410,7 @@ hashable and comparable) to numerical labels::
 
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
-    LabelEncoder()
+    LabelEncoder(new_labels='raise')
     >>> list(le.classes_)
     ['amsterdam', 'paris', 'tokyo']
     >>> le.transform(["tokyo", "tokyo", "paris"])
diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 294d9f81ddc4e..5534f3b1cbd9f 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -75,9 +75,9 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     `LabelEncoder` can be used to normalize labels.
 
     >>> from sklearn import preprocessing
-    >>> le = preprocessing.LabelEncoder(new_labels='raise')
+    >>> le = preprocessing.LabelEncoder()
     >>> le.fit([1, 2, 2, 6])
-    LabelEncoder()
+    LabelEncoder(new_labels='raise')
     >>> le.classes_
     array([1, 2, 6])
     >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS
@@ -88,9 +88,9 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     It can also be used to transform non-numerical labels (as long as they are
     hashable and comparable) to numerical labels.
 
-    >>> le = preprocessing.LabelEncoder(new_labels='raise')
+    >>> le = preprocessing.LabelEncoder()
     >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
-    LabelEncoder()
+    LabelEncoder(new_labels='raise')
     >>> list(le.classes_)
     ['amsterdam', 'paris', 'tokyo']
     >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS

From 5ef9b85119ef10b6335e1f9708562c6f704c0da4 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Wed, 4 Jun 2014 13:37:32 -0400
Subject: [PATCH 39/55] Updating constructor documentation

---
 sklearn/preprocessing/label.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 5534f3b1cbd9f..364d2baed0096 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -59,10 +59,11 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     new_labels : string, optional (default: "raise")
         Determines how to handle new labels, i.e., data
         not seen in the training domain.
-        - If "raise", then raise ValueError.
-        - If "update", then re-map the new labels to classes
-          `[N, ..., N+m-1]`, where `m` is the number of new labels.
-        - If "nan", then re-map the new labels to numpy.nan.
+
+        - If ``"raise"``, then raise ValueError.
+        - If ``"update"``, then re-map the new labels to
+          classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels.
+        - If ``"nan"``, then re-map the new labels to ``numpy.nan``.
 
 
     Attributes

From 4dfb4cb5888a3985b38d4c89e28a7cc23e77b0d9 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Thu, 5 Jun 2014 11:09:23 -0400
Subject: [PATCH 40/55] Adding specific "label" option to new_labels

---
 sklearn/preprocessing/label.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 364d2baed0096..7c64a6a2f22fd 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -100,9 +100,10 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     ['tokyo', 'tokyo', 'paris']
 
     """
-    def __init__(self, new_labels="raise"):
+    def __init__(self, new_labels="raise", new_label_class=-1):
         """Constructor"""
         self.new_labels = new_labels
+        self.new_label_class = new_label_class
 
     def _check_fitted(self):
         if not hasattr(self, "classes_"):
@@ -181,6 +182,19 @@ def transform(self, y):
                                                    y_array[-missing_mask])
                 z[missing_mask] = np.nan
                 return z
+            elif self.new_labels == "label":
+                # Create copy of array and return
+                y_array = np.array(y)
+                z = np.zeros(y_array.shape)
+
+                # Find entries with new labels
+                missing_mask = np.in1d(y, diff)
+
+                # Populate return array properly and return
+                z[-missing_mask] = np.searchsorted(self.classes_,
+                                                   y_array[-missing_mask])
+                z[missing_mask] = self.new_label_class
+                return z
             elif self.new_labels == "raise":
                 # Return ValueError, original behavior.
                 raise ValueError("y contains new labels: %s" % str(diff))

From 392e54b03f71c1ac7308c856e874a409ecf2f4bc Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Thu, 5 Jun 2014 11:09:41 -0400
Subject: [PATCH 41/55] Adding test for "label" option to ``new_labels``

---
 sklearn/preprocessing/tests/test_label.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 14eb7c3e1fe2b..7347f73c8cdd7 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -236,6 +236,19 @@ def test_label_encoder_new_label_nan():
                        [1, 2, np.nan])
 
 
+def test_label_encoder_new_label_replace():
+    """Test LabelEncoder's transform on new labels"""
+    le = LabelEncoder(new_labels="label", new_label_class=-2)
+    le.fit(["a", "b", "b", "c"])
+    assert_array_equal(le.classes_, ["a", "b", "c"])
+    assert_array_equal(le.transform(["a", "a", "c"]),
+                       [0, 0, 2])
+    assert_array_equal(le.inverse_transform([2, 1, 0]),
+                       ["c", "b", "a"])
+    assert_array_equal(le.transform(["b", "c", "d"]),
+                       [1, 2, -2])
+
+
 def test_label_encoder_new_label_arg():
     """Test LabelEncoder's  new_labels argument handling"""
     le = LabelEncoder(new_labels="xyz")

From e05363553bb869d8eaef29acb796450899276dc0 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Thu, 5 Jun 2014 11:11:17 -0400
Subject: [PATCH 42/55] Updating docstring for ``new_labels="label"``

---
 sklearn/preprocessing/label.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 7c64a6a2f22fd..55a289c492dfd 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -64,6 +64,11 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
         - If ``"update"``, then re-map the new labels to
           classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels.
         - If ``"nan"``, then re-map the new labels to ``numpy.nan``.
+        - If ``"label"``, then use the value of ``new_label_class``.
+    
+    new_label_class : integer, optional (default: -1)
+        If ``new_labels="label"``, then this value will be assigned to
+        as the class for any new labels that are encountered.
 
 
     Attributes

From 122a98fe6ce0f7cf5f9f87172b73129d5d6412d4 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Thu, 5 Jun 2014 11:11:45 -0400
Subject: [PATCH 43/55] pep8

---
 sklearn/preprocessing/label.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 55a289c492dfd..e6afbac5535bb 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -65,7 +65,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
           classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels.
         - If ``"nan"``, then re-map the new labels to ``numpy.nan``.
         - If ``"label"``, then use the value of ``new_label_class``.
-    
+
     new_label_class : integer, optional (default: -1)
         If ``new_labels="label"``, then this value will be assigned to
         as the class for any new labels that are encountered.

From de183728162070a8bcd9d0edbd6209dfb9c40665 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Thu, 5 Jun 2014 11:34:23 -0400
Subject: [PATCH 44/55] Autodoc fix

---
 sklearn/preprocessing/label.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index e6afbac5535bb..c1d8403945919 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -83,7 +83,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     >>> from sklearn import preprocessing
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit([1, 2, 2, 6])
-    LabelEncoder(new_labels='raise')
+    LabelEncoder(new_label_class=-1, new_labels='raise')
     >>> le.classes_
     array([1, 2, 6])
     >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS
@@ -96,7 +96,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
 
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
-    LabelEncoder(new_labels='raise')
+    LabelEncoder(new_label_class=-1, new_labels='raise')
     >>> list(le.classes_)
     ['amsterdam', 'paris', 'tokyo']
     >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS

From d735ca27df70bb213e6a346a379259fe32bdba55 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Sun, 8 Jun 2014 09:09:16 -0400
Subject: [PATCH 45/55] Fixing rst docs

---
 doc/modules/preprocessing.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index fc928df3833c2..7d4650cba9871 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -397,7 +397,7 @@ follows::
     >>> from sklearn import preprocessing
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit([1, 2, 2, 6])
-    LabelEncoder(new_labels='raise')
+    LabelEncoder(new_label_class=-1, new_labels='raise')
     >>> le.classes_
     array([1, 2, 6])
     >>> le.transform([1, 1, 2, 6])
@@ -410,7 +410,7 @@ hashable and comparable) to numerical labels::
 
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
-    LabelEncoder(new_labels='raise')
+    LabelEncoder(new_label_class=-1, new_labels='raise')
     >>> list(le.classes_)
     ['amsterdam', 'paris', 'tokyo']
     >>> le.transform(["tokyo", "tokyo", "paris"])

From d276565787c343ffc4bd46b6a6d8879a8cf32582 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Sun, 8 Jun 2014 14:04:07 -0400
Subject: [PATCH 46/55] Changing dtypes for new_labels

---
 sklearn/preprocessing/label.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index c1d8403945919..ec50567c84634 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -177,7 +177,7 @@ def transform(self, y):
             elif self.new_labels == "nan":
                 # Create copy of array and return
                 y_array = np.array(y)
-                z = np.zeros(y_array.shape)
+                z = np.zeros(y_array.shape, dtype=float)
 
                 # Find entries with new labels
                 missing_mask = np.in1d(y, diff)
@@ -190,7 +190,7 @@ def transform(self, y):
             elif self.new_labels == "label":
                 # Create copy of array and return
                 y_array = np.array(y)
-                z = np.zeros(y_array.shape)
+                z = np.zeros(y_array.shape, dtype=int)
 
                 # Find entries with new labels
                 missing_mask = np.in1d(y, diff)

From a01f8b0f1f5c353c1f0cea664a0887fee9265d49 Mon Sep 17 00:00:00 2001
From: mjbommar <michael@bommaritollc.com>
Date: Sun, 8 Jun 2014 15:52:36 -0400
Subject: [PATCH 47/55] Adding example for new_labels argument

---
 doc/modules/preprocessing.rst | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 7d4650cba9871..2915eb9c45dd1 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -418,6 +418,20 @@ hashable and comparable) to numerical labels::
     >>> list(le.inverse_transform([2, 2, 1]))
     ['tokyo', 'tokyo', 'paris']
 
+By default, ``LabelEncoder`` will throw a ``ValueError`` in the event that
+labels are passed in ``transform`` that were not seen in ``fit``.  This
+behavior can be handled with the ``new_labels`` parameter, which supports
+``"raise"``, ``"nan"``, ``"update"``, and ``"label"`` strategies for
+handling new labels.  For example, the ``"label"`` strategy will assign
+the unseen values a label of ``-1``.
+
+    >>> le = preprocessing.LabelEncoder(new_labels="label")
+    >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
+    LabelEncoder(new_label_class=-1, new_labels='label')
+    >>> list(le.classes_)
+    ['amsterdam', 'paris', 'tokyo']
+    >>> le.transform(["tokyo", "tokyo", "paris", "rome"])
+    array([ 2,  2,  1, -1])
 
 Imputation of missing values
 ============================

From 495347c210f985879773bff981752ed3740ea7eb Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael@bommaritollc.com>
Date: Mon, 14 Jul 2014 22:22:30 -0400
Subject: [PATCH 48/55] Adding new_labels handling to fit/fit_transform

---
 sklearn/preprocessing/label.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index ec50567c84634..24e44158f4681 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -126,6 +126,12 @@ def fit(self, y):
         -------
         self : returns an instance of self.
         """
+        # Check new_labels parameter
+        if self.new_labels not in ["update", "nan", "raise", "label"]:
+            # Raise on invalid argument.
+                raise ValueError("Value of argument `new_labels`={0} "
+                                 "is unknown.".format(self.new_labels))
+
         y = column_or_1d(y, warn=True)
         _check_numpy_unicode_bug(y)
         self.classes_ = np.unique(y)
@@ -143,6 +149,12 @@ def fit_transform(self, y):
         -------
         y : array-like of shape [n_samples]
         """
+        # Check new_labels parameter
+        if self.new_labels not in ["update", "nan", "raise", "label"]:
+            # Raise on invalid argument.
+                raise ValueError("Value of argument `new_labels`={0} "
+                                 "is unknown.".format(self.new_labels))
+
         y = column_or_1d(y, warn=True)
         _check_numpy_unicode_bug(y)
         self.classes_, y = np.unique(y, return_inverse=True)

From dee4ae0cb73d884b098735fe8a1036221ef8f1f2 Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael@bommaritollc.com>
Date: Mon, 14 Jul 2014 22:23:06 -0400
Subject: [PATCH 49/55] Improving difficulty of test cases with non-increasing
 unseen labels

---
 sklearn/preprocessing/tests/test_label.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 7347f73c8cdd7..0798a13291c0f 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -219,8 +219,8 @@ def test_label_encoder_new_label_update():
                        [0, 0, 2])
     assert_array_equal(le.inverse_transform([2, 1, 0]),
                        ["c", "b", "a"])
-    assert_array_equal(le.transform(["b", "c", "d"]),
-                       [1, 2, 3])
+    assert_array_equal(le.transform(["_", "b", "c", "d"]),
+                       [3, 1, 2, 4])
 
 
 def test_label_encoder_new_label_nan():
@@ -232,8 +232,8 @@ def test_label_encoder_new_label_nan():
                        [0, 0, 2])
     assert_array_equal(le.inverse_transform([2, 1, 0]),
                        ["c", "b", "a"])
-    assert_array_equal(le.transform(["b", "c", "d"]),
-                       [1, 2, np.nan])
+    assert_array_equal(le.transform(["_", "b", "c", "d"]),
+                       [np.nan, 1, 2, np.nan])
 
 
 def test_label_encoder_new_label_replace():

From c29701784f4a1dd27a27dd341d6d22cb4aa20d06 Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael@bommaritollc.com>
Date: Mon, 14 Jul 2014 22:27:21 -0400
Subject: [PATCH 50/55] Moving ValueError check to fit

---
 sklearn/preprocessing/tests/test_label.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 0798a13291c0f..8c1dfa868dd10 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -252,13 +252,7 @@ def test_label_encoder_new_label_replace():
 def test_label_encoder_new_label_arg():
     """Test LabelEncoder's  new_labels argument handling"""
     le = LabelEncoder(new_labels="xyz")
-    le.fit(["a", "b", "b", "c"])
-    assert_array_equal(le.classes_, ["a", "b", "c"])
-    assert_array_equal(le.transform(["a", "a", "c"]),
-                       [0, 0, 2])
-    assert_array_equal(le.inverse_transform([2, 1, 0]),
-                       ["c", "b", "a"])
-    assert_raises(ValueError, le.transform, ["c", "d"])
+    assert_raises(ValueError, le.fit, ["a", "b", "b", "c"])
 
 
 def test_label_encoder_fit_transform():

From f29800b07b4772dceca7fdbec597532f41291e9d Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael@bommaritollc.com>
Date: Mon, 14 Jul 2014 22:50:14 -0400
Subject: [PATCH 51/55] Improving difficult for new_labels='update' test to
 include multiple transform with new labels

---
 sklearn/preprocessing/tests/test_label.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 8c1dfa868dd10..e76b49827dfc2 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -219,8 +219,14 @@ def test_label_encoder_new_label_update():
                        [0, 0, 2])
     assert_array_equal(le.inverse_transform([2, 1, 0]),
                        ["c", "b", "a"])
-    assert_array_equal(le.transform(["_", "b", "c", "d"]),
-                       [3, 1, 2, 4])
+    assert_array_equal(le.transform(["b", "c", "_"]),
+                       [1, 2, 3])
+    assert_array_equal(le.classes_, ["a", "b", "c", "_"])
+    print(le.classes_)
+    assert_array_equal(le.transform(["_", "z", "a"]),
+                       [3, 4, 0])
+    assert_array_equal(le.classes_, ["a", "b", "c", "_", "z"])
+
 
 
 def test_label_encoder_new_label_nan():

From 74b75896110364183e56f8d7840137044d357f71 Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael@bommaritollc.com>
Date: Mon, 14 Jul 2014 22:51:24 -0400
Subject: [PATCH 52/55] Fixing negative indexing, renamed z->out, failing
 approach for new_labels=update w/ searchsorted

---
 sklearn/preprocessing/label.py | 45 ++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 24e44158f4681..26c5f1ef8fb01 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -178,40 +178,53 @@ def transform(self, y):
         _check_numpy_unicode_bug(classes)
         if len(np.intersect1d(classes, self.classes_)) < len(classes):
             diff = np.setdiff1d(classes, self.classes_)
+            # Create copy of array and return
+            y = np.array(y)
 
             # If we are mapping new labels, get "new" ID and change in copy.
             if self.new_labels == "update":
+                # Setup out
+                out = np.zeros(y.shape, dtype=int)
+
+                #  Find entries with new labels
+                missing_mask = np.in1d(y, diff)
+                new_class_values = np.sort(diff)
+
+                # Populate return array properly and return
+                out[~missing_mask] = np.searchsorted(self.classes_,
+                                                   y[~missing_mask])
+                out[missing_mask] = np.searchsorted(new_class_values,
+                                                   y[missing_mask]) + len(self.classes_)
+
                 # Update the class list with new labels
-                self.classes_ = np.append(self.classes_, np.sort(diff))
+                self.classes_ = np.append(self.classes_, new_class_values)
 
                 # Return mapped encoding
-                return np.searchsorted(self.classes_, y)
+                return out
             elif self.new_labels == "nan":
-                # Create copy of array and return
-                y_array = np.array(y)
-                z = np.zeros(y_array.shape, dtype=float)
+                # Setup out
+                out = np.zeros(y.shape, dtype=float)
 
                 # Find entries with new labels
                 missing_mask = np.in1d(y, diff)
 
                 # Populate return array properly and return
-                z[-missing_mask] = np.searchsorted(self.classes_,
-                                                   y_array[-missing_mask])
-                z[missing_mask] = np.nan
-                return z
+                out[~missing_mask] = np.searchsorted(self.classes_,
+                                                   y[~missing_mask])
+                out[missing_mask] = np.nan
+                return out
             elif self.new_labels == "label":
-                # Create copy of array and return
-                y_array = np.array(y)
-                z = np.zeros(y_array.shape, dtype=int)
+                # Setup out
+                out = np.zeros(y.shape, dtype=int)
 
                 # Find entries with new labels
                 missing_mask = np.in1d(y, diff)
 
                 # Populate return array properly and return
-                z[-missing_mask] = np.searchsorted(self.classes_,
-                                                   y_array[-missing_mask])
-                z[missing_mask] = self.new_label_class
-                return z
+                out[~missing_mask] = np.searchsorted(self.classes_,
+                                                   y[~missing_mask])
+                out[missing_mask] = self.new_label_class
+                return out
             elif self.new_labels == "raise":
                 # Return ValueError, original behavior.
                 raise ValueError("y contains new labels: %s" % str(diff))

From 3e1be5dc3318d16d21e7148a9c4594ecb9a7a7d8 Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael@bommaritollc.com>
Date: Mon, 14 Jul 2014 22:56:03 -0400
Subject: [PATCH 53/55] PEP8

---
 sklearn/preprocessing/label.py            | 22 ++++++++++++----------
 sklearn/preprocessing/tests/test_label.py |  1 -
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 26c5f1ef8fb01..ebd900890f7bd 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -1,6 +1,6 @@
 # Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
-#          Mathieu Blondel <mathieu@mblondel.org>
-#          Olivier Grisel <olivier.grisel@ensta.org>
+# Mathieu Blondel <mathieu@mblondel.org>
+# Olivier Grisel <olivier.grisel@ensta.org>
 #          Andreas Mueller <amueller@ais.uni-bonn.de>
 #          Joel Nothman <joel.nothman@gmail.com>
 #          Hamzeh Alsalhi <ha258@cornell.edu>
@@ -105,6 +105,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     ['tokyo', 'tokyo', 'paris']
 
     """
+
     def __init__(self, new_labels="raise", new_label_class=-1):
         """Constructor"""
         self.new_labels = new_labels
@@ -129,8 +130,8 @@ def fit(self, y):
         # Check new_labels parameter
         if self.new_labels not in ["update", "nan", "raise", "label"]:
             # Raise on invalid argument.
-                raise ValueError("Value of argument `new_labels`={0} "
-                                 "is unknown.".format(self.new_labels))
+            raise ValueError("Value of argument `new_labels`={0} "
+                             "is unknown.".format(self.new_labels))
 
         y = column_or_1d(y, warn=True)
         _check_numpy_unicode_bug(y)
@@ -152,8 +153,8 @@ def fit_transform(self, y):
         # Check new_labels parameter
         if self.new_labels not in ["update", "nan", "raise", "label"]:
             # Raise on invalid argument.
-                raise ValueError("Value of argument `new_labels`={0} "
-                                 "is unknown.".format(self.new_labels))
+            raise ValueError("Value of argument `new_labels`={0} "
+                             "is unknown.".format(self.new_labels))
 
         y = column_or_1d(y, warn=True)
         _check_numpy_unicode_bug(y)
@@ -192,9 +193,10 @@ def transform(self, y):
 
                 # Populate return array properly and return
                 out[~missing_mask] = np.searchsorted(self.classes_,
-                                                   y[~missing_mask])
+                                                     y[~missing_mask])
                 out[missing_mask] = np.searchsorted(new_class_values,
-                                                   y[missing_mask]) + len(self.classes_)
+                                                    y[missing_mask]) + \
+                    len(self.classes_)
 
                 # Update the class list with new labels
                 self.classes_ = np.append(self.classes_, new_class_values)
@@ -210,7 +212,7 @@ def transform(self, y):
 
                 # Populate return array properly and return
                 out[~missing_mask] = np.searchsorted(self.classes_,
-                                                   y[~missing_mask])
+                                                     y[~missing_mask])
                 out[missing_mask] = np.nan
                 return out
             elif self.new_labels == "label":
@@ -222,7 +224,7 @@ def transform(self, y):
 
                 # Populate return array properly and return
                 out[~missing_mask] = np.searchsorted(self.classes_,
-                                                   y[~missing_mask])
+                                                     y[~missing_mask])
                 out[missing_mask] = self.new_label_class
                 return out
             elif self.new_labels == "raise":
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index e76b49827dfc2..30e9262b1968d 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -228,7 +228,6 @@ def test_label_encoder_new_label_update():
     assert_array_equal(le.classes_, ["a", "b", "c", "_", "z"])
 
 
-
 def test_label_encoder_new_label_nan():
     """Test LabelEncoder's transform on new labels"""
     le = LabelEncoder(new_labels="nan")

From abf01cc93caf381770cdfcaf19936c3871d53a46 Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael@bommaritollc.com>
Date: Sat, 19 Jul 2014 09:02:49 -0400
Subject: [PATCH 54/55] Removing nan option and corresponding test

---
 sklearn/preprocessing/label.py            | 17 ++---------------
 sklearn/preprocessing/tests/test_label.py | 13 -------------
 2 files changed, 2 insertions(+), 28 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index ebd900890f7bd..4a4caa9cda17f 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -63,7 +63,6 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
         - If ``"raise"``, then raise ValueError.
         - If ``"update"``, then re-map the new labels to
           classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels.
-        - If ``"nan"``, then re-map the new labels to ``numpy.nan``.
         - If ``"label"``, then use the value of ``new_label_class``.
 
     new_label_class : integer, optional (default: -1)
@@ -128,7 +127,7 @@ def fit(self, y):
         self : returns an instance of self.
         """
         # Check new_labels parameter
-        if self.new_labels not in ["update", "nan", "raise", "label"]:
+        if self.new_labels not in ["update", "raise", "label"]:
             # Raise on invalid argument.
             raise ValueError("Value of argument `new_labels`={0} "
                              "is unknown.".format(self.new_labels))
@@ -151,7 +150,7 @@ def fit_transform(self, y):
         y : array-like of shape [n_samples]
         """
         # Check new_labels parameter
-        if self.new_labels not in ["update", "nan", "raise", "label"]:
+        if self.new_labels not in ["update", "raise", "label"]:
             # Raise on invalid argument.
             raise ValueError("Value of argument `new_labels`={0} "
                              "is unknown.".format(self.new_labels))
@@ -203,18 +202,6 @@ def transform(self, y):
 
                 # Return mapped encoding
                 return out
-            elif self.new_labels == "nan":
-                # Setup out
-                out = np.zeros(y.shape, dtype=float)
-
-                # Find entries with new labels
-                missing_mask = np.in1d(y, diff)
-
-                # Populate return array properly and return
-                out[~missing_mask] = np.searchsorted(self.classes_,
-                                                     y[~missing_mask])
-                out[missing_mask] = np.nan
-                return out
             elif self.new_labels == "label":
                 # Setup out
                 out = np.zeros(y.shape, dtype=int)
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 30e9262b1968d..69d6c6d04ae88 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -228,19 +228,6 @@ def test_label_encoder_new_label_update():
     assert_array_equal(le.classes_, ["a", "b", "c", "_", "z"])
 
 
-def test_label_encoder_new_label_nan():
-    """Test LabelEncoder's transform on new labels"""
-    le = LabelEncoder(new_labels="nan")
-    le.fit(["a", "b", "b", "c"])
-    assert_array_equal(le.classes_, ["a", "b", "c"])
-    assert_array_equal(le.transform(["a", "a", "c"]),
-                       [0, 0, 2])
-    assert_array_equal(le.inverse_transform([2, 1, 0]),
-                       ["c", "b", "a"])
-    assert_array_equal(le.transform(["_", "b", "c", "d"]),
-                       [np.nan, 1, 2, np.nan])
-
-
 def test_label_encoder_new_label_replace():
     """Test LabelEncoder's transform on new labels"""
     le = LabelEncoder(new_labels="label", new_label_class=-2)

From f26a9022f5c5aaa9c9c8b6a108c4b5f95f068c31 Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael@bommaritollc.com>
Date: Sat, 19 Jul 2014 11:10:12 -0400
Subject: [PATCH 55/55] Handling repeated transform calls with
 new_class_mapping_, refactoring, cleaning after removing np.nan.

---
 sklearn/preprocessing/label.py            | 100 +++++++++++++---------
 sklearn/preprocessing/tests/test_label.py |  19 ++--
 2 files changed, 74 insertions(+), 45 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 4a4caa9cda17f..6112b8cf77925 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -10,6 +10,7 @@
 import itertools
 import array
 import warnings
+import operator
 
 import numpy as np
 import scipy.sparse as sp
@@ -63,18 +64,19 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
         - If ``"raise"``, then raise ValueError.
         - If ``"update"``, then re-map the new labels to
           classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels.
-        - If ``"label"``, then use the value of ``new_label_class``.
-
-    new_label_class : integer, optional (default: -1)
-        If ``new_labels="label"``, then this value will be assigned to
-        as the class for any new labels that are encountered.
-
+        - If an integer value is passed, then use re-label with this value.
+          N.B. that default values are in [0, 1, ...], so caution should be
+          taken if a non-negative value is passed to not accidentally
+          intersect.
 
     Attributes
     ----------
     `classes_` : array of shape (n_class,)
         Holds the label for each class.
 
+    `new_label_mapping_` : dictionary
+        Stores the mapping for classes not seen during original ``fit``.
+
     Examples
     --------
     `LabelEncoder` can be used to normalize labels.
@@ -105,15 +107,34 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
 
     """
 
-    def __init__(self, new_labels="raise", new_label_class=-1):
+    def __init__(self, new_labels="raise"):
         """Constructor"""
         self.new_labels = new_labels
-        self.new_label_class = new_label_class
+        self.new_label_mapping_ = {}
 
     def _check_fitted(self):
         if not hasattr(self, "classes_"):
             raise ValueError("LabelEncoder was not fitted yet.")
 
+    def get_classes(self):
+        """Get classes that have been observed by the encoder.  Note that this
+        method returns classes seen both at original ``fit`` time (i.e.,
+        ``self.classes_``) and classes seen after ``fit`` (i.e.,
+        ``self.new_label_mapping_.keys()``) for applicable values of
+        ``new_labels``.
+
+        Returns
+        -------
+        classes : array-like of shape [n_classes]
+        """
+        # If we've seen updates, include them in the order they were added.
+        if len(self.new_label_mapping_) > 0:
+            sorted_new, _ = zip(*sorted(self.new_label_mapping_.iteritems(),
+                                        key=operator.itemgetter(1)))
+            return np.append(self.classes_, sorted_new)
+        else:
+            return self.classes_
+
     def fit(self, y):
         """Fit label encoder
 
@@ -127,10 +148,12 @@ def fit(self, y):
         self : returns an instance of self.
         """
         # Check new_labels parameter
-        if self.new_labels not in ["update", "raise", "label"]:
+        if self.new_labels not in ["update", "raise"] and \
+                type(self.new_labels) not in [int]:
             # Raise on invalid argument.
             raise ValueError("Value of argument `new_labels`={0} "
-                             "is unknown.".format(self.new_labels))
+                             "is unknown and not integer."
+                             .format(self.new_labels))
 
         y = column_or_1d(y, warn=True)
         _check_numpy_unicode_bug(y)
@@ -150,10 +173,12 @@ def fit_transform(self, y):
         y : array-like of shape [n_samples]
         """
         # Check new_labels parameter
-        if self.new_labels not in ["update", "raise", "label"]:
+        if self.new_labels not in ["update", "raise"] and \
+                type(self.new_labels) not in [int]:
             # Raise on invalid argument.
             raise ValueError("Value of argument `new_labels`={0} "
-                             "is unknown.".format(self.new_labels))
+                             "is unknown and not integer."
+                             .format(self.new_labels))
 
         y = column_or_1d(y, warn=True)
         _check_numpy_unicode_bug(y)
@@ -176,47 +201,42 @@ def transform(self, y):
 
         classes = np.unique(y)
         _check_numpy_unicode_bug(classes)
-        if len(np.intersect1d(classes, self.classes_)) < len(classes):
-            diff = np.setdiff1d(classes, self.classes_)
+        if len(np.intersect1d(classes, self.get_classes())) < len(classes):
+            # Get the new classes
+            diff_fit = np.setdiff1d(classes, self.classes_)
+            diff_new = np.setdiff1d(classes, self.get_classes())
+
             # Create copy of array and return
             y = np.array(y)
 
             # If we are mapping new labels, get "new" ID and change in copy.
             if self.new_labels == "update":
-                # Setup out
-                out = np.zeros(y.shape, dtype=int)
-
-                #  Find entries with new labels
-                missing_mask = np.in1d(y, diff)
-                new_class_values = np.sort(diff)
+                # Update the new label mapping
+                next_label = len(self.get_classes())
+                self.new_label_mapping_.update(dict(zip(diff_new,
+                                                        range(next_label,
+                                                              next_label +
+                                                              len(diff_new)))))
 
-                # Populate return array properly and return
-                out[~missing_mask] = np.searchsorted(self.classes_,
-                                                     y[~missing_mask])
-                out[missing_mask] = np.searchsorted(new_class_values,
-                                                    y[missing_mask]) + \
-                    len(self.classes_)
-
-                # Update the class list with new labels
-                self.classes_ = np.append(self.classes_, new_class_values)
+                # Find entries with new labels
+                missing_mask = np.in1d(y, diff_fit)
 
-                # Return mapped encoding
+                # Populate return array properly by mask and return
+                out = np.searchsorted(self.classes_, y)
+                out[missing_mask] = [self.new_label_mapping_[value]
+                                     for value in y[missing_mask]]
                 return out
-            elif self.new_labels == "label":
-                # Setup out
-                out = np.zeros(y.shape, dtype=int)
-
+            elif type(self.new_labels) in [int]:
                 # Find entries with new labels
-                missing_mask = np.in1d(y, diff)
+                missing_mask = np.in1d(y, diff_fit)
 
-                # Populate return array properly and return
-                out[~missing_mask] = np.searchsorted(self.classes_,
-                                                     y[~missing_mask])
-                out[missing_mask] = self.new_label_class
+                # Populate return array properly by mask and return
+                out = np.searchsorted(self.classes_, y)
+                out[missing_mask] = self.new_labels
                 return out
             elif self.new_labels == "raise":
                 # Return ValueError, original behavior.
-                raise ValueError("y contains new labels: %s" % str(diff))
+                raise ValueError("y contains new labels: %s" % str(diff_fit))
             else:
                 # Raise on invalid argument.
                 raise ValueError("Value of argument `new_labels`={0} "
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 69d6c6d04ae88..70a47fcffd498 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -210,6 +210,17 @@ def test_label_encoder():
     assert_raises(ValueError, le.transform, [0, 6])
 
 
+def test_label_encoder_get_classes():
+    """Test LabelEncoder's get_classes method."""
+    le = LabelEncoder(new_labels="update")
+    le.fit([1, 1, 4, 5, -1, 0])
+    assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
+    assert_array_equal(le.classes_, le.get_classes())
+    le.transform([10])
+    assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
+    assert_array_equal(le.get_classes(), [-1, 0, 1, 4, 5, 10])
+
+
 def test_label_encoder_new_label_update():
     """Test LabelEncoder's transform on new labels"""
     le = LabelEncoder(new_labels="update")
@@ -221,16 +232,14 @@ def test_label_encoder_new_label_update():
                        ["c", "b", "a"])
     assert_array_equal(le.transform(["b", "c", "_"]),
                        [1, 2, 3])
-    assert_array_equal(le.classes_, ["a", "b", "c", "_"])
-    print(le.classes_)
+    assert_array_equal(le.get_classes(), ["a", "b", "c", "_"])
     assert_array_equal(le.transform(["_", "z", "a"]),
                        [3, 4, 0])
-    assert_array_equal(le.classes_, ["a", "b", "c", "_", "z"])
 
 
 def test_label_encoder_new_label_replace():
     """Test LabelEncoder's transform on new labels"""
-    le = LabelEncoder(new_labels="label", new_label_class=-2)
+    le = LabelEncoder(new_labels=-99)
     le.fit(["a", "b", "b", "c"])
     assert_array_equal(le.classes_, ["a", "b", "c"])
     assert_array_equal(le.transform(["a", "a", "c"]),
@@ -238,7 +247,7 @@ def test_label_encoder_new_label_replace():
     assert_array_equal(le.inverse_transform([2, 1, 0]),
                        ["c", "b", "a"])
     assert_array_equal(le.transform(["b", "c", "d"]),
-                       [1, 2, -2])
+                       [1, 2, -99])
 
 
 def test_label_encoder_new_label_arg():