-
-
Notifications
You must be signed in to change notification settings - Fork 26.9k
MRG Minibatch reassignment fixes #2638
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
92365c2
7565168
9202aea
f092c83
9b2b115
59fbdfe
3dff264
b93a49e
24d3b1e
9862baa
9e8e56b
21ee627
05d1e0b
9e05470
07800ce
2ca752b
21f2709
f76a4ed
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,6 +25,7 @@ | |
| from ..utils import atleast2d_or_csr | ||
| from ..utils import as_float_array | ||
| from ..utils import gen_batches | ||
| from ..utils.random import choice | ||
| from ..externals.joblib import Parallel | ||
| from ..externals.joblib import delayed | ||
|
|
||
|
|
@@ -396,28 +397,59 @@ def _kmeans_single(X, n_clusters, x_squared_norms, max_iter=300, | |
| return best_labels, best_inertia, best_centers | ||
|
|
||
|
|
||
| def _labels_inertia_precompute_dense(X, x_squared_norms, centers): | ||
| def _labels_inertia_precompute_dense(X, x_squared_norms, centers, distances): | ||
| """Compute labels and inertia using a full distance matrix. | ||
|
|
||
| This will overwrite the 'distances' array in-place. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| X : numpy array, shape (n_sample, n_features) | ||
| Input data. | ||
|
|
||
| x_squared_norms : numpy array, shape (n_samples,) | ||
| Precomputed squared norms of X. | ||
|
|
||
| centers : numpy array, shape (n_clusters, n_features) | ||
| Cluster centers which data is assigned to. | ||
|
|
||
| distances : numpy array, shape (n_samples,) | ||
| Pre-allocated array in which distances are stored. | ||
|
|
||
| Returns | ||
| ------- | ||
| labels : numpy array, dtype=np.int, shape (n_samples,) | ||
| Indices of clusters that samples are assigned to. | ||
|
|
||
| inertia : float | ||
| Sum of distances of samples to their closest cluster center. | ||
|
|
||
| """ | ||
| n_samples = X.shape[0] | ||
| k = centers.shape[0] | ||
| distances = euclidean_distances(centers, X, x_squared_norms, | ||
| squared=True) | ||
| all_distances = euclidean_distances(centers, X, x_squared_norms, | ||
| squared=True) | ||
| labels = np.empty(n_samples, dtype=np.int32) | ||
| labels.fill(-1) | ||
| mindist = np.empty(n_samples) | ||
| mindist.fill(np.infty) | ||
| for center_id in range(k): | ||
| dist = distances[center_id] | ||
| dist = all_distances[center_id] | ||
| labels[dist < mindist] = center_id | ||
| mindist = np.minimum(dist, mindist) | ||
| if n_samples == distances.shape[0]: | ||
| # distances will be changed in-place | ||
| distances[:] = mindist | ||
| inertia = mindist.sum() | ||
| return labels, inertia | ||
|
|
||
|
|
||
| def _labels_inertia(X, x_squared_norms, centers, | ||
| precompute_distances=True, distances=None): | ||
| """E step of the K-means EM algorithm | ||
| """E step of the K-means EM algorithm. | ||
|
|
||
| Compute the labels and the inertia of the given samples and centers | ||
| Compute the labels and the inertia of the given samples and centers. | ||
| This will compute the distances in-place. | ||
|
|
||
| Parameters | ||
| ---------- | ||
|
|
@@ -431,6 +463,9 @@ def _labels_inertia(X, x_squared_norms, centers, | |
| centers: float64 array, shape (k, n_features) | ||
| The cluster centers. | ||
|
|
||
| precompute_distances : boolean, default: True | ||
| Precompute distances (faster but takes more memory). | ||
|
|
||
| distances: float64 array, shape (n_samples,) | ||
| Pre-allocated array to be filled in with each sample's distance | ||
| to the closest center. | ||
|
|
@@ -440,22 +475,23 @@ def _labels_inertia(X, x_squared_norms, centers, | |
| labels: int array of shape(n) | ||
| The resulting assignment | ||
|
|
||
| inertia: float | ||
| The value of the inertia criterion with the assignment | ||
| inertia : float | ||
| Sum of distances of samples to their closest cluster center. | ||
| """ | ||
| n_samples = X.shape[0] | ||
| # set the default value of centers to -1 to be able to detect any anomaly | ||
| # easily | ||
| labels = - np.ones(n_samples, np.int32) | ||
| labels = -np.ones(n_samples, np.int32) | ||
| if distances is None: | ||
| distances = np.zeros(shape=(0,), dtype=np.float64) | ||
| # distances will be changed in-place | ||
| if sp.issparse(X): | ||
| inertia = _k_means._assign_labels_csr( | ||
| X, x_squared_norms, centers, labels, distances=distances) | ||
| else: | ||
| if precompute_distances: | ||
| return _labels_inertia_precompute_dense(X, x_squared_norms, | ||
| centers) | ||
| centers, distances) | ||
| inertia = _k_means._assign_labels_array( | ||
| X, x_squared_norms, centers, labels, distances=distances) | ||
| return labels, inertia | ||
|
|
@@ -550,11 +586,11 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin): | |
| The number of clusters to form as well as the number of | ||
| centroids to generate. | ||
|
|
||
| max_iter : int | ||
| max_iter : int, default: 300 | ||
| Maximum number of iterations of the k-means algorithm for a | ||
| single run. | ||
|
|
||
| n_init : int, optional, default: 10 | ||
| n_init : int, default: 10 | ||
| Number of time the k-means algorithm will be run with different | ||
| centroid seeds. The final results will be the best output of | ||
| n_init consecutive runs in terms of inertia. | ||
|
|
@@ -572,13 +608,13 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin): | |
| If an ndarray is passed, it should be of shape (n_clusters, n_features) | ||
| and gives the initial centers. | ||
|
|
||
| precompute_distances : boolean | ||
| precompute_distances : boolean, default: True | ||
| Precompute distances (faster but takes more memory). | ||
|
|
||
| tol : float, optional default: 1e-4 | ||
| tol : float, default: 1e-4 | ||
| Relative tolerance w.r.t. inertia to declare convergence | ||
|
|
||
| n_jobs : int | ||
| n_jobs : int, default: 1 | ||
| The number of jobs to use for the computation. This works by breaking | ||
| down the pairwise matrix into n_jobs even slices and computing them in | ||
| parallel. | ||
|
|
@@ -602,8 +638,7 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin): | |
| Labels of each point | ||
|
|
||
| `inertia_` : float | ||
| The value of the inertia criterion associated with the chosen | ||
| partition. | ||
| Sum of distances of samples to their closest cluster center. | ||
|
|
||
| Notes | ||
| ------ | ||
|
|
@@ -717,7 +752,7 @@ def fit_transform(self, X, y=None): | |
| return self.fit(X)._transform(X) | ||
|
|
||
| def transform(self, X, y=None): | ||
| """Transform X to a cluster-distance space | ||
| """Transform X to a cluster-distance space. | ||
|
|
||
| In the new space, each dimension is the distance to the cluster | ||
| centers. Note that even if X is sparse, the array returned by | ||
|
|
@@ -787,47 +822,55 @@ def _mini_batch_step(X, x_squared_norms, centers, counts, | |
| distances, random_reassign=False, | ||
| random_state=None, reassignment_ratio=.01, | ||
| verbose=False): | ||
| """Incremental update of the centers for the Minibatch K-Means algorithm | ||
| """Incremental update of the centers for the Minibatch K-Means algorithm. | ||
|
|
||
| Parameters | ||
| ---------- | ||
|
|
||
| X: array, shape (n_samples, n_features) | ||
| X : array, shape (n_samples, n_features) | ||
| The original data array. | ||
|
|
||
| x_squared_norms: array, shape (n_samples,) | ||
| x_squared_norms : array, shape (n_samples,) | ||
| Squared euclidean norm of each data point. | ||
|
|
||
| centers: array, shape (k, n_features) | ||
| centers : array, shape (k, n_features) | ||
| The cluster centers. This array is MODIFIED IN PLACE | ||
|
|
||
| counts: array, shape (k,) | ||
| counts : array, shape (k,) | ||
| The vector in which we keep track of the numbers of elements in a | ||
| cluster. This array is MODIFIED IN PLACE | ||
|
|
||
| distances: array, dtype float64, shape (n_samples), optional | ||
| distances : array, dtype float64, shape (n_samples), optional | ||
| If not None, should be a pre-allocated array that will be used to store | ||
| the distances of each sample to it's closest center. | ||
| May not be None when random_reassign is True. | ||
|
|
||
| random_state: integer or numpy.RandomState, optional | ||
| random_state : integer or numpy.RandomState, optional | ||
| The generator used to initialize the centers. If an integer is | ||
| given, it fixes the seed. Defaults to the global numpy random | ||
| number generator. | ||
|
|
||
| random_reassign: boolean, optional | ||
| random_reassign : boolean, optional | ||
| If True, centers with very low counts are | ||
| randomly-reassigned to observations in dense areas. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not what the new strategy is implementing. The new strategy picks new cluster in locations that are badly represented by the current clusters. |
||
|
|
||
| reassignment_ratio: float, optional | ||
| reassignment_ratio : float, optional | ||
| Control the fraction of the maximum number of counts for a | ||
| center to be reassigned. A higher value means that low count | ||
| centers are more easily reassigned, which means that the | ||
| model will take longer to converge, but should converge in a | ||
| better clustering. | ||
|
|
||
| verbose: bool, optional | ||
| Controls the verbosity | ||
| verbose : bool, optional | ||
| Controls the verbosity. | ||
|
|
||
| Returns | ||
| ------- | ||
| inertia : float | ||
| Sum of distances of samples to their closest cluster center. | ||
|
|
||
| squared_diff : numpy array, shape (n_clusters,) | ||
| Squared distances between previous and updated cluster centers. | ||
|
|
||
| """ | ||
| # Perform label assignment to nearest centers | ||
|
|
@@ -837,19 +880,21 @@ def _mini_batch_step(X, x_squared_norms, centers, counts, | |
| if random_reassign and reassignment_ratio > 0: | ||
| random_state = check_random_state(random_state) | ||
| # Reassign clusters that have very low counts | ||
| to_reassign = np.logical_or( | ||
| (counts <= 1), counts <= reassignment_ratio * counts.max()) | ||
| n_reassigns = min(to_reassign.sum(), X.shape[0]) | ||
| to_reassign = counts < reassignment_ratio * counts.max() | ||
| # maximally assign reassignment_ratio*batch_size samples as clusters | ||
| if to_reassign.sum() > .5 * X.shape[0]: | ||
| indices_dont_reassign = np.argsort(counts)[.5 * X.shape[0]:] | ||
| to_reassign[indices_dont_reassign] = False | ||
| n_reassigns = to_reassign.sum() | ||
| if n_reassigns: | ||
| # Pick new clusters amongst observations with probability | ||
| # proportional to their closeness to their center. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is now the opposite: the new clusters are picked from observations with a higher probability if their are far away from existing centers. |
||
| # Flip the ordering of the distances. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This line should be removed: the ordering is now preserved. |
||
| distances -= distances.max() | ||
| distances *= -1 | ||
| rand_vals = random_state.rand(n_reassigns) | ||
| rand_vals *= distances.sum() | ||
| new_centers = np.searchsorted(distances.cumsum(), | ||
| rand_vals) | ||
| distances += 1e-10 | ||
| distances /= distances.sum() | ||
|
|
||
| new_centers = choice(X.shape[0], replace=False, p=distances, | ||
| size=n_reassigns) | ||
| if verbose: | ||
| print("[MiniBatchKMeans] Reassigning %i cluster centers." | ||
| % n_reassigns) | ||
|
|
@@ -859,6 +904,10 @@ def _mini_batch_step(X, x_squared_norms, centers, counts, | |
| centers) | ||
| else: | ||
| centers[to_reassign] = X[new_centers] | ||
| # reset counts of reassigned centers, but don't reset them too small | ||
| # to avoid instant reassignment. This is a pretty dirty hack as it | ||
| # also modifies the learning rates. | ||
| counts[to_reassign] = np.min(counts[~to_reassign]) | ||
|
|
||
| # implementation for the sparse CSR representation completely written in | ||
| # cython | ||
|
|
@@ -979,14 +1028,14 @@ class MiniBatchKMeans(KMeans): | |
| Maximum number of iterations over the complete dataset before | ||
| stopping independently of any early stopping criterion heuristics. | ||
|
|
||
| max_no_improvement : int, optional | ||
| max_no_improvement : int, default: 10 | ||
| Control early stopping based on the consecutive number of mini | ||
| batches that does not yield an improvement on the smoothed inertia. | ||
|
|
||
| To disable convergence detection based on inertia, set | ||
| max_no_improvement to None. | ||
|
|
||
| tol : float, optional | ||
| tol : float, default: 0.0 | ||
| Control early stopping based on the relative center changes as | ||
| measured by a smoothed, variance-normalized of the mean center | ||
| squared position changes. This early stopping heuristics is | ||
|
|
@@ -1006,7 +1055,7 @@ class MiniBatchKMeans(KMeans): | |
| only algorithm is initialized by running a batch KMeans on a | ||
| random subset of the data. This needs to be larger than k. | ||
|
|
||
| init : {'k-means++', 'random' or an ndarray} | ||
| init : {'k-means++', 'random' or an ndarray}, default: 'k-means++' | ||
| Method for initialization, defaults to 'k-means++': | ||
|
|
||
| 'k-means++' : selects initial cluster centers for k-mean | ||
|
|
@@ -1016,7 +1065,6 @@ class MiniBatchKMeans(KMeans): | |
| 'random': choose k observations (rows) at random from data for | ||
| the initial centroids. | ||
|
|
||
|
|
||
| If an ndarray is passed, it should be of shape (n_clusters, n_features) | ||
| and gives the initial centers. | ||
|
|
||
|
|
@@ -1025,7 +1073,7 @@ class MiniBatchKMeans(KMeans): | |
| In contrast to KMeans, the algorithm is only run once, using the | ||
| best of the ``n_init`` initializations as measured by inertia. | ||
|
|
||
| compute_labels : boolean | ||
| compute_labels : boolean, default=True | ||
| Compute label assignment and inertia for the complete dataset | ||
| once the minibatch optimization has converged in fit. | ||
|
|
||
|
|
@@ -1034,7 +1082,7 @@ class MiniBatchKMeans(KMeans): | |
| given, it fixes the seed. Defaults to the global numpy random | ||
| number generator. | ||
|
|
||
| reassignment_ratio : float, optional | ||
| reassignment_ratio : float, default: 0.01 | ||
| Control the fraction of the maximum number of counts for a | ||
| center to be reassigned. A higher value means that low count | ||
| centers are more easily reassigned, which means that the | ||
|
|
@@ -1150,7 +1198,7 @@ def fit(self, X, y=None): | |
| batch_inertia, centers_squared_diff = _mini_batch_step( | ||
| X_valid, x_squared_norms[validation_indices], | ||
| cluster_centers, counts, old_center_buffer, False, | ||
| distances=distances, verbose=self.verbose) | ||
| distances=None, verbose=self.verbose) | ||
|
|
||
| # Keep only the best cluster centers across independent inits on | ||
| # the common validation set | ||
|
|
@@ -1248,7 +1296,8 @@ def partial_fit(self, X, y=None): | |
| return self | ||
|
|
||
| x_squared_norms = row_norms(X, squared=True) | ||
| self.random_state_ = check_random_state(self.random_state) | ||
| self.random_state_ = getattr(self, "random_state_", | ||
| check_random_state(self.random_state)) | ||
| if (not hasattr(self, 'counts_') | ||
| or not hasattr(self, 'cluster_centers_')): | ||
| # this is the first call partial_fit on this object: | ||
|
|
@@ -1267,7 +1316,7 @@ def partial_fit(self, X, y=None): | |
| # reassignment too often, to allow for building up counts | ||
| random_reassign = self.random_state_.randint( | ||
| 10 * (1 + self.counts_.min())) == 0 | ||
| distances = np.zeros(self.n_clusters, dtype=np.float64) | ||
| distances = np.zeros(X.shape[0], dtype=np.float64) | ||
|
|
||
| _mini_batch_step(X, x_squared_norms, self.cluster_centers_, | ||
| self.counts_, np.zeros(0, np.double), 0, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Usually, when an input parameter is modified inplace, I like to point it out in the docstring, or in a comment at the top of the function, as it is such an important aspect of the codebase for someone reading the code.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, can do that. None of the functions is documented btw ;)