-
-
Notifications
You must be signed in to change notification settings - Fork 26.9k
MemoryError in KNNImputer with california housing #15604
Copy link
Copy link
Closed
Labels
Description
I was doing a simple example with california housing and the KNNImputer blow up into my face:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
calhousing = fetch_california_housing()
X = pd.DataFrame(calhousing.data, columns=calhousing.feature_names)
y = pd.Series(calhousing.target, name='house_value')
rng = np.random.RandomState(42)
density = 4 # one in 10 values will be NaN
mask = rng.randint(density, size=X.shape) == 0
X_na = X.copy()
X_na.values[mask] = np.nan
X_na.head()
X_train_na, X_test_na, y_train_na, y_test_na = train_test_split(
X_na[y<4.9], y[y<4.9], test_size=1000, random_state=0)
model = make_pipeline(
StandardScaler(),
KNNImputer(add_indicator=True),
LinearRegression()
)
model.fit(X_train_na, y_train_na).score(X_test_na, y_test_na)---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-71-ad8b65bc77f2> in <module>
4 LinearRegression()
5 )
----> 6 model.fit(X_train_na, y_train_na).score(X_test_na, y_test_na)
~/Documents/packages/scikit-learn/sklearn/pipeline.py in fit(self, X, y, **fit_params)
346 This estimator
347 """
--> 348 Xt, fit_params = self._fit(X, y, **fit_params)
349 with _print_elapsed_time('Pipeline',
350 self._log_message(len(self.steps) - 1)):
~/Documents/packages/scikit-learn/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
311 message_clsname='Pipeline',
312 message=self._log_message(step_idx),
--> 313 **fit_params_steps[name])
314 # Replace the transformer of the step with the fitted
315 # transformer. This is necessary when loading the transformer
~/miniconda3/envs/dev/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
353
354 def __call__(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
356
357 def call_and_shelve(self, *args, **kwargs):
~/Documents/packages/scikit-learn/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
724 with _print_elapsed_time(message_clsname, message):
725 if hasattr(transformer, 'fit_transform'):
--> 726 res = transformer.fit_transform(X, y, **fit_params)
727 else:
728 res = transformer.fit(X, y, **fit_params).transform(X)
~/Documents/packages/scikit-learn/sklearn/base.py in fit_transform(self, X, y, **fit_params)
566 else:
567 # fit method of arity 2 (supervised transformation)
--> 568 return self.fit(X, y, **fit_params).transform(X)
569
570
~/Documents/packages/scikit-learn/sklearn/impute/_knn.py in transform(self, X)
230 metric=self.metric,
231 missing_values=self.missing_values,
--> 232 force_all_finite=force_all_finite)
233
234 # Maps from indices from X to indices in dist matrix
~/Documents/packages/scikit-learn/sklearn/metrics/pairwise.py in pairwise_distances(X, Y, metric, n_jobs, force_all_finite, **kwds)
1742 func = partial(distance.cdist, metric=metric, **kwds)
1743
-> 1744 return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1745
1746
~/Documents/packages/scikit-learn/sklearn/metrics/pairwise.py in _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1341
1342 if effective_n_jobs(n_jobs) == 1:
-> 1343 return func(X, Y, **kwds)
1344
1345 # enforce a threading backend to prevent data communication overhead
~/Documents/packages/scikit-learn/sklearn/metrics/pairwise.py in nan_euclidean_distances(X, Y, squared, missing_values, copy)
409 present_coords_cnt = np.dot(1 - missing_X, 1 - missing_Y.T)
410 present_mask = (present_coords_cnt != 0)
--> 411 distances[present_mask] *= (X.shape[1] / present_coords_cnt[present_mask])
412
413 if X is Y:
MemoryError: Unable to allocate array with shape (311408488,) and data type float64Reactions are currently unavailable