Describe the bug
TerminatedWorkerError when working with n_jobs=-1 or even n_jobs=4 in GridSearchCV.
I just migrated to a Macbook Pro M3 from an Ubuntu. This info is probably relevant since it seems to be related to how the OS terminate a process? I'm able to run this on my older Ubuntu laptop with older version of Python (v3.8) and Sklearn (v1.0.2). With my Ubuntu laptop, I never encountered this issue before.
I noticed that this issue happens randomly on my Macbook, which means if the TerminatedWorkerError raised, I can re-run the fitting code and it might finish without issue. Sometimes it also happens early in the fitting or a bit later.
My code is very similar to the attached code below. But, it seems I couldn't reproduce the same error with that code on a fresh jupyter notebook!
It would be great if someone could advise how to debug further, or advise if there is any settings I could change to reduce the 'likelihood' of TerminatedWorkerError.
Steps/Code to Reproduce
# Note: I couldn't reproduce this error with the code below, could be differences in data?
# But this is very similar to what my actual code looks like
import numpy as np
from imblearn.pipeline import Pipeline
from sklearn import tree, datasets
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.feature_selection import SequentialFeatureSelector, SelectFromModel
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
class CustomGridSearchCV(GridSearchCV):
@property
def feature_importances_(self):
return self.best_estimator_.feature_importances_
rs=42
breast_cancer = datasets.load_breast_cancer()
grid_search_cv = RepeatedStratifiedKFold(random_state=rs, n_repeats=100, n_splits=5)
tree_params = {'criterion': ['gini', 'entropy'],
'max_depth': range(2, 6),
'min_samples_leaf': range(3, 5, 1)}
tree_pipe = Pipeline([
("scaling", StandardScaler()),
("oversampling", RandomOverSampler(random_state=rs)),
("feature_selection", "passthrough"),
("clf", tree.DecisionTreeClassifier()),
])
parameters = {
'feature_selection': [
SelectFromModel(
CustomGridSearchCV(
param_grid=tree_params,
estimator=DecisionTreeClassifier(random_state=rs),
cv=5,
scoring='balanced_accuracy',
)
),
],
'oversampling__shrinkage': np.arange(0, 1, 0.3),
'clf__criterion': ['gini', 'entropy'],
'clf__max_depth': range(2, 10),
'clf__min_samples_leaf': range(3, 10, 1)
}
clf = GridSearchCV(tree_pipe, parameters, n_jobs=-1, cv=grid_search_cv, verbose=1, error_score='raise')
clf.fit(breast_cancer.data, breast_cancer.target)
Expected Results
No error is thrown.
Actual Results
---------------------------------------------------------------------------
TerminatedWorkerError Traceback (most recent call last)
File <timed exec>:158
File ~/miniconda3/lib/python3.12/site-packages/sklearn/base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
1466 estimator._validate_params()
1468 with config_context(
1469 skip_parameter_validation=(
1470 prefer_skip_nested_validation or global_skip_validation
1471 )
1472 ):
-> 1473 return fit_method(estimator, *args, **kwargs)
File ~/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_search.py:1018, in BaseSearchCV.fit(self, X, y, **params)
1012 results = self._format_results(
1013 all_candidate_params, n_splits, all_out, all_more_results
1014 )
1016 return results
-> 1018 self._run_search(evaluate_candidates)
1020 # multimetric is determined here because in the case of a callable
1021 # self.scoring the return type is only known after calling
1022 first_test_score = all_out[0]["test_scores"]
File ~/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_search.py:1572, in GridSearchCV._run_search(self, evaluate_candidates)
1570 def _run_search(self, evaluate_candidates):
1571 """Search all candidates in param_grid"""
-> 1572 evaluate_candidates(ParameterGrid(self.param_grid))
File ~/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_search.py:964, in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results)
956 if self.verbose > 0:
957 print(
958 "Fitting {0} folds for each of {1} candidates,"
959 " totalling {2} fits".format(
960 n_splits, n_candidates, n_candidates * n_splits
961 )
962 )
--> 964 out = parallel(
965 delayed(_fit_and_score)(
966 clone(base_estimator),
967 X,
968 y,
969 train=train,
970 test=test,
971 parameters=parameters,
972 split_progress=(split_idx, n_splits),
973 candidate_progress=(cand_idx, n_candidates),
974 **fit_and_score_kwargs,
975 )
976 for (cand_idx, parameters), (split_idx, (train, test)) in product(
977 enumerate(candidate_params),
978 enumerate(cv.split(X, y, **routed_params.splitter.split)),
979 )
980 )
982 if len(out) < 1:
983 raise ValueError(
984 "No fits were performed. "
985 "Was the CV iterator empty? "
986 "Were there no candidates?"
987 )
File ~/miniconda3/lib/python3.12/site-packages/sklearn/utils/parallel.py:74, in Parallel.__call__(self, iterable)
69 config = get_config()
70 iterable_with_config = (
71 (_with_config(delayed_func, config), args, kwargs)
72 for delayed_func, args, kwargs in iterable
73 )
---> 74 return super().__call__(iterable_with_config)
File ~/miniconda3/lib/python3.12/site-packages/joblib/parallel.py:2007, in Parallel.__call__(self, iterable)
2001 # The first item from the output is blank, but it makes the interpreter
2002 # progress until it enters the Try/Except block of the generator and
2003 # reaches the first `yield` statement. This starts the asynchronous
2004 # dispatch of the tasks to the workers.
2005 next(output)
-> 2007 return output if self.return_generator else list(output)
File ~/miniconda3/lib/python3.12/site-packages/joblib/parallel.py:1650, in Parallel._get_outputs(self, iterator, pre_dispatch)
1647 yield
1649 with self._backend.retrieval_context():
-> 1650 yield from self._retrieve()
1652 except GeneratorExit:
1653 # The generator has been garbage collected before being fully
1654 # consumed. This aborts the remaining tasks if possible and warn
1655 # the user if necessary.
1656 self._exception = True
File ~/miniconda3/lib/python3.12/site-packages/joblib/parallel.py:1754, in Parallel._retrieve(self)
1747 while self._wait_retrieval():
1748
1749 # If the callback thread of a worker has signaled that its task
1750 # triggered an exception, or if the retrieval loop has raised an
1751 # exception (e.g. `GeneratorExit`), exit the loop and surface the
1752 # worker traceback.
1753 if self._aborting:
-> 1754 self._raise_error_fast()
1755 break
1757 # If the next job is not ready for retrieval yet, we just wait for
1758 # async callbacks to progress.
File ~/miniconda3/lib/python3.12/site-packages/joblib/parallel.py:1789, in Parallel._raise_error_fast(self)
1785 # If this error job exists, immediately raise the error by
1786 # calling get_result. This job might not exists if abort has been
1787 # called directly or if the generator is gc'ed.
1788 if error_job is not None:
-> 1789 error_job.get_result(self.timeout)
File ~/miniconda3/lib/python3.12/site-packages/joblib/parallel.py:745, in BatchCompletionCallBack.get_result(self, timeout)
739 backend = self.parallel._backend
741 if backend.supports_retrieve_callback:
742 # We assume that the result has already been retrieved by the
743 # callback thread, and is stored internally. It's just waiting to
744 # be returned.
--> 745 return self._return_or_raise()
747 # For other backends, the main thread needs to run the retrieval step.
748 try:
File ~/miniconda3/lib/python3.12/site-packages/joblib/parallel.py:763, in BatchCompletionCallBack._return_or_raise(self)
761 try:
762 if self.status == TASK_ERROR:
--> 763 raise self._result
764 return self._result
765 finally:
TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.
The exit codes of the workers are {SIGBUS(-10)}
Versions
System:
python: 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 10:07:17) [Clang 14.0.6 ]
executable: /Users/darren/miniconda3/bin/python
machine: macOS-14.5-arm64-arm-64bit
Python dependencies:
sklearn: 1.5.1
pip: 24.1.2
setuptools: 69.5.1
numpy: 2.0.1
scipy: 1.14.0
Cython: None
pandas: 2.2.2
matplotlib: 3.9.1
joblib: 1.4.2
threadpoolctl: 3.5.0
Built with OpenMP: True
threadpoolctl info:
user_api: openmp
internal_api: openmp
num_threads: 12
prefix: libomp
filepath: /Users/darren/miniconda3/lib/python3.12/site-packages/sklearn/.dylibs/libomp.dylib
version: None
Describe the bug
TerminatedWorkerErrorwhen working withn_jobs=-1or evenn_jobs=4inGridSearchCV.I just migrated to a Macbook Pro M3 from an Ubuntu. This info is probably relevant since it seems to be related to how the OS terminate a process? I'm able to run this on my older Ubuntu laptop with older version of Python (v3.8) and Sklearn (v1.0.2). With my Ubuntu laptop, I never encountered this issue before.
I noticed that this issue happens randomly on my Macbook, which means if the
TerminatedWorkerErrorraised, I can re-run the fitting code and it might finish without issue. Sometimes it also happens early in the fitting or a bit later.My code is very similar to the attached code below. But, it seems I couldn't reproduce the same error with that code on a fresh jupyter notebook!
It would be great if someone could advise how to debug further, or advise if there is any settings I could change to reduce the 'likelihood' of
TerminatedWorkerError.Steps/Code to Reproduce
Expected Results
No error is thrown.
Actual Results
--------------------------------------------------------------------------- TerminatedWorkerError Traceback (most recent call last) File <timed exec>:158 File ~/miniconda3/lib/python3.12/site-packages/sklearn/base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs) 1466 estimator._validate_params() 1468 with config_context( 1469 skip_parameter_validation=( 1470 prefer_skip_nested_validation or global_skip_validation 1471 ) 1472 ): -> 1473 return fit_method(estimator, *args, **kwargs) File ~/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_search.py:1018, in BaseSearchCV.fit(self, X, y, **params) 1012 results = self._format_results( 1013 all_candidate_params, n_splits, all_out, all_more_results 1014 ) 1016 return results -> 1018 self._run_search(evaluate_candidates) 1020 # multimetric is determined here because in the case of a callable 1021 # self.scoring the return type is only known after calling 1022 first_test_score = all_out[0]["test_scores"] File ~/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_search.py:1572, in GridSearchCV._run_search(self, evaluate_candidates) 1570 def _run_search(self, evaluate_candidates): 1571 """Search all candidates in param_grid""" -> 1572 evaluate_candidates(ParameterGrid(self.param_grid)) File ~/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_search.py:964, in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results) 956 if self.verbose > 0: 957 print( 958 "Fitting {0} folds for each of {1} candidates," 959 " totalling {2} fits".format( 960 n_splits, n_candidates, n_candidates * n_splits 961 ) 962 ) --> 964 out = parallel( 965 delayed(_fit_and_score)( 966 clone(base_estimator), 967 X, 968 y, 969 train=train, 970 test=test, 971 parameters=parameters, 972 split_progress=(split_idx, n_splits), 973 candidate_progress=(cand_idx, n_candidates), 974 **fit_and_score_kwargs, 975 ) 976 for (cand_idx, parameters), (split_idx, (train, test)) in product( 977 enumerate(candidate_params), 978 enumerate(cv.split(X, y, **routed_params.splitter.split)), 979 ) 980 ) 982 if len(out) < 1: 983 raise ValueError( 984 "No fits were performed. " 985 "Was the CV iterator empty? " 986 "Were there no candidates?" 987 ) File ~/miniconda3/lib/python3.12/site-packages/sklearn/utils/parallel.py:74, in Parallel.__call__(self, iterable) 69 config = get_config() 70 iterable_with_config = ( 71 (_with_config(delayed_func, config), args, kwargs) 72 for delayed_func, args, kwargs in iterable 73 ) ---> 74 return super().__call__(iterable_with_config) File ~/miniconda3/lib/python3.12/site-packages/joblib/parallel.py:2007, in Parallel.__call__(self, iterable) 2001 # The first item from the output is blank, but it makes the interpreter 2002 # progress until it enters the Try/Except block of the generator and 2003 # reaches the first `yield` statement. This starts the asynchronous 2004 # dispatch of the tasks to the workers. 2005 next(output) -> 2007 return output if self.return_generator else list(output) File ~/miniconda3/lib/python3.12/site-packages/joblib/parallel.py:1650, in Parallel._get_outputs(self, iterator, pre_dispatch) 1647 yield 1649 with self._backend.retrieval_context(): -> 1650 yield from self._retrieve() 1652 except GeneratorExit: 1653 # The generator has been garbage collected before being fully 1654 # consumed. This aborts the remaining tasks if possible and warn 1655 # the user if necessary. 1656 self._exception = True File ~/miniconda3/lib/python3.12/site-packages/joblib/parallel.py:1754, in Parallel._retrieve(self) 1747 while self._wait_retrieval(): 1748 1749 # If the callback thread of a worker has signaled that its task 1750 # triggered an exception, or if the retrieval loop has raised an 1751 # exception (e.g. `GeneratorExit`), exit the loop and surface the 1752 # worker traceback. 1753 if self._aborting: -> 1754 self._raise_error_fast() 1755 break 1757 # If the next job is not ready for retrieval yet, we just wait for 1758 # async callbacks to progress. File ~/miniconda3/lib/python3.12/site-packages/joblib/parallel.py:1789, in Parallel._raise_error_fast(self) 1785 # If this error job exists, immediately raise the error by 1786 # calling get_result. This job might not exists if abort has been 1787 # called directly or if the generator is gc'ed. 1788 if error_job is not None: -> 1789 error_job.get_result(self.timeout) File ~/miniconda3/lib/python3.12/site-packages/joblib/parallel.py:745, in BatchCompletionCallBack.get_result(self, timeout) 739 backend = self.parallel._backend 741 if backend.supports_retrieve_callback: 742 # We assume that the result has already been retrieved by the 743 # callback thread, and is stored internally. It's just waiting to 744 # be returned. --> 745 return self._return_or_raise() 747 # For other backends, the main thread needs to run the retrieval step. 748 try: File ~/miniconda3/lib/python3.12/site-packages/joblib/parallel.py:763, in BatchCompletionCallBack._return_or_raise(self) 761 try: 762 if self.status == TASK_ERROR: --> 763 raise self._result 764 return self._result 765 finally: TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker. The exit codes of the workers are {SIGBUS(-10)}Versions
System: python: 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 10:07:17) [Clang 14.0.6 ] executable: /Users/darren/miniconda3/bin/python machine: macOS-14.5-arm64-arm-64bit Python dependencies: sklearn: 1.5.1 pip: 24.1.2 setuptools: 69.5.1 numpy: 2.0.1 scipy: 1.14.0 Cython: None pandas: 2.2.2 matplotlib: 3.9.1 joblib: 1.4.2 threadpoolctl: 3.5.0 Built with OpenMP: True threadpoolctl info: user_api: openmp internal_api: openmp num_threads: 12 prefix: libomp filepath: /Users/darren/miniconda3/lib/python3.12/site-packages/sklearn/.dylibs/libomp.dylib version: None