Skip to content

TEST: CI: dask test_notebooks fails with RuntimeError: Cluster failed to start: No module named 'bokeh' #5166

@mvashishtha

Description

@mvashishtha

example failure: https://github.com/modin-project/modin/actions/runs/3322991945/jobs/5551188570

This workflow run passed before the commit was merged, but failed when I ran it just now. I think bokeh is a dependency of dask.

stack trace
RuntimeError                              Traceback (most recent call last)
Cell In [2], line 13
     10     url_path = "https://modin-test.s3.us-west-1.amazonaws.com/yellow_tripdata_2015-01.csv"
     11     urllib.request.urlretrieve(url_path, "taxi.csv")
---> 13 modin_df = pd.read_csv(s3_path,parse_dates=["tpep_pickup_datetime","tpep_dropoff_datetime"],quoting=3,nrows=1000)

File ~/work/modin/modin/modin/logging/logger_decorator.py:128, in enable_logging.<locals>.decorator.<locals>.run_and_log(*args, **kwargs)
    113 """
    114 Compute function with logging if Modin logging is enabled.
    115 
   (...)
    125 Any
    126 """
    127 if LogMode.get() == "disable":
--> 128     return obj(*args,**kwargs)
    130 logger = get_logger()
    131 logger_level = getattr(logger, log_level)

File ~/work/modin/modin/modin/_compat/pandas_api/latest/io.py:156, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, skipfooter, doublequote, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    154 f_locals.pop("mangle_dupe_cols", None)
    155 kwargs = {k: v for k, v in f_locals.items() if k in _pd_read_csv_signature}
--> 156 return _read(**kwargs)

File ~/work/modin/modin/modin/_compat/pandas_api/common/io.py:35, in _read(**kwargs)
     22 def _read(**kwargs):
     23     """
     24     Read csv file from local disk.
     25 
   (...)
     33     modin.pandas.DataFrame
     34     """
---> 35     Engine.subscribe(_update_engine)
     36     from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher
     38     squeeze = kwargs.pop("squeeze", False)

File ~/work/modin/modin/modin/config/pubsub.py:217, in Parameter.subscribe(cls, callback)
    208 """
    209 Add `callback` to the `_subs` list and then execute it.
    210 
   (...)
    214     Callable to execute.
    215 """
    216 cls._subs.append(callback)
--> 217 callback(cls)

File ~/work/modin/modin/modin/pandas/__init__.py:161, in _update_engine(publisher)
    158     if _is_first_update.get("Dask", True):
    159         from modin.core.execution.dask.common import initialize_dask
--> 161         initialize_dask()
    162 elif publisher.get() == "Cloudray":
    163     from modin.experimental.cloud import get_connection

File ~/work/modin/modin/modin/core/execution/dask/common/utils.py:47, in initialize_dask()
     45     memory_limit = Memory.get()
     46     worker_memory_limit = memory_limit // num_cpus if memory_limit else "auto"
---> 47     client = Client(n_workers=num_cpus,memory_limit=worker_memory_limit)
     49 num_cpus = len(client.ncores())
     50 NPartitions._put(num_cpus)

File /opt/hostedtoolcache/Python/3.8.14/x64/lib/python3.8/site-packages/distributed/client.py:982, in Client.__init__(self, address, loop, timeout, set_as_default, scheduler_file, security, asynchronous, name, heartbeat_interval, serializers, deserializers, extensions, direct_to_workers, connection_limit, **kwargs)
    979 preload_argv = dask.config.get("distributed.client.preload-argv")
    980 self.preloads = preloading.process_preloads(self, preload, preload_argv)
--> 982 self.start(timeout=timeout)
    983 Client._instances.add(self)
    985 from distributed.recreate_tasks import ReplayTaskClient

File /opt/hostedtoolcache/Python/3.8.14/x64/lib/python3.8/site-packages/distributed/client.py:1172, in Client.start(self, **kwargs)
   1170     self._started = asyncio.ensure_future(self._start(**kwargs))
   1171 else:
-> 1172     sync(self.loop,self._start,**kwargs)

File /opt/hostedtoolcache/Python/3.8.14/x64/lib/python3.8/site-packages/distributed/utils.py:406, in sync(loop, func, callback_timeout, *args, **kwargs)
    404 if error:
    405     typ, exc, tb = error
--> 406     raise exc.with_traceback(tb)
    407 else:
    408     return result

File /opt/hostedtoolcache/Python/3.8.14/x64/lib/python3.8/site-packages/distributed/utils.py:379, in sync.<locals>.f()
    377         future = asyncio.wait_for(future, callback_timeout)
    378     future = asyncio.ensure_future(future)
--> 379     result = yield future
    380 except Exception:
    381     error = sys.exc_info()

File /opt/hostedtoolcache/Python/3.8.14/x64/lib/python3.8/site-packages/tornado/gen.py:762, in Runner.run(self)
    759 exc_info = None
    761 try:
--> 762     value = future.result()
    763 except Exception:
    764     exc_info = sys.exc_info()

File /opt/hostedtoolcache/Python/3.8.14/x64/lib/python3.8/site-packages/distributed/client.py:1238, in Client._start(self, timeout, **kwargs)
   1235 elif self._start_arg is None:
   1236     from distributed.deploy import LocalCluster
-> 1238     self.cluster = await LocalCluster(
   1239         loop=self.loop,
   1240         asynchronous=self._asynchronous,
   1241         **self._startup_kwargs,
   1242     )
   1243     address = self.cluster.scheduler_address
   1245 self._gather_semaphore = asyncio.Semaphore(5)

File /opt/hostedtoolcache/Python/3.8.14/x64/lib/python3.8/site-packages/distributed/deploy/spec.py:398, in SpecCluster.__await__.<locals>._()
    396 async def _():
    397     if self.status == Status.created:
--> 398         await self._start()
    399     await self.scheduler
    400     await self._correct_state()

File /opt/hostedtoolcache/Python/3.8.14/x64/lib/python3.8/site-packages/distributed/deploy/spec.py:319, in SpecCluster._start(self)
    317 self.status = Status.failed
    318 await self._close()
--> 319 raise RuntimeError(f"Cluster failed to start: {e}") from e

RuntimeError: Cluster failed to start: No module named 'bokeh'
RuntimeError: Cluster failed to start: No module named 'bokeh'
============================== 4 failed in 58.87s ==============================

Metadata

Metadata

Assignees

No one assigned

    Labels

    CIDask ⚡Issues related to the Dask engineP0Highest priority tasks requiring immediate fixTesting 📈Issues related to testing

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions