Skip to content

flaky distributed/tests/test_failed_workers.py::test_gather_then_submit_after_failed_workers #6530

@hendrikmakait

Description

@hendrikmakait

https://github.com/dask/distributed/runs/6796365931?check_suite_focus=true#step:11:1301

_________________ test_gather_then_submit_after_failed_workers _________________
args = (), kwds = {}
@wraps(func)
definner(*args, **kwds):
withself._recreate_cm():
>           return func(*args, **kwds)
../../../miniconda3/envs/dask-distributed/lib/python3.10/contextlib.py:79: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/utils_test.py:1212: in test_func
return _run_and_close_tornado(async_fn_outer)
distributed/utils_test.py:473: in _run_and_close_tornado
return asyncio.run(inner_fn())
../../../miniconda3/envs/dask-distributed/lib/python3.10/asyncio/runners.py:44: in run
return loop.run_until_complete(main)
../../../miniconda3/envs/dask-distributed/lib/python3.10/asyncio/base_events.py:646: in run_until_complete
return future.result()
distributed/utils_test.py:470: in inner_fn
returnawait async_fn(*args, **kwargs)
distributed/utils_test.py:1209: in async_fn_outer
returnawait asyncio.wait_for(async_fn(), timeout=timeout * 2)
../../../miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py:445: in wait_for
return fut.result()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
asyncdefasync_fn():
        result = None
with tempfile.TemporaryDirectory() as tmpdir:
            config2 = merge({"temporary-directory": tmpdir}, config)
with dask.config.set(config2):
                workers = []
                s = False
for _ inrange(60):
try:
                        s, ws = await start_cluster(
                            nthreads,
                            scheduler,
                            security=security,
                            Worker=Worker,
                            scheduler_kwargs=scheduler_kwargs,
                            worker_kwargs=worker_kwargs,
                        )
exceptExceptionas e:
                        logger.error(
"Failed to start gen_cluster: "
f"{e.__class__.__name__}: {e}; retrying",
                            exc_info=True,
                        )
await asyncio.sleep(1)
else:
                        workers[:] = ws
                        args = [s] + workers
break
if s isFalse:
raiseException("Could not start cluster")
if client:
                    c = await Client(
                        s.address,
                        security=security,
                        asynchronous=True,
                        **client_kwargs,
                    )
                    args = [c] + args
try:
                    coro = func(*args, *outer_args, **kwargs)
                    task = asyncio.create_task(coro)
                    coro2 = asyncio.wait_for(asyncio.shield(task), timeout)
                    result = await coro2
                    validate_state(s, *workers)
except asyncio.TimeoutError:
assert task
                    buffer = io.StringIO()
# This stack indicates where the coro/test is suspended
                    task.print_stack(file=buffer)
if cluster_dump_directory:
await dump_cluster_state(
                            s,
                            ws,
                            output_dir=cluster_dump_directory,
                            func_name=func.__name__,
                        )
                    task.cancel()
whilenot task.cancelled():
await asyncio.sleep(0.01)
# Hopefully, the hang has been caused by inconsistent
# state, which should be much more meaningful than the
# timeout
                    validate_state(s, *workers)
# Remove as much of the traceback as possible; it's
# uninteresting boilerplate from utils_test and asyncio
# and not from the code being tested.
raise asyncio.TimeoutError(
f"Test timeout after {timeout}s.\n"
"========== Test stack trace starts here ==========\n"
f"{buffer.getvalue()}"
                    ) fromNone
except pytest.xfail.Exception:
raise
exceptException:
if cluster_dump_directory andnot has_pytestmark(
                        test_func, "xfail"
                    ):
await dump_cluster_state(
                            s,
                            ws,
                            output_dir=cluster_dump_directory,
                            func_name=func.__name__,
                        )
raise
finally:
if client and c.status notin ("closing", "closed"):
await c._close(fast=s.status == Status.closed)
await end_cluster(s, workers)
await asyncio.wait_for(cleanup_global_workers(), 1)
try:
                    c = await default_client()
exceptValueError:
pass
else:
await c._close(fast=True)
defget_unclosed():
return [c for c in Comm._instances ifnot c.closed()] + [
                        c
for c in _global_clients.values()
if c.status != "closed"
                    ]
try:
                    start = time()
while time() < start + 60:
                        gc.collect()
ifnot get_unclosed():
break
await asyncio.sleep(0.05)
else:
if allow_unclosed:
print(f"Unclosed Comms: {get_unclosed()}")
else:
>                           raiseRuntimeError("Unclosed Comms", get_unclosed())
E                           RuntimeError: ('Unclosed Comms', [<TCP (closed)  local=tcp://127.0.0.1:56326 remote=tcp://127.0.0.1:56354>, <TCP (closed) Scheduler connection to worker local=tcp://127.0.0.1:56326 remote=tcp://127.0.0.1:56349>, <TCP (closed) ConnectionPool local=tcp://127.0.0.1:56331 remote=tcp://127.0.0.1:56326>, <TCP (closed)  local=tcp://127.0.0.1:56326 remote=tcp://127.0.0.1:56331>])
distributed/utils_test.py:1190: RuntimeError
----------------------------- Captured stderr call -----------------------------
2022-06-08 15:30:36,596 - distributed.scheduler - INFO - State start
2022-06-08 15:30:36,600 - distributed.scheduler - INFO - Clear task state
2022-06-08 15:30:36,601 - distributed.scheduler - INFO -   Scheduler at:     tcp://127.0.0.1:56326
2022-06-08 15:30:36,601 - distributed.scheduler - INFO -   dashboard at:           127.0.0.1:56325
2022-06-08 15:30:36,632 - distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:56327'
2022-06-08 15:30:36,633 - distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:56328'
2022-06-08 15:30:36,633 - distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:56329'
2022-06-08 15:30:36,718 - distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:56330'
/Users/runner/miniconda3/envs/dask-distributed/lib/python3.10/contextlib.py:142: UserWarning: Creating scratch directories is taking a surprisingly long time. This is often due to running workers on a network file system. Consider specifying a local-directory to point workers to write scratch data to a local disk.
  next(self.gen)
/Users/runner/miniconda3/envs/dask-distributed/lib/python3.10/contextlib.py:142: UserWarning: Creating scratch directories is taking a surprisingly long time. This is often due to running workers on a network file system. Consider specifying a local-directory to point workers to write scratch data to a local disk.
  next(self.gen)
/Users/runner/miniconda3/envs/dask-distributed/lib/python3.10/contextlib.py:142: UserWarning: Creating scratch directories is taking a surprisingly long time. This is often due to running workers on a network file system. Consider specifying a local-directory to point workers to write scratch data to a local disk.
  next(self.gen)
/Users/runner/miniconda3/envs/dask-distributed/lib/python3.10/contextlib.py:142: UserWarning: Creating scratch directories is taking a surprisingly long time. This is often due to running workers on a network file system. Consider specifying a local-directory to point workers to write scratch data to a local disk.
  next(self.gen)
2022-06-08 15:30:48,925 - distributed.worker - INFO -       Start worker at:      tcp://127.0.0.1:56342
2022-06-08 15:30:48,925 - distributed.worker - INFO -          Listening to:      tcp://127.0.0.1:56342
2022-06-08 15:30:48,925 - distributed.worker - INFO -          dashboard at:            127.0.0.1:56346
2022-06-08 15:30:48,926 - distributed.worker - INFO - Waiting to connect to:      tcp://127.0.0.1:56326
2022-06-08 15:30:48,926 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:30:48,926 - distributed.worker - INFO -               Threads:                          1
2022-06-08 15:30:48,926 - distributed.worker - INFO -                Memory:                  14.00 GiB
2022-06-08 15:30:48,926 - distributed.worker - INFO -       Local Directory: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/tmpkxjf1p50/dask-worker-space/worker-cqaesjyc
2022-06-08 15:30:48,926 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:30:48,927 - distributed.worker - INFO -       Start worker at:      tcp://127.0.0.1:56343
2022-06-08 15:30:48,927 - distributed.worker - INFO -          Listening to:      tcp://127.0.0.1:56343
2022-06-08 15:30:48,928 - distributed.worker - INFO -          dashboard at:            127.0.0.1:56347
2022-06-08 15:30:48,928 - distributed.worker - INFO - Waiting to connect to:      tcp://127.0.0.1:56326
2022-06-08 15:30:48,928 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:30:48,928 - distributed.worker - INFO -               Threads:                          1
2022-06-08 15:30:48,928 - distributed.worker - INFO -                Memory:                  14.00 GiB
2022-06-08 15:30:48,928 - distributed.worker - INFO -       Local Directory: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/tmpkxjf1p50/dask-worker-space/worker-w7_0em4r
2022-06-08 15:30:48,928 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:30:48,931 - distributed.worker - INFO -       Start worker at:      tcp://127.0.0.1:56344
2022-06-08 15:30:48,931 - distributed.worker - INFO -          Listening to:      tcp://127.0.0.1:56344
2022-06-08 15:30:48,932 - distributed.worker - INFO -          dashboard at:            127.0.0.1:56348
2022-06-08 15:30:48,932 - distributed.worker - INFO - Waiting to connect to:      tcp://127.0.0.1:56326
2022-06-08 15:30:48,932 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:30:48,932 - distributed.worker - INFO -               Threads:                          1
2022-06-08 15:30:48,932 - distributed.worker - INFO -                Memory:                  14.00 GiB
2022-06-08 15:30:48,932 - distributed.worker - INFO -       Local Directory: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/tmpkxjf1p50/dask-worker-space/worker-m_k3z9bl
2022-06-08 15:30:48,932 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:30:48,934 - distributed.worker - INFO -       Start worker at:      tcp://127.0.0.1:56345
2022-06-08 15:30:48,934 - distributed.worker - INFO -          Listening to:      tcp://127.0.0.1:56345
2022-06-08 15:30:48,935 - distributed.worker - INFO -          dashboard at:            127.0.0.1:56351
2022-06-08 15:30:48,935 - distributed.worker - INFO - Waiting to connect to:      tcp://127.0.0.1:56326
2022-06-08 15:30:48,935 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:30:48,935 - distributed.worker - INFO -               Threads:                          1
2022-06-08 15:30:48,935 - distributed.worker - INFO -                Memory:                  14.00 GiB
2022-06-08 15:30:48,935 - distributed.worker - INFO -       Local Directory: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/tmpkxjf1p50/dask-worker-space/worker-ho22i89o
2022-06-08 15:30:48,935 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:30:50,923 - distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:56342', name: 0, status: init, memory: 0, processing: 0>
2022-06-08 15:30:51,836 - distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:56342
2022-06-08 15:30:51,837 - distributed.core - INFO - Starting established connection
2022-06-08 15:30:51,837 - distributed.worker - INFO -         Registered to:      tcp://127.0.0.1:56326
2022-06-08 15:30:51,838 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:30:51,839 - distributed.core - INFO - Starting established connection
2022-06-08 15:30:51,841 - distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:56344', name: 3, status: init, memory: 0, processing: 0>
2022-06-08 15:30:51,842 - distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:56344
2022-06-08 15:30:51,843 - distributed.core - INFO - Starting established connection
2022-06-08 15:30:51,843 - distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:56343', name: 1, status: init, memory: 0, processing: 0>
2022-06-08 15:30:51,843 - distributed.worker - INFO -         Registered to:      tcp://127.0.0.1:56326
2022-06-08 15:30:51,844 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:30:51,844 - distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:56343
2022-06-08 15:30:51,844 - distributed.core - INFO - Starting established connection
2022-06-08 15:30:51,845 - distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:56345', name: 2, status: init, memory: 0, processing: 0>
2022-06-08 15:30:51,845 - distributed.core - INFO - Starting established connection
2022-06-08 15:30:51,846 - distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:56345
2022-06-08 15:30:51,846 - distributed.core - INFO - Starting established connection
2022-06-08 15:30:51,846 - distributed.worker - INFO -         Registered to:      tcp://127.0.0.1:56326
2022-06-08 15:30:51,847 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:30:51,847 - distributed.worker - INFO -         Registered to:      tcp://127.0.0.1:56326
2022-06-08 15:30:51,848 - distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:56328'.
2022-06-08 15:30:51,849 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:30:51,849 - distributed.nanny - INFO - Nanny asking worker to close
2022-06-08 15:30:51,848 - distributed.core - INFO - Starting established connection
2022-06-08 15:30:51,850 - distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:56329'.
2022-06-08 15:30:51,850 - distributed.nanny - INFO - Nanny asking worker to close
2022-06-08 15:30:51,851 - distributed.core - INFO - Starting established connection
2022-06-08 15:30:51,852 - distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:56330'.
2022-06-08 15:30:51,852 - distributed.nanny - INFO - Nanny asking worker to close
2022-06-08 15:30:51,852 - distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:56343
2022-06-08 15:30:51,855 - distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:56345
2022-06-08 15:30:51,856 - distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:56344
2022-06-08 15:30:51,859 - distributed.worker - INFO - Connection to scheduler broken. Closing without reporting. ID: Worker-41f4db71-63d9-4b71-9c76-d7a48239fcfb Address tcp://127.0.0.1:56343 Status: Status.closing
2022-06-08 15:30:51,859 - distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:56343', name: 1, status: closing, memory: 0, processing: 0>
2022-06-08 15:30:51,859 - distributed.core - INFO - Removing comms to tcp://127.0.0.1:56343
2022-06-08 15:30:51,860 - distributed.worker - INFO - Connection to scheduler broken. Closing without reporting. ID: Worker-4d3fbcdb-6ff5-49f9-9c77-3666b596c0ec Address tcp://127.0.0.1:56345 Status: Status.closing
2022-06-08 15:30:51,861 - distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:56344', name: 3, status: closing, memory: 0, processing: 0>
2022-06-08 15:30:51,861 - distributed.core - INFO - Removing comms to tcp://127.0.0.1:56344
2022-06-08 15:30:51,861 - distributed.worker - INFO - Connection to scheduler broken. Closing without reporting. ID: Worker-d74101b3-46ee-4022-a10d-28f0634c6b9d Address tcp://127.0.0.1:56344 Status: Status.closing
2022-06-08 15:30:51,862 - distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:56345', name: 2, status: closing, memory: 0, processing: 0>
2022-06-08 15:30:51,862 - distributed.core - INFO - Removing comms to tcp://127.0.0.1:56345
2022-06-08 15:30:52,214 - distributed.utils_test - ERROR - Failed to start gen_cluster: TimeoutError: Nanny start timed out after 15s.; retrying
Traceback (most recent call last):
  File "/Users/runner/work/distributed/distributed/distributed/nanny.py", line 357, in start_unsafe
    response = await self.instantiate()
  File "/Users/runner/work/distributed/distributed/distributed/nanny.py", line 416, in instantiate
    result = await asyncio.wait_for(
  File "/Users/runner/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 432, in wait_for
    await waiter
asyncio.exceptions.CancelledError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
  File "/Users/runner/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 456, in wait_for
    return fut.result()
asyncio.exceptions.CancelledError
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
  File "/Users/runner/work/distributed/distributed/distributed/core.py", line 327, in start
    await asyncio.wait_for(self.start_unsafe(), timeout=timeout)
  File "/Users/runner/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 458, in wait_for
    raise exceptions.TimeoutError() from exc
asyncio.exceptions.TimeoutError
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
  File "/Users/runner/work/distributed/distributed/distributed/utils_test.py", line 1075, in async_fn
    s, ws = await start_cluster(
  File "/Users/runner/work/distributed/distributed/distributed/utils_test.py", line 919, in start_cluster
    await asyncio.gather(*workers)
  File "/Users/runner/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 650, in _wrap_awaitable
    return (yield from awaitable.__await__())
  File "/Users/runner/work/distributed/distributed/distributed/core.py", line 330, in start
    raise asyncio.TimeoutError(
asyncio.exceptions.TimeoutError: Nanny start timed out after 15s.
2022-06-08 15:30:53,224 - distributed.scheduler - INFO - State start
2022-06-08 15:30:53,541 - distributed.scheduler - INFO - Clear task state
2022-06-08 15:30:53,543 - distributed.scheduler - INFO -   Scheduler at:     tcp://127.0.0.1:56356
2022-06-08 15:30:53,543 - distributed.scheduler - INFO -   dashboard at:           127.0.0.1:56355
2022-06-08 15:30:53,582 - distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:56357'
2022-06-08 15:30:53,582 - distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:56358'
2022-06-08 15:30:53,583 - distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:56359'
2022-06-08 15:30:53,646 - distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:56360'
2022-06-08 15:30:59,761 - distributed.worker - INFO -       Start worker at:      tcp://127.0.0.1:56369
2022-06-08 15:30:59,761 - distributed.worker - INFO -          Listening to:      tcp://127.0.0.1:56369
2022-06-08 15:30:59,761 - distributed.worker - INFO -          dashboard at:            127.0.0.1:56371
2022-06-08 15:30:59,761 - distributed.worker - INFO - Waiting to connect to:      tcp://127.0.0.1:56356
2022-06-08 15:30:59,762 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:30:59,762 - distributed.worker - INFO -               Threads:                          1
2022-06-08 15:30:59,762 - distributed.worker - INFO -                Memory:                  14.00 GiB
2022-06-08 15:30:59,762 - distributed.worker - INFO -       Local Directory: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/tmpkxjf1p50/dask-worker-space/worker-59vtgvhl
2022-06-08 15:30:59,762 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:30:59,771 - distributed.worker - INFO -       Start worker at:      tcp://127.0.0.1:56370
2022-06-08 15:30:59,771 - distributed.worker - INFO -          Listening to:      tcp://127.0.0.1:56370
2022-06-08 15:30:59,771 - distributed.worker - INFO -          dashboard at:            127.0.0.1:56373
2022-06-08 15:30:59,771 - distributed.worker - INFO - Waiting to connect to:      tcp://127.0.0.1:56356
2022-06-08 15:30:59,772 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:30:59,772 - distributed.worker - INFO -               Threads:                          1
2022-06-08 15:30:59,772 - distributed.worker - INFO -                Memory:                  14.00 GiB
2022-06-08 15:30:59,772 - distributed.worker - INFO -       Local Directory: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/tmpkxjf1p50/dask-worker-space/worker-cs6boirh
2022-06-08 15:30:59,772 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:30:59,802 - distributed.worker - INFO -       Start worker at:      tcp://127.0.0.1:56375
2022-06-08 15:30:59,803 - distributed.worker - INFO -          Listening to:      tcp://127.0.0.1:56375
2022-06-08 15:30:59,803 - distributed.worker - INFO -          dashboard at:            127.0.0.1:56377
2022-06-08 15:30:59,803 - distributed.worker - INFO - Waiting to connect to:      tcp://127.0.0.1:56356
2022-06-08 15:30:59,803 - distributed.worker - INFO -       Start worker at:      tcp://127.0.0.1:56376
2022-06-08 15:30:59,803 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:30:59,803 - distributed.worker - INFO -          Listening to:      tcp://127.0.0.1:56376
2022-06-08 15:30:59,803 - distributed.worker - INFO -               Threads:                          1
2022-06-08 15:30:59,804 - distributed.worker - INFO -                Memory:                  14.00 GiB
2022-06-08 15:30:59,804 - distributed.worker - INFO -       Local Directory: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/tmpkxjf1p50/dask-worker-space/worker-ntjdomga
2022-06-08 15:30:59,803 - distributed.worker - INFO -          dashboard at:            127.0.0.1:56378
2022-06-08 15:30:59,804 - distributed.worker - INFO - Waiting to connect to:      tcp://127.0.0.1:56356
2022-06-08 15:30:59,804 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:30:59,804 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:30:59,804 - distributed.worker - INFO -               Threads:                          1
2022-06-08 15:30:59,804 - distributed.worker - INFO -                Memory:                  14.00 GiB
2022-06-08 15:30:59,804 - distributed.worker - INFO -       Local Directory: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/tmpkxjf1p50/dask-worker-space/worker-ju7fqx6y
2022-06-08 15:30:59,804 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:31:02,115 - distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:56375', name: 1, status: init, memory: 0, processing: 0>
2022-06-08 15:31:02,117 - distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:56375
2022-06-08 15:31:02,117 - distributed.core - INFO - Starting established connection
2022-06-08 15:31:02,118 - distributed.worker - INFO -         Registered to:      tcp://127.0.0.1:56356
2022-06-08 15:31:02,118 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:31:02,121 - distributed.core - INFO - Starting established connection
2022-06-08 15:31:02,194 - distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:56370', name: 2, status: init, memory: 0, processing: 0>
2022-06-08 15:31:02,195 - distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:56370
2022-06-08 15:31:02,195 - distributed.core - INFO - Starting established connection
2022-06-08 15:31:02,197 - distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:56369', name: 0, status: init, memory: 0, processing: 0>
2022-06-08 15:31:02,198 - distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:56369
2022-06-08 15:31:02,198 - distributed.core - INFO - Starting established connection
2022-06-08 15:31:02,228 - distributed.worker - INFO -         Registered to:      tcp://127.0.0.1:56356
2022-06-08 15:31:02,229 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:31:02,230 - distributed.worker - INFO -         Registered to:      tcp://127.0.0.1:56356
2022-06-08 15:31:02,230 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:31:02,235 - distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:56376', name: 3, status: init, memory: 0, processing: 0>
2022-06-08 15:31:02,236 - distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:56376
2022-06-08 15:31:02,236 - distributed.core - INFO - Starting established connection
2022-06-08 15:31:02,238 - distributed.core - INFO - Starting established connection
2022-06-08 15:31:02,239 - distributed.core - INFO - Starting established connection
2022-06-08 15:31:02,239 - distributed.worker - INFO -         Registered to:      tcp://127.0.0.1:56356
2022-06-08 15:31:02,239 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:31:02,241 - distributed.core - INFO - Starting established connection
2022-06-08 15:31:02,267 - distributed.scheduler - INFO - Receive client connection: Client-01022396-e740-11ec-b081-00505695c07f
2022-06-08 15:31:02,268 - distributed.core - INFO - Starting established connection
2022-06-08 15:31:03,777 - distributed.nanny - INFO - Worker process 45128 was killed by signal 15
2022-06-08 15:31:03,779 - distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:56369', name: 0, status: running, memory: 5, processing: 0>
2022-06-08 15:31:03,779 - distributed.core - INFO - Removing comms to tcp://127.0.0.1:56369
2022-06-08 15:31:03,787 - distributed.nanny - WARNING - Restarting worker
2022-06-08 15:31:06,602 - distributed.worker - INFO -       Start worker at:      tcp://127.0.0.1:56408
2022-06-08 15:31:06,602 - distributed.worker - INFO -          Listening to:      tcp://127.0.0.1:56408
2022-06-08 15:31:06,602 - distributed.worker - INFO -          dashboard at:            127.0.0.1:56409
2022-06-08 15:31:06,602 - distributed.worker - INFO - Waiting to connect to:      tcp://127.0.0.1:56356
2022-06-08 15:31:06,603 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:31:06,603 - distributed.worker - INFO -               Threads:                          1
2022-06-08 15:31:06,603 - distributed.worker - INFO -                Memory:                  14.00 GiB
2022-06-08 15:31:06,603 - distributed.worker - INFO -       Local Directory: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/tmpkxjf1p50/dask-worker-space/worker-vulg6gm5
2022-06-08 15:31:06,603 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:31:08,440 - distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:56408', name: 0, status: init, memory: 0, processing: 0>
2022-06-08 15:31:08,442 - distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:56408
2022-06-08 15:31:08,442 - distributed.core - INFO - Starting established connection
2022-06-08 15:31:08,454 - distributed.worker - INFO -         Registered to:      tcp://127.0.0.1:56356
2022-06-08 15:31:08,455 - distributed.worker - INFO - -------------------------------------------------
2022-06-08 15:31:08,488 - distributed.core - INFO - Starting established connection
2022-06-08 15:31:08,781 - distributed.worker - ERROR - Worker stream died during communication: tcp://127.0.0.1:56369
Traceback (most recent call last):
  File "/Users/runner/work/distributed/distributed/distributed/comm/tcp.py", line 439, in connect
    stream = await self.client.connect(
  File "/Users/runner/miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/tcpclient.py", line 265, in connect
    addrinfo = await self.resolver.resolve(host, port, af)
  File "/Users/runner/work/distributed/distributed/distributed/comm/tcp.py", line 424, in resolve
    for fam, _, _, _, address in await asyncio.get_running_loop().getaddrinfo(
  File "/Users/runner/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/base_events.py", line 860, in getaddrinfo
    return await self.run_in_executor(
asyncio.exceptions.CancelledError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
  File "/Users/runner/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 456, in wait_for
    return fut.result()
asyncio.exceptions.CancelledError
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
  File "/Users/runner/work/distributed/distributed/distributed/comm/core.py", line 289, in connect
    comm = await asyncio.wait_for(
  File "/Users/runner/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 458, in wait_for
    raise exceptions.TimeoutError() from exc
asyncio.exceptions.TimeoutError
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
  File "/Users/runner/work/distributed/distributed/distributed/worker.py", line 3290, in gather_dep
    response = await get_data_from_worker(
  File "/Users/runner/work/distributed/distributed/distributed/worker.py", line 4616, in get_data_from_worker
    return await retry_operation(_get_data, operation="get_data_from_worker")
  File "/Users/runner/work/distributed/distributed/distributed/utils_comm.py", line 381, in retry_operation
    return await retry(
  File "/Users/runner/work/distributed/distributed/distributed/utils_comm.py", line 366, in retry
    return await coro()
  File "/Users/runner/work/distributed/distributed/distributed/worker.py", line 4593, in _get_data
    comm = await rpc.connect(worker)
  File "/Users/runner/work/distributed/distributed/distributed/core.py", line 1202, in connect
    return await connect_attempt
  File "/Users/runner/work/distributed/distributed/distributed/core.py", line 1138, in _connect
    comm = await connect(
  File "/Users/runner/work/distributed/distributed/distributed/comm/core.py", line 315, in connect
    raise OSError(
OSError: Timed out trying to connect to tcp://127.0.0.1:56369 after 5 s
2022-06-08 15:31:08,823 - distributed.nanny - INFO - Worker process 45132 was killed by signal 15
2022-06-08 15:31:08,825 - distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:56375', name: 1, status: running, memory: 21, processing: 0>
2022-06-08 15:31:08,825 - distributed.core - INFO - Removing comms to tcp://127.0.0.1:56375
2022-06-08 15:31:08,831 - distributed.scheduler - ERROR - Couldn't gather keys {'sum-e85115fa770066da8e0a1836b8c95026': ['tcp://127.0.0.1:56375']} state: ['waiting'] workers: ['tcp://127.0.0.1:56375']
NoneType: None
2022-06-08 15:31:08,834 - distributed.scheduler - ERROR - Shut down workers that don't have promised key: ['tcp://127.0.0.1:56375'], sum-e85115fa770066da8e0a1836b8c95026
NoneType: None
2022-06-08 15:31:08,837 - distributed.nanny - WARNING - Restarting worker
2022-06-08 15:31:08,842 - distributed.client - WARNING - Couldn't gather 1 keys, rescheduling {'sum-e85115fa770066da8e0a1836b8c95026': ('tcp://127.0.0.1:56375',)}
2022-06-08 15:31:09,724 - distributed.nanny - INFO - Worker process 45139 was killed by signal 15
2022-06-08 15:31:09,726 - distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:56370', name: 2, status: running, memory: 21, processing: 0>
2022-06-08 15:31:09,726 - distributed.core - INFO - Removing comms to tcp://127.0.0.1:56370
2022-06-08 15:31:09,729 - distributed.scheduler - ERROR - Couldn't gather keys {'sum-e85115fa770066da8e0a1836b8c95026': ['tcp://127.0.0.1:56370']} state: ['waiting'] workers: ['tcp://127.0.0.1:56370']
NoneType: None
2022-06-08 15:31:09,732 - distributed.scheduler - ERROR - Shut down workers that don't have promised key: ['tcp://127.0.0.1:56370'], sum-e85115fa770066da8e0a1836b8c95026
NoneType: None
2022-06-08 15:31:09,735 - distributed.client - WARNING - Couldn't gather 1 keys, rescheduling {'sum-e85115fa770066da8e0a1836b8c95026': ('tcp://127.0.0.1:56370',)}
2022-06-08 15:31:09,738 - distributed.nanny - WARNING - Restarting worker
2022-06-08 15:31:09,819 - distributed.nanny - INFO - Worker process 45145 was killed by signal 15
2022-06-08 15:31:09,822 - distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:56376', name: 3, status: running, memory: 21, processing: 0>
2022-06-08 15:31:09,822 - distributed.core - INFO - Removing comms to tcp://127.0.0.1:56376
2022-06-08 15:31:09,826 - distributed.scheduler - ERROR - Couldn't gather keys {'sum-e85115fa770066da8e0a1836b8c95026': ['tcp://127.0.0.1:56376']} state: ['waiting'] workers: ['tcp://127.0.0.1:56376']
NoneType: None
2022-06-08 15:31:09,828 - distributed.scheduler - ERROR - Shut down workers that don't have promised key: ['tcp://127.0.0.1:56376'], sum-e85115fa770066da8e0a1836b8c95026
NoneType: None
2022-06-08 15:31:09,830 - distributed.client - WARNING - Couldn't gather 1 keys, rescheduling {'sum-e85115fa770066da8e0a1836b8c95026': ('tcp://127.0.0.1:56376',)}
2022-06-08 15:31:09,832 - distributed.nanny - WARNING - Restarting worker
2022-06-08 15:31:09,913 - distributed.scheduler - INFO - Remove client Client-01022396-e740-11ec-b081-00505695c07f
2022-06-08 15:31:09,915 - distributed.scheduler - INFO - Remove client Client-01022396-e740-11ec-b081-00505695c07f
2022-06-08 15:31:09,916 - distributed.scheduler - INFO - Close client connection: Client-01022396-e740-11ec-b081-00505695c07f
2022-06-08 15:31:09,919 - distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:56357'.
2022-06-08 15:31:09,919 - distributed.nanny - INFO - Nanny asking worker to close
2022-06-08 15:31:09,920 - distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:56358'.
2022-06-08 15:31:09,921 - distributed.nanny - INFO - Nanny asking worker to close
2022-06-08 15:31:09,921 - distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:56408
2022-06-08 15:31:09,924 - distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:56359'.
2022-06-08 15:31:09,925 - distributed.worker - INFO - Connection to scheduler broken. Closing without reporting. ID: Worker-234fea19-b5af-4e46-bfe0-ae9c510772c3 Address tcp://127.0.0.1:56408 Status: Status.closing
2022-06-08 15:31:09,925 - distributed.nanny - INFO - Nanny asking worker to close
2022-06-08 15:31:09,926 - distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:56360'.
2022-06-08 15:31:09,926 - distributed.nanny - INFO - Nanny asking worker to close
2022-06-08 15:31:09,931 - distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:56408', name: 0, status: closing, memory: 0, processing: 0>
2022-06-08 15:31:09,931 - distributed.core - INFO - Removing comms to tcp://127.0.0.1:56408
2022-06-08 15:31:09,932 - distributed.scheduler - INFO - Lost all workers
2022-06-08 15:31:13,490 - distributed.worker - INFO - Stopping worker
2022-06-08 15:31:13,491 - distributed.worker - INFO - Closed worker has not yet started: Status.init
2022-06-08 15:31:13,951 - distributed.scheduler - INFO - Scheduler closing...
2022-06-08 15:31:13,952 - distributed.scheduler - INFO - Scheduler closing all comms
2022-06-08 15:31:14,171 - distributed.process - INFO - reaping stray process <SpawnProcess name='Dask Worker process (from Nanny)' pid=45835 parent=12417 started daemon>
2022-06-08 15:31:14,173 - distributed.process - INFO - reaping stray process <SpawnProcess name='Dask Worker process (from Nanny)' pid=45836 parent=12417 started daemon>
2022-06-08 15:31:14,180 - distributed.utils_perf - WARNING - full garbage collections took 31% CPU time recently (threshold: 10%)
2022-06-08 15:31:14,468 - distributed.utils_perf - WARNING - full garbage collections took 31% CPU time recently (threshold: 10%)
2022-06-08 15:31:14,731 - distributed.utils_perf - WARNING - full garbage collections took 32% CPU time recently (threshold: 10%)
2022-06-08 15:31:15,003 - distributed.utils_perf - WARNING - full garbage collections took 32% CPU time recently (threshold: 10%)
2022-06-08 15:31:15,207 - distributed.utils_perf - WARNING - full garbage collections took 33% CPU time recently (threshold: 10%)
2022-06-08 15:31:15,408 - distributed.utils_perf - WARNING - full garbage collections took 34% CPU time recently (threshold: 10%)
2022-06-08 15:31:15,606 - distributed.utils_perf - WARNING - full garbage collections took 34% CPU time recently (threshold: 10%)
2022-06-08 15:31:15,976 - distributed.utils_perf - WARNING - full garbage collections took 34% CPU time recently (threshold: 10%)
2022-06-08 15:31:16,215 - distributed.utils_perf - WARNING - full garbage collections took 34% CPU time recently (threshold: 10%)
2022-06-08 15:31:22,038 - distributed.core - INFO - Event loop was unresponsive in Scheduler for 6.42s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
2022-06-08 15:31:22,039 - distributed.core - INFO - Event loop was unresponsive in Nanny for 6.06s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
2022-06-08 15:31:22,422 - distributed.utils_perf - WARNING - full garbage collections took 35% CPU time recently (threshold: 10%)
2022-06-08 15:31:23,467 - distributed.utils_perf - WARNING - full garbage collections took 35% CPU time recently (threshold: 10%)
2022-06-08 15:31:23,681 - distributed.utils_perf - WARNING - full garbage collections took 35% CPU time recently (threshold: 10%)
2022-06-08 15:31:24,005 - distributed.utils_perf - WARNING - full garbage collections took 36% CPU time recently (threshold: 10%)
2022-06-08 15:31:24,367 - distributed.utils_perf - WARNING - full garbage collections took 36% CPU time recently (threshold: 10%)
2022-06-08 15:31:24,638 - distributed.utils_perf - WARNING - full garbage collections took 37% CPU time recently (threshold: 10%)
2022-06-08 15:31:24,903 - distributed.utils_perf - WARNING - full garbage collections took 37% CPU time recently (threshold: 10%)
2022-06-08 15:31:25,182 - distributed.utils_perf - WARNING - full garbage collections took 38% CPU time recently (threshold: 10%)
2022-06-08 15:31:25,474 - distributed.utils_perf - WARNING - full garbage collections took 42% CPU time recently (threshold: 10%)
2022-06-08 15:31:25,762 - distributed.utils_perf - WARNING - full garbage collections took 43% CPU time recently (threshold: 10%)
2022-06-08 15:31:26,021 - distributed.utils_perf - WARNING - full garbage collections took 45% CPU time recently (threshold: 10%)
2022-06-08 15:31:26,386 - distributed.utils_perf - WARNING - full garbage collections took 47% CPU time recently (threshold: 10%)
2022-06-08 15:31:26,631 - distributed.utils_perf - WARNING - full garbage collections took 47% CPU time recently (threshold: 10%)
2022-06-08 15:31:26,844 - distributed.utils_perf - WARNING - full garbage collections took 48% CPU time recently (threshold: 10%)
2022-06-08 15:31:27,055 - distributed.utils_perf - WARNING - full garbage collections took 58% CPU time recently (threshold: 10%)
2022-06-08 15:31:27,256 - distributed.utils_perf - WARNING - full garbage collections took 63% CPU time recently (threshold: 10%)
2022-06-08 15:31:27,456 - distributed.utils_perf - WARNING - full garbage collections took 63% CPU time recently (threshold: 10%)
2022-06-08 15:31:27,731 - distributed.utils_perf - WARNING - full garbage collections took 69% CPU time recently (threshold: 10%)
2022-06-08 15:31:27,991 - distributed.utils_perf - WARNING - full garbage collections took 69% CPU time recently (threshold: 10%)
2022-06-08 15:31:28,252 - distributed.utils_perf - WARNING - full garbage collections took 72% CPU time recently (threshold: 10%)
2022-06-08 15:31:28,487 - distributed.utils_perf - WARNING - full garbage collections took 74% CPU time recently (threshold: 10%)
2022-06-08 15:31:28,702 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:31:28,923 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:31:29,150 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:31:29,375 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:31:29,594 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:31:29,801 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:31:30,001 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:31:30,267 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:31:30,527 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:31:30,852 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)

...
2022-06-08 15:32:09,244 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:32:09,469 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:32:09,796 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:32:10,128 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:32:10,330 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:32:10,527 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:32:10,721 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:32:11,045 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:32:11,244 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:32:11,441 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:32:11,745 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:32:12,027 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:32:12,309 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:32:12,532 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:32:12,729 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:32:12,989 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:32:13,311 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:32:13,628 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:32:13,904 - distributed.utils_perf - WARNING - full garbage collections took 97% CPU time recently (threshold: 10%)
2022-06-08 15:32:13,959 - distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:56342', name: 0, status: running, memory: 0, processing: 0>
2022-06-08 15:32:13,959 - distributed.core - INFO - Removing comms to tcp://127.0.0.1:56342
2022-06-08 15:32:13,959 - distributed.worker - INFO - Connection to scheduler broken. Closing without reporting. ID: Worker-c98c8245-dba5-49ae-b23a-0f518fece5f0 Address tcp://127.0.0.1:56342 Status: Status.running
2022-06-08 15:32:13,959 - distributed.scheduler - INFO - Lost all workers
2022-06-08 15:46:52,352 - distributed.utils_perf - WARNING - full garbage collections took 28% CPU time recently (threshold: 10%)
2022-06-08 15:46:53,358 - distributed.utils_perf - WARNING - full garbage collections took 27% CPU time recently (threshold: 10%)
2022-06-08 15:46:54,328 - distributed.utils_perf - WARNING - full garbage collections took 26% CPU time recently (threshold: 10%)
2022-06-08 15:32:13,959 - distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:56342
------------------------------ Captured log call -------------------------------
ERROR    asyncio.events:utils.py:787 
Traceback (most recent call last):
  File "/Users/runner/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/runners.py", line 44, in run
    return loop.run_until_complete(main)
  File "/Users/runner/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/base_events.py", line 646, in run_until_complete
    return future.result()
  File "/Users/runner/work/distributed/distributed/distributed/utils_test.py", line 470, in inner_fn
    return await async_fn(*args, **kwargs)
  File "/Users/runner/work/distributed/distributed/distributed/utils_test.py", line 1209, in async_fn_outer
    return await asyncio.wait_for(async_fn(), timeout=timeout * 2)
  File "/Users/runner/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 445, in wait_for
    return fut.result()
  File "/Users/runner/work/distributed/distributed/distributed/utils_test.py", line 1190, in async_fn
    raise RuntimeError("Unclosed Comms", get_unclosed())
RuntimeError: ('Unclosed Comms', [<TCP  local=tcp://127.0.0.1:56326 remote=tcp://127.0.0.1:56354>, <TCP (closed) Scheduler connection to worker local=tcp://127.0.0.1:56326 remote=tcp://127.0.0.1:56349>, <TCP ConnectionPool local=tcp://127.0.0.1:56331 remote=tcp://127.0.0.1:56326>, <TCP  local=tcp://127.0.0.1:56326 remote=tcp://127.0.0.1:56331>])
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
  File "/Users/runner/work/distributed/distributed/distributed/utils.py", line 761, in wrapper
    return await func(*args, **kwargs)
  File "/Users/runner/work/distributed/distributed/distributed/scheduler.py", line 3699, in add_worker
    await self.handle_worker(comm=comm, worker=address, stimulus_id=stimulus_id)
  File "/Users/runner/work/distributed/distributed/distributed/scheduler.py", line 4814, in handle_worker
    await self.handle_stream(comm=comm, extra={"worker": worker})
  File "/Users/runner/work/distributed/distributed/distributed/core.py", line 649, in handle_stream
    msgs = await comm.read()
  File "/Users/runner/work/distributed/distributed/distributed/comm/tcp.py", line 226, in read
    frames_nbytes = await stream.read_bytes(fmt_size)
asyncio.exceptions.CancelledError
- generated xml file: /Users/runner/work/distributed/distributed/reports/pytest.xml -

Metadata

Metadata

Assignees

No one assigned

    Labels

    flaky testIntermittent failures on CI.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions