-
-
Notifications
You must be signed in to change notification settings - Fork 757
Open
Labels
flaky testIntermittent failures on CI.Intermittent failures on CI.
Description
https://github.com/dask/distributed/runs/6606348836?check_suite_focus=true#step:11:1696
=================================== FAILURES ===================================
_______________________ test_one_thread_deadlock_timeout _______________________
def test_one_thread_deadlock_timeout():
class UsesCounter:
# An actor whose method argument is another actor
def do_inc(self, ac):
# ac.increment() returns an EagerActorFuture and so the timeout
# cannot expire
return ac.increment().result(timeout=0.001)
> with cluster(nworkers=1) as (cl, _):
distributed/tests/test_actor.py:655:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/contextlib.py:142: in __exit__
next(self.gen)
distributed/utils_test.py:705: in cluster
with check_process_leak(check=True), check_instances(), _reconfigure():
/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/contextlib.py:142: in __exit__
next(self.gen)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
@contextmanager
def check_instances():
Client._instances.clear()
Worker._instances.clear()
Scheduler._instances.clear()
SpecCluster._instances.clear()
Worker._initialized_clients.clear()
# assert all(n.status == "closed" for n in Nanny._instances), {
# n: n.status for n in Nanny._instances
# }
Nanny._instances.clear()
_global_clients.clear()
Comm._instances.clear()
yield
start = time()
while set(_global_clients):
sleep(0.1)
assert time() < start + 10
_global_clients.clear()
for w in Worker._instances:
with suppress(RuntimeError): # closed IOLoop
w.loop.add_callback(w.close, executor_wait=False)
if w.status in WORKER_ANY_RUNNING:
w.loop.add_callback(w.close)
Worker._instances.clear()
start = time()
while any(c.status != "closed" for c in Worker._initialized_clients):
sleep(0.1)
assert time() < start + 10
Worker._initialized_clients.clear()
for i in range(5):
if all(c.closed() for c in Comm._instances):
break
else:
sleep(0.1)
else:
L = [c for c in Comm._instances if not c.closed()]
Comm._instances.clear()
> raise ValueError("Unclosed Comms", L)
E ValueError: ('Unclosed Comms', [<TCP local=tcp://127.0.0.1:39154 remote=tcp://127.0.0.1:33275>])
distributed/utils_test.py:1864: ValueError
----------------------------- Captured stderr call -----------------------------
2022-05-26 08:29:08,406 - distributed.http.proxy - INFO - To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
2022-05-26 08:29:08,414 - distributed.scheduler - INFO - State start
2022-05-26 08:29:08,422 - distributed.scheduler - INFO - Clear task state
2022-05-26 08:29:08,422 - distributed.scheduler - INFO - Scheduler at: tcp://127.0.0.1:33275
2022-05-26 08:29:08,422 - distributed.scheduler - INFO - dashboard at: 127.0.0.1:8787
2022-05-26 08:29:08,477 - distributed.worker - INFO - Start worker at: tcp://127.0.0.1:41889
2022-05-26 08:29:08,478 - distributed.worker - INFO - Listening to: tcp://127.0.0.1:41889
2022-05-26 08:29:08,478 - distributed.worker - INFO - dashboard at: 127.0.0.1:42083
2022-05-26 08:29:08,478 - distributed.worker - INFO - Waiting to connect to: tcp://127.0.0.1:33275
2022-05-26 08:29:08,478 - distributed.worker - INFO - -------------------------------------------------
2022-05-26 08:29:08,478 - distributed.worker - INFO - Threads: 1
2022-05-26 08:29:08,478 - distributed.worker - INFO - Memory: 6.78 GiB
2022-05-26 08:29:08,478 - distributed.worker - INFO - Local Directory: /tmp/_dask_test_workerkwumbx6j/dask-worker-space/worker-3gkcfow7
2022-05-26 08:29:08,478 - distributed.worker - INFO - -------------------------------------------------
2022-05-26 08:29:08,884 - distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:41889', status: init, memory: 0, processing: 0>
2022-05-26 08:29:09,238 - distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:41889
2022-05-26 08:29:09,238 - distributed.core - INFO - Starting established connection
2022-05-26 08:29:09,238 - distributed.worker - INFO - Registered to: tcp://127.0.0.1:33275
2022-05-26 08:29:09,239 - distributed.worker - INFO - -------------------------------------------------
2022-05-26 08:29:09,240 - distributed.core - INFO - Starting established connection
2022-05-26 08:29:09,252 - distributed.scheduler - INFO - Receive client connection: Client-e9e9ce4f-dccd-11ec-8a5d-000d3a1d4f00
2022-05-26 08:29:09,253 - distributed.core - INFO - Starting established connection
2022-05-26 08:29:09,315 - distributed.scheduler - INFO - Scheduler closing...
2022-05-26 08:29:09,315 - distributed.scheduler - INFO - Scheduler closing all comms
2022-05-26 08:29:09,316 - distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:41889', status: running, memory: 2, processing: 0>
2022-05-26 08:29:09,317 - distributed.core - INFO - Removing comms to tcp://127.0.0.1:41889
2022-05-26 08:29:09,317 - distributed.scheduler - INFO - Lost all workers
2022-05-26 08:29:09,318 - distributed.batched - INFO - Batched Comm Closed <TCP (closed) Scheduler->Client local=tcp://127.0.0.1:33275 remote=tcp://127.0.0.1:39146>
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/batched.py", line 94, in _background_send
nbytes = yield self.comm.write(
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py", line 762, in run
value = future.result()
File "/home/runner/work/distributed/distributed/distributed/comm/tcp.py", line 269, in write
raise CommClosedError()
distributed.comm.core.CommClosedError
2022-05-26 08:29:09,436 - distributed.client - ERROR -
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/utils.py", line 761, in wrapper
return await func(*args, **kwargs)
File "/home/runner/work/distributed/distributed/distributed/client.py", line 1225, in _reconnect
await self._ensure_connected(timeout=timeout)
File "/home/runner/work/distributed/distributed/distributed/client.py", line 1255, in _ensure_connected
comm = await connect(
File "/home/runner/work/distributed/distributed/distributed/comm/core.py", line 326, in connect
handshake = await asyncio.wait_for(comm.read(), time_left())
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 432, in wait_for
await waiter
asyncio.exceptions.CancelledError
------------------------------ Captured log call -------------------------------
ERROR asyncio.events:utils.py:787
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/utils.py", line 761, in wrapper
return await func(*args, **kwargs)
File "/home/runner/work/distributed/distributed/distributed/client.py", line 1414, in _handle_report
await self._reconnect()
File "/home/runner/work/distributed/distributed/distributed/utils.py", line 761, in wrapper
return await func(*args, **kwargs)
File "/home/runner/work/distributed/distributed/distributed/client.py", line 1225, in _reconnect
await self._ensure_connected(timeout=timeout)
File "/home/runner/work/distributed/distributed/distributed/client.py", line 1255, in _ensure_connected
comm = await connect(
File "/home/runner/work/distributed/distributed/distributed/comm/core.py", line 326, in connect
handshake = await asyncio.wait_for(comm.read(), time_left())
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 432, in wait_for
await waiter
asyncio.exceptions.CancelledError
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
flaky testIntermittent failures on CI.Intermittent failures on CI.