Skip to content

flaky test tests/test_actor.py::test_one_thread_deadlock_timeout #6453

@graingert

Description

@graingert

https://github.com/dask/distributed/runs/6606348836?check_suite_focus=true#step:11:1696

=================================== FAILURES ===================================
_______________________ test_one_thread_deadlock_timeout _______________________

    def test_one_thread_deadlock_timeout():
        class UsesCounter:
            # An actor whose method argument is another actor
    
            def do_inc(self, ac):
                # ac.increment() returns an EagerActorFuture and so the timeout
                # cannot expire
                return ac.increment().result(timeout=0.001)
    
>       with cluster(nworkers=1) as (cl, _):

distributed/tests/test_actor.py:655: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/contextlib.py:142: in __exit__
    next(self.gen)
distributed/utils_test.py:705: in cluster
    with check_process_leak(check=True), check_instances(), _reconfigure():
/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/contextlib.py:142: in __exit__
    next(self.gen)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    @contextmanager
    def check_instances():
        Client._instances.clear()
        Worker._instances.clear()
        Scheduler._instances.clear()
        SpecCluster._instances.clear()
        Worker._initialized_clients.clear()
        # assert all(n.status == "closed" for n in Nanny._instances), {
        #     n: n.status for n in Nanny._instances
        # }
        Nanny._instances.clear()
        _global_clients.clear()
        Comm._instances.clear()
    
        yield
    
        start = time()
        while set(_global_clients):
            sleep(0.1)
            assert time() < start + 10
    
        _global_clients.clear()
    
        for w in Worker._instances:
            with suppress(RuntimeError):  # closed IOLoop
                w.loop.add_callback(w.close, executor_wait=False)
                if w.status in WORKER_ANY_RUNNING:
                    w.loop.add_callback(w.close)
        Worker._instances.clear()
    
        start = time()
        while any(c.status != "closed" for c in Worker._initialized_clients):
            sleep(0.1)
            assert time() < start + 10
        Worker._initialized_clients.clear()
    
        for i in range(5):
            if all(c.closed() for c in Comm._instances):
                break
            else:
                sleep(0.1)
        else:
            L = [c for c in Comm._instances if not c.closed()]
            Comm._instances.clear()
>           raise ValueError("Unclosed Comms", L)
E           ValueError: ('Unclosed Comms', [<TCP  local=tcp://127.0.0.1:39154 remote=tcp://127.0.0.1:33275>])

distributed/utils_test.py:1864: ValueError
----------------------------- Captured stderr call -----------------------------
2022-05-26 08:29:08,406 - distributed.http.proxy - INFO - To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
2022-05-26 08:29:08,414 - distributed.scheduler - INFO - State start
2022-05-26 08:29:08,422 - distributed.scheduler - INFO - Clear task state
2022-05-26 08:29:08,422 - distributed.scheduler - INFO -   Scheduler at:     tcp://127.0.0.1:33275
2022-05-26 08:29:08,422 - distributed.scheduler - INFO -   dashboard at:            127.0.0.1:8787
2022-05-26 08:29:08,477 - distributed.worker - INFO -       Start worker at:      tcp://127.0.0.1:41889
2022-05-26 08:29:08,478 - distributed.worker - INFO -          Listening to:      tcp://127.0.0.1:41889
2022-05-26 08:29:08,478 - distributed.worker - INFO -          dashboard at:            127.0.0.1:42083
2022-05-26 08:29:08,478 - distributed.worker - INFO - Waiting to connect to:      tcp://127.0.0.1:33275
2022-05-26 08:29:08,478 - distributed.worker - INFO - -------------------------------------------------
2022-05-26 08:29:08,478 - distributed.worker - INFO -               Threads:                          1
2022-05-26 08:29:08,478 - distributed.worker - INFO -                Memory:                   6.78 GiB
2022-05-26 08:29:08,478 - distributed.worker - INFO -       Local Directory: /tmp/_dask_test_workerkwumbx6j/dask-worker-space/worker-3gkcfow7
2022-05-26 08:29:08,478 - distributed.worker - INFO - -------------------------------------------------
2022-05-26 08:29:08,884 - distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:41889', status: init, memory: 0, processing: 0>
2022-05-26 08:29:09,238 - distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:41889
2022-05-26 08:29:09,238 - distributed.core - INFO - Starting established connection
2022-05-26 08:29:09,238 - distributed.worker - INFO -         Registered to:      tcp://127.0.0.1:33275
2022-05-26 08:29:09,239 - distributed.worker - INFO - -------------------------------------------------
2022-05-26 08:29:09,240 - distributed.core - INFO - Starting established connection
2022-05-26 08:29:09,252 - distributed.scheduler - INFO - Receive client connection: Client-e9e9ce4f-dccd-11ec-8a5d-000d3a1d4f00
2022-05-26 08:29:09,253 - distributed.core - INFO - Starting established connection
2022-05-26 08:29:09,315 - distributed.scheduler - INFO - Scheduler closing...
2022-05-26 08:29:09,315 - distributed.scheduler - INFO - Scheduler closing all comms
2022-05-26 08:29:09,316 - distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:41889', status: running, memory: 2, processing: 0>
2022-05-26 08:29:09,317 - distributed.core - INFO - Removing comms to tcp://127.0.0.1:41889
2022-05-26 08:29:09,317 - distributed.scheduler - INFO - Lost all workers
2022-05-26 08:29:09,318 - distributed.batched - INFO - Batched Comm Closed <TCP (closed) Scheduler->Client local=tcp://127.0.0.1:33275 remote=tcp://127.0.0.1:39146>
Traceback (most recent call last):
  File "/home/runner/work/distributed/distributed/distributed/batched.py", line 94, in _background_send
    nbytes = yield self.comm.write(
  File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py", line 762, in run
    value = future.result()
  File "/home/runner/work/distributed/distributed/distributed/comm/tcp.py", line 269, in write
    raise CommClosedError()
distributed.comm.core.CommClosedError
2022-05-26 08:29:09,436 - distributed.client - ERROR - 
Traceback (most recent call last):
  File "/home/runner/work/distributed/distributed/distributed/utils.py", line 761, in wrapper
    return await func(*args, **kwargs)
  File "/home/runner/work/distributed/distributed/distributed/client.py", line 1225, in _reconnect
    await self._ensure_connected(timeout=timeout)
  File "/home/runner/work/distributed/distributed/distributed/client.py", line 1255, in _ensure_connected
    comm = await connect(
  File "/home/runner/work/distributed/distributed/distributed/comm/core.py", line 326, in connect
    handshake = await asyncio.wait_for(comm.read(), time_left())
  File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 432, in wait_for
    await waiter
asyncio.exceptions.CancelledError
------------------------------ Captured log call -------------------------------
ERROR    asyncio.events:utils.py:787 
Traceback (most recent call last):
  File "/home/runner/work/distributed/distributed/distributed/utils.py", line 761, in wrapper
    return await func(*args, **kwargs)
  File "/home/runner/work/distributed/distributed/distributed/client.py", line 1414, in _handle_report
    await self._reconnect()
  File "/home/runner/work/distributed/distributed/distributed/utils.py", line 761, in wrapper
    return await func(*args, **kwargs)
  File "/home/runner/work/distributed/distributed/distributed/client.py", line 1225, in _reconnect
    await self._ensure_connected(timeout=timeout)
  File "/home/runner/work/distributed/distributed/distributed/client.py", line 1255, in _ensure_connected
    comm = await connect(
  File "/home/runner/work/distributed/distributed/distributed/comm/core.py", line 326, in connect
    handshake = await asyncio.wait_for(comm.read(), time_left())
  File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 432, in wait_for
    await waiter
asyncio.exceptions.CancelledError

Metadata

Metadata

Assignees

No one assigned

    Labels

    flaky testIntermittent failures on CI.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions