-
-
Notifications
You must be signed in to change notification settings - Fork 757
Open
Description
The below stress test reliably crashes after ~35s with StreamClosedError on a 32 core, 64 GiB RAM host. AMM is off. Neither the RAM nor the CPU are being saturated (peak CPU 80%, peak RAM 35 GiB). This happens both on the main branch (post WSMR) as well as 2021.09.01 (pre WSMR).
May or may not duplicate #5370
CC @fjetter
from distributed.utils_test import gen_cluster
from distributed import Nanny
import dask.array as da
@gen_cluster(client=True, nthreads=[("", 1)] * 16, Worker=Nanny)
async def test_stress(c, s, *nannies):
rng = da.random.RandomState(0)
a = rng.random((20_000, 20_000), chunks=(1_000, 1_000)) # 7.6MiB chunks
b = (a @ a.T).sum()
await c.compute(b)Output:
FAILED [100%]distributed.http.proxy - INFO - To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
distributed.scheduler - INFO - Clear task state
distributed.scheduler - INFO - Scheduler at: tcp://127.0.0.1:43335
distributed.scheduler - INFO - dashboard at: 127.0.0.1:42343
distributed.nanny - INFO - Start Nanny at: 'tcp://127.0.0.1:38299'
distributed.nanny - INFO - Start Nanny at: 'tcp://127.0.0.1:36325'
distributed.nanny - INFO - Start Nanny at: 'tcp://127.0.0.1:35061'
distributed.nanny - INFO - Start Nanny at: 'tcp://127.0.0.1:37267'
distributed.nanny - INFO - Start Nanny at: 'tcp://127.0.0.1:34401'
distributed.nanny - INFO - Start Nanny at: 'tcp://127.0.0.1:41879'
distributed.nanny - INFO - Start Nanny at: 'tcp://127.0.0.1:37015'
distributed.nanny - INFO - Start Nanny at: 'tcp://127.0.0.1:40451'
distributed.nanny - INFO - Start Nanny at: 'tcp://127.0.0.1:36915'
distributed.nanny - INFO - Start Nanny at: 'tcp://127.0.0.1:46021'
distributed.nanny - INFO - Start Nanny at: 'tcp://127.0.0.1:34787'
distributed.nanny - INFO - Start Nanny at: 'tcp://127.0.0.1:35003'
distributed.nanny - INFO - Start Nanny at: 'tcp://127.0.0.1:35495'
distributed.nanny - INFO - Start Nanny at: 'tcp://127.0.0.1:45721'
distributed.nanny - INFO - Start Nanny at: 'tcp://127.0.0.1:34825'
distributed.nanny - INFO - Start Nanny at: 'tcp://127.0.0.1:35985'
distributed.worker - INFO - Start worker at: tcp://127.0.0.1:42219
distributed.worker - INFO - Listening to: tcp://127.0.0.1:42219
distributed.worker - INFO - dashboard at: 127.0.0.1:39845
distributed.worker - INFO - Waiting to connect to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Threads: 1
distributed.worker - INFO - Memory: 62.76 GiB
distributed.worker - INFO - Local Directory: /home/crusaderky/github/distributed/distributed/dask-worker-space/worker-an1_udxn
distributed.worker - INFO - -------------------------------------------------
distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:42219', name: 2, memory: 0, processing: 0>
distributed.worker - INFO - Start worker at: tcp://127.0.0.1:41215
distributed.worker - INFO - Listening to: tcp://127.0.0.1:41215
distributed.worker - INFO - dashboard at: 127.0.0.1:36139
distributed.worker - INFO - Waiting to connect to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Threads: 1
distributed.worker - INFO - Memory: 62.76 GiB
distributed.worker - INFO - Local Directory: /home/crusaderky/github/distributed/distributed/dask-worker-space/worker-yjy0_3_h
distributed.worker - INFO - -------------------------------------------------
distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:42219
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Registered to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:41215', name: 0, memory: 0, processing: 0>
distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:41215
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Registered to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Start worker at: tcp://127.0.0.1:41665
distributed.worker - INFO - Listening to: tcp://127.0.0.1:41665
distributed.worker - INFO - dashboard at: 127.0.0.1:38307
distributed.worker - INFO - Waiting to connect to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Threads: 1
distributed.worker - INFO - Memory: 62.76 GiB
distributed.worker - INFO - Local Directory: /home/crusaderky/github/distributed/distributed/dask-worker-space/worker-p9420cxc
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Start worker at: tcp://127.0.0.1:41507
distributed.worker - INFO - Listening to: tcp://127.0.0.1:41507
distributed.worker - INFO - dashboard at: 127.0.0.1:33103
distributed.worker - INFO - Waiting to connect to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Threads: 1
distributed.worker - INFO - Memory: 62.76 GiB
distributed.worker - INFO - Local Directory: /home/crusaderky/github/distributed/distributed/dask-worker-space/worker-3vdrs1p1
distributed.worker - INFO - -------------------------------------------------
distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:41665', name: 3, memory: 0, processing: 0>
distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:41665
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Registered to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:41507', name: 6, memory: 0, processing: 0>
distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:41507
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Registered to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Start worker at: tcp://127.0.0.1:46489
distributed.worker - INFO - Listening to: tcp://127.0.0.1:46489
distributed.worker - INFO - dashboard at: 127.0.0.1:35857
distributed.worker - INFO - Waiting to connect to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Threads: 1
distributed.worker - INFO - Memory: 62.76 GiB
distributed.worker - INFO - Local Directory: /home/crusaderky/github/distributed/distributed/dask-worker-space/worker-mbuivaux
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Start worker at: tcp://127.0.0.1:35605
distributed.worker - INFO - Listening to: tcp://127.0.0.1:35605
distributed.worker - INFO - dashboard at: 127.0.0.1:34609
distributed.worker - INFO - Waiting to connect to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Threads: 1
distributed.worker - INFO - Memory: 62.76 GiB
distributed.worker - INFO - Local Directory: /home/crusaderky/github/distributed/distributed/dask-worker-space/worker-i1oe2n60
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Start worker at: tcp://127.0.0.1:44849
distributed.worker - INFO - Listening to: tcp://127.0.0.1:44849
distributed.worker - INFO - dashboard at: 127.0.0.1:35215
distributed.worker - INFO - Waiting to connect to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Threads: 1
distributed.worker - INFO - Memory: 62.76 GiB
distributed.worker - INFO - Local Directory: /home/crusaderky/github/distributed/distributed/dask-worker-space/worker-s6v7nle6
distributed.worker - INFO - -------------------------------------------------
distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:46489', name: 7, memory: 0, processing: 0>
distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:46489
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Registered to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:35605', name: 4, memory: 0, processing: 0>
distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:35605
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Registered to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:44849', name: 5, memory: 0, processing: 0>
distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:44849
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Registered to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Start worker at: tcp://127.0.0.1:43025
distributed.worker - INFO - Listening to: tcp://127.0.0.1:43025
distributed.worker - INFO - dashboard at: 127.0.0.1:44143
distributed.worker - INFO - Waiting to connect to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Threads: 1
distributed.worker - INFO - Memory: 62.76 GiB
distributed.worker - INFO - Local Directory: /home/crusaderky/github/distributed/distributed/dask-worker-space/worker-m480ko5u
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Start worker at: tcp://127.0.0.1:40381
distributed.worker - INFO - Listening to: tcp://127.0.0.1:40381
distributed.worker - INFO - dashboard at: 127.0.0.1:43667
distributed.worker - INFO - Waiting to connect to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Threads: 1
distributed.worker - INFO - Memory: 62.76 GiB
distributed.worker - INFO - Local Directory: /home/crusaderky/github/distributed/distributed/dask-worker-space/worker-bnonvxhx
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Start worker at: tcp://127.0.0.1:33031
distributed.worker - INFO - Listening to: tcp://127.0.0.1:33031
distributed.worker - INFO - dashboard at: 127.0.0.1:46829
distributed.worker - INFO - Waiting to connect to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Threads: 1
distributed.worker - INFO - Memory: 62.76 GiB
distributed.worker - INFO - Local Directory: /home/crusaderky/github/distributed/distributed/dask-worker-space/worker-n_i88ztb
distributed.worker - INFO - -------------------------------------------------
distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:43025', name: 9, memory: 0, processing: 0>
distributed.worker - INFO - Start worker at: tcp://127.0.0.1:42021
distributed.worker - INFO - Listening to: tcp://127.0.0.1:42021
distributed.worker - INFO - dashboard at: 127.0.0.1:45097
distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:43025
distributed.worker - INFO - Waiting to connect to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Threads: 1
distributed.worker - INFO - Memory: 62.76 GiB
distributed.worker - INFO - Local Directory: /home/crusaderky/github/distributed/distributed/dask-worker-space/worker-nam_4xmz
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Registered to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:40381', name: 8, memory: 0, processing: 0>
distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:40381
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Registered to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:33031', name: 10, memory: 0, processing: 0>
distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:33031
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Registered to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:42021', name: 11, memory: 0, processing: 0>
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:42021
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Registered to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Start worker at: tcp://127.0.0.1:36621
distributed.worker - INFO - Listening to: tcp://127.0.0.1:36621
distributed.worker - INFO - dashboard at: 127.0.0.1:34179
distributed.worker - INFO - Waiting to connect to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Threads: 1
distributed.worker - INFO - Memory: 62.76 GiB
distributed.worker - INFO - Local Directory: /home/crusaderky/github/distributed/distributed/dask-worker-space/worker-4zdsbrf0
distributed.worker - INFO - -------------------------------------------------
distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:36621', name: 14, memory: 0, processing: 0>
distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:36621
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Registered to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Start worker at: tcp://127.0.0.1:37217
distributed.worker - INFO - Listening to: tcp://127.0.0.1:37217
distributed.worker - INFO - dashboard at: 127.0.0.1:39209
distributed.worker - INFO - Waiting to connect to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Threads: 1
distributed.worker - INFO - Memory: 62.76 GiB
distributed.worker - INFO - Local Directory: /home/crusaderky/github/distributed/distributed/dask-worker-space/worker-uurum51c
distributed.worker - INFO - -------------------------------------------------
distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:37217', name: 1, memory: 0, processing: 0>
distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:37217
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Registered to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Start worker at: tcp://127.0.0.1:44253
distributed.worker - INFO - Listening to: tcp://127.0.0.1:44253
distributed.worker - INFO - dashboard at: 127.0.0.1:36551
distributed.worker - INFO - Waiting to connect to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Threads: 1
distributed.worker - INFO - Memory: 62.76 GiB
distributed.worker - INFO - Local Directory: /home/crusaderky/github/distributed/distributed/dask-worker-space/worker-nnz8hnrp
distributed.worker - INFO - -------------------------------------------------
distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:44253', name: 13, memory: 0, processing: 0>
distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:44253
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Registered to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Start worker at: tcp://127.0.0.1:45265
distributed.worker - INFO - Listening to: tcp://127.0.0.1:45265
distributed.worker - INFO - dashboard at: 127.0.0.1:42235
distributed.worker - INFO - Waiting to connect to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Threads: 1
distributed.worker - INFO - Memory: 62.76 GiB
distributed.worker - INFO - Local Directory: /home/crusaderky/github/distributed/distributed/dask-worker-space/worker-gn9fx7mh
distributed.worker - INFO - -------------------------------------------------
distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:45265', name: 15, memory: 0, processing: 0>
distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:45265
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Registered to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Start worker at: tcp://127.0.0.1:35323
distributed.worker - INFO - Listening to: tcp://127.0.0.1:35323
distributed.worker - INFO - dashboard at: 127.0.0.1:41021
distributed.worker - INFO - Waiting to connect to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Threads: 1
distributed.worker - INFO - Memory: 62.76 GiB
distributed.worker - INFO - Local Directory: /home/crusaderky/github/distributed/distributed/dask-worker-space/worker-e6is398n
distributed.worker - INFO - -------------------------------------------------
distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:35323', name: 12, memory: 0, processing: 0>
distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:35323
distributed.core - INFO - Starting established connection
distributed.worker - INFO - Registered to: tcp://127.0.0.1:43335
distributed.worker - INFO - -------------------------------------------------
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Receive client connection: Client-f1ec2065-2143-11ec-ab9b-b42e99c1ab7d
distributed.core - INFO - Starting established connection
distributed.utils_perf - INFO - full garbage collection released 30.21 MiB from 508 reference cycles (threshold: 9.54 MiB)
distributed.scheduler - INFO - Remove client Client-f1ec2065-2143-11ec-ab9b-b42e99c1ab7d
distributed.scheduler - INFO - Remove client Client-f1ec2065-2143-11ec-ab9b-b42e99c1ab7d
distributed.scheduler - INFO - Close client connection: Client-f1ec2065-2143-11ec-ab9b-b42e99c1ab7d
distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:38299'
distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:36325'
distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:35061'
distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:37267'
distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:34401'
distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:41879'
distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:37015'
distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:40451'
distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:36915'
distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:46021'
distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:34787'
distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:35003'
distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:35495'
distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:45721'
distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:34825'
distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:35985'
distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:33031
distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:33031', name: 10, memory: 0, processing: 0>
distributed.core - INFO - Removing comms to tcp://127.0.0.1:33031
distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:41215
distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:36621
distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:42021
distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:44253
distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:35323
distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:42219
distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:36621', name: 14, memory: 0, processing: 0>
distributed.core - INFO - Removing comms to tcp://127.0.0.1:36621
distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:44253', name: 13, memory: 0, processing: 0>
distributed.core - INFO - Removing comms to tcp://127.0.0.1:44253
distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:35323', name: 12, memory: 0, processing: 0>
distributed.core - INFO - Removing comms to tcp://127.0.0.1:35323
distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:42219', name: 2, memory: 0, processing: 0>
distributed.core - INFO - Removing comms to tcp://127.0.0.1:42219
distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:46489
distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:44849
distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:40381
distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:40381', name: 8, memory: 0, processing: 0>
distributed.core - INFO - Removing comms to tcp://127.0.0.1:40381
distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:37217
distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:37217', name: 1, memory: 0, processing: 0>
distributed.core - INFO - Removing comms to tcp://127.0.0.1:37217
distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:41215', name: 0, memory: 0, processing: 0>
distributed.core - INFO - Removing comms to tcp://127.0.0.1:41215
distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:35605
distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:45265
distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:43025
distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:44849', name: 5, memory: 0, processing: 0>
distributed.core - INFO - Removing comms to tcp://127.0.0.1:44849
distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:42021', name: 11, memory: 0, processing: 0>
distributed.core - INFO - Removing comms to tcp://127.0.0.1:42021
distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:35605', name: 4, memory: 0, processing: 0>
distributed.core - INFO - Removing comms to tcp://127.0.0.1:35605
distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:46489', name: 7, memory: 0, processing: 0>
distributed.core - INFO - Removing comms to tcp://127.0.0.1:46489
distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:45265', name: 15, memory: 0, processing: 0>
distributed.core - INFO - Removing comms to tcp://127.0.0.1:45265
distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:43025', name: 9, memory: 0, processing: 0>
distributed.core - INFO - Removing comms to tcp://127.0.0.1:43025
distributed.worker - WARNING - Heartbeat to scheduler failed
Traceback (most recent call last):
File "/home/crusaderky/github/distributed/distributed/comm/tcp.py", line 198, in read
frames_nbytes = await stream.read_bytes(fmt_size)
tornado.iostream.StreamClosedError: Stream is closed
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/crusaderky/github/distributed/distributed/worker.py", line 1063, in heartbeat
response = await retry_operation(
File "/home/crusaderky/github/distributed/distributed/utils_comm.py", line 385, in retry_operation
return await retry(
File "/home/crusaderky/github/distributed/distributed/utils_comm.py", line 370, in retry
return await coro()
File "/home/crusaderky/github/distributed/distributed/core.py", line 860, in send_recv_from_rpc
result = await send_recv(comm=comm, op=key, **kwargs)
File "/home/crusaderky/github/distributed/distributed/core.py", line 637, in send_recv
response = await comm.read(deserializers=deserializers)
File "/home/crusaderky/github/distributed/distributed/comm/tcp.py", line 214, in read
convert_stream_closed_error(self, e)
File "/home/crusaderky/github/distributed/distributed/comm/tcp.py", line 128, in convert_stream_closed_error
raise CommClosedError(f"in {obj}: {exc}") from exc
distributed.comm.core.CommClosedError: in <TCP (closed) ConnectionPool.heartbeat_worker local=tcp://127.0.0.1:48266 remote=tcp://127.0.0.1:43335>: Stream is closed
distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:41665
distributed.worker - WARNING - Heartbeat to scheduler failed
Traceback (most recent call last):
File "/home/crusaderky/github/distributed/distributed/comm/tcp.py", line 198, in read
frames_nbytes = await stream.read_bytes(fmt_size)
tornado.iostream.StreamClosedError: Stream is closed
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/crusaderky/github/distributed/distributed/worker.py", line 1063, in heartbeat
response = await retry_operation(
File "/home/crusaderky/github/distributed/distributed/utils_comm.py", line 385, in retry_operation
return await retry(
File "/home/crusaderky/github/distributed/distributed/utils_comm.py", line 370, in retry
return await coro()
File "/home/crusaderky/github/distributed/distributed/core.py", line 860, in send_recv_from_rpc
result = await send_recv(comm=comm, op=key, **kwargs)
File "/home/crusaderky/github/distributed/distributed/core.py", line 637, in send_recv
response = await comm.read(deserializers=deserializers)
File "/home/crusaderky/github/distributed/distributed/comm/tcp.py", line 214, in read
convert_stream_closed_error(self, e)
File "/home/crusaderky/github/distributed/distributed/comm/tcp.py", line 128, in convert_stream_closed_error
raise CommClosedError(f"in {obj}: {exc}") from exc
distributed.comm.core.CommClosedError: in <TCP (closed) ConnectionPool.heartbeat_worker local=tcp://127.0.0.1:47762 remote=tcp://127.0.0.1:43335>: Stream is closed
distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:41507
distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:41507', name: 6, memory: 0, processing: 0>
distributed.core - INFO - Removing comms to tcp://127.0.0.1:41507
distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:41665', name: 3, memory: 0, processing: 0>
distributed.core - INFO - Removing comms to tcp://127.0.0.1:41665
distributed.scheduler - INFO - Lost all workers
distributed.worker - WARNING - Heartbeat to scheduler failed
Traceback (most recent call last):
File "/home/crusaderky/github/distributed/distributed/comm/tcp.py", line 198, in read
frames_nbytes = await stream.read_bytes(fmt_size)
tornado.iostream.StreamClosedError: Stream is closed
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/crusaderky/github/distributed/distributed/worker.py", line 1063, in heartbeat
response = await retry_operation(
File "/home/crusaderky/github/distributed/distributed/utils_comm.py", line 385, in retry_operation
return await retry(
File "/home/crusaderky/github/distributed/distributed/utils_comm.py", line 370, in retry
return await coro()
File "/home/crusaderky/github/distributed/distributed/core.py", line 860, in send_recv_from_rpc
result = await send_recv(comm=comm, op=key, **kwargs)
File "/home/crusaderky/github/distributed/distributed/core.py", line 637, in send_recv
response = await comm.read(deserializers=deserializers)
File "/home/crusaderky/github/distributed/distributed/comm/tcp.py", line 214, in read
convert_stream_closed_error(self, e)
File "/home/crusaderky/github/distributed/distributed/comm/tcp.py", line 128, in convert_stream_closed_error
raise CommClosedError(f"in {obj}: {exc}") from exc
distributed.comm.core.CommClosedError: in <TCP (closed) ConnectionPool.heartbeat_worker local=tcp://127.0.0.1:48272 remote=tcp://127.0.0.1:43335>: Stream is closed
distributed.scheduler - INFO - Scheduler closing...
distributed.scheduler - INFO - Scheduler closing all comms
distributed/test1.py:5 (test_deadlock)
c = <Client: 'tcp://127.0.0.1:43335' processes=16 threads=16, memory=0.98 TiB>
s = <Scheduler: "tcp://127.0.0.1:43335" workers: 0 cores: 0, tasks: 0>
nannies = (<Nanny: None, threads: 1>, <Nanny: None, threads: 1>, <Nanny: None, threads: 1>, <Nanny: None, threads: 1>, <Nanny: None, threads: 1>, <Nanny: None, threads: 1>, ...)
rng = <dask.array.random.RandomState object at 0x7f79b04f0370>
a = dask.array<random_sample, shape=(20000, 20000), dtype=float64, chunksize=(1000, 1000), chunktype=numpy.ndarray>
b = dask.array<sum-aggregate, shape=(), dtype=float64, chunksize=(), chunktype=numpy.ndarray>
@gen_cluster(client=True, nthreads=[("", 1)] * 16, Worker=Nanny)
async def test_deadlock(c, s, *nannies):
rng = da.random.RandomState(0)
a = rng.random((20000, 20000), chunks=(1000, 1000))
b = (a @ a.T).sum()
> await c.compute(b)
test1.py:11:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <Future: cancelled, key: finalize-16518575106c5aff3e520a6947d8383a>
raiseit = True
async def _result(self, raiseit=True):
> await self._state.wait()
client.py:235:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <FutureState: cancelled>, timeout = None
async def wait(self, timeout=None):
> await asyncio.wait_for(self._get_event().wait(), timeout)
client.py:474:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
fut = <coroutine object Event.wait at 0x7f7954651f40>, timeout = None
async def wait_for(fut, timeout, *, loop=None):
"""Wait for the single Future or coroutine to complete, with timeout.
Coroutine will be wrapped in Task.
Returns result of the Future or coroutine. When a timeout occurs,
it cancels the task and raises TimeoutError. To avoid the task
cancellation, wrap it in shield().
If the wait is cancelled, the task is also cancelled.
This function is a coroutine.
"""
if loop is None:
loop = events.get_running_loop()
else:
warnings.warn("The loop argument is deprecated since Python 3.8, "
"and scheduled for removal in Python 3.10.",
DeprecationWarning, stacklevel=2)
if timeout is None:
> return await fut
../../../miniconda3/envs/distributed39/lib/python3.9/asyncio/tasks.py:442:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <asyncio.locks.Event object at 0x7f7915cc0130 [set]>
async def wait(self):
"""Block until the internal flag is true.
If the internal flag is true on entry, return True
immediately. Otherwise, block until another coroutine calls
set() to set the flag to true, then return True.
"""
if self._value:
return True
fut = self._loop.create_future()
self._waiters.append(fut)
try:
> await fut
E asyncio.exceptions.CancelledError
../../../miniconda3/envs/distributed39/lib/python3.9/asyncio/locks.py:226: CancelledError
During handling of the above exception, another exception occurred:
fut = <Task cancelled name='Task-262' coro=<test_deadlock() done, defined at /home/crusaderky/github/distributed/distributed/test1.py:6>>
timeout = 30
async def wait_for(fut, timeout, *, loop=None):
"""Wait for the single Future or coroutine to complete, with timeout.
Coroutine will be wrapped in Task.
Returns result of the Future or coroutine. When a timeout occurs,
it cancels the task and raises TimeoutError. To avoid the task
cancellation, wrap it in shield().
If the wait is cancelled, the task is also cancelled.
This function is a coroutine.
"""
if loop is None:
loop = events.get_running_loop()
else:
warnings.warn("The loop argument is deprecated since Python 3.8, "
"and scheduled for removal in Python 3.10.",
DeprecationWarning, stacklevel=2)
if timeout is None:
return await fut
if timeout <= 0:
fut = ensure_future(fut, loop=loop)
if fut.done():
return fut.result()
await _cancel_and_wait(fut, loop=loop)
try:
fut.result()
except exceptions.CancelledError as exc:
raise exceptions.TimeoutError() from exc
else:
raise exceptions.TimeoutError()
waiter = loop.create_future()
timeout_handle = loop.call_later(timeout, _release_waiter, waiter)
cb = functools.partial(_release_waiter, waiter)
fut = ensure_future(fut, loop=loop)
fut.add_done_callback(cb)
try:
# wait until the future completes or the timeout
try:
await waiter
except exceptions.CancelledError:
if fut.done():
return fut.result()
else:
fut.remove_done_callback(cb)
# We must ensure that the task is not running
# after wait_for() returns.
# See https://bugs.python.org/issue32751
await _cancel_and_wait(fut, loop=loop)
raise
if fut.done():
return fut.result()
else:
fut.remove_done_callback(cb)
# We must ensure that the task is not running
# after wait_for() returns.
# See https://bugs.python.org/issue32751
await _cancel_and_wait(fut, loop=loop)
# In case task cancellation failed with some
# exception, we should re-raise it
# See https://bugs.python.org/issue40607
try:
> fut.result()
E asyncio.exceptions.CancelledError
../../../miniconda3/envs/distributed39/lib/python3.9/asyncio/tasks.py:492: CancelledError
The above exception was the direct cause of the following exception:
outer_args = (), kwargs = {}, result = None
coro = <function gen_cluster.<locals>._.<locals>.test_func.<locals>.coro at 0x7f79b332a550>
@functools.wraps(func)
def test_func(*outer_args, **kwargs):
result = None
workers = []
with clean(timeout=active_rpc_timeout, **clean_kwargs) as loop:
async def coro():
with dask.config.set(config):
s = False
for _ in range(60):
try:
s, ws = await start_cluster(
nthreads,
scheduler,
loop,
security=security,
Worker=Worker,
scheduler_kwargs=scheduler_kwargs,
worker_kwargs=worker_kwargs,
)
except Exception as e:
logger.error(
"Failed to start gen_cluster: "
f"{e.__class__.__name__}: {e}; retrying",
exc_info=True,
)
await asyncio.sleep(1)
else:
workers[:] = ws
args = [s] + workers
break
if s is False:
raise Exception("Could not start cluster")
if client:
c = await Client(
s.address,
loop=loop,
security=security,
asynchronous=True,
**client_kwargs,
)
args = [c] + args
try:
future = func(*args, *outer_args, **kwargs)
future = asyncio.wait_for(future, timeout)
result = await future
if s.validate:
s.validate_state()
finally:
if client and c.status not in ("closing", "closed"):
await c._close(fast=s.status == Status.closed)
await end_cluster(s, workers)
await asyncio.wait_for(cleanup_global_workers(), 1)
try:
c = await default_client()
except ValueError:
pass
else:
await c._close(fast=True)
def get_unclosed():
return [c for c in Comm._instances if not c.closed()] + [
c
for c in _global_clients.values()
if c.status != "closed"
]
try:
start = time()
while time() < start + 60:
gc.collect()
if not get_unclosed():
break
await asyncio.sleep(0.05)
else:
if allow_unclosed:
print(f"Unclosed Comms: {get_unclosed()}")
else:
raise RuntimeError("Unclosed Comms", get_unclosed())
finally:
Comm._instances.clear()
_global_clients.clear()
return result
> result = loop.run_sync(
coro, timeout=timeout * 2 if timeout else timeout
)
utils_test.py:994:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
../../../miniconda3/envs/distributed39/lib/python3.9/site-packages/tornado/ioloop.py:530: in run_sync
return future_cell[0].result()
utils_test.py:953: in coro
result = await future
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
fut = <Task cancelled name='Task-262' coro=<test_deadlock() done, defined at /home/crusaderky/github/distributed/distributed/test1.py:6>>
timeout = 30
async def wait_for(fut, timeout, *, loop=None):
"""Wait for the single Future or coroutine to complete, with timeout.
Coroutine will be wrapped in Task.
Returns result of the Future or coroutine. When a timeout occurs,
it cancels the task and raises TimeoutError. To avoid the task
cancellation, wrap it in shield().
If the wait is cancelled, the task is also cancelled.
This function is a coroutine.
"""
if loop is None:
loop = events.get_running_loop()
else:
warnings.warn("The loop argument is deprecated since Python 3.8, "
"and scheduled for removal in Python 3.10.",
DeprecationWarning, stacklevel=2)
if timeout is None:
return await fut
if timeout <= 0:
fut = ensure_future(fut, loop=loop)
if fut.done():
return fut.result()
await _cancel_and_wait(fut, loop=loop)
try:
fut.result()
except exceptions.CancelledError as exc:
raise exceptions.TimeoutError() from exc
else:
raise exceptions.TimeoutError()
waiter = loop.create_future()
timeout_handle = loop.call_later(timeout, _release_waiter, waiter)
cb = functools.partial(_release_waiter, waiter)
fut = ensure_future(fut, loop=loop)
fut.add_done_callback(cb)
try:
# wait until the future completes or the timeout
try:
await waiter
except exceptions.CancelledError:
if fut.done():
return fut.result()
else:
fut.remove_done_callback(cb)
# We must ensure that the task is not running
# after wait_for() returns.
# See https://bugs.python.org/issue32751
await _cancel_and_wait(fut, loop=loop)
raise
if fut.done():
return fut.result()
else:
fut.remove_done_callback(cb)
# We must ensure that the task is not running
# after wait_for() returns.
# See https://bugs.python.org/issue32751
await _cancel_and_wait(fut, loop=loop)
# In case task cancellation failed with some
# exception, we should re-raise it
# See https://bugs.python.org/issue40607
try:
fut.result()
except exceptions.CancelledError as exc:
> raise exceptions.TimeoutError() from exc
E asyncio.exceptions.TimeoutError
../../../miniconda3/envs/distributed39/lib/python3.9/asyncio/tasks.py:494: TimeoutError
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels