Skip to content

Task state validation failure for fetch with who_has #6147

@mrocklin

Description

@mrocklin
Traceback (most recent call last):
  File "/home/runner/work/distributed/distributed/distributed/worker.py", line 3975, in validate_task
    self.validate_task_fetch(ts)
  File "/home/runner/work/distributed/distributed/distributed/worker.py", line 3917, in validate_task_fetch
    assert ts.who_has
AssertionError

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/ioloop.py", line 741, in _run_callback
    ret = callback()
  File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/ioloop.py", line 765, in _discard_future_result
    future.result()
  File "/home/runner/work/distributed/distributed/distributed/worker.py", line 1191, in handle_scheduler
    await self.handle_stream(comm, every_cycle=[self.ensure_communicating])
  File "/home/runner/work/distributed/distributed/distributed/core.py", line 625, in handle_stream
    handler(**merge(extra, msg))
  File "/home/runner/work/distributed/distributed/distributed/worker.py", line 1916, in handle_compute_task
    self.transitions(recommendations, stimulus_id=stimulus_id)
  File "/home/runner/work/distributed/distributed/distributed/worker.py", line 2621, in transitions
    self.validate_task(ts)
  File "/home/runner/work/distributed/distributed/distributed/worker.py", line 3985, in validate_task
    raise AssertionError(
AssertionError: Invalid TaskState encountered for <TaskState "('rechunk-merge-77b7d884d5a5f48f375d62bb7d136665', 0, 333)" fetch>.
Story:
[("('rechunk-merge-77b7d884d5a5f48f375d62bb7d136665', 0, 333)", 'compute-task', 'compute-task-1650128700.1081934', 1650128700.1481702), ("('rechunk-merge-77b7d884d5a5f48f375d62bb7d136665', 0, 333)", 'released', 'waiting', 'waiting', {"('rechunk-split-77b7d884d5a5f48f375d62bb7d136665', 1333)": 'fetch'}, 'compute-task-1650128700.1081934', 1650128700.1482384), ("('rechunk-merge-77b7d884d5a5f48f375d62bb7d136665', 0, 333)", 'waiting', 'ready', 'ready', {}, 'ensure-communicating-1650128700.1599193', 1650128700.2175784), ("('rechunk-merge-77b7d884d5a5f48f375d62bb7d136665', 0, 333)", 'ready', 'executing', 'executing', {}, 'compute-task-1650128696.340246', 1650128700.218642), ("('rechunk-merge-77b7d884d5a5f48f375d62bb7d136665', 0, 333)", 'put-in-memory', 'compute-task-1650128696.340246', 1650128700.268273), ("('rechunk-merge-77b7d884d5a5f48f375d62bb7d136665', 0, 333)", 'executing', 'memory', 'memory', {"('rechunk-merge-77b7d884d5a5f48f375d62bb7d136665', 0, 251)": 'executing'}, 'compute-task-1650128696.340246', 1650128700.268321), ('free-keys', ("('rechunk-merge-77b7d884d5a5f48f375d62bb7d136665', 0, 333)",), 'processing-released-1650128700.5942154', 1650128701.1206636), ("('rechunk-merge-77b7d884d5a5f48f375d62bb7d136665', 0, 333)", 'release-key', 'processing-released-1650128700.5942154', 1650128701.1[2067](https://github.com/dask/distributed/runs/6048720092?check_suite_focus=true#step:11:2067)46), ("('rechunk-merge-77b7d884d5a5f48f375d62bb7d136665', 0, 333)", 'memory', 'released', 'released', {"('rechunk-merge-77b7d884d5a5f48f375d62bb7d136665', 0, 333)": 'forgotten'}, 'processing-released-1650128700.5942154', 1650128701.1[2073](https://github.com/dask/distributed/runs/6048720092?check_suite_focus=true#step:11:2073)92), ("('rechunk-merge-77b7d884d5a5f48f375d62bb7d136665', 0, 333)", 'released', 'forgotten', 'forgotten', {}, 'processing-released-1650128700.5942154', 1650128701.1[2075](https://github.com/dask/distributed/runs/6048720092?check_suite_focus=true#step:11:2075)35), ("('rechunk-merge-77b7d884d5a5f48f375d62bb7d136665', 0, 333)", 'ensure-task-exists', 'released', 'compute-task-1650128701.1707418', 1650128701.1864583), ("('rechunk-merge-77b7d884d5a5f48f375d62bb7d136665', 0, 333)", 'released', 'fetch', 'fetch', {}, 'compute-task-1650128701.1707418', 1650128701.1865091)]

https://github.com/dask/distributed/runs/6048720092?check_suite_focus=true

This was found by the new test_chaos_rechunk test.

cc @fjetter @gjoseph92

Metadata

Metadata

Assignees

No one assigned

    Labels

    deadlockThe cluster appears to not make any progress

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions