-
-
Notifications
You must be signed in to change notification settings - Fork 757
Expand file tree
/
Copy pathdistributed.yaml
More file actions
356 lines (323 loc) · 14.8 KB
/
distributed.yaml
File metadata and controls
356 lines (323 loc) · 14.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
distributed:
version: 2
# logging:
# distributed: info
# distributed.client: warning
# distributed.gc: warning
# bokeh: error
# # http://stackoverflow.com/questions/21234772/python-tornado-disable-logging-to-stderr
# tornado: critical
# tornado.application: error
scheduler:
allowed-failures: 3 # number of retries before a task is considered bad
bandwidth: 100000000 # 100 MB/s estimated worker-worker bandwidth
blocked-handlers: []
contact-address: null
default-data-size: 1kiB
# Whether to reuse the same Scheduler to Worker comm for repeated broadcasts.
reuse-broadcast-comm: True
# Number of seconds to wait until workers or clients are removed from the events log
# after they have been removed from the scheduler
events-cleanup-delay: 1h
idle-timeout: null # Shut down after this duration, like "1h" or "30 minutes"
no-workers-timeout: null # If a task remains unrunnable for longer than this, it fails.
work-stealing: True # workers should steal tasks from each other
work-stealing-interval: 1s # Callback time for work stealing
worker-saturation: 1.1 # Send this fraction of nthreads root tasks to workers
rootish-taskgroup: 5 # number of dependencies of a rootish tg
rootish-taskgroup-dependencies: 5 # number of dependencies of the dependencies of the rootish tg
worker-ttl: "5 minutes" # like '60s'. Time to live for workers. They must heartbeat faster than this
preload: [] # Run custom modules with Scheduler
preload-argv: [] # See https://docs.dask.org/en/latest/how-to/customize-initialization.html
unknown-task-duration: 500ms # Default duration for all tasks with unknown durations ("15m", "2h")
default-task-durations: # How long we expect function names to run ("1h", "1s") (helps for long tasks)
rechunk-split: 1us
split-shuffle: 1us
split-taskshuffle: 1us
split-stage: 1us
validate: False # Check scheduler state at every step for debugging
dashboard:
status:
task-stream-length: 1000
tasks:
task-stream-length: 100000
tls:
ca-file: null
key: null
cert: null
bokeh-application: # keywords to pass to BokehTornado application
allow_websocket_origin: ["*"]
keep_alive_milliseconds: 500
check_unused_sessions_milliseconds: 500
locks:
lease-validation-interval: 10s # The interval in which the scheduler validates staleness of all acquired leases. Must always be smaller than the lease-timeout itself.
lease-timeout: 30s # Maximum interval to wait for a Client refresh before a lease is invalidated and released.
http:
routes:
- distributed.http.scheduler.prometheus
- distributed.http.scheduler.info
- distributed.http.scheduler.json
- distributed.http.health
- distributed.http.proxy
- distributed.http.statics
allowed-imports:
- dask
- distributed
active-memory-manager:
# Set to true to auto-start the Active Memory Manager on Scheduler start; if false
# you'll have to either manually start it with client.amm.start() or run it once
# with client.amm.run_once().
start: true
# Once started, run the AMM cycle every <interval>
interval: 2s
# Memory measure to use. Must be one of the attributes of
# distributed.scheduler.MemoryState.
measure: optimistic
# Policies that should be executed at every cycle. Any additional keys in each
# object are passed as keyword arguments to the policy constructor.
policies:
- class: distributed.active_memory_manager.ReduceReplicas
worker:
blocked-handlers: []
multiprocessing-method: spawn
use-file-locking: True
transfer:
message-bytes-limit: 50MB
connections: # Maximum concurrent connections for data
outgoing: 50 # This helps to control network saturation
incoming: 10
preload: [] # Run custom modules with Worker
preload-argv: [] # See https://docs.dask.org/en/latest/how-to/customize-initialization.html
daemon: True
validate: False # Check worker state at every step for debugging
resources: {} # Key: value pairs specifying worker resources.
lifetime:
duration: null # Time after which to gracefully shutdown the worker
stagger: 0 seconds # Random amount by which to stagger lifetimes
restart: False # Do we resurrect the worker after the lifetime deadline?
profile:
enabled: True # Whether or not to enable profiling
interval: 10ms # Time between statistical profiling queries
cycle: 1000ms # Time between starting new profile
low-level: False # Whether or not to include low-level functions
# Requires https://github.com/numba/stacktrace
memory:
# When there is an increase in process memory (as observed by the operating
# system) that is not accounted for by the dask keys stored on the worker, ignore
# it for this long before considering it in non-critical memory measures.
# This should be set to be longer than the duration of most dask tasks.
recent-to-old-time: 30s
rebalance:
# Memory measure to rebalance upon. Possible choices are:
# process
# Total process memory, as measured by the OS.
# optimistic
# Managed by dask (instantaneous) + unmanaged (without any increases
# happened in the last <distributed.worker.memory.recent-to-old-time>).
# Recommended for use on CPython with large (2MiB+) numpy-based data chunks.
# managed
# Only consider the data allocated by dask in RAM. Recommended if RAM is not
# released in a timely fashion back to the OS after the Python objects are
# dereferenced, but remains available for reuse by PyMalloc.
#
# If this is your problem on Linux, you should alternatively consider
# setting the MALLOC_TRIM_THRESHOLD_ environment variable (note the final
# underscore) to a low value; refer to the mallopt man page and to the
# comments about M_TRIM_THRESHOLD on
# https://sourceware.org/git/?p=glibc.git;a=blob;f=malloc/malloc.c
# managed_total
# Only consider data allocated by dask, including that spilled to disk.
# Recommended if disk occupation of the spill file is an issue.
measure: optimistic
# Fraction of worker process memory at which we start potentially sending
# data to other workers. Ignored when max_memory is not set.
sender-min: 0.30
# Fraction of worker process memory at which we stop potentially accepting
# data from other workers. Ignored when max_memory is not set.
recipient-max: 0.60
# Fraction of worker process memory, around the cluster mean, where a worker is
# neither a sender nor a recipient of data during a rebalance operation. E.g. if
# the mean cluster occupation is 50%, sender-recipient-gap=0.10 means that only
# nodes above 55% will donate data and only nodes below 45% will receive them.
# This helps avoid data from bouncing around the cluster repeatedly.
# Ignored when max_memory is not set.
sender-recipient-gap: 0.10
# Fractions of worker process memory at which we take action to avoid memory
# blowup. Set any of the values to False to turn off the behavior entirely.
# All fractions are relative to each worker's memory_limit.
transfer: 0.10 # fractional size of incoming data transfers where we start
# throttling incoming data transfers
target: 0.60 # fraction of managed memory where we start spilling to disk
spill: 0.70 # fraction of process memory where we start spilling to disk
pause: 0.80 # fraction of process memory at which we pause worker threads
terminate: 0.95 # fraction of process memory at which we terminate the worker
# Max size of the spill file on disk (e.g. "10 GB")
# Set to false for no maximum.
max-spill: false
spill-compression: auto # See also: distributed.comm.compression
# Interval between checks for the spill, pause, and terminate thresholds.
# The target threshold is checked every time new data is inserted.
monitor-interval: 100ms
http:
routes:
- distributed.http.worker.prometheus
- distributed.http.health
- distributed.http.statics
nanny:
preload: [] # Run custom modules with Nanny
preload-argv: [] # See https://docs.dask.org/en/latest/how-to/customize-initialization.html
# Override environment variables after spawning the Worker process.
# Use whenever you are sure that nothing will read them before the end of the worker
# initialization.
environ: {}
# Override environment variables *before* spawning the Worker initialization.
# Use for variables that are parsed in or before the worker init.
# Note that this leaks variables into the nanny process.
# Read important caveats at
# https://distributed.dask.org/en/stable/worker.html#nanny.
pre-spawn-environ:
# See https://distributed.dask.org/en/stable/worker-memory.html#automatically-trim-memory
MALLOC_TRIM_THRESHOLD_: 65536
# Numpy configuration
OMP_NUM_THREADS: 1
MKL_NUM_THREADS: 1
OPENBLAS_NUM_THREADS: 1
client:
direct-to-workers: null # Whether to connect directly to workers for gather / scatter
heartbeat: 5s # Interval between client heartbeats
scheduler-info-interval: 2s # Interval between scheduler-info updates
security-loader: null # A callable to load security credentials if none are provided explicitl
preload: [] # Run custom modules with Client
preload-argv: [] # See https://docs.dask.org/en/latest/how-to/customize-initialization.html
deploy:
lost-worker-timeout: 15s # Interval after which to hard-close a lost worker job
cluster-repr-interval: 500ms # Interval between calls to update cluster-repr for the widget
adaptive:
interval: 1s # Interval between scaling evaluations
target-duration: 5s # Time an entire graph calculation is desired to take ("1m", "30m")
minimum: 0 # Minimum number of workers
maximum: .inf # Maximum number of workers
wait-count: 3 # Number of times a worker should be suggested for removal before removing it
comm:
retry: # some operations (such as gathering data) are subject to re-tries with the below parameters
count: 0 # the maximum retry attempts. 0 disables re-trying.
delay:
min: 1s # the first non-zero delay between re-tries
max: 20s # the maximum delay between re-tries
compression: false # See also: distributed.worker.memory.spill-compression
shard: 64MiB
offload: 10MiB # Size after which we choose to offload serialization to another thread
default-scheme: tcp
socket-backlog: 2048
zstd:
level: 3 # Compression level, between 1 and 22.
threads: 0 # Threads to use. 0 for single-threaded, -1 to infer from cpu count.
timeouts:
connect: 30s # time before connecting fails
tcp: 30s # time before calling an unresponsive connection dead
require-encryption: null # Whether to require encryption on non-local comms
tls:
ciphers: null # Allowed ciphers, specified as an OpenSSL cipher string.
min-version: 1.2 # The minimum TLS version supported.
max-version: null # The maximum TLS version supported.
ca-file: null # Path to a CA file, in pem format, optional
scheduler:
cert: null # Path to certificate file for scheduler.
key: null # Path to key file for scheduler. Alternatively, the key
# can be appended to the cert file above, and this field
# left blank.
worker:
key: null
cert: null
client:
key: null
cert: null
websockets:
shard: 8MiB
diagnostics:
nvml: True
cudf: False
computations:
max-history: 100
nframes: 0
ignore-modules:
- asyncio
- functools
- threading
- datashader
- dask
- debugpy
- distributed
- ipykernel
- coiled
- cudf
- cuml
- matplotlib
- pluggy # part of pytest
- prefect
- rechunker
- xarray
- xgboost
- xdist
- __channelexec__ # more xdist
- execnet # more xdist
ignore-files:
# `python -m pytest` (or other module)
# runpy.py on Python <=3.13; <frozen runpy> on >=3.14
- runpy
# Many variations of pytest:
# pytest, py.test, pytest-script (on Windows),
# _pytest (implementation), vscode_pytest
- .*py\.?test.*
- pycharm # Run pytest from PyCharm GUI
- get_output_via_markers
erred-tasks:
max-history: 100
p2p:
comm:
buffer: 1 GiB
concurrency: 10
message-bytes-limit: 2 MiB
retry:
count: 10
delay:
min: 1s # the first non-zero delay between re-tries
max: 30s # the maximum delay between re-tries
storage:
buffer: 100 MiB
disk: True
threads: null
###################
# Bokeh dashboard #
###################
dashboard:
link: "{scheme}://{host}:{port}/status"
export-tool: False
graph-max-items: 5000 # maximum number of tasks to try to plot in graph view
prometheus:
namespace: "dask"
##################
# Administrative #
##################
admin:
large-graph-warning-threshold: 10MB # Threshold for warning on large graph
tick:
interval: 20ms # time between event loop health checks
limit: 3s # time allowed before triggering a warning
cycle: 1s # time between checking event loop speed
max-error-length: 10000 # Maximum size traceback after error to return
log-length: 10000 # Maximum length of worker/scheduler logs to keep in memory
log-format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
low-level-log-length: 1000 # Maximum length of various logs for developers
pdb-on-err: False # enter debug mode on scheduling error
system-monitor:
interval: 500ms
log-length: 7200 # Maximum number of samples to keep in memory
disk: true # Monitor host-wide disk I/O
host-cpu: false # Monitor host-wide CPU usage, with very granular breakdown
gil:
enabled: true # Monitor GIL contention
interval: "1ms" # Frequency to poll GIL
event-loop: tornado
rmm:
pool-size: null