Skip to content

Commit bc3f69e

Browse files
committed
Use population-based weighted sampling for Asia builds
This replaces the Asia/China/India split with population-based weighted sampling (possible in Augur version 25.3.0). This requires changing the geographical grouping resolution from division to country, but I assume it was only grouped by division in an attempt to have varying group sizes per country, and that population-based weighting is an acceptable replacement.
1 parent 7c5d7bb commit bc3f69e

3 files changed

Lines changed: 68 additions & 224 deletions

File tree

nextstrain_profiles/nextstrain-gisaid/builds.yaml

Lines changed: 32 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -273,30 +273,18 @@ subsampling:
273273

274274
# Custom subsampling logic for region Asia over 1m
275275
# Grouping by division
276-
# Separating three buckets for China, India and elsewhere
276+
# Grouping by country weighted by population size
277277
# 4375 total
278278
# 4:1 ratio of recent to early
279279
# 4:1 ratio of focal to context
280-
# 3:2:2 proportions of Asia, China, India
281280
nextstrain_region_asia_grouped_by_division_1m:
282281
# Early focal samples for Asia
283282
asia_early:
284-
group_by: "division year month"
285-
max_sequences: 300
286-
max_date: "--max-date 1M"
287-
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
288-
# Early focal samples for China
289-
china_early:
290-
group_by: "division year month"
291-
max_sequences: 200
292-
max_date: "--max-date 1M"
293-
exclude: "--exclude-where 'country!=China'"
294-
# Early focal samples for India
295-
india_early:
296-
group_by: "division year month"
297-
max_sequences: 200
283+
group_by: "country year month"
284+
group_by_weights: "defaults/population_weights.tsv"
285+
max_sequences: 700
298286
max_date: "--max-date 1M"
299-
exclude: "--exclude-where 'country!=India'"
287+
exclude: "--exclude-where 'region!=Asia'"
300288
# Early contextual samples from the rest of the world
301289
context_early:
302290
group_by: "country year month"
@@ -305,22 +293,11 @@ subsampling:
305293
exclude: "--exclude-where 'region=Asia'"
306294
# Recent focal samples for Asia
307295
asia_recent:
308-
group_by: "division week"
309-
max_sequences: 1200
296+
group_by: "country year month"
297+
group_by_weights: "defaults/population_weights.tsv"
298+
max_sequences: 2800
310299
min_date: "--min-date 1M"
311-
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
312-
# Recent focal samples for China
313-
china_recent:
314-
group_by: "division week"
315-
max_sequences: 800
316-
max_date: "--min-date 1M"
317-
exclude: "--exclude-where 'country!=China'"
318-
# Recent focal samples for India
319-
india_recent:
320-
group_by: "division week"
321-
max_sequences: 800
322-
max_date: "--min-date 1M"
323-
exclude: "--exclude-where 'country!=India'"
300+
exclude: "--exclude-where 'region!=Asia'"
324301
# Early contextual samples from the rest of the world
325302
context_recent:
326303
group_by: "country week"
@@ -330,30 +307,18 @@ subsampling:
330307

331308
# Custom subsampling logic for region Asia over 2m
332309
# Grouping by division
333-
# Separating three buckets for China, India and elsewhere
310+
# Grouping by country weighted by population size
334311
# 4375 total
335312
# 4:1 ratio of recent to early
336313
# 4:1 ratio of focal to context
337-
# 3:2:2 proportions of Asia, China, India
338314
nextstrain_region_asia_grouped_by_division_2m:
339315
# Early focal samples for Asia
340316
asia_early:
341-
group_by: "division year month"
342-
max_sequences: 300
343-
max_date: "--max-date 2M"
344-
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
345-
# Early focal samples for China
346-
china_early:
347-
group_by: "division year month"
348-
max_sequences: 200
349-
max_date: "--max-date 2M"
350-
exclude: "--exclude-where 'country!=China'"
351-
# Early focal samples for India
352-
india_early:
353-
group_by: "division year month"
354-
max_sequences: 200
317+
group_by: "country year month"
318+
group_by_weights: "defaults/population_weights.tsv"
319+
max_sequences: 700
355320
max_date: "--max-date 2M"
356-
exclude: "--exclude-where 'country!=India'"
321+
exclude: "--exclude-where 'region!=Asia'"
357322
# Early contextual samples from the rest of the world
358323
context_early:
359324
group_by: "country year month"
@@ -362,22 +327,11 @@ subsampling:
362327
exclude: "--exclude-where 'region=Asia'"
363328
# Recent focal samples for Asia
364329
asia_recent:
365-
group_by: "division week"
366-
max_sequences: 1200
330+
group_by: "country year month"
331+
group_by_weights: "defaults/population_weights.tsv"
332+
max_sequences: 2800
367333
min_date: "--min-date 2M"
368-
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
369-
# Recent focal samples for China
370-
china_recent:
371-
group_by: "division week"
372-
max_sequences: 800
373-
max_date: "--min-date 2M"
374-
exclude: "--exclude-where 'country!=China'"
375-
# Recent focal samples for India
376-
india_recent:
377-
group_by: "division week"
378-
max_sequences: 800
379-
max_date: "--min-date 2M"
380-
exclude: "--exclude-where 'country!=India'"
334+
exclude: "--exclude-where 'region!=Asia'"
381335
# Early contextual samples from the rest of the world
382336
context_recent:
383337
group_by: "country week"
@@ -387,30 +341,18 @@ subsampling:
387341

388342
# Custom subsampling logic for region Asia over 6m
389343
# Grouping by division
390-
# Separating three buckets for China, India and elsewhere
344+
# Grouping by country weighted by population size
391345
# 4375 total
392346
# 4:1 ratio of recent to early
393347
# 4:1 ratio of focal to context
394-
# 3:2:2 proportions of Asia, China, India
395348
nextstrain_region_asia_grouped_by_division_6m:
396349
# Early focal samples for Asia
397350
asia_early:
398-
group_by: "division year month"
399-
max_sequences: 300
400-
max_date: "--max-date 6M"
401-
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
402-
# Early focal samples for China
403-
china_early:
404-
group_by: "division year month"
405-
max_sequences: 200
406-
max_date: "--max-date 6M"
407-
exclude: "--exclude-where 'country!=China'"
408-
# Early focal samples for India
409-
india_early:
410-
group_by: "division year month"
411-
max_sequences: 200
351+
group_by: "country year month"
352+
group_by_weights: "defaults/population_weights.tsv"
353+
max_sequences: 700
412354
max_date: "--max-date 6M"
413-
exclude: "--exclude-where 'country!=India'"
355+
exclude: "--exclude-where 'region!=Asia'"
414356
# Early contextual samples from the rest of the world
415357
context_early:
416358
group_by: "country year month"
@@ -419,22 +361,11 @@ subsampling:
419361
exclude: "--exclude-where 'region=Asia'"
420362
# Recent focal samples for Asia
421363
asia_recent:
422-
group_by: "division year month"
423-
max_sequences: 1200
364+
group_by: "country year month"
365+
group_by_weights: "defaults/population_weights.tsv"
366+
max_sequences: 2800
424367
min_date: "--min-date 6M"
425-
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
426-
# Recent focal samples for China
427-
china_recent:
428-
group_by: "division year month"
429-
max_sequences: 800
430-
max_date: "--min-date 6M"
431-
exclude: "--exclude-where 'country!=China'"
432-
# Recent focal samples for India
433-
india_recent:
434-
group_by: "division year month"
435-
max_sequences: 800
436-
max_date: "--min-date 6M"
437-
exclude: "--exclude-where 'country!=India'"
368+
exclude: "--exclude-where 'region!=Asia'"
438369
# Early contextual samples from the rest of the world
439370
context_recent:
440371
group_by: "country year month"
@@ -443,27 +374,16 @@ subsampling:
443374
exclude: "--exclude-where 'region=Asia'"
444375

445376
# Custom subsampling logic for region Asia over all-time
446-
# Grouping by division
447-
# Separating three buckets for China, India and elsewhere
377+
# Grouping by country weighted by population size
448378
# 4375 total
449379
# 4:1 ratio of focal to context
450-
# 3:2:2 proportions of Asia, China, India
451380
nextstrain_region_asia_grouped_by_division_all_time:
452381
# Focal samples for Asia
453382
asia:
454-
group_by: "division year month"
455-
max_sequences: 1500
456-
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
457-
# Focal samples for China
458-
china:
459-
group_by: "division year month"
460-
max_sequences: 1000
461-
exclude: "--exclude-where 'country!=China'"
462-
# Focal samples for India
463-
india:
464-
group_by: "division year month"
465-
max_sequences: 1000
466-
exclude: "--exclude-where 'country!=India'"
383+
group_by: "country year month"
384+
group_by_weights: "defaults/population_weights.tsv"
385+
max_sequences: 3500
386+
exclude: "--exclude-where 'region!=Asia'"
467387
# Contextual samples from the rest of the world
468388
context:
469389
group_by: "country year month"

0 commit comments

Comments
 (0)