Skip to content

Replace LAMA with Dask to parallelise cf.Data #182

@sadielbartholomew

Description

@sadielbartholomew

A master issue to register steps and progress towards migrating the current approach to domain-distributed array representation, Large Amounts of Massive Arrays (LAMA), with an approach based on the efficient task parallelism provided by the Dask library.

The core aims, to be implemented in the separate branch dask for merging into master at the end of the work ready for a v4.0.0 major release, are to:

  • replace all LAMA logic with Dask-based logic,
  • in doing so, parallelise all existing cf.Data operations,
  • reproducing exactly the current API and functionality so that there are no user-facing differences other than better performance (though after the work is done we agreed we might want to make API amendments).

This comment is intended as a working document to flesh out with detail and new steps as they arise during the progress of the work. Please update and amend it as appropriate, or comment in the thread.

Step-by-step plan

Relevant PRs will be linked here

[To be fleshed out shortly]

Open questions, points of discussion, etc.

See the comments thread for this Issue, else external locations as linked from the thread for meatier discussion points.


Progress breakdown in terms of methods and attributes: OLD

❗ NOTE: superseded by #295 with new, easier-to-parse, table with notes and groupings, etc. ❗

The ~300 functional/user-facing cf.Data methods and attributes are outlined below in a pretty-printed dict where the value is a Bool to indicate if they have been "Daskified" on the dask branch so far. Copy and filter using Python if desired:

Click to expand [updated by @davidhassell 2021-02-22, @sadielbartholomew 19.08.21 (upon work restart)]...
complete = {
    'HDF_chunks': False,
        'Units': True,  # by DH
    '_HDF_chunks': False,
    '_Units': False,
    '_YMDhms': False,
    '__abs__': False,
    '__add__': False,
    '__and__': False,
    '__array__': False,
    '__bool__': False,
        '__contains__': True,  # by DH
    '__data__': False,
    '__deepcopy__': False,
    '__div__': False,
    '__doc_template__': False,
    '__docstring_package_depth__': False,
    '__docstring_substitutions__': False,
    '__eq__': False,
    '__float__': False,
    '__floordiv__': False,
    '__ge__': False,
        '__getitem__': True,  # by DH
    '__gt__': False,
    '__hash__': False,
    '__iadd__': False,
    '__iand__': False,
    '__idiv__': False,
    '__ifloordiv__': False,
    '__ilshift__': False,
    '__imod__': False,
    '__imul__': False,
    '__int__': False,
    '__invert__': False,
    '__ior__': False,
    '__ipow__': False,
    '__irshift__': False,
    '__isub__': False,
    '__iter__': False,
    '__itruediv__': False,
    '__ixor__': False,
    '__le__': False,
    '__len__': False,
    '__lshift__': False,
    '__lt__': False,
    '__mod__': False,
    '__module__': False,
    '__mul__': False,
    '__ne__': False,
    '__neg__': False,
    '__new__': False,
    '__or__': False,
    '__pos__': False,
    '__pow__': False,
    '__query_set__': False,
    '__query_wi__': False,
    '__query_wo__': False,
    '__radd__': False,
    '__rand__': False,
    '__rdiv__': False,
    '__reduce__': False,
    '__reduce_ex__': False,
    '__rfloordiv__': False,
    '__rlshift__': False,
    '__rmod__': False,
    '__rmul__': False,
    '__ror__': False,
    '__round__': False,
    '__rpow__': False,
    '__rrshift__': False,
    '__rshift__': False,
    '__rsub__': False,
    '__rtruediv__': False,
    '__rxor__': False,
        '__setitem__': True,  # by DH
    '__sub__': False,
    '__truediv__': False,
    '__xor__': False,
    '_all_axes': False,
    '_all_axis_names': False,
    '_asdatetime': False,
    '_asreftime': False,
    '_atol': False,
        '_auxiliary_mask': True,  # by DH (removed)
        '_auxiliary_mask_add_component': True,  # by DH (removed)
        '_auxiliary_mask_from_1d_indices': True,  # by DH (removed)
        '_auxiliary_mask_return': True,  # by DH (removed)
        '_auxiliary_mask_subspace': True,  # by DH (removed)
        '_auxiliary_mask_tidy': True,  # by DH (removed)
        '_axes': True,  # by DH
    '_binary_operation': False,
        '_change_axis_names': True,  # by DH (removed)
    '_chunk_add_partitions': False,
    '_collapse': False,
    '_collapse_create_weights': False,
    '_collapse_finalise': False,
    '_collapse_mask': False,
    '_collapse_optimize_weights': False,
    '_collapse_subspace': False,
    '_combined_units': False,
    '_create_auxiliary_mask_component': False,
    '_custom': False,
    '_cyclic': False,
    '_default': False,
    '_del_Array': False,
    '_del_component': False,
        '_dtype': True,  # by DH (removed)
    '_equals': False,
    '_equals_preprocess': False,
        '_flag_partitions_for_processing': True,  # by DH (removed)
        '_flip': True,  # by DH (removed)
    '_get_Array': False,
    '_get_component': False,
    '_has_component': False,
    '_initialise_netcdf': False,
    '_is_abstract_Array_subclass': False,
    '_isdatetime': False,
    '_item': False,
    '_move_flip_to_partitions': False,
        '_ndim': True,  # by DH (removed)
    '_new_axis_identifier': False,
    '_package': False,
    '_parse_axes': False,
    '_parse_indices': False,
        '_pmaxes': True,  # by DH (removed)
        '_pmndim': True,  # by DH (removed)
        '_pmshape': True,  # by DH (removed)
        '_pmsize': True,  # by DH (removed)
    '_rtol': False,
    '_set_Array': False,
    '_set_CompressedArray': False,
    '_set_component': False,
        '_set_partition_matrix': True, # by DH (removed)
    '_set_subspace': False,
        '_shape': True,  # by DH (removed)
        '_share_lock_files': True,  # by DH (removed)
        '_share_partitions': True,  # by DH (removed)
        '_size': True,  # by DH (removed)
        '_unary_operation': True,  # by DH (SLB note: removed? 2x in mixin/propertiesdata* but they are probably separate?)
        'add_partitions': True,  # by DH (deprecated)
    'all': False,
    'allclose': False,
    'any': False,
    'apply_masking': False,
    'arccos': False,
    'arccosh': False,
    'arcsin': False,
    'arcsinh': False,
    'arctan': False,
    'arctanh': False,
    'argmax': False,
        'array': True,  # by DH
    'asdata': False,
    'binary_mask': False,
    'ceil': False,
    'change_calendar': False,
    'chunk': False,
    'clip': False,
    'close': False,
    'compressed': False,
    'compressed_array': False,
    'concatenate': False,
    'concatenate_data': False,
    'convolution_filter': False,
    'copy': False,
    'cos': False,
    'cosh': False,
    'count': False,
    'count_masked': False,
    'creation_commands': False,
    'cumsum': False,
    'cyclic': False,
    'data': False,
        'datetime_array': True,  # by DH
    'datetime_as_string': False,
    'datum': False,
    'day': False,
    'del_calendar': False,
    'del_fill_value': False,
    'del_units': False,
    'diff': False,
    'digitize': False,
    'dtarray': False,
        'dtype': True,  # by DH
    'dump': False,
    'dumpd': False,
    'dumps': False,
    'empty': False,
    'equals': False,
    'exp': False,
    'expand_dims': False,
    'files': False,
    'fill_value': False,
    'filled': False,
    'first_element': False,
    'fits_in_memory': False,
    'fits_in_one_chunk_in_memory': False,
    'flat': False,
    'flatten': False,
        'flip': True,  # by DH
    'floor': False,
    'full': False,
    'func': False,
    'get_calendar': False,
    'get_compressed_axes': False,
    'get_compressed_dimension': False,
    'get_compression_type': False,
    'get_count': False,
    'get_data': False,
    'get_filenames': False,
    'get_fill_value': False,
    'get_index': False,
    'get_list': False,
    'get_units': False,
    'halo': False,
        'hardmask': True,  # by DH
    'has_calendar': False,
    'has_fill_value': False,
    'has_units': False,
    'hour': False,
    'in_memory': False,
        'insert_dimension': True,  # by DH
    'inspect': False,
    'integral': False,
    'isclose': False,
        'is_masked': True,  # by DH
    'ispartitioned': False,
    'isscalar': False,
    'last_element': False,
    'loadd': False,
    'loads': False,
    'log': False,
    'mask': False,
    'mask_fpe': False,
    'mask_invalid': False,
    'masked_all': False,
    'max': False,
    'maximum': False,
    'maximum_absolute_value': False,
    'mean': False,
    'mean_absolute_value': False,
    'mean_of_upper_decile': False,
    'median': False,
    'mid_range': False,
    'min': False,
    'minimum': False,
    'minimum_absolute_value': False,
    'minute': False,
    'month': False,
        'nbytes': True,  # by DH
    'nc_clear_hdf5_chunksizes': False,
    'nc_hdf5_chunksizes': False,
    'nc_set_hdf5_chunksizes': False,
        'ndim': True,  # by DH
    'ndindex': False,
    'ones': False,
    'outerproduct': False,
    'override_calendar': False,
    'override_units': False,
    'partition_boundaries': False,
    'partition_configuration': False,
    'partitions': False,
    'percentile': False,
    'range': False,
    'reconstruct_sectioned_data': False,
    'rint': False,
        'roll': True,  # by DH
    'root_mean_square': False,
    'round': False,
    'sample_size': False,
    'save_to_disk': False,
    'sd': False,
    'second': False,
    'second_element': False,
    'section': False,
    'set_calendar': False,
    'set_fill_value': False,
    'set_units': False,
    'seterr': False,
        'shape': True,  # by DH
    'sin': False,
    'sinh': False,
        'size': True,  # by DH
    'source': False,
        'squeeze': True,  # by DH
    'standard_deviation': False,
    'stats': False,
    'sum': False,
    'sum_of_squares': False,
    'sum_of_weights': False,
    'sum_of_weights2': False,
    'swapaxes': False,
    'tan': False,
    'tanh': False,
    'to_disk': False,
    'to_memory': False,
    'tolist': False,
    'transpose': False,
    'trunc': False,
    'uncompress': False,
    'unique': False,
    'var': False,
    'variance': False,
    'varray': False,
    'where': False,
    'year': False,
    'zeros': False
}

Keywords to deprecate

TODO (see #254 (comment) for context).

Metadata

Metadata

Labels

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions