-
Notifications
You must be signed in to change notification settings - Fork 53
Expand file tree
/
Copy pathstack.py
More file actions
326 lines (282 loc) · 15.4 KB
/
stack.py
File metadata and controls
326 lines (282 loc) · 15.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
from __future__ import annotations
from typing import AbstractSet, List, Literal, Optional, Sequence, Tuple, Type, Union
import numpy as np
import xarray as xr
import dask
from rasterio import RasterioIOError
from rasterio.enums import Resampling
from .prepare import prepare_items, to_attrs, to_coords
from .raster_spec import Bbox, IntFloat, Resolutions
from .reader_protocol import Reader
from .rio_env import LayeredEnv
from .rio_reader import AutoParallelRioReader
from .stac_types import (
ItemCollectionIsh,
ItemIsh,
PystacItem,
SatstacItem,
items_to_plain,
)
from .to_dask import items_to_dask, ChunksParam
def stack(
items: Union[
ItemCollectionIsh, ItemIsh, Sequence[PystacItem], Sequence[SatstacItem]
],
assets: Optional[Union[List[str], AbstractSet[str]]] = frozenset(
["image/tiff", "image/x.geotiff", "image/vnd.stac.geotiff", "image/jp2"]
),
epsg: Optional[int] = None,
resolution: Optional[Union[IntFloat, Resolutions]] = None,
bounds: Optional[Bbox] = None,
bounds_latlon: Optional[Bbox] = None,
snap_bounds: bool = True,
resampling: Resampling = Resampling.nearest,
chunksize: ChunksParam = 1024,
dtype: np.dtype = np.dtype("float64"),
fill_value: Union[int, float] = np.nan,
rescale: bool = True,
sortby_date: Literal["asc", "desc", False] = "asc",
xy_coords: Literal["center", "topleft", False] = "topleft",
properties: Union[bool, str, Sequence[str]] = True,
band_coords: bool = True,
gdal_env: Optional[LayeredEnv] = None,
errors_as_nodata: Tuple[Exception, ...] = (
RasterioIOError("HTTP response code: 404"),
),
reader: Type[Reader] = AutoParallelRioReader,
) -> xr.DataArray:
"""
Create an `xarray.DataArray` of all the STAC items, reprojected to the same grid and stacked by time.
The DataArray's dimensions will be ``("time", "band", "y", "x")``. It's backed by
a lazy `Dask array <dask.array.Array>`, so you can manipulate it without touching any data.
We'll try to choose the output coordinate reference system, resolution, and bounds
based on the metadata in the STAC items. However, if not all items have the necessary
metadata, or aren't in the same coordinate reference system, you'll have specify these
yourself---``epsg`` and ``resolution`` are the two parameters you'll set most often.
Examples
--------
>>> import stackstac
>>> import satsearch
>>> items = satsearch.Search(...).items()
>>> # Use default CRS, resolution, bounding box, etc.
>>> xr_stack = stackstac.stack(items)
>>>
>>> # Reproject to 100-meter resolution in web mercator
>>> xr_stack = stackstac.stack(items, epsg=3857, resolution=100)
>>>
>>> # Only use specific asset IDs
>>> xr_stack = stackstac.stack(items, assets=["B01", "B03", "B02"])
>>>
>>> # Clip to a custom bounding box
>>> xr_stack = stackstac.stack(items, bounds_latlon=[-106.2, 35.6, -105.6, 36])
>>>
>>> # Turn off all metadata if you don't need it
>>> xr_stack = stackstac.stack(
... items, properties=False, bands_coords=False, xy_coords=False, sortby_date=False
... )
>>>
>>> # Custom dtype and fill_value
>>> xr_stack = stackstac.stack(items, rescale=False, fill_value=0, dtype="uint16")
Note
----
Don't be scared of all the parameters!
Though there are lots of options, you can leave nearly all of them as their defaults.
Parameters
----------
items:
The STAC items to stack. Can be a plain Python list of dicts
following the STAC JSON specification, or objects from
the `satstac <https://github.com/sat-utils/sat-stac>`_ or
`pystac <https://github.com/stac-utils/pystac>`_
libraries.
assets:
Which asset IDs to use. Any Items missing a particular Asset will return an array
of ``fill_value`` for that Asset. By default, returns all assets with a GeoTIFF
or JPEG2000 ``type``.
If None, all assets are used.
If a list of strings, those asset IDs are used.
If a set, only assets compatible with those mimetypes are used (according to the
``type`` field on each asset). Note that if you give ``assets={"image/tiff"}``,
and the asset ``B1`` has ``type="image/tiff"`` on some items but ``type="image/png"``
on others, then ``B1`` will not be included. Mimetypes structure is respected, so
``image/tiff`` will also match ``image/tiff; application=geotiff``; ``image`` will match
``image/tiff`` and ``image/jp2``, etc. See the `STAC common media types <MT>`_ for ideas.
Assets which don't have ``type`` specified on any item will be dropped in this case.
Note: each asset's data must contain exactly one band. Multi-band assets (like an RGB GeoTIFF)
are not yet supported.
.. _MT: https://github.com/radiantearth/stac-spec/blob/master/best-practices.md#common-media-types-in-stac
epsg:
Reproject into this coordinate reference system, as given by an `EPSG code <http://epsg.io>`_.
If None (default), uses whatever CRS is set on all the items. In this case, all Items/Assets
must have the ``proj:epsg`` field, and it must be the same value for all of them.
resolution:
Output resolution. Careful: this must be given in the output CRS's units!
For example, with ``epsg=4326`` (meaning lat-lon), the units are degrees of
latitude/longitude, not meters. Giving ``resolution=20`` in that case would mean
each pixel is 20ºx20º (probably not what you wanted). You can also give pair of
``(x_resolution, y_resolution)``.
If None (default), we try to calculate each Asset's resolution based on whatever metadata is available,
and pick the minimum of all the resolutions---meaning by default, all data will be upscaled to
match the "finest" or "highest-resolution" Asset.
To estimate resolution, these combinations of fields must be set on each Asset or Item
(in order of preference):
* The ``proj:transform`` and ``proj:epsg`` fields
* The ``proj:shape`` and one of ``proj:bbox`` or ``bbox`` fields
bounds:
Output spatial bounding-box, as a tuple of ``(min_x, min_y, max_x, max_y)``.
This defines the ``(west, south, east, north)`` rectangle the output array will cover.
Values must be in the same coordinate reference system as ``epsg``.
If None (default), the bounding box of all the input items is automatically used.
(This only requires the ``bbox`` field to be set on each Item, which is a required
field in the STAC specification, so only in rare cases will auto-calculating the bounds
fail.) So in most cases, you can leave ``bounds`` as None. You'd only need to set it
when you want to use a custom bounding box.
When ``bounds`` is given, any assets that don't overlap those bounds are dropped.
bounds_latlon:
Same as ``bounds``, but given in degrees (latitude and longitude) for convenience.
Only one of ``bounds`` and ``bounds_latlon`` can be used.
snap_bounds:
Whether to snap the bounds to whole-number intervals of ``resolution`` to prevent
fraction-of-a-pixel offsets. Default: True.
This is equivalent to the ``-tap`` or
`target-align pixels <https://gis.stackexchange.com/questions/165402/how-to-use-tap-in-gdal-rasterize>`_
argument in GDAL.
resampling:
The rasterio resampling method to use when reprojecting or rescaling data to a different CRS or resolution.
Default: ``rasterio.enums.Resampling.nearest``.
chunksize:
The chunksize to use for the Dask array. Default: 1024. Picking a good chunksize will
have significant effects on performance!
Can be given in any format :ref:`Dask understands <dask:array.chunks>`,
such as ``1024``, ``(1024, 2048)``, ``(10, "auto", 512, 512)``, ``15 MB``, etc.
If only 1 or 2 sizes are given, like ``2048`` or ``(512, 1024)``, this is used to chunk
just the spatial dimensions (last two). The time and band dimensions will have a chunksize of 1,
meaning that every STAC Asset will be its own chunk. (This is the default.)
If you'll be filtering items somewhat randomly (like ``stack[stack["eo:cloud_cover"] < 20]``),
you want the chunksize to be like ``(1, X, X, X)``. Otherwise, if you had a chunksize like
``(3, 1, X, X)``, Dask would always load three items per chunk, even if two of them would be
immediately thrown away because they didn't match the cloud-cover filter.
However, when your graph starts getting too large for Dask to handle, using a larger chunksize
for the time or band dimensions can help a lot. For example, ``chunksize=(10, 1, 512, 512)`` would
process in 512x512 pixel tiles, loading 10 assets at a time per tile. ``chunksize=(-1, 1, 512, 512)``
would load *every* item within the 512x512 tile into the chunk.
If you'll be immediately compositing the data (like ``.median("time")``), doing this is
often a good idea because you'll be flattening the assets together anyway.
dtype:
The NumPy data type of the output array. Default: ``float64``. Must be a data type
that's compatible with ``fill_value``.
fill_value:
Value to fill nodata/masked pixels with. Default: ``np.nan``.
Using NaN is generally the best approach, since many functions already know how to
handle/propagate NaNs, or have NaN-equivalents (``dask.array.nanmean`` vs ``dask.array.mean``,
for example). However, NaN requires a floating-point ``dtype``. If you know the data can
be represented in a smaller data type (like ``uint16``), using a different ``fill_value``
(like 0) and managing it yourself could save a lot of memory.
rescale:
Whether to rescale pixel values by the scale and offset present in the ``raster:bands`` metadata
for each asset.
Default: True. Note that this could produce floating-point data when the
original values are ints, so set ``dtype`` accordingly. Raises `ValueError` if the
``dtype`` specified can't hold the data after rescaling: for example, if loading
data with ``dtype=int, rescale=True`` where the scaling factor is 1.5, the rescaled
data would be floating-point, and couldn't be stored as an integer.
sortby_date:
Whether to pre-sort the items by date (from the ``properties["datetime"]`` field).
One of ``"asc"``, ``"desc"``, or False to disable sorting. Default: ``"asc"``.
Note that if you set ``sortby_date=False``, selecting date ranges from the DataArray
(like ``da.loc["2020-01":"2020-04"]``) may behave strangely, because xarray/pandas
needs indexes to be sorted.
xy_coords:
Whether to add geospatial coordinate labels for the ``x`` and ``y`` dimensions of the DataArray,
allowing for spatial indexing. The coordinates will be in the coordinate reference system given
by ``epsg``
If ``"topleft"`` (default), the coordinates are for each pixel's upper-left corner,
following raster conventions.
If ``"center"``, the coordinates are for each pixel's centroid, following xarray conventions.
If False, ``x`` and ``y`` will just be indexed by row/column numbers, saving a small amount of time
and local memory.
properties:
Which fields from each STAC Item's ``properties`` to add as coordinates to the DataArray, indexing the "time"
dimension.
If None (default), all properties will be used. If a string or sequence of strings, only those fields
will be used. For each Item missing a particular field, its value for that Item will be None.
If False, no properties will be added.
band_coords:
Whether to include Asset-level metadata as coordinates for the ``bands`` dimension.
If True (default), for each asset ID, the field(s) that have the same value across all Items
will be added as coordinates.
The ``eo:bands`` field is also unpacked if present, and ``sar:polarizations`` is renamed to
``polarization`` for convenience.
gdal_env:
Advanced use: a `~.LayeredEnv` of GDAL configuration options to use while opening
and reading datasets. If None (default), `~.DEFAULT_GDAL_ENV` is used.
See ``rio_reader.py`` for notes on why these default options were chosen.
errors_as_nodata:
Exception patterns to ignore when opening datasets or reading data.
Exceptions matching the pattern will be logged as warnings, and just
produce nodata (``fill_value``).
The exception patterns should be instances of an Exception type to catch,
where ``str(exception_pattern)`` is a regex pattern to match against
``str(raised_exception)``. For example, ``RasterioIOError("HTTP response code: 404")``
(the default). Or ``IOError(r"HTTP response code: 4\\d\\d")``, to catch any 4xx HTTP error.
Or ``Exception(".*")`` to catch absolutely anything (that one's probably a bad idea).
reader:
Advanced use: the `~.Reader` type to use. Currently there is only one real reader type:
`~.AutoParallelRioReader`. However, there's also `~.FakeReader` (which doesn't read data at all,
just returns random numbers), which can be helpful for isolating whether performace issues are
due to IO and GDAL, or inherent to dask.
Returns
-------
xarray.DataArray:
xarray DataArray, backed by a Dask array. No IO will happen until calling ``.compute()``,
or accessing ``.values``. The dimensions will be ``("time", "band", "y", "x")``.
``time`` will be equal in length to the number of items you pass in, and indexed by STAC Item datetime.
Note that this means multiple entries could have the same index. Note also datetime strings are cast to
'UTC' but passed to xarray without timezone information (dtype='datetime64[ns]').
``band`` will be equal in length to the number of asset IDs used (see the ``assets`` parameter for more).
The size of ``y`` and ``x`` will be determined by ``resolution`` and ``bounds``, which in many cases are
automatically computed from the items you pass in.
"""
plain_items = items_to_plain(items)
if sortby_date is not False:
plain_items = sorted(
plain_items,
key=lambda item: item["properties"].get("datetime", "") or "",
reverse=sortby_date == "desc",
)
asset_table, spec, asset_ids, plain_items = prepare_items(
plain_items,
assets=assets,
epsg=epsg,
resolution=resolution,
bounds=bounds,
bounds_latlon=bounds_latlon,
snap_bounds=snap_bounds,
rescale=rescale,
dtype=dtype,
)
arr = items_to_dask(
asset_table,
spec,
chunksize=chunksize,
dtype=dtype,
resampling=resampling,
fill_value=fill_value,
rescale=rescale,
reader=reader,
gdal_env=gdal_env,
errors_as_nodata=errors_as_nodata,
)
return xr.DataArray(
arr,
*to_coords(
plain_items,
asset_ids,
spec,
xy_coords=xy_coords,
properties=properties,
band_coords=band_coords,
),
attrs=to_attrs(spec),
name="stackstac-" + dask.base.tokenize(arr),
)