Skip to content

Commit c1a2047

Browse files
authored
sources/azure: ensure retries on IMDS request failure (#1271)
There are two issues with IMDS retries: 1. IMDS_VER_WANT will never be attempted if retries=0, such as when fetching network metadata with infinite=True. 2. get_imds_data_with_api_fallback() will attempt one request with IMDS_VER_WANT. If the connection fails due to a timeout, connection issue, or error code other than 400, an empty dictionary will be returned without attempting the requested number of retries. This PR: - Updates get_imds_data_with_api_fallback() to invoke get_metadata_from_imds() with the specified retries and infinite parameters. - Updates retry_on_url_exc to take a configurable set of HTTP error codes and exception types to retry on. - Add IMDS_RETRY_CODES set to retry with when fetching data from IMDS: - 404 not found (yet) - 410 gone / unavailable (yet) - 429 rate-limited/throttled - 500 server error - Replace default callback with imds_readurl_exception_callback, which configures retry_on_url_exc() with these error codes and instances. - Add new pytests for IMDS to eventually replace the unittest equivalents and improve existing coverage. Signed-off-by: Chris Patterson <cpatterson@microsoft.com>
1 parent f4404af commit c1a2047

File tree

3 files changed

+283
-56
lines changed

3 files changed

+283
-56
lines changed

cloudinit/sources/DataSourceAzure.py

Lines changed: 55 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import base64
88
import crypt
99
import datetime
10+
import functools
1011
import os
1112
import os.path
1213
import re
@@ -67,6 +68,17 @@
6768
IMDS_VER_MIN = "2019-06-01"
6869
IMDS_VER_WANT = "2021-08-01"
6970
IMDS_EXTENDED_VER_MIN = "2021-03-01"
71+
IMDS_RETRY_CODES = (
72+
404, # not found (yet)
73+
410, # gone / unavailable (yet)
74+
429, # rate-limited/throttled
75+
500, # server error
76+
)
77+
imds_readurl_exception_callback = functools.partial(
78+
retry_on_url_exc,
79+
retry_codes=IMDS_RETRY_CODES,
80+
retry_instances=(requests.Timeout,),
81+
)
7082

7183

7284
class MetadataType(Enum):
@@ -725,44 +737,49 @@ def _get_data(self):
725737
def get_imds_data_with_api_fallback(
726738
self,
727739
*,
728-
retries,
729-
md_type=MetadataType.ALL,
730-
exc_cb=retry_on_url_exc,
731-
infinite=False,
732-
):
733-
"""
734-
Wrapper for get_metadata_from_imds so that we can have flexibility
735-
in which IMDS api-version we use. If a particular instance of IMDS
736-
does not have the api version that is desired, we want to make
737-
this fault tolerant and fall back to a good known minimum api
738-
version.
739-
"""
740-
for _ in range(retries):
741-
try:
742-
LOG.info("Attempting IMDS api-version: %s", IMDS_VER_WANT)
743-
return get_metadata_from_imds(
744-
retries=0,
745-
md_type=md_type,
746-
api_version=IMDS_VER_WANT,
747-
exc_cb=exc_cb,
748-
)
749-
except UrlError as err:
750-
LOG.info("UrlError with IMDS api-version: %s", IMDS_VER_WANT)
751-
if err.code == 400:
752-
log_msg = "Fall back to IMDS api-version: {}".format(
753-
IMDS_VER_MIN
754-
)
755-
report_diagnostic_event(log_msg, logger_func=LOG.info)
756-
break
740+
retries: int,
741+
md_type: MetadataType = MetadataType.ALL,
742+
exc_cb=imds_readurl_exception_callback,
743+
infinite: bool = False,
744+
) -> dict:
745+
"""Fetch metadata from IMDS using IMDS_VER_WANT API version.
757746
758-
LOG.info("Using IMDS api-version: %s", IMDS_VER_MIN)
759-
return get_metadata_from_imds(
760-
retries=retries,
761-
md_type=md_type,
762-
api_version=IMDS_VER_MIN,
763-
exc_cb=exc_cb,
764-
infinite=infinite,
765-
)
747+
Falls back to IMDS_VER_MIN version if IMDS returns a 400 error code,
748+
indicating that IMDS_VER_WANT is unsupported.
749+
750+
:return: Parsed metadata dictionary or empty dict on error.
751+
"""
752+
LOG.info("Attempting IMDS api-version: %s", IMDS_VER_WANT)
753+
try:
754+
return get_metadata_from_imds(
755+
retries=retries,
756+
md_type=md_type,
757+
api_version=IMDS_VER_WANT,
758+
exc_cb=exc_cb,
759+
infinite=infinite,
760+
)
761+
except UrlError as error:
762+
LOG.info("UrlError with IMDS api-version: %s", IMDS_VER_WANT)
763+
# Fall back if HTTP code is 400, otherwise return empty dict.
764+
if error.code != 400:
765+
return {}
766+
767+
log_msg = "Fall back to IMDS api-version: {}".format(IMDS_VER_MIN)
768+
report_diagnostic_event(log_msg, logger_func=LOG.info)
769+
try:
770+
return get_metadata_from_imds(
771+
retries=retries,
772+
md_type=md_type,
773+
api_version=IMDS_VER_MIN,
774+
exc_cb=exc_cb,
775+
infinite=infinite,
776+
)
777+
except UrlError as error:
778+
report_diagnostic_event(
779+
"Failed to fetch IMDS metadata: %s" % error,
780+
logger_func=LOG.error,
781+
)
782+
return {}
766783

767784
def device_name_to_device(self, name):
768785
return self.ds_cfg["disk_aliases"].get(name)
@@ -2271,7 +2288,7 @@ def get_metadata_from_imds(
22712288
retries,
22722289
md_type=MetadataType.ALL,
22732290
api_version=IMDS_VER_MIN,
2274-
exc_cb=retry_on_url_exc,
2291+
exc_cb=imds_readurl_exception_callback,
22752292
infinite=False,
22762293
):
22772294
"""Query Azure's instance metadata service, returning a dictionary.

cloudinit/url_helper.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -639,16 +639,22 @@ def oauth_headers(
639639
return signed_headers
640640

641641

642-
def retry_on_url_exc(msg, exc):
643-
"""readurl exception_cb that will retry on NOT_FOUND and Timeout.
642+
def retry_on_url_exc(
643+
msg, exc, *, retry_codes=(NOT_FOUND,), retry_instances=(requests.Timeout,)
644+
):
645+
"""Configurable retry exception callback for readurl().
646+
647+
:param retry_codes: Codes to retry on. Defaults to 404.
648+
:param retry_instances: Exception types to retry on. Defaults to
649+
requests.Timeout.
644650
645-
Returns False to raise the exception from readurl, True to retry.
651+
:returns: False to raise the exception from readurl(), True to retry.
646652
"""
647653
if not isinstance(exc, UrlError):
648654
return False
649-
if exc.code == NOT_FOUND:
655+
if exc.code in retry_codes:
650656
return True
651-
if exc.cause and isinstance(exc.cause, requests.Timeout):
657+
if exc.cause and isinstance(exc.cause, retry_instances):
652658
return True
653659
return False
654660

0 commit comments

Comments
 (0)