Skip to content

Commit c45a6c0

Browse files
Simplified retry logic to DNS cache
PiperOrigin-RevId: 540061371
1 parent 408f9c9 commit c45a6c0

2 files changed

Lines changed: 97 additions & 16 deletions

File tree

tensorflow/tsl/platform/cloud/BUILD

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,11 @@ cc_library(
8282
deps = [
8383
":http_request",
8484
"//tensorflow/tsl/platform:env",
85+
"//tensorflow/tsl/platform:errors",
86+
"//tensorflow/tsl/platform:retrying_utils",
87+
"//tensorflow/tsl/platform:status",
88+
"@com_google_absl//absl/status",
89+
"@com_google_absl//absl/strings",
8590
],
8691
)
8792

tensorflow/tsl/platform/cloud/gcs_dns_cache.cc

Lines changed: 92 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@ limitations under the License.
1414
==============================================================================*/
1515

1616
#include "tensorflow/tsl/platform/cloud/gcs_dns_cache.h"
17+
18+
#include <cstring>
19+
20+
#include "absl/status/status.h"
21+
#include "absl/strings/str_cat.h"
22+
#include "tensorflow/tsl/platform/errors.h"
23+
#include "tensorflow/tsl/platform/retrying_utils.h"
24+
#include "tensorflow/tsl/platform/status.h"
1725
#ifndef _WIN32
1826
#include <arpa/inet.h>
1927
#include <netdb.h>
@@ -33,19 +41,9 @@ namespace {
3341
const std::vector<string>& kCachedDomainNames =
3442
*new std::vector<string>{"www.googleapis.com", "storage.googleapis.com"};
3543

36-
inline void print_getaddrinfo_error(const string& name, int error_code) {
37-
#ifndef _WIN32
38-
if (error_code == EAI_SYSTEM) {
39-
LOG(ERROR) << "Error resolving " << name
40-
<< " (EAI_SYSTEM): " << strerror(errno);
41-
} else {
42-
LOG(ERROR) << "Error resolving " << name << ": "
43-
<< gai_strerror(error_code);
44-
}
45-
#else
46-
// TODO:WSAGetLastError is better than gai_strerror
47-
LOG(ERROR) << "Error resolving " << name << ": " << gai_strerror(error_code);
48-
#endif
44+
inline void print_getaddrinfo_error(const string& name, Status return_status) {
45+
// Status doesn't map well to EAI type errors.
46+
LOG(ERROR) << "Error resolving " << name << ": " << return_status;
4947
}
5048

5149
// Selects one item at random from a vector of items, using a uniform
@@ -101,10 +99,88 @@ void GcsDnsCache::AnnotateRequest(HttpRequest* request) {
10199
hints.ai_family = AF_INET; // Only use IPv4 for now.
102100
hints.ai_socktype = SOCK_STREAM;
103101
addrinfo* result = nullptr;
104-
int return_code = getaddrinfo(name.c_str(), nullptr, &hints, &result);
102+
RetryConfig retryConfig(
103+
/* init_delay_time_us = */ 5000,
104+
/* max_delay_time_us = */ 50 * 1000 * 5000,
105+
/* max_retries = */ 5);
106+
107+
const Status getaddrinfo_status = RetryingUtils::CallWithRetries(
108+
[&name, &hints, &result]() {
109+
int return_code = getaddrinfo(name.c_str(), nullptr, &hints, &result);
110+
absl::Status return_status;
111+
switch (return_code) {
112+
case 0:
113+
return_status = OkStatus();
114+
break;
115+
#ifndef _WIN32
116+
case EAI_ADDRFAMILY:
117+
case EAI_SERVICE:
118+
case EAI_SOCKTYPE:
119+
case EAI_NONAME:
120+
return_status = absl::FailedPreconditionError(
121+
absl::StrCat("System in invalid state for getaddrinfo call: ",
122+
gai_strerror(return_code)));
123+
break;
124+
case EAI_AGAIN:
125+
case EAI_NODATA: // lump nodata in here - the domains being resolved
126+
// should always have data
127+
return_status = absl::UnavailableError(absl::StrCat(
128+
"Resolving ", name, " is temporarily unavailable"));
129+
break;
130+
case EAI_BADFLAGS:
131+
case EAI_FAMILY:
132+
return_status = absl::InvalidArgumentError(absl::StrCat(
133+
"Bad arguments for getaddrinfo: ", gai_strerror(return_code)));
134+
break;
135+
case EAI_FAIL:
136+
return_status = absl::NotFoundError(
137+
absl::StrCat("Permanent failure resolving ", name, ": ",
138+
gai_strerror(return_code)));
139+
break;
140+
case EAI_MEMORY:
141+
return_status = absl::ResourceExhaustedError("Out of memory");
142+
break;
143+
case EAI_SYSTEM:
144+
default:
145+
return_status = absl::UnknownError(strerror(return_code));
146+
#else
147+
// mapping from
148+
// https://learn.microsoft.com/en-us/windows/win32/api/ws2tcpip/nf-ws2tcpip-getaddrinfo#return-value
149+
case WSATYPE_NOT_FOUND:
150+
case WSAESOCKTNOSUPPORT:
151+
case WSAHOST_NOT_FOUND:
152+
return_status = absl::FailedPreconditionError(
153+
absl::StrCat("System in invalid state for getaddrinfo call: ",
154+
gai_strerror(return_code)));
155+
break;
156+
case WSATRY_AGAIN:
157+
return_status = absl::UnavailableError(absl::StrCat(
158+
"Resolving ", name, " is temporarily unavailable"));
159+
break;
160+
case WSAEINVAL:
161+
case WSAEAFNOSUPPORT:
162+
return_status = absl::InvalidArgumentError(absl::StrCat(
163+
"Bad arguments for getaddrinfo: ", gai_strerror(return_code)));
164+
break;
165+
case WSANO_RECOVERY:
166+
return_status = absl::NotFoundError(
167+
absl::StrCat("Permanent failure resolving ", name, ": ",
168+
gai_strerror(return_code)));
169+
break;
170+
case WSA_NOT_ENOUGH_MEMORY:
171+
return_status = absl::ResourceExhaustedError("Out of memory");
172+
break;
173+
default:
174+
return_status = absl::UnknownError(strerror(return_code));
175+
#endif
176+
}
177+
178+
return Status(return_status);
179+
},
180+
retryConfig);
105181

106182
std::vector<string> output;
107-
if (return_code == 0) {
183+
if (getaddrinfo_status.ok()) {
108184
for (const addrinfo* i = result; i != nullptr; i = i->ai_next) {
109185
if (i->ai_family != AF_INET || i->ai_addr->sa_family != AF_INET) {
110186
LOG(WARNING) << "Non-IPv4 address returned. ai_family: " << i->ai_family
@@ -125,7 +201,7 @@ void GcsDnsCache::AnnotateRequest(HttpRequest* request) {
125201
}
126202
}
127203
} else {
128-
print_getaddrinfo_error(name, return_code);
204+
print_getaddrinfo_error(name, getaddrinfo_status);
129205
}
130206
if (result != nullptr) {
131207
freeaddrinfo(result);

0 commit comments

Comments
 (0)