Skip to content

Commit 7b4f2d2

Browse files
authored
[8.5] [HealthAPI] Diagnosis: report typed affected resources (elastic#90653) (elastic#90678)
The health API reports the affected resources in case of an unhealthy deployment. Until now all indicators reported one type of resource per diagnosis (index, ILM policy, snapshot repository) With the introduction of the disk indicator we now have an indicator that reports multiple types of resources under the same diagnosis (ie. nodes and indices). This changes the structure of the `affected_resources` field to accommodate multiple types of resources: ``` "affected_resources": { "nodes": [ { "id": "e1af6F5rTcmgpExkdOMzCg", "name": "hot" }, { "id": "u_wBVl4ZRne4uZq_ziLsuw", "name": "warm" } ], "indices": [ ".geoip_databases", "test_index" ] } ```
1 parent 6a0e308 commit 7b4f2d2

16 files changed

Lines changed: 315 additions & 96 deletions

File tree

rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/health/40_diagnosis.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
---
22
"Diagnosis":
33
- skip:
4-
version: "- 8.3.99"
5-
reason: "diagnosis was only added in 8.4.0"
4+
version: "- 8.4.99"
5+
reason: "diagnosis was redefined in 8.5.0"
66

77
- do:
88
indices.create:
@@ -24,4 +24,4 @@
2424
- length: { indicators.shards_availability.diagnosis: 1 }
2525
- is_true: indicators.shards_availability.diagnosis.0.affected_resources
2626
- length: { indicators.shards_availability.diagnosis.0.affected_resources: 1 }
27-
- match: { indicators.shards_availability.diagnosis.0.affected_resources.0: "red_index" }
27+
- match: { indicators.shards_availability.diagnosis.0.affected_resources.indices.0: "red_index" }

server/src/main/java/org/elasticsearch/cluster/node/DiscoveryNode.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
import java.io.IOException;
2525
import java.util.Collections;
26+
import java.util.Comparator;
2627
import java.util.Map;
2728
import java.util.Objects;
2829
import java.util.Optional;
@@ -39,6 +40,8 @@ public class DiscoveryNode implements Writeable, ToXContentFragment {
3940

4041
static final String COORDINATING_ONLY = "coordinating_only";
4142
public static final Version EXTERNAL_ID_VERSION = Version.V_8_3_0;
43+
public static final Comparator<DiscoveryNode> DISCOVERY_NODE_COMPARATOR = Comparator.comparing(DiscoveryNode::getName)
44+
.thenComparing(DiscoveryNode::getId);
4245

4346
public static boolean hasRole(final Settings settings, final DiscoveryNodeRole role) {
4447
// this method can be called before the o.e.n.NodeRoleSettings.NODE_ROLES_SETTING is initialized

server/src/main/java/org/elasticsearch/cluster/routing/allocation/ShardsAvailabilityHealthIndicatorService.java

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
import static org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider.INDEX_ROUTING_ALLOCATION_ENABLE_SETTING;
6969
import static org.elasticsearch.cluster.routing.allocation.decider.ShardsLimitAllocationDecider.CLUSTER_TOTAL_SHARDS_PER_NODE_SETTING;
7070
import static org.elasticsearch.cluster.routing.allocation.decider.ShardsLimitAllocationDecider.INDEX_TOTAL_SHARDS_PER_NODE_SETTING;
71+
import static org.elasticsearch.health.Diagnosis.Resource.Type.INDEX;
7172
import static org.elasticsearch.health.HealthStatus.GREEN;
7273
import static org.elasticsearch.health.HealthStatus.RED;
7374
import static org.elasticsearch.health.HealthStatus.YELLOW;
@@ -126,7 +127,7 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {
126127
status.getSummary(),
127128
status.getDetails(explain),
128129
status.getImpacts(),
129-
status.getUserActions(explain)
130+
status.getDiagnosis(explain)
130131
);
131132
}
132133

@@ -890,11 +891,11 @@ public List<HealthIndicatorImpact> getImpacts() {
890891
}
891892

892893
/**
893-
* Summarizes the user actions that are needed to solve unassigned primary and replica shards.
894+
* Returns the diagnosis for unassigned primary and replica shards.
894895
* @param explain true if user actions should be generated, false if they should be omitted.
895896
* @return A summary of user actions. Alternatively, an empty list if none were found or explain is false.
896897
*/
897-
public List<Diagnosis> getUserActions(boolean explain) {
898+
public List<Diagnosis> getDiagnosis(boolean explain) {
898899
if (explain) {
899900
Map<Diagnosis.Definition, Set<String>> actionsToAffectedIndices = new HashMap<>(primaries.userActions);
900901
replicas.userActions.forEach((actionDefinition, indicesWithReplicasUnassigned) -> {
@@ -913,10 +914,15 @@ public List<Diagnosis> getUserActions(boolean explain) {
913914
.map(
914915
e -> new Diagnosis(
915916
e.getKey(),
916-
e.getValue()
917-
.stream()
918-
.sorted(indicesComparatorByPriorityAndName(clusterMetadata))
919-
.collect(Collectors.toList())
917+
List.of(
918+
new Diagnosis.Resource(
919+
INDEX,
920+
e.getValue()
921+
.stream()
922+
.sorted(indicesComparatorByPriorityAndName(clusterMetadata))
923+
.collect(Collectors.toList())
924+
)
925+
)
920926
)
921927
)
922928
.collect(Collectors.toList());

server/src/main/java/org/elasticsearch/health/Diagnosis.java

Lines changed: 105 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,16 @@
88

99
package org.elasticsearch.health;
1010

11+
import org.elasticsearch.cluster.node.DiscoveryNode;
1112
import org.elasticsearch.core.Nullable;
13+
import org.elasticsearch.xcontent.ToXContentFragment;
1214
import org.elasticsearch.xcontent.ToXContentObject;
1315
import org.elasticsearch.xcontent.XContentBuilder;
1416

1517
import java.io.IOException;
18+
import java.util.Collection;
1619
import java.util.List;
20+
import java.util.Objects;
1721

1822
import static org.elasticsearch.health.HealthService.HEALTH_API_ID_PREFIX;
1923

@@ -23,7 +27,102 @@
2327
* @param definition The definition of the diagnosis (e.g. message, helpURL)
2428
* @param affectedResources Optional list of "things" that are affected by this condition (e.g. shards, indices, or policies).
2529
*/
26-
public record Diagnosis(Definition definition, @Nullable List<String> affectedResources) implements ToXContentObject {
30+
public record Diagnosis(Definition definition, @Nullable List<Resource> affectedResources) implements ToXContentObject {
31+
32+
/**
33+
* Represents a type of affected resource, together with the resources/abstractions that
34+
* are affected.
35+
*/
36+
public static class Resource implements ToXContentFragment {
37+
38+
public static final String ID_FIELD = "id";
39+
public static final String NAME_FIELD = "name";
40+
41+
public enum Type {
42+
INDEX("indices"),
43+
NODE("nodes"),
44+
SLM_POLICY("slm_policy"),
45+
SNAPSHOT_REPOSITORY("snapshot_repository");
46+
47+
private final String displayValue;
48+
49+
Type(String displayValue) {
50+
this.displayValue = displayValue;
51+
}
52+
}
53+
54+
private final Type type;
55+
56+
@Nullable
57+
private Collection<String> values;
58+
@Nullable
59+
private Collection<DiscoveryNode> nodes;
60+
61+
public Resource(Type type, Collection<String> values) {
62+
if (type == Type.NODE) {
63+
throw new IllegalArgumentException("Nodes should be modelled using the dedicated constructor");
64+
}
65+
66+
this.type = type;
67+
this.values = values;
68+
}
69+
70+
public Resource(Collection<DiscoveryNode> nodes) {
71+
this.type = Type.NODE;
72+
this.nodes = nodes;
73+
}
74+
75+
@Override
76+
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
77+
if (nodes != null) {
78+
// we report both node ids and names so we need a bit of structure
79+
builder.startArray(type.displayValue);
80+
for (DiscoveryNode node : nodes) {
81+
builder.startObject();
82+
builder.field(ID_FIELD, node.getId());
83+
if (node.getName() != null) {
84+
builder.field(NAME_FIELD, node.getName());
85+
}
86+
builder.endObject();
87+
}
88+
builder.endArray();
89+
} else {
90+
builder.field(type.displayValue, values);
91+
}
92+
return builder;
93+
}
94+
95+
@Override
96+
public boolean equals(Object o) {
97+
if (this == o) {
98+
return true;
99+
}
100+
if (o == null || getClass() != o.getClass()) {
101+
return false;
102+
}
103+
Resource resource = (Resource) o;
104+
return type == resource.type && Objects.equals(values, resource.values) && Objects.equals(nodes, resource.nodes);
105+
}
106+
107+
@Override
108+
public int hashCode() {
109+
return Objects.hash(type, values, nodes);
110+
}
111+
112+
public Type getType() {
113+
return type;
114+
}
115+
116+
@Nullable
117+
public Collection<String> getValues() {
118+
return values;
119+
}
120+
121+
@Nullable
122+
public Collection<DiscoveryNode> getNodes() {
123+
return nodes;
124+
}
125+
}
27126

28127
/**
29128
* Details a diagnosis - cause and a potential action that a user could take to clear an issue identified by a {@link HealthService}.
@@ -44,7 +143,11 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
44143
builder.field("action", definition.action);
45144

46145
if (affectedResources != null && affectedResources.size() > 0) {
47-
builder.field("affected_resources", affectedResources);
146+
builder.startObject("affected_resources");
147+
for (Resource affectedResource : affectedResources) {
148+
affectedResource.toXContent(builder, params);
149+
}
150+
builder.endObject();
48151
}
49152

50153
builder.field("help_url", definition.helpURL);

server/src/main/java/org/elasticsearch/health/node/DiskHealthIndicatorService.java

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,12 @@
4040
import java.util.stream.Collectors;
4141
import java.util.stream.Stream;
4242

43+
import static org.elasticsearch.cluster.node.DiscoveryNode.DISCOVERY_NODE_COMPARATOR;
4344
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.are;
4445
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.getSortedUniqueValuesString;
4546
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.getTruncatedIndices;
4647
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.indices;
48+
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.indicesComparatorByPriorityAndName;
4749
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.regularNoun;
4850
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.regularVerb;
4951
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.these;
@@ -134,11 +136,11 @@ static class DiskHealthAnalyzer {
134136

135137
private final ClusterState clusterState;
136138
private final Set<String> blockedIndices;
137-
private final Set<DiscoveryNode> dataNodes = new HashSet<>();
139+
private final List<DiscoveryNode> dataNodes = new ArrayList<>();
138140
// In this context a master node, is a master node that cannot contain data.
139-
private final Map<HealthStatus, Set<DiscoveryNode>> masterNodes = new HashMap<>();
141+
private final Map<HealthStatus, List<DiscoveryNode>> masterNodes = new HashMap<>();
140142
// In this context "other" nodes are nodes that cannot contain data and are not masters.
141-
private final Map<HealthStatus, Set<DiscoveryNode>> otherNodes = new HashMap<>();
143+
private final Map<HealthStatus, List<DiscoveryNode>> otherNodes = new HashMap<>();
142144
private final Set<DiscoveryNodeRole> affectedRoles = new HashSet<>();
143145
private final Set<String> indicesAtRisk;
144146
private final HealthStatus healthStatus;
@@ -168,11 +170,18 @@ static class DiskHealthAnalyzer {
168170
if (node.canContainData()) {
169171
dataNodes.add(node);
170172
} else if (node.isMasterNode()) {
171-
masterNodes.computeIfAbsent(healthStatus, ignored -> new HashSet<>()).add(node);
173+
masterNodes.computeIfAbsent(healthStatus, ignored -> new ArrayList<>()).add(node);
172174
} else {
173-
otherNodes.computeIfAbsent(healthStatus, ignored -> new HashSet<>()).add(node);
175+
otherNodes.computeIfAbsent(healthStatus, ignored -> new ArrayList<>()).add(node);
174176
}
175177
}
178+
dataNodes.sort(DISCOVERY_NODE_COMPARATOR);
179+
for (List<DiscoveryNode> masterNodes : masterNodes.values()) {
180+
masterNodes.sort(DISCOVERY_NODE_COMPARATOR);
181+
}
182+
for (List<DiscoveryNode> nodes : otherNodes.values()) {
183+
nodes.sort(DISCOVERY_NODE_COMPARATOR);
184+
}
176185
indicesAtRisk = getIndicesForNodes(dataNodes, clusterState);
177186
healthStatus = mostSevereStatusSoFar;
178187
details = createDetails(diskHealthByNode, blockedIndices);
@@ -317,6 +326,20 @@ private List<Diagnosis> getDiagnoses() {
317326
List<Diagnosis> diagnosisList = new ArrayList<>();
318327
if (hasBlockedIndices() || hasUnhealthyDataNodes()) {
319328
Set<String> affectedIndices = Sets.union(blockedIndices, indicesAtRisk);
329+
List<Diagnosis.Resource> affectedResources = new ArrayList<>();
330+
if (dataNodes.size() > 0) {
331+
Diagnosis.Resource nodeResources = new Diagnosis.Resource(dataNodes);
332+
affectedResources.add(nodeResources);
333+
}
334+
if (affectedIndices.size() > 0) {
335+
Diagnosis.Resource indexResources = new Diagnosis.Resource(
336+
Diagnosis.Resource.Type.INDEX,
337+
affectedIndices.stream()
338+
.sorted(indicesComparatorByPriorityAndName(clusterState.metadata()))
339+
.collect(Collectors.toList())
340+
);
341+
affectedResources.add(indexResources);
342+
}
320343
diagnosisList.add(
321344
new Diagnosis(
322345
new Diagnosis.Definition(
@@ -336,7 +359,7 @@ private List<Diagnosis> getDiagnoses() {
336359
+ "this. If you have already taken action please wait for the rebalancing to complete.",
337360
"https://ela.st/fix-data-disk"
338361
),
339-
dataNodes.stream().map(DiscoveryNode::getId).sorted().toList()
362+
affectedResources
340363
)
341364
);
342365
}
@@ -397,7 +420,7 @@ private boolean hasBlockedIndices() {
397420
}
398421

399422
// Non-private for unit testing
400-
static Set<String> getIndicesForNodes(Set<DiscoveryNode> nodes, ClusterState clusterState) {
423+
static Set<String> getIndicesForNodes(List<DiscoveryNode> nodes, ClusterState clusterState) {
401424
RoutingNodes routingNodes = clusterState.getRoutingNodes();
402425
return nodes.stream()
403426
.map(node -> routingNodes.node(node.getId()))
@@ -416,11 +439,11 @@ private Diagnosis createNonDataNodeDiagnosis(HealthStatus healthStatus, Collecti
416439
"Please add capacity to the current nodes, or replace them with ones with higher capacity.",
417440
isMaster ? "https://ela.st/fix-master-disk" : "https://ela.st/fix-disk-space"
418441
),
419-
nodes.stream().map(DiscoveryNode::getId).sorted().toList()
442+
List.of(new Diagnosis.Resource(nodes))
420443
);
421444
}
422445

423-
private int getUnhealthyNodeSize(Map<HealthStatus, Set<DiscoveryNode>> nodes) {
446+
private int getUnhealthyNodeSize(Map<HealthStatus, List<DiscoveryNode>> nodes) {
424447
return (nodes.containsKey(HealthStatus.RED) ? nodes.get(HealthStatus.RED).size() : 0) + (nodes.containsKey(HealthStatus.YELLOW)
425448
? nodes.get(HealthStatus.YELLOW).size()
426449
: 0);

server/src/main/java/org/elasticsearch/snapshots/RepositoryIntegrityHealthIndicatorService.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import org.elasticsearch.cluster.metadata.RepositoryMetadata;
1313
import org.elasticsearch.cluster.service.ClusterService;
1414
import org.elasticsearch.health.Diagnosis;
15+
import org.elasticsearch.health.Diagnosis.Resource.Type;
1516
import org.elasticsearch.health.HealthIndicatorDetails;
1617
import org.elasticsearch.health.HealthIndicatorImpact;
1718
import org.elasticsearch.health.HealthIndicatorResult;
@@ -132,7 +133,7 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {
132133
)
133134
: HealthIndicatorDetails.EMPTY,
134135
impacts,
135-
List.of(new Diagnosis(CORRUPTED_REPOSITORY, corrupted))
136+
List.of(new Diagnosis(CORRUPTED_REPOSITORY, List.of(new Diagnosis.Resource(Type.SNAPSHOT_REPOSITORY, corrupted))))
136137
);
137138
}
138139

0 commit comments

Comments
 (0)