Skip to content

Commit 8ff1a87

Browse files
committed
Remove cgroup stats leniency
This commit removes some leniency in the handling of I/O exceptions during the reading of cgroup stats.
1 parent 5e2b71f commit 8ff1a87

2 files changed

Lines changed: 119 additions & 169 deletions

File tree

core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java

Lines changed: 116 additions & 157 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,10 @@
3232
import java.lang.reflect.Method;
3333
import java.nio.file.Files;
3434
import java.nio.file.Path;
35-
import java.util.Collections;
3635
import java.util.HashMap;
3736
import java.util.List;
3837
import java.util.Locale;
3938
import java.util.Map;
40-
import java.util.function.Supplier;
4139
import java.util.regex.Matcher;
4240
import java.util.regex.Pattern;
4341

@@ -189,65 +187,16 @@ public short getSystemCpuPercent() {
189187
}
190188

191189
/**
192-
* Reads all lines of a file and logs any {@link IOException}.
190+
* Reads a file containing a single line.
193191
*
194192
* @param path path to the file to read
195-
* @return all the line or an empty list if an {@link IOException}
196-
* occurred
193+
* @return the single line
194+
* @throws IOException if an I/O exception occurs reading the file
197195
*/
198-
private List<String> readAllLines(final Path path) {
199-
try {
200-
return Files.readAllLines(path);
201-
} catch (final IOException e) {
202-
if (logger.isDebugEnabled()) {
203-
logger.debug("error reading " + path, e);
204-
}
205-
return Collections.emptyList();
206-
}
207-
}
208-
209-
/**
210-
* Reads a file containing a single line and logs any
211-
* {@link IOException}.
212-
*
213-
* @param path path to the file to read
214-
* @return the single line or {@code null} if an
215-
* {@link IOException} occurred
216-
*/
217-
private String readSingleLine(final Path path) {
218-
try {
219-
final List<String> lines = Files.readAllLines(path);
220-
assert lines != null && lines.size() == 1;
221-
return lines.get(0);
222-
} catch (final IOException e) {
223-
if (logger.isDebugEnabled()) {
224-
logger.debug("error reading " + path, e);
225-
}
226-
return null;
227-
}
228-
}
229-
230-
/**
231-
* Parses the input to a long and logs any
232-
* {@link NumberFormatException} that occurs during parsing.
233-
*
234-
* @param line the line to parse
235-
* @param message a debug message to log if parsing fails
236-
* @return the parsed value
237-
*/
238-
private long parseSingleLineAsLong(final String line, final Supplier<String> message) {
239-
if (line != null) {
240-
try {
241-
return Long.parseLong(line);
242-
} catch (final NumberFormatException e) {
243-
if (logger.isDebugEnabled()) {
244-
logger.debug(message.get(), e);
245-
}
246-
// fallback
247-
}
248-
}
249-
250-
return -1;
196+
private String readSingleLine(final Path path) throws IOException {
197+
final List<String> lines = Files.readAllLines(path);
198+
assert lines != null && lines.size() == 1;
199+
return lines.get(0);
251200
}
252201

253202
// pattern for lines in /proc/self/cgroup
@@ -261,29 +210,27 @@ private long parseSingleLineAsLong(final String line, final Supplier<String> mes
261210
*
262211
* @return a map from subsystems to the control group for the
263212
* Elasticsearch process.
213+
* @throws IOException if an I/O exception occurs reading
214+
* {@code /proc/self/cgroup}
264215
*/
265-
private Map<String, String> getControlGroups() {
216+
private Map<String, String> getControlGroups() throws IOException {
266217
final List<String> lines = readProcSelfCgroup();
267-
if (!lines.isEmpty()) {
268-
final Map<String, String> controllerMap = new HashMap<>();
269-
for (final String line : lines) {
270-
final Matcher matcher = CONTROL_GROUP_PATTERN.matcher(line);
271-
// note that Matcher#matches must be invoked as
272-
// matching is lazy; this can not happen in an assert
273-
// as assertions might not be enabled
274-
final boolean matches = matcher.matches();
275-
assert matches : line;
276-
// at this point we have captured the subsystems and the
277-
// control group
278-
final String[] controllers = matcher.group(1).split(",");
279-
for (final String controller : controllers) {
280-
controllerMap.put(controller, matcher.group(2));
281-
}
218+
final Map<String, String> controllerMap = new HashMap<>();
219+
for (final String line : lines) {
220+
final Matcher matcher = CONTROL_GROUP_PATTERN.matcher(line);
221+
// note that Matcher#matches must be invoked as
222+
// matching is lazy; this can not happen in an assert
223+
// as assertions might not be enabled
224+
final boolean matches = matcher.matches();
225+
assert matches : line;
226+
// at this point we have captured the subsystems and the
227+
// control group
228+
final String[] controllers = matcher.group(1).split(",");
229+
for (final String controller : controllers) {
230+
controllerMap.put(controller, matcher.group(2));
282231
}
283-
return controllerMap;
284232
}
285-
286-
return Collections.emptyMap();
233+
return controllerMap;
287234
}
288235

289236
/**
@@ -299,12 +246,15 @@ private Map<String, String> getControlGroups() {
299246
* bound to the hierarchy, and the last field representing the
300247
* control group.
301248
*
302-
* @return the lines from {@code /proc/self/cgroup} or an empty
303-
* list.
249+
* @return the lines from {@code /proc/self/cgroup}
250+
* @throws IOException if an I/O exception occurs reading
251+
* {@code /proc/self/cgroup}
304252
*/
305253
@SuppressForbidden(reason = "access /proc/self/cgroup")
306-
List<String> readProcSelfCgroup() {
307-
return readAllLines(PathUtils.get("/proc/self/cgroup"));
254+
List<String> readProcSelfCgroup() throws IOException {
255+
final List<String> lines = Files.readAllLines(PathUtils.get("/proc/self/cgroup"));
256+
assert lines != null && !lines.isEmpty();
257+
return lines;
308258
}
309259

310260
/**
@@ -314,13 +264,12 @@ List<String> readProcSelfCgroup() {
314264
*
315265
* @param controlGroup the control group for the Elasticsearch
316266
* process for the {@code cpuacct} subsystem
317-
* @return the total CPU time in nanoseconds, or -1
267+
* @return the total CPU time in nanoseconds
268+
* @throws IOException if an I/O exception occurs reading
269+
* {@code cpuacct.usage} for the control group
318270
*/
319-
private long getCgroupCpuAcctUsageNanos(final String controlGroup) {
320-
final String line = readSysFsCgroupCpuAcctCpuAcctUsage(controlGroup);
321-
return parseSingleLineAsLong(
322-
line,
323-
() -> String.format(Locale.ROOT, "error parsing cpuacct.usage [%s] for control group [%s]", line, controlGroup));
271+
private long getCgroupCpuAcctUsageNanos(final String controlGroup) throws IOException {
272+
return Long.parseLong(readSysFsCgroupCpuAcctCpuAcctUsage(controlGroup));
324273
}
325274

326275
/**
@@ -333,10 +282,12 @@ private long getCgroupCpuAcctUsageNanos(final String controlGroup) {
333282
* @param controlGroup the control group to which the Elasticsearch
334283
* process belongs for the {@code cpuacct}
335284
* subsystem
336-
* @return the line from {@code cpuacct.usage} or {@code null}
285+
* @return the line from {@code cpuacct.usage}
286+
* @throws IOException if an I/O exception occurs reading
287+
* {@code cpuacct.usage} for the control group
337288
*/
338289
@SuppressForbidden(reason = "access /sys/fs/cgroup/cpuacct")
339-
String readSysFsCgroupCpuAcctCpuAcctUsage(final String controlGroup) {
290+
String readSysFsCgroupCpuAcctCpuAcctUsage(final String controlGroup) throws IOException {
340291
return readSingleLine(PathUtils.get("/sys/fs/cgroup/cpuacct", controlGroup, "cpuacct.usage"));
341292
}
342293

@@ -347,13 +298,12 @@ String readSysFsCgroupCpuAcctCpuAcctUsage(final String controlGroup) {
347298
*
348299
* @param controlGroup the control group for the Elasticsearch
349300
* process for the {@code cpuacct} subsystem
350-
* @return the CFS quota period in microseconds, or -1
301+
* @return the CFS quota period in microseconds
302+
* @throws IOException if an I/O exception occurs reading
303+
* {@code cpu.cfs_period_us} for the control group
351304
*/
352-
private long getCgroupCpuAcctCpuCfsPeriodMicros(final String controlGroup) {
353-
final String line = readSysFsCgroupCpuAcctCpuCfsPeriod(controlGroup);
354-
return parseSingleLineAsLong(
355-
line,
356-
() -> String.format(Locale.ROOT, "error parsing cpu.cfs_period_us [%s] for control group [%s]", line, controlGroup));
305+
private long getCgroupCpuAcctCpuCfsPeriodMicros(final String controlGroup) throws IOException {
306+
return Long.parseLong(readSysFsCgroupCpuAcctCpuCfsPeriod(controlGroup));
357307
}
358308

359309
/**
@@ -366,10 +316,12 @@ private long getCgroupCpuAcctCpuCfsPeriodMicros(final String controlGroup) {
366316
* @param controlGroup the control group to which the Elasticsearch
367317
* process belongs for the {@code cpu}
368318
* subsystem
369-
* @return the line from {@code cpu.cfs_period_us} or {@code null}
319+
* @return the line from {@code cpu.cfs_period_us}
320+
* @throws IOException if an I/O exception occurs reading
321+
* {@code cpu.cfs_period_us} for the control group
370322
*/
371323
@SuppressForbidden(reason = "access /sys/fs/cgroup/cpu")
372-
String readSysFsCgroupCpuAcctCpuCfsPeriod(final String controlGroup) {
324+
String readSysFsCgroupCpuAcctCpuCfsPeriod(final String controlGroup) throws IOException {
373325
return readSingleLine(PathUtils.get("/sys/fs/cgroup/cpu", controlGroup, "cpu.cfs_period_us"));
374326
}
375327

@@ -380,13 +332,12 @@ String readSysFsCgroupCpuAcctCpuCfsPeriod(final String controlGroup) {
380332
*
381333
* @param controlGroup the control group for the Elasticsearch
382334
* process for the {@code cpuacct} subsystem
383-
* @return the CFS quota in microseconds, or -1
335+
* @return the CFS quota in microseconds
336+
* @throws IOException if an I/O exception occurs reading
337+
* {@code cpu.cfs_quota_us} for the control group
384338
*/
385-
private long getCGroupCpuAcctCpuCfsQuotaMicros(final String controlGroup) {
386-
final String line = readSysFsCgroupCpuAcctCpuAcctCfsQuota(controlGroup);
387-
return parseSingleLineAsLong(
388-
line,
389-
() -> String.format(Locale.ROOT, "error parsing cpu.cfs_quota_us [%s] for control group [%s]", line, controlGroup));
339+
private long getCGroupCpuAcctCpuCfsQuotaMicros(final String controlGroup) throws IOException {
340+
return Long.parseLong(readSysFsCgroupCpuAcctCpuAcctCfsQuota(controlGroup));
390341
}
391342

392343
/**
@@ -399,10 +350,12 @@ private long getCGroupCpuAcctCpuCfsQuotaMicros(final String controlGroup) {
399350
* @param controlGroup the control group to which the Elasticsearch
400351
* process belongs for the {@code cpu}
401352
* subsystem
402-
* @return the line from {@code cpu.cfs_quota_us} or {@code null}
353+
* @return the line from {@code cpu.cfs_quota_us}
354+
* @throws IOException if an I/O exception occurs reading
355+
* {@code cpu.cfs_quota_us} for the control group
403356
*/
404357
@SuppressForbidden(reason = "access /sys/fs/cgroup/cpu")
405-
String readSysFsCgroupCpuAcctCpuAcctCfsQuota(final String controlGroup) {
358+
String readSysFsCgroupCpuAcctCpuAcctCfsQuota(final String controlGroup) throws IOException {
406359
return readSingleLine(PathUtils.get("/sys/fs/cgroup/cpu", controlGroup, "cpu.cfs_quota_us"));
407360
}
408361

@@ -412,39 +365,33 @@ String readSysFsCgroupCpuAcctCpuAcctCfsQuota(final String controlGroup) {
412365
*
413366
* @param controlGroup the control group for the Elasticsearch
414367
* process for the {@code cpuacct} subsystem
415-
* @return the CPU time statistics or {@code null}
368+
* @return the CPU time statistics
369+
* @throws IOException if an I/O exception occurs reading
370+
* {@code cpu.stat} for the control group
416371
*/
417-
private OsStats.Cgroup.CpuStat getCgroupCpuAcctCpuStat(final String controlGroup) {
372+
private OsStats.Cgroup.CpuStat getCgroupCpuAcctCpuStat(final String controlGroup) throws IOException {
418373
final List<String> lines = readSysFsCgroupCpuAcctCpuStat(controlGroup);
419-
if (!lines.isEmpty()) {
420-
assert lines.size() == 3;
421-
long numberOfPeriods = -1;
422-
long numberOfTimesThrottled = -1;
423-
long timeThrottledNanos = -1;
424-
for (final String line : lines) {
425-
final String[] fields = line.split("\\s+");
426-
switch (fields[0]) {
427-
case "nr_periods":
428-
numberOfPeriods =
429-
parseSingleLineAsLong(fields[1], () -> String.format(Locale.ROOT, "error parsing nr_periods [%s]", line));
430-
break;
431-
case "nr_throttled":
432-
numberOfTimesThrottled =
433-
parseSingleLineAsLong(fields[1], () -> String.format(Locale.ROOT, "error parsing nr_throttled [%s]", line));
434-
break;
435-
case "throttled_time":
436-
timeThrottledNanos =
437-
parseSingleLineAsLong(fields[1], () -> String.format(Locale.ROOT, "error parsing throttled_time [%s]", line));
438-
break;
439-
}
374+
long numberOfPeriods = -1;
375+
long numberOfTimesThrottled = -1;
376+
long timeThrottledNanos = -1;
377+
for (final String line : lines) {
378+
final String[] fields = line.split("\\s+");
379+
switch (fields[0]) {
380+
case "nr_periods":
381+
numberOfPeriods = Long.parseLong(fields[1]);
382+
break;
383+
case "nr_throttled":
384+
numberOfTimesThrottled = Long.parseLong(fields[1]);
385+
break;
386+
case "throttled_time":
387+
timeThrottledNanos = Long.parseLong(fields[1]);
388+
break;
440389
}
441-
assert numberOfPeriods != -1;
442-
assert numberOfTimesThrottled != -1;
443-
assert timeThrottledNanos != -1;
444-
return new OsStats.Cgroup.CpuStat(numberOfPeriods, numberOfTimesThrottled, timeThrottledNanos);
445390
}
446-
447-
return null;
391+
assert numberOfPeriods != -1;
392+
assert numberOfTimesThrottled != -1;
393+
assert timeThrottledNanos != -1;
394+
return new OsStats.Cgroup.CpuStat(numberOfPeriods, numberOfTimesThrottled, timeThrottledNanos);
448395
}
449396

450397
/**
@@ -468,11 +415,41 @@ private OsStats.Cgroup.CpuStat getCgroupCpuAcctCpuStat(final String controlGroup
468415
* process belongs for the {@code cpu}
469416
* subsystem
470417
*
471-
* @return the line from {@code cpu.cfs_quota_us} or {@code null}
418+
* @return the lines from {@code cpu.stat}
419+
* @throws IOException if an I/O exception occurs reading
420+
* {@code cpu.stat} for the control group
472421
*/
473422
@SuppressForbidden(reason = "access /sys/fs/cgroup/cpu")
474-
List<String> readSysFsCgroupCpuAcctCpuStat(final String controlGroup) {
475-
return readAllLines(PathUtils.get("/sys/fs/cgroup/cpu", controlGroup, "cpu.stat"));
423+
List<String> readSysFsCgroupCpuAcctCpuStat(final String controlGroup) throws IOException {
424+
final List<String> lines = Files.readAllLines(PathUtils.get("/sys/fs/cgroup/cpu", controlGroup, "cpu.stat"));
425+
assert lines != null && lines.size() == 3;
426+
return lines;
427+
}
428+
429+
/**
430+
* Basic cgroup stats.
431+
*
432+
* @return basic cgroup stats, or {@code null} if an I/O exception
433+
* occurred reading the cgroup stats
434+
*/
435+
private OsStats.Cgroup getCgroup() {
436+
try {
437+
final Map<String, String> controllerMap = getControlGroups();
438+
final String cpuControlGroup = controllerMap.get("cpu");
439+
final String cpuAcctControlGroup = controllerMap.get("cpuacct");
440+
return new OsStats.Cgroup(
441+
cpuAcctControlGroup,
442+
getCgroupCpuAcctUsageNanos(cpuAcctControlGroup),
443+
cpuControlGroup,
444+
getCgroupCpuAcctCpuCfsPeriodMicros(cpuControlGroup),
445+
getCGroupCpuAcctCpuCfsQuotaMicros(cpuControlGroup),
446+
getCgroupCpuAcctCpuStat(cpuControlGroup));
447+
} catch (final IOException e) {
448+
if (logger.isDebugEnabled()) {
449+
logger.debug("error reading control group stats", e);
450+
}
451+
return null;
452+
}
476453
}
477454

478455
private static class OsProbeHolder {
@@ -484,6 +461,7 @@ public static OsProbe getInstance() {
484461
}
485462

486463
OsProbe() {
464+
487465
}
488466

489467
private final Logger logger = ESLoggerFactory.getLogger(getClass());
@@ -497,26 +475,7 @@ public OsStats osStats() {
497475
final OsStats.Cpu cpu = new OsStats.Cpu(getSystemCpuPercent(), getSystemLoadAverage());
498476
final OsStats.Mem mem = new OsStats.Mem(getTotalPhysicalMemorySize(), getFreePhysicalMemorySize());
499477
final OsStats.Swap swap = new OsStats.Swap(getTotalSwapSpaceSize(), getFreeSwapSpaceSize());
500-
final OsStats.Cgroup cgroup;
501-
if (Constants.LINUX) {
502-
final Map<String, String> controllerMap = getControlGroups();
503-
if (controllerMap.containsKey("cpuacct") && controllerMap.containsKey("cpu")) {
504-
final String cpuControlGroup = controllerMap.get("cpu");
505-
final String cpuAcctControlGroup = controllerMap.get("cpuacct");
506-
cgroup =
507-
new OsStats.Cgroup(
508-
cpuAcctControlGroup,
509-
getCgroupCpuAcctUsageNanos(cpuAcctControlGroup),
510-
cpuControlGroup,
511-
getCgroupCpuAcctCpuCfsPeriodMicros(cpuControlGroup),
512-
getCGroupCpuAcctCpuCfsQuotaMicros(cpuControlGroup),
513-
getCgroupCpuAcctCpuStat(cpuControlGroup));
514-
} else {
515-
cgroup = null;
516-
}
517-
} else {
518-
cgroup = null;
519-
}
478+
final OsStats.Cgroup cgroup = Constants.LINUX ? getCgroup() : null;
520479
return new OsStats(System.currentTimeMillis(), cpu, mem, swap, cgroup);
521480
}
522481

0 commit comments

Comments
 (0)