Skip to content

Commit 54d4737

Browse files
authored
Reapply "Handle cgroups v2 in OsProbe (#76883)" (#77106)
Re-apply #76883. Somehow a line was missed from security.policy.
1 parent 46dd252 commit 54d4737

4 files changed

Lines changed: 293 additions & 103 deletions

File tree

qa/os/src/test/java/org/elasticsearch/packaging/test/DockerTests.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -849,7 +849,6 @@ public void test131InitProcessHasCorrectPID() {
849849
/**
850850
* Check that Elasticsearch reports per-node cgroup information.
851851
*/
852-
@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/76812")
853852
public void test140CgroupOsStatsAreAvailable() throws Exception {
854853
waitForElasticsearch(installation, USERNAME, PASSWORD);
855854

server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java

Lines changed: 208 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -30,25 +30,31 @@
3030
import java.util.regex.Matcher;
3131
import java.util.regex.Pattern;
3232
import java.util.stream.Collectors;
33+
import java.util.stream.Stream;
3334

3435
/**
3536
* The {@link OsProbe} class retrieves information about the physical and swap size of the machine
3637
* memory, as well as the system load average and cpu load.
3738
*
38-
* In some exceptional cases, it's possible the underlying native methods used by
39+
* <p>In some exceptional cases, it's possible the underlying native methods used by
3940
* {@link #getFreePhysicalMemorySize()}, {@link #getTotalPhysicalMemorySize()},
4041
* {@link #getFreeSwapSpaceSize()}, and {@link #getTotalSwapSpaceSize()} can return a
4142
* negative value. Because of this, we prevent those methods from returning negative values,
4243
* returning 0 instead.
4344
*
44-
* The OS can report a negative number in a number of cases:
45-
* - Non-supported OSes (HP-UX, or AIX)
46-
* - A failure of macOS to initialize host statistics
47-
* - An OS that does not support the {@code _SC_PHYS_PAGES} or {@code _SC_PAGE_SIZE} flags for the {@code sysconf()} linux kernel call
48-
* - An overflow of the product of {@code _SC_PHYS_PAGES} and {@code _SC_PAGE_SIZE}
49-
* - An error case retrieving these values from a linux kernel
50-
* - A non-standard libc implementation not implementing the required values
51-
* For a more exhaustive explanation, see https://github.com/elastic/elasticsearch/pull/42725
45+
* <p>The OS can report a negative number in a number of cases:
46+
*
47+
* <ul>
48+
* <li>Non-supported OSes (HP-UX, or AIX)
49+
* <li>A failure of macOS to initialize host statistics
50+
* <li>An OS that does not support the {@code _SC_PHYS_PAGES} or {@code _SC_PAGE_SIZE} flags for the {@code sysconf()} linux kernel call
51+
* <li>An overflow of the product of {@code _SC_PHYS_PAGES} and {@code _SC_PAGE_SIZE}
52+
* <li>An error case retrieving these values from a linux kernel
53+
* <li>A non-standard libc implementation not implementing the required values
54+
* </ul>
55+
*
56+
* <p>For a more exhaustive explanation, see <a href="https://github.com/elastic/elasticsearch/pull/42725"
57+
* >https://github.com/elastic/elasticsearch/pull/42725</a>
5258
*/
5359
public class OsProbe {
5460

@@ -178,7 +184,7 @@ final double[] getSystemLoadAverage() {
178184
final String procLoadAvg = readProcLoadavg();
179185
assert procLoadAvg.matches("(\\d+\\.\\d+\\s+){3}\\d+/\\d+\\s+\\d+");
180186
final String[] fields = procLoadAvg.split("\\s+");
181-
return new double[]{Double.parseDouble(fields[0]), Double.parseDouble(fields[1]), Double.parseDouble(fields[2])};
187+
return new double[] { Double.parseDouble(fields[0]), Double.parseDouble(fields[1]), Double.parseDouble(fields[2]) };
182188
} catch (final IOException e) {
183189
if (logger.isDebugEnabled()) {
184190
logger.debug("error reading /proc/loadavg", e);
@@ -192,7 +198,7 @@ final double[] getSystemLoadAverage() {
192198
}
193199
try {
194200
final double oneMinuteLoadAverage = (double) getSystemLoadAverage.invoke(osMxBean);
195-
return new double[]{oneMinuteLoadAverage >= 0 ? oneMinuteLoadAverage : -1, -1, -1};
201+
return new double[] { oneMinuteLoadAverage >= 0 ? oneMinuteLoadAverage : -1, -1, -1 };
196202
} catch (IllegalAccessException | InvocationTargetException e) {
197203
if (logger.isDebugEnabled()) {
198204
logger.debug("error reading one minute load average from operating system", e);
@@ -318,6 +324,23 @@ String readSysFsCgroupCpuAcctCpuAcctUsage(final String controlGroup) throws IOEx
318324
return readSingleLine(PathUtils.get("/sys/fs/cgroup/cpuacct", controlGroup, "cpuacct.usage"));
319325
}
320326

327+
private long[] getCgroupV2CpuLimit(String controlGroup) throws IOException {
328+
String entry = readCgroupV2CpuLimit(controlGroup);
329+
String[] parts = entry.split("\\s+");
330+
assert parts.length == 2 : "Expected 2 fields in [cpu.max]";
331+
332+
long[] values = new long[2];
333+
334+
values[0] = "max".equals(parts[0]) ? -1L : Long.parseLong(parts[0]);
335+
values[1] = Long.parseLong(parts[1]);
336+
return values;
337+
}
338+
339+
@SuppressForbidden(reason = "access /sys/fs/cgroup/cpu.max")
340+
String readCgroupV2CpuLimit(String controlGroup) throws IOException {
341+
return readSingleLine(PathUtils.get("/sys/fs/cgroup/", controlGroup, "cpu.max"));
342+
}
343+
321344
/**
322345
* The total period of time in microseconds for how frequently the Elasticsearch control group's access to CPU resources will be
323346
* reallocated.
@@ -454,6 +477,35 @@ String readSysFsCgroupMemoryLimitInBytes(final String controlGroup) throws IOExc
454477
return readSingleLine(PathUtils.get("/sys/fs/cgroup/memory", controlGroup, "memory.limit_in_bytes"));
455478
}
456479

480+
/**
481+
* The maximum amount of user memory (including file cache).
482+
* If there is no limit then some Linux versions return the maximum value that can be stored in an
483+
* unsigned 64 bit number, and this will overflow a long, hence the result type is <code>String</code>.
484+
* (The alternative would have been <code>BigInteger</code> but then it would not be possible to index
485+
* the OS stats document into Elasticsearch without losing information, as <code>BigInteger</code> is
486+
* not a supported Elasticsearch type.)
487+
*
488+
* @param controlGroup the control group for the Elasticsearch process for the {@code memory} subsystem
489+
* @return the maximum amount of user memory (including file cache)
490+
* @throws IOException if an I/O exception occurs reading {@code memory.limit_in_bytes} for the control group
491+
*/
492+
private String getCgroupV2MemoryLimitInBytes(final String controlGroup) throws IOException {
493+
return readSysFsCgroupV2MemoryLimitInBytes(controlGroup);
494+
}
495+
496+
/**
497+
* Returns the line from {@code memory.max} for the control group to which the Elasticsearch process belongs for the
498+
* {@code memory} subsystem. This line represents the maximum amount of user memory (including file cache).
499+
*
500+
* @param controlGroup the control group to which the Elasticsearch process belongs for the {@code memory} subsystem
501+
* @return the line from {@code memory.max}
502+
* @throws IOException if an I/O exception occurs reading {@code memory.max} for the control group
503+
*/
504+
@SuppressForbidden(reason = "access /sys/fs/cgroup/memory.max")
505+
String readSysFsCgroupV2MemoryLimitInBytes(final String controlGroup) throws IOException {
506+
return readSingleLine(PathUtils.get("/sys/fs/cgroup/", controlGroup, "memory.max"));
507+
}
508+
457509
/**
458510
* The total current memory usage by processes in the cgroup (in bytes).
459511
* If there is no limit then some Linux versions return the maximum value that can be stored in an
@@ -483,27 +535,87 @@ String readSysFsCgroupMemoryUsageInBytes(final String controlGroup) throws IOExc
483535
return readSingleLine(PathUtils.get("/sys/fs/cgroup/memory", controlGroup, "memory.usage_in_bytes"));
484536
}
485537

538+
/**
539+
* The total current memory usage by processes in the cgroup (in bytes).
540+
* If there is no limit then some Linux versions return the maximum value that can be stored in an
541+
* unsigned 64 bit number, and this will overflow a long, hence the result type is <code>String</code>.
542+
* (The alternative would have been <code>BigInteger</code> but then it would not be possible to index
543+
* the OS stats document into Elasticsearch without losing information, as <code>BigInteger</code> is
544+
* not a supported Elasticsearch type.)
545+
*
546+
* @param controlGroup the control group for the Elasticsearch process for the {@code memory} subsystem
547+
* @return the total current memory usage by processes in the cgroup (in bytes)
548+
* @throws IOException if an I/O exception occurs reading {@code memory.current} for the control group
549+
*/
550+
private String getCgroupV2MemoryUsageInBytes(final String controlGroup) throws IOException {
551+
return readSysFsCgroupV2MemoryUsageInBytes(controlGroup);
552+
}
553+
554+
/**
555+
* Returns the line from {@code memory.current} for the control group to which the Elasticsearch process belongs for the
556+
* {@code memory} subsystem. This line represents the total current memory usage by processes in the cgroup (in bytes).
557+
*
558+
* @param controlGroup the control group to which the Elasticsearch process belongs for the {@code memory} subsystem
559+
* @return the line from {@code memory.current}
560+
* @throws IOException if an I/O exception occurs reading {@code memory.current} for the control group
561+
*/
562+
@SuppressForbidden(reason = "access /sys/fs/cgroup/memory.current")
563+
String readSysFsCgroupV2MemoryUsageInBytes(final String controlGroup) throws IOException {
564+
return readSingleLine(PathUtils.get("/sys/fs/cgroup/", controlGroup, "memory.current"));
565+
}
566+
486567
/**
487568
* Checks if cgroup stats are available by checking for the existence of {@code /proc/self/cgroup}, {@code /sys/fs/cgroup/cpu},
488569
* {@code /sys/fs/cgroup/cpuacct} and {@code /sys/fs/cgroup/memory}.
489570
*
490571
* @return {@code true} if the stats are available, otherwise {@code false}
491572
*/
492573
@SuppressForbidden(reason = "access /proc/self/cgroup, /sys/fs/cgroup/cpu, /sys/fs/cgroup/cpuacct and /sys/fs/cgroup/memory")
493-
boolean areCgroupStatsAvailable() {
574+
boolean areCgroupStatsAvailable() throws IOException {
494575
if (Files.exists(PathUtils.get("/proc/self/cgroup")) == false) {
495576
return false;
496577
}
497-
if (Files.exists(PathUtils.get("/sys/fs/cgroup/cpu")) == false) {
498-
return false;
499-
}
500-
if (Files.exists(PathUtils.get("/sys/fs/cgroup/cpuacct")) == false) {
501-
return false;
578+
579+
List<String> lines = readProcSelfCgroup();
580+
581+
// cgroup v2
582+
if (lines.size() == 1 && lines.get(0).startsWith("0::")) {
583+
return Stream.of("/sys/fs/cgroup/cpu.stat", "/sys/fs/cgroup/memory.stat").allMatch(path -> Files.exists(PathUtils.get(path)));
502584
}
503-
if (Files.exists(PathUtils.get("/sys/fs/cgroup/memory")) == false) {
504-
return false;
585+
586+
return Stream.of("/sys/fs/cgroup/cpu", "/sys/fs/cgroup/cpuacct", "/sys/fs/cgroup/memory")
587+
.allMatch(path -> Files.exists(PathUtils.get(path)));
588+
}
589+
590+
/**
591+
* The CPU statistics for all tasks in the Elasticsearch control group.
592+
*
593+
* @param controlGroup the control group to which the Elasticsearch process belongs for the {@code memory} subsystem
594+
* @return the CPU statistics
595+
* @throws IOException if an I/O exception occurs reading {@code cpu.stat} for the control group
596+
*/
597+
private Map<String, Long> getCgroupV2CpuStats(String controlGroup) throws IOException {
598+
final List<String> lines = readCgroupV2CpuStats(controlGroup);
599+
final Map<String, Long> stats = new HashMap<>();
600+
601+
for (String line : lines) {
602+
String[] parts = line.split("\\s+");
603+
assert parts.length == 2 : "Corrupt cpu.stat line: [" + line + "]";
604+
stats.put(parts[0], Long.parseLong(parts[1]));
505605
}
506-
return true;
606+
607+
final List<String> expectedKeys = List.of("nr_periods", "nr_throttled", "system_usec", "throttled_usec", "usage_usec", "user_usec");
608+
expectedKeys.forEach(key -> {
609+
assert stats.containsKey(key) : key;
610+
assert stats.get(key) != -1 : stats.get(key);
611+
});
612+
613+
return stats;
614+
}
615+
616+
@SuppressForbidden(reason = "access /sys/fs/cgroup/cpu.stat")
617+
List<String> readCgroupV2CpuStats(final String controlGroup) throws IOException {
618+
return Files.readAllLines(PathUtils.get("/sys/fs/cgroup", controlGroup, "cpu.stat"));
507619
}
508620

509621
/**
@@ -515,45 +627,79 @@ private OsStats.Cgroup getCgroup() {
515627
try {
516628
if (areCgroupStatsAvailable() == false) {
517629
return null;
518-
} else {
519-
final Map<String, String> controllerMap = getControlGroups();
520-
assert controllerMap.isEmpty() == false;
630+
}
631+
632+
final Map<String, String> controllerMap = getControlGroups();
633+
assert controllerMap.isEmpty() == false;
521634

522-
final String cpuAcctControlGroup = controllerMap.get("cpuacct");
635+
final String cpuAcctControlGroup;
636+
final long cgroupCpuAcctUsageNanos;
637+
final long cgroupCpuAcctCpuCfsPeriodMicros;
638+
final long cgroupCpuAcctCpuCfsQuotaMicros;
639+
final String cpuControlGroup;
640+
final OsStats.Cgroup.CpuStat cpuStat;
641+
final String memoryControlGroup;
642+
final String cgroupMemoryLimitInBytes;
643+
final String cgroupMemoryUsageInBytes;
644+
645+
if (controllerMap.size() == 1 && controllerMap.containsKey("")) {
646+
// There's a single hierarchy for all controllers
647+
cpuControlGroup = cpuAcctControlGroup = memoryControlGroup = controllerMap.get("");
648+
649+
// `cpuacct` was merged with `cpu` in v2
650+
final Map<String, Long> cpuStatsMap = getCgroupV2CpuStats(cpuControlGroup);
651+
652+
cgroupCpuAcctUsageNanos = cpuStatsMap.get("usage_usec");
653+
654+
long[] cpuLimits = getCgroupV2CpuLimit(cpuControlGroup);
655+
cgroupCpuAcctCpuCfsQuotaMicros = cpuLimits[0];
656+
cgroupCpuAcctCpuCfsPeriodMicros = cpuLimits[1];
657+
658+
cpuStat = new OsStats.Cgroup.CpuStat(
659+
cpuStatsMap.get("nr_periods"),
660+
cpuStatsMap.get("nr_throttled"),
661+
cpuStatsMap.get("throttled_usec")
662+
);
663+
664+
cgroupMemoryLimitInBytes = getCgroupV2MemoryLimitInBytes(memoryControlGroup);
665+
cgroupMemoryUsageInBytes = getCgroupV2MemoryUsageInBytes(memoryControlGroup);
666+
} else {
667+
cpuAcctControlGroup = controllerMap.get("cpuacct");
523668
if (cpuAcctControlGroup == null) {
524669
logger.debug("no [cpuacct] data found in cgroup stats");
525670
return null;
526671
}
527-
final long cgroupCpuAcctUsageNanos = getCgroupCpuAcctUsageNanos(cpuAcctControlGroup);
672+
cgroupCpuAcctUsageNanos = getCgroupCpuAcctUsageNanos(cpuAcctControlGroup);
528673

529-
final String cpuControlGroup = controllerMap.get("cpu");
674+
cpuControlGroup = controllerMap.get("cpu");
530675
if (cpuControlGroup == null) {
531676
logger.debug("no [cpu] data found in cgroup stats");
532677
return null;
533678
}
534-
final long cgroupCpuAcctCpuCfsPeriodMicros = getCgroupCpuAcctCpuCfsPeriodMicros(cpuControlGroup);
535-
final long cgroupCpuAcctCpuCfsQuotaMicros = getCgroupCpuAcctCpuCfsQuotaMicros(cpuControlGroup);
536-
final OsStats.Cgroup.CpuStat cpuStat = getCgroupCpuAcctCpuStat(cpuControlGroup);
679+
cgroupCpuAcctCpuCfsPeriodMicros = getCgroupCpuAcctCpuCfsPeriodMicros(cpuControlGroup);
680+
cgroupCpuAcctCpuCfsQuotaMicros = getCgroupCpuAcctCpuCfsQuotaMicros(cpuControlGroup);
681+
cpuStat = getCgroupCpuAcctCpuStat(cpuControlGroup);
537682

538-
final String memoryControlGroup = controllerMap.get("memory");
683+
memoryControlGroup = controllerMap.get("memory");
539684
if (memoryControlGroup == null) {
540685
logger.debug("no [memory] data found in cgroup stats");
541686
return null;
542687
}
543-
final String cgroupMemoryLimitInBytes = getCgroupMemoryLimitInBytes(memoryControlGroup);
544-
final String cgroupMemoryUsageInBytes = getCgroupMemoryUsageInBytes(memoryControlGroup);
545-
546-
return new OsStats.Cgroup(
547-
cpuAcctControlGroup,
548-
cgroupCpuAcctUsageNanos,
549-
cpuControlGroup,
550-
cgroupCpuAcctCpuCfsPeriodMicros,
551-
cgroupCpuAcctCpuCfsQuotaMicros,
552-
cpuStat,
553-
memoryControlGroup,
554-
cgroupMemoryLimitInBytes,
555-
cgroupMemoryUsageInBytes);
688+
cgroupMemoryLimitInBytes = getCgroupMemoryLimitInBytes(memoryControlGroup);
689+
cgroupMemoryUsageInBytes = getCgroupMemoryUsageInBytes(memoryControlGroup);
556690
}
691+
692+
return new OsStats.Cgroup(
693+
cpuAcctControlGroup,
694+
cgroupCpuAcctUsageNanos,
695+
cpuControlGroup,
696+
cgroupCpuAcctCpuCfsPeriodMicros,
697+
cgroupCpuAcctCpuCfsQuotaMicros,
698+
cpuStat,
699+
memoryControlGroup,
700+
cgroupMemoryLimitInBytes,
701+
cgroupMemoryUsageInBytes
702+
);
557703
} catch (final IOException e) {
558704
logger.debug("error reading control group stats", e);
559705
return null;
@@ -576,13 +722,14 @@ public static OsProbe getInstance() {
576722

577723
OsInfo osInfo(long refreshInterval, int allocatedProcessors) throws IOException {
578724
return new OsInfo(
579-
refreshInterval,
580-
Runtime.getRuntime().availableProcessors(),
581-
allocatedProcessors,
582-
Constants.OS_NAME,
583-
getPrettyName(),
584-
Constants.OS_ARCH,
585-
Constants.OS_VERSION);
725+
refreshInterval,
726+
Runtime.getRuntime().availableProcessors(),
727+
allocatedProcessors,
728+
Constants.OS_NAME,
729+
getPrettyName(),
730+
Constants.OS_ARCH,
731+
Constants.OS_VERSION
732+
);
586733
}
587734

588735
private String getPrettyName() throws IOException {
@@ -594,11 +741,13 @@ private String getPrettyName() throws IOException {
594741
* wrapped in single- or double-quotes.
595742
*/
596743
final List<String> etcOsReleaseLines = readOsRelease();
597-
final List<String> prettyNameLines =
598-
etcOsReleaseLines.stream().filter(line -> line.startsWith("PRETTY_NAME")).collect(Collectors.toList());
744+
final List<String> prettyNameLines = etcOsReleaseLines.stream()
745+
.filter(line -> line.startsWith("PRETTY_NAME"))
746+
.collect(Collectors.toList());
599747
assert prettyNameLines.size() <= 1 : prettyNameLines;
600-
final Optional<String> maybePrettyNameLine =
601-
prettyNameLines.size() == 1 ? Optional.of(prettyNameLines.get(0)) : Optional.empty();
748+
final Optional<String> maybePrettyNameLine = prettyNameLines.size() == 1
749+
? Optional.of(prettyNameLines.get(0))
750+
: Optional.empty();
602751
if (maybePrettyNameLine.isPresent()) {
603752
// we trim since some OS contain trailing space, for example, Oracle Linux Server 6.9 has a trailing space after the quote
604753
final String trimmedPrettyNameLine = maybePrettyNameLine.get().trim();
@@ -695,11 +844,15 @@ boolean isDebian8() throws IOException {
695844
return Constants.LINUX && getPrettyName().equals("Debian GNU/Linux 8 (jessie)");
696845
}
697846

847+
OsStats.Cgroup getCgroup(boolean isLinux) {
848+
return isLinux ? getCgroup() : null;
849+
}
850+
698851
public OsStats osStats() {
699852
final OsStats.Cpu cpu = new OsStats.Cpu(getSystemCpuPercent(), getSystemLoadAverage());
700853
final OsStats.Mem mem = new OsStats.Mem(getTotalPhysicalMemorySize(), getFreePhysicalMemorySize());
701854
final OsStats.Swap swap = new OsStats.Swap(getTotalSwapSpaceSize(), getFreeSwapSpaceSize());
702-
final OsStats.Cgroup cgroup = Constants.LINUX ? getCgroup() : null;
855+
final OsStats.Cgroup cgroup = getCgroup(Constants.LINUX);
703856
return new OsStats(System.currentTimeMillis(), cpu, mem, swap, cgroup);
704857
}
705858

0 commit comments

Comments
 (0)