From 64feba56e427ada0729982a8bdc1f599ee66fd32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Hamb=C3=BCchen?= Date: Sat, 25 Apr 2020 23:27:15 +0200 Subject: [PATCH] Fix earlyoom killing processes too early when ZFS is in use. The ZFS ARC cache is memory-reclaimable, like the Linux buffer cache. However, in contrast to the buffer cache, it currently does not count to `MemAvailable` (see https://github.com/openzfs/zfs/issues/10255), leading earlyoom to believe we are out of memory when we still have a lot of memory available (in practice, many GBs). Thus, until now, earlyoom tended to kill processes on ZFS systems even though there was no memory pressure. This commit fixes it by adding the `size` field of `/proc/spl/kstat/zfs/arcstats` to `MemAvailable`. The effect can be checked easily on ZFS systems: Before this commit, dropping the ARC via (command from [1]) echo 3 | sudo tee /proc/sys/vm/drop_caches would result in an increase of free memory in earlyoom's output; with this fix, it stays equal. [1]: https://serverfault.com/a/857386/128321 --- MANPAGE.md | 6 ++++++ meminfo.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/MANPAGE.md b/MANPAGE.md index e7ef8c1..a55d156 100644 --- a/MANPAGE.md +++ b/MANPAGE.md @@ -140,6 +140,12 @@ this help text 105: Could not convert number when parsing the contents of /proc/meminfo +106: Could not open /proc/spl/kstat/zfs/arcstats despite it existing + +107: Could read /proc/spl/kstat/zfs/arcstats + +108: Could not parse /proc/spl/kstat/zfs/arcstats contents + # Why not trigger the kernel oom killer? Earlyoom does not use `echo f > /proc/sysrq-trigger` because the Chrome people diff --git a/meminfo.c b/meminfo.c index 8dfe0ae..cde4afe 100644 --- a/meminfo.c +++ b/meminfo.c @@ -57,19 +57,25 @@ static long long available_guesstimate(const char* buf) return MemFree + Cached + Buffers - Shmem; } -/* Parse /proc/meminfo. +/* Parse /proc/meminfo and other related files: + * * ZFS's /proc/spl/kstat/zfs/arcstats, if it exists * This function either returns valid data or kills the process * with a fatal error. */ meminfo_t parse_meminfo() { static FILE* fd; + static FILE* arcstats_fd; // Note that we do not need to close static FDs that we ensure to // `fopen()` maximally once. static int guesstimate_warned = 0; // On Linux 5.3, "wc -c /proc/meminfo" counts 1391 bytes. // 8192 should be enough for the foreseeable future. char buf[8192] = { 0 }; + // On Linux 4.19 with ZoL 0.8.3, "wc -c /proc/spl/kstat/zfs/arcstats" + // counts 4354 bytes. + // 8192 should be enough for the foreseeable future. + char arcstats_buf[8192] = { 0 }; meminfo_t m = { 0 }; if (fd == NULL) @@ -110,6 +116,58 @@ meminfo_t parse_meminfo() } } + // If ZFS is in use, compute size of its cache, which is memory-reclaimable. + // It currently does not count to `MemAvailable`, like the Linux buffer cache + // of other file systems does, see: + // * https://github.com/openzfs/zfs/issues/10255 + long long zfs_arcstats_bytes = 0; + if (arcstats_fd == NULL) + arcstats_fd = fopen("/proc/spl/kstat/zfs/arcstats", "r"); + if (arcstats_fd == NULL && errno != ENOENT) { + fatal(106, "could not open /proc/spl/kstat/zfs/arcstats: %s\n", strerror(errno)); + } + if (arcstats_fd != NULL) { + rewind(arcstats_fd); + + // Read the file; loop to handle short reads. + size_t buf_offset = 0; + while (1) { + size_t len = fread(arcstats_buf + buf_offset, 1, sizeof(arcstats_buf) - 1 - buf_offset, arcstats_fd); + if (ferror(arcstats_fd)) { + perror("parse_meminfo: fread() on /proc/spl/kstat/zfs/arcstats failed"); + fatal(107, "could not read /proc/spl/kstat/zfs/arcstats\n"); + } + buf_offset += len; + if (feof(arcstats_fd)) { + break; + } + if (len == 0) { + fatal(108, "could not read /proc/spl/kstat/zfs/arcstats: 0 bytes returned\n"); + } + } + + // The format of the `arcstats` file is like this: + // name type data + // hits 4 373259415 + // ... + // size 4 1721339808 + // We scan for the "size" line and parse the second number. + + const char search_term[] = "\nsize "; + char* hit = strstr(arcstats_buf, search_term); + if (hit == NULL) { + warn("parse_meminfo: arcstats does not contain size field\n"); + } else { + // ` ` skips spaces, `%*u` ignores the `type` field. + int matches = sscanf(hit + strlen(search_term), " %*u %lld", &zfs_arcstats_bytes); + if (matches < 1) { + warn("parse_meminfo: unexpected /proc/spl/kstat/zfs/arcstats contents in size line\n"); + } + } + } + + MemAvailable += zfs_arcstats_bytes / 1024; // MemAvailable is in kB + // Calculate percentages m.MemAvailablePercent = (double)MemAvailable * 100 / (double)m.MemTotalKiB; if (m.SwapTotalKiB > 0) {