Skip to content

Commit dc43652

Browse files
committed
Set temporary single CPU affinity before cgroup cpuset transition.
This handles a corner case when joining a container having all the processes running exclusively on isolated CPU cores to force the kernel to schedule runc process on the first CPU core within the cgroups cpuset. The introduction of the kernel commit 46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76 has affected this deterministic scheduling behavior by distributing tasks across CPU cores within the cgroups cpuset. Some intensive real-time application are relying on this deterministic behavior and use the first CPU core to run a slow thread while other CPU cores are fully used by real-time threads with SCHED_FIFO policy. Such applications prevents runc process from joining a container when the runc process is randomly scheduled on a CPU core owned by a real-time thread. Signed-off-by: Cédric Clerget <[email protected]>
1 parent caa6e52 commit dc43652

File tree

4 files changed

+201
-0
lines changed

4 files changed

+201
-0
lines changed

libcontainer/container_linux.go

+83
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"google.golang.org/protobuf/proto"
2929

3030
"github.com/opencontainers/runc/libcontainer/cgroups"
31+
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
3132
"github.com/opencontainers/runc/libcontainer/configs"
3233
"github.com/opencontainers/runc/libcontainer/intelrdt"
3334
"github.com/opencontainers/runc/libcontainer/system"
@@ -2246,6 +2247,34 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa
22462247
})
22472248
}
22482249

2250+
// set CPU affinity
2251+
if it == initSetns && len(c.cgroupManager.GetPaths()) > 0 {
2252+
// get the target container cgroup
2253+
if cg, err := c.cgroupManager.GetCgroups(); err != nil {
2254+
return nil, fmt.Errorf("getting container cgroups: %w", err)
2255+
} else if cg.CpusetCpus != "" {
2256+
// get the isolated CPU list
2257+
d, err := os.ReadFile("/sys/devices/system/cpu/isolated")
2258+
// The above file may not be available in some environment
2259+
// due to /sys not being mounted, if not present we don't
2260+
// try to set CPU affinity and ignore the error.
2261+
// When an empty set is returned, the data length is equal
2262+
// to 1 (newline char), when set its length is greater than 1
2263+
// which means we may need to adjust CPU affinity shortly.
2264+
if err == nil && len(d) > 1 {
2265+
cpu, eligible, err := getEligibleCPU(cg.CpusetCpus, string(bytes.TrimSpace(d)))
2266+
if err != nil {
2267+
return nil, fmt.Errorf("getting eligible cpu: %w", err)
2268+
} else if eligible {
2269+
r.AddData(&Int32msg{
2270+
Type: CPUAffinityAttr,
2271+
Value: uint32(cpu),
2272+
})
2273+
}
2274+
}
2275+
}
2276+
}
2277+
22492278
return bytes.NewReader(r.Serialize()), nil
22502279
}
22512280

@@ -2280,3 +2309,57 @@ func requiresRootOrMappingTool(c *configs.Config) bool {
22802309
}
22812310
return !reflect.DeepEqual(c.GidMappings, gidMap)
22822311
}
2312+
2313+
// getEligibleCPU returns the first eligible CPU for CPU affinity before
2314+
// entering in a cgroup cpuset.
2315+
// - when there is not cpuset cores: no eligible CPU
2316+
// - when there is not isolated cores: no eligible CPU
2317+
// - when cpuset cores are all isolated cores: return first CPU of the cpuset
2318+
// - when cpuset cores are mixed between housekeeping/isolated cores: no eligible CPU.
2319+
func getEligibleCPU(cpusetList, isolatedList string) (int, bool, error) {
2320+
if isolatedList == "" || cpusetList == "" {
2321+
return 0, false, nil
2322+
}
2323+
2324+
// The target container has a cgroup cpuset, get the bit range.
2325+
cpusetBits, err := systemd.RangeToBits(cpusetList)
2326+
if err != nil {
2327+
return 0, false, fmt.Errorf("parsing cpuset cpus list %s: %w", cpusetList, err)
2328+
}
2329+
2330+
isolatedBits, err := systemd.RangeToBits(isolatedList)
2331+
if err != nil {
2332+
return 0, false, fmt.Errorf("parsing isolated cpus list %s: %w", isolatedList, err)
2333+
}
2334+
2335+
affinityCore := 0
2336+
isolatedCores := 0
2337+
cpusetCores := 0
2338+
2339+
// start from cpu core #0
2340+
currentCore := 0
2341+
// CPU core start from the first slice element and bits are read
2342+
// from the least to the most significant bit.
2343+
for byteRange := 0; byteRange < len(cpusetBits); byteRange++ {
2344+
for bit := 0; bit < 8; bit++ {
2345+
if cpusetBits[byteRange]&(1<<bit) != 0 {
2346+
// add the first core of the cgroup cpuset to the affinity set
2347+
if cpusetCores == 0 {
2348+
affinityCore = currentCore
2349+
}
2350+
// cpuset cores count
2351+
cpusetCores++
2352+
// isolated cores count
2353+
if byteRange < len(isolatedBits) {
2354+
if isolatedBits[byteRange]&(1<<bit) != 0 {
2355+
isolatedCores++
2356+
}
2357+
}
2358+
}
2359+
currentCore++
2360+
}
2361+
}
2362+
2363+
// we have a cpuset with only isolated cores
2364+
return affinityCore, cpusetCores > 0 && isolatedCores == cpusetCores, nil
2365+
}

libcontainer/container_linux_test.go

+84
Original file line numberDiff line numberDiff line change
@@ -286,3 +286,87 @@ func TestGetContainerStateAfterUpdate(t *testing.T) {
286286
t.Fatalf("expected Memory to be 2048 but received %q", state.Config.Cgroups.Memory)
287287
}
288288
}
289+
290+
func TestGetEligibleCPU(t *testing.T) {
291+
tests := []struct {
292+
name string
293+
cpuset string
294+
isolset string
295+
expectedErr bool
296+
expectedAffinityCore int
297+
expectedEligible bool
298+
}{
299+
{
300+
name: "no cpuset",
301+
isolset: "2-15,18-31,34-47",
302+
expectedEligible: false,
303+
},
304+
{
305+
name: "no isolated set",
306+
cpuset: "0-15",
307+
expectedEligible: false,
308+
},
309+
{
310+
name: "bad cpuset format",
311+
cpuset: "core0 to core15",
312+
isolset: "2-15,18-31,34-47",
313+
expectedErr: true,
314+
},
315+
{
316+
name: "bad isolated set format",
317+
cpuset: "0-15",
318+
isolset: "core0 to core15",
319+
expectedErr: true,
320+
},
321+
{
322+
name: "no eligible core",
323+
cpuset: "0-1,16-17,32-33",
324+
isolset: "2-15,18-31,34-47",
325+
expectedEligible: false,
326+
},
327+
{
328+
name: "no eligible core mixed",
329+
cpuset: "0-31",
330+
isolset: "2-15,18-31,34-47",
331+
expectedEligible: false,
332+
},
333+
{
334+
name: "eligible core #4",
335+
cpuset: "4-7",
336+
isolset: "2-15,18-31,34-47",
337+
expectedEligible: true,
338+
expectedAffinityCore: 4,
339+
},
340+
{
341+
name: "eligible core #40",
342+
cpuset: "40-47",
343+
isolset: "2-15,18-31,34-47",
344+
expectedEligible: true,
345+
expectedAffinityCore: 40,
346+
},
347+
{
348+
name: "eligible core #24",
349+
cpuset: "24-31",
350+
isolset: "2-15,18-31,34-47",
351+
expectedEligible: true,
352+
expectedAffinityCore: 24,
353+
},
354+
}
355+
356+
for _, tt := range tests {
357+
t.Run(tt.name, func(t *testing.T) {
358+
affinityCore, eligible, err := getEligibleCPU(tt.cpuset, tt.isolset)
359+
if err != nil && !tt.expectedErr {
360+
t.Fatalf("unexpected error: %s", err)
361+
} else if err == nil && tt.expectedErr {
362+
t.Fatalf("unexpected success")
363+
} else if tt.expectedEligible && !eligible {
364+
t.Fatalf("was expecting eligible core but no eligible core returned")
365+
} else if !tt.expectedEligible && eligible {
366+
t.Fatalf("was not expecting eligible core but got eligible core")
367+
} else if tt.expectedEligible && tt.expectedAffinityCore != affinityCore {
368+
t.Fatalf("expected affinity core %d: got %d instead", tt.expectedAffinityCore, affinityCore)
369+
}
370+
})
371+
}
372+
}

libcontainer/message_linux.go

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ const (
2222
UidmapPathAttr uint16 = 27288
2323
GidmapPathAttr uint16 = 27289
2424
MountSourcesAttr uint16 = 27290
25+
CPUAffinityAttr uint16 = 27291
2526
)
2627

2728
type Int32msg struct {

libcontainer/nsenter/nsexec.c

+33
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,9 @@ struct nlconfig_t {
9595
/* Mount sources opened outside the container userns. */
9696
char *mountsources;
9797
size_t mountsources_len;
98+
99+
/* Temporary CPU affinity before cgroup cpuset transition. */
100+
uint32_t cpu_affinity;
98101
};
99102

100103
/*
@@ -112,6 +115,7 @@ struct nlconfig_t {
112115
#define UIDMAPPATH_ATTR 27288
113116
#define GIDMAPPATH_ATTR 27289
114117
#define MOUNT_SOURCES_ATTR 27290
118+
#define CPU_AFFINITY_ATTR 27291
115119

116120
/*
117121
* Use the raw syscall for versions of glibc which don't include a function for
@@ -383,6 +387,9 @@ static void nl_parse(int fd, struct nlconfig_t *config)
383387
if (len != size)
384388
bail("failed to read netlink payload, %zu != %zu", len, size);
385389

390+
/* No cpu affinity by default: int32(-1) */
391+
config->cpu_affinity = ~0;
392+
386393
/* Parse the netlink payload. */
387394
config->data = data;
388395
while (current < data + size) {
@@ -431,6 +438,9 @@ static void nl_parse(int fd, struct nlconfig_t *config)
431438
config->mountsources = current;
432439
config->mountsources_len = payload_len;
433440
break;
441+
case CPU_AFFINITY_ATTR:
442+
config->cpu_affinity = readint32(current);
443+
break;
434444
default:
435445
bail("unknown netlink message type %d", nlattr->nla_type);
436446
}
@@ -1053,6 +1063,29 @@ void nsexec(void)
10531063
bail("failed to sync with parent: SYNC_MOUNTSOURCES_ACK: got %u", s);
10541064
}
10551065

1066+
/*
1067+
* Set temporary single CPU affinity before cgroup cpuset transition,
1068+
* this handles a corner case when joining a container having all
1069+
* the processes running exclusively on isolated CPU cores to force
1070+
* the kernel to schedule runc process on the first CPU core within the
1071+
* cgroups cpuset. The introduction of the kernel commit
1072+
* 46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76 has affected this deterministic
1073+
* scheduling behavior by distributing tasks across CPU cores within the
1074+
* cgroups cpuset. Some intensive real-time application are relying on this
1075+
* deterministic behavior and use the first CPU core to run a slow thread
1076+
* while other CPU cores are fully used by real-time threads with SCHED_FIFO
1077+
* policy. Such applications prevent runc process from joining a container
1078+
* when the runc process is randomly scheduled on a CPU core owned by a
1079+
* real-time thread.
1080+
*/
1081+
if ((int32_t) config.cpu_affinity >= 0) {
1082+
cpu_set_t set;
1083+
CPU_ZERO(&set);
1084+
CPU_SET(config.cpu_affinity, &set);
1085+
if (sched_setaffinity(0, sizeof(set), &set) == -1)
1086+
bail("sched_setaffinity failed");
1087+
}
1088+
10561089
/*
10571090
* TODO: What about non-namespace clone flags that we're dropping here?
10581091
*

0 commit comments

Comments
 (0)