Skip to content

Commit c2aa2df

Browse files
sargunkees
authored andcommitted
seccomp: Add wait_killable semantic to seccomp user notifier
This introduces a per-filter flag (SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) that makes it so that when notifications are received by the supervisor the notifying process will transition to wait killable semantics. Although wait killable isn't a set of semantics formally exposed to userspace, the concept is searchable. If the notifying process is signaled prior to the notification being received by the userspace agent, it will be handled as normal. One quirk about how this is handled is that the notifying process only switches to TASK_KILLABLE if it receives a wakeup from either an addfd or a signal. This is to avoid an unnecessary wakeup of the notifying task. The reasons behind switching into wait_killable only after userspace receives the notification are: * Avoiding unncessary work - Often, workloads will perform work that they may abort (request racing comes to mind). This allows for syscalls to be aborted safely prior to the notification being received by the supervisor. In this, the supervisor doesn't end up doing work that the workload does not want to complete anyways. * Avoiding side effects - We don't want the syscall to be interruptible once the supervisor starts doing work because it may not be trivial to reverse the operation. For example, unmounting a file system may take a long time, and it's hard to rollback, or treat that as reentrant. * Avoid breaking runtimes - Various runtimes do not GC when they are during a syscall (or while running native code that subsequently calls a syscall). If many notifications are blocked, and not picked up by the supervisor, this can get the application into a bad state. Signed-off-by: Sargun Dhillon <[email protected]> Signed-off-by: Kees Cook <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 662340e commit c2aa2df

File tree

4 files changed

+54
-3
lines changed

4 files changed

+54
-3
lines changed

Documentation/userspace-api/seccomp_filter.rst

+10
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,16 @@ notifying process it will be replaced. The supervisor can also add an FD, and
271271
respond atomically by using the ``SECCOMP_ADDFD_FLAG_SEND`` flag and the return
272272
value will be the injected file descriptor number.
273273

274+
The notifying process can be preempted, resulting in the notification being
275+
aborted. This can be problematic when trying to take actions on behalf of the
276+
notifying process that are long-running and typically retryable (mounting a
277+
filesytem). Alternatively, at filter installation time, the
278+
``SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV`` flag can be set. This flag makes it
279+
such that when a user notification is received by the supervisor, the notifying
280+
process will ignore non-fatal signals until the response is sent. Signals that
281+
are sent prior to the notification being received by userspace are handled
282+
normally.
283+
274284
It is worth noting that ``struct seccomp_data`` contains the values of register
275285
arguments to the syscall, but does not contain pointers to memory. The task's
276286
memory is accessible to suitably privileged traces via ``ptrace()`` or

include/linux/seccomp.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
SECCOMP_FILTER_FLAG_LOG | \
99
SECCOMP_FILTER_FLAG_SPEC_ALLOW | \
1010
SECCOMP_FILTER_FLAG_NEW_LISTENER | \
11-
SECCOMP_FILTER_FLAG_TSYNC_ESRCH)
11+
SECCOMP_FILTER_FLAG_TSYNC_ESRCH | \
12+
SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
1213

1314
/* sizeof() the first published struct seccomp_notif_addfd */
1415
#define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24

include/uapi/linux/seccomp.h

+2
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
#define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
2424
#define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
2525
#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
26+
/* Received notifications wait in killable state (only respond to fatal signals) */
27+
#define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV (1UL << 5)
2628

2729
/*
2830
* All BPF programs must return a 32-bit value.

kernel/seccomp.c

+40-2
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,8 @@ static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
200200
* the filter can be freed.
201201
* @cache: cache of arch/syscall mappings to actions
202202
* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
203+
* @wait_killable_recv: Put notifying process in killable state once the
204+
* notification is received by the userspace listener.
203205
* @prev: points to a previously installed, or inherited, filter
204206
* @prog: the BPF program to evaluate
205207
* @notif: the struct that holds all notification related information
@@ -220,6 +222,7 @@ struct seccomp_filter {
220222
refcount_t refs;
221223
refcount_t users;
222224
bool log;
225+
bool wait_killable_recv;
223226
struct action_cache cache;
224227
struct seccomp_filter *prev;
225228
struct bpf_prog *prog;
@@ -893,6 +896,10 @@ static long seccomp_attach_filter(unsigned int flags,
893896
if (flags & SECCOMP_FILTER_FLAG_LOG)
894897
filter->log = true;
895898

899+
/* Set wait killable flag, if present. */
900+
if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
901+
filter->wait_killable_recv = true;
902+
896903
/*
897904
* If there is an existing filter, make it the prev and don't drop its
898905
* task reference.
@@ -1080,6 +1087,12 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn
10801087
complete(&addfd->completion);
10811088
}
10821089

1090+
static bool should_sleep_killable(struct seccomp_filter *match,
1091+
struct seccomp_knotif *n)
1092+
{
1093+
return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT;
1094+
}
1095+
10831096
static int seccomp_do_user_notification(int this_syscall,
10841097
struct seccomp_filter *match,
10851098
const struct seccomp_data *sd)
@@ -1110,11 +1123,25 @@ static int seccomp_do_user_notification(int this_syscall,
11101123
* This is where we wait for a reply from userspace.
11111124
*/
11121125
do {
1126+
bool wait_killable = should_sleep_killable(match, &n);
1127+
11131128
mutex_unlock(&match->notify_lock);
1114-
err = wait_for_completion_interruptible(&n.ready);
1129+
if (wait_killable)
1130+
err = wait_for_completion_killable(&n.ready);
1131+
else
1132+
err = wait_for_completion_interruptible(&n.ready);
11151133
mutex_lock(&match->notify_lock);
1116-
if (err != 0)
1134+
1135+
if (err != 0) {
1136+
/*
1137+
* Check to see if the notifcation got picked up and
1138+
* whether we should switch to wait killable.
1139+
*/
1140+
if (!wait_killable && should_sleep_killable(match, &n))
1141+
continue;
1142+
11171143
goto interrupted;
1144+
}
11181145

11191146
addfd = list_first_entry_or_null(&n.addfd,
11201147
struct seccomp_kaddfd, list);
@@ -1484,6 +1511,9 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
14841511
mutex_lock(&filter->notify_lock);
14851512
knotif = find_notification(filter, unotif.id);
14861513
if (knotif) {
1514+
/* Reset the process to make sure it's not stuck */
1515+
if (should_sleep_killable(filter, knotif))
1516+
complete(&knotif->ready);
14871517
knotif->state = SECCOMP_NOTIFY_INIT;
14881518
up(&filter->notif->request);
14891519
}
@@ -1829,6 +1859,14 @@ static long seccomp_set_mode_filter(unsigned int flags,
18291859
((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0))
18301860
return -EINVAL;
18311861

1862+
/*
1863+
* The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT flag doesn't make sense
1864+
* without the SECCOMP_FILTER_FLAG_NEW_LISTENER flag.
1865+
*/
1866+
if ((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) &&
1867+
((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == 0))
1868+
return -EINVAL;
1869+
18321870
/* Prepare the new filter before holding any locks. */
18331871
prepared = seccomp_prepare_user_filter(filter);
18341872
if (IS_ERR(prepared))

0 commit comments

Comments
 (0)