// These policies are heavily influenced by Docker's default profile. Further
// customization done on top:
//
// - Avoid syscalls that need root anyway. The policies here are mostly meant to
// be used by unprivileged users (not containers with root inside). The
// syscalls wouldn't be harmful, but would result in larger BPF programs that
// in turn incur more overhead.
// - Avoid rarely used syscalls that can be abused for yet more fingerprinting
// on desktop applications. This category mostly contains syscalls useful for
// profiling (e.g. mincore, cachestat).
// - Split them into categories inspired by systemD's seccomp filter sets and
// OpenBSD's pledge promises.
POLICY Aio {
ALLOW {
io_cancel, io_destroy, io_getevents, io_pgetevents, io_setup, io_submit
}
}
POLICY BasicIo {
ALLOW {
read, readv, tee, vmsplice, write, writev,
// ioctl() is definitively not about generic/stream/basic I/O. ioctl()
// is really a syscall in disguise that device drivers can use for
// anything. However it's expected that any program doing file I/O or
// socket I/O or TTY IO will eventually stumble on glibc using ioctl()
// for some operations so let's go ahead and just include it in the
// basic IO set to force other IO categories to include it too.
ioctl
}
}
POLICY Clock {
ALLOW {
clock_getres, clock_gettime, gettimeofday, time, times
}
}
// Compat quirks. This family of policies is a good candidate to be maintained
// in a different repo.
POLICY CompatX86 {
ALLOW {
// important for old ABI emulation
personality(persona) {
persona == /*PER_LINUX=*/0 || persona == /*PER_LINUX32=*/8 ||
persona == /*UNAME26=*/0x0020000 ||
persona == /*PER_LINUX32|UNAME26=*/0x20008 ||
persona == 0xffffffff
},
// Important for x86 family's ABI. We put it in here instead of
// c-runtime because other archs don't need it. Ideally Kafel would
// allow us to write arch_prctl@amd64 in c-runtime and the rule would
// only be included when we're building for the amd64 arch.
arch_prctl
}
}
POLICY CompatDB32 {
ALLOW {
remap_file_pages
}
}
POLICY CompatSystemd {
ALLOW {
// SystemD uses this to get mount-id
name_to_handle_at
}
}
POLICY CompatWine {
ALLOW {
modify_ldt
}
}
POLICY Credentials {
ALLOW {
getegid, geteuid, getgid, getgroups, getresgid, getresuid, getuid
}
}
POLICY CredentialsExtra {
ALLOW {
// SystemD lists this syscall in the policy 'process' with the reasoning
// that it's able to query arbitrary processes so it's a process
// relationship related syscall. Following the same reasoning, we opt to
// not include this syscall in the policy 'credentials' as other
// syscalls in that category don't allow querying arbitrary
// processes. However we also opt to not include capget in the category
// 'process' given most usages of that policy won't need capget at all
// and would just make the resulting BPF bigger.
capget
}
}
POLICY CredentialsMutation {
ALLOW {
capset, setfsgid, setfsuid, setgid, setgroups, setregid, setresgid,
setresuid, setreuid, setuid
}
}
// Memory allocation, threading, syscall interaction (or libc support) and
// functions that should always be available (e.g. exit_group to bail out as a
// program's last resort).
//
// Do notice that actually opening a libc-based program requires access to much
// more syscalls as the loader is going to scrape the filesystem for the
// required libraries and do many operations to stich the program image
// together. The idea here is to apply a filter that will allow the C runtime to
// keep running after we already have the program image in RAM.
POLICY CRuntime {
ALLOW {
brk, exit, exit_group, futex, futex_requeue, futex_wait, futex_waitv,
futex_wake, get_robust_list, get_thread_area, gettid, madvise,
map_shadow_stack, membarrier, mmap, mprotect, mremap, munmap,
restart_syscall, rseq, sched_yield, set_robust_list, set_thread_area,
set_tid_address,
// glibc's malloc() has references to getrandom(), so it's included here
getrandom
}
}
// These syscalls are already gated by YAMA's ptrace_scope or capabilities
// (e.g. CAP_PERFMON). The usual reasoning would be that it's safe to permit
// them, but:
//
// - They are really only useful for process inspection/debugging.
// - For IPC usage, better mechanisms exist (e.g. one can memfd+seal+mmap to
// have zero copy I/O between cooperating processes).
// - They appeared in a few CVEs in the past.
POLICY Debug {
ALLOW {
kcmp, pidfd_getfd, perf_event_open, process_madvise, process_mrelease,
process_vm_readv, process_vm_writev, ptrace
}
}
POLICY FileDescriptors {
ALLOW {
close, close_range, dup, dup2, dup3, fcntl
}
}
// This policy is split off from filesystem so a process could still perform
// file IO on:
//
// - Already open files.
// - Files received from UNIX sockets.
// - Memfds.
POLICY FileIo {
ALLOW {
copy_file_range, fadvise64, fallocate, flock, ftruncate, lseek, pread64,
preadv, preadv2, pwrite64, pwritev, pwritev2, readahead, sendfile,
splice
}
}
// OpenBSD's pledge further breaks down this promise into rpath, wpath, cpath
// and dpath, but Landlock would be more appropriate to mirror the intention of
// such granular designs
POLICY Filesystem {
ALLOW {
access, chdir, creat, faccessat, faccessat2, fchdir, fgetxattr,
flistxattr, fstat, fstatfs, getcwd, getdents, getdents64, getxattr,
inotify_add_watch, inotify_init, inotify_init1, inotify_rm_watch,
lgetxattr, link, linkat, listxattr, llistxattr, lstat, mkdir, mkdirat,
mknod, mknodat, newfstatat, open, openat, openat2, readlink, readlinkat,
rename, renameat, renameat2, rmdir, stat, statfs, statx, symlink,
symlinkat, truncate, umask, unlink, unlinkat
}
}
// Allowed to make explicit changes to fields in struct stat relating to a file.
POLICY FilesystemAttr {
ALLOW {
chmod, chown, fchmod, fchmodat, fchmodat2, fchown, fchownat,
fremovexattr, fsetxattr, futimesat, lchown, lremovexattr, lsetxattr,
removexattr, setxattr, utime, utimensat, utimes
}
}
// Event loop system calls.
POLICY IoEvent {
ALLOW {
epoll_create, epoll_create1, epoll_ctl, epoll_ctl_old, epoll_pwait,
epoll_pwait2, epoll_wait, epoll_wait_old, eventfd, eventfd2, poll,
ppoll, pselect6, select
}
}
// io_uring nowadays is considered unsafe for general usage:
// http://security.googleblog.com/2023/06/learnings-from-kctf-vrps-42-linux.html
POLICY IoUring {
ALLOW {
io_uring_enter, io_uring_register, io_uring_setup
}
}
// SysV IPC, POSIX Message Queues or other IPC.
POLICY Ipc {
ALLOW {
memfd_create, mq_getsetattr, mq_notify, mq_open, mq_timedreceive,
mq_timedsend, mq_unlink, msgctl, msgget, msgrcv, msgsnd, pipe, pipe2,
semctl, semget, semop, semtimedop, shmat, shmctl, shmdt, shmget
}
}
// Memory locking control.
POLICY Memlock {
ALLOW {
memfd_secret, mlock, mlock2, mlockall, munlock, munlockall
}
}
POLICY NetworkIo {
ALLOW {
connect, getpeername, getsockname, getsockopt, recvfrom, recvmmsg,
recvmsg, sendmmsg, sendmsg, sendto, setsockopt, shutdown
}
}
POLICY NetworkServer {
ALLOW {
accept, accept4, bind, listen
}
}
POLICY NetworkSocketTcp {
ALLOW {
socket(domain, type, protocol) {
(type & 0x7ff) == /*SOCK_STREAM=*/1 && protocol == 0 &&
(domain == /*AF_INET=*/2 || domain == /*AF_INET6=*/10)
}
}
}
POLICY NetworkSocketUdp {
ALLOW {
socket(domain, type, protocol) {
(type & 0x7ff) == /*SOCK_DGRAM=*/2 && protocol == 0 &&
(domain == /*AF_INET=*/2 || domain == /*AF_INET6=*/10)
}
}
}
POLICY NetworkSocketUnix {
ALLOW {
socket(domain, type, protocol) {
domain == /*AF_UNIX=*/1 && protocol == 0
},
socketpair(domain, type, protocol) {
domain == /*AF_UNIX=*/1 && protocol == 0
}
}
}
// System calls used for memory protection keys.
POLICY Pkey {
ALLOW {
pkey_alloc, pkey_free, pkey_mprotect
}
}
// Process control, execution, namespacing, relationship operations.
//
// Most likely you'll ALWAYS need access to this set to sandbox other binaries:
// <https://lore.kernel.org/all/202010281500.855B950FE@keescook/T/>. It's only
// really practical to exclude this set from the seccomp filter if you're
// sandboxing yourself (i.e. cooperatively dropping further privileges before
// doing dangerous stuff). It's a shame that Linux doesn't offer this type of
// transition-on-exec mechanism for seccomp nor cgroups. Folks from SELinux
// already know just how important it is to support this kind of mechanism for
// properly dropping privileges, and it'd be good for more kernel hackers to
// learn this lesson as well.
POLICY Process {
ALLOW {
// Where's clone2? ia64 is the only architecture that has clone2, but
// ia64 doesn't implement seccomp. c.f.
// acce2f71779c54086962fefce3833d886c655f62 in the kernel.
clone, clone3, execve, execveat, fork, getpgid, getpgrp, getpid,
getppid, getrusage, getsid, kill, pidfd_open, pidfd_send_signal, prctl,
rt_sigqueueinfo, rt_tgsigqueueinfo, setpgid, setsid, tgkill, tkill,
vfork, wait4, waitid
}
}
POLICY Resources {
ALLOW {
getcpu, getpriority, getrlimit, ioprio_get, sched_getaffinity,
sched_getattr, sched_getparam, sched_get_priority_max,
sched_get_priority_min, sched_getscheduler, sched_rr_get_interval
}
}
// Alter resource settings.
POLICY ResourcesMutation {
ALLOW {
ioprio_set, prlimit64, sched_setaffinity, sched_setattr, sched_setparam,
sched_setscheduler, setpriority, setrlimit
}
}
POLICY Sandbox {
ALLOW {
landlock_add_rule, landlock_create_ruleset, landlock_restrict_self,
seccomp
}
}
// Process signal handling.
POLICY Signal {
ALLOW {
pause, rt_sigaction, rt_sigpending, rt_sigprocmask, rt_sigreturn,
rt_sigsuspend, rt_sigtimedwait, sigaltstack, signalfd, signalfd4
}
}
// Synchronize files and memory to storage.
POLICY Sync {
ALLOW {
fdatasync, fsync, msync, sync, sync_file_range, syncfs
}
}
// Schedule operations by time.
POLICY Timer {
ALLOW {
alarm, getitimer, clock_nanosleep, nanosleep, setitimer, timer_create,
timer_delete, timer_getoverrun, timer_gettime, timer_settime,
timerfd_create, timerfd_gettime, timerfd_settime
}
}