Software sandboxing: The basics

8 min read Original article ↗
// These policies are heavily influenced by Docker's default profile. Further
// customization done on top:
//
// - Avoid syscalls that need root anyway. The policies here are mostly meant to
//   be used by unprivileged users (not containers with root inside). The
//   syscalls wouldn't be harmful, but would result in larger BPF programs that
//   in turn incur more overhead.
// - Avoid rarely used syscalls that can be abused for yet more fingerprinting
//   on desktop applications. This category mostly contains syscalls useful for
//   profiling (e.g. mincore, cachestat).
// - Split them into categories inspired by systemD's seccomp filter sets and
//   OpenBSD's pledge promises.

POLICY Aio {
    ALLOW {
        io_cancel, io_destroy, io_getevents, io_pgetevents, io_setup, io_submit
    }
}

POLICY BasicIo {
    ALLOW {
        read, readv, tee, vmsplice, write, writev,

        // ioctl() is definitively not about generic/stream/basic I/O. ioctl()
        // is really a syscall in disguise that device drivers can use for
        // anything. However it's expected that any program doing file I/O or
        // socket I/O or TTY IO will eventually stumble on glibc using ioctl()
        // for some operations so let's go ahead and just include it in the
        // basic IO set to force other IO categories to include it too.
        ioctl
    }
}

POLICY Clock {
    ALLOW {
        clock_getres, clock_gettime, gettimeofday, time, times
    }
}

// Compat quirks. This family of policies is a good candidate to be maintained
// in a different repo.
POLICY CompatX86 {
    ALLOW {
        // important for old ABI emulation
        personality(persona) {
            persona == /*PER_LINUX=*/0 || persona == /*PER_LINUX32=*/8 ||
            persona == /*UNAME26=*/0x0020000 ||
            persona == /*PER_LINUX32|UNAME26=*/0x20008 ||
            persona == 0xffffffff
        },

        // Important for x86 family's ABI. We put it in here instead of
        // c-runtime because other archs don't need it. Ideally Kafel would
        // allow us to write arch_prctl@amd64 in c-runtime and the rule would
        // only be included when we're building for the amd64 arch.
        arch_prctl
    }
}

POLICY CompatDB32 {
    ALLOW {
        remap_file_pages
    }
}

POLICY CompatSystemd {
    ALLOW {
        // SystemD uses this to get mount-id
        name_to_handle_at
    }
}

POLICY CompatWine {
    ALLOW {
        modify_ldt
    }
}

POLICY Credentials {
    ALLOW {
        getegid, geteuid, getgid, getgroups, getresgid, getresuid, getuid
    }
}

POLICY CredentialsExtra {
    ALLOW {
        // SystemD lists this syscall in the policy 'process' with the reasoning
        // that it's able to query arbitrary processes so it's a process
        // relationship related syscall. Following the same reasoning, we opt to
        // not include this syscall in the policy 'credentials' as other
        // syscalls in that category don't allow querying arbitrary
        // processes. However we also opt to not include capget in the category
        // 'process' given most usages of that policy won't need capget at all
        // and would just make the resulting BPF bigger.
        capget
    }
}

POLICY CredentialsMutation {
    ALLOW {
        capset, setfsgid, setfsuid, setgid, setgroups, setregid, setresgid,
        setresuid, setreuid, setuid
    }
}

// Memory allocation, threading, syscall interaction (or libc support) and
// functions that should always be available (e.g. exit_group to bail out as a
// program's last resort).
//
// Do notice that actually opening a libc-based program requires access to much
// more syscalls as the loader is going to scrape the filesystem for the
// required libraries and do many operations to stich the program image
// together. The idea here is to apply a filter that will allow the C runtime to
// keep running after we already have the program image in RAM.
POLICY CRuntime {
    ALLOW {
        brk, exit, exit_group, futex, futex_requeue, futex_wait, futex_waitv,
        futex_wake, get_robust_list, get_thread_area, gettid, madvise,
        map_shadow_stack, membarrier, mmap, mprotect, mremap, munmap,
        restart_syscall, rseq, sched_yield, set_robust_list, set_thread_area,
        set_tid_address,

        // glibc's malloc() has references to getrandom(), so it's included here
        getrandom
    }
}

// These syscalls are already gated by YAMA's ptrace_scope or capabilities
// (e.g. CAP_PERFMON). The usual reasoning would be that it's safe to permit
// them, but:
//
// - They are really only useful for process inspection/debugging.
// - For IPC usage, better mechanisms exist (e.g. one can memfd+seal+mmap to
//   have zero copy I/O between cooperating processes).
// - They appeared in a few CVEs in the past.
POLICY Debug {
    ALLOW {
        kcmp, pidfd_getfd, perf_event_open, process_madvise, process_mrelease,
        process_vm_readv, process_vm_writev, ptrace
    }
}

POLICY FileDescriptors {
    ALLOW {
        close, close_range, dup, dup2, dup3, fcntl
    }
}

// This policy is split off from filesystem so a process could still perform
// file IO on:
//
// - Already open files.
// - Files received from UNIX sockets.
// - Memfds.
POLICY FileIo {
    ALLOW {
        copy_file_range, fadvise64, fallocate, flock, ftruncate, lseek, pread64,
        preadv, preadv2, pwrite64, pwritev, pwritev2, readahead, sendfile,
        splice
    }
}

// OpenBSD's pledge further breaks down this promise into rpath, wpath, cpath
// and dpath, but Landlock would be more appropriate to mirror the intention of
// such granular designs
POLICY Filesystem {
    ALLOW {
        access, chdir, creat, faccessat, faccessat2, fchdir, fgetxattr,
        flistxattr, fstat, fstatfs, getcwd, getdents, getdents64, getxattr,
        inotify_add_watch, inotify_init, inotify_init1, inotify_rm_watch,
        lgetxattr, link, linkat, listxattr, llistxattr, lstat, mkdir, mkdirat,
        mknod, mknodat, newfstatat, open, openat, openat2, readlink, readlinkat,
        rename, renameat, renameat2, rmdir, stat, statfs, statx, symlink,
        symlinkat, truncate, umask, unlink, unlinkat
    }
}

// Allowed to make explicit changes to fields in struct stat relating to a file.
POLICY FilesystemAttr {
    ALLOW {
        chmod, chown, fchmod, fchmodat, fchmodat2, fchown, fchownat,
        fremovexattr, fsetxattr, futimesat, lchown, lremovexattr, lsetxattr,
        removexattr, setxattr, utime, utimensat, utimes
    }
}

// Event loop system calls.
POLICY IoEvent {
    ALLOW {
        epoll_create, epoll_create1, epoll_ctl, epoll_ctl_old, epoll_pwait,
        epoll_pwait2, epoll_wait, epoll_wait_old, eventfd, eventfd2, poll,
        ppoll, pselect6, select
    }
}

// io_uring nowadays is considered unsafe for general usage:
// http://security.googleblog.com/2023/06/learnings-from-kctf-vrps-42-linux.html
POLICY IoUring {
    ALLOW {
        io_uring_enter, io_uring_register, io_uring_setup
    }
}

// SysV IPC, POSIX Message Queues or other IPC.
POLICY Ipc {
    ALLOW {
        memfd_create, mq_getsetattr, mq_notify, mq_open, mq_timedreceive,
        mq_timedsend, mq_unlink, msgctl, msgget, msgrcv, msgsnd, pipe, pipe2,
        semctl, semget, semop, semtimedop, shmat, shmctl, shmdt, shmget
    }
}

// Memory locking control.
POLICY Memlock {
    ALLOW {
        memfd_secret, mlock, mlock2, mlockall, munlock, munlockall
    }
}

POLICY NetworkIo {
    ALLOW {
        connect, getpeername, getsockname, getsockopt, recvfrom, recvmmsg,
        recvmsg, sendmmsg, sendmsg, sendto, setsockopt, shutdown
    }
}

POLICY NetworkServer {
    ALLOW {
        accept, accept4, bind, listen
    }
}

POLICY NetworkSocketTcp {
    ALLOW {
        socket(domain, type, protocol) {
            (type & 0x7ff) == /*SOCK_STREAM=*/1 && protocol == 0 &&
            (domain == /*AF_INET=*/2 || domain == /*AF_INET6=*/10)
        }
    }
}

POLICY NetworkSocketUdp {
    ALLOW {
        socket(domain, type, protocol) {
            (type & 0x7ff) == /*SOCK_DGRAM=*/2 && protocol == 0 &&
            (domain == /*AF_INET=*/2 || domain == /*AF_INET6=*/10)
        }
    }
}

POLICY NetworkSocketUnix {
    ALLOW {
        socket(domain, type, protocol) {
            domain == /*AF_UNIX=*/1 && protocol == 0
        },
        socketpair(domain, type, protocol) {
            domain == /*AF_UNIX=*/1 && protocol == 0
        }
    }
}

// System calls used for memory protection keys.
POLICY Pkey {
    ALLOW {
        pkey_alloc, pkey_free, pkey_mprotect
    }
}

// Process control, execution, namespacing, relationship operations.
//
// Most likely you'll ALWAYS need access to this set to sandbox other binaries:
// <https://lore.kernel.org/all/202010281500.855B950FE@keescook/T/>. It's only
// really practical to exclude this set from the seccomp filter if you're
// sandboxing yourself (i.e. cooperatively dropping further privileges before
// doing dangerous stuff). It's a shame that Linux doesn't offer this type of
// transition-on-exec mechanism for seccomp nor cgroups. Folks from SELinux
// already know just how important it is to support this kind of mechanism for
// properly dropping privileges, and it'd be good for more kernel hackers to
// learn this lesson as well.
POLICY Process {
    ALLOW {
        // Where's clone2? ia64 is the only architecture that has clone2, but
        // ia64 doesn't implement seccomp. c.f.
        // acce2f71779c54086962fefce3833d886c655f62 in the kernel.
        clone, clone3, execve, execveat, fork, getpgid, getpgrp, getpid,
        getppid, getrusage, getsid, kill, pidfd_open, pidfd_send_signal, prctl,
        rt_sigqueueinfo, rt_tgsigqueueinfo, setpgid, setsid, tgkill, tkill,
        vfork, wait4, waitid
    }
}

POLICY Resources {
    ALLOW {
        getcpu, getpriority, getrlimit, ioprio_get, sched_getaffinity,
        sched_getattr, sched_getparam, sched_get_priority_max,
        sched_get_priority_min, sched_getscheduler, sched_rr_get_interval
    }
}

// Alter resource settings.
POLICY ResourcesMutation {
    ALLOW {
        ioprio_set, prlimit64, sched_setaffinity, sched_setattr, sched_setparam,
        sched_setscheduler, setpriority, setrlimit
    }
}

POLICY Sandbox {
    ALLOW {
        landlock_add_rule, landlock_create_ruleset, landlock_restrict_self,
        seccomp
    }
}

// Process signal handling.
POLICY Signal {
    ALLOW {
        pause, rt_sigaction, rt_sigpending, rt_sigprocmask, rt_sigreturn,
        rt_sigsuspend, rt_sigtimedwait, sigaltstack, signalfd, signalfd4
    }
}

// Synchronize files and memory to storage.
POLICY Sync {
    ALLOW {
        fdatasync, fsync, msync, sync, sync_file_range, syncfs
    }
}

// Schedule operations by time.
POLICY Timer {
    ALLOW {
        alarm, getitimer, clock_nanosleep, nanosleep, setitimer, timer_create,
        timer_delete, timer_getoverrun, timer_gettime, timer_settime,
        timerfd_create, timerfd_gettime, timerfd_settime
    }
}