diff -aurN -x '*.orig' src-clean/contrib/openbsm/libbsm/bsm_errno.c src/contrib/openbsm/libbsm/bsm_errno.c --- src-clean/contrib/openbsm/libbsm/bsm_errno.c 2010-08-25 10:09:23.000000000 +0200 +++ src/contrib/openbsm/libbsm/bsm_errno.c 2010-08-25 10:24:35.000000000 +0200 @@ -684,6 +684,13 @@ ERRNO_NO_LOCAL_MAPPING, #endif ES("Key was rejected by service") }, + { BSM_ERRNO_ENOTCAPABLE, +#ifdef ENOTCAPABLE + ENOTCAPABLE, +#else + ERRNO_NO_LOCAL_MAPPING, +#endif + ES("Capabilities insufficient") }, }; static const int bsm_errnos_count = sizeof(bsm_errnos) / sizeof(bsm_errnos[0]); diff -aurN -x '*.orig' src-clean/contrib/openbsm/sys/bsm/audit_errno.h src/contrib/openbsm/sys/bsm/audit_errno.h --- src-clean/contrib/openbsm/sys/bsm/audit_errno.h 2010-08-25 10:09:23.000000000 +0200 +++ src/contrib/openbsm/sys/bsm/audit_errno.h 2010-08-25 10:24:35.000000000 +0200 @@ -204,6 +204,7 @@ #define BSM_ERRNO_EKEYEXPIRED 220 /* Linux-specific. */ #define BSM_ERRNO_EKEYREVOKED 221 /* Linux-specific. */ #define BSM_ERRNO_EKEYREJECTED 222 /* Linux-specific. */ +#define BSM_ERRNO_ENOTCAPABLE 223 /* FreeBSD-specific. */ /* * In the event that OpenBSM doesn't have a file representation of a local diff -aurN -x '*.orig' src-clean/contrib/tcpdump/tcpdump.c src/contrib/tcpdump/tcpdump.c --- src-clean/contrib/tcpdump/tcpdump.c 2010-08-25 10:09:28.000000000 +0200 +++ src/contrib/tcpdump/tcpdump.c 2010-08-25 10:24:35.000000000 +0200 @@ -76,6 +76,8 @@ #include #endif /* WIN32 */ +#include +#include #include "netdissect.h" #include "interface.h" @@ -1197,6 +1199,14 @@ (void)fflush(stderr); } #endif /* WIN32 */ + if (lc_limitfd(STDIN_FILENO, CAP_FSTAT) < 0) + error("lc_limitfd: unable to limit STDIN_FILENO"); + if (lc_limitfd(STDOUT_FILENO, CAP_FSTAT | CAP_SEEK | CAP_WRITE) < 0) + error("lc_limitfd: unable to limit STDIN_FILENO"); + if (lc_limitfd(STDERR_FILENO, CAP_FSTAT | CAP_SEEK | CAP_WRITE) < 0) + error("lc_limitfd: unable to limit STDERR_FILENO"); + if (cap_enter() < 0) + error("cap_enter: %s", pcap_strerror(errno)); status = pcap_loop(pd, cnt, callback, pcap_userdata); if (WFileName == NULL) { /* diff -aurN -x '*.orig' src-clean/crypto/openssh/sshd.c src/crypto/openssh/sshd.c --- src-clean/crypto/openssh/sshd.c 2010-08-25 10:10:19.000000000 +0200 +++ src/crypto/openssh/sshd.c 2010-08-25 10:24:35.000000000 +0200 @@ -46,6 +46,7 @@ __RCSID("$FreeBSD: src/crypto/openssh/sshd.c,v 1.48.2.4.2.1 2010/06/14 02:09:06 kensmith Exp $"); #include +#include #include #include #include @@ -629,6 +630,8 @@ fatal("setgroups: %.100s", strerror(errno)); permanently_set_uid(privsep_pw); #endif + if (cap_enter() != 0 && errno != ENOSYS) + fatal("cap_enter: %.100s", strerror(errno)); } static int diff -aurN -x '*.orig' src-clean/gnu/usr.bin/groff/tmac/mdoc.local src/gnu/usr.bin/groff/tmac/mdoc.local --- src-clean/gnu/usr.bin/groff/tmac/mdoc.local 2010-08-25 10:10:25.000000000 +0200 +++ src/gnu/usr.bin/groff/tmac/mdoc.local 2010-08-25 10:24:35.000000000 +0200 @@ -38,6 +38,7 @@ .ds doc-str-Lb-libc_r Reentrant C\~Library (libc_r, \-lc_r) .ds doc-str-Lb-libcalendar Calendar Arithmetic Library (libcalendar, \-lcalendar) .ds doc-str-Lb-libcam Common Access Method User Library (libcam, \-lcam) +.ds doc-str-Lb-libcapsicum Capability Services Library (libcapsicum, \-lcapsicum) .ds doc-str-Lb-libcipher FreeSec Crypt Library (libcipher, \-lcipher) .ds doc-str-Lb-libdevinfo Device and Resource Information Utility Library (libdevinfo, \-ldevinfo) .ds doc-str-Lb-libdevstat Device Statistics Library (libdevstat, \-ldevstat) diff -aurN -x '*.orig' src-clean/include/libgen.h src/include/libgen.h --- src-clean/include/libgen.h 2010-08-25 10:10:27.000000000 +0200 +++ src/include/libgen.h 2010-08-25 10:24:35.000000000 +0200 @@ -36,6 +36,7 @@ __BEGIN_DECLS char *basename(const char *); +char *basename_r(const char *, char *); char *dirname(const char *); #if 0 char *regcmp(const char *, ...); diff -aurN -x '*.orig' src-clean/lib/Makefile src/lib/Makefile --- src-clean/lib/Makefile 2010-08-25 10:10:35.000000000 +0200 +++ src/lib/Makefile 2010-08-25 10:24:35.000000000 +0200 @@ -57,6 +57,7 @@ ${_libbsnmp} \ libbz2 \ libcalendar \ + libcapsicum \ libcam \ libcompat \ libdevinfo \ diff -aurN -x '*.orig' src-clean/lib/csu/amd64/crt1.c src/lib/csu/amd64/crt1.c --- src-clean/lib/csu/amd64/crt1.c 2010-08-25 10:10:29.000000000 +0200 +++ src/lib/csu/amd64/crt1.c 2010-08-25 10:24:35.000000000 +0200 @@ -31,6 +31,7 @@ #endif /* lint */ #include +#include #include "libc_private.h" #include "crtbrand.c" diff -aurN -x '*.orig' src-clean/lib/csu/common/crtbrand.c src/lib/csu/common/crtbrand.c --- src-clean/lib/csu/common/crtbrand.c 2010-08-25 10:10:29.000000000 +0200 +++ src/lib/csu/common/crtbrand.c 2010-08-25 10:24:35.000000000 +0200 @@ -27,6 +27,7 @@ __FBSDID("$FreeBSD: src/lib/csu/common/crtbrand.c,v 1.6.2.1.4.1 2010/06/14 02:09:06 kensmith Exp $"); #include +#include #define ABI_VENDOR "FreeBSD" #define ABI_SECTION ".note.ABI-tag" @@ -50,3 +51,4 @@ ABI_VENDOR, __FreeBSD_version }; + diff -aurN -x '*.orig' src-clean/lib/csu/i386-elf/crt1_c.c src/lib/csu/i386-elf/crt1_c.c --- src-clean/lib/csu/i386-elf/crt1_c.c 2010-08-25 10:10:29.000000000 +0200 +++ src/lib/csu/i386-elf/crt1_c.c 2010-08-25 10:24:35.000000000 +0200 @@ -93,3 +93,4 @@ } __asm(".hidden _start1"); + diff -aurN -x '*.orig' src-clean/lib/libc/gen/Makefile.inc src/lib/libc/gen/Makefile.inc --- src-clean/lib/libc/gen/Makefile.inc 2010-08-25 10:10:30.000000000 +0200 +++ src/lib/libc/gen/Makefile.inc 2010-08-25 10:24:35.000000000 +0200 @@ -20,6 +20,7 @@ getpeereid.c getprogname.c getpwent.c getttyent.c \ getusershell.c getvfsbyname.c glob.c \ initgroups.c isatty.c isinf.c isnan.c jrand48.c lcong48.c \ + ld_libdirs.c ld_sandbox.c \ lockf.c lrand48.c mrand48.c nftw.c nice.c \ nlist.c nrand48.c opendir.c \ pause.c pmadvise.c popen.c posix_spawn.c \ @@ -77,6 +78,7 @@ MLINKS+=arc4random.3 arc4random_addrandom.3 arc4random.3 arc4random_stir.3 \ arc4random.3 arc4random_buf.3 arc4random.3 arc4random_uniform.3 +MLINKS+=basename.3 basename_r.3 MLINKS+=ctermid.3 ctermid_r.3 MLINKS+=devname.3 devname_r.3 MLINKS+=devname.3 fdevname.3 diff -aurN -x '*.orig' src-clean/lib/libc/gen/Symbol.map src/lib/libc/gen/Symbol.map --- src-clean/lib/libc/gen/Symbol.map 2010-08-25 10:10:30.000000000 +0200 +++ src/lib/libc/gen/Symbol.map 2010-08-25 10:24:35.000000000 +0200 @@ -327,6 +327,7 @@ FBSD_1.1 { arc4random_buf; arc4random_uniform; + basename_r; fdevname; fdevname_r; fdopendir; @@ -339,6 +340,7 @@ fts_read; fts_set; fts_set_clientptr; + ld_insandbox; posix_spawn; posix_spawn_file_actions_addclose; posix_spawn_file_actions_adddup2; @@ -367,6 +369,7 @@ FBSD_1.2 { getpagesizes; + ld_libdirs; }; FBSDprivate_1.0 { diff -aurN -x '*.orig' src-clean/lib/libc/gen/basename.3 src/lib/libc/gen/basename.3 --- src-clean/lib/libc/gen/basename.3 2010-08-25 10:10:30.000000000 +0200 +++ src/lib/libc/gen/basename.3 2010-08-25 10:24:35.000000000 +0200 @@ -27,7 +27,7 @@ .\" $OpenBSD: basename.3,v 1.12 2000/04/18 03:01:25 aaron Exp $ .\" $FreeBSD: src/lib/libc/gen/basename.3,v 1.8.10.1.4.1 2010/06/14 02:09:06 kensmith Exp $ .\" -.Dd October 12, 2006 +.Dd October 6, 2009 .Dt BASENAME 3 .Os .Sh NAME @@ -37,6 +37,8 @@ .In libgen.h .Ft char * .Fn basename "const char *path" +.Ft char * +.Fn basename_r "const char *path" "char *bname" .Sh DESCRIPTION The .Fn basename @@ -58,6 +60,12 @@ is a null pointer or the empty string, a pointer to the string .Qq \&. is returned. +.Pp +The +.Fn basename_r +variation accepts a buffer of at least +.Dv MAXPATHLEN +bytes in which to store the resulting component. .Sh IMPLEMENTATION NOTES The .Fn basename @@ -65,15 +73,17 @@ returns a pointer to internal storage space allocated on the first call that will be overwritten by subsequent calls. +.Fn basename_r +is therefore preferred for threaded applications. .Sh RETURN VALUES On successful completion, .Fn basename -returns a pointer to the last component of +and +.Fn basename_r +return pointers to the last component of .Fa path . .Pp -If -.Fn basename -fails, a null pointer is returned and the global variable +If they fail, a null pointer is returned and the global variable .Va errno is set to indicate the error. .Sh ERRORS diff -aurN -x '*.orig' src-clean/lib/libc/gen/basename.c src/lib/libc/gen/basename.c --- src-clean/lib/libc/gen/basename.c 2010-08-25 10:10:30.000000000 +0200 +++ src/lib/libc/gen/basename.c 2010-08-25 10:24:35.000000000 +0200 @@ -40,18 +40,12 @@ #include char * -basename(path) +basename_r(path, bname) const char *path; + char *bname; { - static char *bname = NULL; const char *endp, *startp; - if (bname == NULL) { - bname = (char *)malloc(MAXPATHLEN); - if (bname == NULL) - return(NULL); - } - /* Empty or NULL string gets treated as "." */ if (path == NULL || *path == '\0') { (void)strcpy(bname, "."); @@ -82,3 +76,17 @@ bname[endp - startp + 1] = '\0'; return(bname); } + +char * +basename(path) + const char *path; +{ + static char *bname = NULL; + + if (bname == NULL) { + bname = (char *)malloc(MAXPATHLEN); + if (bname == NULL) + return (NULL); + } + return (basename_r(path, bname)); +} diff -aurN -x '*.orig' src-clean/lib/libc/gen/ld_libdirs.c src/lib/libc/gen/ld_libdirs.c --- src-clean/lib/libc/gen/ld_libdirs.c 1970-01-01 01:00:00.000000000 +0100 +++ src/lib/libc/gen/ld_libdirs.c 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,44 @@ +/*- + * Copyright (c) 2010 Jonathan Anderson + * All rights reserved. + * + * WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED + * ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND + * UNEXPECTED WAYS. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#pragma weak ld_libdirs +int +ld_libdirs(int *fds, int *fdlen) +{ + + errno = EOPNOTSUPP; + return (-1); +} + diff -aurN -x '*.orig' src-clean/lib/libc/gen/ld_sandbox.c src/lib/libc/gen/ld_sandbox.c --- src-clean/lib/libc/gen/ld_sandbox.c 1970-01-01 01:00:00.000000000 +0100 +++ src/lib/libc/gen/ld_sandbox.c 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,40 @@ +/*- + * Copyright (c) 2008-2009 Robert N. M. Watson + * All rights reserved. + * + * WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED + * ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND + * UNEXPECTED WAYS. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#pragma weak ld_insandbox +int +ld_insandbox(void) +{ + + return (0); +} diff -aurN -x '*.orig' src-clean/lib/libc/sys/Makefile.inc src/lib/libc/sys/Makefile.inc --- src-clean/lib/libc/sys/Makefile.inc 2010-08-25 10:10:32.000000000 +0200 +++ src/lib/libc/sys/Makefile.inc 2010-08-25 10:24:35.000000000 +0200 @@ -64,7 +64,7 @@ MAN+= abort2.2 accept.2 access.2 acct.2 adjtime.2 \ aio_cancel.2 aio_error.2 aio_read.2 aio_return.2 \ aio_suspend.2 aio_waitcomplete.2 aio_write.2 \ - bind.2 brk.2 chdir.2 chflags.2 \ + bind.2 brk.2 cap_enter.2 cap_new.2 chdir.2 chflags.2 \ chmod.2 chown.2 chroot.2 clock_gettime.2 close.2 closefrom.2 \ connect.2 cpuset.2 cpuset_getaffinity.2 dup.2 execve.2 _exit.2 \ extattr_get_file.2 fcntl.2 fhopen.2 flock.2 fork.2 fsync.2 \ @@ -83,7 +83,7 @@ mq_setattr.2 \ msgctl.2 msgget.2 msgrcv.2 msgsnd.2 \ msync.2 munmap.2 nanosleep.2 nfssvc.2 ntp_adjtime.2 open.2 \ - pathconf.2 pipe.2 poll.2 posix_openpt.2 profil.2 \ + pathconf.2 pdfork.2 pipe.2 poll.2 posix_openpt.2 profil.2 \ pselect.2 ptrace.2 quotactl.2 \ read.2 readlink.2 reboot.2 recv.2 rename.2 revoke.2 rfork.2 rmdir.2 \ rtprio.2 @@ -105,6 +105,8 @@ MLINKS+=access.2 eaccess.2 access.2 faccessat.2 MLINKS+=brk.2 sbrk.2 +MLINKS+=cap_enter.2 cap_getmode.2 +MLINKS+=cap_new.2 cap_getrights.2 MLINKS+=chdir.2 fchdir.2 MLINKS+=chflags.2 fchflags.2 chflags.2 lchflags.2 MLINKS+=chmod.2 fchmod.2 chmod.2 fchmodat.2 chmod.2 lchmod.2 @@ -162,6 +164,9 @@ MLINKS+=open.2 openat.2 MLINKS+=pathconf.2 fpathconf.2 MLINKS+=pathconf.2 lpathconf.2 +MLINKS+=pdfork.2 pdgetpid.2 \ + pdfork.2 pdkill.2 \ + pdfork.2 pdwait4.2 MLINKS+=read.2 pread.2 read.2 preadv.2 read.2 readv.2 MLINKS+=readlink.2 readlinkat.2 MLINKS+=recv.2 recvfrom.2 recv.2 recvmsg.2 diff -aurN -x '*.orig' src-clean/lib/libc/sys/Symbol.map src/lib/libc/sys/Symbol.map --- src-clean/lib/libc/sys/Symbol.map 2010-08-25 10:10:32.000000000 +0200 +++ src/lib/libc/sys/Symbol.map 2010-08-25 10:24:35.000000000 +0200 @@ -330,6 +330,10 @@ FBSD_1.1 { __semctl; + cap_enter; + cap_getmode; + cap_getrights; + cap_new; closefrom; cpuset; cpuset_getid; @@ -352,6 +356,10 @@ mknodat; msgctl; openat; + pdfork; + pdgetpid; + pdkill; + pdwait4; readlinkat; renameat; setfib; diff -aurN -x '*.orig' src-clean/lib/libc/sys/cap_enter.2 src/lib/libc/sys/cap_enter.2 --- src-clean/lib/libc/sys/cap_enter.2 1970-01-01 01:00:00.000000000 +0100 +++ src/lib/libc/sys/cap_enter.2 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,112 @@ +.\" +.\" Copyright (c) 2008-2009 Robert N. M. Watson +.\" All rights reserved. +.\" +.\" WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED +.\" ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND +.\" UNEXPECTED WAYS. +.\" +.\" This software was developed at the University of Cambridge Computer +.\" Laboratory with support from a grant from Google, Inc. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd June 11, 2009 +.Dt CAP_ENTER 2 +.Os +.Sh NAME +.Nm cap_enter , +.Nm cap_getmode +.Nd Capability mode system calls +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In sys/capability.h +.Ft int +.Fn cap_enter "void" +.Ft int +.Fn cap_getmode "u_int *modep" +.Sh DESCRIPTION +.Fn cap_enter +places the current process into capability mode, a mode of execution in which +processes may only issue system calls operating on file descriptors or +reading limited global system state. +Access to global name spaces, such as file system or IPC name spaces, is +prevented. +If the process is already in a capability mode sandbox, the system call is a +no-op. +Future process descendants create with +.Xr fork 2 +or +.Xr pdfork 2 +will be placed in capability mode from inception. +.Pp +When combined with capabilities created with +.Xr cap_new 2 , +.Fn cap_enter +may be used to create kernel-enforced sandboxes in which +appropriately-crafted applications or application components may be run. +Most sandboxes will be created and managed using the +.Xr libcapsicum +library, rather than using system calls directly. +.Pp +.Fn cap_getmode +returns a flag indicating whether or not the process is in a capability mode +sandbox. +.Sh CAVEAT +Creating effecive process sandboxes is a tricky process that involves +identifying the least possible rights required by the process and then +passing those rights into the process in a safe manner. +See the CAVEAT +section of +.Xr cap_new 2 +for why this is particularly tricky with UNIX file descriptors as the +canonical representation of a right. +Consumers of +.Fn cap_enter +should also be aware of other inherited rights, such as access to VM +resources, memory contents, and other process properties that should be +considered. +It is advisable to use +.Xr fexecve 2 +to create a runtime environment inside the sandbox that has as few implicitly +acquired rights as possible. +.Sh RETURN VALUES +.Rv -std cap_enter cap_getmode +.Sh SEE ALSO +.Xr cap_new 2 , +.Xr fexecve 2 , +.Xr libcapsicum 3 +.Sh HISTORY +Support for capabilities and capabilities mode was developed as part of the +.Tn TrustedBSD +Project. +.Sh BUGS +WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED ON IN +PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND UNEXPECTED WAYS. +.Sh AUTHORS +These functions and the capability facility were created by +.An "Robert N. M. Watson" +at the University of Cambridge Computer Laboratory with support from a grant +from Google, Inc. diff -aurN -x '*.orig' src-clean/lib/libc/sys/cap_new.2 src/lib/libc/sys/cap_new.2 --- src-clean/lib/libc/sys/cap_new.2 1970-01-01 01:00:00.000000000 +0100 +++ src/lib/libc/sys/cap_new.2 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,481 @@ +.\" +.\" Copyright (c) 2008-2010 Robert N. M. Watson +.\" All rights reserved. +.\" +.\" WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED +.\" ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND +.\" UNEXPECTED WAYS. +.\" +.\" This software was developed at the University of Cambridge Computer +.\" Laboratory with support from a grant from Google, Inc. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd June 11, 2009 +.Dt CAP_NEW 2 +.Os +.Sh NAME +.Nm cap_new , +.Nm cap_getrights +.Nd System calls to manipulate capabilities +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In sys/capability.h +.Ft int +.Fn cap_new "int fd" "cap_rights_t rights" +.Ft int +.Fn cap_getrights "int fd" "cap_rights_t *rightsp" +.Sh DESCRIPTION +Capabilities are special file descriptors derived from an existing file +descriptor, such as one returned by +.Xr fhopen 2 , +.Xr kqueue 2 , +.Xr mq_open 2 , +.Xr open 2 , +.Xr pipe 2 , +.Xr shm_open 2 , +.Xr socket 2 , +or +.Xr socketpair 2 , +but with a restricted set of permitted operations determined by a rights +mask set when the capability is created. +These restricted rights cannot be changed after the capability is created, +although further capabilities with yet more restricted rights may be created +from an existing capability. +In every other sense, a capability behaves in the same way as the file +descriptor it was created from. +.Pp +.Fn cap_new +creates a new capability for the existing file descriptor +.Fa fd , +and returns a file descriptor for it. +Operations on the capability will be limited to those permitted by +.Fa rights , +which is static for the lifetime of the capability. +If +.Fa fd +refers to an existing capability, then +.Fa rights +must be equal to or a subset of the rights on that capability. +As with +.Xr dup 2 +and +.Xr dup2 2 , +many properties are shared between the new capability and the existing file +descriptor, including open file flags, blocking disposition, and file offset. +Many applications will prefer to use the +.Xr cap_limitfd 3 +library call, part of +.Xr libcapsicum 3 , +as it offers a more convenient interface. +.Pp +.Fn cap_getrights +queries the rights associated with the capability referred to by file +descriptor +.Fa fd . +.Pp +These system calls, when combined with +.Xr cap_enter 2 , +may be used to construct process sandboxes with highly granular rights +assignment. +.Sh RIGHTS +The following rights may be specified in a new capability rights mask: +.Bl -tag -width CAP_EXTATTR_DELETE +.It Dv CAP_ACCEPT +Permit +.Xr accept 2 . +.It Dv CAP_ACL_CHECK +Permit checking of an ACL on a file descriptor; there is no cross-reference +for this system call. +.It Dv CAP_ACL_DELETE +Permit +.Xr acl_delete_fd_np 2 . +.It Dv CAP_ACL_GET +Permit +.Xr acl_get_fd 2 +and +.Xr acl_get_fd_np 2 . +.It Dv CAP_ACL_SET +Permit +.Xr acl_set_fd 2 +and +.Xr acl_set_fd_np 2 . +.It Dv CAP_BIND +Permit +.Xr bind 2 . +Note that sockets can also become bound implicitly as a result of +.Xr connect 2 +or +.Xr send 2 , +and that socket options set with +.Xr setsockopt 2 +may also affect binding behavior. +.It Dv CAP_CONNECT +Permit +.Xr connect 2 ; +also required for +.Xr sendto 2 +with a non-NULL destination address. +.It Dv CAP_EVENT +Permit +.Xr select 2 , +.Xr poll 2 , +and +.Xr kevent 2 +to be used in monitoring the file descriptor for events. +.It Dv CAP_FEXECVE +Permit +.Xr fexecve 2 ; +.Dv CAP_READ +will also be required. +.It Dv CAP_EXTATTR_DELETE +Permit +.Xr extattr_delete_fd 2 . +.It Dv CAP_EXTATTR_GET +Permit +.Xr extattr_get_fd 2 . +.It Dv CAP_EXTATTR_LIST +Permit +.Xr extattr_list_fd 2 . +.It Dv CAP_EXTATTR_SET +Permit +.Xr extattr_set_fd 2 . +.It Dv CAP_FCHDIR +Permit +.Xr fchdir 2 . +.It Dv CAP_FCHFLAGS +Permit +.Xr fchflags 2 . +.It Dv CAP_FCHMOD +Permit +.Xr fchmod 2 . +.It Dv CAP_FCHOWN +Permit +.Xr fchown 2 . +.It Dv CAP_FCNTL +Permit +.Xr fcntl 2 ; +be aware that this call provides indirect access to other operations, such as +.Xr flock 2 . +.It Dv CAP_FLOCK +Permit +.Xr flock 2 +and related calls. +.It Dv CAP_FPATHCONF +Permit +.Xr fpathconf 2 . +.It Dv CAP_FSCK +Permit UFS background-fsck operations on the descriptor. +.It Dv CAP_FSTAT +Permit +.Xr fstat 2 . +.It Dv CAP_FSTATFS +Permit +.Xr fstatfs 2 . +.It Dv CAP_FSYNC +Permit +.Xr aio_fsync 2 +and +.Xr fsync 2 . +.Pp +.It Dv CAP_FTRUNCATE +Permit +.Xr ftruncate 2 . +.It Dv CAP_FUTIMES +Permit +.Xr futimes 2 . +.It Dv CAP_GETPEERNAME +Permit +.Xr getpeername 2 . +.It Dv CAP_GETSOCKNAME +Permit +.Xr getsockname 2 . +.It Dv CAP_GETSOCKOPT +Permit +.Xr getsockopt 2 . +.It Dv CAP_IOCTL +Permit +.Xr ioctl 2 . +Be aware that this system call has enormous scope, including potentially +global scope for some objects. +.It Dv CAP_KEVENT +Permit +.Xr kevent 2 ; +.Dv CAP_EVENT +is also required on file descriptors that will be monitored using +.Xr kevent 2 . +.It Dv CAP_LISTEN +Permit +.Xr listen 2 ; +not much use (generally) without +.Dv CAP_BIND . +.It Dv CAP_LOOKUP +Permit the file descriptor to be used as a starting directory for calls such +as +.Xr linkat 2 , +.Xr openat 2 , +and +.Xr unlinkat 2 . +Note that these calls are not available in capability mode as they manipulate +a global name space; see +.Xr cap_enter 2 +for details. +.It Dv CAP_MAC_GET +Permit +.Xr mac_get_fd 2 . +.It Dv CAP_MAC_SET +Permit +.Xr mac_set_fd 2 . +.It Dv CAP_MMAP +Permit +.Xr mmap 2 ; +specific invocations may also require +.Dv CAP_READ +or +.Dv CAP_WRITE . +.Pp +.It Dv CAP_PDGETPID +Permit +.Xr pdgetpid 2 . +.It Dv CAP_PDKILL +Permit +.Xr pdkill 2 . +.It Dv CAP_PDWAIT +Permit +.Xr pdwait 2 . +.It Dv CAP_PEELOFF +Permit +.Xr sctp_peeloff 2 . +.It Dv CAP_READ +Allow +.Xr aio_read 2 , +.Xr pread 2 , +.Xr read 2 , +.Xr recv 2 , +.Xr recvfrom 2 , +.Xr recvmsg 2 , +and related system calls. +.Pp +For files and other seekable objects, +.Dv CAP_SEEK +may also be required. +.It Dv CAP_REVOKE +Permit +.Xr frevoke 2 +in certain ABI compatibility modes that support this system call. +.It Dv CAP_SEEK +Permit operations that seek on the file descriptor, such as +.Xr lseek 2 , +but also required for I/O system calls that modify the file offset, such as +.Xr read 2 +and +.Xr write 2 . +.It Dv CAP_SEM_GETVALUE +Permit +.Xr sem_getvalue 3 . +.It Dv CAP_SEM_POST +Permit +.Xr sem_post 3 . +.It Dv CAP_SEM_WAIT +Permit +.Xr sem_wait 3 +and +.Xr sem_trywait 3 . +.It Dv CAP_SETSOCKOPT +Permit +.Xr setsockopt 2 ; +this controls various aspects of socket behavior and may affect binding, +connecting, and other behaviors with global scope. +.It Dv CAP_SHUTDOWN +Permit explicit +.Xr shutdown 2 ; +closing the socket will also generally shut down any connections on it. +.It Dv CAP_TTYHOOK +Allow configuration of TTY hooks, such as +.Xr snp 4 , +on the file descriptor. +.It Dv CAP_WRITE +Allow +.Xr aio_write 2 , +.Xr pwrite 2 , +.Xr send 2 , +.Xr sendmsg 2 , +.Xr sendto 2 , +.Xr write 2 , +and related system calls. +.Pp +For files and other seekable objects, +.Dv CAP_SEEK +may also be required. +.Pp +For +.Xr sendto 2 +with a non-NULL connection address, +.Dv CAP_CONNECT +is also required. +.El +.Sh CAVEAT +The +.Fn cap_new +system call and the capabilities it creates may be used to assign +fine-grained rights to sandboxed processes running in capability mode. +However, the semantics of objects accessed via file descriptors are complex, +so caution should be exercised in passing object capabilities into sandboxes. +.Sh RETURN VALUES +If successful, +.Fn cap_new +returns a non-negative integer, termed a file descriptor. +It returns -1 on failure, and sets +.Va errno +to indicate the error. +.Pp +.Rv -std cap_getrights +.Sh ERRORS +.Fn cap_new +may return the following errors: +.Bl -tag -width Er +.It Bq Er EBADF +The +.Fa fd +argument is not a valid active descriptor. +.It Bq Er EINVAL +An invalid right has been requested in +.Fa rights . +.It Bq Er EMFILE +The process has already reached its limit for open file descriptors. +.It Bq Er ENFILE +The system file table is full. +.It Bq Er EPERM +.Fa rights +contains requested rights not present in the current rights mask associated +with the capability referenced by +.Fa fd , +if any. +.El +.Pp +.Fn cap_getrights +may return the following errors: +.Bl -tag -width Er +.It Bq Er EBADF +The +.Fa fd +argument is not a valid active descriptor. +.It Bq Er EINVAL +The +.Fa fd +argument is not a capability. +.El +.Sh SEE ALSO +.Xr accept 2 , +.Xr acl_delete_fd_np 2 , +.Xr acl_get_fd 2 , +.Xr acl_get_fd_np 2 , +.Xr acl_set_fd_np 2 , +.Xr aio_read 2 , +.Xr aio_fsync 2 , +.Xr aio_write 2 , +.Xr bind 2 , +.Xr cap_enter 2 , +.Xr connect 2 , +.Xr dup 2 , +.Xr dup2 2 , +.Xr extattr_delete_fd 2 , +.Xr extattr_get_fd 2 , +.Xr extattr_list_fd 2 , +.Xr extattr_set_fd 2 , +.Xr fchflags 2 , +.Xr fchown 2 , +.Xr fcntl 2 , +.Xr fexecve 2 , +.Xr fhopen 2 , +.Xr flock 2 , +.Xr fpathconf 2 , +.Xr fstat 2 , +.Xr fstatfs 2 , +.Xr fsync 2 , +.Xr ftruncate 2 , +.Xr futimes 2 , +.Xr getpeername 2 , +.Xr getsockname 2 , +.Xr getsockopt 2 , +.Xr ioctl 2 , +.Xr kevent 2 , +.Xr kqueue 2 , +.Xr linkat 2 , +.Xr listen 2 , +.Xr mac_get_fd 2 , +.Xr mac_set_fd 2 , +.Xr mmap 2 , +.Xr mq_open 2 , +.Xr open 2 , +.Xr openat 2 , +.Xr pdgetpid 2 , +.Xr pdkill 2 , +.Xr pdwait 2 , +.Xr pipe 2 , +.Xr poll 2 , +.Xr pread 2 , +.Xr pwrite 2 , +.Xr read 2 , +.Xr recv 2 , +.Xr recvfrom 2 , +.Xr recvmsg 2 , +.Xr sctp_peeloff 2 , +.Xr select 2 , +.Xr send 2 , +.Xr sendmsg 2 , +.Xr sendto 2 , +.Xr setsockopt 2 , +.Xr shm_open 2 , +.Xr shutdown 2 , +.Xr socket 2 , +.Xr socketpair 2 , +.Xr unlinkat 2 , +.Xr write 2 , +.Xr cap_limitfd 3 , +.Xr libcapsicum 3 , +.Xr sem_getvalue 3 , +.Xr sem_post 3 , +.Xr sem_trywait 3 , +.Xr sem_wait 3 , +.Xr snp 4 +.Sh HISTORY +Support for capabilities and capabilities mode was developed as part of the +.Tn TrustedBSD +Project. +.Sh BUGS +This man page should list the set of permitted system calls more specifically +for each capability right. +.Pp +Capability rights sometimes have unclear indirect impacts, which should be +documented, or at least hinted at. +.Pp +WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED ON IN +PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND UNEXPECTED WAYS. +.Sh AUTHORS +These functions and the capability facility were created by +.An "Robert N. M. Watson" +at the University of Cambridge Computer Laboratory with support from a grant +from Google, Inc. diff -aurN -x '*.orig' src-clean/lib/libc/sys/fork.2 src/lib/libc/sys/fork.2 --- src-clean/lib/libc/sys/fork.2 2010-08-25 10:10:32.000000000 +0200 +++ src/lib/libc/sys/fork.2 2010-08-25 10:24:35.000000000 +0200 @@ -122,6 +122,7 @@ .El .Sh SEE ALSO .Xr execve 2 , +.Xr pdfork 2 , .Xr rfork 2 , .Xr setitimer 2 , .Xr setrlimit 2 , diff -aurN -x '*.orig' src-clean/lib/libc/sys/pdfork.2 src/lib/libc/sys/pdfork.2 --- src-clean/lib/libc/sys/pdfork.2 1970-01-01 01:00:00.000000000 +0100 +++ src/lib/libc/sys/pdfork.2 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,123 @@ +.\" +.\" Copyright (c) 2009 Robert N. M. Watson +.\" All rights reserved. +.\" +.\" WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED +.\" ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND +.\" UNEXPECTED WAYS. +.\" +.\" This software was developed at the University of Cambridge Computer +.\" Laboratory with support from a grant from Google, Inc. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd January 18, 2009 +.Dt PDFORK 2 +.Os +.Sh NAME +.Nm pdfork , +.Nm pdgetpid , +.Nm pdkill , +.Nm pdwait +.Nd System calls to manage process descriptors +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In sys/procdesc.h +.Ft int +.Fn pdfork "int *fdp" +.Ft int +.Fn pdgetpid "int fd" "pid_t *pidp" +.Ft int +.Fn pdkill "int fd" "int signum" +.Ft int +.Fn pdwait "int fd" "int *status" "int options" "struct rusage *rusage" +.Sh DESCRIPTION +Process descriptors are special file descriptors that represent processes, +and are created using +.Fn pdfork , +a variant of +.Xr fork 2 , +which, if successful, returns a process descriptor in the integer pointed to +by +.Fa pidp . +.Pp +.Fn pdgetpid +queries the process ID (PID) if the process descriptor +.Fa fd . +.Pp +.Fn pdkill +is functionally identical to +.Xr kill 2 , +except that it accepts a process descriptor, +.Fa fd , +rather than a PID. +.Pp +.Fn pdwait +is currently unimplemented, but in the future will be functionally identical +to +.Xr wait4 2 , +except that it accepts a process descriptor rather than a PID. +.Pp +The following system calls also have effects specific to process descriptors: +.Pp +.Xr fstat 2 +queries status of a process descriptor; currently only the +.Fa sb_mode +field is defined; if the owner read, write, and execute bits are set then the +process represented by the process descriptor is still alive. +.Pp +.Xr poll 2 +allows waiting for process state transitions; currently only +.Dv POLLHUP +is defined, and will be raised when the process dies. +.Pp +.Xr close 2 +will close the process descriptor, and if the process is still alive, +terminate it with the signal +.Dv SIGKILL . +.Sh RETURN VALUES +.Sh ERRORS +.Sh SEE ALSO +.Xr close 2 , +.Xr fork 2 , +.Xr fstat 2 , +.Xr kill 2 , +.Xr poll 2 , +.Xr wait4 2 +.Sh HISTORY +Support for process descriptors mode was developed as part of the +.Tn TrustedBSD +Project. +.Sh BUGS +.Fn pdwait +is not yet implemented. +.Pp +WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED ON IN +PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND UNEXPECTED WAYS. +.Sh AUTHORS +These functions and the capability facility were created by +.An "Robert N. M. Watson" +at the University of Cambridge Computer Laboratory with support from a grant +from Google, Inc. diff -aurN -x '*.orig' src-clean/lib/libcapsicum/Makefile src/lib/libcapsicum/Makefile --- src-clean/lib/libcapsicum/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ src/lib/libcapsicum/Makefile 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,59 @@ +# $FreeBSD$ + +LIB= capsicum +DPADD= ${LIBSBUF} +LDADD= -lsbuf + +SRCS= \ + libcapsicum.c \ + libcapsicum_sandbox.c \ + libcapsicum_sandbox_io.c \ + libcapsicum_host.c \ + libcapsicum_host_io.c \ + libcapsicum_fdlist.c + +INCS= libcapsicum.h + +CFLAGS+= -I. + +SHLIB_MAJOR= 1 + +WARNS?= 6 + +MAN= libcapsicum.3 +MAN+= libcapsicum_fdlist.3 +MAN+= libcapsicum_host.3 +MAN+= libcapsicum_sandbox.3 +MLINKS= libcapsicum.3 lc_limitfd.3 \ + libcapsicum_fdlist.3 lc_fdlist_new.3 \ + libcapsicum_fdlist.3 lc_fdlist_global.3 \ + libcapsicum_fdlist.3 lc_fdlist_dup.3 \ + libcapsicum_fdlist.3 lc_fdlist_free.3 \ + libcapsicum_fdlist.3 lc_fdlist_add.3 \ + libcapsicum_fdlist.3 lc_fdlist_addcap.3 \ + libcapsicum_fdlist.3 lc_fdlist_lookup.3 \ + libcapsicum_host.3 lch_autosandbox_isenabled.3 \ + libcapsicum_host.3 lch_start.3 \ + libcapsicum_host.3 lch_startfd.3 \ + libcapsicum_host.3 lch_stop.3 \ + libcapsicum_host.3 lch_getsock.3 \ + libcapsicum_host.3 lch_getpid.3 \ + libcapsicum_host.3 lch_getprocdesc.3 \ + libcapsicum_host.3 lch_recv.3 \ + libcapsicum_host.3 lch_recv_rights.3 \ + libcapsicum_host.3 lch_rpc.3 \ + libcapsicum_host.3 lch_rpc_rights.3 \ + libcapsicum_host.3 lch_send.3 \ + libcapsicum_host.3 lch_send_rights.3 \ + libcapsicum_sandbox.3 lcs_get.3 \ + libcapsicum_sandbox.3 lcs_getsock.3 \ + libcapsicum_sandbox.3 lcs_recv.3 \ + libcapsicum_sandbox.3 lcs_recv_rights.3 \ + libcapsicum_sandbox.3 lcs_recvrpc.3 \ + libcapsicum_sandbox.3 lcs_recvrpc_rights.3 \ + libcapsicum_sandbox.3 lcs_send.3 \ + libcapsicum_sandbox.3 lcs_send_rights.3 \ + libcapsicum_sandbox.3 lcs_sendrpc.3 \ + libcapsicum_sandbox.3 lcs_sendrpc_rights.3 + +.include diff -aurN -x '*.orig' src-clean/lib/libcapsicum/libcapsicum.3 src/lib/libcapsicum/libcapsicum.3 --- src-clean/lib/libcapsicum/libcapsicum.3 1970-01-01 01:00:00.000000000 +0100 +++ src/lib/libcapsicum/libcapsicum.3 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,127 @@ +.\" +.\" Copyright (c) 2009-2010 Robert N. M. Watson +.\" All rights reserved. +.\" +.\" WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED +.\" ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND +.\" UNEXPECTED WAYS. +.\" +.\" This software was developed at the University of Cambridge Computer +.\" Laboratory with support from a grant from Google, Inc. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd June 11, 2009 +.Os +.Dt LIBCAPSICUM 3 +.Sh NAME +.Nm libcapsicum +.Nd "library interface to capability-mode services" +.Sh LIBRARY +.Lb libcapsicum +.Sh SYNOPSIS +.In sys/types.h +.In sys/capability.h +.In libcapsicum.h +.Ft int +.Fn lc_limitfd "int fd" "cap_rights_t rights" +.Sh DESCRIPTION +.Nm +implements APIs that allow applications to create, manage, and interact with +sandboxed software services running in capability mode, described in +.Xr cap_enter 2 . +Applications linked against +.Nm +will use one or both of "host" and "sandbox" APIs, depending on whether they +consume or produce sandboxed services. +.Nm +will start sandboxed components using a sandbox-specific run-time linker, +.Xr rtld-elf-cap 1 , +rather than the standard +.Xr rtld-elf 1 . +.Pp +Host processes use the +.Nm +host API, +described in +.Xr libcapsicum_host 3 , +to launch compartmentalized components in sandboxes. +They may also use +.Nm +to communication with the sandboxed service based on socket I/O or remote +procedure call (RPC). +.Pp +Sandbox processes run in capability mode, and are only able to use resources +either assigned to the sandbox during creation, or later explicitly passed to +the process. +Sandbox processes use the +.Nm +sandbox API, +described in +.Xr libcapsicum_sandbox 3 . +Sandboxed processes themselves may launch software components in further +sandboxes, so a single program may use both host and sandbox APIs. +.Pp +In addition, the +.Nm +file descriptor list API, described in +.Xr libcapsicum_fdlist 3 , +may be used to manage the delegation of file descriptors/capabilities to +sandboxes using a namespace. +.Sh CAPSICUM API +.Fn lc_limitfd +is a wrapper around +.Xr cap_new 2 , +.Xr dup2 2 , +and +.Xr close 2 . +which takes an existing file descriptor and replaces it with a capability +with the requested rights mask. +.Sh SEE ALSO +.Xr rpcgen 1 , +.Xr rtld-elf 1 , +.Xr rtld-elf-cap 1 , +.Xr cap_enter 2 , +.Xr cap_new 2 , +.Xr close 2 , +.Xr dup2 2 , +.Xr libcapsicum_fdlist 3 , +.Xr libcapsicum_host 3 , +.Xr libcapsicum_sandbox 3 , +.Xr unix 4 +.Sh HISTORY +Support for capabilities and capabilities mode was developed as part of the +.Tn TrustedBSD +Project. +.Sh BUGS +WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED ON IN +PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND UNEXPECTED WAYS. +.Sh AUTHORS +These functions and the capability facility were created by +.An -nosplit +.An "Robert N. M. Watson" +and +.An "Jonathan Anderson" +at the University of Cambridge Computer Laboratory with support from a grant +from Google, Inc. diff -aurN -x '*.orig' src-clean/lib/libcapsicum/libcapsicum.c src/lib/libcapsicum/libcapsicum.c --- src-clean/lib/libcapsicum/libcapsicum.c 1970-01-01 01:00:00.000000000 +0100 +++ src/lib/libcapsicum/libcapsicum.c 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,268 @@ +/*- + * Copyright (c) 2009 Robert N. M. Watson + * All rights reserved. + * + * WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED + * ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND + * UNEXPECTED WAYS. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $P4: //depot/projects/trustedbsd/capabilities/src/lib/libcapsicum/libcapsicum.c#4 $ + */ + +#include +#include +#include + +#include +#include +#include + +#include "libcapsicum.h" +#include "libcapsicum_internal.h" +#include "libcapsicum_sandbox_api.h" + +int +lc_limitfd(int fd, cap_rights_t rights) +{ + int fd_cap; + int error; + + fd_cap = cap_new(fd, rights); + if (fd_cap < 0) + return (-1); + if (dup2(fd_cap, fd) < 0) { + error = errno; + close(fd_cap); + errno = error; + return (-1); + } + close(fd_cap); + return (0); +} + +void +_lc_dispose_rights(int *fdp, int fdcount) +{ + int i; + + for (i = 0; i < fdcount; i++) + close(fdp[i]); +} + +/* + * Given a 'struct msghdr' returned by a successful call to recvmsg(), + * extract up to the desired number of file descriptors (or clean up the + * mess if something goes wrong). + */ +int +_lc_receive_rights(struct msghdr *msg, int *fdp, int *fdcountp) +{ + int *cmsg_fdp, fdcount, i, scmrightscount; + struct cmsghdr *cmsg; + + /* + * Walk the complete control message chain to count received control + * messages and rights. If there is more than one rights message or + * there are too many file descriptors, re-walk and close them all + * and return an error. + */ + fdcount = 0; + scmrightscount = 0; + for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; + cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (cmsg->cmsg_level != SOL_SOCKET || + cmsg->cmsg_type != SCM_RIGHTS) + continue; + fdcount += (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + scmrightscount++; + } + if (scmrightscount > 1 || fdcount > *fdcountp) { + for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; + cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (cmsg->cmsg_level != SOL_SOCKET || + cmsg->cmsg_type != SCM_RIGHTS) + continue; + cmsg_fdp = (int *)CMSG_DATA(cmsg); + fdcount = (cmsg->cmsg_len - CMSG_LEN(0)) / + sizeof(int); + _lc_dispose_rights(cmsg_fdp, fdcount); + } + errno = EBADMSG; + return (-1); + } + + /* + * Re-walk the control messages and copy out the file descriptor + * numbers, return success. No need to recalculate fdcount. + */ + for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; + cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (cmsg->cmsg_level != SOL_SOCKET || + cmsg->cmsg_type != SCM_RIGHTS) + continue; + cmsg_fdp = (int *)CMSG_DATA(cmsg); + for (i = 0; i < fdcount; i++) + fdp[i] = cmsg_fdp[i]; + } + *fdcountp = fdcount; + return (0); +} + +ssize_t +_lc_send(int fd, const void *msg, size_t len, int flags, int lc_flags) +{ + ssize_t retlen; + + if (fd == -1 || fd == 0) { + errno = ECHILD; + return (-1); + } + if (lc_flags & LC_IGNOREEINTR) { + do { + retlen = send(fd, msg, len, flags); + } while (retlen < 0 && errno == EINTR); + } else + retlen = send(fd, msg, len, flags); + return (retlen); +} + +ssize_t +_lc_send_rights(int fd, const void *msg, size_t len, int flags, int lc_flags, + int *fdp, int fdcount) +{ + char cmsgbuf[CMSG_SPACE(LIBCAPSICUM_SANDBOX_API_MAXRIGHTS * + sizeof(int))]; + struct cmsghdr *cmsg; + struct msghdr msghdr; + struct iovec iov; + ssize_t retlen; + int i; + + if (fdcount == 0) + return (_lc_send(fd, msg, len, flags, lc_flags)); + + if (fd == -1 || fd == 0) { + errno = ECHILD; + return (-1); + } + + if (fdcount > LIBCAPSICUM_SANDBOX_API_MAXRIGHTS) { + errno = EMSGSIZE; + return (-1); + } + + bzero(&iov, sizeof(iov)); + iov.iov_base = __DECONST(void *, msg); + iov.iov_len = len; + + bzero(&cmsgbuf, sizeof(cmsgbuf)); + cmsg = (struct cmsghdr *)cmsgbuf; + cmsg->cmsg_len = CMSG_SPACE(fdcount * sizeof(int)); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + for (i = 0; i < fdcount; i++) + ((int *)CMSG_DATA(cmsg))[i] = fdp[i]; + + bzero(&msghdr, sizeof(msghdr)); + msghdr.msg_iov = &iov; + msghdr.msg_iovlen = 1; + msghdr.msg_control = cmsg; + msghdr.msg_controllen = cmsg->cmsg_len; + + if (lc_flags & LC_IGNOREEINTR) { + do { + retlen = sendmsg(fd, &msghdr, flags); + } while (retlen < 0 && errno == EINTR); + } else + retlen = sendmsg(fd, &msghdr, flags); + return (retlen); +} + +ssize_t +_lc_recv(int fd, void *buf, size_t len, int flags, int lc_flags) +{ + ssize_t retlen; + + if (fd == -1 || fd == 0) { + errno = ESRCH; + return (-1); + } + if (lc_flags & LC_IGNOREEINTR) { + do { + retlen = recv(fd, buf, len, flags); + } while (retlen < 0 && errno == EINTR); + return (retlen); + } else + return (recv(fd, buf, len, flags)); +} + +ssize_t +_lc_recv_rights(int fd, void *buf, size_t len, int flags, int lc_flags, + int *fdp, int *fdcountp) +{ + char cmsgbuf[CMSG_SPACE(LIBCAPSICUM_SANDBOX_API_MAXRIGHTS * + sizeof(int))]; + struct msghdr msghdr; + struct iovec iov; + ssize_t retlen; + + if (*fdcountp == 0) + return (_lc_recv(fd, buf, len, flags, lc_flags)); + + if (fd == -1 || fd == 0) { + errno = ECHILD; + return (-1); + } + + if (*fdcountp > LIBCAPSICUM_SANDBOX_API_MAXRIGHTS) { + errno = EMSGSIZE; + return (-1); + } + + bzero(&iov, sizeof(iov)); + iov.iov_base = buf; + iov.iov_len = len; + + bzero(cmsgbuf, sizeof(cmsgbuf)); + bzero(&msghdr, sizeof(msghdr)); + msghdr.msg_iov = &iov; + msghdr.msg_iovlen = 1; + msghdr.msg_control = cmsgbuf; + msghdr.msg_controllen = sizeof(cmsgbuf); + + if (lc_flags & LC_IGNOREEINTR) { + do { + retlen = recvmsg(fd, &msghdr, flags); + } while (retlen < 0 && errno == EINTR); + } else + retlen = recvmsg(fd, &msghdr, flags); + if (retlen < 0) + return (-1); + if (_lc_receive_rights(&msghdr, fdp, fdcountp) < 0) + return (-1); + return (retlen); +} diff -aurN -x '*.orig' src-clean/lib/libcapsicum/libcapsicum.h src/lib/libcapsicum/libcapsicum.h --- src-clean/lib/libcapsicum/libcapsicum.h 1970-01-01 01:00:00.000000000 +0100 +++ src/lib/libcapsicum/libcapsicum.h 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,239 @@ +/*- + * Copyright (c) 2009-2010 Robert N. M. Watson + * All rights reserved. + * + * WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED + * ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND + * UNEXPECTED WAYS. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $P4: //depot/projects/trustedbsd/capabilities/src/lib/libcapsicum/libcapsicum.h#15 $ + */ + +#ifndef _LIBCAPSICUM_H_ +#define _LIBCAPSICUM_H_ + +#include +#include + +__BEGIN_DECLS + +struct lc_sandbox; +struct lc_host; + +/* + * A list of file descriptors, which can be passed around in shared memory. + */ +struct lc_fdlist; +struct lc_fdlist *lc_fdlist_new(void); +struct lc_fdlist *lc_fdlist_global(void); +struct lc_fdlist *lc_fdlist_dup(struct lc_fdlist *lfp_orig); +void lc_fdlist_free(struct lc_fdlist *lfp); +void lc_fdlist_print(struct lc_fdlist *lfp, int outFD); + +/* + * Size of an FD list in bytes, including all associated string data. + * + * XXX: This will probably become library-private soon. + */ +u_int lc_fdlist_size(struct lc_fdlist *lfp); + +/* + * Add a file descriptor to the list. + * + * lfp the list to add to + * subsystem a software component name, e.g. "org.freebsd.rtld-elf-cap" + * classname a class name, e.g. "libdir" or "library" + * name an instance name, e.g. "system library dir" or "libc.so.6" + * fd the file descriptor + */ +int lc_fdlist_add(struct lc_fdlist *lfp, const char *subsystem, + const char *classname, const char *name, int fd); + +/* + * Append the contents of one list to another. + */ +int lc_fdlist_append(struct lc_fdlist *to, struct lc_fdlist *from); + + +/* + * Like lc_fdlist_add(), but allows capability rights to be specified. The + * file descriptor will be wrapped in a capability with the given rights (so + * if the descriptor *is* a capability, its rights will be constrained + * according to this rights mask.) + */ +int lc_fdlist_addcap(struct lc_fdlist *l, const char *subsystem, + const char *classname, const char *name, int fd, + cap_rights_t rights); + +/* + * Open a stored file descriptor. + * + * Given a filename '/foo/bar/fubar', this function will attempt to find the file + * in the FD list. If that fails, it will attempt to find a parent directory in the + * FD list and supply a filename relative to that FD (which will be a pointer to a + * location within the supplied filename - do NOT free it!). + */ +int +lc_fdlist_find(struct lc_fdlist *lfp, const char *subsystem, + const char *classname, const char *filename, + const char **relative_name); + +/* + * Look up a file descriptor. + * + * Multiple entries with the same classname are allowed, so iterating through + * all instances of a class is done by supplying an integer 'pos' which is + * used internally to skip entries which have already been seen. If 'pos' is + * 0 or NULL, the first matching entry will be returned. + */ +int lc_fdlist_lookup(struct lc_fdlist *lfp, const char *subsystem, + const char *classname, char **name, int *fdp, int *pos); + +/* + * Look up a file descriptor without a name. Repeated calls to this function + * will iterate through all descriptors in the list. + */ +int lc_fdlist_getentry(struct lc_fdlist *lfp, char **subsystem, + char **classname, char **name, int *fdp, int *pos); + +/* + * Reorder FD list (WARNING: this could be dangerous!). + * + * This call takes all of the file descriptors in the FD list, and moves them + * into a continuous array, starting at the FD given by 'start'. Any file + * descriptors above 'start' which are not in the FD list are closed. + */ +int lc_fdlist_reorder(struct lc_fdlist *lfp); + +/* + * Capability interfaces. + */ +int lc_limitfd(int fd, cap_rights_t rights); + +/* + * Global policy interface to ask whether we should, in fact, sandbox a + * particular optionally sandboxed service, by name. + */ +int lch_autosandbox_isenabled(const char *servicename); + +/* + * Interfaces to start and stop capability mode sandboxs. + */ +int lch_start(const char *sandbox, char *const argv[], u_int flags, + struct lc_fdlist *fds, struct lc_sandbox **lcspp); +int lch_startfd(int fd_sandbox, const char *binname, char *const argv[], + u_int flags, struct lc_fdlist *fds, struct lc_sandbox **lcspp); +void lch_stop(struct lc_sandbox *lcsp); + +/* + * Flags to lch_start_flags: + */ +#define LCH_PERMIT_STDERR 0x00000001 +#define LCH_PERMIT_STDOUT 0x00000002 + +/* + * Interfaces to query state about capability mode sandboxs. + */ +int lch_getsock(struct lc_sandbox *lcsp, int *fdp); +int lch_getpid(struct lc_sandbox *lcsp, pid_t *pidp); +int lch_getprocdesc(struct lc_sandbox *lcsp, int *fdp); + +/* + * Message-passing APIs for the host environment. + */ +struct iovec; +ssize_t lch_recv(struct lc_sandbox *lcsp, void *buf, size_t len, int flags); +ssize_t lch_recv_rights(struct lc_sandbox *lcsp, void *buf, size_t len, + int flags, int *fdp, int *fdcountp); +ssize_t lch_send(struct lc_sandbox *lcsp, const void *msg, size_t len, + int flags); +ssize_t lch_send_rights(struct lc_sandbox *lcsp, const void *msg, size_t len, + int flags, int *fdp, int fdcount); + +/* + * RPC APIs for the host environment. + */ +int lch_rpc(struct lc_sandbox *lcsp, u_int32_t opno, struct iovec *req, + int reqcount, struct iovec *rep, int repcount, size_t *replenp); +int lch_rpc_rights(struct lc_sandbox *lcsp, u_int32_t opno, + struct iovec *req, int reqcount, int *req_fdp, int req_fdcount, + struct iovec *rep, int repcount, size_t *replenp, int *rep_fdp, + int *rep_fdcountp); + +/* + * Interfaces to query state from within capability mode sandboxes. + */ +int lcs_get(struct lc_host **lchpp); +int lcs_getsock(struct lc_host *lchp, int *fdp); + +/* + * Message-passing APIs for the sandbox environment. + */ +ssize_t lcs_recv(struct lc_host *lchp, void *buf, size_t len, int flags); +ssize_t lcs_recv_rights(struct lc_host *lchp, void *buf, size_t len, + int flags, int *fdp, int *fdcountp); +ssize_t lcs_send(struct lc_host *lchp, const void *msg, size_t len, + int flags); +ssize_t lcs_send_rights(struct lc_host *lchp, const void *msg, size_t len, + int flags, int *fdp, int fdcount); + +/* + * RPC APIs for the sandbox environment. + */ +int lcs_recvrpc(struct lc_host *lchp, u_int32_t *opnop, + u_int32_t *seqnop, u_char **bufferp, size_t *lenp); +int lcs_recvrpc_rights(struct lc_host *lchp, u_int32_t *opnop, + u_int32_t *seqnop, u_char **bufferp, size_t *lenp, int *fdp, + int *fdcountp); +int lcs_sendrpc(struct lc_host *lchp, u_int32_t opno, u_int32_t seqno, + struct iovec *rep, int repcount); +int lcs_sendrpc_rights(struct lc_host *lchp, u_int32_t opno, + u_int32_t seqno, struct iovec *rep, int repcount, int *fdp, + int fdcount); + +/* + * Actually an rtld-elf-cap symbol, but declared here so it is available to + * applications. + */ +int ld_libcache_lookup(const char *libname, int *fdp); +int ld_insandbox(void); +/* + * If this call fails because the buffer 'fds' is too small, 'fdlen' will contain + * the size of the array which is actually required. + */ +int ld_libdirs(int *fds, int *fdlen); + +/* + * Applications may declare an alternative entry point to the default ELF + * entry point for their binary, which will be used in preference to 'main' + * in the sandbox environment. + */ +int cap_main(int argc, char *argv[]); + +__END_DECLS + +#endif /* !_LIBCAPSICUM_H_ */ diff -aurN -x '*.orig' src-clean/lib/libcapsicum/libcapsicum_fdlist.3 src/lib/libcapsicum/libcapsicum_fdlist.3 --- src-clean/lib/libcapsicum/libcapsicum_fdlist.3 1970-01-01 01:00:00.000000000 +0100 +++ src/lib/libcapsicum/libcapsicum_fdlist.3 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,188 @@ +.\" +.\" Copyright (c) 2010 Robert N. M. Watson +.\" All rights reserved. +.\" +.\" WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED +.\" ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND +.\" UNEXPECTED WAYS. +.\" +.\" This software was developed at the University of Cambridge Computer +.\" Laboratory with support from a grant from Google, Inc. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd January 31, 2010 +.Os +.Dt LIBCAPSICUM_FDLIST 3 +.Sh NAME +.Nm libcapsicum +.Nd "library interface to file descriptor lists" +.Sh LIBRARY +.Lb libcapsicum +.Sh SYNOPSIS +.In sys/types.h +.In sys/capability.h +.In libcapsicum.h +.Ft struct lc_fdlist * +.Fn lc_fdlist_new "void" +.Ft struct lc_fdlist * +.Fn lc_fdlist_global "void" +.Ft struct lc_fdlist * +.Fn lc_fdlist_dup "struct lc_fdlist *lfp" +.Ft void +.Fn lc_fdlist_free "struct lc_fdlist *lfp" +.Ft int +.Fn lc_fdlist_add "struct lc_fdlist *lfp" "const char *subsystem" "const char *classname" "const char *name" "int fd" +.Ft int +.Fn lc_fdlist_addcap "struct lc_fdlist *lfp" "const char *subsystem" "const char *classname" "const char *name" "int fd" "cap_rights_t rights" +.Ft int +.Fn lc_fdlist_append "struct lc_fdlist *to" "struct lc_fdlist *from" +.Ft int +.Fn lc_fdlist_getentry "struct lc_fdlist *lfp" "char **subsystem" "char **classname" "char **name" "int *fdp" "int *pos" +.Ft int +.Fn lc_fdlist_lookup "struct lc_fdlist *lfp" "const char *subsystem" "const char *classname" "const char **name" "int *fdp" "int *pos" +.Sh DESCRIPTION +These +.Nm +library routines create, manage, and destroy file descriptor lists. +File descriptor lists are used by +.Nm +to describe sets of rights that should be delegated to newly created +sandboxes, as well as binding them to names so that sandboxed code can look +up file descriptors provided by code in the host without using hard-coded +file descriptor numbers. +This is necessary because file descriptors may not be the same in the host +and sandbox environments. +.Nm +will arrange for all necessary name and descriptor information to be +available in the sandbox, and file descriptor numbers returned in the sandbox +are with respect to the sandbox's file descriptor assignments. +.Pp +Note that the file descriptor list code is not aware of any changes in file +descriptor status that may happen as a result of application behavior, such +as calls to +.Xr open 2 , +.Xr dup 2, +or +.Xr close 2. +As such, applications must update any file descriptor lists referring to +manipulated descriptors if the descriptor list will later be queried for +them, or used in creating a new sandbox. +.Ss File descriptor list creation and destruction +These functions create, duplicate, and free file descriptor lists: +.Pp +.Fn lc_fdlist_new +allocates a new file descriptor list containing no file descriptor +registrations. +Sandboxed code may also use +.Fn lc_fdlist_global +to query the global file descriptor list passed in when the sandbox was +created. +.Pp +.Fn lc_fdlist_dup +duplicates an existing file descriptor list, creating a new list with +identical entries. +Once duplicated, the lists may diverge; this allows the creation of a +template list for a class of sandbox, followed by duplication and +customization for a specific sandbox instance. +.Pp +.Fn lc_fdlist_free +frees an existing file descriptor list; note that this does not close or +otherwise modify file descriptors described by the list. +.Ss File descriptor list entries +Each file descriptor list entry is described by a three-part character string +namespace: +.Bl -tag -width "subsystem" +.It Fa subsystem +Application or library name, globally unique in order to prevent collisions +between software components in the same host/sandbox pair. +.It Fa classname +An application-specific or library-specific name, intended to reflect a +specific software component within that application or library. +.It Fa name +A per-subsystem, per-class namespace, which might contain file names or other +specific object instance description. +.El +.Pp +These functions insert and look up file descriptor list entries: +.Pp +.Fn lc_fdlist_add +adds a file descriptor, +.Fa fd , +with the three-part name +.Fa subsystem , +.Fa classname , +and +.Fa name +to the file descriptor list +.Fa lfp . +.Fn lc_fdlist_add +is identical except that it further registers a capability mask to apply to +the descriptor during sandbox creation, avoiding the need for separate calls +to .Xr cap_new +in application code. +.Pp +.Fn lc_fdlist_lookup +looks up a file descriptor using the three-part name +.Fa subsystem , +.Fa classname , +and +.Fa name +from the file descriptor list +.Fa lfp . +.Fn lc_fdlist_getentry +may be used to iterate through all descriptors in the list. +.Sh RETURN VALUES +The +.Fn lc_fdlist_new , +.Fn lc_flist_global , +and +.Fn lc_fdlist_dup +functions return a pointer to the desired file descriptor list if successful; +otherwise the value +.Dv NULL +is returned and the global variable +.Va errno +is set to indicate the error. +.Pp +.Rv -std lc_fdlist_add lc_fdlist_addcap lc_fdlist_lookup +.Sh SEE ALSO +.Xr cap_new 2 , +.Xr close 2 , +.Xr dup 2 , +.Xr open 2 , +.Xr libcapsicum 3 , +.Xr libcapsicum_host 3 , +.Xr libcapsicum_sandbox 3 , +.Sh HISTORY +Support for capabilities and capabilities mode was developed as part of the +.Tn TrustedBSD +Project. +.Sh BUGS +WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED ON IN +PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND UNEXPECTED WAYS. +.Sh AUTHORS +These functions were created by +.An "Jonathan Anderson" +at the University of Cambridge Computer Laboratory. diff -aurN -x '*.orig' src-clean/lib/libcapsicum/libcapsicum_fdlist.c src/lib/libcapsicum/libcapsicum_fdlist.c --- src-clean/lib/libcapsicum/libcapsicum_fdlist.c 1970-01-01 01:00:00.000000000 +0100 +++ src/lib/libcapsicum/libcapsicum_fdlist.c 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,629 @@ +/*- + * Copyright (c) 2009 Jonathan Anderson + * Copyright (c) 2010 Robert N. M. Watson + * All rights reserved. + * + * WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED + * ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND + * UNEXPECTED WAYS. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $P4: //depot/projects/trustedbsd/capabilities/src/lib/libcapsicum/libcapsicum_fdlist.c#15 $ + */ + +#include +#include + +#define _WITH_DPRINTF +#include +#include +#include +#include +#include +#include +#include + +#include "libcapsicum_internal.h" +#include "libcapsicum_sandbox_api.h" + +struct lc_fdlist_entry { + u_int sysoff; /* offset of e.g. "org.freebsd.rtld-elf-cap" */ + u_int syslen; /* length of above */ + u_int classoff; /* offset of variable ID e.g. "libs" */ + u_int classnamelen; /* length of above */ + u_int nameoff; /* offset of entry name (e.g. "libc.so.7") */ + u_int namelen; /* length of above */ + int fd; /* the file descriptor */ +}; + +struct lc_fdlist_storage { + u_int count; /* number of entries */ + u_int capacity; /* entries that we can hold */ + u_int namelen; /* bytes of name data */ + u_int namecapacity; /* bytes of name data we can hold */ + struct lc_fdlist_entry entries[]; /* entries in the descriptor list */ + + /* followed by bytes of name data */ +}; + +struct lc_fdlist { + pthread_mutex_t lf_lock; /* for thread safety */ + struct lc_fdlist_storage *lf_storage; +}; + +#define LOCK(lfp) pthread_mutex_lock(&((lfp)->lf_lock)); +#define UNLOCK(lfp) pthread_mutex_unlock(&((lfp)->lf_lock)); + +/* Where an FD list's name byte array starts */ +static char *lc_fdlist_storage_names(struct lc_fdlist_storage *lfsp); +static u_int lc_fdlist_storage_size(struct lc_fdlist_storage *lfsp); + +static struct lc_fdlist global_fdlist = { + .lf_lock = PTHREAD_MUTEX_INITIALIZER, +}; + +struct lc_fdlist * +lc_fdlist_global(void) +{ + char *env; + + /* + * global_fdlist.lf_storage is set to a non-NULL value after the + * first call, and will never change; global_fdlist is only valid + * once it has non-NULL storage. + */ + LOCK(&global_fdlist); + if (global_fdlist.lf_storage != NULL) { + UNLOCK(&global_fdlist); + return (&global_fdlist); + } + env = getenv(LIBCAPSICUM_SANDBOX_FDLIST); + if ((env != NULL) && (strnlen(env, 8) < 7)) { + struct lc_fdlist_storage *lfsp; + struct stat sb; + int fd = -1; + + /* XXX: Should use strtol(3). */ + for (int i = 0; (i < 7) && env[i]; i++) { + if ((env[i] < '0') || (env[i] > '9')) + goto fail; + } + if (sscanf(env, "%d", &fd) != 1) + goto fail; + if (fd < 0) + goto fail; + if (fstat(fd, &sb) < 0) + goto fail; + lfsp = mmap(NULL, sb.st_size, PROT_READ | PROT_WRITE, + MAP_NOSYNC | MAP_SHARED, fd, 0); + if (lfsp == MAP_FAILED) + goto fail; + + /* + * XXX: Should perform additional validation of shared memory + * to make sure sizes/etc are internally consistent. + */ + global_fdlist.lf_storage = lfsp; + return (&global_fdlist); + } + +fail: + /* XXX: We don't always set errno before returning. */ + UNLOCK(&global_fdlist); + return (NULL); +} + +#define INITIAL_ENTRIES 16 +#define INITIAL_NAMEBYTES (64 * INITIAL_ENTRIES) + +struct lc_fdlist * +lc_fdlist_new(void) +{ + struct lc_fdlist_storage *lfsp; + struct lc_fdlist *lfp; + u_int bytes; + + lfp = malloc(sizeof(*lfp)); + bytes = sizeof(*lfsp) + + INITIAL_ENTRIES * sizeof(struct lc_fdlist_entry) + + INITIAL_NAMEBYTES; + lfsp = lfp->lf_storage = malloc(bytes); + if (lfsp == NULL) { + free(lfp); + return (NULL); + } + lfsp->count = 0; + lfsp->capacity = INITIAL_ENTRIES; + lfsp->namelen = 0; + lfsp->namecapacity = INITIAL_NAMEBYTES; + if (pthread_mutex_init(&lfp->lf_lock, NULL) != 0) { + free(lfp->lf_storage); + free(lfp); + return (NULL); + } + return (lfp); +} + +struct lc_fdlist * +lc_fdlist_dup(struct lc_fdlist *lfp_orig) +{ + struct lc_fdlist *lfp_new; + u_int size; + + lfp_new = malloc(sizeof(*lfp_new)); + if (lfp_new == NULL) + return (NULL); + if (pthread_mutex_init(&lfp_new->lf_lock, NULL) != 0) { + free(lfp_new); + return (NULL); + } + LOCK(lfp_orig); + size = lc_fdlist_storage_size(lfp_orig->lf_storage); + lfp_new->lf_storage = malloc(size); + if (lfp_new->lf_storage == NULL) { + UNLOCK(lfp_orig); + pthread_mutex_destroy(&lfp_new->lf_lock); + free(lfp_new); + return (NULL); + } + memcpy(lfp_new->lf_storage, lfp_orig->lf_storage, size); + UNLOCK(lfp_orig); + return (lfp_new); +} + +void +lc_fdlist_free(struct lc_fdlist *lfp) +{ + + free(lfp->lf_storage); + pthread_mutex_destroy(&lfp->lf_lock); + free(lfp); +} + +void +lc_fdlist_print(struct lc_fdlist *lfp, int outFD) +{ + dprintf(outFD, "FD List:\n"); + for(int i = 0; ; ) + { + char *subsystem, *classname, *name; + int fd; + + if (lc_fdlist_getentry(lfp, &subsystem, &classname, &name, &fd, &i) + < 0) + break; + + dprintf(outFD, "% 3d:\t'%s'.'%s': '%s'\n", + fd, subsystem, classname, name); + } +} + +int +lc_fdlist_add(struct lc_fdlist *lfp, const char *subsystem, + const char *classname, const char *name, int fd) +{ + struct lc_fdlist_storage *lfsp; + + LOCK(lfp); + lfsp = lfp->lf_storage; + + /* Do we need more entry space? */ + if (lfsp->count == lfsp->capacity) { + u_int namebytes_per_entry, newnamebytes, newsize; + struct lc_fdlist_storage *lfsp_copy; + char *tmp = NULL; + + /* Copy name data out of the way. */ + if (lfsp->namelen > 0) { + tmp = malloc(lfsp->namelen); + if (tmp == NULL) { + UNLOCK(lfp); + return (-1); + } + memcpy(tmp, lc_fdlist_storage_names(lfsp), + lfsp->namelen); + } + + /* Double the number of available entries. */ + namebytes_per_entry = lfsp->namecapacity / lfsp->capacity; + newnamebytes = lfsp->capacity * namebytes_per_entry; + newsize = lc_fdlist_storage_size(lfsp) + newnamebytes + + lfsp->capacity * sizeof(struct lc_fdlist_entry); + lfsp_copy = realloc(lfsp, newsize); + if (lfsp_copy == NULL) { + free(tmp); + UNLOCK(lfp); + return (-1); + } + + lfsp_copy->capacity *= 2; + lfsp_copy->namecapacity += newnamebytes; + + /* Copy name bytes back. */ + if (lfsp_copy->namelen > 0) + memcpy(lc_fdlist_storage_names(lfsp_copy), tmp, + lfsp_copy->namelen); + + lfsp = lfp->lf_storage = lfsp_copy; + free(tmp); + } + + /* Do we need more name space? */ + u_int subsyslen = strlen(subsystem); + u_int classnamelen = strlen(classname); + u_int namelen = strlen(name); + + if ((lfsp->namelen + subsyslen + classnamelen + namelen) >= + lfsp->namecapacity) { + + /* Double the name capacity. */ + struct lc_fdlist_storage *lfsp_enlarged; + + lfsp_enlarged = realloc(lfsp, lc_fdlist_storage_size(lfsp) + + lfsp->namecapacity); + if (lfsp_enlarged == NULL) { + UNLOCK(lfp); + return (-1); + } + + lfsp_enlarged->namecapacity *= 2; + lfsp = lfp->lf_storage = lfsp_enlarged; + } + + /* Create the new entry. */ + struct lc_fdlist_entry *entry = lfsp->entries + lfsp->count; + + entry->fd = fd; + + char *names = lc_fdlist_storage_names(lfsp); + char *head = names + lfsp->namelen; + + strncpy(head, subsystem, subsyslen + 1); + entry->sysoff = (head - names); + entry->syslen = subsyslen; + head += subsyslen + 1; + + strncpy(head, classname, classnamelen + 1); + entry->classoff = (head - names); + entry->classnamelen = classnamelen; + head += classnamelen + 1; + + strncpy(head, name, namelen + 1); + entry->nameoff = (head - names); + entry->namelen = namelen + 1; + head += namelen + 1; + + lfsp->count++; + lfsp->namelen = (head - names); + + UNLOCK(lfp); + return (0); +} + +int +lc_fdlist_append(struct lc_fdlist *to, struct lc_fdlist *from) +{ + int pos = 0; + if (to == NULL) { + errno = EINVAL; + return (-1); + } + + if (from == NULL) + return (0); + + /* Use address to order lc_fdlist locks. */ + if ((uintptr_t)to < (uintptr_t)from) { + LOCK(to); + LOCK(from); + } else { + LOCK(from); + LOCK(to); + } + + for (u_int i = 0; i < from->lf_storage->count; i++) { + char *subsystem; + char *classname; + char *name; + int fd; + + /* + * XXXRW: This recurses the from lock. + */ + if (lc_fdlist_getentry(from, &subsystem, &classname, &name, + &fd, &pos) < 0) + goto fail; + + /* + * XXXRW: This recurses the to lock. + */ + if (lc_fdlist_add(to, subsystem, classname, name, fd) < 0) { + free(subsystem); + goto fail; + } + free(subsystem); + } + return (0); + +fail: + UNLOCK(from); + UNLOCK(to); + return (-1); +} + +int +lc_fdlist_addcap(struct lc_fdlist *fdlist, const char *subsystem, + const char *classname, const char *name, int fd, cap_rights_t rights) +{ + int capfd; + + /* + * XXXRW: This API isn't particularly caller-friendly, in that it + * allocates a descriptor that the caller is responsible for freeing, + * but doesn't tell the caller what fd that is. Not yet clear what + * the preferred API is. + */ + capfd = cap_new(fd, rights); + if (capfd < 0) + return (-1); + return (lc_fdlist_add(fdlist, subsystem, classname, name, capfd)); +} + +int +lc_fdlist_find(struct lc_fdlist *lfp, const char *subsystem, + const char *classname, const char *filename, + const char **relative_name) +{ + int pos = 0; + int fd = -1; + + /* try to find the file itself in the FD list */ + size_t len = strlen(filename); + if (relative_name) + *relative_name = filename + len; + + while (fd == -1) + { + char *dirname; + + if (lc_fdlist_lookup(lfp, subsystem, classname, + &dirname, &fd, &pos) == -1) + break; + + if (strncmp(dirname, filename, len + 1)) fd = -1; + free(dirname); + } + + if (fd >= 0) return fd; + + + /* now try to find a parent directory and a relative filename */ + *relative_name = NULL; + pos = 0; + + while (fd == -1) + { + char *dirname; + + if (lc_fdlist_lookup(lfp, subsystem, classname, + &dirname, &fd, &pos) == -1) + return (-1); + + len = strlen(dirname); + + /* if there is no filename, we can't do relative naming */ + if (len == 0) + { + fd = -1; + continue; + } + + if (strncmp(dirname, filename, len)) fd = -1; + else + { + *relative_name = filename + len; + if (**relative_name == '/') (*relative_name)++; + } + + free(dirname); + } + + return fd; +} + + +int +lc_fdlist_lookup(struct lc_fdlist *lfp, const char *subsystem, + const char *classname, char **name, int *fdp, int *pos) +{ + struct lc_fdlist_storage *lfsp; + + LOCK(lfp); + lfsp = lfp->lf_storage; + if ((pos != NULL) && (*pos >= (int)lfsp->count)) { + UNLOCK(lfp); + errno = EINVAL; + return (-1); + } + + int successful = 0; + const char *names = lc_fdlist_storage_names(lfsp); + + for (u_int i = (pos ? *pos : 0); i < lfsp->count; i++) { + struct lc_fdlist_entry *entry = lfsp->entries + i; + + if ((!subsystem + || !strncmp(subsystem, names + entry->sysoff, + entry->syslen + 1)) + && (!classname + || !strncmp(classname, names + entry->classoff, + entry->classnamelen + 1))) + { + /* found a matching entry! */ + successful = 1; + *fdp = entry->fd; + + if (name) { + *name = malloc(entry->namelen + 1); + strncpy(*name, names + entry->nameoff, + entry->namelen + 1); + } + if (pos) *pos = i + 1; + break; + } + } + UNLOCK(lfp); + if (successful) + return (0); + + errno = ENOENT; + return (-1); +} + +int +lc_fdlist_getentry(struct lc_fdlist *lfp, char **subsystem, char **classname, + char **name, int *fdp, int *pos) +{ + struct lc_fdlist_storage *lfsp; + + LOCK(lfp); + lfsp = lfp->lf_storage; + + if ((subsystem == NULL) || (classname == NULL) || (name == NULL) || + (fdp == NULL) || ((pos != NULL) && (*pos >= (int) lfsp->count))) { + errno = EINVAL; + return (-1); + } + + struct lc_fdlist_entry *entry = lfsp->entries + (pos ? *pos : 0); + char *names = lc_fdlist_storage_names(lfsp); + int size = entry->syslen + entry->classnamelen + entry->namelen; + char *head = malloc(size); + + strncpy(head, names + entry->sysoff, entry->syslen + 1); + *subsystem = head; + head += size; + + strncpy(head, names + entry->classoff, entry->classnamelen + 1); + *classname = head; + head += size; + + strncpy(head, names + entry->nameoff, entry->namelen + 1); + *name = head; + head += size; + + *fdp = entry->fd; + UNLOCK(lfp); + if (pos) + (*pos)++; + return (0); +} + +int +lc_fdlist_reorder(struct lc_fdlist *lfp) +{ + struct lc_fdlist_storage *lfsp; + + LOCK(lfp); + lfsp = lfp->lf_storage; + + /* + * Identify the highest source file descriptor we care about so that + * when we play the dup2() rearranging game, we don't overwrite any + * we care about. + */ + int highestfd = -1; + for (u_int i = 0; i < lfsp->count; i++) { + if (lfsp->entries[i].fd > highestfd) + highestfd = lfsp->entries[i].fd; + } + highestfd++; /* Don't tread on the highest */ + + /* + * First, move all our descriptors up the range. + */ + for (u_int i = 0; i < lfsp->count; i++) { + if (dup2(lfsp->entries[i].fd, highestfd + i) < 0) { + UNLOCK(lfp); + return (-1); + } + } + + /* + * Now put them back. + */ + for (u_int i = 0; i < lfsp->count; i++) { + if (dup2(highestfd + i, i) < 0) { + UNLOCK(lfp); + return (-1); + } + + lfsp->entries[i].fd = i; + } + + /* + * Close the descriptors that we moved, as well as any others that + * were left open by the caller. + */ + closefrom(lfsp->count); + UNLOCK(lfp); + return (0); +} + +static u_int +lc_fdlist_storage_size(struct lc_fdlist_storage *lfsp) +{ + + return (sizeof(*lfsp) + + lfsp->capacity * sizeof(struct lc_fdlist_entry) + + lfsp->namecapacity); +} + +u_int +lc_fdlist_size(struct lc_fdlist *lfp) +{ + u_int size; + + LOCK(lfp); + size = lc_fdlist_storage_size(lfp->lf_storage); + UNLOCK(lfp); + return (size); +} + +static char * +lc_fdlist_storage_names(struct lc_fdlist_storage *lfsp) +{ + + return (((char *) lfsp) + lc_fdlist_storage_size(lfsp) - + lfsp->namecapacity); +} + +void* +_lc_fdlist_getstorage(struct lc_fdlist* lfp) +{ + + return (lfp->lf_storage); +} diff -aurN -x '*.orig' src-clean/lib/libcapsicum/libcapsicum_host.3 src/lib/libcapsicum/libcapsicum_host.3 --- src-clean/lib/libcapsicum/libcapsicum_host.3 1970-01-01 01:00:00.000000000 +0100 +++ src/lib/libcapsicum/libcapsicum_host.3 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,236 @@ +.\" +.\" Copyright (c) 2009 Robert N. M. Watson +.\" All rights reserved. +.\" +.\" WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED +.\" ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND +.\" UNEXPECTED WAYS. +.\" +.\" This software was developed at the University of Cambridge Computer +.\" Laboratory with support from a grant from Google, Inc. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd June 11, 2009 +.Os +.Dt LIBCAPSICUM_HOST 3 +.Sh NAME +.Nm libcapsicum +.Nd "library interface to capability-mode services" +.Sh LIBRARY +.Lb libcapsicum +.Sh SYNOPSIS +.In sys/types.h +.In sys/capability.h +.In libcapsicum.h +.Ft int +.Fn lch_start "const char *sandbox" "char *const argv[]" "u_int flags" "struct lc_sandbox **lcsp" +.Ft int +.Fn lch_startfd "int fd_sandbox" "char *const argv[]" "u_int flags" "struct lc_sandbox **lcsp" +.Ft void +.Fn lch_stop "struct lc_sandbox *lcsp" +.Ft int +.Fn lch_autosandbox_isenabled "const char *servicename" +.Ft int +.Fn lch_getsock "struct lc_sandbox *lcsp" "int *fdp" +.Ft int +.Fn lch_getpid "struct lc_sandbox *lcsp" "pid_t *pidp" +.Ft int +.Fn lch_getprocdesc "struct lc_sandbox *lcsp" "int *fdp" +.Ft ssize_t +.Fn lch_recv "struct lc_sandbox *lcsp, void *buf" "size_t len" "int flags" +.Ft ssize_t +.Fn lch_recv_rights "struct lc_sandbox *lcsp" "void *buf" "size_t len" "int flags" "int *fdp" "int *fdcountp" +.Ft int +.Fn lch_rpc "struct lc_sandbox *lcsp" "u_int32_t opno" "struct iovec *req" "int reqcount" "struct iovec *rep" "int repcount" "size_t *replenp" +.Ft int +.Fn lch_rpc_rights "struct lc_sandbox *lcsp" "u_int32_t opno" "struct iovec *req" "int reqcount" "int *req_fdp" "int req_fdcount" "struct iovec *rep" "int repcount" "size_t *replenp" "int *rep_fdp" "int *rep_fdcountp" +.Ft ssize_t +.Fn lch_send "struct lc_sandbox *lcsp" "const void *msg" "size_t len" "int flags" +.Ft ssize_t +.Fn lch_send_rights "struct lc_sandbox *lcsp" "const void *msg" "size_t len" "int flags" "int *fdp" "int fdcount" +.Sh DESCRIPTION +The +.Nm +library routines provide services for processes hosting or running in +capability mode. +Depending on the requirements of the host and sandbox, the API can simply be +used to set up and stop sandboxes, used to manage I/O using a +.Xr unix 4 +domain socket connection to the sandbox, or can provide a basic remote +procedure call (RPC) facility. +Applications may also use RPC generators such as +.Xr rpcgen 1 +to build event handling and marshaling code. +.Pp +This man page describes the host API. +General information on +.Nm +may be found in +.Xr libcapsicum 3 . +Information on the sandbox API may be found in +.Xr libcapsicum_sandbox 3 . +.Sh HOST API +The +.Nm +host API allows processes to start, stop, and manage sandboxes running in +capability mode. +Host API functions can be identified by their function name prefix, +.Dv lch_ . +.Pp +Each executing sandbox instance is described by an opaque +.Dt "struct lc_sandbox *" , +which is returned by +.Fn lch_start +for successfully started sandboxes, and passed into other APIs to indicate +which sandbox should be acted on. +.Fn lch_start +creates a new executing sandboxes, given the name of the sandbox binary via +.Va sandbox , +and command line arguments +.Va argv , +and optional flags +.Va flags +to fine-tune aspects of sandbox operation; the only currently defined flag is +.Dv LCH_PERMIT_STDERR , +which allows the sandbox to write to the current process's +.Dv stderr . +By default, this is not permitted. +.Pp +.Fn lch_startfd +accept a file descriptor argument, +.Va fd_sandbox , +rather than a path, so is appropriate for use within a sandbox. +.Pp +Executing sandboxes may be stopped (and all state freed) using +.Fn lch_stop . +Following a call to +.Fn lch_stop , +the +.Va lchp +argument will no longer be valid. +.Pp +Libraries and tools performing self-compartmentalization can use the +interface +.Nm lch_autosandbox_isenabled +along with a unique string identifying their service to determine whether or +not a global policy affecting the service requires sandboxing to be enabled +or not. +.Pp +Properties of the sandbox, such as the socket used to communicate with it, +the proces descriptor for the sandbox process, and the pid, may be queried +using +.Fn lch_getsock , +.Fn lch_getprocdesc , +and +.Fn lch_getpid . +.Pp +.Nm +implements a number of I/O functions as part of the host API, which are +documented in +.Xr libcapsicum_host 3 . +.Fn lch_recv +and +.Fn lch_send +provide simple wrappers around +.Xr recv 2 +and +.Xr send 2 +to avoid sandbox consumers from having to query sandbox socket file +descriptors before use. +.Pp +.Fn lch_recv_rights +and +.Fn lch_send_rights +are similar, but allow file descriptors to be attached the the messages +received and sent. +Both accept a pointer to a file descriptor array, +.Va fdp . +Callers to +.Fn lch_recv_rights +will pass in the length of the array via +.Va fdcountp , +whose value will be changed to the actual number of file descriptors +received. +Callers to +.Fn lch_send_rights +will pass in the number of file descriptors in the array via +.Va fdcount . +.Pp +.Fn lch_rpc +provides a simple synchronous RPC facility, and is intended to be used in +coordination with the +.Fn lcs_recvrpc +and +.Fn lcs_sendrpc +sandbox APIs. +The host provides an operation number meaningful to the sandbox, +.Va opno, +RPC arguments represented by +.Va req +and +.Va reqcount +using an +.Vt iovec +in the style of +.Xr writev 2 , +and similar receive buffers passed via +.Va rep +and +.Va repcount . +If the RPC fails, -1 will be returned, or 0 and the size of any reply will be +returned by reference using +.Va replenp . +.Nm lch_rpc_rights +allows the sending and receiving of file descriptors as part of the RPC +operation. +.Sh SEE ALSO +.Xr rpcgen 1 , +.Xr recv 2 , +.Xr send 2 , +.Xr writev 2 , +.Xr free 3 , +.Xr libcapsicum 3 , +.Xr libcapsicum_sandbox 3 , +.Xr malloc 3 , +.Xr unix 4 +.Sh HISTORY +Support for capabilities and capabilities mode was developed as part of the +.Tn TrustedBSD +Project. +.Sh BUGS +WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED ON IN +PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND UNEXPECTED WAYS. +.Pp +All sequence numbers will always have the value 0. +This is fine from a retransmission perspective, as generally no +retransmission should be required, but consumers should serialize use of the +RPC service when consuming it from concurrent callers (such as multiple +threads or multiple processes) to prevent I/O interlacing from corrupting the +RPC stream. +.Sh AUTHORS +These functions and the capability facility were created by +.An "Robert N. M. Watson" +at the University of Cambridge Computer Laboratory with support from a grant +from Google, Inc. diff -aurN -x '*.orig' src-clean/lib/libcapsicum/libcapsicum_host.c src/lib/libcapsicum/libcapsicum_host.c --- src-clean/lib/libcapsicum/libcapsicum_host.c 1970-01-01 01:00:00.000000000 +0100 +++ src/lib/libcapsicum/libcapsicum_host.c 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,424 @@ +/*- + * Copyright (c) 2009-2010 Robert N. M. Watson + * All rights reserved. + * + * WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED + * ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND + * UNEXPECTED WAYS. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $P4: //depot/projects/trustedbsd/capabilities/src/lib/libcapsicum/libcapsicum_host.c#19 $ + */ + +#include +#include +#include +#include +#include +#include +#include + +#define _WITH_DPRINTF +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libcapsicum.h" +#include "libcapsicum_internal.h" +#include "libcapsicum_sandbox_api.h" + +#define LIBCAPSICUM_CAPMASK_SOCK (CAP_EVENT | CAP_READ | CAP_WRITE) +#define LIBCAPSICUM_CAPMASK_BIN (CAP_READ | CAP_EVENT | CAP_FSTAT | \ + CAP_FSTATFS | \ + CAP_FEXECVE | CAP_MMAP | \ + CAP_MAPEXEC) +#define LIBCAPSICUM_CAPMASK_SANDBOX LIBCAPSICUM_CAPMASK_BIN +#define LIBCAPSICUM_CAPMASK_LDSO LIBCAPSICUM_CAPMASK_BIN +#define LIBCAPSICUM_CAPMASK_LIBDIR LIBCAPSICUM_CAPMASK_BIN \ + | CAP_LOOKUP | CAP_ATBASE +#define LIBCAPSICUM_CAPMASK_FDLIST CAP_READ | CAP_WRITE | CAP_FTRUNCATE \ + | CAP_FSTAT | CAP_MMAP + +#define LIBCAPSICUM_CAPMASK_STDOUT CAP_WRITE | CAP_SEEK | CAP_FSTAT + +extern char **environ; + +#define LD_ELF_CAP_SO "ld-elf-cap.so.1" +#define PATH_LD_ELF_CAP_SO "/libexec" + +int +lch_autosandbox_isenabled(__unused const char *servicename) +{ + + if (getenv("LIBCAPSICUM_NOAUTOSANDBOX") != NULL) + return (0); + return (1); +} + +/* + * Once in the child process, create the new sandbox. + * + * XXX: A number of things happen here that are not safe after fork(), + * especially calls to err(). + */ +static void +lch_sandbox(int fd_sock, int fd_binary, int fd_rtld, u_int flags, + const char *binname, char *const argv[], struct lc_fdlist *userfds) +{ + struct sbuf *sbufp; + int shmfd = -1; + size_t fdlistsize; + struct lc_fdlist *fds; + void *shm; + + /* + * Inform the run-time linked of the binary's name. + */ + if (setenv("LD_BINNAME", binname, 1) == -1) + err(-1, "Error in setenv(LD_BINNAME)"); + + /* + * Create an anonymous shared memory segment for the FD list. + */ + shmfd = shm_open(SHM_ANON, O_RDWR | O_CREAT | O_TRUNC, 0600); + if (shmfd < 0) + err(-1, "Error creating shared memory segment"); + + /* + * Create and fill up the FD list. + */ + fds = lc_fdlist_new(); + if (fds == NULL) + err(-1, "Error in lc_fdlist_new()"); + + if (lc_fdlist_addcap(fds, LIBCAPSICUM_FQNAME, "stdin", "", + STDIN_FILENO, 0) < 0) + err(-1, "Error in lc_fdlist_addcap(stdin)"); + + if (lc_fdlist_addcap(fds, LIBCAPSICUM_FQNAME, "stdout", "", + STDOUT_FILENO, + (flags & LCH_PERMIT_STDOUT) ? LIBCAPSICUM_CAPMASK_STDOUT : 0) < 0) + err(-1, "Error in lc_fdlist_addcap(stdout)"); + + if (lc_fdlist_addcap(fds, LIBCAPSICUM_FQNAME, "stderr", "", + STDERR_FILENO, + (flags & LCH_PERMIT_STDERR) ? LIBCAPSICUM_CAPMASK_STDOUT : 0) < 0) + err(-1, "Error in lc_fdlist_addcap(stderr)"); + + if (lc_fdlist_addcap(fds, LIBCAPSICUM_FQNAME, "socket", "", + fd_sock, LIBCAPSICUM_CAPMASK_SOCK) < 0) + err(-1, "Error in lc_fdlist_addcap(fd_sock)"); + + if (lc_fdlist_addcap(fds, LIBCAPSICUM_FQNAME, "fdlist", "", + shmfd, LIBCAPSICUM_CAPMASK_FDLIST) < 0) + err(-1, "Error in lc_fdlist_addcap(shmfd)"); + + if (lc_fdlist_addcap(fds, RTLD_CAP_FQNAME, "rtld", "", + fd_rtld, LIBCAPSICUM_CAPMASK_LDSO) < 0) + err(-1, "Error in lc_fdlist_addcap(fd_rtld)"); + + if (lc_fdlist_addcap(fds, RTLD_CAP_FQNAME, "Executable", binname, + fd_binary, LIBCAPSICUM_CAPMASK_SANDBOX) < 0) + err(-1, "Error in lc_fdlist_addcap(fd_binary)"); + + /* + * Ask RTLD for library path descriptors. + * + * NOTE: This is FreeBSD-specific; porting to other operating systems + * will require dynamic linkers capable of answering similar queries. + */ + int size = 16; + int *libdirs; + + while (1) { + libdirs = malloc(size * sizeof(int)); + if (ld_libdirs(libdirs, &size) < 0) { + free(libdirs); + if (size > 0) + continue; + err(-1, "Error in ld_libdirs()"); + } else + break; + } + + for (int j = 0; j < size; j++) { + if (lc_fdlist_addcap(fds, RTLD_CAP_FQNAME, "LibraryDirectory", "", + libdirs[j], LIBCAPSICUM_CAPMASK_LIBDIR) < 0) + err(-1, "Error in lc_fdlist_addcap(libdirs[%d]: %d)", + j, libdirs[j]); + } + + /* Append user FD list and reorder the descriptors */ + if (lc_fdlist_append(fds, userfds) < 0) + err(-1, "Error in lc_fdlist_append()"); + + if (lc_fdlist_reorder(fds) < 0) + err(-1, "Error in lc_fdlist_reorder()"); + + + + + /* + * Find the fdlist shared memory segment. + */ + int pos = 0; + if (lc_fdlist_lookup(fds, LIBCAPSICUM_FQNAME, "fdlist", NULL, &shmfd, + &pos) < 0) + err(-1, "Error in lc_fdlist_lookup(fdlist)"); + + char tmp[8]; + sprintf(tmp, "%d", shmfd); + if (setenv(LIBCAPSICUM_SANDBOX_FDLIST, tmp, 1) == -1) + err(-1, "Error in setenv(LIBCAPSICUM_SANDBOX_FDLIST)"); + + /* + * Map it and copy the list. + */ + fdlistsize = lc_fdlist_size(fds); + if (ftruncate(shmfd, fdlistsize) < 0) + err(-1, "Error in ftruncate(shmfd)"); + + shm = mmap(NULL, fdlistsize, PROT_READ | PROT_WRITE, + MAP_NOSYNC | MAP_SHARED, shmfd, 0); + if (shm == MAP_FAILED) + err(-1, "Error mapping fdlist SHM"); + + memcpy(shm, _lc_fdlist_getstorage(fds), fdlistsize); + if (munmap(shm, fdlistsize)) + err(-1, "Error in munmap(shm, fdlistsize)"); + + + /* + * Find RTLD. + */ + if (lc_fdlist_lookup(fds, RTLD_CAP_FQNAME, "rtld", NULL, &fd_rtld, + NULL) < 0) + err(-1, "Error in lc_fdlist_lookup(RTLD)"); + + /* + * Find the binary for RTLD. + */ + if (lc_fdlist_lookup(fds, RTLD_CAP_FQNAME, "Executable", NULL, + &fd_binary, NULL) < 0) + err(-1, "Error in lc_fdlist_lookup(Executable)"); + + sprintf(tmp, "%d", fd_binary); + if (setenv("LD_BINARY", tmp, 1) != 0) + err(-1, "Error in setenv(LD_BINARY)"); + + /* + * Build LD_LIBRARY_DIRS and LD_PRELOAD for RTLD. + * + * NOTE: This is FreeBSD-specific; porting to other operating systems + * will require dynamic linkers capable of operating on file + * descriptors. + */ + sbufp = sbuf_new_auto(); + if (sbufp == NULL) + err(-1, "Error in sbuf_new_auto()"); + + { + int fd; + pos = 0; + while (lc_fdlist_lookup(fds, RTLD_CAP_FQNAME, "LibraryDirectory", + NULL, &fd, &pos) >= 0) + sbuf_printf(sbufp, "%d:", fd); + } + + sbuf_finish(sbufp); + if (sbuf_overflowed(sbufp)) + err(-1, "sbuf_overflowed()"); + if (setenv("LD_LIBRARY_DIRS", sbuf_data(sbufp), 1) == -1) + err(-1, "Error in setenv(LD_LIBRARY_DIRS)"); + sbuf_delete(sbufp); + + sbufp = sbuf_new_auto(); + if (sbufp == NULL) + err(-1, "Error in sbuf_new_auto()"); + + { + int fd; + pos = 0; + while (lc_fdlist_lookup(fds, RTLD_CAP_FQNAME, "PreloadLibrary", + NULL, &fd, &pos) >= 0) + sbuf_printf(sbufp, "%d:", fd); + } + + sbuf_finish(sbufp); + if (sbuf_overflowed(sbufp)) + err(-1, "sbuf_overflowed()"); + if (setenv("LD_PRELOAD", sbuf_data(sbufp), 1) == -1) + err(-1, "Error in setenv(LD_PRELOAD)"); + sbuf_delete(sbufp); + + if (cap_enter() < 0) + err(-1, "cap_enter() failed"); + + (void)fexecve(fd_rtld, argv, environ); + dprintf(2, "ERROR: fexecve() failed; errno = %d\n", errno); +} + +int +lch_startfd(int fd_binary, const char *binname, char *const argv[], + u_int flags, struct lc_fdlist *fds, struct lc_sandbox **lcspp) +{ + struct lc_sandbox *lcsp; + int fd_rtld; + int fd_procdesc, fd_sockpair[2]; + int error, val; + pid_t pid; + + fd_rtld = fd_procdesc = fd_sockpair[0] = fd_sockpair[1] = -1; + + lcsp = malloc(sizeof(*lcsp)); + if (lcsp == NULL) + return (-1); + bzero(lcsp, sizeof(*lcsp)); + + if (ld_insandbox()) { + struct lc_fdlist *globals; + int pos = 0; + + globals = lc_fdlist_global(); + if (globals == NULL) + goto out_error; + if (lc_fdlist_lookup(globals, RTLD_CAP_FQNAME, "rtld", NULL, + &fd_rtld, &pos) < 0) + goto out_error; + } else { + fd_rtld = open(PATH_LD_ELF_CAP_SO "/" LD_ELF_CAP_SO, + O_RDONLY); + if (fd_rtld < 0) + goto out_error; + } + + if (socketpair(PF_LOCAL, SOCK_STREAM, 0, fd_sockpair) < 0) + goto out_error; + + val = 1; + if (setsockopt(fd_sockpair[0], SOL_SOCKET, SO_NOSIGPIPE, &val, + sizeof(val)) < 0) { + fd_sockpair[0] = fd_sockpair[1] = -1; + goto out_error; + } + + pid = pdfork(&fd_procdesc); + if (pid < 0) { + fd_procdesc = -1; + goto out_error; + } + if (pid == 0) { + lch_sandbox(fd_sockpair[1], fd_binary, fd_rtld, flags, + binname, argv, fds); + exit(-1); + } + if (fd_rtld != -1) + close(fd_rtld); + close(fd_sockpair[1]); + + lcsp->lcs_fd_procdesc = fd_procdesc; + lcsp->lcs_fd_sock = fd_sockpair[0]; + lcsp->lcs_pid = pid; + *lcspp = lcsp; + + return (0); + +out_error: + error = errno; + if (fd_sockpair[0] != -1) + close(fd_sockpair[0]); + if (fd_sockpair[1] != -1) + close(fd_sockpair[1]); + if (fd_rtld != -1) + close(fd_rtld); + if (lcsp != NULL) + free(lcsp); + errno = error; + return (-1); +} + +int +lch_start(const char *sandbox, char *const argv[], u_int flags, + struct lc_fdlist *fds, struct lc_sandbox **lcspp) +{ + char binname[MAXPATHLEN]; + int error, fd_binary, ret; + + if (basename_r(sandbox, binname) == NULL) + return (-1); + + fd_binary = open(sandbox, O_RDONLY); + if (fd_binary < 0) + return (-1); + + ret = lch_startfd(fd_binary, binname, argv, flags, fds, lcspp); + error = errno; + close(fd_binary); + errno = error; + return (ret); +} + +void +lch_stop(struct lc_sandbox *lcsp) +{ + + close(lcsp->lcs_fd_sock); + close(lcsp->lcs_fd_procdesc); + lcsp->lcs_fd_sock = -1; + lcsp->lcs_fd_procdesc = -1; + lcsp->lcs_pid = -1; + free(lcsp); +} + +int +lch_getsock(struct lc_sandbox *lcsp, int *fdp) +{ + + *fdp = lcsp->lcs_fd_sock; + return (0); +} + +int +lch_getpid(struct lc_sandbox *lcsp, pid_t *pidp) +{ + + *pidp = lcsp->lcs_pid; + return (0); +} + +int +lch_getprocdesc(struct lc_sandbox *lcsp, int *fdp) +{ + + *fdp = lcsp->lcs_fd_procdesc; + return (0); +} diff -aurN -x '*.orig' src-clean/lib/libcapsicum/libcapsicum_host_io.c src/lib/libcapsicum/libcapsicum_host_io.c --- src-clean/lib/libcapsicum/libcapsicum_host_io.c 1970-01-01 01:00:00.000000000 +0100 +++ src/lib/libcapsicum/libcapsicum_host_io.c 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,232 @@ +/*- + * Copyright (c) 2009 Robert N. M. Watson + * All rights reserved. + * + * WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED + * ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND + * UNEXPECTED WAYS. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $P4: //depot/projects/trustedbsd/capabilities/src/lib/libcapsicum/libcapsicum_host_io.c#3 $ + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libcapsicum.h" +#include "libcapsicum_internal.h" +#include "libcapsicum_sandbox_api.h" + +/* + * Simple I/O wrappers for capability sockets. Possibly more keeping an eye + * on the worker should take place here. + */ +ssize_t +lch_send(struct lc_sandbox *lcsp, const void *msg, size_t len, int flags) +{ + + return (_lc_send(lcsp->lcs_fd_sock, msg, len, flags, 0)); +} + +ssize_t +lch_send_rights(struct lc_sandbox *lcsp, const void *msg, size_t len, + int flags, int *fdp, int fdcount) +{ + + return (_lc_send_rights(lcsp->lcs_fd_sock, msg, len, flags, 0, fdp, + fdcount)); +} + +ssize_t +lch_recv(struct lc_sandbox *lcsp, void *buf, size_t len, int flags) +{ + + return (_lc_recv(lcsp->lcs_fd_sock, buf, len, flags, 0)); +} + +ssize_t +lch_recv_rights(struct lc_sandbox *lcsp, void *buf, size_t len, int flags, + int *fdp, int *fdcountp) +{ + + return (_lc_recv_rights(lcsp->lcs_fd_sock, buf, len, flags, 0, fdp, + fdcountp)); +} + +/* + * Simple libcapsicum RPC facility (lcrpc): send a request, get back a + * reply (up to the size bound of the buffers passed in). The caller is + * responsible for retransmitting if the sandbox fails. + * + * Right now sequence numbers are unimplemented -- that's fine because we + * don't need retransmission, and are synchronous. However, it might not be + * a bad idea to use them anyway. + */ +static int +lch_rpc_internal(struct lc_sandbox *lcsp, u_int32_t opno, struct iovec *req, + int reqcount, int *req_fdp, int req_fdcount, struct iovec *rep, + int repcount, size_t *replenp, int *rep_fdp, int *rep_fdcountp) +{ + struct lcrpc_request_hdr req_hdr; + struct lcrpc_reply_hdr rep_hdr; + size_t left, off, space, totlen, want; + ssize_t len; + int i; + + bzero(&req_hdr, sizeof(req_hdr)); + req_hdr.lcrpc_reqhdr_magic = LCRPC_REQUEST_HDR_MAGIC; + req_hdr.lcrpc_reqhdr_seqno = 0; + req_hdr.lcrpc_reqhdr_opno = opno; + for (i = 0; i < reqcount; i++) + req_hdr.lcrpc_reqhdr_datalen += req[i].iov_len; + for (i = 0; i < repcount; i++) + req_hdr.lcrpc_reqhdr_maxrepdatalen += rep[i].iov_len; + + /* + * Send our header. + */ + if (req_fdp != NULL) + len = _lc_send_rights(lcsp->lcs_fd_sock, &req_hdr, + sizeof(req_hdr), 0, LC_IGNOREEINTR, req_fdp, + req_fdcount); + else + len = _lc_send(lcsp->lcs_fd_sock, &req_hdr, sizeof(req_hdr), + 0, LC_IGNOREEINTR); + if (len < 0) + return (-1); + if (len != sizeof(req_hdr)) { + errno = ECHILD; + return (-1); + } + + /* + * Send the user request. + */ + for (i = 0; i < reqcount; i++) { + len = _lc_send(lcsp->lcs_fd_sock, req[i].iov_base, + req[i].iov_len, 0, LC_IGNOREEINTR); + if (len < 0) + return (-1); + if ((size_t)len != req[i].iov_len) { + errno = ECHILD; + return (-1); + } + } + + /* + * Receive our header and validate. + */ + if (rep_fdp != NULL) + len = _lc_recv_rights(lcsp->lcs_fd_sock, &rep_hdr, + sizeof(rep_hdr), MSG_WAITALL, LC_IGNOREEINTR, rep_fdp, + rep_fdcountp); + else + len = _lc_recv(lcsp->lcs_fd_sock, &rep_hdr, sizeof(rep_hdr), + MSG_WAITALL, LC_IGNOREEINTR); + if (len < 0) + return (-1); + if (len != sizeof(rep_hdr)) { + if (rep_fdp != NULL) + _lc_dispose_rights(rep_fdp, *rep_fdcountp); + errno = ECHILD; + return (-1); + } + + if (rep_hdr.lcrpc_rephdr_magic != LCRPC_REPLY_HDR_MAGIC || + rep_hdr.lcrpc_rephdr_seqno != 0 || + rep_hdr.lcrpc_rephdr_opno != opno || + rep_hdr.lcrpc_rephdr_datalen > req_hdr.lcrpc_reqhdr_maxrepdatalen) { + if (rep_fdp != NULL) + _lc_dispose_rights(rep_fdp, *rep_fdcountp); + errno = EBADRPC; + return (-1); + } + + /* + * Receive the user data. Notice that we can partially overwrite the + * user buffer but still receive an error. + */ + totlen = 0; + for (i = 0; i < repcount; i++) { + off = 0; + while (totlen < rep_hdr.lcrpc_rephdr_datalen) { + space = rep[i].iov_len - off; + left = rep_hdr.lcrpc_rephdr_datalen - totlen; + want = (space > left) ? space : left; + len = _lc_recv(lcsp->lcs_fd_sock, + (u_char *)((uintptr_t)rep[i].iov_base + off), + want, MSG_WAITALL, LC_IGNOREEINTR); + if (len < 0) + return (-1); + if ((size_t)len != want) { + if (rep_fdp != NULL) + _lc_dispose_rights(rep_fdp, + *rep_fdcountp); + errno = ECHILD; + return (-1); + } + off += len; + totlen += len; + if (rep[i].iov_len == off) + break; + } + if (totlen == rep_hdr.lcrpc_rephdr_datalen) + break; + } + *replenp = totlen; + return (0); +} + +int +lch_rpc(struct lc_sandbox *lcsp, u_int32_t opno, struct iovec *req, + int reqcount, struct iovec *rep, int repcount, size_t *replenp) +{ + + return (lch_rpc_internal(lcsp, opno, req, reqcount, NULL, 0, + rep, repcount, replenp, NULL, NULL)); +} + +int +lch_rpc_rights(struct lc_sandbox *lcsp, u_int32_t opno, struct iovec *req, + int reqcount, int *req_fdp, int req_fdcount, struct iovec *rep, + int repcount, size_t *replenp, int *rep_fdp, int *rep_fdcountp) +{ + + return (lch_rpc_internal(lcsp, opno, req, reqcount, req_fdp, + req_fdcount, rep, repcount, replenp, rep_fdp, rep_fdcountp)); +} diff -aurN -x '*.orig' src-clean/lib/libcapsicum/libcapsicum_internal.h src/lib/libcapsicum/libcapsicum_internal.h --- src-clean/lib/libcapsicum/libcapsicum_internal.h 1970-01-01 01:00:00.000000000 +0100 +++ src/lib/libcapsicum/libcapsicum_internal.h 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,71 @@ +/*- + * Copyright (c) 2009 Robert N. M. Watson + * All rights reserved. + * + * WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED + * ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND + * UNEXPECTED WAYS. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $P4: //depot/projects/trustedbsd/capabilities/src/lib/libcapsicum/libcapsicum_internal.h#6 $ + */ + +#ifndef _LIBCAPSICUM_INTERNAL_H_ +#define _LIBCAPSICUM_INTERNAL_H_ + +#define LIBCAPSICUM_FQNAME "org.freebsd.libcapsicum" +#define RTLD_CAP_FQNAME "org.freebsd.rtld-elf-cap" + +struct lc_host { + int lch_fd_sock; +}; + +struct lc_sandbox { + int lcs_fd_sock; + int lcs_fd_procdesc; + pid_t lcs_pid; +}; + +void* _lc_fdlist_getstorage(struct lc_fdlist*); + +/* + * Communications flags for recv/send calls (lc_flags). + */ +#define LC_IGNOREEINTR 0x00000001 + +struct msghdr; +void _lc_dispose_rights(int *fdp, int fdcount); +int _lc_receive_rights(struct msghdr *msg, int *fdp, int *fdcountp); + +ssize_t _lc_recv(int fd, void *buf, size_t len, int flags, int lc_flags); +ssize_t _lc_recv_rights(int fd, void *buf, size_t len, int flags, + int lc_flags, int *fdp, int *fdcountp); +ssize_t _lc_send(int fd, const void *msg, size_t len, int flags, + int lc_flags); +ssize_t _lc_send_rights(int fd, const void *msg, size_t len, int flags, + int lc_flags, int *fdp, int fdcount); + +#endif /* !_LIBCAPSICUM_INTERNAL_H_ */ diff -aurN -x '*.orig' src-clean/lib/libcapsicum/libcapsicum_sandbox.3 src/lib/libcapsicum/libcapsicum_sandbox.3 --- src-clean/lib/libcapsicum/libcapsicum_sandbox.3 1970-01-01 01:00:00.000000000 +0100 +++ src/lib/libcapsicum/libcapsicum_sandbox.3 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,174 @@ +.\" +.\" Copyright (c) 2009 Robert N. M. Watson +.\" All rights reserved. +.\" +.\" WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED +.\" ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND +.\" UNEXPECTED WAYS. +.\" +.\" This software was developed at the University of Cambridge Computer +.\" Laboratory with support from a grant from Google, Inc. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd June 11, 2009 +.Os +.Dt LIBCAPSICUM_SANDBOX 3 +.Sh NAME +.Nm libcapsicum +.Nd "library interface to capability-mode services" +.Sh LIBRARY +.Lb libcapsicum +.Sh SYNOPSIS +.In sys/types.h +.In sys/capability.h +.In libcapsicum.h +.Ft int +.Fn lcs_get "struct lc_host **lchpp" +.Ft int +.Fn lcs_getsock "struct lc_host *lchp" "int *fdp" +.Ft ssize_t +.Fn lcs_recv "struct lc_host *lchp" "void *buf" "size_t len" "int flags" +.Ft ssize_t +.Fn lcs_recv_rights "struct lc_host *lchp" "void *buf" "size_t len" "int flags" "int *fdp" "int *fdcountp" +.Ft int +.Fn lcs_recvrpc "struct lc_host *lchp" "u_int32_t *opnop" "u_int32_t *seqnop" "u_char **bufferp" "size_t *lenp" +.Ft int +.Fn lcs_recvrpc_rights "struct lc_host *lchp" "u_int32_t *opnop" "u_int32_t *seqnop" "u_char **bufferp" "size_t *lenp" "int *fdp" "int *fdcountp" +.Ft ssize_t +.Fn lcs_send "struct lc_host *lchp" "const void *msg" "size_t len" "int flags" +.Ft ssize_t +.Fn lcs_send_rights "struct lc_host *lchp" "const void *msg" "size_t len" "int flags" "int *fdp" "int fdcount" +.Ft int +.Fn lcs_sendrpc "struct lc_host *lchp" "u_int32_t opno" "u_int32_t seqno" "struct iovec *rep" "int repcount" +.Ft int +.Fn lcs_sendrpc_rights "struct lc_host *lchp" "u_int32_t opno" "u_int32_t seqno" "struct iovec *rep" "int repcount" "int *fdp" "int fdcount" +.Sh DESCRIPTION +The +.Nm +library routines provide services for processes hosting or running in +capability mode. +Depending on the requirements of the host and sandbox, the API can simply be +used to set up and stop sandboxes, used to manage I/O using a +.Xr unix 4 +domain socket connection to the sandbox, or can provide a basic remote +procedure call (RPC) facility. +Applications may also use RPC generators such as +.Xr rpcgen 1 +to build event handling and marshaling code. +.Pp +This man page describes the sandbox API. +General information on +.Nm +may be found in +.Xr libcapsicum 3 . +.Sh SANDBOX API +The +.Nm +sandbox API allows sandbox processes to interact with their host process. +Sandbox API functions can be identified by their function name prefix, +.Dv lcs_ . +.Pp +Each executing sandbox will have a single corresponding host instance, +described by an opaque +.Dt "struct lc_host" , +which is returned by +.Fn lcs_get . +.Pp +The socket for the host may be queried using +.Fn lcs_getsock . +.Pp +.Nm +implements a number of I/O functions as part of the sandbox API, which are +documented in +.Xr libcapsicum_sandbox 3 . +.Fn lcs_recv +and +.Fn lcs_send +provide simple wrappers around +.Xr recv 2 +and +.Xr send 2 +to avoid sandboxes having to query host socket file descriptors before use. +.Pp +.Fn lcs_recv_rights +and +.Fn lcs_send_rights +similarly allow receiving and sending file descriptors with messages. +.Pp +.Fn lcs_recvrpc +and +.Fn lcs_sendrpc +may be used to implement a simple RPC system, in coordination with a host +using +.Fn lch_rpc . +.Fn lcs_recvrpc +blocks awaiting the receipt of an RPC request, which will be returned in a +buffer allocated using +.Xr malloc 3 , +.Va bufferp , +and with a data size returned via +.Va lenp . +The caller will also receive an operation number and a sequence number via +.Va opnop +and +.Va seqnop . +.Pp +When an RPC is complete, it should be returned to the host via +.Fn lcs_sendrpc , +which accepts the same operation and sequence number as arguments, as well as +reply data via the +.Vt iovec +.Va rep +and +.Va repcount . +When the sandbox is done with the request data, it should free the memory +using +.Xr free 3 . +.Nm lcs_recvrpc_rights +and +.Nm lcs_sendrpc_rights +allow the receiving and sending of file descriptors along with the RPC. +.Sh SEE ALSO +.Xr rpcgen 1 , +.Xr recv 2 , +.Xr send 2 , +.Xr writev 2 , +.Xr free 3 , +.Xr libcapsicum 3 , +.Xr libcapsicum_host 3 , +.Xr malloc 3 , +.Xr unix 4 +.Sh HISTORY +Support for capabilities and capabilities mode was developed as part of the +.Tn TrustedBSD +Project. +.Sh BUGS +WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED ON IN +PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND UNEXPECTED WAYS. +.Sh AUTHORS +These functions and the capability facility were created by +.An "Robert N. M. Watson" +at the University of Cambridge Computer Laboratory with support from a grant +from Google, Inc. diff -aurN -x '*.orig' src-clean/lib/libcapsicum/libcapsicum_sandbox.c src/lib/libcapsicum/libcapsicum_sandbox.c --- src-clean/lib/libcapsicum/libcapsicum_sandbox.c 1970-01-01 01:00:00.000000000 +0100 +++ src/lib/libcapsicum/libcapsicum_sandbox.c 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,89 @@ +/*- + * Copyright (c) 2009 Robert N. M. Watson + * All rights reserved. + * + * WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED + * ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND + * UNEXPECTED WAYS. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include +#include /* TODO: temporary */ +#include +#include + +#include "libcapsicum.h" +#include "libcapsicum_internal.h" +#include "libcapsicum_sandbox_api.h" + +static int lch_initialized; +static struct lc_host lch_global; + +int +lcs_get(struct lc_host **lchpp) +{ + int fd_sock; + + if (lch_initialized) { + *lchpp = &lch_global; + return (0); + } + + if (!ld_insandbox()) { + errno = EINVAL; + return (-1); + } + + struct lc_fdlist *fds = lc_fdlist_global(); + if (lc_fdlist_lookup(fds, LIBCAPSICUM_FQNAME, "socket", NULL, + &fd_sock, NULL) < 0) + return (-1); + if (fd_sock == -1) + return (-1); + + lch_global.lch_fd_sock = fd_sock; + lch_initialized = 1; + *lchpp = &lch_global; + return (0); +} + +int +lcs_getsock(struct lc_host *lchp, int *fdp) +{ + + *fdp = lchp->lch_fd_sock; + return (0); +} diff -aurN -x '*.orig' src-clean/lib/libcapsicum/libcapsicum_sandbox_api.h src/lib/libcapsicum/libcapsicum_sandbox_api.h --- src-clean/lib/libcapsicum/libcapsicum_sandbox_api.h 1970-01-01 01:00:00.000000000 +0100 +++ src/lib/libcapsicum/libcapsicum_sandbox_api.h 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,82 @@ +/*- + * Copyright (c) 2009 Robert N. M. Watson + * All rights reserved. + * + * WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED + * ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND + * UNEXPECTED WAYS. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $P4: //depot/projects/trustedbsd/capabilities/src/lib/libcapsicum/libcapsicum_sandbox_api.h#4 $ + */ + +#ifndef _LIBCAPSICUM_SANDBOX_API_H_ +#define _LIBCAPSICUM_SANDBOX_API_H_ + +/* + * This include file captures the assumptions libcapsicum sandboxs will + * make about the runtime environment set up by libcapsicum hosts. + */ +#define LIBCAPSICUM_SANDBOX_API_ENV "LIBCAPSICUM_SANDBOX" +#define LIBCAPSICUM_SANDBOX_FDLIST "LIBCAPSICUM_FDLIST" +#define LIBCAPSICUM_SANDBOX_API_SOCK "sock" + +/* + * Maximum number of file descriptor rights we will ever send as part of an + * RPC. + */ +#define LIBCAPSICUM_SANDBOX_API_MAXRIGHTS 16 + +/* + * Simple libcapsicum RPC facility (lcrpc) definitions. + */ +#define LCRPC_REQUEST_HDR_MAGIC 0x29ee2d7eb9143d98 +struct lcrpc_request_hdr { + u_int64_t lcrpc_reqhdr_magic; + u_int32_t lcrpc_reqhdr_seqno; + u_int32_t lcrpc_reqhdr_opno; + u_int64_t lcrpc_reqhdr_datalen; + u_int64_t lcrpc_reqhdr_maxrepdatalen; + u_int64_t _lcrpc_reqhdr_spare3; + u_int64_t _lcrpc_reqhdr_spare2; + u_int64_t _lcrpc_reqhdr_spare1; + u_int64_t _lcrpc_reqhdr_spare0; +} __packed; + +#define LCRPC_REPLY_HDR_MAGIC 0x37cc2e29f5cce29b +struct lcrpc_reply_hdr { + u_int64_t lcrpc_rephdr_magic; + u_int32_t lcrpc_rephdr_seqno; + u_int32_t lcrpc_rephdr_opno; + u_int64_t lcrpc_rephdr_datalen; + u_int64_t _lcrpc_rephdr_spare4; + u_int64_t _lcrpc_rephdr_spare3; + u_int64_t _lcrpc_rephdr_spare2; + u_int64_t _lcrpc_rephdr_spare1; + u_int64_t _lcrpc_rephdr_spare0; +} __packed; + +#endif /* !_LIBCAPSICUM_H_ */ diff -aurN -x '*.orig' src-clean/lib/libcapsicum/libcapsicum_sandbox_io.c src/lib/libcapsicum/libcapsicum_sandbox_io.c --- src-clean/lib/libcapsicum/libcapsicum_sandbox_io.c 1970-01-01 01:00:00.000000000 +0100 +++ src/lib/libcapsicum/libcapsicum_sandbox_io.c 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,259 @@ +/*- + * Copyright (c) 2009 Robert N. M. Watson + * All rights reserved. + * + * WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED + * ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND + * UNEXPECTED WAYS. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "libcapsicum.h" +#include "libcapsicum_internal.h" +#include "libcapsicum_sandbox_api.h" + +ssize_t +lcs_recv(struct lc_host *lchp, void *buf, size_t len, int flags) +{ + + return (_lc_recv(lchp->lch_fd_sock, buf, len, flags, 0)); +} + +ssize_t +lcs_recv_rights(struct lc_host *lchp, void *buf, size_t len, int flags, + int *fdp, int *fdcountp) +{ + + return (_lc_recv_rights(lchp->lch_fd_sock, buf, len, flags, 0, fdp, + fdcountp)); +} + +ssize_t +lcs_send(struct lc_host *lchp, const void *msg, size_t len, int flags) +{ + + return (_lc_send(lchp->lch_fd_sock, msg, len, flags, 0)); +} + +ssize_t +lcs_send_rights(struct lc_host *lchp, const void *msg, size_t len, + int flags, int *fdp, int fdcount) +{ + + return (_lc_send_rights(lchp->lch_fd_sock, msg, len, flags, 0, fdp, + fdcount)); +} + +/* + * libcapsicum RPC facility (lcrpc) sandbox routines. Since arguments are + * variable size, space is allocated by the RPC code rather than the caller, + * who is expected to free it with free(3) if desired. + */ +static int +lcs_recvrpc_internal(struct lc_host *lchp, u_int32_t *opnop, + u_int32_t *seqnop, u_char **bufferp, size_t *lenp, int *fdp, + int *fdcountp) +{ + struct lcrpc_request_hdr req_hdr; + size_t totlen; + ssize_t len; + u_char *buffer; + int error; + + if (fdp != NULL) + len = _lc_recv_rights(lchp->lch_fd_sock, &req_hdr, + sizeof(req_hdr), MSG_WAITALL, LC_IGNOREEINTR, fdp, + fdcountp); + else + len = _lc_recv(lchp->lch_fd_sock, &req_hdr, sizeof(req_hdr), + MSG_WAITALL, LC_IGNOREEINTR); + if (len < 0) + return (-1); + if (len == 0) { + if (fdp != NULL) + _lc_dispose_rights(fdp, *fdcountp); + errno = EPIPE; + return (-1); + } + if (len != sizeof(req_hdr)) { + if (fdp != NULL) + _lc_dispose_rights(fdp, *fdcountp); + errno = EBADMSG; + return (-1); + } + + if (req_hdr.lcrpc_reqhdr_magic != LCRPC_REQUEST_HDR_MAGIC) { + if (fdp != NULL) + _lc_dispose_rights(fdp, *fdcountp); + errno = EBADMSG; + return (-1); + } + + /* + * XXXRW: Should we check that the receive data fits in the address + * space of the sandbox? + * + * XXXRW: If malloc() fails, we should drain the right amount of data + * from the socket so that the next RPC will succeed. Possibly we + * should also reply with an error from this layer to the sender? + * What about if there are other socket errors, such as EINTR? + */ + buffer = malloc(req_hdr.lcrpc_reqhdr_datalen); + if (buffer == NULL) { + error = errno; + if (fdp != NULL) + _lc_dispose_rights(fdp, *fdcountp); + errno = error; + return (-1); + } + + /* + * XXXRW: Likewise, how to handle failure at this stage? + */ + totlen = 0; + while (totlen < req_hdr.lcrpc_reqhdr_datalen) { + len = _lc_recv(lchp->lch_fd_sock, buffer + totlen, + req_hdr.lcrpc_reqhdr_datalen - totlen, MSG_WAITALL, + LC_IGNOREEINTR); + if (len < 0) { + error = errno; + if (fdp != NULL) + _lc_dispose_rights(fdp, *fdcountp); + free(buffer); + return (-1); + } + if (len == 0) { + errno = EPIPE; + if (fdp != NULL) + _lc_dispose_rights(fdp, *fdcountp); + free(buffer); + return (-1); + } + totlen += len; + } + *bufferp = buffer; + *lenp = totlen; + *opnop = req_hdr.lcrpc_reqhdr_opno; + *seqnop = req_hdr.lcrpc_reqhdr_seqno; + return (0); +} + +int +lcs_recvrpc(struct lc_host *lchp, u_int32_t *opnop, u_int32_t *seqnop, + u_char **bufferp, size_t *lenp) +{ + + return (lcs_recvrpc_internal(lchp, opnop, seqnop, bufferp, lenp, + NULL, NULL)); +} + +int +lcs_recvrpc_rights(struct lc_host *lchp, u_int32_t *opnop, u_int32_t *seqnop, + u_char **bufferp, size_t *lenp, int *fdp, int *fdcountp) +{ + + return (lcs_recvrpc_internal(lchp, opnop, seqnop, bufferp, lenp, + fdp, fdcountp)); +} + +static int +lcs_sendrpc_internal(struct lc_host *lchp, u_int32_t opno, u_int32_t seqno, + struct iovec *rep, int repcount, int *fdp, int fdcount) +{ + struct lcrpc_reply_hdr rep_hdr; + ssize_t len; + int i; + + bzero(&rep_hdr, sizeof(rep_hdr)); + rep_hdr.lcrpc_rephdr_magic = LCRPC_REPLY_HDR_MAGIC; + rep_hdr.lcrpc_rephdr_seqno = seqno; + rep_hdr.lcrpc_rephdr_opno = opno; + rep_hdr.lcrpc_rephdr_datalen = 0; + for (i = 0; i < repcount; i++) + rep_hdr.lcrpc_rephdr_datalen += rep[i].iov_len; + + /* + * Send our header. + */ + if (fdp != NULL) + len = _lc_send_rights(lchp->lch_fd_sock, &rep_hdr, + sizeof(rep_hdr), 0, LC_IGNOREEINTR, fdp, fdcount); + else + len = _lc_send(lchp->lch_fd_sock, &rep_hdr, sizeof(rep_hdr), + 0, LC_IGNOREEINTR); + if (len < 0) + return (-1); + if (len != sizeof(rep_hdr)) { + errno = EPIPE; + return (-1); + } + + /* + * Send user data. + */ + for (i = 0; i < repcount; i++) { + len = _lc_send(lchp->lch_fd_sock, rep[i].iov_base, + rep[i].iov_len, 0, LC_IGNOREEINTR); + if (len < 0) + return (-1); + if ((size_t)len != rep[i].iov_len) { + errno = EPIPE; + return (-1); + } + } + return (0); +} + +int +lcs_sendrpc(struct lc_host *lchp, u_int32_t opno, u_int32_t seqno, + struct iovec *rep, int repcount) +{ + + return (lcs_sendrpc_internal(lchp, opno, seqno, rep, repcount, NULL, + 0)); +} + +int +lcs_sendrpc_rights(struct lc_host *lchp, u_int32_t opno, u_int32_t seqno, + struct iovec *rep, int repcount, int *fdp, int fdcount) +{ + + return (lcs_sendrpc_internal(lchp, opno, seqno, rep, repcount, fdp, + fdcount)); +} diff -aurN -x '*.orig' src-clean/lib/libkvm/kvm_proc.c src/lib/libkvm/kvm_proc.c --- src-clean/lib/libkvm/kvm_proc.c 2010-08-25 10:10:33.000000000 +0200 +++ src/lib/libkvm/kvm_proc.c 2010-08-25 10:24:35.000000000 +0200 @@ -154,6 +154,7 @@ kvm_read(kd, (u_long)ucred.cr_groups, kp->ki_groups, kp->ki_ngroups * sizeof(gid_t)); kp->ki_uid = ucred.cr_uid; + kp->ki_cr_flags = ucred.cr_flags; if (ucred.cr_prison != NULL) { if (KREAD(kd, (u_long)ucred.cr_prison, &pr)) { _kvm_err(kd, kd->program, diff -aurN -x '*.orig' src-clean/libexec/Makefile src/libexec/Makefile --- src-clean/libexec/Makefile 2010-08-25 10:10:36.000000000 +0200 +++ src/libexec/Makefile 2010-08-25 10:24:35.000000000 +0200 @@ -22,6 +22,7 @@ rpc.sprayd \ ${_rshd} \ ${_rtld-elf} \ + ${_rtld-elf-cap} \ save-entropy \ ${_smrsh} \ talkd \ @@ -54,6 +55,7 @@ .if !defined(NO_PIC) && !defined(NO_RTLD) _rtld-elf= rtld-elf +_rtld-elf-cap= rtld-elf-cap .endif .if ${MK_RCMDS} != "no" diff -aurN -x '*.orig' src-clean/libexec/rtld-elf/Makefile src/libexec/rtld-elf/Makefile --- src-clean/libexec/rtld-elf/Makefile 2010-08-25 10:10:36.000000000 +0200 +++ src/libexec/rtld-elf/Makefile 2010-08-25 10:24:35.000000000 +0200 @@ -11,7 +11,7 @@ CSTD?= gnu99 CFLAGS+= -Wall -DFREEBSD_ELF -DIN_RTLD CFLAGS+= -I${.CURDIR}/${MACHINE_ARCH} -I${.CURDIR} -LDFLAGS+= -nostdlib -e .rtld_start +LDFLAGS+= -nostdlib -Wl,-e,.rtld_start INSTALLFLAGS= -C -b PRECIOUSPROG= BINDIR= /libexec diff -aurN -x '*.orig' src-clean/libexec/rtld-elf/Symbol.map src/libexec/rtld-elf/Symbol.map --- src-clean/libexec/rtld-elf/Symbol.map 2010-08-25 10:10:36.000000000 +0200 +++ src/libexec/rtld-elf/Symbol.map 2010-08-25 10:24:35.000000000 +0200 @@ -18,6 +18,10 @@ __tls_get_addr; }; +FBSD_1.2 { + ld_libdirs; +}; + FBSDprivate_1.0 { _rtld_thread_init; _rtld_allocate_tls; diff -aurN -x '*.orig' src-clean/libexec/rtld-elf/map_object.c src/libexec/rtld-elf/map_object.c --- src-clean/libexec/rtld-elf/map_object.c 2010-08-25 10:10:36.000000000 +0200 +++ src/libexec/rtld-elf/map_object.c 2010-08-25 10:24:35.000000000 +0200 @@ -44,7 +44,8 @@ /* * Map a shared object into memory. The "fd" argument is a file descriptor, - * which must be open on the object and positioned at its beginning. + * which must be open on the object. + * * The "path" argument is a pathname that is used only for error messages. * * The return value is a pointer to a newly-allocated Obj_Entry structure @@ -83,6 +84,7 @@ Elf_Addr bss_vaddr; Elf_Addr bss_vlimit; caddr_t bss_addr; + int mmap_flags; hdr = get_elf_header(fd, path); if (hdr == NULL) @@ -152,8 +154,10 @@ mapsize = base_vlimit - base_vaddr; base_addr = hdr->e_type == ET_EXEC ? (caddr_t) base_vaddr : NULL; - mapbase = mmap(base_addr, mapsize, PROT_NONE, MAP_ANON | MAP_PRIVATE | - MAP_NOCORE, -1, 0); + mmap_flags = MAP_ANON | MAP_PRIVATE | MAP_NOCORE; + if (base_addr != NULL) + mmap_flags |= MAP_FIXED; + mapbase = mmap(base_addr, mapsize, PROT_NONE, mmap_flags, -1, 0); if (mapbase == (caddr_t) -1) { _rtld_error("%s: mmap of entire address space failed: %s", path, strerror(errno)); diff -aurN -x '*.orig' src-clean/libexec/rtld-elf/rtld.c src/libexec/rtld-elf/rtld.c --- src-clean/libexec/rtld-elf/rtld.c 2010-08-25 10:10:36.000000000 +0200 +++ src/libexec/rtld-elf/rtld.c 2010-08-25 10:24:35.000000000 +0200 @@ -59,11 +59,25 @@ #include "libmap.h" #include "rtld_tls.h" +#ifdef IN_RTLD_CAP +#include "rtld_sandbox.h" +#endif + #ifndef COMPAT_32BIT +#ifdef IN_RTLD_CAP +#define PATH_RTLD "/libexec/ld-elf-cap.so.1" +#else #define PATH_RTLD "/libexec/ld-elf.so.1" +#endif +#else +#ifdef IN_RTLD_CAP +#define PATH_RTLD "/libexex/ld-elf32-cap.so.1" #else #define PATH_RTLD "/libexec/ld-elf32.so.1" #endif +#endif + +#define INITIAL_FDLEN 16 /* Types. */ typedef void (*func_ptr_type)(); @@ -79,6 +93,13 @@ unsigned int num_used; /* Number of array slots used */ } DoneList; +typedef struct Struct_FDArray { + rtld_lock_t lock; /* Mutual exclusion */ + int *content; /* The file descriptors */ + int count; /* Number of descriptors in array */ + int capacity; /* Space available for descriptors */ +} FDArray; + /* * Function declarations. */ @@ -93,13 +114,18 @@ static void errmsg_restore(char *); static char *errmsg_save(void); static void *fill_search_info(const char *, size_t, void *); +#ifdef IN_RTLD_CAP +static int find_library_fd(const char *name); +#else static char *find_library(const char *, const Obj_Entry *); static const char *gethints(void); +#endif static void init_dag(Obj_Entry *); static void init_dag1(Obj_Entry *, Obj_Entry *, DoneList *); static void init_rtld(caddr_t); static void initlist_add_neededs(Needed_Entry *, Objlist *); static void initlist_add_objects(Obj_Entry *, Obj_Entry **, Objlist *); +static void init_libdirs(void); static bool is_exported(const Elf_Sym *); static void linkmap_add(Obj_Entry *); static void linkmap_delete(Obj_Entry *); @@ -120,7 +146,9 @@ static int rtld_dirname(const char *, char *); static int rtld_dirname_abs(const char *, char *); static void rtld_exit(void); +#ifndef IN_RTLD_CAP static char *search_library_path(const char *, const char *); +#endif static const void **get_program_var_addr(const char *); static void set_program_var(const char *, const void *); static const Elf_Sym *symlook_default(const char *, unsigned long, @@ -144,6 +172,8 @@ static int object_match_name(const Obj_Entry *, const char *); static void ld_utrace_log(int, void *, void *, size_t, int, const char *); +int ld_libdirs(int *, int *); + void r_debug_state(struct r_debug *, struct link_map *); /* @@ -158,6 +188,9 @@ used to affect the libraries loaded */ static char *ld_bind_now; /* Environment variable for immediate binding */ static char *ld_debug; /* Environment variable for debugging */ +static FDArray library_dirs; /* File descriptors of library path */ +static FDArray preload_fds; /* File descriptors of preloaded libraries */ +static bool locks_initialized; /* FDArray locks have been initialized */ static char *ld_library_path; /* Environment variable for search path */ static char *ld_preload; /* Environment variable for libraries to load first */ @@ -214,6 +247,10 @@ (func_ptr_type) &dl_iterate_phdr, (func_ptr_type) &_rtld_atfork_pre, (func_ptr_type) &_rtld_atfork_post, +#ifdef IN_RTLD_CAP + (func_ptr_type) &ld_insandbox, +#endif + (func_ptr_type) &ld_libdirs, NULL }; @@ -310,6 +347,10 @@ func_ptr_type _rtld(Elf_Addr *sp, func_ptr_type *exit_proc, Obj_Entry **objp) { +#ifdef IN_RTLD_CAP + struct stat sb; + Elf_Auxinfo aux_execfd; +#endif Elf_Auxinfo *aux_info[AT_COUNT]; int i; int argc; @@ -352,10 +393,46 @@ assert(aux_info[AT_BASE] != NULL); init_rtld((caddr_t) aux_info[AT_BASE]->a_un.a_ptr); - __progname = obj_rtld.path; + /* XXXRW: Need to do something about program names in capability mode. */ + __progname = obj_rtld.path; /* TODO: binary name */ argv0 = argv[0] != NULL ? argv[0] : "(null)"; environ = env; +#ifdef IN_RTLD_CAP + /* + * In capability mode, the kernel has executed ld-elf-cap.so directly, + * and the parent has passed the executable it wants us to run as a file + * descriptor. The kernel doesn't know this, so rewrite our auxilary + * arguments so the remainder of rtld thinks the kernel passed the file + * descriptor using AT_EXECFD. + */ + if (aux_info[AT_EXECFD] == NULL) { + const char *ld_binary; + char *endp; + long ld_binary_fd; + + ld_binary = getenv(LD_ "BINARY"); + if (ld_binary == NULL) { + _rtld_error("LD_BINARY unset; aborting"); + die(); + } + ld_binary_fd = strtol(ld_binary, &endp, 10); + if (ld_binary_fd < 0 || ld_binary_fd > __INT_MAX || *endp != 0) { + _rtld_error("LD_BINARY invalid"); + die(); + } + if (fstat(ld_binary_fd, &sb) < 0) { + __progname = "ld-elf-cap.so"; + _rtld_error("executable file descriptor unusable"); + die(); + } + bzero(&aux_execfd, sizeof(aux_execfd)); + aux_execfd.a_type = AT_EXECFD; + aux_execfd.a_un.a_val = ld_binary_fd; + aux_info[AT_EXECFD] = &aux_execfd; + } +#endif + trust = !issetugid(); ld_bind_now = getenv(LD_ "BIND_NOW"); @@ -372,6 +449,9 @@ _rtld_error("environment corrupt; aborting"); die(); } +#ifdef IN_RTLD_CAP + unsetenv(LD_ "LIBCACHE"); +#endif } ld_debug = getenv(LD_ "DEBUG"); libmap_disable = getenv(LD_ "LIBMAP_DISABLE") != NULL; @@ -379,7 +459,8 @@ ld_library_path = getenv(LD_ "LIBRARY_PATH"); ld_preload = getenv(LD_ "PRELOAD"); ld_elf_hints_path = getenv(LD_ "ELF_HINTS_PATH"); - dangerous_ld_env = libmap_disable || (libmap_override != NULL) || + dangerous_ld_env = + libmap_disable || (libmap_override != NULL) || (ld_library_path != NULL) || (ld_preload != NULL) || (ld_elf_hints_path != NULL); ld_tracing = getenv(LD_ "TRACE_LOADED_OBJECTS"); @@ -403,7 +484,9 @@ int fd = aux_info[AT_EXECFD]->a_un.a_val; dbg("loading main program"); obj_main = map_object(fd, argv0, NULL); +#ifndef IN_RTLD_CAP close(fd); +#endif if (obj_main == NULL) die(); } else { /* Main program already loaded. */ @@ -476,8 +559,15 @@ sym_zero.st_shndx = SHN_UNDEF; sym_zero.st_value = -(uintptr_t)obj_main->relocbase; +#ifndef IN_RTLD_CAP if (!libmap_disable) libmap_disable = (bool)lm_init(libmap_override); +#endif + + /* Initialize FD arrays */ + library_dirs.content = NULL; + preload_fds.content = NULL; + dbg("loading LD_PRELOAD libraries"); if (load_preload_objects() == -1) @@ -538,6 +628,9 @@ dbg("initializing thread locks"); lockdflt_init(); + locks_initialized = true; + library_dirs.lock = rtld_dirs_lock; + preload_fds.lock = rtld_preloads_lock; /* Make a list of init functions to call. */ objlist_init(&initlist); @@ -555,6 +648,7 @@ /* Return the exit procedure and the program entry point. */ *exit_proc = rtld_exit; *objp = obj_main; + return (func_ptr_type) obj_main->entry; } @@ -722,6 +816,7 @@ return (res4); } + static void die(void) { @@ -1104,6 +1199,112 @@ return h; } + + +bool fdarray_init(struct Struct_FDArray *fds) { + if (fds == NULL) return false; + + int lockstate = fdarray_lock(fds); + + fds->count = 0; + fds->capacity = 8; + fds->content = xmalloc(fds->capacity * sizeof(int)); + + fdarray_unlock(fds, lockstate); + + return true; +} + +int fdarray_lock(struct Struct_FDArray *fds) { + if (!locks_initialized) return 0; + else return wlock_acquire(fds->lock); +} + +int fdarray_rlock(struct Struct_FDArray *fds) { + if (!locks_initialized) return 0; + else return rlock_acquire(fds->lock); +} + +void fdarray_unlock(struct Struct_FDArray *fds, int state) { + if (!locks_initialized) return; + else wlock_release(fds->lock, state); +} + +void fdarray_runlock(struct Struct_FDArray *fds, int state) { + if (!locks_initialized) return; + else rlock_release(fds->lock, state); +} +bool fdarray_append(struct Struct_FDArray *fds, int fd) +{ + + int lockstate = fdarray_lock(fds); + + /* Do we need to grow? */ + if (fds->count == fds->capacity) { + fds->capacity *= 2; + fds->content = realloc(fds->content, fds->capacity * sizeof(int)); + if (fds->content == NULL) { + _rtld_error("add_libdir_fd: realloc failed"); + fdarray_unlock(fds, lockstate); + return false; + } + } + + /* Add the new fd to the end. */ + fds->content[fds->count++] = fd; + + fdarray_unlock(fds, lockstate); + return true; +} + +int* fdarray_get(struct Struct_FDArray *fds) +{ + int bytes, *copy; + + int lockstate = fdarray_rlock(fds); + + bytes = fds->count * sizeof(int); + + copy = xmalloc(bytes); + if (copy != NULL) + bcopy(fds->content, copy, bytes); + + fdarray_runlock(fds, lockstate); + + return copy; +} + + + +#ifdef IN_RTLD_CAP +/* + * Find the library with the given name, and return an open file descriptor + * to it. + */ +static int +find_library_fd(const char *name) +{ + int fd, i; + + int lockstate = fdarray_lock(&library_dirs); + if (library_dirs.content == NULL) init_libdirs(); + fdarray_unlock(&library_dirs, lockstate); + + + lockstate = fdarray_rlock(&library_dirs); + for (i = 0; i < library_dirs.count; i++) { + fd = openat(library_dirs.content[i], name, O_RDONLY); + if (fd >= 0) { + fdarray_runlock(&library_dirs, lockstate); + return (fd); + } + } + + fdarray_runlock(&library_dirs, lockstate); + return (-1); +} + +#else /* * Find the library with the given name, and return its full pathname. * The returned string is dynamically allocated. Generates an error @@ -1157,6 +1358,7 @@ } return NULL; } +#endif /* * Given a symbol number in a referencing object, find the corresponding @@ -1234,6 +1436,7 @@ return def; } +#ifndef IN_RTLD_CAP /* * Return the search path from the ldconfig hints file, reading it if * necessary. Returns NULL if there are problems with the hints file, @@ -1272,6 +1475,7 @@ } return hints[0] != '\0' ? hints : NULL; } +#endif static void init_dag(Obj_Entry *root) @@ -1325,10 +1529,16 @@ objtmp.dynamic = rtld_dynamic(&objtmp); digest_dynamic(&objtmp, 1); assert(objtmp.needed == NULL); +#if 0 + /* + * XXXRW: For reasons as yet undetermined, this assertion fires in + * capability mode. + */ #if !defined(__mips__) /* MIPS and SH{3,5} have a bogus DT_TEXTREL. */ assert(!objtmp.textrel); #endif +#endif /* * Temporarily put the dynamic linker entry into the object list, so @@ -1495,6 +1705,30 @@ if (object_match_name(obj, name)) return obj; +#ifdef IN_RTLD_CAP + path = xstrdup(name); + + /* is the name actually a file descriptor? */ + long long long_fd = strtonum(path, 0, __INT_MAX, NULL); + if ((long_fd >= 0) && (fstat((int) long_fd, &sb) == 0)) + fd = (int) long_fd; + + /* if not, search the library path */ + else { + dbg("preload by name: %s", name); + if (strchr(name, '/') != NULL) { + _rtld_error("Absolute paths (e.g. \"%s\") not supported", path); + free(path); + return NULL; + } + + if ((fd = find_library_fd(path)) < 0) { + _rtld_error("Unable to find \"%s\" in LD_LIBRARY_DIRS", path); + free(path); + return NULL; + } + } +#else path = find_library(name, refobj); if (path == NULL) return NULL; @@ -1512,22 +1746,29 @@ free(path); return NULL; } +#endif if (fstat(fd, &sb) == -1) { _rtld_error("Cannot fstat \"%s\"", path); +#ifndef IN_RTLD_CAP close(fd); +#endif free(path); return NULL; } for (obj = obj_list->next; obj != NULL; obj = obj->next) { if (obj->ino == sb.st_ino && obj->dev == sb.st_dev) { +#ifndef IN_RTLD_CAP close(fd); +#endif break; } } if (obj != NULL) { object_add_name(obj, name); free(path); +#ifndef IN_RTLD_CAP close(fd); +#endif return obj; } if (flags & RTLD_LO_NOLOAD) @@ -1537,7 +1778,9 @@ obj = do_load_object(fd, name, path, &sb, flags); if (obj == NULL) free(path); +#ifndef IN_RTLD_CAP close(fd); +#endif return obj; } @@ -1556,7 +1799,7 @@ if (dangerous_ld_env) { if (fstatfs(fd, &fs) != 0) { _rtld_error("Cannot fstatfs \"%s\"", path); - return NULL; + return NULL; } if (fs.f_flags & MNT_NOEXEC) { _rtld_error("Cannot execute objects on %s\n", fs.f_mntonname); @@ -1569,6 +1812,7 @@ return NULL; object_add_name(obj, name); + obj->fd = fd; obj->path = path; digest_dynamic(obj, 0); if (obj->z_noopen && (flags & (RTLD_LO_DLOPEN | RTLD_LO_TRACE)) == @@ -1831,8 +2075,10 @@ dbg("rtld_exit()"); objlist_call_fini(&list_fini, true, &lockstate); /* No need to remove the items from the list, since we are exiting. */ +#ifndef IN_RTLD_CAP if (!libmap_disable) lm_fini(); +#endif wlock_release(rtld_bind_lock, lockstate); } @@ -1903,6 +2149,7 @@ return (NULL); } +#ifndef IN_RTLD_CAP static char * search_library_path(const char *name, const char *path) { @@ -1923,6 +2170,110 @@ return (p); } +#endif + +/* + * Add file descriptors for a path list (e.g. '/lib:/usr/lib') to + * ld_library_dirs. + */ +static bool +add_libdir_paths(const char *path) +{ + + if (path == NULL) + return false; + + char *pathcopy, *dirname, *tokcontext; + int pathlen = strnlen(path, PATH_MAX); + + pathcopy = malloc(pathlen + 1); + strncpy(pathcopy, path, pathlen + 1); + + bool success = true; + for (dirname = strtok_r(pathcopy, ":", &tokcontext); dirname; + dirname = strtok_r(NULL, ":", &tokcontext)) { + struct try_library_args arg; + int fd; + + arg.name = ""; + arg.namelen = 0; + arg.buffer = xmalloc(PATH_MAX); + arg.buflen = PATH_MAX; + + if (try_library_path(dirname, strnlen(dirname, PATH_MAX), &arg)) { + fd = open(dirname, O_RDONLY); + } else { + /* 'dirname' is not a directory path; perhaps it's a descriptor? */ + fd = (int) strtol(dirname, NULL, 0); + if ((fd == 0) && (errno == 0)) + continue; + } + + if (fd >= 0) + if (!fdarray_append(&library_dirs, fd)) { + success = false; + break; + } + } + + free(pathcopy); + return success; +} + +/* + * Build the list of library file descriptors. + */ +static void +init_libdirs(void) +{ + fdarray_init(&library_dirs); + +#ifdef IN_RTLD_CAP + char *envvar = getenv(LD_ "LIBRARY_DIRS"); + + if (envvar == NULL) + err(-1, "No %s set in capability mode", LD_ "LIBRARY_DIRS"); + + add_libdir_paths(envvar); +#else /* !IN_RTLD_CAP */ + /* Look for directories a la find_library (TODO: refactor!). */ + add_libdir_paths(ld_library_path); + add_libdir_paths(gethints()); + add_libdir_paths(STANDARD_LIBRARY_PATH); +#endif +} + +/* + * Return an array of file descriptors for the library search paths. + */ +int +ld_libdirs(int *fds, int *fdcount) +{ + if (fdcount == NULL) + return (-1); + else if (fds == NULL) { + *fdcount = -1; + return (-1); + } + + int lockstate = rlock_acquire(library_dirs.lock); + + if (library_dirs.content == NULL) + init_libdirs(); + + if (*fdcount < library_dirs.count) { + *fdcount = library_dirs.count; + rlock_release(library_dirs.lock, lockstate); + return (-1); + } + + *fdcount = library_dirs.count; + memcpy(fds, library_dirs.content, *fdcount * sizeof(int)); + + rlock_release(library_dirs.lock, lockstate); + + return (0); +} int dlclose(void *handle) @@ -2407,6 +2758,7 @@ _info.dls_cnt = 0; path_enumerate(ld_library_path, fill_search_info, &args); +#ifndef IN_RTLD_CAP path_enumerate(obj->rpath, fill_search_info, &args); path_enumerate(gethints(), fill_search_info, &args); path_enumerate(STANDARD_LIBRARY_PATH, fill_search_info, &args); @@ -2417,12 +2769,14 @@ info->dls_cnt = _info.dls_cnt; return (0); } +#endif if (info->dls_cnt != _info.dls_cnt || info->dls_size != _info.dls_size) { _rtld_error("Uninitialized Dl_serinfo struct passed to dlinfo()"); return (-1); } +#ifndef IN_RTLD_CAP args.request = RTLD_DI_SERINFO; args.serinfo = info; args.serpath = &info->dls_serpath[0]; @@ -2443,6 +2797,7 @@ args.flags = LA_SER_DEFAULT; if (path_enumerate(STANDARD_LIBRARY_PATH, fill_search_info, &args) != NULL) return (-1); +#endif return (0); } diff -aurN -x '*.orig' src-clean/libexec/rtld-elf/rtld.h src/libexec/rtld-elf/rtld.h --- src-clean/libexec/rtld-elf/rtld.h 2010-08-25 10:10:36.000000000 +0200 +++ src/libexec/rtld-elf/rtld.h 2010-08-25 10:24:35.000000000 +0200 @@ -226,6 +226,7 @@ struct link_map linkmap; /* For GDB and dlinfo() */ Objlist dldags; /* Object belongs to these dlopened DAGs (%) */ Objlist dagmembers; /* DAG has these members (%) */ + int fd; /* Object's file descriptor */ dev_t dev; /* Object's filesystem's device */ ino_t ino; /* Object's inode number */ void *priv; /* Platform-dependant */ @@ -255,6 +256,11 @@ const Obj_Entry *obj; /* Shared object which defines it */ } SymCache; +/* + * Storage for arrays of file descriptors. + */ +struct Struct_FDArray; + extern void _rtld_error(const char *, ...) __printflike(1, 2); extern Obj_Entry *map_object(int, const char *, const struct stat *); extern void *xcalloc(size_t); @@ -287,6 +293,11 @@ bool allocate_tls_offset(Obj_Entry *obj); void free_tls_offset(Obj_Entry *obj); const Ver_Entry *fetch_ventry(const Obj_Entry *obj, unsigned long); +bool fdarray_init(struct Struct_FDArray*); +bool fdarray_append(struct Struct_FDArray*, int); +int* fdarray_get(struct Struct_FDArray*); +int fdarray_lock(struct Struct_FDArray*); +void fdarray_unlock(struct Struct_FDArray*, int); /* * MD function declarations. diff -aurN -x '*.orig' src-clean/libexec/rtld-elf/rtld_lock.c src/libexec/rtld-elf/rtld_lock.c --- src-clean/libexec/rtld-elf/rtld_lock.c 2010-08-25 10:10:36.000000000 +0200 +++ src/libexec/rtld-elf/rtld_lock.c 2010-08-25 10:24:35.000000000 +0200 @@ -173,7 +173,7 @@ lockinfo.thread_clr_flag(mask); } -#define RTLD_LOCK_CNT 3 +#define RTLD_LOCK_CNT 5 struct rtld_lock { void *handle; int mask; @@ -182,6 +182,8 @@ rtld_lock_t rtld_bind_lock = &rtld_locks[0]; rtld_lock_t rtld_libc_lock = &rtld_locks[1]; rtld_lock_t rtld_phdr_lock = &rtld_locks[2]; +rtld_lock_t rtld_dirs_lock = &rtld_locks[3]; +rtld_lock_t rtld_preloads_lock = &rtld_locks[4]; int rlock_acquire(rtld_lock_t lock) diff -aurN -x '*.orig' src-clean/libexec/rtld-elf/rtld_lock.h src/libexec/rtld-elf/rtld_lock.h --- src-clean/libexec/rtld-elf/rtld_lock.h 2010-08-25 10:10:36.000000000 +0200 +++ src/libexec/rtld-elf/rtld_lock.h 2010-08-25 10:24:35.000000000 +0200 @@ -56,6 +56,8 @@ extern rtld_lock_t rtld_bind_lock; extern rtld_lock_t rtld_libc_lock; extern rtld_lock_t rtld_phdr_lock; +extern rtld_lock_t rtld_dirs_lock; +extern rtld_lock_t rtld_preloads_lock; int rlock_acquire(rtld_lock_t); int wlock_acquire(rtld_lock_t); diff -aurN -x '*.orig' src-clean/libexec/rtld-elf-cap/Makefile src/libexec/rtld-elf-cap/Makefile --- src-clean/libexec/rtld-elf-cap/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ src/libexec/rtld-elf-cap/Makefile 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,55 @@ +# $FreeBSD$ +# $P4: //depot/projects/trustedbsd/capabilities/src/libexec/rtld-elf-cap/Makefile#21 $ + +.include +MK_SSP= no + +PROG?= ld-elf-cap.so.1 +SRCS= rtld_start.S \ + reloc.c rtld.c rtld_lock.c map_object.c \ + malloc.c xmalloc.c debug.c \ + crtbrand.c rtld_sandbox.c +MAN= rtld-elf-cap.1 +CSTD?= gnu99 +CFLAGS+= -Wall -DFREEBSD_ELF -DIN_RTLD -DIN_RTLD_CAP -g +CFLAGS+= -I${.CURDIR} -I${.CURDIR}/../rtld-elf/${MACHINE_ARCH} -I${.CURDIR}/../rtld-elf +LDFLAGS+= -nostdlib -Wl,-e,.rtld_start +WARNS?= 2 +INSTALLFLAGS= -C -b +PRECIOUSPROG= +BINDIR= /libexec +MLINKS= rtld-elf-cap.1 ld-elf-cap.so.1.1 + +CFLAGS+= -fpic -DPIC +LDFLAGS+= -shared -Wl,-Bsymbolic +DPADD= ${LIBC_PIC} +LDADD= -lc_pic -lssp_nonshared + +.if ${MACHINE_ARCH} != "ia64" +.if ${MK_SYMVER} == "yes" +LIBCDIR= ${.CURDIR}/../../lib/libc +VERSION_DEF= ${LIBCDIR}/Versions.def +SYMBOL_MAPS= ${.CURDIR}/../rtld-elf/Symbol.map +VERSION_MAP= Version.map +LDFLAGS+= -Wl,--version-script=${VERSION_MAP} + +${PROG}: ${VERSION_MAP} + +.if exists(${.CURDIR}/../rtld-elf/${MACHINE_ARCH}/Symbol.map) +SYMBOL_MAPS+= ${.CURDIR}/../rtld-elf/${MACHINE_ARCH}/Symbol.map +.endif +.endif +.endif + +SYMBOL_MAPS+= ${.CURDIR}/Symbol.map + +.if exists(${.CURDIR}/../rtld-elf/${MACHINE_ARCH}/Makefile.inc) +.include "${.CURDIR}/../rtld-elf/${MACHINE_ARCH}/Makefile.inc" +.endif + +.PATH: ${.CURDIR}/../rtld-elf +.PATH: ${.CURDIR}/../rtld-elf/${MACHINE_ARCH} +.PATH: ${.CURDIR}/../../lib/csu/common + +.include +.include diff -aurN -x '*.orig' src-clean/libexec/rtld-elf-cap/Symbol.map src/libexec/rtld-elf-cap/Symbol.map --- src-clean/libexec/rtld-elf-cap/Symbol.map 1970-01-01 01:00:00.000000000 +0100 +++ src/libexec/rtld-elf-cap/Symbol.map 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,9 @@ +/* + * $FreeBSD$ + */ + +FBSD_1.1 { + ld_libcache_add; + ld_libcache_lookup; + ld_insandbox; +}; diff -aurN -x '*.orig' src-clean/libexec/rtld-elf-cap/rtld-elf-cap.1 src/libexec/rtld-elf-cap/rtld-elf-cap.1 --- src-clean/libexec/rtld-elf-cap/rtld-elf-cap.1 1970-01-01 01:00:00.000000000 +0100 +++ src/libexec/rtld-elf-cap/rtld-elf-cap.1 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,118 @@ +.\" +.\" Copyright (c) 2009 Robert N. M. Watson +.\" All rights reserved. +.\" +.\" WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED +.\" ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND +.\" UNEXPECTED WAYS. +.\" +.\" This software was developed at the University of Cambridge Computer +.\" Laboratory with support from a grant from Google, Inc. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd June 11, 2009 +.Os +.Dt RTLD-ELF-CAP 1 +.Sh NAME +.Nm ld-elf-cap.so.1 , +.Nm rtld-elf-cap +.Nd capability-mode run-time link editor +.Sh DESCRIPTION +The +.Nm +is a version of +.Xr ld-elf.so.1 1 +specific to the sandbox environment created using +.Xr libcapsicum 3 , +which provides certain extended or modified linker services for that +environment: +.Bl -bullet +.It +Will not attempt to use global file system namespaces that are not available +when running under +.Xr cap_enter 2 . +.It +Expects to be directly executed using +.Xr fexecve 2 , +with the desired binary to run passed as file descriptor 3. +.It +Recognizes the addition symbol +.Dv cap_main , +which will be used instead of the normal ELF entry point for a binary when in +sandbox mode. +This makes it easy a single binary to select different behavior when run in +the different environments. +.It +Interprets the +.Dv LD_LIBCACHE +environmental variable set by sandbox start routines, and implements +.Fn ld_libcache_lookup , +allowing file descriptors for binaries and libraries passed across +.Xr fexecve 2 +to be used by +.Xr libcapsicum 3 , +as well as applications. +.It +Implements a version of +.Fn ld_insandbox +that returns true, overriding the libc function that returns false. +.El +.Pp +Applications using +.Dv cap_main +will need to export it as a dynamic symbol, perhaps using +.Xr gcc 1 's +.Dv -rdynamic +command line flag. +.Pp +Most capability-mode applications will be started using the APIs defined in +.Xr libcapsicum 3 , +which properly set up the run-time environment for +.Nm . +.Sh SEE ALSO +.Xr gcc 1 , +.Xr ld-elf.so.1 1 , +.Xr cap_enter 2 , +.Xr fexecve 2 , +.Xr libcapsicum 3 +.Sh HISTORY +Support for capabilities and capabilities mode was developed as part of the +.Tn TrustedBSD +Project. +.Sh BUGS +WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED ON IN +PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND UNEXPECTED WAYS. +.Pp +The format of +.Dv LD_LIBCACHE +is not documented, and may change. +.Sh AUTHORS +.Nm +is derived from +.Xr rtld 1 , +the normal run-time linker, and was developed by +.An "Robert N. M. Watson" +at the University of Cambridge Computer Laboratory with support from a grant +from Google, Inc. diff -aurN -x '*.orig' src-clean/libexec/rtld-elf-cap/rtld_sandbox.c src/libexec/rtld-elf-cap/rtld_sandbox.c --- src-clean/libexec/rtld-elf-cap/rtld_sandbox.c 1970-01-01 01:00:00.000000000 +0100 +++ src/libexec/rtld-elf-cap/rtld_sandbox.c 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,42 @@ +/*- + * Copyright (c) 2009 Robert N. M. Watson + * All rights reserved. + * + * WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED + * ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND + * UNEXPECTED WAYS. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +int +ld_insandbox(void) +{ + + return (1); +} diff -aurN -x '*.orig' src-clean/libexec/rtld-elf-cap/rtld_sandbox.h src/libexec/rtld-elf-cap/rtld_sandbox.h --- src-clean/libexec/rtld-elf-cap/rtld_sandbox.h 1970-01-01 01:00:00.000000000 +0100 +++ src/libexec/rtld-elf-cap/rtld_sandbox.h 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,39 @@ +/*- + * Copyright (c) 2009 Robert N. M. Watson + * All rights reserved. + * + * WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED + * ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND + * UNEXPECTED WAYS. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef RTLD_SANDBOX_H +#define RTLD_SANDBOX_H + +int ld_insandbox(void); + +#endif /* !RTLD_SANDBOX_H */ diff -aurN -x '*.orig' src-clean/sbin/dhclient/dhclient.c src/sbin/dhclient/dhclient.c --- src-clean/sbin/dhclient/dhclient.c 2010-08-25 10:10:11.000000000 +0200 +++ src/sbin/dhclient/dhclient.c 2010-08-25 10:24:35.000000000 +0200 @@ -61,6 +61,8 @@ #include +#include + #ifndef _PATH_VAREMPTY #define _PATH_VAREMPTY "/var/empty" #endif @@ -438,6 +440,9 @@ if (immediate_daemon) go_daemon(); + if (cap_enter() != 0 && errno != ENOSYS) + error("cap_enter"); + ifi->client->state = S_INIT; state_reboot(ifi); diff -aurN -x '*.orig' src-clean/share/man/man4/unix.4 src/share/man/man4/unix.4 --- src-clean/share/man/man4/unix.4 2010-08-25 10:10:09.000000000 +0200 +++ src/share/man/man4/unix.4 2010-08-25 10:24:35.000000000 +0200 @@ -32,7 +32,7 @@ .\" @(#)unix.4 8.1 (Berkeley) 6/9/93 .\" $FreeBSD: src/share/man/man4/unix.4,v 1.13.10.1.4.1 2010/06/14 02:09:06 kensmith Exp $ .\" -.Dd July 15, 2001 +.Dd October 5, 2009 .Dt UNIX 4 .Os .Sh NAME @@ -52,7 +52,8 @@ The .Ux Ns -domain family supports the -.Dv SOCK_STREAM +.Dv SOCK_STREAM , +.Dv SOCK_SEQPACKET , and .Dv SOCK_DGRAM socket types and uses @@ -127,11 +128,14 @@ .Ux Ns -domain protocol family is comprised of simple transport protocols that support the -.Dv SOCK_STREAM +.Dv SOCK_STREAM , +.Dv SOCK_SEQPACKET , and .Dv SOCK_DGRAM abstractions. .Dv SOCK_STREAM +and +.Dv SOCK_SEQPACKET sockets also support the communication of .Ux file descriptors through the use of the @@ -206,8 +210,9 @@ .Xr getsockopt 2 : .Bl -tag -width ".Dv LOCAL_CONNWAIT" .It Dv LOCAL_CREDS -This option may be enabled on a -.Dv SOCK_DGRAM +This option may be enabled on +.Dv SOCK_DGRAM , +.Dv SOCK_SEQPACKET , or a .Dv SOCK_STREAM socket. diff -aurN -x '*.orig' src-clean/share/mk/bsd.libnames.mk src/share/mk/bsd.libnames.mk --- src-clean/share/mk/bsd.libnames.mk 2010-08-25 10:10:07.000000000 +0200 +++ src/share/mk/bsd.libnames.mk 2010-08-25 10:24:35.000000000 +0200 @@ -30,6 +30,7 @@ LIBC_PIC?= ${DESTDIR}${LIBDIR}/libc_pic.a LIBCALENDAR?= ${DESTDIR}${LIBDIR}/libcalendar.a LIBCAM?= ${DESTDIR}${LIBDIR}/libcam.a +LIBCAPSICUM?= ${DESTDIR}${LIBDIR}/libcapsicum.a LIBCOM_ERR?= ${DESTDIR}${LIBDIR}/libcom_err.a LIBCOMPAT?= ${DESTDIR}${LIBDIR}/libcompat.a LIBCRYPT?= ${DESTDIR}${LIBDIR}/libcrypt.a diff -aurN -x '*.orig' src-clean/sys/amd64/amd64/sys_machdep.c src/sys/amd64/amd64/sys_machdep.c --- src-clean/sys/amd64/amd64/sys_machdep.c 2010-08-25 10:09:42.000000000 +0200 +++ src/sys/amd64/amd64/sys_machdep.c 2010-08-25 10:24:35.000000000 +0200 @@ -167,6 +167,10 @@ uint64_t a64base; struct i386_ioperm_args iargs; + /* + * XXXRW: As new operations are added here, check that they are safe + * in capability mode. + */ if (uap->op == I386_GET_LDT || uap->op == I386_SET_LDT) return (sysarch_ldt(td, uap, UIO_USERSPACE)); /* diff -aurN -x '*.orig' src-clean/sys/amd64/amd64/trap.c src/sys/amd64/amd64/trap.c --- src-clean/sys/amd64/amd64/trap.c 2010-08-25 10:09:42.000000000 +0200 +++ src/sys/amd64/amd64/trap.c 2010-08-25 10:24:35.000000000 +0200 @@ -44,6 +44,7 @@ * AMD64 Trap and System call handling */ +#include "opt_capabilities.h" #include "opt_clock.h" #include "opt_cpu.h" #include "opt_hwpmc_hooks.h" @@ -68,6 +69,7 @@ #include #include #include +#include #include #include #ifdef KTRACE @@ -913,6 +915,18 @@ CTR4(KTR_SYSC, "syscall enter thread %p pid %d proc %s code %d", td, td->td_proc->p_pid, td->td_name, sa.code); +#ifdef CAPABILITIES + /* + * In capabilities mode, we only allow access to system calls flagged + * SYF_CAPENABLED. + */ + if (error == 0) { + if (!(callp->sy_flags & SYF_CAPENABLED) && + (td->td_ucred->cr_flags & CRED_FLAG_CAPMODE)) + error = ENOSYS; + } +#endif + if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; diff -aurN -x '*.orig' src-clean/sys/amd64/conf/CAPABILITIES src/sys/amd64/conf/CAPABILITIES --- src-clean/sys/amd64/conf/CAPABILITIES 1970-01-01 01:00:00.000000000 +0100 +++ src/sys/amd64/conf/CAPABILITIES 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,11 @@ +include GENERIC +ident CAPABILITIES + +options CAPABILITIES +options PROCDESC +options KDTRACE_HOOKS +options INVARIANT_SUPPORT +options INVARIANTS +options WITNESS +options KDB +options DDB diff -aurN -x '*.orig' src-clean/sys/amd64/linux32/linux32_machdep.c src/sys/amd64/linux32/linux32_machdep.c --- src-clean/sys/amd64/linux32/linux32_machdep.c 2010-08-25 10:09:42.000000000 +0200 +++ src/sys/amd64/linux32/linux32_machdep.c 2010-08-25 10:24:35.000000000 +0200 @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -861,7 +862,7 @@ * protection options specified. */ - if ((error = fget(td, bsd_args.fd, &fp)) != 0) + if ((error = fget(td, bsd_args.fd, CAP_MMAP, &fp)) != 0) return (error); if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); diff -aurN -x '*.orig' src-clean/sys/amd64/linux32/linux32_proto.h src/sys/amd64/linux32/linux32_proto.h --- src-clean/sys/amd64/linux32/linux32_proto.h 2010-08-25 10:09:42.000000000 +0200 +++ src/sys/amd64/linux32/linux32_proto.h 2010-08-25 10:24:35.000000000 +0200 @@ -1257,6 +1257,13 @@ #endif /* COMPAT_FREEBSD6 */ + +#ifdef COMPAT_FREEBSD7 + +#define nosys linux_nosys + +#endif /* COMPAT_FREEBSD7 */ + #define LINUX_SYS_AUE_linux_fork AUE_FORK #define LINUX_SYS_AUE_linux_open AUE_OPEN_RWTC #define LINUX_SYS_AUE_linux_waitpid AUE_WAIT4 diff -aurN -x '*.orig' src-clean/sys/arm/arm/sys_machdep.c src/sys/arm/arm/sys_machdep.c --- src-clean/sys/arm/arm/sys_machdep.c 2010-08-25 10:09:42.000000000 +0200 +++ src/sys/arm/arm/sys_machdep.c 2010-08-25 10:24:35.000000000 +0200 @@ -104,6 +104,10 @@ { int error; + /* + * XXXRW: As new operations are added here, check that they are safe + * in capability mode. + */ switch (uap->op) { case ARM_SYNC_ICACHE : error = arm32_sync_icache(td, uap->parms); diff -aurN -x '*.orig' src-clean/sys/bsm/audit_errno.h src/sys/bsm/audit_errno.h --- src-clean/sys/bsm/audit_errno.h 2010-08-25 10:09:43.000000000 +0200 +++ src/sys/bsm/audit_errno.h 2010-08-25 10:24:35.000000000 +0200 @@ -205,6 +205,7 @@ #define BSM_ERRNO_EKEYEXPIRED 220 /* Linux-specific. */ #define BSM_ERRNO_EKEYREVOKED 221 /* Linux-specific. */ #define BSM_ERRNO_EKEYREJECTED 222 /* Linux-specific. */ +#define BSM_ERRNO_ENOTCAPABLE 223 /* FreeBSD-specific. */ /* * In the event that OpenBSM doesn't have a file representation of a local diff -aurN -x '*.orig' src-clean/sys/cddl/compat/opensolaris/sys/file.h src/sys/cddl/compat/opensolaris/sys/file.h --- src-clean/sys/cddl/compat/opensolaris/sys/file.h 2010-08-25 10:09:43.000000000 +0200 +++ src/sys/cddl/compat/opensolaris/sys/file.h 2010-08-25 10:24:35.000000000 +0200 @@ -34,14 +34,18 @@ #ifdef _KERNEL typedef struct file file_t; +#include + static __inline file_t * getf(int fd, int write) { struct file *fp; - if (write && fget_write(curthread, fd, &fp) == 0) + if (write && fget_write(curthread, fd, CAP_WRITE | CAP_SEEK, &fp) == + 0) return (fp); - else if (!write && fget_read(curthread, fd, &fp) == 0) + else if (!write && fget_read(curthread, CAP_READ | CAP_SEEK, fd, + &fp) == 0) return (fp); return (NULL); } diff -aurN -x '*.orig' src-clean/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c --- src-clean/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c 2010-08-25 10:09:44.000000000 +0200 +++ src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c 2010-08-25 10:24:35.000000000 +0200 @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff -aurN -x '*.orig' src-clean/sys/compat/freebsd32/freebsd32_ioctl.c src/sys/compat/freebsd32/freebsd32_ioctl.c --- src-clean/sys/compat/freebsd32/freebsd32_ioctl.c 2010-08-25 10:09:44.000000000 +0200 +++ src/sys/compat/freebsd32/freebsd32_ioctl.c 2010-08-25 10:24:35.000000000 +0200 @@ -33,6 +33,7 @@ #include "opt_compat.h" #include +#include #include #include #include @@ -207,7 +208,7 @@ struct file *fp; int error; - if ((error = fget(td, uap->fd, &fp)) != 0) + if ((error = fget(td, uap->fd, CAP_IOCTL, &fp)) != 0) return (error); if ((fp->f_flag & (FREAD | FWRITE)) == 0) { fdrop(fp, td); diff -aurN -x '*.orig' src-clean/sys/compat/freebsd32/freebsd32_syscall.h src/sys/compat/freebsd32/freebsd32_syscall.h --- src-clean/sys/compat/freebsd32/freebsd32_syscall.h 2010-08-25 10:09:44.000000000 +0200 +++ src/sys/compat/freebsd32/freebsd32_syscall.h 2010-08-25 10:24:35.000000000 +0200 @@ -409,5 +409,13 @@ #define FREEBSD32_SYS_freebsd32_msgctl 511 #define FREEBSD32_SYS_freebsd32_shmctl 512 #define FREEBSD32_SYS_lpathconf 513 +#define FREEBSD32_SYS_cap_new 514 +#define FREEBSD32_SYS_cap_getrights 515 +#define FREEBSD32_SYS_cap_enter 516 +#define FREEBSD32_SYS_cap_getmode 517 +#define FREEBSD32_SYS_pdfork 518 +#define FREEBSD32_SYS_pdkill 519 +#define FREEBSD32_SYS_pdgetpid 520 +#define FREEBSD32_SYS_pdwait 521 #define FREEBSD32_SYS_freebsd32_pselect 522 #define FREEBSD32_SYS_MAXSYSCALL 523 diff -aurN -x '*.orig' src-clean/sys/compat/freebsd32/freebsd32_syscalls.c src/sys/compat/freebsd32/freebsd32_syscalls.c --- src-clean/sys/compat/freebsd32/freebsd32_syscalls.c 2010-08-25 10:09:44.000000000 +0200 +++ src/sys/compat/freebsd32/freebsd32_syscalls.c 2010-08-25 10:24:35.000000000 +0200 @@ -537,13 +537,13 @@ "freebsd32_msgctl", /* 511 = freebsd32_msgctl */ "freebsd32_shmctl", /* 512 = freebsd32_shmctl */ "lpathconf", /* 513 = lpathconf */ - "#514", /* 514 = cap_new */ - "#515", /* 515 = cap_getrights */ - "#516", /* 516 = cap_enter */ - "#517", /* 517 = cap_getmode */ - "#518", /* 518 = pdfork */ - "#519", /* 519 = pdkill */ - "#520", /* 520 = pdgetpid */ - "#521", /* 521 = pdwait */ + "cap_new", /* 514 = cap_new */ + "cap_getrights", /* 515 = cap_getrights */ + "cap_enter", /* 516 = cap_enter */ + "cap_getmode", /* 517 = cap_getmode */ + "pdfork", /* 518 = pdfork */ + "pdkill", /* 519 = pdkill */ + "pdgetpid", /* 520 = pdgetpid */ + "pdwait", /* 521 = pdwait */ "freebsd32_pselect", /* 522 = freebsd32_pselect */ }; diff -aurN -x '*.orig' src-clean/sys/compat/freebsd32/freebsd32_sysent.c src/sys/compat/freebsd32/freebsd32_sysent.c --- src-clean/sys/compat/freebsd32/freebsd32_sysent.c 2010-08-25 10:09:44.000000000 +0200 +++ src/sys/compat/freebsd32/freebsd32_sysent.c 2010-08-25 10:24:35.000000000 +0200 @@ -574,13 +574,13 @@ { AS(freebsd32_msgctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 511 = freebsd32_msgctl */ { AS(freebsd32_shmctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 512 = freebsd32_shmctl */ { AS(lpathconf_args), (sy_call_t *)lpathconf, AUE_LPATHCONF, NULL, 0, 0, 0 }, /* 513 = lpathconf */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 514 = cap_new */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 515 = cap_getrights */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 516 = cap_enter */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 517 = cap_getmode */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 518 = pdfork */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 519 = pdkill */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 520 = pdgetpid */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 521 = pdwait */ + { AS(cap_new_args), (sy_call_t *)cap_new, AUE_CAP_NEW, NULL, 0, 0, 0 }, /* 514 = cap_new */ + { AS(cap_getrights_args), (sy_call_t *)cap_getrights, AUE_CAP_GETRIGHTS, NULL, 0, 0, 0 }, /* 515 = cap_getrights */ + { 0, (sy_call_t *)cap_enter, AUE_CAP_ENTER, NULL, 0, 0, 0 }, /* 516 = cap_enter */ + { AS(cap_getmode_args), (sy_call_t *)cap_getmode, AUE_CAP_GETMODE, NULL, 0, 0, 0 }, /* 517 = cap_getmode */ + { AS(pdfork_args), (sy_call_t *)pdfork, AUE_NULL, NULL, 0, 0, 0 }, /* 518 = pdfork */ + { AS(pdkill_args), (sy_call_t *)pdkill, AUE_NULL, NULL, 0, 0, 0 }, /* 519 = pdkill */ + { AS(pdgetpid_args), (sy_call_t *)pdgetpid, AUE_NULL, NULL, 0, 0, 0 }, /* 520 = pdgetpid */ + { AS(pdwait_args), (sy_call_t *)pdwait, AUE_NULL, NULL, 0, 0, 0 }, /* 521 = pdwait */ { AS(freebsd32_pselect_args), (sy_call_t *)freebsd32_pselect, AUE_SELECT, NULL, 0, 0, 0 }, /* 522 = freebsd32_pselect */ }; diff -aurN -x '*.orig' src-clean/sys/compat/freebsd32/syscalls.master src/sys/compat/freebsd32/syscalls.master --- src-clean/sys/compat/freebsd32/syscalls.master 2010-08-25 10:09:44.000000000 +0200 +++ src/sys/compat/freebsd32/syscalls.master 2010-08-25 10:24:35.000000000 +0200 @@ -951,14 +951,16 @@ 512 AUE_SHMCTL NOSTD { int freebsd32_shmctl(int shmid, int cmd, \ struct shmid_ds32 *buf); } 513 AUE_LPATHCONF NOPROTO { int lpathconf(char *path, int name); } -514 AUE_CAP_NEW UNIMPL cap_new -515 AUE_CAP_GETRIGHTS UNIMPL cap_getrights -516 AUE_CAP_ENTER UNIMPL cap_enter -517 AUE_CAP_GETMODE UNIMPL cap_getmode -518 AUE_PDFORK UNIMPL pdfork -519 AUE_PDKILL UNIMPL pdkill -520 AUE_PDGETPID UNIMPL pdgetpid -521 AUE_PDWAIT UNIMPL pdwait +514 AUE_CAP_NEW NOPROTO { int cap_new(int fd, u_int64_t rights); } +515 AUE_CAP_GETRIGHTS NOPROTO { int cap_getrights(int fd, \ + u_int64_t *rightsp); } +516 AUE_CAP_ENTER NOPROTO { int cap_enter(void); } +517 AUE_CAP_GETMODE NOPROTO { int cap_getmode(u_int *modep); } +518 AUE_NULL NOPROTO { int pdfork(int *fdp); } +519 AUE_NULL NOPROTO { int pdkill(int fd, int signum); } +520 AUE_NULL NOPROTO { int pdgetpid(int fd, pid_t *pidp); } +521 AUE_NULL NOPROTO { int pdwait(int fd, int *status, \ + int options, struct rusage *rusage); } 522 AUE_SELECT STD { int freebsd32_pselect(int nd, fd_set *in, \ fd_set *ou, fd_set *ex, \ const struct timespec32 *ts, \ diff -aurN -x '*.orig' src-clean/sys/compat/linux/linux_file.c src/sys/compat/linux/linux_file.c --- src-clean/sys/compat/linux/linux_file.c 2010-08-25 10:09:44.000000000 +0200 +++ src/sys/compat/linux/linux_file.c 2010-08-25 10:24:35.000000000 +0200 @@ -33,6 +33,7 @@ #include #include +#include #include #include #include @@ -139,7 +140,7 @@ * having the same filedesc could use that fd without * checking below. */ - error = fget(td, fd, &fp); + error = fget(td, fd, CAP_IOCTL, &fp); if (!error) { sx_slock(&proctree_lock); PROC_LOCK(p); @@ -1043,8 +1044,9 @@ error = pread(td, &bsd); if (error == 0) { + /* XXXRW: No capability rights should be OK. */ /* This seems to violate POSIX but linux does it */ - if ((error = fgetvp(td, uap->fd, &vp)) != 0) + if ((error = fgetvp(td, uap->fd, 0, &vp)) != 0) return (error); if (vp->v_type == VDIR) { vrele(vp); @@ -1392,8 +1394,12 @@ * XXX some Linux applications depend on F_SETOWN having no * significant effect for pipes (SIGIO is not delivered for * pipes under Linux-2.2.35 at least). + * + * Don't really need to check CAP_FCNTL here since real work + * will depend on kern_fnctl(), but it's will give the right + * error in the EINVAL case. */ - error = fget(td, args->fd, &fp); + error = fget(td, args->fd, CAP_FCNTL, &fp); if (error) return (error); if (fp->f_type == DTYPE_PIPE) { diff -aurN -x '*.orig' src-clean/sys/compat/linux/linux_ioctl.c src/sys/compat/linux/linux_ioctl.c --- src-clean/sys/compat/linux/linux_ioctl.c 2010-08-25 10:09:44.000000000 +0200 +++ src/sys/compat/linux/linux_ioctl.c 2010-08-25 10:24:35.000000000 +0200 @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -166,7 +167,7 @@ u_int sectorsize, fwcylinders, fwheads, fwsectors; off_t mediasize, bytespercyl; - if ((error = fget(td, args->fd, &fp)) != 0) + if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0) return (error); switch (args->cmd & 0xffff) { case LINUX_HDIO_GET_GEO: @@ -247,7 +248,7 @@ u_int sectorsize; off_t mediasize; - if ((error = fget(td, args->fd, &fp)) != 0) + if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0) return (error); switch (args->cmd & 0xffff) { case LINUX_BLKGETSIZE: @@ -673,7 +674,7 @@ struct file *fp; int error; - if ((error = fget(td, args->fd, &fp)) != 0) + if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0) return (error); switch (args->cmd & 0xffff) { @@ -1413,7 +1414,7 @@ struct file *fp; int error; - if ((error = fget(td, args->fd, &fp)) != 0) + if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0) return (error); switch (args->cmd & 0xffff) { @@ -1934,7 +1935,7 @@ struct file *fp; int error; - if ((error = fget(td, args->fd, &fp)) != 0) + if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0) return (error); switch (args->cmd & 0xffff) { @@ -2325,7 +2326,7 @@ ifp = NULL; error = 0; - if ((error = fget(td, args->fd, &fp)) != 0) + if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0) return (error); type = fp->f_type; fdrop(fp, td); @@ -2551,7 +2552,7 @@ struct file *fp; int error, type; - if ((error = fget(td, args->fd, &fp)) != 0) + if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0) return (error); type = fp->f_type; fdrop(fp, td); @@ -2577,7 +2578,7 @@ u_long cmd; int error; - if ((error = fget(td, args->fd, &fp)) != 0) { + if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0) { printf("sg_linux_ioctl: fget returned %d\n", error); return (error); } @@ -2633,7 +2634,7 @@ (unsigned long)args->cmd); #endif - if ((error = fget(td, args->fd, &fp)) != 0) + if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0) return (error); if ((fp->f_flag & (FREAD|FWRITE)) == 0) { fdrop(fp, td); diff -aurN -x '*.orig' src-clean/sys/compat/linux/linux_socket.c src/sys/compat/linux/linux_socket.c --- src-clean/sys/compat/linux/linux_socket.c 2010-08-25 10:09:44.000000000 +0200 +++ src/sys/compat/linux/linux_socket.c 2010-08-25 10:24:35.000000000 +0200 @@ -720,8 +720,10 @@ * XXXRW: Instead of using fgetsock(), check that it is a * socket and use the file descriptor reference instead of * creating a new one. + * + * XXXRW: No capability rights required here? */ - error = fgetsock(td, args->s, &so, &fflag); + error = fgetsock(td, args->s, 0, &so, &fflag); if (error == 0) { error = EISCONN; if (fflag & FNONBLOCK) { diff -aurN -x '*.orig' src-clean/sys/compat/linux/linux_stats.c src/sys/compat/linux/linux_stats.c --- src-clean/sys/compat/linux/linux_stats.c 2010-08-25 10:09:44.000000000 +0200 +++ src/sys/compat/linux/linux_stats.c 2010-08-25 10:24:35.000000000 +0200 @@ -141,8 +141,11 @@ struct vnode *vp; int major, minor; + /* + * XXXRW: No capability rights required here. + */ if ((!S_ISCHR(buf->st_mode) && !S_ISBLK(buf->st_mode)) || - fget(td, fd, &fp) != 0) + fget(td, fd, 0, &fp) != 0) return; vp = fp->f_vnode; if (vp != NULL && vp->v_rdev != NULL && diff -aurN -x '*.orig' src-clean/sys/compat/svr4/svr4_fcntl.c src/sys/compat/svr4/svr4_fcntl.c --- src-clean/sys/compat/svr4/svr4_fcntl.c 2010-08-25 10:09:44.000000000 +0200 +++ src/sys/compat/svr4/svr4_fcntl.c 2010-08-25 10:24:35.000000000 +0200 @@ -33,6 +33,7 @@ __FBSDID("$FreeBSD: src/sys/compat/svr4/svr4_fcntl.c,v 1.48.2.1.4.1 2010/06/14 02:09:06 kensmith Exp $"); #include +#include #include #include #include @@ -261,7 +262,7 @@ int error, *retval; retval = td->td_retval; - if ((error = fgetvp(td, fd, &vp)) != 0) + if ((error = fgetvp(td, fd, CAP_REVOKE, &vp)) != 0) return (error); if (vp->v_type != VCHR && vp->v_type != VBLK) { @@ -313,7 +314,7 @@ /* * We only support truncating the file. */ - if ((error = fget(td, fd, &fp)) != 0) + if ((error = fget(td, fd, CAP_FTRUNCATE, &fp)) != 0) return (error); vp = fp->f_vnode; @@ -392,7 +393,7 @@ #if defined(NOTYET) struct file *fp; - error = fget(td, retval, &fp); + error = fget(td, retval, CAP_IOCTL, &fp); PROC_UNLOCK(p); /* * we may have lost a race the above open() and diff -aurN -x '*.orig' src-clean/sys/compat/svr4/svr4_filio.c src/sys/compat/svr4/svr4_filio.c --- src-clean/sys/compat/svr4/svr4_filio.c 2010-08-25 10:09:44.000000000 +0200 +++ src/sys/compat/svr4/svr4_filio.c 2010-08-25 10:24:35.000000000 +0200 @@ -120,7 +120,7 @@ ra.buf = uap->buf; ra.nbyte = uap->nbyte; - if (fget(td, uap->fd, &fp) != 0) { + if (fget(td, uap->fd, CAP_READ, &fp) != 0) { DPRINTF(("Something fishy with the user-supplied file descriptor...\n")); return EBADF; } diff -aurN -x '*.orig' src-clean/sys/compat/svr4/svr4_ioctl.c src/sys/compat/svr4/svr4_ioctl.c --- src-clean/sys/compat/svr4/svr4_ioctl.c 2010-08-25 10:09:44.000000000 +0200 +++ src/sys/compat/svr4/svr4_ioctl.c 2010-08-25 10:24:35.000000000 +0200 @@ -30,6 +30,7 @@ __FBSDID("$FreeBSD: src/sys/compat/svr4/svr4_ioctl.c,v 1.26.2.1.4.1 2010/06/14 02:09:06 kensmith Exp $"); #include +#include #include #include #include @@ -102,7 +103,7 @@ retval = td->td_retval; cmd = uap->com; - if ((error = fget(td, uap->fd, &fp)) != 0) + if ((error = fget(td, uap->fd, CAP_IOCTL, &fp)) != 0) return (error); if ((fp->f_flag & (FREAD | FWRITE)) == 0) { diff -aurN -x '*.orig' src-clean/sys/compat/svr4/svr4_proto.h src/sys/compat/svr4/svr4_proto.h --- src-clean/sys/compat/svr4/svr4_proto.h 2010-08-25 10:09:44.000000000 +0200 +++ src/sys/compat/svr4/svr4_proto.h 2010-08-25 10:24:35.000000000 +0200 @@ -503,6 +503,12 @@ #endif /* COMPAT_FREEBSD6 */ + +#ifdef COMPAT_FREEBSD7 + + +#endif /* COMPAT_FREEBSD7 */ + #define SVR4_SYS_AUE_svr4_sys_open AUE_NULL #define SVR4_SYS_AUE_svr4_sys_wait AUE_NULL #define SVR4_SYS_AUE_svr4_sys_creat AUE_NULL diff -aurN -x '*.orig' src-clean/sys/compat/svr4/svr4_stream.c src/sys/compat/svr4/svr4_stream.c --- src-clean/sys/compat/svr4/svr4_stream.c 2010-08-25 10:09:44.000000000 +0200 +++ src/sys/compat/svr4/svr4_stream.c 2010-08-25 10:24:35.000000000 +0200 @@ -42,6 +42,7 @@ #include "opt_ktrace.h" #include +#include #include #include #include @@ -1448,7 +1449,7 @@ struct file *fp; int error; - if ((error = fget(td, uap->fd, &fp)) != 0) { + if ((error = fget(td, uap->fd, CAP_WRITE, &fp)) != 0) { #ifdef DEBUG_SVR4 uprintf("putmsg: bad fp\n"); #endif @@ -1620,7 +1621,7 @@ struct file *fp; int error; - if ((error = fget(td, uap->fd, &fp)) != 0) { + if ((error = fget(td, uap->fd, CAP_READ, &fp)) != 0) { #ifdef DEBUG_SVR4 uprintf("getmsg: bad fp\n"); #endif diff -aurN -x '*.orig' src-clean/sys/conf/NOTES src/sys/conf/NOTES --- src-clean/sys/conf/NOTES 2010-08-25 10:09:44.000000000 +0200 +++ src/sys/conf/NOTES 2010-08-25 10:24:35.000000000 +0200 @@ -1105,6 +1105,9 @@ options NTFS_ICONV options UDF_ICONV +# Support for process descriptors +options PROCDESC + ##################################################################### # POSIX P1003.1B @@ -1126,6 +1129,9 @@ # Support for BSM audit options AUDIT +# Support for kernel capabilities +options CAPABILITIES + # Support for Mandatory Access Control (MAC): options MAC options MAC_BIBA diff -aurN -x '*.orig' src-clean/sys/conf/files src/sys/conf/files --- src-clean/sys/conf/files 2010-08-25 10:09:44.000000000 +0200 +++ src/sys/conf/files 2010-08-25 10:24:35.000000000 +0200 @@ -2149,8 +2149,10 @@ kern/subr_turnstile.c standard kern/subr_unit.c standard kern/subr_witness.c optional witness +kern/sys_capability.c standard kern/sys_generic.c standard kern/sys_pipe.c standard +kern/sys_procdesc.c standard kern/sys_process.c standard kern/sys_socket.c standard kern/syscalls.c optional witness | invariants | kdtrace_hooks diff -aurN -x '*.orig' src-clean/sys/conf/options src/sys/conf/options --- src-clean/sys/conf/options 2010-08-25 10:09:44.000000000 +0200 +++ src/sys/conf/options 2010-08-25 10:24:35.000000000 +0200 @@ -63,6 +63,7 @@ ADAPTIVE_LOCKMGRS ALQ AUDIT opt_global.h +CAPABILITIES opt_capabilities.h CODA_COMPAT_5 opt_coda.h COMPAT_43 opt_compat.h COMPAT_43TTY opt_compat.h @@ -144,6 +145,7 @@ PPC_PROBE_CHIPSET opt_ppc.h PPS_SYNC opt_ntp.h PREEMPTION opt_sched.h +PROCDESC opt_procdesc.h QUOTA SCHED_4BSD opt_sched.h SCHED_STATS opt_sched.h diff -aurN -x '*.orig' src-clean/sys/dev/aac/aac_linux.c src/sys/dev/aac/aac_linux.c --- src-clean/sys/dev/aac/aac_linux.c 2010-08-25 10:09:52.000000000 +0200 +++ src/sys/dev/aac/aac_linux.c 2010-08-25 10:24:35.000000000 +0200 @@ -33,6 +33,7 @@ #include #include +#include #include #include #include @@ -78,7 +79,7 @@ u_long cmd; int error; - if ((error = fget(td, args->fd, &fp)) != 0) + if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0) return (error); cmd = args->cmd; diff -aurN -x '*.orig' src-clean/sys/dev/amr/amr_linux.c src/sys/dev/amr/amr_linux.c --- src-clean/sys/dev/amr/amr_linux.c 2010-08-25 10:09:47.000000000 +0200 +++ src/sys/dev/amr/amr_linux.c 2010-08-25 10:24:35.000000000 +0200 @@ -30,6 +30,7 @@ #include #include +#include #include #include #include @@ -74,7 +75,7 @@ struct file *fp; int error; - if ((error = fget(p, args->fd, &fp)) != 0) + if ((error = fget(p, args->fd, CAP_IOCTL, &fp)) != 0) return (error); error = fo_ioctl(fp, args->cmd, (caddr_t)args->arg, p->td_ucred, p); fdrop(fp, p); diff -aurN -x '*.orig' src-clean/sys/dev/hwpmc/hwpmc_logging.c src/sys/dev/hwpmc/hwpmc_logging.c --- src-clean/sys/dev/hwpmc/hwpmc_logging.c 2010-08-25 10:09:51.000000000 +0200 +++ src/sys/dev/hwpmc/hwpmc_logging.c 2010-08-25 10:24:35.000000000 +0200 @@ -37,6 +37,7 @@ __FBSDID("$FreeBSD: src/sys/dev/hwpmc/hwpmc_logging.c,v 1.15.2.4.2.1 2010/06/14 02:09:06 kensmith Exp $"); #include +#include #include #include #include @@ -589,7 +590,7 @@ po->po_file)); /* get a reference to the file state */ - error = fget_write(curthread, logfd, &po->po_file); + error = fget_write(curthread, logfd, CAP_WRITE, &po->po_file); if (error) goto error; diff -aurN -x '*.orig' src-clean/sys/dev/ipmi/ipmi_linux.c src/sys/dev/ipmi/ipmi_linux.c --- src-clean/sys/dev/ipmi/ipmi_linux.c 2010-08-25 10:09:51.000000000 +0200 +++ src/sys/dev/ipmi/ipmi_linux.c 2010-08-25 10:24:35.000000000 +0200 @@ -32,6 +32,7 @@ */ #include +#include #include #include #include @@ -92,7 +93,7 @@ u_long cmd; int error; - if ((error = fget(td, args->fd, &fp)) != 0) + if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0) return (error); cmd = args->cmd; diff -aurN -x '*.orig' src-clean/sys/dev/iscsi/initiator/iscsi.c src/sys/dev/iscsi/initiator/iscsi.c --- src-clean/sys/dev/iscsi/initiator/iscsi.c 2010-08-25 10:09:51.000000000 +0200 +++ src/sys/dev/iscsi/initiator/iscsi.c 2010-08-25 10:24:35.000000000 +0200 @@ -35,6 +35,7 @@ #include "opt_iscsi_initiator.h" #include +#include #include #include #include @@ -399,11 +400,15 @@ if(sp->soc != NULL) isc_stop_receiver(sp); - error = fget(td, fd, &sp->fp); + /* + * XXXRW: Possibly should be CAP_SOCK_ALL? + */ + error = fget(td, fd, CAP_READ | CAP_WRITE | CAP_SHUTDOWN, &sp->fp); if(error) return error; - if((error = fgetsock(td, fd, &sp->soc, 0)) == 0) { + if((error = fgetsock(td, fd, CAP_READ | CAP_WRITE | CAP_SHUTDOWN, + &sp->soc, 0)) == 0) { sp->td = td; isc_start_receiver(sp); } diff -aurN -x '*.orig' src-clean/sys/dev/mfi/mfi_linux.c src/sys/dev/mfi/mfi_linux.c --- src-clean/sys/dev/mfi/mfi_linux.c 2010-08-25 10:09:52.000000000 +0200 +++ src/sys/dev/mfi/mfi_linux.c 2010-08-25 10:24:35.000000000 +0200 @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -95,7 +96,7 @@ break; } - if ((error = fget(p, args->fd, &fp)) != 0) + if ((error = fget(p, args->fd, CAP_IOCTL, &fp)) != 0) return (error); error = fo_ioctl(fp, cmd, (caddr_t)args->arg, p->td_ucred, p); fdrop(fp, p); diff -aurN -x '*.orig' src-clean/sys/dev/snp/snp.c src/sys/dev/snp/snp.c --- src-clean/sys/dev/snp/snp.c 2010-08-25 10:09:54.000000000 +0200 +++ src/sys/dev/snp/snp.c 2010-08-25 10:24:35.000000000 +0200 @@ -28,6 +28,7 @@ __FBSDID("$FreeBSD: src/sys/dev/snp/snp.c,v 1.115.2.1.4.1 2010/06/14 02:09:06 kensmith Exp $"); #include +#include #include #include #include @@ -257,6 +258,9 @@ SNP_UNLOCK(); return (EBUSY); } + /* + * XXXRW: no capability check here. + */ error = ttyhook_register(&ss->snp_tty, td->td_proc, *(int *)data, &snp_hook, ss); SNP_UNLOCK(); diff -aurN -x '*.orig' src-clean/sys/dev/tdfx/tdfx_linux.c src/sys/dev/tdfx/tdfx_linux.c --- src-clean/sys/dev/tdfx/tdfx_linux.c 2010-08-25 10:09:54.000000000 +0200 +++ src/sys/dev/tdfx/tdfx_linux.c 2010-08-25 10:24:35.000000000 +0200 @@ -28,6 +28,7 @@ __FBSDID("$FreeBSD: src/sys/dev/tdfx/tdfx_linux.c,v 1.1.12.1.4.1 2010/06/14 02:09:06 kensmith Exp $"); #include +#include #include #include #include @@ -53,7 +54,7 @@ struct file *fp; - if ((error = fget(td, args->fd, &fp)) != 0) + if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0) return (error); /* We simply copy the data and send it right to ioctl */ copyin((caddr_t)args->arg, &d_pio, sizeof(d_pio)); diff -aurN -x '*.orig' src-clean/sys/fs/fdescfs/fdesc_vnops.c src/sys/fs/fdescfs/fdesc_vnops.c --- src-clean/sys/fs/fdescfs/fdesc_vnops.c 2010-08-25 10:09:55.000000000 +0200 +++ src/sys/fs/fdescfs/fdesc_vnops.c 2010-08-25 10:24:35.000000000 +0200 @@ -40,6 +40,7 @@ #include #include +#include #include #include #include @@ -305,7 +306,10 @@ fd = fd1; } - if ((error = fget(td, fd, &fp)) != 0) + /* + * XXXRW: 'fp' isn't actually used so no rights to check? + */ + if ((error = fget(td, fd, 0, &fp)) != 0) goto bad; /* Check if we're looking up ourselves. */ @@ -387,6 +391,7 @@ struct ucred *a_cred; } */ *ap; { + struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; @@ -436,6 +441,7 @@ struct ucred *a_cred; } */ *ap; { +#if 0 struct vattr *vap = ap->a_vap; struct vnode *vp; struct mount *mp; @@ -479,6 +485,9 @@ } fdrop(fp, td); return (error); +#else + return (EOPNOTSUPP); +#endif } #define UIO_MX 16 diff -aurN -x '*.orig' src-clean/sys/fs/nfs/nfsport.h src/sys/fs/nfs/nfsport.h --- src-clean/sys/fs/nfs/nfsport.h 2010-08-25 10:09:55.000000000 +0200 +++ src/sys/fs/nfs/nfsport.h 2010-08-25 10:24:35.000000000 +0200 @@ -42,6 +42,7 @@ #ifdef _KERNEL #include #include +#include #include #include #include diff -aurN -x '*.orig' src-clean/sys/fs/nfsclient/nfs_clport.c src/sys/fs/nfsclient/nfs_clport.c --- src-clean/sys/fs/nfsclient/nfs_clport.c 2010-08-25 10:09:55.000000000 +0200 +++ src/sys/fs/nfsclient/nfs_clport.c 2010-08-25 10:24:35.000000000 +0200 @@ -1184,7 +1184,8 @@ error = copyin(uap->argp, (caddr_t)&nfscbdarg, sizeof(nfscbdarg)); if (error) return (error); - if ((error = fget(td, nfscbdarg.sock, &fp)) != 0) { + if ((error = fget(td, nfscbdarg.sock, CAP_SOCK_ALL, &fp)) + != 0) { return (error); } if (fp->f_type != DTYPE_SOCKET) { diff -aurN -x '*.orig' src-clean/sys/fs/nfsserver/nfs_nfsdport.c src/sys/fs/nfsserver/nfs_nfsdport.c --- src-clean/sys/fs/nfsserver/nfs_nfsdport.c 2010-08-25 10:09:55.000000000 +0200 +++ src/sys/fs/nfsserver/nfs_nfsdport.c 2010-08-25 10:24:35.000000000 +0200 @@ -2925,7 +2925,8 @@ error = copyin(uap->argp, (caddr_t)&sockarg, sizeof (sockarg)); if (error) return (error); - if ((error = fget(td, sockarg.sock, &fp)) != 0) { + if ((error = fget(td, sockarg.sock, CAP_SOCK_ALL, &fp)) + != 0) { return (error); } if (fp->f_type != DTYPE_SOCKET) { diff -aurN -x '*.orig' src-clean/sys/fs/portalfs/portal_vfsops.c src/sys/fs/portalfs/portal_vfsops.c --- src-clean/sys/fs/portalfs/portal_vfsops.c 2010-08-25 10:09:57.000000000 +0200 +++ src/sys/fs/portalfs/portal_vfsops.c 2010-08-25 10:24:35.000000000 +0200 @@ -40,6 +40,7 @@ #include #include +#include #include #include #include @@ -112,7 +113,10 @@ if (error) return (error); - if ((error = fget(td, v, &fp)) != 0) + /* + * XXXRW: I suppose we want CAP_SOCK_ALL here? + */ + if ((error = fget(td, v, CAP_READ | CAP_WRITE, &fp)) != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, td); diff -aurN -x '*.orig' src-clean/sys/fs/portalfs/portal_vnops.c src/sys/fs/portalfs/portal_vnops.c --- src-clean/sys/fs/portalfs/portal_vnops.c 2010-08-25 10:09:57.000000000 +0200 +++ src/sys/fs/portalfs/portal_vnops.c 2010-08-25 10:24:35.000000000 +0200 @@ -407,8 +407,12 @@ /* * Check that the mode the file is being opened for is a subset * of the mode of the existing descriptor. + * + * XXXRW: It is stunningly non-obvious how to handle this with + * respect to capabilities. Does that mean this is simply a bad + * idea? */ - if ((error = fget(td, fd, &fp)) != 0) + if ((error = fget(td, fd, 0, &fp)) != 0) goto bad; if (((ap->a_mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { fdrop(fp, td); diff -aurN -x '*.orig' src-clean/sys/gnu/fs/xfs/xfs_dfrag.c src/sys/gnu/fs/xfs/xfs_dfrag.c --- src-clean/sys/gnu/fs/xfs/xfs_dfrag.c 2010-08-25 10:09:58.000000000 +0200 +++ src/sys/gnu/fs/xfs/xfs_dfrag.c 2010-08-25 10:24:35.000000000 +0200 @@ -46,6 +46,7 @@ #include "xfs_mac.h" #include "xfs_rw.h" +#include #include /* @@ -79,7 +80,8 @@ } /* Pull information for the target fd */ - if (fgetvp(td, (int)sxp->sx_fdtarget, &bvp) != 0) { + if (fgetvp(td, (int)sxp->sx_fdtarget, CAP_READ | CAP_WRITE, &bvp) + != 0) { error = XFS_ERROR(EINVAL); goto error0; } @@ -91,7 +93,8 @@ goto error0; } - if (fgetvp(td, (int)sxp->sx_fdtmp, &btvp) != 0) { + if (fgetvp(td, (int)sxp->sx_fdtmp, CAP_READ | CAP_WRITE, &btvp) != + 0) { error = XFS_ERROR(EINVAL); goto error0; } diff -aurN -x '*.orig' src-clean/sys/i386/conf/CAPABILITIES src/sys/i386/conf/CAPABILITIES --- src-clean/sys/i386/conf/CAPABILITIES 1970-01-01 01:00:00.000000000 +0100 +++ src/sys/i386/conf/CAPABILITIES 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,11 @@ +include GENERIC +ident CAPABILITIES + +options CAPABILITIES +options PROCDESC +options KDTRACE_HOOKS +options INVARIANT_SUPPORT +options INVARIANTS +options WITNESS +options KDB +options DDB diff -aurN -x '*.orig' src-clean/sys/i386/i386/sys_machdep.c src/sys/i386/i386/sys_machdep.c --- src-clean/sys/i386/i386/sys_machdep.c 2010-08-25 10:09:58.000000000 +0200 +++ src/sys/i386/i386/sys_machdep.c 2010-08-25 10:24:35.000000000 +0200 @@ -33,6 +33,7 @@ __FBSDID("$FreeBSD: src/sys/i386/i386/sys_machdep.c,v 1.120.2.1.4.1 2010/06/14 02:09:06 kensmith Exp $"); #include "opt_kstack_pages.h" +#include "opt_capabilities.h" #include #include @@ -127,6 +128,10 @@ break; } + /* + * XXXRW: As new operations are added here, check that they are safe + * in capability mode. + */ switch(uap->op) { case I386_GET_LDT: error = i386_get_ldt(td, &kargs.largs); @@ -159,6 +164,10 @@ error = i386_set_ioperm(td, &kargs.iargs); break; case I386_VM86: +#ifdef CAPABILITIES + if (td->td_ucred->cr_flags & CRED_FLAG_CAPMODE) + return (EPERM); +#endif error = vm86_sysarch(td, uap->parms); break; case I386_GET_FSBASE: @@ -316,6 +325,10 @@ int i, error; char *iomap; +#ifdef CAPABILITIES + if (td->td_ucred->cr_flags & CRED_FLAG_CAPMODE) + return (EPERM); +#endif if ((error = priv_check(td, PRIV_IO)) != 0) return (error); if ((error = securelevel_gt(td->td_ucred, 0)) != 0) diff -aurN -x '*.orig' src-clean/sys/i386/i386/trap.c src/sys/i386/i386/trap.c --- src-clean/sys/i386/i386/trap.c 2010-08-25 10:09:58.000000000 +0200 +++ src/sys/i386/i386/trap.c 2010-08-26 14:24:13.000000000 +0200 @@ -44,6 +44,7 @@ * 386 Trap and System call handling */ +#include "opt_capabilities.h" #include "opt_clock.h" #include "opt_cpu.h" #include "opt_hwpmc_hooks.h" @@ -70,6 +71,7 @@ #include #include #include +#include #include #include #ifdef KTRACE @@ -1079,6 +1081,18 @@ CTR4(KTR_SYSC, "syscall enter thread %p pid %d proc %s code %d", td, td->td_proc->p_pid, td->td_name, sa.code); +#ifdef CAPABILITIES + /* + * In capabilities mode, we only allow access to system calls flagged + * SYF_CAPENABLED. + */ + if (error == 0) { + if (!(sa.callp->sy_flags & SYF_CAPENABLED) && + (td->td_ucred->cr_flags & CRED_FLAG_CAPMODE)) + error = ENOSYS; + } +#endif + if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame->tf_edx; diff -aurN -x '*.orig' src-clean/sys/i386/ibcs2/ibcs2_fcntl.c src/sys/i386/ibcs2/ibcs2_fcntl.c --- src-clean/sys/i386/ibcs2/ibcs2_fcntl.c 2010-08-25 10:09:58.000000000 +0200 +++ src/sys/i386/ibcs2/ibcs2_fcntl.c 2010-08-25 10:24:35.000000000 +0200 @@ -32,6 +32,7 @@ #include #include +#include #include #include #include @@ -203,7 +204,10 @@ struct file *fp; int error; - error = fget(td, td->td_retval[0], &fp); + /* + * XXXRW: Think more about the capability right to use here. + */ + error = fget(td, td->td_retval[0], CAP_IOCTL, &fp); PROC_UNLOCK(p); if (error) return (EBADF); diff -aurN -x '*.orig' src-clean/sys/i386/ibcs2/ibcs2_ioctl.c src/sys/i386/ibcs2/ibcs2_ioctl.c --- src-clean/sys/i386/ibcs2/ibcs2_ioctl.c 2010-08-25 10:09:58.000000000 +0200 +++ src/sys/i386/ibcs2/ibcs2_ioctl.c 2010-08-25 10:24:35.000000000 +0200 @@ -31,6 +31,7 @@ #include #include +#include #include #include #include @@ -333,7 +334,11 @@ struct file *fp; int error; - if ((error = fget(td, uap->fd, &fp)) != 0) { + /* + * XXXRW: Possibily we should switch (cmd) to generate a rights mask + * to use here, see IBCS2_SIOCSOCKSYS for example. + */ + if ((error = fget(td, uap->fd, CAP_IOCTL, &fp)) != 0) { DPRINTF(("ibcs2_ioctl(%d): bad fd %d ", p->p_pid, uap->fd)); return EBADF; diff -aurN -x '*.orig' src-clean/sys/i386/ibcs2/ibcs2_proto.h src/sys/i386/ibcs2/ibcs2_proto.h --- src-clean/sys/i386/ibcs2/ibcs2_proto.h 2010-08-25 10:09:58.000000000 +0200 +++ src/sys/i386/ibcs2/ibcs2_proto.h 2010-08-25 10:24:35.000000000 +0200 @@ -345,6 +345,12 @@ #endif /* COMPAT_FREEBSD6 */ + +#ifdef COMPAT_FREEBSD7 + + +#endif /* COMPAT_FREEBSD7 */ + #define IBCS2_SYS_AUE_ibcs2_read AUE_NULL #define IBCS2_SYS_AUE_ibcs2_open AUE_OPEN_RWTC #define IBCS2_SYS_AUE_ibcs2_wait AUE_WAIT4 diff -aurN -x '*.orig' src-clean/sys/i386/linux/linux_machdep.c src/sys/i386/linux/linux_machdep.c --- src-clean/sys/i386/linux/linux_machdep.c 2010-08-25 10:09:58.000000000 +0200 +++ src/sys/i386/linux/linux_machdep.c 2010-08-25 10:24:35.000000000 +0200 @@ -31,6 +31,7 @@ #include #include +#include #include #include #include @@ -691,9 +692,12 @@ * The file descriptor fildes is opened with * read permission, regardless of the * protection options specified. + * + * XXXRW: The real work is done in the FreeBSD mmap(), so + * just checking CAP_MMAP here is fine. */ - if ((error = fget(td, bsd_args.fd, &fp)) != 0) + if ((error = fget(td, bsd_args.fd, CAP_MMAP, &fp)) != 0) return (error); if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); diff -aurN -x '*.orig' src-clean/sys/i386/linux/linux_proto.h src/sys/i386/linux/linux_proto.h --- src-clean/sys/i386/linux/linux_proto.h 2010-08-25 10:09:58.000000000 +0200 +++ src/sys/i386/linux/linux_proto.h 2010-08-25 10:24:35.000000000 +0200 @@ -1277,6 +1277,13 @@ #endif /* COMPAT_FREEBSD6 */ + +#ifdef COMPAT_FREEBSD7 + +#define nosys linux_nosys + +#endif /* COMPAT_FREEBSD7 */ + #define LINUX_SYS_AUE_linux_fork AUE_FORK #define LINUX_SYS_AUE_linux_open AUE_OPEN_RWTC #define LINUX_SYS_AUE_linux_waitpid AUE_WAIT4 diff -aurN -x '*.orig' src-clean/sys/kern/Makefile src/sys/kern/Makefile --- src-clean/sys/kern/Makefile 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/Makefile 2010-08-25 10:24:35.000000000 +0200 @@ -12,7 +12,8 @@ ../sys/sysproto.h init_sysent.c syscalls.c systrace_args.c ../sys/syscall.h \ -../sys/syscall.mk ../sys/sysproto.h: makesyscalls.sh syscalls.master +../sys/syscall.mk ../sys/sysproto.h: makesyscalls.sh syscalls.master \ +capabilities.conf -mv -f init_sysent.c init_sysent.c.bak -mv -f syscalls.c syscalls.c.bak -mv -f systrace_args.c systrace_args.c.bak diff -aurN -x '*.orig' src-clean/sys/kern/capabilities.conf src/sys/kern/capabilities.conf --- src-clean/sys/kern/capabilities.conf 1970-01-01 01:00:00.000000000 +0100 +++ src/sys/kern/capabilities.conf 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,759 @@ +## +## Copyright (c) 2008 Robert N. M. Watson +## All rights reserved. +## +## WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED +## ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND +## UNEXPECTED WAYS. +## +## This software was developed at the University of Cambridge Computer +## Laboratory with support from a grant from Google, Inc. +## +## Redistribution and use in source and binary forms, with or without +## modification, are permitted provided that the following conditions +## are met: +## 1. Redistributions of source code must retain the above copyright +## notice, this list of conditions and the following disclaimer. +## 2. Redistributions in binary form must reproduce the above copyright +## notice, this list of conditions and the following disclaimer in the +## documentation and/or other materials provided with the distribution. +## +## THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +## ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +## IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +## ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +## FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +## DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +## OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +## HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +## LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +## OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +## SUCH DAMAGE. +## +## $FreeBSD$ +## +## List of system calls enabled in capability mode, one name per line. +## +## Notes: +## - sys_exit(2), abort2(2) and close(2) are very important. +## - Sorted alphabetically, please keep it that way. +## +## $P4: //depot/projects/trustedbsd/capabilities/src/sys/kern/capabilities.conf#27 $ +## + +## +## Allow ACL and MAC label operations by file descriptor, subject to +## capability rights. Allow MAC label operations on the current process but +## we will need to scope __mac_get_pid(2). +## +__acl_aclcheck_fd +__acl_delete_fd +__acl_get_fd +__acl_set_fd +__mac_get_fd +#__mac_get_pid +__mac_get_proc +__mac_set_fd +__mac_set_proc + +## +## Allow sysctl(2) as we scope internal to the call; this is a global +## namespace, but there are several critical sysctls required for almost +## anything to run, such as hw.pagesize. For now that policy lives in the +## kernel for performance and simplicity, but perhaps it could move to a +## proxying daemon in userspace. +## +__sysctl + +## +## Allow umtx operations as these are scoped by address space. +## +## XXRW: Need to check this very carefully. +## +_umtx_lock +_umtx_op +_umtx_unlock + +## +## Allow process termination using abort2(2). +## +abort2 + +## +## Allow accept(2) since it doesn't manipulate namespaces directly, rather +## relies on existing bindings on a socket, subject to capability rights. +## +accept + +## +## Allow AIO operations by file descriptor, subject to capability rights. +## +aio_cancel +aio_error +aio_fsync +aio_read +aio_return +aio_suspend +aio_waitcomplete +aio_write + +## +## audit(2) is a global operation, submitting to the global trail, but it is +## controlled by privilege, and it might be useful to be able to submit +## records from sandboxes. For now, disallow, but we may want to think about +## providing some sort of proxy service for this. +## +#audit + +## +## Disllow bind(2) for now, even though we support CAP_BIND. +## +## XXXRW: Revisit this. +## +#bind + +## +## Allow capability mode and capability system calls. +## +cap_enter +cap_getmode +cap_getrights +cap_new + +## +## Allow read-only clock operations. +## +clock_gettime +clock_getres + +## +## Always allow file descriptor close(2). +## +close +closefrom + +## +## Disallow connect(2) for now, despite CAP_CONNECT. +## +## XXXRW: Revisit this. +## +#connect + +## +## cpuset(2) and related calls require scoping by process, but should +## eventually be allowed, at least in the current process case. +## +#cpuset +#cpuset_getaffinity +#cpuset_getid +#cpuset_setaffinity +#cpuset_setid + +## +## Always allow dup(2) and dup2(2) manipulation of the file descriptor table. +## +dup +dup2 + +## +## Allow extended attribute operations by file descriptor, subject to +## capability rights. +## +extattr_delete_fd +extattr_get_fd +extattr_list_fd +extattr_set_fd + +## +## Allow changing file flags, mode, and owner by file descriptor, subject to +## capability rights. +## +fchflags +fchmod +fchown + +## +## For now, allow fcntl(2), subject to capability rights, but this probably +## needs additional scoping. +## +fcntl + +## +## Allow fexecve(2), subject to capability rights. We perform some scoping, +## such as disallowing privilege escalation. +## +fexecve + +## +## Allow flock(2), subject to capability rights. +## +flock + +## +## Allow fork(2), even though it returns pids -- some applications seem to +## prefer this interface. +## +fork + +## +## Allow fpathconf(2), subject to capability rights. +## +fpathconf + +## +## Allow various file descriptor-based I/O operations, subject to capability +## rights. mmap(2) requires further attention. +## +freebsd6_ftruncate +freebsd6_lseek +freebsd6_mmap +freebsd6_pread +freebsd6_pwrite + +## +## Allow querying file and file system state with fstat(2) and fstatfs(2), +## subject to capability rights. +## +fstat +fstatfs + +## +## Allow further file descriptor-based I/O operations, subject to capability +## rights. +## +fsync +ftruncate + +## +## Allow futimes(2), subject to capability rights. +## +futimes + +## +## Allow querying process audit state, subject to normal access control. +## +getaudit +getaudit_addr +getauid + +## +## Allow thread context management with getcontext(2). +## +getcontext + +## +## Allow directory I/O on a file descriptor, subject to capability rights. +## Originally we had separate capabilities for directory-specific read +## operations, but on BSD we allow reading the raw directory data, so we just +## rely on CAP_READ (etc) now. +## +## XXXRW: Possibly these should also use CAP_SEEK. +## +getdents +getdirentries + +## +## Allow querying certain trivial global state. +## +getdomainname + +## +## Allow querying current process credential state. +## +getegid +geteuid + +## +## Allow querying certain trivial global state. +## +gethostid +gethostname + +## +## Allow querying per-process timer. +## +getitimer + +## +## Allow querying current process credential state. +## +getgid +getgroups +getlogin + +## +## Allow querying certain trivial global state. +## +getpagesize +getpeername + +## +## Allow querying certain per-process scheduling, resource limit, and +## credential state. +## +## XXXRW: getpgid(2) needs scoping. It's not clear if it's worth scoping +## getppid(2). getpriority(2) needs scoping. getrusage(2) needs scoping. +## getsid(2) needs scoping. +## +getpgid +getpgrp +getpid +getppid +getpriority +getresgid +getresuid +getrlimit +getrusage +getsid + +## +## Allow querying socket state, subject to capability rights. +## +## XXXRW: getsockopt(2) may need more attention. +## +getsockname +getsockopt + +## +## Allow querying the global clock. +## +gettimeofday + +## +## Allow querying current process credential state. +## +getuid + +## +## Disallow ioctl(2) for now, as frequently ioctl(2) operations have global +## scope, but this is a tricky one as it is also required for tty control. +## We do have a capability right for this operation. +## +## XXXRW: This needs to be revisited. +## +#ioctl + +## +## Allow querying current process credential state. +## +issetugid + +## +## Allow kevent(2), as we will authorize based on capability rights on the +## target descriptor. +## +## XXXRW: Do we do this? +## +kevent + +## +## Allow message queue operations on file descriptors, subject to capability +## rights. +## +kmq_notify +kmq_setattr +kmq_timedreceive +kmq_timedsend + +## +## Allow kqueue(2), we will control use. +## +kqueue + +## +## Allow managing per-process timers. +## +ktimer_create +ktimer_delete +ktimer_getoverrun +ktimer_gettime +ktimer_settime + +## +## We can't allow ktrace(2) because it relies on a global namespace, but we +## might want to introduce an fktrace(2) of some sort. +## +#ktrace + +## +## Allow AIO operations by file descriptor, subject to capability rights. +## +lio_listio + +## +## Allow listen(2), subject to capability rights. +## +## XXXRW: One might argue this manipulates a global namespace. +## +listen + +## +## Allow I/O-related file descriptors, subject to capability rights. +## +lseek + +## +## Allow MAC label operations by file descriptor, subject to capability +## rights. +## +mac_get_fd +mac_set_fd + +## +## Allow simple VM operations on the current process. +## +madvise +mincore +minherit +mlock +mlockall + +## +## Allow memory mapping a file descriptor, and updating protections, subject +## to capability rights. +## +## XXXRW: We currently don't properly mask VM protections using capability +## rights. +## +mmap +mprotect + +## +## Allow simple VM operations on the current process. +## +msync +munlock +munlockall +munmap + +## +## Allow the current process to sleep. +## +nanosleep + +## +## Allow querying the global clock. +## +ntp_gettime + +## +## Allow AIO operations by file descriptor, subject to capability rights. +## +oaio_read +oaio_write + +## +## Allow simple VM operations on the current process. +## +obreak + +## +## Allow AIO operations by file descriptor, subject to capability rights. +## +olio_listio + +## +## Allow some of the *at(2) calls, which we have constrained to prevent accessing +## files which are not "under" the directory FD given to the syscall. +## +faccessat +fstatat +fchmodat +futimesat +mkdirat +rmdirat +mkfifoat +mknodat +openat +renameat + +## +## Allow entry into open(2). This system call will fail, since access to the global +## file namespace has been disallowed, but allowing entry into the syscall means +## that an audit trail will be generated (which is also very useful for debugging), +## +open + +## +## Allow poll(2), which will be scoped by capability rights. +## +## XXXRW: Perhaps we don't need the OpenBSD version? +## XXXRW: We don't yet do that scoping. +## +openbsd_poll + +## +## Process descriptor-related system calls are allowed. +## +pdfork +pdgetpid +pdkill +pdwait4 + +## +## Allow pipe(2). +## +pipe + +## +## Allow poll(2), which will be scoped by capability rights. +## XXXRW: We don't yet do that scoping. +## +poll + +## +## Allow I/O-related file descriptors, subject to capability rights. +## +pread +preadv + +## +## Allow access to profiling state on the current process. +## +profil + +## +## Disallow ptrace(2) for now, but we do need debugging facilities in +## capability mode, so we will want to revisit this, possibly by scoping its +## operation. +## +#ptrace + +## +## Allow I/O-related file descriptors, subject to capability rights. +## +pwrite +pwritev +read +readv +recv +recvfrom +recvmsg + +## +## Allow real-time scheduling primitives to be used. +## +## XXXRW: These require scoping. +## +rtprio +rtprio_thread + +## +## Allow simple VM operations on the current process. +## +sbrk + +## +## Allow querying trivial global scheduler state. +## +sched_get_priority_max +sched_get_priority_min + +## +## Allow various thread/process scheduler operations. +## +## XXXRW: Some of these require further scoping. +## +sched_getparam +sched_getscheduler +sched_rr_getinterval +sched_setparam +sched_setscheduler +sched_yield + +## +## Allow I/O-related file descriptors, subject to capability rights. +## +sctp_generic_recvmsg +sctp_generic_sendmsg +sctp_generic_sendmsg_iov +sctp_peeloff + +## +## Allow select(2), which will be scoped by capability rights. +## +## XXXRW: But is it? +## +select + +## +## Allow I/O-related file descriptors, subject to capability rights. Use of +## explicit addresses here is restricted by the system calls themselves. +## +send +sendfile +sendmsg +sendto + +## +## Allow setting per-process audit state, which is controlled separately by +## privileges. +## +setaudit +setaudit_addr +setauid + +## +## Allow setting thread context. +## +setcontext + +## +## Allow setting current process credential state, which is controlled +## separately by privilege. +## +setegid +seteuid +setgid + +## +## Allow use of the process interval timer. +## +setitimer + +## +## Allow setpriority(2). +## +## XXXRW: Requires scoping. +## +setpriority + +## +## Allow setting current process credential state, which is controlled +## separately by privilege. +## +setregid +setresgid +setresuid +setreuid + +## +## Allow setting process resource limits with setrlimit(2). +## +setrlimit + +## +## Allow creating a new session with setsid(2). +## +setsid + +## +## Allow setting socket options with setsockopt(2), subject to capability +## rights. +## +## XXXRW: Might require scoping. +## +setsockopt + +## +## Allow setting current process credential state, which is controlled +## separately by privilege. +## +setuid + +## +## Allow shm_open(2), which is scoped so as to allow only access to new +## anonymous objects. +## +shm_open + +## +## Allow I/O-related file descriptors, subject to capability rights. +## +shutdown + +## +## Allow signal control on current process. +## +sigaction +sigaltstack +sigblock +sigpending +sigprocmask +sigqueue +sigreturn +sigsetmask +sigstack +sigsuspend +sigtimedwait +sigvec +sigwaitinfo + +## +## Allow creating new socket pairs with socket(2) and socketpair(2). +## +socket +socketpair + +## +## Allow simple VM operations on the current process. +## +## XXXRW: Kernel doesn't implement this, so drop? +## +sstk + +## +## Do allow sync(2) for now, but possibly shouldn't. +## +sync + +## +## Always allow process termination with sys_exit(2). +## +sys_exit + +## +## sysarch(2) does rather diverse things, but is required on at least i386 +## in order to configure per-thread data. As such, it's scoped on each +## architecture. +## +sysarch + +## +## Allow thread operations operating only on current process. +## +thr_create +thr_exit +thr_kill + +## +## Disallow thr_kill2(2), as it may operate beyond the current process. +## +## XXXRW: Requires scoping. +## +#thr_kill2 + +## +## Allow thread operations operating only on current process. +## +thr_new +thr_self +thr_set_name +thr_suspend +thr_wake + +## +## Allow manipulation of the current process umask with umask(2). +## +umask + +## +## Allow submitting of process trace entries with utrace(2). +## +utrace + +## +## Allow generating UUIDs with uuidgen(2). +## +uuidgen + +## +## Allow I/O-related file descriptors, subject to capability rights. +## +write +writev + +## +## Allow processes to yield(2). +## +yield diff -aurN -x '*.orig' src-clean/sys/kern/imgact_elf.c src/sys/kern/imgact_elf.c --- src-clean/sys/kern/imgact_elf.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/imgact_elf.c 2010-08-25 10:24:35.000000000 +0200 @@ -573,9 +573,14 @@ imgp->object = NULL; imgp->execlabel = NULL; + vfslocked = 0; + if (curthread->td_ucred->cr_flags & CRED_FLAG_CAPMODE) { + nd->ni_vp = NULL; + error = EPERM; + goto fail; + } NDINIT(nd, LOOKUP, MPSAFE|LOCKLEAF|FOLLOW, UIO_SYSSPACE, file, curthread); - vfslocked = 0; if ((error = namei(nd)) != 0) { nd->ni_vp = NULL; goto fail; diff -aurN -x '*.orig' src-clean/sys/kern/init_sysent.c src/sys/kern/init_sysent.c --- src-clean/sys/kern/init_sysent.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/init_sysent.c 2010-08-25 10:24:35.000000000 +0200 @@ -35,12 +35,12 @@ /* The casts are bogus but will do for now. */ struct sysent sysent[] = { { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 0 = syscall */ - { AS(sys_exit_args), (sy_call_t *)sys_exit, AUE_EXIT, NULL, 0, 0, 0 }, /* 1 = exit */ - { 0, (sy_call_t *)fork, AUE_FORK, NULL, 0, 0, 0 }, /* 2 = fork */ - { AS(read_args), (sy_call_t *)read, AUE_NULL, NULL, 0, 0, 0 }, /* 3 = read */ - { AS(write_args), (sy_call_t *)write, AUE_NULL, NULL, 0, 0, 0 }, /* 4 = write */ - { AS(open_args), (sy_call_t *)open, AUE_OPEN_RWTC, NULL, 0, 0, 0 }, /* 5 = open */ - { AS(close_args), (sy_call_t *)close, AUE_CLOSE, NULL, 0, 0, 0 }, /* 6 = close */ + { AS(sys_exit_args), (sy_call_t *)sys_exit, AUE_EXIT, NULL, 0, 0, SYF_CAPENABLED }, /* 1 = exit */ + { 0, (sy_call_t *)fork, AUE_FORK, NULL, 0, 0, SYF_CAPENABLED }, /* 2 = fork */ + { AS(read_args), (sy_call_t *)read, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 3 = read */ + { AS(write_args), (sy_call_t *)write, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 4 = write */ + { AS(open_args), (sy_call_t *)open, AUE_OPEN_RWTC, NULL, 0, 0, SYF_CAPENABLED }, /* 5 = open */ + { AS(close_args), (sy_call_t *)close, AUE_CLOSE, NULL, 0, 0, SYF_CAPENABLED }, /* 6 = close */ { AS(wait_args), (sy_call_t *)wait4, AUE_WAIT4, NULL, 0, 0, 0 }, /* 7 = wait4 */ { compat(AS(ocreat_args),creat), AUE_CREAT, NULL, 0, 0, 0 }, /* 8 = old creat */ { AS(link_args), (sy_call_t *)link, AUE_LINK, NULL, 0, 0, 0 }, /* 9 = link */ @@ -51,199 +51,199 @@ { AS(mknod_args), (sy_call_t *)mknod, AUE_MKNOD, NULL, 0, 0, 0 }, /* 14 = mknod */ { AS(chmod_args), (sy_call_t *)chmod, AUE_CHMOD, NULL, 0, 0, 0 }, /* 15 = chmod */ { AS(chown_args), (sy_call_t *)chown, AUE_CHOWN, NULL, 0, 0, 0 }, /* 16 = chown */ - { AS(obreak_args), (sy_call_t *)obreak, AUE_NULL, NULL, 0, 0, 0 }, /* 17 = break */ + { AS(obreak_args), (sy_call_t *)obreak, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 17 = break */ { compat4(AS(freebsd4_getfsstat_args),getfsstat), AUE_GETFSSTAT, NULL, 0, 0, 0 }, /* 18 = freebsd4 getfsstat */ - { compat(AS(olseek_args),lseek), AUE_LSEEK, NULL, 0, 0, 0 }, /* 19 = old lseek */ - { 0, (sy_call_t *)getpid, AUE_GETPID, NULL, 0, 0, 0 }, /* 20 = getpid */ + { compat(AS(olseek_args),lseek), AUE_LSEEK, NULL, 0, 0, SYF_CAPENABLED }, /* 19 = old lseek */ + { 0, (sy_call_t *)getpid, AUE_GETPID, NULL, 0, 0, SYF_CAPENABLED }, /* 20 = getpid */ { AS(mount_args), (sy_call_t *)mount, AUE_MOUNT, NULL, 0, 0, 0 }, /* 21 = mount */ { AS(unmount_args), (sy_call_t *)unmount, AUE_UMOUNT, NULL, 0, 0, 0 }, /* 22 = unmount */ - { AS(setuid_args), (sy_call_t *)setuid, AUE_SETUID, NULL, 0, 0, 0 }, /* 23 = setuid */ - { 0, (sy_call_t *)getuid, AUE_GETUID, NULL, 0, 0, 0 }, /* 24 = getuid */ - { 0, (sy_call_t *)geteuid, AUE_GETEUID, NULL, 0, 0, 0 }, /* 25 = geteuid */ + { AS(setuid_args), (sy_call_t *)setuid, AUE_SETUID, NULL, 0, 0, SYF_CAPENABLED }, /* 23 = setuid */ + { 0, (sy_call_t *)getuid, AUE_GETUID, NULL, 0, 0, SYF_CAPENABLED }, /* 24 = getuid */ + { 0, (sy_call_t *)geteuid, AUE_GETEUID, NULL, 0, 0, SYF_CAPENABLED }, /* 25 = geteuid */ { AS(ptrace_args), (sy_call_t *)ptrace, AUE_PTRACE, NULL, 0, 0, 0 }, /* 26 = ptrace */ - { AS(recvmsg_args), (sy_call_t *)recvmsg, AUE_RECVMSG, NULL, 0, 0, 0 }, /* 27 = recvmsg */ - { AS(sendmsg_args), (sy_call_t *)sendmsg, AUE_SENDMSG, NULL, 0, 0, 0 }, /* 28 = sendmsg */ - { AS(recvfrom_args), (sy_call_t *)recvfrom, AUE_RECVFROM, NULL, 0, 0, 0 }, /* 29 = recvfrom */ - { AS(accept_args), (sy_call_t *)accept, AUE_ACCEPT, NULL, 0, 0, 0 }, /* 30 = accept */ - { AS(getpeername_args), (sy_call_t *)getpeername, AUE_GETPEERNAME, NULL, 0, 0, 0 }, /* 31 = getpeername */ - { AS(getsockname_args), (sy_call_t *)getsockname, AUE_GETSOCKNAME, NULL, 0, 0, 0 }, /* 32 = getsockname */ + { AS(recvmsg_args), (sy_call_t *)recvmsg, AUE_RECVMSG, NULL, 0, 0, SYF_CAPENABLED }, /* 27 = recvmsg */ + { AS(sendmsg_args), (sy_call_t *)sendmsg, AUE_SENDMSG, NULL, 0, 0, SYF_CAPENABLED }, /* 28 = sendmsg */ + { AS(recvfrom_args), (sy_call_t *)recvfrom, AUE_RECVFROM, NULL, 0, 0, SYF_CAPENABLED }, /* 29 = recvfrom */ + { AS(accept_args), (sy_call_t *)accept, AUE_ACCEPT, NULL, 0, 0, SYF_CAPENABLED }, /* 30 = accept */ + { AS(getpeername_args), (sy_call_t *)getpeername, AUE_GETPEERNAME, NULL, 0, 0, SYF_CAPENABLED }, /* 31 = getpeername */ + { AS(getsockname_args), (sy_call_t *)getsockname, AUE_GETSOCKNAME, NULL, 0, 0, SYF_CAPENABLED }, /* 32 = getsockname */ { AS(access_args), (sy_call_t *)access, AUE_ACCESS, NULL, 0, 0, 0 }, /* 33 = access */ { AS(chflags_args), (sy_call_t *)chflags, AUE_CHFLAGS, NULL, 0, 0, 0 }, /* 34 = chflags */ - { AS(fchflags_args), (sy_call_t *)fchflags, AUE_FCHFLAGS, NULL, 0, 0, 0 }, /* 35 = fchflags */ - { 0, (sy_call_t *)sync, AUE_SYNC, NULL, 0, 0, 0 }, /* 36 = sync */ + { AS(fchflags_args), (sy_call_t *)fchflags, AUE_FCHFLAGS, NULL, 0, 0, SYF_CAPENABLED }, /* 35 = fchflags */ + { 0, (sy_call_t *)sync, AUE_SYNC, NULL, 0, 0, SYF_CAPENABLED }, /* 36 = sync */ { AS(kill_args), (sy_call_t *)kill, AUE_KILL, NULL, 0, 0, 0 }, /* 37 = kill */ { compat(AS(ostat_args),stat), AUE_STAT, NULL, 0, 0, 0 }, /* 38 = old stat */ - { 0, (sy_call_t *)getppid, AUE_GETPPID, NULL, 0, 0, 0 }, /* 39 = getppid */ + { 0, (sy_call_t *)getppid, AUE_GETPPID, NULL, 0, 0, SYF_CAPENABLED }, /* 39 = getppid */ { compat(AS(olstat_args),lstat), AUE_LSTAT, NULL, 0, 0, 0 }, /* 40 = old lstat */ - { AS(dup_args), (sy_call_t *)dup, AUE_DUP, NULL, 0, 0, 0 }, /* 41 = dup */ - { 0, (sy_call_t *)pipe, AUE_PIPE, NULL, 0, 0, 0 }, /* 42 = pipe */ - { 0, (sy_call_t *)getegid, AUE_GETEGID, NULL, 0, 0, 0 }, /* 43 = getegid */ - { AS(profil_args), (sy_call_t *)profil, AUE_PROFILE, NULL, 0, 0, 0 }, /* 44 = profil */ + { AS(dup_args), (sy_call_t *)dup, AUE_DUP, NULL, 0, 0, SYF_CAPENABLED }, /* 41 = dup */ + { 0, (sy_call_t *)pipe, AUE_PIPE, NULL, 0, 0, SYF_CAPENABLED }, /* 42 = pipe */ + { 0, (sy_call_t *)getegid, AUE_GETEGID, NULL, 0, 0, SYF_CAPENABLED }, /* 43 = getegid */ + { AS(profil_args), (sy_call_t *)profil, AUE_PROFILE, NULL, 0, 0, SYF_CAPENABLED }, /* 44 = profil */ { AS(ktrace_args), (sy_call_t *)ktrace, AUE_KTRACE, NULL, 0, 0, 0 }, /* 45 = ktrace */ - { compat(AS(osigaction_args),sigaction), AUE_SIGACTION, NULL, 0, 0, 0 }, /* 46 = old sigaction */ - { 0, (sy_call_t *)getgid, AUE_GETGID, NULL, 0, 0, 0 }, /* 47 = getgid */ - { compat(AS(osigprocmask_args),sigprocmask), AUE_SIGPROCMASK, NULL, 0, 0, 0 }, /* 48 = old sigprocmask */ - { AS(getlogin_args), (sy_call_t *)getlogin, AUE_GETLOGIN, NULL, 0, 0, 0 }, /* 49 = getlogin */ + { compat(AS(osigaction_args),sigaction), AUE_SIGACTION, NULL, 0, 0, SYF_CAPENABLED }, /* 46 = old sigaction */ + { 0, (sy_call_t *)getgid, AUE_GETGID, NULL, 0, 0, SYF_CAPENABLED }, /* 47 = getgid */ + { compat(AS(osigprocmask_args),sigprocmask), AUE_SIGPROCMASK, NULL, 0, 0, SYF_CAPENABLED }, /* 48 = old sigprocmask */ + { AS(getlogin_args), (sy_call_t *)getlogin, AUE_GETLOGIN, NULL, 0, 0, SYF_CAPENABLED }, /* 49 = getlogin */ { AS(setlogin_args), (sy_call_t *)setlogin, AUE_SETLOGIN, NULL, 0, 0, 0 }, /* 50 = setlogin */ { AS(acct_args), (sy_call_t *)acct, AUE_ACCT, NULL, 0, 0, 0 }, /* 51 = acct */ - { compat(0,sigpending), AUE_SIGPENDING, NULL, 0, 0, 0 }, /* 52 = old sigpending */ - { AS(sigaltstack_args), (sy_call_t *)sigaltstack, AUE_SIGALTSTACK, NULL, 0, 0, 0 }, /* 53 = sigaltstack */ + { compat(0,sigpending), AUE_SIGPENDING, NULL, 0, 0, SYF_CAPENABLED }, /* 52 = old sigpending */ + { AS(sigaltstack_args), (sy_call_t *)sigaltstack, AUE_SIGALTSTACK, NULL, 0, 0, SYF_CAPENABLED }, /* 53 = sigaltstack */ { AS(ioctl_args), (sy_call_t *)ioctl, AUE_IOCTL, NULL, 0, 0, 0 }, /* 54 = ioctl */ { AS(reboot_args), (sy_call_t *)reboot, AUE_REBOOT, NULL, 0, 0, 0 }, /* 55 = reboot */ { AS(revoke_args), (sy_call_t *)revoke, AUE_REVOKE, NULL, 0, 0, 0 }, /* 56 = revoke */ { AS(symlink_args), (sy_call_t *)symlink, AUE_SYMLINK, NULL, 0, 0, 0 }, /* 57 = symlink */ { AS(readlink_args), (sy_call_t *)readlink, AUE_READLINK, NULL, 0, 0, 0 }, /* 58 = readlink */ { AS(execve_args), (sy_call_t *)execve, AUE_EXECVE, NULL, 0, 0, 0 }, /* 59 = execve */ - { AS(umask_args), (sy_call_t *)umask, AUE_UMASK, NULL, 0, 0, 0 }, /* 60 = umask */ + { AS(umask_args), (sy_call_t *)umask, AUE_UMASK, NULL, 0, 0, SYF_CAPENABLED }, /* 60 = umask */ { AS(chroot_args), (sy_call_t *)chroot, AUE_CHROOT, NULL, 0, 0, 0 }, /* 61 = chroot */ - { compat(AS(ofstat_args),fstat), AUE_FSTAT, NULL, 0, 0, 0 }, /* 62 = old fstat */ + { compat(AS(ofstat_args),fstat), AUE_FSTAT, NULL, 0, 0, SYF_CAPENABLED }, /* 62 = old fstat */ { compat(AS(getkerninfo_args),getkerninfo), AUE_NULL, NULL, 0, 0, 0 }, /* 63 = old getkerninfo */ - { compat(0,getpagesize), AUE_NULL, NULL, 0, 0, 0 }, /* 64 = old getpagesize */ - { AS(msync_args), (sy_call_t *)msync, AUE_MSYNC, NULL, 0, 0, 0 }, /* 65 = msync */ + { compat(0,getpagesize), AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 64 = old getpagesize */ + { AS(msync_args), (sy_call_t *)msync, AUE_MSYNC, NULL, 0, 0, SYF_CAPENABLED }, /* 65 = msync */ { 0, (sy_call_t *)vfork, AUE_VFORK, NULL, 0, 0, 0 }, /* 66 = vfork */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 67 = obsolete vread */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 68 = obsolete vwrite */ - { AS(sbrk_args), (sy_call_t *)sbrk, AUE_SBRK, NULL, 0, 0, 0 }, /* 69 = sbrk */ - { AS(sstk_args), (sy_call_t *)sstk, AUE_SSTK, NULL, 0, 0, 0 }, /* 70 = sstk */ - { compat(AS(ommap_args),mmap), AUE_MMAP, NULL, 0, 0, 0 }, /* 71 = old mmap */ + { AS(sbrk_args), (sy_call_t *)sbrk, AUE_SBRK, NULL, 0, 0, SYF_CAPENABLED }, /* 69 = sbrk */ + { AS(sstk_args), (sy_call_t *)sstk, AUE_SSTK, NULL, 0, 0, SYF_CAPENABLED }, /* 70 = sstk */ + { compat(AS(ommap_args),mmap), AUE_MMAP, NULL, 0, 0, SYF_CAPENABLED }, /* 71 = old mmap */ { AS(ovadvise_args), (sy_call_t *)ovadvise, AUE_O_VADVISE, NULL, 0, 0, 0 }, /* 72 = vadvise */ - { AS(munmap_args), (sy_call_t *)munmap, AUE_MUNMAP, NULL, 0, 0, 0 }, /* 73 = munmap */ - { AS(mprotect_args), (sy_call_t *)mprotect, AUE_MPROTECT, NULL, 0, 0, 0 }, /* 74 = mprotect */ - { AS(madvise_args), (sy_call_t *)madvise, AUE_MADVISE, NULL, 0, 0, 0 }, /* 75 = madvise */ + { AS(munmap_args), (sy_call_t *)munmap, AUE_MUNMAP, NULL, 0, 0, SYF_CAPENABLED }, /* 73 = munmap */ + { AS(mprotect_args), (sy_call_t *)mprotect, AUE_MPROTECT, NULL, 0, 0, SYF_CAPENABLED }, /* 74 = mprotect */ + { AS(madvise_args), (sy_call_t *)madvise, AUE_MADVISE, NULL, 0, 0, SYF_CAPENABLED }, /* 75 = madvise */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 76 = obsolete vhangup */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 77 = obsolete vlimit */ - { AS(mincore_args), (sy_call_t *)mincore, AUE_MINCORE, NULL, 0, 0, 0 }, /* 78 = mincore */ - { AS(getgroups_args), (sy_call_t *)getgroups, AUE_GETGROUPS, NULL, 0, 0, 0 }, /* 79 = getgroups */ + { AS(mincore_args), (sy_call_t *)mincore, AUE_MINCORE, NULL, 0, 0, SYF_CAPENABLED }, /* 78 = mincore */ + { AS(getgroups_args), (sy_call_t *)getgroups, AUE_GETGROUPS, NULL, 0, 0, SYF_CAPENABLED }, /* 79 = getgroups */ { AS(setgroups_args), (sy_call_t *)setgroups, AUE_SETGROUPS, NULL, 0, 0, 0 }, /* 80 = setgroups */ - { 0, (sy_call_t *)getpgrp, AUE_GETPGRP, NULL, 0, 0, 0 }, /* 81 = getpgrp */ + { 0, (sy_call_t *)getpgrp, AUE_GETPGRP, NULL, 0, 0, SYF_CAPENABLED }, /* 81 = getpgrp */ { AS(setpgid_args), (sy_call_t *)setpgid, AUE_SETPGRP, NULL, 0, 0, 0 }, /* 82 = setpgid */ - { AS(setitimer_args), (sy_call_t *)setitimer, AUE_SETITIMER, NULL, 0, 0, 0 }, /* 83 = setitimer */ + { AS(setitimer_args), (sy_call_t *)setitimer, AUE_SETITIMER, NULL, 0, 0, SYF_CAPENABLED }, /* 83 = setitimer */ { compat(0,wait), AUE_WAIT4, NULL, 0, 0, 0 }, /* 84 = old wait */ { AS(swapon_args), (sy_call_t *)swapon, AUE_SWAPON, NULL, 0, 0, 0 }, /* 85 = swapon */ - { AS(getitimer_args), (sy_call_t *)getitimer, AUE_GETITIMER, NULL, 0, 0, 0 }, /* 86 = getitimer */ - { compat(AS(gethostname_args),gethostname), AUE_SYSCTL, NULL, 0, 0, 0 }, /* 87 = old gethostname */ + { AS(getitimer_args), (sy_call_t *)getitimer, AUE_GETITIMER, NULL, 0, 0, SYF_CAPENABLED }, /* 86 = getitimer */ + { compat(AS(gethostname_args),gethostname), AUE_SYSCTL, NULL, 0, 0, SYF_CAPENABLED }, /* 87 = old gethostname */ { compat(AS(sethostname_args),sethostname), AUE_SYSCTL, NULL, 0, 0, 0 }, /* 88 = old sethostname */ { 0, (sy_call_t *)getdtablesize, AUE_GETDTABLESIZE, NULL, 0, 0, 0 }, /* 89 = getdtablesize */ - { AS(dup2_args), (sy_call_t *)dup2, AUE_DUP2, NULL, 0, 0, 0 }, /* 90 = dup2 */ + { AS(dup2_args), (sy_call_t *)dup2, AUE_DUP2, NULL, 0, 0, SYF_CAPENABLED }, /* 90 = dup2 */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 91 = getdopt */ - { AS(fcntl_args), (sy_call_t *)fcntl, AUE_FCNTL, NULL, 0, 0, 0 }, /* 92 = fcntl */ - { AS(select_args), (sy_call_t *)select, AUE_SELECT, NULL, 0, 0, 0 }, /* 93 = select */ + { AS(fcntl_args), (sy_call_t *)fcntl, AUE_FCNTL, NULL, 0, 0, SYF_CAPENABLED }, /* 92 = fcntl */ + { AS(select_args), (sy_call_t *)select, AUE_SELECT, NULL, 0, 0, SYF_CAPENABLED }, /* 93 = select */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 94 = setdopt */ - { AS(fsync_args), (sy_call_t *)fsync, AUE_FSYNC, NULL, 0, 0, 0 }, /* 95 = fsync */ - { AS(setpriority_args), (sy_call_t *)setpriority, AUE_SETPRIORITY, NULL, 0, 0, 0 }, /* 96 = setpriority */ - { AS(socket_args), (sy_call_t *)socket, AUE_SOCKET, NULL, 0, 0, 0 }, /* 97 = socket */ + { AS(fsync_args), (sy_call_t *)fsync, AUE_FSYNC, NULL, 0, 0, SYF_CAPENABLED }, /* 95 = fsync */ + { AS(setpriority_args), (sy_call_t *)setpriority, AUE_SETPRIORITY, NULL, 0, 0, SYF_CAPENABLED }, /* 96 = setpriority */ + { AS(socket_args), (sy_call_t *)socket, AUE_SOCKET, NULL, 0, 0, SYF_CAPENABLED }, /* 97 = socket */ { AS(connect_args), (sy_call_t *)connect, AUE_CONNECT, NULL, 0, 0, 0 }, /* 98 = connect */ - { compat(AS(accept_args),accept), AUE_ACCEPT, NULL, 0, 0, 0 }, /* 99 = old accept */ - { AS(getpriority_args), (sy_call_t *)getpriority, AUE_GETPRIORITY, NULL, 0, 0, 0 }, /* 100 = getpriority */ - { compat(AS(osend_args),send), AUE_SEND, NULL, 0, 0, 0 }, /* 101 = old send */ - { compat(AS(orecv_args),recv), AUE_RECV, NULL, 0, 0, 0 }, /* 102 = old recv */ - { compat(AS(osigreturn_args),sigreturn), AUE_SIGRETURN, NULL, 0, 0, 0 }, /* 103 = old sigreturn */ + { compat(AS(accept_args),accept), AUE_ACCEPT, NULL, 0, 0, SYF_CAPENABLED }, /* 99 = old accept */ + { AS(getpriority_args), (sy_call_t *)getpriority, AUE_GETPRIORITY, NULL, 0, 0, SYF_CAPENABLED }, /* 100 = getpriority */ + { compat(AS(osend_args),send), AUE_SEND, NULL, 0, 0, SYF_CAPENABLED }, /* 101 = old send */ + { compat(AS(orecv_args),recv), AUE_RECV, NULL, 0, 0, SYF_CAPENABLED }, /* 102 = old recv */ + { compat(AS(osigreturn_args),sigreturn), AUE_SIGRETURN, NULL, 0, 0, SYF_CAPENABLED }, /* 103 = old sigreturn */ { AS(bind_args), (sy_call_t *)bind, AUE_BIND, NULL, 0, 0, 0 }, /* 104 = bind */ - { AS(setsockopt_args), (sy_call_t *)setsockopt, AUE_SETSOCKOPT, NULL, 0, 0, 0 }, /* 105 = setsockopt */ - { AS(listen_args), (sy_call_t *)listen, AUE_LISTEN, NULL, 0, 0, 0 }, /* 106 = listen */ + { AS(setsockopt_args), (sy_call_t *)setsockopt, AUE_SETSOCKOPT, NULL, 0, 0, SYF_CAPENABLED }, /* 105 = setsockopt */ + { AS(listen_args), (sy_call_t *)listen, AUE_LISTEN, NULL, 0, 0, SYF_CAPENABLED }, /* 106 = listen */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 107 = obsolete vtimes */ - { compat(AS(osigvec_args),sigvec), AUE_NULL, NULL, 0, 0, 0 }, /* 108 = old sigvec */ - { compat(AS(osigblock_args),sigblock), AUE_NULL, NULL, 0, 0, 0 }, /* 109 = old sigblock */ - { compat(AS(osigsetmask_args),sigsetmask), AUE_NULL, NULL, 0, 0, 0 }, /* 110 = old sigsetmask */ - { compat(AS(osigsuspend_args),sigsuspend), AUE_NULL, NULL, 0, 0, 0 }, /* 111 = old sigsuspend */ - { compat(AS(osigstack_args),sigstack), AUE_NULL, NULL, 0, 0, 0 }, /* 112 = old sigstack */ - { compat(AS(orecvmsg_args),recvmsg), AUE_RECVMSG, NULL, 0, 0, 0 }, /* 113 = old recvmsg */ - { compat(AS(osendmsg_args),sendmsg), AUE_SENDMSG, NULL, 0, 0, 0 }, /* 114 = old sendmsg */ + { compat(AS(osigvec_args),sigvec), AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 108 = old sigvec */ + { compat(AS(osigblock_args),sigblock), AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 109 = old sigblock */ + { compat(AS(osigsetmask_args),sigsetmask), AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 110 = old sigsetmask */ + { compat(AS(osigsuspend_args),sigsuspend), AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 111 = old sigsuspend */ + { compat(AS(osigstack_args),sigstack), AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 112 = old sigstack */ + { compat(AS(orecvmsg_args),recvmsg), AUE_RECVMSG, NULL, 0, 0, SYF_CAPENABLED }, /* 113 = old recvmsg */ + { compat(AS(osendmsg_args),sendmsg), AUE_SENDMSG, NULL, 0, 0, SYF_CAPENABLED }, /* 114 = old sendmsg */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 115 = obsolete vtrace */ - { AS(gettimeofday_args), (sy_call_t *)gettimeofday, AUE_GETTIMEOFDAY, NULL, 0, 0, 0 }, /* 116 = gettimeofday */ - { AS(getrusage_args), (sy_call_t *)getrusage, AUE_GETRUSAGE, NULL, 0, 0, 0 }, /* 117 = getrusage */ - { AS(getsockopt_args), (sy_call_t *)getsockopt, AUE_GETSOCKOPT, NULL, 0, 0, 0 }, /* 118 = getsockopt */ + { AS(gettimeofday_args), (sy_call_t *)gettimeofday, AUE_GETTIMEOFDAY, NULL, 0, 0, SYF_CAPENABLED }, /* 116 = gettimeofday */ + { AS(getrusage_args), (sy_call_t *)getrusage, AUE_GETRUSAGE, NULL, 0, 0, SYF_CAPENABLED }, /* 117 = getrusage */ + { AS(getsockopt_args), (sy_call_t *)getsockopt, AUE_GETSOCKOPT, NULL, 0, 0, SYF_CAPENABLED }, /* 118 = getsockopt */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 119 = resuba */ - { AS(readv_args), (sy_call_t *)readv, AUE_READV, NULL, 0, 0, 0 }, /* 120 = readv */ - { AS(writev_args), (sy_call_t *)writev, AUE_WRITEV, NULL, 0, 0, 0 }, /* 121 = writev */ + { AS(readv_args), (sy_call_t *)readv, AUE_READV, NULL, 0, 0, SYF_CAPENABLED }, /* 120 = readv */ + { AS(writev_args), (sy_call_t *)writev, AUE_WRITEV, NULL, 0, 0, SYF_CAPENABLED }, /* 121 = writev */ { AS(settimeofday_args), (sy_call_t *)settimeofday, AUE_SETTIMEOFDAY, NULL, 0, 0, 0 }, /* 122 = settimeofday */ - { AS(fchown_args), (sy_call_t *)fchown, AUE_FCHOWN, NULL, 0, 0, 0 }, /* 123 = fchown */ - { AS(fchmod_args), (sy_call_t *)fchmod, AUE_FCHMOD, NULL, 0, 0, 0 }, /* 124 = fchmod */ - { compat(AS(recvfrom_args),recvfrom), AUE_RECVFROM, NULL, 0, 0, 0 }, /* 125 = old recvfrom */ - { AS(setreuid_args), (sy_call_t *)setreuid, AUE_SETREUID, NULL, 0, 0, 0 }, /* 126 = setreuid */ - { AS(setregid_args), (sy_call_t *)setregid, AUE_SETREGID, NULL, 0, 0, 0 }, /* 127 = setregid */ + { AS(fchown_args), (sy_call_t *)fchown, AUE_FCHOWN, NULL, 0, 0, SYF_CAPENABLED }, /* 123 = fchown */ + { AS(fchmod_args), (sy_call_t *)fchmod, AUE_FCHMOD, NULL, 0, 0, SYF_CAPENABLED }, /* 124 = fchmod */ + { compat(AS(recvfrom_args),recvfrom), AUE_RECVFROM, NULL, 0, 0, SYF_CAPENABLED }, /* 125 = old recvfrom */ + { AS(setreuid_args), (sy_call_t *)setreuid, AUE_SETREUID, NULL, 0, 0, SYF_CAPENABLED }, /* 126 = setreuid */ + { AS(setregid_args), (sy_call_t *)setregid, AUE_SETREGID, NULL, 0, 0, SYF_CAPENABLED }, /* 127 = setregid */ { AS(rename_args), (sy_call_t *)rename, AUE_RENAME, NULL, 0, 0, 0 }, /* 128 = rename */ { compat(AS(otruncate_args),truncate), AUE_TRUNCATE, NULL, 0, 0, 0 }, /* 129 = old truncate */ - { compat(AS(oftruncate_args),ftruncate), AUE_FTRUNCATE, NULL, 0, 0, 0 }, /* 130 = old ftruncate */ - { AS(flock_args), (sy_call_t *)flock, AUE_FLOCK, NULL, 0, 0, 0 }, /* 131 = flock */ + { compat(AS(oftruncate_args),ftruncate), AUE_FTRUNCATE, NULL, 0, 0, SYF_CAPENABLED }, /* 130 = old ftruncate */ + { AS(flock_args), (sy_call_t *)flock, AUE_FLOCK, NULL, 0, 0, SYF_CAPENABLED }, /* 131 = flock */ { AS(mkfifo_args), (sy_call_t *)mkfifo, AUE_MKFIFO, NULL, 0, 0, 0 }, /* 132 = mkfifo */ - { AS(sendto_args), (sy_call_t *)sendto, AUE_SENDTO, NULL, 0, 0, 0 }, /* 133 = sendto */ - { AS(shutdown_args), (sy_call_t *)shutdown, AUE_SHUTDOWN, NULL, 0, 0, 0 }, /* 134 = shutdown */ - { AS(socketpair_args), (sy_call_t *)socketpair, AUE_SOCKETPAIR, NULL, 0, 0, 0 }, /* 135 = socketpair */ + { AS(sendto_args), (sy_call_t *)sendto, AUE_SENDTO, NULL, 0, 0, SYF_CAPENABLED }, /* 133 = sendto */ + { AS(shutdown_args), (sy_call_t *)shutdown, AUE_SHUTDOWN, NULL, 0, 0, SYF_CAPENABLED }, /* 134 = shutdown */ + { AS(socketpair_args), (sy_call_t *)socketpair, AUE_SOCKETPAIR, NULL, 0, 0, SYF_CAPENABLED }, /* 135 = socketpair */ { AS(mkdir_args), (sy_call_t *)mkdir, AUE_MKDIR, NULL, 0, 0, 0 }, /* 136 = mkdir */ { AS(rmdir_args), (sy_call_t *)rmdir, AUE_RMDIR, NULL, 0, 0, 0 }, /* 137 = rmdir */ { AS(utimes_args), (sy_call_t *)utimes, AUE_UTIMES, NULL, 0, 0, 0 }, /* 138 = utimes */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 139 = obsolete 4.2 sigreturn */ { AS(adjtime_args), (sy_call_t *)adjtime, AUE_ADJTIME, NULL, 0, 0, 0 }, /* 140 = adjtime */ - { compat(AS(ogetpeername_args),getpeername), AUE_GETPEERNAME, NULL, 0, 0, 0 }, /* 141 = old getpeername */ - { compat(0,gethostid), AUE_SYSCTL, NULL, 0, 0, 0 }, /* 142 = old gethostid */ + { compat(AS(ogetpeername_args),getpeername), AUE_GETPEERNAME, NULL, 0, 0, SYF_CAPENABLED }, /* 141 = old getpeername */ + { compat(0,gethostid), AUE_SYSCTL, NULL, 0, 0, SYF_CAPENABLED }, /* 142 = old gethostid */ { compat(AS(osethostid_args),sethostid), AUE_SYSCTL, NULL, 0, 0, 0 }, /* 143 = old sethostid */ - { compat(AS(ogetrlimit_args),getrlimit), AUE_GETRLIMIT, NULL, 0, 0, 0 }, /* 144 = old getrlimit */ - { compat(AS(osetrlimit_args),setrlimit), AUE_SETRLIMIT, NULL, 0, 0, 0 }, /* 145 = old setrlimit */ + { compat(AS(ogetrlimit_args),getrlimit), AUE_GETRLIMIT, NULL, 0, 0, SYF_CAPENABLED }, /* 144 = old getrlimit */ + { compat(AS(osetrlimit_args),setrlimit), AUE_SETRLIMIT, NULL, 0, 0, SYF_CAPENABLED }, /* 145 = old setrlimit */ { compat(AS(okillpg_args),killpg), AUE_KILLPG, NULL, 0, 0, 0 }, /* 146 = old killpg */ - { 0, (sy_call_t *)setsid, AUE_SETSID, NULL, 0, 0, 0 }, /* 147 = setsid */ + { 0, (sy_call_t *)setsid, AUE_SETSID, NULL, 0, 0, SYF_CAPENABLED }, /* 147 = setsid */ { AS(quotactl_args), (sy_call_t *)quotactl, AUE_QUOTACTL, NULL, 0, 0, 0 }, /* 148 = quotactl */ { compat(0,quota), AUE_O_QUOTA, NULL, 0, 0, 0 }, /* 149 = old quota */ - { compat(AS(getsockname_args),getsockname), AUE_GETSOCKNAME, NULL, 0, 0, 0 }, /* 150 = old getsockname */ + { compat(AS(getsockname_args),getsockname), AUE_GETSOCKNAME, NULL, 0, 0, SYF_CAPENABLED }, /* 150 = old getsockname */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 151 = sem_lock */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 152 = sem_wakeup */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 153 = asyncdaemon */ { AS(nlm_syscall_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 154 = nlm_syscall */ { AS(nfssvc_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 155 = nfssvc */ - { compat(AS(ogetdirentries_args),getdirentries), AUE_GETDIRENTRIES, NULL, 0, 0, 0 }, /* 156 = old getdirentries */ + { compat(AS(ogetdirentries_args),getdirentries), AUE_GETDIRENTRIES, NULL, 0, 0, SYF_CAPENABLED }, /* 156 = old getdirentries */ { compat4(AS(freebsd4_statfs_args),statfs), AUE_STATFS, NULL, 0, 0, 0 }, /* 157 = freebsd4 statfs */ - { compat4(AS(freebsd4_fstatfs_args),fstatfs), AUE_FSTATFS, NULL, 0, 0, 0 }, /* 158 = freebsd4 fstatfs */ + { compat4(AS(freebsd4_fstatfs_args),fstatfs), AUE_FSTATFS, NULL, 0, 0, SYF_CAPENABLED }, /* 158 = freebsd4 fstatfs */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 159 = nosys */ { AS(lgetfh_args), (sy_call_t *)lgetfh, AUE_LGETFH, NULL, 0, 0, 0 }, /* 160 = lgetfh */ { AS(getfh_args), (sy_call_t *)getfh, AUE_NFS_GETFH, NULL, 0, 0, 0 }, /* 161 = getfh */ - { compat4(AS(freebsd4_getdomainname_args),getdomainname), AUE_SYSCTL, NULL, 0, 0, 0 }, /* 162 = freebsd4 getdomainname */ + { compat4(AS(freebsd4_getdomainname_args),getdomainname), AUE_SYSCTL, NULL, 0, 0, SYF_CAPENABLED }, /* 162 = freebsd4 getdomainname */ { compat4(AS(freebsd4_setdomainname_args),setdomainname), AUE_SYSCTL, NULL, 0, 0, 0 }, /* 163 = freebsd4 setdomainname */ { compat4(AS(freebsd4_uname_args),uname), AUE_NULL, NULL, 0, 0, 0 }, /* 164 = freebsd4 uname */ - { AS(sysarch_args), (sy_call_t *)sysarch, AUE_SYSARCH, NULL, 0, 0, 0 }, /* 165 = sysarch */ - { AS(rtprio_args), (sy_call_t *)rtprio, AUE_RTPRIO, NULL, 0, 0, 0 }, /* 166 = rtprio */ + { AS(sysarch_args), (sy_call_t *)sysarch, AUE_SYSARCH, NULL, 0, 0, SYF_CAPENABLED }, /* 165 = sysarch */ + { AS(rtprio_args), (sy_call_t *)rtprio, AUE_RTPRIO, NULL, 0, 0, SYF_CAPENABLED }, /* 166 = rtprio */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 167 = nosys */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 168 = nosys */ { AS(semsys_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 169 = semsys */ { AS(msgsys_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 170 = msgsys */ { AS(shmsys_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 171 = shmsys */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 172 = nosys */ - { AS(freebsd6_pread_args), (sy_call_t *)freebsd6_pread, AUE_PREAD, NULL, 0, 0, 0 }, /* 173 = freebsd6_pread */ - { AS(freebsd6_pwrite_args), (sy_call_t *)freebsd6_pwrite, AUE_PWRITE, NULL, 0, 0, 0 }, /* 174 = freebsd6_pwrite */ + { AS(freebsd6_pread_args), (sy_call_t *)freebsd6_pread, AUE_PREAD, NULL, 0, 0, SYF_CAPENABLED }, /* 173 = freebsd6_pread */ + { AS(freebsd6_pwrite_args), (sy_call_t *)freebsd6_pwrite, AUE_PWRITE, NULL, 0, 0, SYF_CAPENABLED }, /* 174 = freebsd6_pwrite */ { AS(setfib_args), (sy_call_t *)setfib, AUE_NULL, NULL, 0, 0, 0 }, /* 175 = setfib */ { AS(ntp_adjtime_args), (sy_call_t *)ntp_adjtime, AUE_NTP_ADJTIME, NULL, 0, 0, 0 }, /* 176 = ntp_adjtime */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 177 = sfork */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 178 = getdescriptor */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 179 = setdescriptor */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 180 = nosys */ - { AS(setgid_args), (sy_call_t *)setgid, AUE_SETGID, NULL, 0, 0, 0 }, /* 181 = setgid */ - { AS(setegid_args), (sy_call_t *)setegid, AUE_SETEGID, NULL, 0, 0, 0 }, /* 182 = setegid */ - { AS(seteuid_args), (sy_call_t *)seteuid, AUE_SETEUID, NULL, 0, 0, 0 }, /* 183 = seteuid */ + { AS(setgid_args), (sy_call_t *)setgid, AUE_SETGID, NULL, 0, 0, SYF_CAPENABLED }, /* 181 = setgid */ + { AS(setegid_args), (sy_call_t *)setegid, AUE_SETEGID, NULL, 0, 0, SYF_CAPENABLED }, /* 182 = setegid */ + { AS(seteuid_args), (sy_call_t *)seteuid, AUE_SETEUID, NULL, 0, 0, SYF_CAPENABLED }, /* 183 = seteuid */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 184 = lfs_bmapv */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 185 = lfs_markv */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 186 = lfs_segclean */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 187 = lfs_segwait */ { AS(stat_args), (sy_call_t *)stat, AUE_STAT, NULL, 0, 0, 0 }, /* 188 = stat */ - { AS(fstat_args), (sy_call_t *)fstat, AUE_FSTAT, NULL, 0, 0, 0 }, /* 189 = fstat */ + { AS(fstat_args), (sy_call_t *)fstat, AUE_FSTAT, NULL, 0, 0, SYF_CAPENABLED }, /* 189 = fstat */ { AS(lstat_args), (sy_call_t *)lstat, AUE_LSTAT, NULL, 0, 0, 0 }, /* 190 = lstat */ { AS(pathconf_args), (sy_call_t *)pathconf, AUE_PATHCONF, NULL, 0, 0, 0 }, /* 191 = pathconf */ - { AS(fpathconf_args), (sy_call_t *)fpathconf, AUE_FPATHCONF, NULL, 0, 0, 0 }, /* 192 = fpathconf */ + { AS(fpathconf_args), (sy_call_t *)fpathconf, AUE_FPATHCONF, NULL, 0, 0, SYF_CAPENABLED }, /* 192 = fpathconf */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 193 = nosys */ - { AS(__getrlimit_args), (sy_call_t *)getrlimit, AUE_GETRLIMIT, NULL, 0, 0, 0 }, /* 194 = getrlimit */ - { AS(__setrlimit_args), (sy_call_t *)setrlimit, AUE_SETRLIMIT, NULL, 0, 0, 0 }, /* 195 = setrlimit */ - { AS(getdirentries_args), (sy_call_t *)getdirentries, AUE_GETDIRENTRIES, NULL, 0, 0, 0 }, /* 196 = getdirentries */ - { AS(freebsd6_mmap_args), (sy_call_t *)freebsd6_mmap, AUE_MMAP, NULL, 0, 0, 0 }, /* 197 = freebsd6_mmap */ + { AS(__getrlimit_args), (sy_call_t *)getrlimit, AUE_GETRLIMIT, NULL, 0, 0, SYF_CAPENABLED }, /* 194 = getrlimit */ + { AS(__setrlimit_args), (sy_call_t *)setrlimit, AUE_SETRLIMIT, NULL, 0, 0, SYF_CAPENABLED }, /* 195 = setrlimit */ + { AS(getdirentries_args), (sy_call_t *)getdirentries, AUE_GETDIRENTRIES, NULL, 0, 0, SYF_CAPENABLED }, /* 196 = getdirentries */ + { AS(freebsd6_mmap_args), (sy_call_t *)freebsd6_mmap, AUE_MMAP, NULL, 0, 0, SYF_CAPENABLED }, /* 197 = freebsd6_mmap */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 198 = __syscall */ - { AS(freebsd6_lseek_args), (sy_call_t *)freebsd6_lseek, AUE_LSEEK, NULL, 0, 0, 0 }, /* 199 = freebsd6_lseek */ + { AS(freebsd6_lseek_args), (sy_call_t *)freebsd6_lseek, AUE_LSEEK, NULL, 0, 0, SYF_CAPENABLED }, /* 199 = freebsd6_lseek */ { AS(freebsd6_truncate_args), (sy_call_t *)freebsd6_truncate, AUE_TRUNCATE, NULL, 0, 0, 0 }, /* 200 = freebsd6_truncate */ - { AS(freebsd6_ftruncate_args), (sy_call_t *)freebsd6_ftruncate, AUE_FTRUNCATE, NULL, 0, 0, 0 }, /* 201 = freebsd6_ftruncate */ - { AS(sysctl_args), (sy_call_t *)__sysctl, AUE_SYSCTL, NULL, 0, 0, 0 }, /* 202 = __sysctl */ - { AS(mlock_args), (sy_call_t *)mlock, AUE_MLOCK, NULL, 0, 0, 0 }, /* 203 = mlock */ - { AS(munlock_args), (sy_call_t *)munlock, AUE_MUNLOCK, NULL, 0, 0, 0 }, /* 204 = munlock */ + { AS(freebsd6_ftruncate_args), (sy_call_t *)freebsd6_ftruncate, AUE_FTRUNCATE, NULL, 0, 0, SYF_CAPENABLED }, /* 201 = freebsd6_ftruncate */ + { AS(sysctl_args), (sy_call_t *)__sysctl, AUE_SYSCTL, NULL, 0, 0, SYF_CAPENABLED }, /* 202 = __sysctl */ + { AS(mlock_args), (sy_call_t *)mlock, AUE_MLOCK, NULL, 0, 0, SYF_CAPENABLED }, /* 203 = mlock */ + { AS(munlock_args), (sy_call_t *)munlock, AUE_MUNLOCK, NULL, 0, 0, SYF_CAPENABLED }, /* 204 = munlock */ { AS(undelete_args), (sy_call_t *)undelete, AUE_UNDELETE, NULL, 0, 0, 0 }, /* 205 = undelete */ - { AS(futimes_args), (sy_call_t *)futimes, AUE_FUTIMES, NULL, 0, 0, 0 }, /* 206 = futimes */ - { AS(getpgid_args), (sy_call_t *)getpgid, AUE_GETPGID, NULL, 0, 0, 0 }, /* 207 = getpgid */ + { AS(futimes_args), (sy_call_t *)futimes, AUE_FUTIMES, NULL, 0, 0, SYF_CAPENABLED }, /* 206 = futimes */ + { AS(getpgid_args), (sy_call_t *)getpgid, AUE_GETPGID, NULL, 0, 0, SYF_CAPENABLED }, /* 207 = getpgid */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 208 = newreboot */ - { AS(poll_args), (sy_call_t *)poll, AUE_POLL, NULL, 0, 0, 0 }, /* 209 = poll */ + { AS(poll_args), (sy_call_t *)poll, AUE_POLL, NULL, 0, 0, SYF_CAPENABLED }, /* 209 = poll */ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0 }, /* 210 = lkmnosys */ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0 }, /* 211 = lkmnosys */ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0 }, /* 212 = lkmnosys */ @@ -266,15 +266,15 @@ { 0, (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 229 = freebsd7 shmctl */ { AS(shmdt_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 230 = shmdt */ { AS(shmget_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 231 = shmget */ - { AS(clock_gettime_args), (sy_call_t *)clock_gettime, AUE_NULL, NULL, 0, 0, 0 }, /* 232 = clock_gettime */ + { AS(clock_gettime_args), (sy_call_t *)clock_gettime, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 232 = clock_gettime */ { AS(clock_settime_args), (sy_call_t *)clock_settime, AUE_CLOCK_SETTIME, NULL, 0, 0, 0 }, /* 233 = clock_settime */ - { AS(clock_getres_args), (sy_call_t *)clock_getres, AUE_NULL, NULL, 0, 0, 0 }, /* 234 = clock_getres */ - { AS(ktimer_create_args), (sy_call_t *)ktimer_create, AUE_NULL, NULL, 0, 0, 0 }, /* 235 = ktimer_create */ - { AS(ktimer_delete_args), (sy_call_t *)ktimer_delete, AUE_NULL, NULL, 0, 0, 0 }, /* 236 = ktimer_delete */ - { AS(ktimer_settime_args), (sy_call_t *)ktimer_settime, AUE_NULL, NULL, 0, 0, 0 }, /* 237 = ktimer_settime */ - { AS(ktimer_gettime_args), (sy_call_t *)ktimer_gettime, AUE_NULL, NULL, 0, 0, 0 }, /* 238 = ktimer_gettime */ - { AS(ktimer_getoverrun_args), (sy_call_t *)ktimer_getoverrun, AUE_NULL, NULL, 0, 0, 0 }, /* 239 = ktimer_getoverrun */ - { AS(nanosleep_args), (sy_call_t *)nanosleep, AUE_NULL, NULL, 0, 0, 0 }, /* 240 = nanosleep */ + { AS(clock_getres_args), (sy_call_t *)clock_getres, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 234 = clock_getres */ + { AS(ktimer_create_args), (sy_call_t *)ktimer_create, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 235 = ktimer_create */ + { AS(ktimer_delete_args), (sy_call_t *)ktimer_delete, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 236 = ktimer_delete */ + { AS(ktimer_settime_args), (sy_call_t *)ktimer_settime, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 237 = ktimer_settime */ + { AS(ktimer_gettime_args), (sy_call_t *)ktimer_gettime, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 238 = ktimer_gettime */ + { AS(ktimer_getoverrun_args), (sy_call_t *)ktimer_getoverrun, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 239 = ktimer_getoverrun */ + { AS(nanosleep_args), (sy_call_t *)nanosleep, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 240 = nanosleep */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 241 = nosys */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 242 = nosys */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 243 = nosys */ @@ -282,12 +282,12 @@ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 245 = nosys */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 246 = nosys */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 247 = nosys */ - { AS(ntp_gettime_args), (sy_call_t *)ntp_gettime, AUE_NULL, NULL, 0, 0, 0 }, /* 248 = ntp_gettime */ + { AS(ntp_gettime_args), (sy_call_t *)ntp_gettime, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 248 = ntp_gettime */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 249 = nosys */ - { AS(minherit_args), (sy_call_t *)minherit, AUE_MINHERIT, NULL, 0, 0, 0 }, /* 250 = minherit */ + { AS(minherit_args), (sy_call_t *)minherit, AUE_MINHERIT, NULL, 0, 0, SYF_CAPENABLED }, /* 250 = minherit */ { AS(rfork_args), (sy_call_t *)rfork, AUE_RFORK, NULL, 0, 0, 0 }, /* 251 = rfork */ - { AS(openbsd_poll_args), (sy_call_t *)openbsd_poll, AUE_POLL, NULL, 0, 0, 0 }, /* 252 = openbsd_poll */ - { 0, (sy_call_t *)issetugid, AUE_ISSETUGID, NULL, 0, 0, 0 }, /* 253 = issetugid */ + { AS(openbsd_poll_args), (sy_call_t *)openbsd_poll, AUE_POLL, NULL, 0, 0, SYF_CAPENABLED }, /* 252 = openbsd_poll */ + { 0, (sy_call_t *)issetugid, AUE_ISSETUGID, NULL, 0, 0, SYF_CAPENABLED }, /* 253 = issetugid */ { AS(lchown_args), (sy_call_t *)lchown, AUE_LCHOWN, NULL, 0, 0, 0 }, /* 254 = lchown */ { AS(aio_read_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 255 = aio_read */ { AS(aio_write_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 256 = aio_write */ @@ -306,12 +306,12 @@ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 269 = nosys */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 270 = nosys */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 271 = nosys */ - { AS(getdents_args), (sy_call_t *)getdents, AUE_O_GETDENTS, NULL, 0, 0, 0 }, /* 272 = getdents */ + { AS(getdents_args), (sy_call_t *)getdents, AUE_O_GETDENTS, NULL, 0, 0, SYF_CAPENABLED }, /* 272 = getdents */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 273 = nosys */ { AS(lchmod_args), (sy_call_t *)lchmod, AUE_LCHMOD, NULL, 0, 0, 0 }, /* 274 = lchmod */ { AS(lchown_args), (sy_call_t *)lchown, AUE_LCHOWN, NULL, 0, 0, 0 }, /* 275 = netbsd_lchown */ { AS(lutimes_args), (sy_call_t *)lutimes, AUE_LUTIMES, NULL, 0, 0, 0 }, /* 276 = lutimes */ - { AS(msync_args), (sy_call_t *)msync, AUE_MSYNC, NULL, 0, 0, 0 }, /* 277 = netbsd_msync */ + { AS(msync_args), (sy_call_t *)msync, AUE_MSYNC, NULL, 0, 0, SYF_CAPENABLED }, /* 277 = netbsd_msync */ { AS(nstat_args), (sy_call_t *)nstat, AUE_STAT, NULL, 0, 0, 0 }, /* 278 = nstat */ { AS(nfstat_args), (sy_call_t *)nfstat, AUE_FSTAT, NULL, 0, 0, 0 }, /* 279 = nfstat */ { AS(nlstat_args), (sy_call_t *)nlstat, AUE_LSTAT, NULL, 0, 0, 0 }, /* 280 = nlstat */ @@ -323,8 +323,8 @@ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 286 = nosys */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 287 = nosys */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 288 = nosys */ - { AS(preadv_args), (sy_call_t *)preadv, AUE_PREADV, NULL, 0, 0, 0 }, /* 289 = preadv */ - { AS(pwritev_args), (sy_call_t *)pwritev, AUE_PWRITEV, NULL, 0, 0, 0 }, /* 290 = pwritev */ + { AS(preadv_args), (sy_call_t *)preadv, AUE_PREADV, NULL, 0, 0, SYF_CAPENABLED }, /* 289 = preadv */ + { AS(pwritev_args), (sy_call_t *)pwritev, AUE_PWRITEV, NULL, 0, 0, SYF_CAPENABLED }, /* 290 = pwritev */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 291 = nosys */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 292 = nosys */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 293 = nosys */ @@ -344,9 +344,9 @@ { AS(kldnext_args), (sy_call_t *)kldnext, AUE_NULL, NULL, 0, 0, 0 }, /* 307 = kldnext */ { AS(kldstat_args), (sy_call_t *)kldstat, AUE_NULL, NULL, 0, 0, 0 }, /* 308 = kldstat */ { AS(kldfirstmod_args), (sy_call_t *)kldfirstmod, AUE_NULL, NULL, 0, 0, 0 }, /* 309 = kldfirstmod */ - { AS(getsid_args), (sy_call_t *)getsid, AUE_GETSID, NULL, 0, 0, 0 }, /* 310 = getsid */ - { AS(setresuid_args), (sy_call_t *)setresuid, AUE_SETRESUID, NULL, 0, 0, 0 }, /* 311 = setresuid */ - { AS(setresgid_args), (sy_call_t *)setresgid, AUE_SETRESGID, NULL, 0, 0, 0 }, /* 312 = setresgid */ + { AS(getsid_args), (sy_call_t *)getsid, AUE_GETSID, NULL, 0, 0, SYF_CAPENABLED }, /* 310 = getsid */ + { AS(setresuid_args), (sy_call_t *)setresuid, AUE_SETRESUID, NULL, 0, 0, SYF_CAPENABLED }, /* 311 = setresuid */ + { AS(setresgid_args), (sy_call_t *)setresgid, AUE_SETRESGID, NULL, 0, 0, SYF_CAPENABLED }, /* 312 = setresgid */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 313 = obsolete signanosleep */ { AS(aio_return_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 314 = aio_return */ { AS(aio_suspend_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 315 = aio_suspend */ @@ -355,49 +355,49 @@ { AS(oaio_read_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 318 = oaio_read */ { AS(oaio_write_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 319 = oaio_write */ { AS(olio_listio_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 320 = olio_listio */ - { 0, (sy_call_t *)yield, AUE_NULL, NULL, 0, 0, 0 }, /* 321 = yield */ + { 0, (sy_call_t *)yield, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 321 = yield */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 322 = obsolete thr_sleep */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 323 = obsolete thr_wakeup */ - { AS(mlockall_args), (sy_call_t *)mlockall, AUE_MLOCKALL, NULL, 0, 0, 0 }, /* 324 = mlockall */ - { 0, (sy_call_t *)munlockall, AUE_MUNLOCKALL, NULL, 0, 0, 0 }, /* 325 = munlockall */ + { AS(mlockall_args), (sy_call_t *)mlockall, AUE_MLOCKALL, NULL, 0, 0, SYF_CAPENABLED }, /* 324 = mlockall */ + { 0, (sy_call_t *)munlockall, AUE_MUNLOCKALL, NULL, 0, 0, SYF_CAPENABLED }, /* 325 = munlockall */ { AS(__getcwd_args), (sy_call_t *)__getcwd, AUE_GETCWD, NULL, 0, 0, 0 }, /* 326 = __getcwd */ - { AS(sched_setparam_args), (sy_call_t *)sched_setparam, AUE_NULL, NULL, 0, 0, 0 }, /* 327 = sched_setparam */ - { AS(sched_getparam_args), (sy_call_t *)sched_getparam, AUE_NULL, NULL, 0, 0, 0 }, /* 328 = sched_getparam */ - { AS(sched_setscheduler_args), (sy_call_t *)sched_setscheduler, AUE_NULL, NULL, 0, 0, 0 }, /* 329 = sched_setscheduler */ - { AS(sched_getscheduler_args), (sy_call_t *)sched_getscheduler, AUE_NULL, NULL, 0, 0, 0 }, /* 330 = sched_getscheduler */ - { 0, (sy_call_t *)sched_yield, AUE_NULL, NULL, 0, 0, 0 }, /* 331 = sched_yield */ - { AS(sched_get_priority_max_args), (sy_call_t *)sched_get_priority_max, AUE_NULL, NULL, 0, 0, 0 }, /* 332 = sched_get_priority_max */ - { AS(sched_get_priority_min_args), (sy_call_t *)sched_get_priority_min, AUE_NULL, NULL, 0, 0, 0 }, /* 333 = sched_get_priority_min */ + { AS(sched_setparam_args), (sy_call_t *)sched_setparam, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 327 = sched_setparam */ + { AS(sched_getparam_args), (sy_call_t *)sched_getparam, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 328 = sched_getparam */ + { AS(sched_setscheduler_args), (sy_call_t *)sched_setscheduler, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 329 = sched_setscheduler */ + { AS(sched_getscheduler_args), (sy_call_t *)sched_getscheduler, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 330 = sched_getscheduler */ + { 0, (sy_call_t *)sched_yield, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 331 = sched_yield */ + { AS(sched_get_priority_max_args), (sy_call_t *)sched_get_priority_max, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 332 = sched_get_priority_max */ + { AS(sched_get_priority_min_args), (sy_call_t *)sched_get_priority_min, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 333 = sched_get_priority_min */ { AS(sched_rr_get_interval_args), (sy_call_t *)sched_rr_get_interval, AUE_NULL, NULL, 0, 0, 0 }, /* 334 = sched_rr_get_interval */ - { AS(utrace_args), (sy_call_t *)utrace, AUE_NULL, NULL, 0, 0, 0 }, /* 335 = utrace */ - { compat4(AS(freebsd4_sendfile_args),sendfile), AUE_SENDFILE, NULL, 0, 0, 0 }, /* 336 = freebsd4 sendfile */ + { AS(utrace_args), (sy_call_t *)utrace, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 335 = utrace */ + { compat4(AS(freebsd4_sendfile_args),sendfile), AUE_SENDFILE, NULL, 0, 0, SYF_CAPENABLED }, /* 336 = freebsd4 sendfile */ { AS(kldsym_args), (sy_call_t *)kldsym, AUE_NULL, NULL, 0, 0, 0 }, /* 337 = kldsym */ { AS(jail_args), (sy_call_t *)jail, AUE_JAIL, NULL, 0, 0, 0 }, /* 338 = jail */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 339 = pioctl */ - { AS(sigprocmask_args), (sy_call_t *)sigprocmask, AUE_SIGPROCMASK, NULL, 0, 0, 0 }, /* 340 = sigprocmask */ - { AS(sigsuspend_args), (sy_call_t *)sigsuspend, AUE_SIGSUSPEND, NULL, 0, 0, 0 }, /* 341 = sigsuspend */ - { compat4(AS(freebsd4_sigaction_args),sigaction), AUE_SIGACTION, NULL, 0, 0, 0 }, /* 342 = freebsd4 sigaction */ - { AS(sigpending_args), (sy_call_t *)sigpending, AUE_SIGPENDING, NULL, 0, 0, 0 }, /* 343 = sigpending */ - { compat4(AS(freebsd4_sigreturn_args),sigreturn), AUE_SIGRETURN, NULL, 0, 0, 0 }, /* 344 = freebsd4 sigreturn */ - { AS(sigtimedwait_args), (sy_call_t *)sigtimedwait, AUE_SIGWAIT, NULL, 0, 0, 0 }, /* 345 = sigtimedwait */ - { AS(sigwaitinfo_args), (sy_call_t *)sigwaitinfo, AUE_NULL, NULL, 0, 0, 0 }, /* 346 = sigwaitinfo */ + { AS(sigprocmask_args), (sy_call_t *)sigprocmask, AUE_SIGPROCMASK, NULL, 0, 0, SYF_CAPENABLED }, /* 340 = sigprocmask */ + { AS(sigsuspend_args), (sy_call_t *)sigsuspend, AUE_SIGSUSPEND, NULL, 0, 0, SYF_CAPENABLED }, /* 341 = sigsuspend */ + { compat4(AS(freebsd4_sigaction_args),sigaction), AUE_SIGACTION, NULL, 0, 0, SYF_CAPENABLED }, /* 342 = freebsd4 sigaction */ + { AS(sigpending_args), (sy_call_t *)sigpending, AUE_SIGPENDING, NULL, 0, 0, SYF_CAPENABLED }, /* 343 = sigpending */ + { compat4(AS(freebsd4_sigreturn_args),sigreturn), AUE_SIGRETURN, NULL, 0, 0, SYF_CAPENABLED }, /* 344 = freebsd4 sigreturn */ + { AS(sigtimedwait_args), (sy_call_t *)sigtimedwait, AUE_SIGWAIT, NULL, 0, 0, SYF_CAPENABLED }, /* 345 = sigtimedwait */ + { AS(sigwaitinfo_args), (sy_call_t *)sigwaitinfo, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 346 = sigwaitinfo */ { AS(__acl_get_file_args), (sy_call_t *)__acl_get_file, AUE_NULL, NULL, 0, 0, 0 }, /* 347 = __acl_get_file */ { AS(__acl_set_file_args), (sy_call_t *)__acl_set_file, AUE_NULL, NULL, 0, 0, 0 }, /* 348 = __acl_set_file */ - { AS(__acl_get_fd_args), (sy_call_t *)__acl_get_fd, AUE_NULL, NULL, 0, 0, 0 }, /* 349 = __acl_get_fd */ - { AS(__acl_set_fd_args), (sy_call_t *)__acl_set_fd, AUE_NULL, NULL, 0, 0, 0 }, /* 350 = __acl_set_fd */ + { AS(__acl_get_fd_args), (sy_call_t *)__acl_get_fd, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 349 = __acl_get_fd */ + { AS(__acl_set_fd_args), (sy_call_t *)__acl_set_fd, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 350 = __acl_set_fd */ { AS(__acl_delete_file_args), (sy_call_t *)__acl_delete_file, AUE_NULL, NULL, 0, 0, 0 }, /* 351 = __acl_delete_file */ - { AS(__acl_delete_fd_args), (sy_call_t *)__acl_delete_fd, AUE_NULL, NULL, 0, 0, 0 }, /* 352 = __acl_delete_fd */ + { AS(__acl_delete_fd_args), (sy_call_t *)__acl_delete_fd, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 352 = __acl_delete_fd */ { AS(__acl_aclcheck_file_args), (sy_call_t *)__acl_aclcheck_file, AUE_NULL, NULL, 0, 0, 0 }, /* 353 = __acl_aclcheck_file */ - { AS(__acl_aclcheck_fd_args), (sy_call_t *)__acl_aclcheck_fd, AUE_NULL, NULL, 0, 0, 0 }, /* 354 = __acl_aclcheck_fd */ + { AS(__acl_aclcheck_fd_args), (sy_call_t *)__acl_aclcheck_fd, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 354 = __acl_aclcheck_fd */ { AS(extattrctl_args), (sy_call_t *)extattrctl, AUE_EXTATTRCTL, NULL, 0, 0, 0 }, /* 355 = extattrctl */ { AS(extattr_set_file_args), (sy_call_t *)extattr_set_file, AUE_EXTATTR_SET_FILE, NULL, 0, 0, 0 }, /* 356 = extattr_set_file */ { AS(extattr_get_file_args), (sy_call_t *)extattr_get_file, AUE_EXTATTR_GET_FILE, NULL, 0, 0, 0 }, /* 357 = extattr_get_file */ { AS(extattr_delete_file_args), (sy_call_t *)extattr_delete_file, AUE_EXTATTR_DELETE_FILE, NULL, 0, 0, 0 }, /* 358 = extattr_delete_file */ { AS(aio_waitcomplete_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 359 = aio_waitcomplete */ - { AS(getresuid_args), (sy_call_t *)getresuid, AUE_GETRESUID, NULL, 0, 0, 0 }, /* 360 = getresuid */ - { AS(getresgid_args), (sy_call_t *)getresgid, AUE_GETRESGID, NULL, 0, 0, 0 }, /* 361 = getresgid */ - { 0, (sy_call_t *)kqueue, AUE_KQUEUE, NULL, 0, 0, 0 }, /* 362 = kqueue */ - { AS(kevent_args), (sy_call_t *)kevent, AUE_NULL, NULL, 0, 0, 0 }, /* 363 = kevent */ + { AS(getresuid_args), (sy_call_t *)getresuid, AUE_GETRESUID, NULL, 0, 0, SYF_CAPENABLED }, /* 360 = getresuid */ + { AS(getresgid_args), (sy_call_t *)getresgid, AUE_GETRESGID, NULL, 0, 0, SYF_CAPENABLED }, /* 361 = getresgid */ + { 0, (sy_call_t *)kqueue, AUE_KQUEUE, NULL, 0, 0, SYF_CAPENABLED }, /* 362 = kqueue */ + { AS(kevent_args), (sy_call_t *)kevent, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 363 = kevent */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 364 = __cap_get_proc */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 365 = __cap_set_proc */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 366 = __cap_get_fd */ @@ -405,9 +405,9 @@ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 368 = __cap_set_fd */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 369 = __cap_set_file */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 370 = nosys */ - { AS(extattr_set_fd_args), (sy_call_t *)extattr_set_fd, AUE_EXTATTR_SET_FD, NULL, 0, 0, 0 }, /* 371 = extattr_set_fd */ - { AS(extattr_get_fd_args), (sy_call_t *)extattr_get_fd, AUE_EXTATTR_GET_FD, NULL, 0, 0, 0 }, /* 372 = extattr_get_fd */ - { AS(extattr_delete_fd_args), (sy_call_t *)extattr_delete_fd, AUE_EXTATTR_DELETE_FD, NULL, 0, 0, 0 }, /* 373 = extattr_delete_fd */ + { AS(extattr_set_fd_args), (sy_call_t *)extattr_set_fd, AUE_EXTATTR_SET_FD, NULL, 0, 0, SYF_CAPENABLED }, /* 371 = extattr_set_fd */ + { AS(extattr_get_fd_args), (sy_call_t *)extattr_get_fd, AUE_EXTATTR_GET_FD, NULL, 0, 0, SYF_CAPENABLED }, /* 372 = extattr_get_fd */ + { AS(extattr_delete_fd_args), (sy_call_t *)extattr_delete_fd, AUE_EXTATTR_DELETE_FD, NULL, 0, 0, SYF_CAPENABLED }, /* 373 = extattr_delete_fd */ { AS(__setugid_args), (sy_call_t *)__setugid, AUE_NULL, NULL, 0, 0, 0 }, /* 374 = __setugid */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 375 = nfsclnt */ { AS(eaccess_args), (sy_call_t *)eaccess, AUE_EACCESS, NULL, 0, 0, 0 }, /* 376 = eaccess */ @@ -418,20 +418,20 @@ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 381 = kse_create */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 382 = kse_thr_interrupt */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 383 = kse_release */ - { AS(__mac_get_proc_args), (sy_call_t *)__mac_get_proc, AUE_NULL, NULL, 0, 0, 0 }, /* 384 = __mac_get_proc */ - { AS(__mac_set_proc_args), (sy_call_t *)__mac_set_proc, AUE_NULL, NULL, 0, 0, 0 }, /* 385 = __mac_set_proc */ - { AS(__mac_get_fd_args), (sy_call_t *)__mac_get_fd, AUE_NULL, NULL, 0, 0, 0 }, /* 386 = __mac_get_fd */ + { AS(__mac_get_proc_args), (sy_call_t *)__mac_get_proc, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 384 = __mac_get_proc */ + { AS(__mac_set_proc_args), (sy_call_t *)__mac_set_proc, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 385 = __mac_set_proc */ + { AS(__mac_get_fd_args), (sy_call_t *)__mac_get_fd, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 386 = __mac_get_fd */ { AS(__mac_get_file_args), (sy_call_t *)__mac_get_file, AUE_NULL, NULL, 0, 0, 0 }, /* 387 = __mac_get_file */ - { AS(__mac_set_fd_args), (sy_call_t *)__mac_set_fd, AUE_NULL, NULL, 0, 0, 0 }, /* 388 = __mac_set_fd */ + { AS(__mac_set_fd_args), (sy_call_t *)__mac_set_fd, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 388 = __mac_set_fd */ { AS(__mac_set_file_args), (sy_call_t *)__mac_set_file, AUE_NULL, NULL, 0, 0, 0 }, /* 389 = __mac_set_file */ { AS(kenv_args), (sy_call_t *)kenv, AUE_NULL, NULL, 0, 0, 0 }, /* 390 = kenv */ { AS(lchflags_args), (sy_call_t *)lchflags, AUE_LCHFLAGS, NULL, 0, 0, 0 }, /* 391 = lchflags */ - { AS(uuidgen_args), (sy_call_t *)uuidgen, AUE_NULL, NULL, 0, 0, 0 }, /* 392 = uuidgen */ - { AS(sendfile_args), (sy_call_t *)sendfile, AUE_SENDFILE, NULL, 0, 0, 0 }, /* 393 = sendfile */ + { AS(uuidgen_args), (sy_call_t *)uuidgen, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 392 = uuidgen */ + { AS(sendfile_args), (sy_call_t *)sendfile, AUE_SENDFILE, NULL, 0, 0, SYF_CAPENABLED }, /* 393 = sendfile */ { AS(mac_syscall_args), (sy_call_t *)mac_syscall, AUE_NULL, NULL, 0, 0, 0 }, /* 394 = mac_syscall */ { AS(getfsstat_args), (sy_call_t *)getfsstat, AUE_GETFSSTAT, NULL, 0, 0, 0 }, /* 395 = getfsstat */ { AS(statfs_args), (sy_call_t *)statfs, AUE_STATFS, NULL, 0, 0, 0 }, /* 396 = statfs */ - { AS(fstatfs_args), (sy_call_t *)fstatfs, AUE_FSTATFS, NULL, 0, 0, 0 }, /* 397 = fstatfs */ + { AS(fstatfs_args), (sy_call_t *)fstatfs, AUE_FSTATFS, NULL, 0, 0, SYF_CAPENABLED }, /* 397 = fstatfs */ { AS(fhstatfs_args), (sy_call_t *)fhstatfs, AUE_FHSTATFS, NULL, 0, 0, 0 }, /* 398 = fhstatfs */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 399 = nosys */ { AS(ksem_close_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 400 = ksem_close */ @@ -450,13 +450,13 @@ { AS(extattr_get_link_args), (sy_call_t *)extattr_get_link, AUE_EXTATTR_GET_LINK, NULL, 0, 0, 0 }, /* 413 = extattr_get_link */ { AS(extattr_delete_link_args), (sy_call_t *)extattr_delete_link, AUE_EXTATTR_DELETE_LINK, NULL, 0, 0, 0 }, /* 414 = extattr_delete_link */ { AS(__mac_execve_args), (sy_call_t *)__mac_execve, AUE_NULL, NULL, 0, 0, 0 }, /* 415 = __mac_execve */ - { AS(sigaction_args), (sy_call_t *)sigaction, AUE_SIGACTION, NULL, 0, 0, 0 }, /* 416 = sigaction */ - { AS(sigreturn_args), (sy_call_t *)sigreturn, AUE_SIGRETURN, NULL, 0, 0, 0 }, /* 417 = sigreturn */ + { AS(sigaction_args), (sy_call_t *)sigaction, AUE_SIGACTION, NULL, 0, 0, SYF_CAPENABLED }, /* 416 = sigaction */ + { AS(sigreturn_args), (sy_call_t *)sigreturn, AUE_SIGRETURN, NULL, 0, 0, SYF_CAPENABLED }, /* 417 = sigreturn */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 418 = __xstat */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 419 = __xfstat */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 420 = __xlstat */ - { AS(getcontext_args), (sy_call_t *)getcontext, AUE_NULL, NULL, 0, 0, 0 }, /* 421 = getcontext */ - { AS(setcontext_args), (sy_call_t *)setcontext, AUE_NULL, NULL, 0, 0, 0 }, /* 422 = setcontext */ + { AS(getcontext_args), (sy_call_t *)getcontext, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 421 = getcontext */ + { AS(setcontext_args), (sy_call_t *)setcontext, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 422 = setcontext */ { AS(swapcontext_args), (sy_call_t *)swapcontext, AUE_NULL, NULL, 0, 0, 0 }, /* 423 = swapcontext */ { AS(swapoff_args), (sy_call_t *)swapoff, AUE_SWAPOFF, NULL, 0, 0, 0 }, /* 424 = swapoff */ { AS(__acl_get_link_args), (sy_call_t *)__acl_get_link, AUE_NULL, NULL, 0, 0, 0 }, /* 425 = __acl_get_link */ @@ -464,78 +464,78 @@ { AS(__acl_delete_link_args), (sy_call_t *)__acl_delete_link, AUE_NULL, NULL, 0, 0, 0 }, /* 427 = __acl_delete_link */ { AS(__acl_aclcheck_link_args), (sy_call_t *)__acl_aclcheck_link, AUE_NULL, NULL, 0, 0, 0 }, /* 428 = __acl_aclcheck_link */ { AS(sigwait_args), (sy_call_t *)sigwait, AUE_SIGWAIT, NULL, 0, 0, 0 }, /* 429 = sigwait */ - { AS(thr_create_args), (sy_call_t *)thr_create, AUE_NULL, NULL, 0, 0, 0 }, /* 430 = thr_create */ - { AS(thr_exit_args), (sy_call_t *)thr_exit, AUE_NULL, NULL, 0, 0, 0 }, /* 431 = thr_exit */ - { AS(thr_self_args), (sy_call_t *)thr_self, AUE_NULL, NULL, 0, 0, 0 }, /* 432 = thr_self */ - { AS(thr_kill_args), (sy_call_t *)thr_kill, AUE_NULL, NULL, 0, 0, 0 }, /* 433 = thr_kill */ - { AS(_umtx_lock_args), (sy_call_t *)_umtx_lock, AUE_NULL, NULL, 0, 0, 0 }, /* 434 = _umtx_lock */ - { AS(_umtx_unlock_args), (sy_call_t *)_umtx_unlock, AUE_NULL, NULL, 0, 0, 0 }, /* 435 = _umtx_unlock */ + { AS(thr_create_args), (sy_call_t *)thr_create, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 430 = thr_create */ + { AS(thr_exit_args), (sy_call_t *)thr_exit, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 431 = thr_exit */ + { AS(thr_self_args), (sy_call_t *)thr_self, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 432 = thr_self */ + { AS(thr_kill_args), (sy_call_t *)thr_kill, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 433 = thr_kill */ + { AS(_umtx_lock_args), (sy_call_t *)_umtx_lock, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 434 = _umtx_lock */ + { AS(_umtx_unlock_args), (sy_call_t *)_umtx_unlock, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 435 = _umtx_unlock */ { AS(jail_attach_args), (sy_call_t *)jail_attach, AUE_NULL, NULL, 0, 0, 0 }, /* 436 = jail_attach */ - { AS(extattr_list_fd_args), (sy_call_t *)extattr_list_fd, AUE_EXTATTR_LIST_FD, NULL, 0, 0, 0 }, /* 437 = extattr_list_fd */ + { AS(extattr_list_fd_args), (sy_call_t *)extattr_list_fd, AUE_EXTATTR_LIST_FD, NULL, 0, 0, SYF_CAPENABLED }, /* 437 = extattr_list_fd */ { AS(extattr_list_file_args), (sy_call_t *)extattr_list_file, AUE_EXTATTR_LIST_FILE, NULL, 0, 0, 0 }, /* 438 = extattr_list_file */ { AS(extattr_list_link_args), (sy_call_t *)extattr_list_link, AUE_EXTATTR_LIST_LINK, NULL, 0, 0, 0 }, /* 439 = extattr_list_link */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 440 = kse_switchin */ { AS(ksem_timedwait_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 441 = ksem_timedwait */ - { AS(thr_suspend_args), (sy_call_t *)thr_suspend, AUE_NULL, NULL, 0, 0, 0 }, /* 442 = thr_suspend */ - { AS(thr_wake_args), (sy_call_t *)thr_wake, AUE_NULL, NULL, 0, 0, 0 }, /* 443 = thr_wake */ + { AS(thr_suspend_args), (sy_call_t *)thr_suspend, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 442 = thr_suspend */ + { AS(thr_wake_args), (sy_call_t *)thr_wake, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 443 = thr_wake */ { AS(kldunloadf_args), (sy_call_t *)kldunloadf, AUE_MODUNLOAD, NULL, 0, 0, 0 }, /* 444 = kldunloadf */ { AS(audit_args), (sy_call_t *)audit, AUE_AUDIT, NULL, 0, 0, 0 }, /* 445 = audit */ { AS(auditon_args), (sy_call_t *)auditon, AUE_AUDITON, NULL, 0, 0, 0 }, /* 446 = auditon */ - { AS(getauid_args), (sy_call_t *)getauid, AUE_GETAUID, NULL, 0, 0, 0 }, /* 447 = getauid */ - { AS(setauid_args), (sy_call_t *)setauid, AUE_SETAUID, NULL, 0, 0, 0 }, /* 448 = setauid */ - { AS(getaudit_args), (sy_call_t *)getaudit, AUE_GETAUDIT, NULL, 0, 0, 0 }, /* 449 = getaudit */ - { AS(setaudit_args), (sy_call_t *)setaudit, AUE_SETAUDIT, NULL, 0, 0, 0 }, /* 450 = setaudit */ - { AS(getaudit_addr_args), (sy_call_t *)getaudit_addr, AUE_GETAUDIT_ADDR, NULL, 0, 0, 0 }, /* 451 = getaudit_addr */ - { AS(setaudit_addr_args), (sy_call_t *)setaudit_addr, AUE_SETAUDIT_ADDR, NULL, 0, 0, 0 }, /* 452 = setaudit_addr */ + { AS(getauid_args), (sy_call_t *)getauid, AUE_GETAUID, NULL, 0, 0, SYF_CAPENABLED }, /* 447 = getauid */ + { AS(setauid_args), (sy_call_t *)setauid, AUE_SETAUID, NULL, 0, 0, SYF_CAPENABLED }, /* 448 = setauid */ + { AS(getaudit_args), (sy_call_t *)getaudit, AUE_GETAUDIT, NULL, 0, 0, SYF_CAPENABLED }, /* 449 = getaudit */ + { AS(setaudit_args), (sy_call_t *)setaudit, AUE_SETAUDIT, NULL, 0, 0, SYF_CAPENABLED }, /* 450 = setaudit */ + { AS(getaudit_addr_args), (sy_call_t *)getaudit_addr, AUE_GETAUDIT_ADDR, NULL, 0, 0, SYF_CAPENABLED }, /* 451 = getaudit_addr */ + { AS(setaudit_addr_args), (sy_call_t *)setaudit_addr, AUE_SETAUDIT_ADDR, NULL, 0, 0, SYF_CAPENABLED }, /* 452 = setaudit_addr */ { AS(auditctl_args), (sy_call_t *)auditctl, AUE_AUDITCTL, NULL, 0, 0, 0 }, /* 453 = auditctl */ - { AS(_umtx_op_args), (sy_call_t *)_umtx_op, AUE_NULL, NULL, 0, 0, 0 }, /* 454 = _umtx_op */ - { AS(thr_new_args), (sy_call_t *)thr_new, AUE_NULL, NULL, 0, 0, 0 }, /* 455 = thr_new */ - { AS(sigqueue_args), (sy_call_t *)sigqueue, AUE_NULL, NULL, 0, 0, 0 }, /* 456 = sigqueue */ + { AS(_umtx_op_args), (sy_call_t *)_umtx_op, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 454 = _umtx_op */ + { AS(thr_new_args), (sy_call_t *)thr_new, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 455 = thr_new */ + { AS(sigqueue_args), (sy_call_t *)sigqueue, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 456 = sigqueue */ { AS(kmq_open_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 457 = kmq_open */ { AS(kmq_setattr_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 458 = kmq_setattr */ { AS(kmq_timedreceive_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 459 = kmq_timedreceive */ { AS(kmq_timedsend_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 460 = kmq_timedsend */ { AS(kmq_notify_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 461 = kmq_notify */ { AS(kmq_unlink_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 462 = kmq_unlink */ - { AS(abort2_args), (sy_call_t *)abort2, AUE_NULL, NULL, 0, 0, 0 }, /* 463 = abort2 */ - { AS(thr_set_name_args), (sy_call_t *)thr_set_name, AUE_NULL, NULL, 0, 0, 0 }, /* 464 = thr_set_name */ + { AS(abort2_args), (sy_call_t *)abort2, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 463 = abort2 */ + { AS(thr_set_name_args), (sy_call_t *)thr_set_name, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 464 = thr_set_name */ { AS(aio_fsync_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 465 = aio_fsync */ - { AS(rtprio_thread_args), (sy_call_t *)rtprio_thread, AUE_RTPRIO, NULL, 0, 0, 0 }, /* 466 = rtprio_thread */ + { AS(rtprio_thread_args), (sy_call_t *)rtprio_thread, AUE_RTPRIO, NULL, 0, 0, SYF_CAPENABLED }, /* 466 = rtprio_thread */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 467 = nosys */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 468 = nosys */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 469 = __getpath_fromfd */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 470 = __getpath_fromaddr */ - { AS(sctp_peeloff_args), (sy_call_t *)sctp_peeloff, AUE_NULL, NULL, 0, 0, 0 }, /* 471 = sctp_peeloff */ - { AS(sctp_generic_sendmsg_args), (sy_call_t *)sctp_generic_sendmsg, AUE_NULL, NULL, 0, 0, 0 }, /* 472 = sctp_generic_sendmsg */ - { AS(sctp_generic_sendmsg_iov_args), (sy_call_t *)sctp_generic_sendmsg_iov, AUE_NULL, NULL, 0, 0, 0 }, /* 473 = sctp_generic_sendmsg_iov */ - { AS(sctp_generic_recvmsg_args), (sy_call_t *)sctp_generic_recvmsg, AUE_NULL, NULL, 0, 0, 0 }, /* 474 = sctp_generic_recvmsg */ - { AS(pread_args), (sy_call_t *)pread, AUE_PREAD, NULL, 0, 0, 0 }, /* 475 = pread */ - { AS(pwrite_args), (sy_call_t *)pwrite, AUE_PWRITE, NULL, 0, 0, 0 }, /* 476 = pwrite */ - { AS(mmap_args), (sy_call_t *)mmap, AUE_MMAP, NULL, 0, 0, 0 }, /* 477 = mmap */ - { AS(lseek_args), (sy_call_t *)lseek, AUE_LSEEK, NULL, 0, 0, 0 }, /* 478 = lseek */ + { AS(sctp_peeloff_args), (sy_call_t *)sctp_peeloff, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 471 = sctp_peeloff */ + { AS(sctp_generic_sendmsg_args), (sy_call_t *)sctp_generic_sendmsg, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 472 = sctp_generic_sendmsg */ + { AS(sctp_generic_sendmsg_iov_args), (sy_call_t *)sctp_generic_sendmsg_iov, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 473 = sctp_generic_sendmsg_iov */ + { AS(sctp_generic_recvmsg_args), (sy_call_t *)sctp_generic_recvmsg, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 474 = sctp_generic_recvmsg */ + { AS(pread_args), (sy_call_t *)pread, AUE_PREAD, NULL, 0, 0, SYF_CAPENABLED }, /* 475 = pread */ + { AS(pwrite_args), (sy_call_t *)pwrite, AUE_PWRITE, NULL, 0, 0, SYF_CAPENABLED }, /* 476 = pwrite */ + { AS(mmap_args), (sy_call_t *)mmap, AUE_MMAP, NULL, 0, 0, SYF_CAPENABLED }, /* 477 = mmap */ + { AS(lseek_args), (sy_call_t *)lseek, AUE_LSEEK, NULL, 0, 0, SYF_CAPENABLED }, /* 478 = lseek */ { AS(truncate_args), (sy_call_t *)truncate, AUE_TRUNCATE, NULL, 0, 0, 0 }, /* 479 = truncate */ - { AS(ftruncate_args), (sy_call_t *)ftruncate, AUE_FTRUNCATE, NULL, 0, 0, 0 }, /* 480 = ftruncate */ + { AS(ftruncate_args), (sy_call_t *)ftruncate, AUE_FTRUNCATE, NULL, 0, 0, SYF_CAPENABLED }, /* 480 = ftruncate */ { AS(thr_kill2_args), (sy_call_t *)thr_kill2, AUE_KILL, NULL, 0, 0, 0 }, /* 481 = thr_kill2 */ - { AS(shm_open_args), (sy_call_t *)shm_open, AUE_SHMOPEN, NULL, 0, 0, 0 }, /* 482 = shm_open */ + { AS(shm_open_args), (sy_call_t *)shm_open, AUE_SHMOPEN, NULL, 0, 0, SYF_CAPENABLED }, /* 482 = shm_open */ { AS(shm_unlink_args), (sy_call_t *)shm_unlink, AUE_SHMUNLINK, NULL, 0, 0, 0 }, /* 483 = shm_unlink */ { AS(cpuset_args), (sy_call_t *)cpuset, AUE_NULL, NULL, 0, 0, 0 }, /* 484 = cpuset */ { AS(cpuset_setid_args), (sy_call_t *)cpuset_setid, AUE_NULL, NULL, 0, 0, 0 }, /* 485 = cpuset_setid */ { AS(cpuset_getid_args), (sy_call_t *)cpuset_getid, AUE_NULL, NULL, 0, 0, 0 }, /* 486 = cpuset_getid */ { AS(cpuset_getaffinity_args), (sy_call_t *)cpuset_getaffinity, AUE_NULL, NULL, 0, 0, 0 }, /* 487 = cpuset_getaffinity */ { AS(cpuset_setaffinity_args), (sy_call_t *)cpuset_setaffinity, AUE_NULL, NULL, 0, 0, 0 }, /* 488 = cpuset_setaffinity */ - { AS(faccessat_args), (sy_call_t *)faccessat, AUE_FACCESSAT, NULL, 0, 0, 0 }, /* 489 = faccessat */ - { AS(fchmodat_args), (sy_call_t *)fchmodat, AUE_FCHMODAT, NULL, 0, 0, 0 }, /* 490 = fchmodat */ + { AS(faccessat_args), (sy_call_t *)faccessat, AUE_FACCESSAT, NULL, 0, 0, SYF_CAPENABLED }, /* 489 = faccessat */ + { AS(fchmodat_args), (sy_call_t *)fchmodat, AUE_FCHMODAT, NULL, 0, 0, SYF_CAPENABLED }, /* 490 = fchmodat */ { AS(fchownat_args), (sy_call_t *)fchownat, AUE_FCHOWNAT, NULL, 0, 0, 0 }, /* 491 = fchownat */ - { AS(fexecve_args), (sy_call_t *)fexecve, AUE_FEXECVE, NULL, 0, 0, 0 }, /* 492 = fexecve */ - { AS(fstatat_args), (sy_call_t *)fstatat, AUE_FSTATAT, NULL, 0, 0, 0 }, /* 493 = fstatat */ - { AS(futimesat_args), (sy_call_t *)futimesat, AUE_FUTIMESAT, NULL, 0, 0, 0 }, /* 494 = futimesat */ + { AS(fexecve_args), (sy_call_t *)fexecve, AUE_FEXECVE, NULL, 0, 0, SYF_CAPENABLED }, /* 492 = fexecve */ + { AS(fstatat_args), (sy_call_t *)fstatat, AUE_FSTATAT, NULL, 0, 0, SYF_CAPENABLED }, /* 493 = fstatat */ + { AS(futimesat_args), (sy_call_t *)futimesat, AUE_FUTIMESAT, NULL, 0, 0, SYF_CAPENABLED }, /* 494 = futimesat */ { AS(linkat_args), (sy_call_t *)linkat, AUE_LINKAT, NULL, 0, 0, 0 }, /* 495 = linkat */ - { AS(mkdirat_args), (sy_call_t *)mkdirat, AUE_MKDIRAT, NULL, 0, 0, 0 }, /* 496 = mkdirat */ - { AS(mkfifoat_args), (sy_call_t *)mkfifoat, AUE_MKFIFOAT, NULL, 0, 0, 0 }, /* 497 = mkfifoat */ - { AS(mknodat_args), (sy_call_t *)mknodat, AUE_MKNODAT, NULL, 0, 0, 0 }, /* 498 = mknodat */ - { AS(openat_args), (sy_call_t *)openat, AUE_OPENAT_RWTC, NULL, 0, 0, 0 }, /* 499 = openat */ + { AS(mkdirat_args), (sy_call_t *)mkdirat, AUE_MKDIRAT, NULL, 0, 0, SYF_CAPENABLED }, /* 496 = mkdirat */ + { AS(mkfifoat_args), (sy_call_t *)mkfifoat, AUE_MKFIFOAT, NULL, 0, 0, SYF_CAPENABLED }, /* 497 = mkfifoat */ + { AS(mknodat_args), (sy_call_t *)mknodat, AUE_MKNODAT, NULL, 0, 0, SYF_CAPENABLED }, /* 498 = mknodat */ + { AS(openat_args), (sy_call_t *)openat, AUE_OPENAT_RWTC, NULL, 0, 0, SYF_CAPENABLED }, /* 499 = openat */ { AS(readlinkat_args), (sy_call_t *)readlinkat, AUE_READLINKAT, NULL, 0, 0, 0 }, /* 500 = readlinkat */ - { AS(renameat_args), (sy_call_t *)renameat, AUE_RENAMEAT, NULL, 0, 0, 0 }, /* 501 = renameat */ + { AS(renameat_args), (sy_call_t *)renameat, AUE_RENAMEAT, NULL, 0, 0, SYF_CAPENABLED }, /* 501 = renameat */ { AS(symlinkat_args), (sy_call_t *)symlinkat, AUE_SYMLINKAT, NULL, 0, 0, 0 }, /* 502 = symlinkat */ { AS(unlinkat_args), (sy_call_t *)unlinkat, AUE_UNLINKAT, NULL, 0, 0, 0 }, /* 503 = unlinkat */ { AS(posix_openpt_args), (sy_call_t *)posix_openpt, AUE_POSIX_OPENPT, NULL, 0, 0, 0 }, /* 504 = posix_openpt */ @@ -543,18 +543,18 @@ { AS(jail_get_args), (sy_call_t *)jail_get, AUE_NULL, NULL, 0, 0, 0 }, /* 506 = jail_get */ { AS(jail_set_args), (sy_call_t *)jail_set, AUE_NULL, NULL, 0, 0, 0 }, /* 507 = jail_set */ { AS(jail_remove_args), (sy_call_t *)jail_remove, AUE_NULL, NULL, 0, 0, 0 }, /* 508 = jail_remove */ - { AS(closefrom_args), (sy_call_t *)closefrom, AUE_CLOSEFROM, NULL, 0, 0, 0 }, /* 509 = closefrom */ + { AS(closefrom_args), (sy_call_t *)closefrom, AUE_CLOSEFROM, NULL, 0, 0, SYF_CAPENABLED }, /* 509 = closefrom */ { AS(__semctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 510 = __semctl */ { AS(msgctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 511 = msgctl */ { AS(shmctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0 }, /* 512 = shmctl */ { AS(lpathconf_args), (sy_call_t *)lpathconf, AUE_LPATHCONF, NULL, 0, 0, 0 }, /* 513 = lpathconf */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 514 = cap_new */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 515 = cap_getrights */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 516 = cap_enter */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 517 = cap_getmode */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 518 = pdfork */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 519 = pdkill */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 520 = pdgetpid */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0 }, /* 521 = pdwait */ + { AS(cap_new_args), (sy_call_t *)cap_new, AUE_CAP_NEW, NULL, 0, 0, SYF_CAPENABLED }, /* 514 = cap_new */ + { AS(cap_getrights_args), (sy_call_t *)cap_getrights, AUE_CAP_GETRIGHTS, NULL, 0, 0, SYF_CAPENABLED }, /* 515 = cap_getrights */ + { 0, (sy_call_t *)cap_enter, AUE_CAP_ENTER, NULL, 0, 0, SYF_CAPENABLED }, /* 516 = cap_enter */ + { AS(cap_getmode_args), (sy_call_t *)cap_getmode, AUE_CAP_GETMODE, NULL, 0, 0, SYF_CAPENABLED }, /* 517 = cap_getmode */ + { AS(pdfork_args), (sy_call_t *)pdfork, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 518 = pdfork */ + { AS(pdkill_args), (sy_call_t *)pdkill, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 519 = pdkill */ + { AS(pdgetpid_args), (sy_call_t *)pdgetpid, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED }, /* 520 = pdgetpid */ + { AS(pdwait_args), (sy_call_t *)pdwait, AUE_NULL, NULL, 0, 0, 0 }, /* 521 = pdwait */ { AS(pselect_args), (sy_call_t *)pselect, AUE_SELECT, NULL, 0, 0, 0 }, /* 522 = pselect */ }; diff -aurN -x '*.orig' src-clean/sys/kern/kern_descrip.c src/sys/kern/kern_descrip.c --- src-clean/sys/kern/kern_descrip.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/kern_descrip.c 2010-08-25 10:24:35.000000000 +0200 @@ -37,13 +37,16 @@ #include __FBSDID("$FreeBSD: src/sys/kern/kern_descrip.c,v 1.357.2.3.2.1 2010/06/14 02:09:06 kensmith Exp $"); +#include "opt_capabilities.h" #include "opt_compat.h" #include "opt_ddb.h" #include "opt_ktrace.h" +#include "opt_procdesc.h" #include #include +#include #include #include #include @@ -61,6 +64,7 @@ #include #include #include +#include #include #include #include @@ -81,6 +85,7 @@ #include #include +#include #include @@ -410,6 +415,23 @@ return (fp); } +static inline struct file * +fdtofp_cap(int fd, cap_rights_t rights, struct filedesc *fdp) +{ + struct file *fp; + + FILEDESC_LOCK_ASSERT(fdp); + if ((unsigned)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) + return (NULL); +#ifdef CAPABILITIES + if (fp->f_type == DTYPE_CAPABILITY) + if (cap_fextract(fp, rights, &fp)) + return (NULL); +#endif /* CAPABILITIES */ + return (fp); +} + int kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) { @@ -468,7 +490,7 @@ case F_GETFL: FILEDESC_SLOCK(fdp); - if ((fp = fdtofp(fd, fdp)) == NULL) { + if ((fp = fdtofp_cap(fd, CAP_FCNTL, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; @@ -479,7 +501,7 @@ case F_SETFL: FILEDESC_SLOCK(fdp); - if ((fp = fdtofp(fd, fdp)) == NULL) { + if ((fp = fdtofp_cap(fd, CAP_FCNTL, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; @@ -511,7 +533,7 @@ case F_GETOWN: FILEDESC_SLOCK(fdp); - if ((fp = fdtofp(fd, fdp)) == NULL) { + if ((fp = fdtofp_cap(fd, CAP_FCNTL, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; @@ -526,7 +548,7 @@ case F_SETOWN: FILEDESC_SLOCK(fdp); - if ((fp = fdtofp(fd, fdp)) == NULL) { + if ((fp = fdtofp_cap(fd, CAP_FCNTL, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; @@ -552,7 +574,7 @@ case F_SETLK: do_setlk: FILEDESC_SLOCK(fdp); - if ((fp = fdtofp(fd, fdp)) == NULL) { + if ((fp = fdtofp_cap(fd, CAP_FLOCK, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; @@ -647,7 +669,7 @@ case F_GETLK: FILEDESC_SLOCK(fdp); - if ((fp = fdtofp(fd, fdp)) == NULL) { + if ((fp = fdtofp_cap(fd, CAP_FLOCK, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; @@ -1120,7 +1142,7 @@ int fd; { struct filedesc *fdp; - struct file *fp; + struct file *fp, *fp_object; int error; int holdleaders; @@ -1155,8 +1177,14 @@ * added, and deleteing a knote for the new fd. */ knote_fdclose(td, fd); - if (fp->f_type == DTYPE_MQUEUE) - mq_fdclose(td, fd, fp); + + /* + * When we're closing an fd with a capability, we need to notify + * mqueue if the underlying object is of type mqueue. + */ + (void)cap_fextract(fp, 0, &fp_object); + if (fp_object->f_type == DTYPE_MQUEUE) + mq_fdclose(td, fd, fp_object); FILEDESC_XUNLOCK(fdp); error = closef(fp, td); @@ -1266,7 +1294,7 @@ AUDIT_ARG_FD(fd); - if ((error = fget(td, fd, &fp)) != 0) + if ((error = fget(td, fd, CAP_FSTAT, &fp)) != 0) return (error); AUDIT_ARG_FILE(td->td_proc, fp); @@ -1322,7 +1350,7 @@ struct vnode *vp; int error; - if ((error = fget(td, uap->fd, &fp)) != 0) + if ((error = fget(td, uap->fd, CAP_FPATHCONF, &fp)) != 0) return (error); /* If asynchronous I/O is available, it works for all descriptors. */ @@ -1516,13 +1544,45 @@ int falloc(struct thread *td, struct file **resultfp, int *resultfd) { - struct proc *p = td->td_proc; struct file *fp; - int error, i; + int error; + + error = falloc_noinstall(td, &fp); + if (error) return (error); /* no reference held on error */ + + error = finstall(td, fp, resultfd); + if (error) { + fdrop(fp, td); /* one reference (fp only) */ + return (error); + } + + if (resultfp) *resultfp = fp; /* copy out result */ + else fdrop(fp, td); /* release local reference */ + + return (0); +} + +/* + * Create a new open file structure without allocating a file decriptor. + */ +int +falloc_noinstall(struct thread *td, struct file **resultfp) +{ + struct file *fp; + int error; int maxuserfiles = maxfiles - (maxfiles / 20); static struct timeval lastfail; static int curfail; + /* + * Cowardly refuse to create a referenceless file; the calling code + * MUST expect a pointer to be returned. + */ + if (!resultfp) + return (error = EINVAL); + + atomic_add_int(&openfiles, 1); + fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO); if ((openfiles >= maxuserfiles && priv_check(td, PRIV_MAXFILES) != 0) || @@ -1534,34 +1594,42 @@ uma_zfree(file_zone, fp); return (ENFILE); } - atomic_add_int(&openfiles, 1); - /* - * If the process has file descriptor zero open, add the new file - * descriptor to the list of open files at that point, otherwise - * put it at the front of the list of open files. - */ refcount_init(&fp->f_count, 1); - if (resultfp) - fhold(fp); fp->f_cred = crhold(td->td_ucred); fp->f_ops = &badfileops; fp->f_data = NULL; fp->f_vnode = NULL; - FILEDESC_XLOCK(p->p_fd); - if ((error = fdalloc(td, 0, &i))) { - FILEDESC_XUNLOCK(p->p_fd); - fdrop(fp, td); - if (resultfp) - fdrop(fp, td); + LIST_INIT(&fp->f_caps); + fp->f_capcount = 0; + + *resultfp = fp; + + return (0); +} + + +/* + * Install a file in the file descriptor table. + */ +int +finstall(struct thread *td, struct file *fp, int *fd) +{ + struct filedesc *fdp = td->td_proc->p_fd; + int error; + + FILEDESC_XLOCK(fdp); + + if ((error = fdalloc(td, 0, fd))) { + FILEDESC_XUNLOCK(fdp); return (error); } - p->p_fd->fd_ofiles[i] = fp; - FILEDESC_XUNLOCK(p->p_fd); - if (resultfp) - *resultfp = fp; - if (resultfd) - *resultfd = i; + + fhold(fp); + fdp->fd_ofiles[*fd] = fp; + + FILEDESC_XUNLOCK(fdp); + return (0); } @@ -2052,6 +2120,7 @@ struct flock lf; struct filedesc_to_leader *fdtol; struct filedesc *fdp; + struct file *fp_object; /* * POSIX record locking dictates that any close releases ALL @@ -2064,11 +2133,15 @@ * NULL thread pointer when there really is no owning * context that might have locks, or the locks will be * leaked. + * + * If this is a capability, we do lock processing under the + * underyling vnode, not the capability. */ - if (fp->f_type == DTYPE_VNODE && td != NULL) { + (void)cap_fextract(fp, 0, &fp_object); + if (fp_object->f_type == DTYPE_VNODE && td != NULL) { int vfslocked; - vp = fp->f_vnode; + vp = fp_object->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { lf.l_whence = SEEK_SET; @@ -2098,7 +2171,7 @@ lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; - vp = fp->f_vnode; + vp = fp_object->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, F_POSIX); @@ -2177,16 +2250,30 @@ * If the descriptor doesn't exist or doesn't match 'flags', EBADF is * returned. * + * If the file is a capability, 'rights' will be checked against the + * capability rights mask, and the object decapsulated if the check passes. + * The capability itself will never be returned. + * * If an error occured the non-zero error is returned and *fpp is set to * NULL. Otherwise *fpp is held and set and zero is returned. Caller is * responsible for fdrop(). */ +#define FGET_GETCAP 0x00000001 static __inline int -_fget(struct thread *td, int fd, struct file **fpp, int flags) +_fget(struct thread *td, int fd, struct file **fpp, int flags, + cap_rights_t needrights, cap_rights_t *haverights, + u_char *maxprotp, int fget_flags) { struct filedesc *fdp; struct file *fp; +#ifdef CAPABILITIES + struct file *fp_fromcap; + int error; +#endif + /* + * Validate the file descriptor number and find the struct file. + */ *fpp = NULL; if (td == NULL || (fdp = td->td_proc->p_fd) == NULL) return (EBADF); @@ -2196,39 +2283,109 @@ fdrop(fp, td); return (EBADF); } + +#ifdef CAPABILITIES + /* If this is a capability, what rights does it have? */ + if (haverights) { + if (fp->f_type == DTYPE_CAPABILITY) + *haverights = cap_rights(fp); + else + *haverights = -1; + } + /* - * FREAD and FWRITE failure return EBADF as per POSIX. - * - * Only one flag, or 0, may be specified. + * If a capability has been requested, return the capability + * directly. Otherwise, check capability rights, extract the + * underlying object, and check its access flags. */ - if ((flags == FREAD && (fp->f_flag & FREAD) == 0) || - (flags == FWRITE && (fp->f_flag & FWRITE) == 0)) { - fdrop(fp, td); - return (EBADF); + if (fget_flags & FGET_GETCAP) { + if (fp->f_type != DTYPE_CAPABILITY) { + fdrop(fp, td); + return (EINVAL); + } + } else { + /* + * If a capability hasn't been requested, then validate the + * capability and find the underlying object. + */ + if (maxprotp != NULL) + error = cap_fextract_mmap(fp, needrights, maxprotp, + &fp_fromcap); + else + error = cap_fextract(fp, needrights, &fp_fromcap); + if (error) { + fdrop(fp, td); + return (error); + } + + /* + * If cap_fextract() returned a different file descriptor + * than was passed in, drop the original capability and hold + * the new descriptor. fp after this point refers to the + * actual object, not the capability. + */ + if (fp != fp_fromcap) { + fhold(fp_fromcap); + fdrop(fp, td); + fp = fp_fromcap; + } +#else /* !CAPABILITIES */ + KASSERT(fp->f_type != DTYPE_CAPABILITY, + ("_fget: saw capability")); + if (maxprotp != NULL) + *maxprotp = VM_PROT_ALL; +#endif /* CAPABILITIES */ + if ((flags == FREAD && (fp->f_flag & FREAD) == 0) || + (flags == FWRITE && (fp->f_flag & FWRITE) == 0)) { + fdrop(fp, td); + return (EBADF); + } +#ifdef CAPABILITIES } +#endif *fpp = fp; return (0); } int -fget(struct thread *td, int fd, struct file **fpp) +fget(struct thread *td, int fd, cap_rights_t rights, struct file **fpp) { - return(_fget(td, fd, fpp, 0)); + return(_fget(td, fd, fpp, 0, rights, NULL, NULL, 0)); } int -fget_read(struct thread *td, int fd, struct file **fpp) +fget_mmap(struct thread *td, int fd, cap_rights_t rights, u_char *maxprotp, + struct file **fpp) { - return(_fget(td, fd, fpp, FREAD)); + return (_fget(td, fd, fpp, 0, rights, NULL, maxprotp, 0)); } int -fget_write(struct thread *td, int fd, struct file **fpp) +fget_read(struct thread *td, int fd, cap_rights_t rights, struct file **fpp) { - return(_fget(td, fd, fpp, FWRITE)); + return(_fget(td, fd, fpp, FREAD, rights, NULL, NULL, 0)); +} + +int +fget_write(struct thread *td, int fd, cap_rights_t rights, struct file **fpp) +{ + + return(_fget(td, fd, fpp, FWRITE, rights, NULL, NULL, 0)); +} + +/* + * Unlike the other fget() calls, which accept and check capability rights + * but never return capabilities, fgetcap() returns the capability but + * doesn't check capability rights. + */ +int +fgetcap(struct thread *td, int fd, struct file **fpp) +{ + + return (_fget(td, fd, fpp, 0, 0, NULL, NULL, FGET_GETCAP)); } /* @@ -2239,13 +2396,15 @@ * XXX: what about the unused flags ? */ static __inline int -_fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags) +_fgetvp(struct thread *td, int fd, int flags, + cap_rights_t needrights, cap_rights_t *haverights, struct vnode **vpp) { struct file *fp; int error; *vpp = NULL; - if ((error = _fget(td, fd, &fp, flags)) != 0) + if ((error = _fget(td, fd, &fp, flags, needrights, haverights, NULL, 0)) + != 0) return (error); if (fp->f_vnode == NULL) { error = EINVAL; @@ -2259,25 +2418,34 @@ } int -fgetvp(struct thread *td, int fd, struct vnode **vpp) +fgetvp(struct thread *td, int fd, cap_rights_t rights, struct vnode **vpp) { - return (_fgetvp(td, fd, vpp, 0)); + return (_fgetvp(td, fd, 0, rights, NULL, vpp)); } int -fgetvp_read(struct thread *td, int fd, struct vnode **vpp) +fgetvp_rights(struct thread *td, int fd, cap_rights_t need, cap_rights_t *have, + struct vnode **vpp) { + return (_fgetvp(td, fd, 0, need, have, vpp)); +} - return (_fgetvp(td, fd, vpp, FREAD)); +int +fgetvp_read(struct thread *td, int fd, cap_rights_t rights, + struct vnode **vpp) +{ + + return (_fgetvp(td, fd, FREAD, rights, NULL, vpp)); } #ifdef notyet int -fgetvp_write(struct thread *td, int fd, struct vnode **vpp) +fgetvp_write(struct thread *td, int fd, cap_rights_t rights, + struct vnode **vpp) { - return (_fgetvp(td, fd, vpp, FWRITE)); + return (_fgetvp(td, fd, FWRITE, rights, NULL, vpp)); } #endif @@ -2293,7 +2461,8 @@ * during use. */ int -fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp) +fgetsock(struct thread *td, int fd, cap_rights_t rights, struct socket **spp, + u_int *fflagp) { struct file *fp; int error; @@ -2301,7 +2470,7 @@ *spp = NULL; if (fflagp != NULL) *fflagp = 0; - if ((error = _fget(td, fd, &fp, 0)) != 0) + if ((error = _fget(td, fd, &fp, 0, rights, NULL, NULL, 0)) != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { error = ENOTSOCK; @@ -2335,6 +2504,9 @@ /* * Handle the last reference to a file being closed. + * + * No special capability handling here, as the capability's fo_close will run + * instead of the object here, and perform any necessary drop on the object. */ int _fdrop(struct file *fp, struct thread *td) @@ -2354,6 +2526,10 @@ devfs_fpdrop(fp); atomic_subtract_int(&openfiles, 1); crfree(fp->f_cred); + if (!LIST_EMPTY(&fp->f_caps)) + panic("_fdrop: f_caps not empty"); + if (fp->f_capcount != 0) + panic("_fdrop: f_capcount != 0"); uma_zfree(file_zone, fp); return (error); @@ -2381,7 +2557,7 @@ int vfslocked; int error; - if ((error = fget(td, uap->fd, &fp)) != 0) + if ((error = fget(td, uap->fd, CAP_FLOCK, &fp)) != 0) return (error); if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); @@ -2768,6 +2944,22 @@ so = NULL; tp = NULL; kif->kf_fd = i; + +#ifdef CAPABILITIES + /* + * When reporting a capability, most fields will be from the + * underlying object, but do mark as a capability. With + * ofiledesc, we don't have a field to export the + * cap_rights_t, but we do with the new filedesc. + */ + if (fp->f_type == DTYPE_CAPABILITY) { + kif->kf_flags |= KF_FLAG_CAPABILITY; + (void)cap_fextract(fp, 0, &fp); + } +#else + KASSERT(fp->f_type != DTYPE_CAPABILITY, + ("sysctl_kern_proc_filedesc: saw capability")); +#endif switch (fp->f_type) { case DTYPE_VNODE: kif->kf_type = KF_TYPE_VNODE; @@ -2813,6 +3005,10 @@ tp = fp->f_data; break; + case DTYPE_PROCDESC: + kif->kf_type = KF_TYPE_PROCDESC; + break; + default: kif->kf_type = KF_TYPE_UNKNOWN; break; @@ -3021,6 +3217,22 @@ so = NULL; tp = NULL; kif->kf_fd = i; + +#ifdef CAPABILITIES + /* + * When reporting a capability, most fields will be from the + * underlying object, but do mark as a capability and export + * the capability rights mask. + */ + if (fp->f_type == DTYPE_CAPABILITY) { + kif->kf_flags |= KF_FLAG_CAPABILITY; + kif->kf_cap_rights = cap_rights(fp); + (void)cap_fextract(fp, 0, &fp); + } +#else + KASSERT(fp->f_type != DTYPE_CAPABILITY, + ("sysctl_kern_proc_filedesc: saw capability")); +#endif switch (fp->f_type) { case DTYPE_VNODE: kif->kf_type = KF_TYPE_VNODE; @@ -3066,6 +3278,13 @@ tp = fp->f_data; break; +#ifdef PROCDESC + case DTYPE_PROCDESC: + kif->kf_type = KF_TYPE_PROCDESC; + kif->kf_pid = procdesc_pid(fp); + break; +#endif + default: kif->kf_type = KF_TYPE_UNKNOWN; break; diff -aurN -x '*.orig' src-clean/sys/kern/kern_event.c src/sys/kern/kern_event.c --- src-clean/sys/kern/kern_event.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/kern_event.c 2010-08-25 10:24:35.000000000 +0200 @@ -33,6 +33,7 @@ #include #include +#include #include #include #include @@ -801,7 +802,7 @@ struct file *fp; int i, n, nerrors, error; - if ((error = fget(td, fd, &fp)) != 0) + if ((error = fget(td, fd, CAP_KEVENT, &fp)) != 0) return (error); if ((error = kqueue_acquire(fp, &kq)) != 0) goto done_norel; @@ -957,7 +958,7 @@ findkn: if (fops->f_isfd) { KASSERT(td != NULL, ("td is NULL")); - error = fget(td, kev->ident, &fp); + error = fget(td, kev->ident, CAP_EVENT, &fp); if (error) goto done; @@ -2166,7 +2167,7 @@ struct file *fp; int error; - if ((error = fget(td, fd, &fp)) != 0) + if ((error = fget(td, fd, CAP_KEVENT, &fp)) != 0) return (error); if ((error = kqueue_acquire(fp, &kq)) != 0) goto noacquire; diff -aurN -x '*.orig' src-clean/sys/kern/kern_exec.c src/sys/kern/kern_exec.c --- src-clean/sys/kern/kern_exec.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/kern_exec.c 2010-08-25 10:24:35.000000000 +0200 @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -112,7 +113,7 @@ NULL, 0, sysctl_kern_ps_strings, "LU", ""); /* XXX This should be vm_size_t. */ -SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD, +SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD|CTLFLAG_CAPRD, NULL, 0, sysctl_kern_usrstack, "LU", ""); SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD, @@ -336,7 +337,7 @@ struct vnode *tracevp = NULL; struct ucred *tracecred = NULL; #endif - struct vnode *textvp = NULL, *binvp = NULL; + struct vnode *textvp = NULL, *binvp; int credential_changing; int vfslocked; int textset; @@ -410,6 +411,18 @@ interpret: if (args->fname != NULL) { + /* + * While capability mode can't reach this point via direct + * path arguments to execve(), we also don't allow + * interpreters to be used in capability mode (for now). + * Catch indirect lookups and return a permissions error. + * + * XXXRW: Is this the right error? + */ + if (td->td_ucred->cr_flags & CRED_FLAG_CAPMODE) { + error = EPERM; + goto exec_fail; + } error = namei(&nd); if (error) goto exec_fail; @@ -419,7 +432,9 @@ imgp->vp = binvp; } else { AUDIT_ARG_FD(args->fd); - error = fgetvp(td, args->fd, &binvp); + /* XXXRW: Possibly should just be CAP_FEXECVE? */ + error = fgetvp_read(td, args->fd, CAP_READ | CAP_FEXECVE, + &binvp); if (error) goto exec_fail; vfslocked = VFS_LOCK_GIANT(binvp->v_mount); @@ -626,6 +641,13 @@ * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * + * We disable setuid/setgid/etc in capability mode on the basis that + * most setugid applications are not written with that environment in + * mind, and will therefore almost certainly operate incorrectly. In + * principle there's no reason that setugid applications might not be + * useful in capability mode, so we may want to reconsider this + * conservative design choice in the future. + * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ @@ -641,6 +663,7 @@ #endif if (credential_changing && + (oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0 && (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { /* diff -aurN -x '*.orig' src-clean/sys/kern/kern_exit.c src/sys/kern/kern_exit.c --- src-clean/sys/kern/kern_exit.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/kern_exit.c 2010-08-25 10:24:35.000000000 +0200 @@ -40,6 +40,7 @@ #include "opt_compat.h" #include "opt_kdtrace.h" #include "opt_ktrace.h" +#include "opt_procdesc.h" #include #include @@ -50,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -488,39 +490,55 @@ knlist_clear(&p->p_klist, 1); /* - * Notify parent that we're gone. If parent has the PS_NOCLDWAIT - * flag set, or if the handler is set to SIG_IGN, notify process - * 1 instead (and hope it will handle this situation). - */ - PROC_LOCK(p->p_pptr); - mtx_lock(&p->p_pptr->p_sigacts->ps_mtx); - if (p->p_pptr->p_sigacts->ps_flag & (PS_NOCLDWAIT | PS_CLDSIGIGN)) { - struct proc *pp; - - mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); - pp = p->p_pptr; - PROC_UNLOCK(pp); - proc_reparent(p, initproc); - p->p_sigparent = SIGCHLD; - PROC_LOCK(p->p_pptr); - + * If this is a process with a descriptor, we may not need to deliver + * a signal to the parent. proctree_lock is held over + * procdesc_exit() to serialize concurrent calls to close() and + * exit(). + */ +#ifdef PROCDESC + if (p->p_procdesc == NULL || procdesc_exit(p)) { +#endif /* - * Notify parent, so in case he was wait(2)ing or - * executing waitpid(2) with our pid, he will - * continue. + * Notify parent that we're gone, in case it is wait(2)ing or + * executing waitpid(2) with our pid. If parent has the + * PS_NOCLDWAIT flag set, or if the handler is set to + * SIG_IGN, notify process 1 instead (and hope it will handle + * this situation). */ - wakeup(pp); - } else - mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); - - if (p->p_pptr == initproc) - psignal(p->p_pptr, SIGCHLD); - else if (p->p_sigparent != 0) { - if (p->p_sigparent == SIGCHLD) - childproc_exited(p); - else /* LINUX thread */ - psignal(p->p_pptr, p->p_sigparent); + PROC_LOCK(p->p_pptr); + mtx_lock(&p->p_pptr->p_sigacts->ps_mtx); + if (p->p_pptr->p_sigacts->ps_flag & (PS_NOCLDWAIT | + PS_CLDSIGIGN)) { + struct proc *pp; + + mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); + pp = p->p_pptr; + PROC_UNLOCK(pp); + proc_reparent(p, initproc); + p->p_sigparent = SIGCHLD; + PROC_LOCK(p->p_pptr); + /* + * If this was the last child of our parent, notify + * parent, so in case he was wait(2)ing, he will + * continue. + */ + wakeup(pp); + } else + mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); + + if (p->p_pptr == initproc) + psignal(p->p_pptr, SIGCHLD); + else if (p->p_sigparent != 0) { + if (p->p_sigparent == SIGCHLD) + childproc_exited(p); + else /* LINUX thread */ + psignal(p->p_pptr, p->p_sigparent); + } +#ifdef PROCDESC + } else { + PROC_LOCK(p->p_pptr); } +#endif sx_xunlock(&proctree_lock); /* @@ -540,6 +558,8 @@ * a lost wakeup. So, we first call wakeup, then we grab the * sched lock, update the state, and release the parent process' * proc lock. + * + * XXXRW: Why do we wake up the parent...? */ wakeup(p->p_pptr); cv_broadcast(&p->p_pwait); @@ -682,12 +702,20 @@ return (error); } +int +pdwait(struct thread *td, struct pdwait_args *uap) +{ + + /* XXXRW: Not yet. */ + return (ENOSYS); +} + /* * Reap the remains of a zombie process and optionally return status and * rusage. Asserts and will release both the proctree_lock and the process * lock as part of its work. */ -static void +void proc_reap(struct thread *td, struct proc *p, int *status, int options, struct rusage *rusage) { @@ -725,6 +753,8 @@ /* * If we got the child via a ptrace 'attach', we need to give it back * to the old parent. + * + * XXXRW: How will ptrace and process descriptors interact here? */ if (p->p_oppid && (t = pfind(p->p_oppid)) != NULL) { PROC_LOCK(p); @@ -748,6 +778,10 @@ sx_xunlock(&allproc_lock); LIST_REMOVE(p, p_sibling); leavepgrp(p); +#ifdef PROCDESC + if (p->p_procdesc != NULL) + procdesc_reap(p); +#endif sx_xunlock(&proctree_lock); /* @@ -849,6 +883,17 @@ continue; } + /* + * If a process has a process descriptor, then it won't be + * picked up by wait4(). Unless it's being debugged, in + * which case the debugging process will need to manage it + * with waitpid(). + */ + if (p->p_procdesc != NULL && p->p_oppid == 0) { + PROC_UNLOCK(p); + continue; + } + nfound++; PROC_SLOCK(p); if (p->p_state == PRS_ZOMBIE) { diff -aurN -x '*.orig' src-clean/sys/kern/kern_fork.c src/sys/kern/kern_fork.c --- src-clean/sys/kern/kern_fork.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/kern_fork.c 2010-08-25 10:24:35.000000000 +0200 @@ -40,6 +40,7 @@ #include "opt_kdtrace.h" #include "opt_ktrace.h" #include "opt_kstack_pages.h" +#include "opt_procdesc.h" #include #include @@ -55,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -112,6 +114,39 @@ return (error); } +/* ARGUSED */ +int +pdfork(td, uap) + struct thread *td; + struct pdfork_args *uap; +{ +#ifdef PROCDESC + int error, fd; + struct proc *p2; + + /* + * XXXRW: For now, we play a slight game here to avoid changing the + * arguments to fork1() - when a process descriptor is requested, we + * will initially return the file descriptor via td_retval[0], then + * in pdfork(), we copy that out and replace the retval with the pid. + * + * It is necessary to return fd by reference as 0 is a valid file + * descriptor number, and the child needs to be able to distinguish + * itself from the parent using the return value. + */ + error = fork1(td, RFFDG | RFPROC | RFPROCDESC, 0, &p2); + if (error == 0) { + fd = td->td_retval[0]; + td->td_retval[0] = p2->p_pid; + td->td_retval[1] = 0; + error = copyout(&fd, uap->fdp, sizeof(fd)); + } + return (error); +#else + return (ENOSYS); +#endif +} + /* ARGSUSED */ int vfork(td, uap) @@ -215,12 +250,22 @@ struct sigacts *newsigacts; struct vmspace *vm2; vm_ooffset_t mem_charged; +#ifdef PROCDESC + struct file *fp_procdesc = NULL; + int fd_procdesc; +#endif int error; /* Can't copy and clear. */ if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) return (EINVAL); +#ifdef PROCDESC + /* Can't not create a process yet get a process descriptor. */ + if ((flags & RFPROCDESC) && ((flags & RFPROC) == 0)) + return (EINVAL); +#endif + p1 = td->td_proc; /* @@ -342,6 +387,21 @@ goto fail; } +#ifdef PROCDESC + /* + * If required, create a process descriptor in the parent first; we + * will abandon it if something goes wrong. We don't finit() until + * later. + * + * XXXRW: What errno to return? + */ + if (flags & RFPROCDESC) { + error = falloc(td, &fp_procdesc, &fd_procdesc); + if (error) + goto fail; + } +#endif + /* * Increment the count of procs running with this uid. Don't allow * a nonprivileged user to exceed their current limit. @@ -494,6 +554,16 @@ } else if (flags & RFFDG) { fd = fdcopy(p1->p_fd); fdtol = NULL; +#ifdef PROCDESC + /* + * If the file descriptor table is copied, we only want the + * process descriptor to appear in the parent, so close in + * the child. + */ + if (flags & RFPROCDESC) + fdclose(fd, fp_procdesc, fd_procdesc, td); +#endif + } else { fd = fdshare(p1->p_fd); if (p1->p_fdtol == NULL) @@ -731,6 +801,16 @@ p2->p_vmspace->vm_ssize); } +#ifdef PROCDESC + /* + * Associate the process descriptor with the process before anything + * can happen that might cause that process to need the descriptor. + * However, don't do this until after fork(2) can no longer fail. + */ + if (flags & RFPROCDESC) + procdesc_new(p2, fp_procdesc); +#endif + /* * Both processes are set up, now check if any loadable modules want * to adjust anything. @@ -784,6 +864,17 @@ * Return child proc pointer to parent. */ *procp = p2; + + /* + * If we're using process descriptors, then the process descriptor + * number, rather than the chid pid, will be returned. + */ +#ifdef PROCDESC + if (flags & RFPROCDESC) { + td->td_retval[0] = fd_procdesc; + fdrop(fp_procdesc, td); + } +#endif return (0); fail: sx_sunlock(&proctree_lock); @@ -798,6 +889,10 @@ if (vm2 != NULL) vmspace_free(vm2); uma_zfree(proc_zone, newproc); +#ifdef PROCDESC + if ((flags & RFPROCDESC) && (fp_procdesc != NULL)) + fdrop(fp_procdesc, td); +#endif pause("fork", hz / 2); return (error); } diff -aurN -x '*.orig' src-clean/sys/kern/kern_mib.c src/sys/kern/kern_mib.c --- src-clean/sys/kern/kern_mib.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/kern_mib.c 2010-08-25 10:24:35.000000000 +0200 @@ -57,7 +57,7 @@ SYSCTL_NODE(, 0, sysctl, CTLFLAG_RW, 0, "Sysctl internal magic"); -SYSCTL_NODE(, CTL_KERN, kern, CTLFLAG_RW, 0, +SYSCTL_NODE(, CTL_KERN, kern, CTLFLAG_RW|CTLFLAG_CAPRD, 0, "High kernel, proc, limits &c"); SYSCTL_NODE(, CTL_VM, vm, CTLFLAG_RW, 0, "Virtual memory"); @@ -90,23 +90,24 @@ SYSCTL_STRING(_kern, OID_AUTO, ident, CTLFLAG_RD|CTLFLAG_MPSAFE, kern_ident, 0, "Kernel identifier"); -SYSCTL_STRING(_kern, KERN_OSRELEASE, osrelease, CTLFLAG_RD|CTLFLAG_MPSAFE, - osrelease, 0, "Operating system release"); +SYSCTL_STRING(_kern, KERN_OSRELEASE, osrelease, CTLFLAG_RD|CTLFLAG_MPSAFE| + CTLFLAG_CAPRD, osrelease, 0, "Operating system release"); -SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD, +SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD | CTLFLAG_CAPRD, 0, BSD, "Operating system revision"); SYSCTL_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD|CTLFLAG_MPSAFE, version, 0, "Kernel version"); -SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD|CTLFLAG_MPSAFE, +SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD|CTLFLAG_MPSAFE| + CTLFLAG_CAPRD, ostype, 0, "Operating system type"); /* * NOTICE: The *userland* release date is available in * /usr/include/osreldate.h */ -SYSCTL_INT(_kern, KERN_OSRELDATE, osreldate, CTLFLAG_RD, +SYSCTL_INT(_kern, KERN_OSRELDATE, osreldate, CTLFLAG_RD | CTLFLAG_CAPRD, &osreldate, 0, "Kernel release date"); SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RDTUN, @@ -118,24 +119,24 @@ SYSCTL_INT(_kern, OID_AUTO, maxusers, CTLFLAG_RDTUN, &maxusers, 0, "Hint for kernel tuning"); -SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD, +SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD | CTLFLAG_CAPRD, 0, ARG_MAX, "Maximum bytes of argument to execve(2)"); -SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD, +SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD | CTLFLAG_CAPRD, 0, _POSIX_VERSION, "Version of POSIX attempting to comply to"); -SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RDTUN, +SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RDTUN | CTLFLAG_CAPRD, &ngroups_max, 0, "Maximum number of supplemental groups a user can belong to"); -SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD, +SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD | CTLFLAG_CAPRD, 0, 1, "Whether job control is available"); #ifdef _POSIX_SAVED_IDS -SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, +SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD | CTLFLAG_CAPRD, 0, 1, "Whether saved set-group/user ID is available"); #else -SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, +SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD | CTLFLAG_CAPRD, 0, 0, "Whether saved set-group/user ID is available"); #endif @@ -144,13 +145,13 @@ SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, CTLFLAG_RW, kernelname, sizeof kernelname, "Name of kernel file booted"); -SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD, +SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD | CTLFLAG_CAPRD, &mp_ncpus, 0, "Number of active CPUs"); -SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD, +SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD | CTLFLAG_CAPRD, 0, BYTE_ORDER, "System byte order"); -SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD, +SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD | CTLFLAG_CAPRD, 0, PAGE_SIZE, "System memory page size"); static int @@ -167,7 +168,7 @@ } SYSCTL_PROC(_kern, KERN_ARND, arandom, - CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, + CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, NULL, 0, sysctl_kern_arnd, "", "arc4rand"); static int @@ -426,6 +427,8 @@ * This is really cheating. These actually live in the libc, something * which I'm not quite sure is a good idea anyway, but in order for * getnext and friends to actually work, we define dummies here. + * + * XXXRW: These probably should be CTLFLAG_CAPRD. */ SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD, "", 0, "PATH that finds all the standard utilities"); diff -aurN -x '*.orig' src-clean/sys/kern/kern_sig.c src/sys/kern/kern_sig.c --- src-clean/sys/kern/kern_sig.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/kern_sig.c 2010-08-25 10:24:35.000000000 +0200 @@ -40,12 +40,14 @@ #include "opt_compat.h" #include "opt_kdtrace.h" #include "opt_ktrace.h" +#include "opt_procdesc.h" #include #include #include #include #include +#include #include #include #include @@ -57,6 +59,7 @@ #include #include #include +#include #include #include #include @@ -1725,6 +1728,34 @@ /* NOTREACHED */ } +int +pdkill(td, uap) + struct thread *td; + struct pdkill_args *uap; +{ +#ifdef PROCDESC + struct proc *p; + int error; + + AUDIT_ARG_SIGNUM(uap->signum); + AUDIT_ARG_FD(uap->fd); + if ((u_int)uap->signum > _SIG_MAXSIG) + return (EINVAL); + + error = procdesc_find(td, uap->fd, CAP_PDKILL, &p); + if (error) + return (error); + AUDIT_ARG_PROCESS(p); + error = p_cansignal(td, p, uap->signum); + if (error == 0 && uap->signum) + psignal(p, uap->signum); + PROC_UNLOCK(p); + return (error); +#else + return (ENOSYS); +#endif +} + #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct okillpg_args { diff -aurN -x '*.orig' src-clean/sys/kern/kern_sysctl.c src/sys/kern/kern_sysctl.c --- src-clean/sys/kern/kern_sysctl.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/kern_sysctl.c 2010-08-25 10:24:35.000000000 +0200 @@ -38,10 +38,12 @@ #include __FBSDID("$FreeBSD: src/sys/kern/kern_sysctl.c,v 1.201.2.2.4.1 2010/06/14 02:09:06 kensmith Exp $"); +#include "opt_capabilities.h" #include "opt_compat.h" #include "opt_ktrace.h" #include +#include #include #include #include @@ -683,7 +685,12 @@ return (SYSCTL_OUT(req, "", 1)); } -static SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD, sysctl_sysctl_name, ""); +/* + * XXXRW: Shouldn't return name data for nodes that we don't permit in + * capability mode. + */ +static SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD | CTLFLAG_CAPRD, + sysctl_sysctl_name, ""); static int sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen, @@ -762,7 +769,12 @@ return (error); } -static SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD, sysctl_sysctl_next, ""); +/* + * XXXRW: Shouldn't return next data for nodes that we don't permit in + * capability mode. + */ +static SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD | CTLFLAG_CAPRD, + sysctl_sysctl_next, ""); static int name2oid(char *name, int *oid, int *len, struct sysctl_oid **oidpp) @@ -858,8 +870,12 @@ return (error); } -SYSCTL_PROC(_sysctl, 3, name2oid, CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_MPSAFE, - 0, 0, sysctl_sysctl_name2oid, "I", ""); +/* + * XXXRW: Shouldn't return name2oid data for nodes that we don't permit in + * capability mode. + */ +SYSCTL_PROC(_sysctl, 3, name2oid, CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_MPSAFE| + CTLFLAG_CAPRW, 0, 0, sysctl_sysctl_name2oid, "I", ""); static int sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS) @@ -881,7 +897,11 @@ } -static SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD|CTLFLAG_MPSAFE, +/* + * XXXRW: Shouldn't return oidfmt data for nodes that we don't permit in + * capability mode. + */ +static SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_CAPRD, sysctl_sysctl_oidfmt, ""); static int @@ -900,7 +920,12 @@ return (error); } -static SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD, sysctl_sysctl_oiddescr, ""); +/* + * XXXRW: Shouldn't return oiddescr data for nodes that we don't permit in + * capability mode. + */ +static SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD | CTLFLAG_CAPRD, + sysctl_sysctl_oiddescr, ""); /* * Default "handler" functions. @@ -1371,6 +1396,19 @@ KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL")); +#ifdef CAPABILITIES + /* + * If the process is in capability mode, then don't permit reading or + * writing unless specifically granted for the node. + */ + if (req->td->td_ucred->cr_flags & CRED_FLAG_CAPMODE) { + if (req->oldptr && !(oid->oid_kind & CTLFLAG_CAPRD)) + return (EPERM); + if (req->newptr && !(oid->oid_kind & CTLFLAG_CAPWR)) + return (EPERM); + } +#endif + /* Is this sysctl sensitive to securelevels? */ if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) { lvl = (oid->oid_kind & CTLMASK_SECURE) >> CTLSHIFT_SECURE; diff -aurN -x '*.orig' src-clean/sys/kern/makesyscalls.sh src/sys/kern/makesyscalls.sh --- src-clean/sys/kern/makesyscalls.sh 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/makesyscalls.sh 2010-08-25 10:24:35.000000000 +0200 @@ -39,6 +39,13 @@ sysprotoend="sysprotoend.$$" systracetmp="systrace.$$" +if [ -r capabilities.conf ]; then + capenabled=`cat capabilities.conf | grep -v '^#' | grep -v "^$"` + capenabled=`echo $capenabled | sed 's/ /,/g'` +else + capenabled="" +fi + trap "rm $sysaue $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $syscompat6 $syscompat6dcl $syscompat7 $syscompat7dcl $sysent $sysinc $sysarg $sysprotoend $systracetmp" 0 touch $sysaue $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $syscompat6 $syscompat6dcl $syscompat7 $syscompat7dcl $sysent $sysinc $sysarg $sysprotoend $systracetmp @@ -97,8 +104,11 @@ switchname = \"$switchname\" namesname = \"$namesname\" infile = \"$1\" + capenabled_string = \"$capenabled\" "' + split(capenabled_string, capenabled, ","); + printf "/*\n * System call switch table.\n *\n" > syssw printf " * DO NOT EDIT-- this file is automatically generated.\n" > syssw printf " * $%s$\n", "FreeBSD" > syssw @@ -286,6 +296,20 @@ f++ #function return type funcname=$f + + # + # We now know the func name, so define a flags field for it. + # Do this before any other processing as we may return early + # from it. + # + # XXXRW: Surely we can just look it up in the array in awk? + # + for (cap in capenabled) { + if (funcname == capenabled[cap]) { + flags = "SYF_CAPENABLED"; + } + } + if (funcalias == "") funcalias = funcname if (argalias == "") { @@ -344,7 +368,7 @@ } # - # The currently-empty flags field. + # The flags, if any. # { flags = "0"; diff -aurN -x '*.orig' src-clean/sys/kern/posix4_mib.c src/sys/kern/posix4_mib.c --- src-clean/sys/kern/posix4_mib.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/posix4_mib.c 2010-08-25 10:24:35.000000000 +0200 @@ -57,7 +57,7 @@ SYSCTL_DECL(_p1003_1b); #define P1B_SYSCTL(num, name) \ - SYSCTL_INT(_p1003_1b, num, name, CTLFLAG_RD, facility + num - 1, 0, ""); + SYSCTL_INT(_p1003_1b, num, name, CTLFLAG_RD | CTLFLAG_CAPRD, facility + num - 1, 0, ""); #define P1B_SYSCTL_RW(num, name) \ SYSCTL_PROC(_p1003_1b, num, name, CTLTYPE_INT | CTLFLAG_RW, NULL, num, \ p31b_sysctl_proc, "I", ""); @@ -67,7 +67,7 @@ SYSCTL_DECL(_kern_p1003_1b); #define P1B_SYSCTL(num, name) \ - SYSCTL_INT(_kern_p1003_1b, OID_AUTO, name, CTLFLAG_RD, \ + SYSCTL_INT(_kern_p1003_1b, OID_AUTO, name, CTLFLAG_RD | CTLFLAG_CAPRD, \ facility + num - 1, 0, ""); #define P1B_SYSCTL_RW(num, name) \ SYSCTL_PROC(_p1003_1b, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW, NULL, \ diff -aurN -x '*.orig' src-clean/sys/kern/subr_smp.c src/sys/kern/subr_smp.c --- src-clean/sys/kern/subr_smp.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/subr_smp.c 2010-08-25 10:24:35.000000000 +0200 @@ -71,12 +71,12 @@ volatile int smp_started; u_int mp_maxid; -SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD, NULL, "Kernel SMP"); +SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD|CTLFLAG_CAPRD, NULL, "Kernel SMP"); -SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD, &mp_maxid, 0, +SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0, "Max CPU ID."); -SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD, &mp_maxcpus, 0, +SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus, 0, "Max number of CPUs that the system was compiled for."); int smp_active = 0; /* are the APs allowed to run? */ @@ -84,12 +84,12 @@ "Number of Auxillary Processors (APs) that were successfully started"); int smp_disabled = 0; /* has smp been disabled? */ -SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN, &smp_disabled, 0, +SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD, &smp_disabled, 0, "SMP has been disabled from the loader"); TUNABLE_INT("kern.smp.disabled", &smp_disabled); int smp_cpus = 1; /* how many cpu's running */ -SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD, &smp_cpus, 0, +SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0, "Number of CPUs online"); int smp_topology = 0; /* Which topology we're using. */ diff -aurN -x '*.orig' src-clean/sys/kern/sys_capability.c src/sys/kern/sys_capability.c --- src-clean/sys/kern/sys_capability.c 1970-01-01 01:00:00.000000000 +0100 +++ src/sys/kern/sys_capability.c 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,553 @@ +/*- + * Copyright (c) 2008-2009 Robert N. M. Watson + * All rights reserved. + * + * WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED + * ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND + * UNEXPECTED WAYS. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * FreeBSD kernel capability facility. + * + * Each capability is represented as a file descriptor, but unlike + * traditional file descriptors, capabilities include a fine-grained and + * constant mask of rights associated with them. + * + * Capabilities wrap other actual object file descriptors -- system calls + * declare the rights they require when looking up file descriptors. + * + * When one capability is created from another, rather than nesting, we + * directly reference the underlying object but with a new mask, rather than + * referencing the previous capability. New capabilities will have the same + * set or a subset of rights of the capability they are derived from. + */ + +#include "opt_capabilities.h" + +#include +__FBSDID("$P4: //depot/projects/trustedbsd/capabilities/src/sys/kern/sys_capability.c#30 $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#ifdef CAPABILITIES + +/* + * struct capability describes a capability, and is hung off of its struct + * file f_data field. cap_file and cap_rightss are static once hooked up, as + * neither the object it references nor the rights it encapsulates are + * permitted to change. cap_filelist may change when other capabilites are + * added or removed from the same file, and is currently protected by the + * pool mutex for the object file descriptor. + */ +struct capability { + struct file *cap_object; /* Underlying object's file. */ + struct file *cap_file; /* Back-pointer to cap's file. */ + cap_rights_t cap_rights; /* Mask of rights on object. */ + LIST_ENTRY(capability) cap_filelist; /* Object's cap list. */ +}; + +/* + * Capabilities have a fileops vector, but in practice none should ever be + * called except for fo_close, as the capability will normally not be + * returned during a file descriptor lookup in the system call code. + */ +static fo_rdwr_t capability_read; +static fo_rdwr_t capability_write; +static fo_truncate_t capability_truncate; +static fo_ioctl_t capability_ioctl; +static fo_poll_t capability_poll; +static fo_kqfilter_t capability_kqfilter; +static fo_stat_t capability_stat; +static fo_close_t capability_close; + +static struct fileops capability_ops = { + .fo_read = capability_read, + .fo_write = capability_write, + .fo_truncate = capability_truncate, + .fo_ioctl = capability_ioctl, + .fo_poll = capability_poll, + .fo_kqfilter = capability_kqfilter, + .fo_stat = capability_stat, + .fo_close = capability_close, + + /* + * Possibly we should have two fileops vectors, one with and one + * without DFLAG_PASSABLE, in order to support wrapping objects who + * don't have DFLAG_PASSABLE. On the other hand, there are no such + * objects, which raises the question as to why the flag exists at + * all. + * + * XXXRW: Actually, this is not true: kqueue's aren't passable, so we + * do need to do this. + * + * No need to set DFLAG_SEEKABLE as any seek operations will fall + * through to the underlying object once the capability is verified. + */ + .fo_flags = DFLAG_PASSABLE, +}; + +static uma_zone_t capability_zone; + +/* + * We don't currently have any MIB entries for sysctls, but we do expose + * security.capabilities so that it's easy to tell if options CAPABILITIES is + * compiled into the kernel. + */ +SYSCTL_NODE(_security, OID_AUTO, capabilities, CTLFLAG_RW, 0, + "TrustedBSD Capabilities controls"); + +static void +capability_init(void *dummy __unused) +{ + + capability_zone = uma_zcreate("capability", + sizeof(struct capability), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, + 0); + if (capability_zone == NULL) + panic("capability_init: capability_zone not initialized"); +} +SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, capability_init, NULL); + +/* + * Test whether a capability grants the requested rights. + */ +static int +cap_check(struct capability *c, cap_rights_t rights) +{ + + if ((c->cap_rights | rights) != c->cap_rights) + return (ENOTCAPABLE); + return (0); +} + +/* + * Given a file descriptor, test it against a capability rights mask and then + * return the file descriptor on which to actually perform the requested + * operation. As long as the reference to fp_cap remains valid, the returned + * pointer in *fp will remain valid, so no extra reference management is + * required, and the caller should fdrop() fp_cap as normal when done with + * both. + */ +int +cap_fextract(struct file *fp_cap, cap_rights_t rights, struct file **fpp) +{ + struct capability *c; + int error; + + if (fp_cap->f_type != DTYPE_CAPABILITY) { + *fpp = fp_cap; + return (0); + } + c = fp_cap->f_data; + error = cap_check(c, rights); + if (error) + return (error); + *fpp = c->cap_object; + return (0); +} + +/* + * Slightly different routine for memory mapping file descriptors: unwrap the + * capability and check CAP_MMAP, but also return a bitmask representing the + * maximum mapping rights the capability allows on the object. + */ +int +cap_fextract_mmap(struct file *fp_cap, cap_rights_t rights, u_char *maxprotp, + struct file **fpp) +{ + struct capability *c; + u_char maxprot; + int error; + + if (fp_cap->f_type != DTYPE_CAPABILITY) { + *fpp = fp_cap; + *maxprotp = VM_PROT_ALL; + return (0); + } + c = fp_cap->f_data; + error = cap_check(c, rights); + if (error) + return (error); + *fpp = c->cap_object; + maxprot = 0; + if (c->cap_rights & CAP_READ) + maxprot |= VM_PROT_READ; + if (c->cap_rights & CAP_WRITE) + maxprot |= VM_PROT_WRITE; + if (c->cap_rights & CAP_MAPEXEC) + maxprot |= VM_PROT_EXECUTE; + *maxprotp = maxprot; + return (0); +} + +/* + * Extract rights from a capability for monitoring purposes -- not for use in + * any other way, as we want to keep all capability permission evaluation in + * this one file. + */ +cap_rights_t +cap_rights(struct file *fp_cap) +{ + struct capability *c; + + KASSERT(fp_cap->f_type == DTYPE_CAPABILITY, + ("cap_rights: !capability")); + + c = fp_cap->f_data; + return (c->cap_rights); +} + +/* + * System call to enter capability mode for the process. + */ +int +cap_enter(struct thread *td, struct cap_enter_args *uap) +{ + struct ucred *newcred, *oldcred; + struct proc *p; + + if (td->td_ucred->cr_flags & CRED_FLAG_CAPMODE) + return (0); + + newcred = crget(); + p = td->td_proc; + PROC_LOCK(p); + oldcred = p->p_ucred; + crcopy(newcred, oldcred); + newcred->cr_flags |= CRED_FLAG_CAPMODE; + p->p_ucred = newcred; + PROC_UNLOCK(p); + crfree(oldcred); + return (0); +} + +/* + * System call to query whether the process is in capability mode. + */ +int +cap_getmode(struct thread *td, struct cap_getmode_args *uap) +{ + u_int i; + + i = (td->td_ucred->cr_flags & CRED_FLAG_CAPMODE) ? 1 : 0; + return (copyout(&i, uap->modep, sizeof(i))); +} + +/* + * System call to create a new capability reference to either an existing + * file object or an an existing capability. + */ +int +cap_new(struct thread *td, struct cap_new_args *uap) +{ + int error, capfd; + int fd = uap->fd; + struct file *fp, *cap; + cap_rights_t rights = uap->rights; + + AUDIT_ARG_FD(fd); + AUDIT_ARG_RIGHTS(rights); + + /* + * We always allow creating a capability referencing an existing + * descriptor or capability, even if it's not of much use to the + * application. + */ + error = fget(td, fd, 0, &fp); + if (error) return (error); + + AUDIT_ARG_FILE(td->td_proc, fp); + + error = kern_capwrap(td, fp, rights, &cap, &capfd); + + /* + * Release our reference to the file (another one has been taken for + * the capability's sake if necessary). + */ + fdrop(fp, td); + + return error; +} + + +/* + * Create a capability to wrap around an existing file. + */ +int kern_capwrap(struct thread *td, struct file *fp, cap_rights_t rights, + struct file **cap, int *capfd) +{ + struct capability *c, *c_old; + struct file *fp_object; + int error; + + if ((rights | CAP_MASK_VALID) != CAP_MASK_VALID) + return (EINVAL); + + c = uma_zalloc(capability_zone, M_WAITOK | M_ZERO); + + + /* + * If a new capability is being derived from an existing capability, + * then the new capability rights must be a subset of the existing + * rights. + */ + if (fp->f_type == DTYPE_CAPABILITY) { + c_old = fp->f_data; + if ((c_old->cap_rights | rights) != c_old->cap_rights) { + error = ENOTCAPABLE; + goto fail; + } + } + + /* + * Allocate a new file descriptor to hang the capability off. + */ + error = falloc(td, cap, capfd); + if (error) + goto fail; + + /* + * Rather than nesting capabilities, directly reference the object an + * existing capability references. There's nothing else interesting + * to preserve for future use, as we've incorporated the previous + * rights mask into the new one. This prevents us from having to + * deal with capability chains. + */ + if (fp->f_type == DTYPE_CAPABILITY) + fp_object = ((struct capability *)fp->f_data)->cap_object; + else + fp_object = fp; + fhold(fp_object); + c->cap_rights = rights; + c->cap_object = fp_object; + c->cap_file = *cap; + finit(*cap, fp->f_flag, DTYPE_CAPABILITY, c, &capability_ops); + + /* + * Add this capability to the per-file list of referencing + * capabilities. + */ + mtx_pool_lock(mtxpool_sleep, fp_object); + LIST_INSERT_HEAD(&fp_object->f_caps, c, cap_filelist); + fp_object->f_capcount++; + mtx_pool_unlock(mtxpool_sleep, fp_object); + td->td_retval[0] = *capfd; + + /* + * Release our private reference (the proc filedesc still has one). + */ + fdrop(*cap, td); + + return (0); + +fail: + uma_zfree(capability_zone, c); + return (error); +} + +/* + * System call to query the rights mask associated with a capability. + */ +int +cap_getrights(struct thread *td, struct cap_getrights_args *uap) +{ + struct capability *c; + struct file *fp; + int error; + + AUDIT_ARG_FD(uap->fd); + error = fgetcap(td, uap->fd, &fp); + if (error) + return (error); + c = fp->f_data; + error = copyout(&c->cap_rights, uap->rightsp, sizeof(*uap->rightsp)); + fdrop(fp, td); + return (error); +} + +/* + * When a capability is closed, simply drop the reference on the underlying + * object and free the capability. fdrop() will handle the case where the + * underlying object also needs to close, and the caller will have already + * performed any object-specific lock or mqueue handling. + */ +static int +capability_close(struct file *fp, struct thread *td) +{ + struct capability *c; + struct file *fp_object; + + KASSERT(fp->f_type == DTYPE_CAPABILITY, + ("capability_close: !capability")); + c = fp->f_data; + fp->f_ops = &badfileops; + fp->f_data = NULL; + fp_object = c->cap_object; + mtx_pool_lock(mtxpool_sleep, fp_object); + LIST_REMOVE(c, cap_filelist); + fp_object->f_capcount--; + mtx_pool_unlock(mtxpool_sleep, fp_object); + uma_zfree(capability_zone, c); + return (fdrop(fp_object, td)); +} + +/* + * In general, file descriptor operations should never make it to the + * capability, only the underlying file descriptor operation vector, so panic + * if we do turn up here. + */ +static int +capability_read(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td) +{ + + panic("capability_read"); +} + +static int +capability_write(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td) +{ + + panic("capability_write"); +} + +static int +capability_truncate(struct file *fp, off_t length, struct ucred *active_cred, + struct thread *td) +{ + + panic("capability_truncate"); +} + +static int +capability_ioctl(struct file *fp, u_long com, void *data, + struct ucred *active_cred, struct thread *td) +{ + + panic("capability_ioctl"); +} + +static int +capability_poll(struct file *fp, int events, struct ucred *active_cred, + struct thread *td) +{ + + panic("capability_poll"); +} + +static int +capability_kqfilter(struct file *fp, struct knote *kn) +{ + + panic("capability_kqfilter"); +} + +static int +capability_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, + struct thread *td) +{ + + panic("capability_stat"); +} + +#else /* !CAPABILITIES */ + +/* + * Stub Capability functions for when options CAPABILITIES isn't compiled + * into the kernel. + */ +int +cap_fextract(struct file *fp_cap, cap_rights_t rights, struct file **fpp) +{ + + KASSERT(fp_cap->f_type != DTYPE_CAPABILITY, + ("cap_fextract: saw capability")); + + *fpp = fp_cap; + return (0); +} + +int +cap_fextract_mmap(struct file *fp_cap, cap_rights_t rights, u_char *maxprotp, + struct file **fpp) +{ + + KASSERT(fp_cap->f_type != DTYPE_CAPABILITY, + ("cap_fextract_mmap: saw capability")); + + *fpp = fp_cap; + *maxprotp = VM_PROT_ALL; + return (0); +} + +int +cap_enter(struct thread *td, struct cap_enter_args *uap) +{ + + return (ENOSYS); +} + +int +cap_getmode(struct thread *td, struct cap_getmode_args *uap) +{ + + return (ENOSYS); +} + +int +cap_new(struct thread *td, struct cap_new_args *uap) +{ + + return (ENOSYS); +} + +int +cap_getrights(struct thread *td, struct cap_getrights_args *uap) +{ + + return (ENOSYS); +} + +#endif /* CAPABILITIES */ diff -aurN -x '*.orig' src-clean/sys/kern/sys_generic.c src/sys/kern/sys_generic.c --- src-clean/sys/kern/sys_generic.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/sys_generic.c 2010-08-25 10:24:35.000000000 +0200 @@ -37,12 +37,14 @@ #include __FBSDID("$FreeBSD: src/sys/kern/sys_generic.c,v 1.175.2.6.2.1 2010/06/14 02:09:06 kensmith Exp $"); +#include "opt_capabilities.h" #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include +#include #include #include #include @@ -231,7 +233,7 @@ struct file *fp; int error; - error = fget_read(td, fd, &fp); + error = fget_read(td, fd, CAP_READ | CAP_SEEK, &fp); if (error) return (error); error = dofileread(td, fd, fp, auio, (off_t)-1, 0); @@ -274,7 +276,7 @@ struct file *fp; int error; - error = fget_read(td, fd, &fp); + error = fget_read(td, fd, CAP_READ, &fp); if (error) return (error); if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) @@ -440,7 +442,7 @@ struct file *fp; int error; - error = fget_write(td, fd, &fp); + error = fget_write(td, fd, CAP_WRITE | CAP_SEEK, &fp); if (error) return (error); error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); @@ -483,7 +485,7 @@ struct file *fp; int error; - error = fget_write(td, fd, &fp); + error = fget_write(td, fd, CAP_WRITE, &fp); if (error) return (error); if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) @@ -565,7 +567,7 @@ AUDIT_ARG_FD(fd); if (length < 0) return (EINVAL); - error = fget(td, fd, &fp); + error = fget(td, fd, CAP_FTRUNCATE, &fp); if (error) return (error); AUDIT_ARG_FILE(td->td_proc, fp); @@ -695,7 +697,7 @@ AUDIT_ARG_FD(fd); AUDIT_ARG_CMD(com); - if ((error = fget(td, fd, &fp)) != 0) + if ((error = fget(td, fd, CAP_IOCTL, &fp)) != 0) return (error); if ((fp->f_flag & (FREAD | FWRITE)) == 0) { fdrop(fp, td); @@ -1053,6 +1055,37 @@ return (n); } +static __inline int +getselfd_cap(struct filedesc *fdp, int fd, struct file **fpp) +{ + struct file *fp; +#ifdef CAPABILITIES + struct file *fp_fromcap; + int error; +#endif + + if ((fp = fget_unlocked(fdp, fd)) == NULL) + return (EBADF); +#ifdef CAPABILITIES + /* + * If the file descriptor is for a capability, test righst and use + * the file descriptor referenced by the capability. + */ + error = cap_fextract(fp, CAP_EVENT, &fp_fromcap); + if (error) { + fdrop(fp, curthread); + return (error); + } + if (fp != fp_fromcap) { + fhold(fp_fromcap); + fdrop(fp, curthread); + fp = fp_fromcap; + } +#endif /* CAPABILITIES */ + *fpp = fp; + return (0); +} + /* * Traverse the list of fds attached to this thread's seltd and check for * completion. @@ -1068,6 +1101,7 @@ struct file *fp; fd_mask bit; int fd, ev, n, idx; + int error; fdp = td->td_proc->p_fd; stp = td->td_sel; @@ -1079,8 +1113,9 @@ /* If the selinfo wasn't cleared the event didn't fire. */ if (si != NULL) continue; - if ((fp = fget_unlocked(fdp, fd)) == NULL) - return (EBADF); + error = getselfd_cap(fdp, fd, &fp); + if (error) + return (error); idx = fd / NFDBITS; bit = (fd_mask)1 << (fd % NFDBITS); ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td); @@ -1108,6 +1143,7 @@ fd_mask bit; int ev, flags, end, fd; int n, idx; + int error; fdp = td->td_proc->p_fd; n = 0; @@ -1118,8 +1154,9 @@ flags = selflags(ibits, idx, bit); if (flags == 0) continue; - if ((fp = fget_unlocked(fdp, fd)) == NULL) - return (EBADF); + error = getselfd_cap(fdp, fd, &fp); + if (error) + return (error); selfdalloc(td, (void *)(uintptr_t)fd); ev = fo_poll(fp, flags, td->td_ucred, td); fdrop(fp, td); diff -aurN -x '*.orig' src-clean/sys/kern/sys_procdesc.c src/sys/kern/sys_procdesc.c --- src-clean/sys/kern/sys_procdesc.c 1970-01-01 01:00:00.000000000 +0100 +++ src/sys/kern/sys_procdesc.c 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,483 @@ +/*- + * Copyright (c) 2009 Robert N. M. Watson + * All rights reserved. + * + * WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED + * ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND + * UNEXPECTED WAYS. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/*- + * FreeBSD process descriptor facility. + * + * Some processes are represented by a file descriptor, which will be used in + * preference to signaling and pids for the purposes of process management, + * and is, in effect, a form of capability. When a process descriptor is + * used with a process, it ceases to be visible to certain traditional UNIX + * process facilities, such as waitpid(2). + * + * Some semantics: + * + * - At most one process descriptor will exist for any process, although + * references to that descriptor may be held from many processes (or even + * be in flight between processes over a local domain socket). + * - Last close on the process descriptor will terminate the process using + * SIGKILL and reparent it to init so that there's a process to reap it + * when it's done exiting. + * - If the process exits before the descriptor is closed, it will not + * generate SIGCHLD on termination, or be picked up by waitpid(). + * - The pdkill(2) system call may be used to deliver a signal to the process + * using its process descriptor. + * - The pdwait(2) system call may be used to block (or not) on a process + * descriptor to collect termination information. + * + * Open questions: + * + * - How to handle ptrace(2)? + * - Will we want to add a pidtoprocdesc(2) system call to allow process + * descriptors to be created for processes without pfork(2)? + */ + +#include +__FBSDID("$P4"); + +#include "opt_procdesc.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#ifdef PROCDESC + +static uma_zone_t procdesc_zone; + +static fo_rdwr_t procdesc_read; +static fo_rdwr_t procdesc_write; +static fo_truncate_t procdesc_truncate; +static fo_ioctl_t procdesc_ioctl; +static fo_poll_t procdesc_poll; +static fo_kqfilter_t procdesc_kqfilter; +static fo_stat_t procdesc_stat; +static fo_close_t procdesc_close; + +static struct fileops procdesc_ops = { + .fo_read = procdesc_read, + .fo_write = procdesc_write, + .fo_truncate = procdesc_truncate, + .fo_ioctl = procdesc_ioctl, + .fo_poll = procdesc_poll, + .fo_kqfilter = procdesc_kqfilter, + .fo_stat = procdesc_stat, + .fo_close = procdesc_close, + .fo_flags = DFLAG_PASSABLE, +}; + +/* + * We don't currently have any MIB entries for sysctls, but we do expose + * kern.procdesc so that it's easy to tell if options PROCDESC is compiled + * into the kernel. + */ +SYSCTL_NODE(_kern, OID_AUTO, procdesc, CTLFLAG_RW, 0, + "TrustedBSD process descriptor controls"); + +/* + * Initialize with VFS so that process descriptors are available along with + * other file descriptor types. As long as it runs before init(8) starts, + * there shouldn't be a problem. + */ +static void +procdesc_init(void *dummy __unused) +{ + + procdesc_zone = uma_zcreate("procdesc", sizeof(struct procdesc), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + if (procdesc_zone == NULL) + panic("procdesc_init: procdesc_zone not initialized"); +} +SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, procdesc_init, NULL); + +/* + * Return a locked process given a process descriptor, or ESRCH if it has + * died. + */ +int +procdesc_find(struct thread *td, int fd, cap_rights_t rights, + struct proc **p) +{ + struct procdesc *pd; + struct file *fp; + int error; + + error = fget(td, fd, rights, &fp); + if (error) + return (error); + if (fp->f_type != DTYPE_PROCDESC) { + error = EBADF; + goto out; + } + pd = fp->f_data; + sx_slock(&proctree_lock); + if (td->td_proc != NULL) { + *p = pd->pd_proc; + PROC_LOCK(*p); + } else + error = ESRCH; + sx_sunlock(&proctree_lock); +out: + fdrop(fp, td); + return (error); +} + +/* + * Function to be used by procstat(1) sysctls when returning procdesc + * information. + */ +pid_t +procdesc_pid(struct file *fp_procdesc) +{ + struct procdesc *pd; + + KASSERT(fp_procdesc->f_type == DTYPE_PROCDESC, + ("procdesc_pid: !procdesc")); + + pd = fp_procdesc->f_data; + return (pd->pd_pid); +} + +/* + * System call to return the pid of a process given its process descriptor. + */ +int +pdgetpid(struct thread *td, struct pdgetpid_args *uap) +{ + struct procdesc *pd; + struct file *fp; + pid_t pid; + int error; + + AUDIT_ARG_FD(uap->fd); + error = fget(td, uap->fd, CAP_PDGETPID, &fp); + if (error) + return (error); + if (fp->f_type == DTYPE_PROCDESC) { + pd = fp->f_data; + pid = pd->pd_pid; + AUDIT_ARG_PID(pid); + } else + error = EBADF; + fdrop(fp, td); + if (error == 0) + error = copyout(&pid, uap->pidp, sizeof(pid)); + return (error); +} + +/* + * When a new process is forked by pdfork(), a file descriptor is allocated + * by the fork code first, then the process is forked, and then we get a + * chance to set up the process descriptor. Failure is not permitted at this + * point, so procdesc_new() must succeed. + */ +void +procdesc_new(struct proc *p, struct file *fp_procdesc) +{ + struct procdesc *pd; + + pd = uma_zalloc(procdesc_zone, M_WAITOK | M_ZERO); + pd->pd_proc = p; + pd->pd_pid = p->p_pid; + p->p_procdesc = pd; + PROCDESC_LOCK_INIT(pd); + + /* + * Process descriptors start out with two references: one from their + * struct file, and the other from their struct proc. + */ + refcount_init(&pd->pd_refcount, 2); + + /* XXXRW: Why these flags? */ + finit(fp_procdesc, FREAD | FWRITE, DTYPE_PROCDESC, pd, + &procdesc_ops); +} + +static void +procdesc_free(struct procdesc *pd) +{ + + /* + * When the last reference is released, we assert that the descriptor + * has been closed, but not that the process has exited, as we will + * detach the descriptor before the process dies if the descript is + * closed, as we can't wait synchronously. + */ + if (refcount_release(&pd->pd_refcount)) { + KASSERT(pd->pd_proc == NULL, + ("procdesc_free: pd_proc != NULL")); + KASSERT((pd->pd_flags & PD_CLOSED), + ("procdesc_free: !PD_CLOSED")); + + PROCDESC_LOCK_DESTROY(pd); + uma_zfree(procdesc_zone, pd); + } +} + +/* + * procdesc_exit() - notify a process descriptor that its process is exiting. + * We use the proctree_lock to ensure that process exit either happens + * strictly before or strictly after a concurrent call to procdesc_close(). + */ +int +procdesc_exit(struct proc *p) +{ + struct procdesc *pd; + + sx_assert(&proctree_lock, SA_XLOCKED); + PROC_LOCK_ASSERT(p, MA_OWNED); + KASSERT(p->p_procdesc != NULL, ("procdesc_exit: p_procdesc NULL")); + + pd = p->p_procdesc; + + PROCDESC_LOCK(pd); + KASSERT((pd->pd_flags & PD_CLOSED) == 0 || p->p_pptr == initproc, + ("procdesc_exit: closed && parent not init")); + + pd->pd_flags |= PD_EXITED; + + /* + * If the process descriptor has been closed, then we have nothing + * to do; return 1 so that init will get SIGCHLD and do the reaping. + * Clean up the procdesc now rather than letting it happen during + * that reap. + */ + if (pd->pd_flags & PD_CLOSED) { + PROCDESC_UNLOCK(pd); + pd->pd_proc = NULL; + p->p_procdesc = NULL; + procdesc_free(pd); + return (1); + } + if (pd->pd_flags & PD_SELECTED) { + pd->pd_flags &= ~PD_SELECTED; + selwakeup(&pd->pd_selinfo); + } + PROCDESC_UNLOCK(pd); + return (0); +} + +/* + * When a process descriptor is reaped, perhaps as a result of close() or + * pdwait4(), release the process's reference on the process descriptor. + */ +void +procdesc_reap(struct proc *p) +{ + struct procdesc *pd; + + sx_assert(&proctree_lock, SA_XLOCKED); + KASSERT(p->p_procdesc != NULL, ("procdesc_reap: p_procdesc == NULL")); + + pd = p->p_procdesc; + pd->pd_proc = NULL; + procdesc_free(pd); +} + +/* + * procdesc_close() - last close on a process descriptor. If the process is + * still running, terminate with SIGKILL and let init(8) clean up the mess; + * if not, we have to clean up the zombie ourselves. + */ +static int +procdesc_close(struct file *fp, struct thread *td) +{ + struct procdesc *pd; + struct proc *p; + + KASSERT(fp->f_type == DTYPE_PROCDESC, ("procdesc_close: !procdesc")); + + pd = fp->f_data; + fp->f_ops = &badfileops; + fp->f_data = NULL; + + sx_xlock(&proctree_lock); + PROCDESC_LOCK(pd); + pd->pd_flags |= PD_CLOSED; + PROCDESC_UNLOCK(pd); + p = pd->pd_proc; + PROC_LOCK(p); + if (p->p_state == PRS_ZOMBIE) { + /* + * If the process is already dead and just awaiting reaping, + * do that now. This will release the process's reference to + * the process descriptor when it calls back into + * procdesc_reap(). + */ + PROC_SLOCK(p); + proc_reap(curthread, p, NULL, 0, NULL); + } else { + /* + * If the process is not yet dead, we need to kill it, but we + * can't wait around synchronously for it to go away, as that + * path leads to madness (and deadlocks). First, detach the + * process from its descriptor so that its exit status will + * be reported normally. + */ + pd->pd_proc = NULL; + p->p_procdesc = NULL; + procdesc_free(pd); + + /* + * Next, reparent it to init(8) so that there's someone to + * pick up the pieces; finally, terminate with prejudice. + */ + p->p_sigparent = SIGCHLD; + proc_reparent(p, initproc); + psignal(p, SIGKILL); + PROC_UNLOCK(p); + sx_xunlock(&proctree_lock); + } + + /* + * Release the file descriptor's reference on the process descriptor. + */ + procdesc_free(pd); + return (0); +} + +static int +procdesc_read(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td) +{ + + return (EOPNOTSUPP); +} + +static int +procdesc_write(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td) +{ + + return (EOPNOTSUPP); +} + +static int +procdesc_truncate(struct file *fp, off_t length, struct ucred *active_cred, + struct thread *td) +{ + + return (EOPNOTSUPP); +} + +static int +procdesc_ioctl(struct file *fp, u_long com, void *data, + struct ucred *active_cred, struct thread *td) +{ + + return (EOPNOTSUPP); +} + +static int +procdesc_poll(struct file *fp, int events, struct ucred *active_cred, + struct thread *td) +{ + struct procdesc *pd; + int revents; + + revents = 0; + pd = fp->f_data; + PROCDESC_LOCK(pd); + if (pd->pd_flags & PD_EXITED) + revents |= POLLHUP; + if (revents == 0) { + selrecord(td, &pd->pd_selinfo); + pd->pd_flags |= PD_SELECTED; + } + PROCDESC_UNLOCK(pd); + return (revents); +} + +static int +procdesc_kqfilter(struct file *fp, struct knote *kn) +{ + + return (EOPNOTSUPP); +} + +static int +procdesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, + struct thread *td) +{ + struct procdesc *pd; + + /* + * XXXRW: Perhaps we should cache some more information from the + * process so that we can return it reliably here even after it has + * died. For example, caching its credential data. + */ + bzero(sb, sizeof(*sb)); + pd = fp->f_data; + sx_slock(&proctree_lock); + if (pd->pd_proc != NULL) { + PROC_LOCK(pd->pd_proc); + if (pd->pd_proc->p_state != PRS_ZOMBIE) + sb->st_mode = S_IFREG | S_IRWXU; + else + sb->st_mode = S_IFREG; + sb->st_uid = pd->pd_proc->p_ucred->cr_ruid; + sb->st_gid = pd->pd_proc->p_ucred->cr_rgid; + PROC_UNLOCK(pd->pd_proc); + } else + sb->st_mode = S_IFREG; + sx_sunlock(&proctree_lock); + return (0); +} + +#else /* !PROCDESC */ + +int +pdgetpid(struct thread *td, struct pdgetpid_args *uap) +{ + + return (ENOSYS); +} + +#endif /* PROCDESC */ diff -aurN -x '*.orig' src-clean/sys/kern/syscalls.c src/sys/kern/syscalls.c --- src-clean/sys/kern/syscalls.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/syscalls.c 2010-08-25 10:24:35.000000000 +0200 @@ -521,13 +521,13 @@ "msgctl", /* 511 = msgctl */ "shmctl", /* 512 = shmctl */ "lpathconf", /* 513 = lpathconf */ - "#514", /* 514 = cap_new */ - "#515", /* 515 = cap_getrights */ - "#516", /* 516 = cap_enter */ - "#517", /* 517 = cap_getmode */ - "#518", /* 518 = pdfork */ - "#519", /* 519 = pdkill */ - "#520", /* 520 = pdgetpid */ - "#521", /* 521 = pdwait */ + "cap_new", /* 514 = cap_new */ + "cap_getrights", /* 515 = cap_getrights */ + "cap_enter", /* 516 = cap_enter */ + "cap_getmode", /* 517 = cap_getmode */ + "pdfork", /* 518 = pdfork */ + "pdkill", /* 519 = pdkill */ + "pdgetpid", /* 520 = pdgetpid */ + "pdwait", /* 521 = pdwait */ "pselect", /* 522 = pselect */ }; diff -aurN -x '*.orig' src-clean/sys/kern/syscalls.master src/sys/kern/syscalls.master --- src-clean/sys/kern/syscalls.master 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/syscalls.master 2010-08-25 10:24:35.000000000 +0200 @@ -911,14 +911,25 @@ 512 AUE_SHMCTL NOSTD { int shmctl(int shmid, int cmd, \ struct shmid_ds *buf); } 513 AUE_LPATHCONF STD { int lpathconf(char *path, int name); } -514 AUE_CAP_NEW UNIMPL cap_new -515 AUE_CAP_GETRIGHTS UNIMPL cap_getrights -516 AUE_CAP_ENTER UNIMPL cap_enter -517 AUE_CAP_GETMODE UNIMPL cap_getmode -518 AUE_PDFORK UNIMPL pdfork -519 AUE_PDKILL UNIMPL pdkill -520 AUE_PDGETPID UNIMPL pdgetpid -521 AUE_PDWAIT UNIMPL pdwait + +; +; Capability system calls. +; +514 AUE_CAP_NEW STD { int cap_new(int fd, u_int64_t rights); } +515 AUE_CAP_GETRIGHTS STD { int cap_getrights(int fd, \ + u_int64_t *rightsp); } +516 AUE_CAP_ENTER STD { int cap_enter(void); } +517 AUE_CAP_GETMODE STD { int cap_getmode(u_int *modep); } + +; +; Process descriptor system calls. These need audit event identifiers. +; +518 AUE_NULL STD { int pdfork(int *fdp); } +519 AUE_NULL STD { int pdkill(int fd, int signum); } +520 AUE_NULL STD { int pdgetpid(int fd, pid_t *pidp); } +521 AUE_NULL STD { int pdwait(int fd, int *status, \ + int options, struct rusage *rusage); } + 522 AUE_SELECT STD { int pselect(int nd, fd_set *in, \ fd_set *ou, fd_set *ex, \ const struct timespec *ts, \ diff -aurN -x '*.orig' src-clean/sys/kern/systrace_args.c src/sys/kern/systrace_args.c --- src-clean/sys/kern/systrace_args.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/systrace_args.c 2010-08-25 10:24:35.000000000 +0200 @@ -3072,6 +3072,67 @@ *n_args = 2; break; } + /* cap_new */ + case 514: { + struct cap_new_args *p = params; + iarg[0] = p->fd; /* int */ + uarg[1] = p->rights; /* u_int64_t */ + *n_args = 2; + break; + } + /* cap_getrights */ + case 515: { + struct cap_getrights_args *p = params; + iarg[0] = p->fd; /* int */ + uarg[1] = (intptr_t) p->rightsp; /* u_int64_t * */ + *n_args = 2; + break; + } + /* cap_enter */ + case 516: { + *n_args = 0; + break; + } + /* cap_getmode */ + case 517: { + struct cap_getmode_args *p = params; + uarg[0] = (intptr_t) p->modep; /* u_int * */ + *n_args = 1; + break; + } + /* pdfork */ + case 518: { + struct pdfork_args *p = params; + uarg[0] = (intptr_t) p->fdp; /* int * */ + *n_args = 1; + break; + } + /* pdkill */ + case 519: { + struct pdkill_args *p = params; + iarg[0] = p->fd; /* int */ + iarg[1] = p->signum; /* int */ + *n_args = 2; + break; + } + /* pdgetpid */ + case 520: { + struct pdgetpid_args *p = params; + iarg[0] = p->fd; /* int */ + uarg[1] = (intptr_t) p->pidp; /* pid_t * */ + *n_args = 2; + break; + } + /* pdwait */ + case 521: { + struct pdwait_args *p = params; + iarg[0] = p->fd; /* int */ + uarg[1] = (intptr_t) p->status; /* int * */ + iarg[2] = p->options; /* int */ + uarg[3] = (intptr_t) p->rusage; /* struct rusage * */ + *n_args = 4; + break; + } /* pselect */ case 522: { struct pselect_args *p = params; @@ -8166,6 +8227,100 @@ break; }; break; + /* cap_new */ + case 514: + switch(ndx) { + case 0: + p = "int"; + break; + case 1: + p = "u_int64_t"; + break; + default: + break; + }; + break; + /* cap_getrights */ + case 515: + switch(ndx) { + case 0: + p = "int"; + break; + case 1: + p = "u_int64_t *"; + break; + default: + break; + }; + break; + /* cap_enter */ + case 516: + break; + /* cap_getmode */ + case 517: + switch(ndx) { + case 0: + p = "u_int *"; + break; + default: + break; + }; + break; + /* pdfork */ + case 518: + switch(ndx) { + case 0: + p = "int *"; + break; + default: + break; + }; + break; + /* pdkill */ + case 519: + switch(ndx) { + case 0: + p = "int"; + break; + case 1: + p = "int"; + break; + default: + break; + }; + break; + /* pdgetpid */ + case 520: + switch(ndx) { + case 0: + p = "int"; + break; + case 1: + p = "pid_t *"; + break; + default: + break; + }; + break; + /* pdwait */ + case 521: + switch(ndx) { + case 0: + p = "int"; + break; + case 1: + p = "int *"; + break; + case 2: + p = "int"; + break; + case 3: + p = "struct rusage *"; + break; + default: + break; + }; + break; /* pselect */ case 522: switch(ndx) { diff -aurN -x '*.orig' src-clean/sys/kern/tty.c src/sys/kern/tty.c --- src-clean/sys/kern/tty.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/tty.c 2010-08-25 10:24:35.000000000 +0200 @@ -30,9 +30,11 @@ #include __FBSDID("$FreeBSD: src/sys/kern/tty.c,v 1.328.2.4.2.1 2010/06/14 02:09:06 kensmith Exp $"); +#include "opt_capabilities.h" #include "opt_compat.h" #include +#include #include #include #include @@ -1775,20 +1777,12 @@ struct file *fp; struct cdev *dev; struct cdevsw *cdp; - struct filedesc *fdp; int error; /* Validate the file descriptor. */ - if ((fdp = p->p_fd) == NULL) - return (EBADF); - - fp = fget_unlocked(fdp, fd); - if (fp == NULL) - return (EBADF); - if (fp->f_ops == &badfileops) { - error = EBADF; - goto done1; - } + error = fget(curthread, fd, CAP_TTYHOOK, &fp); + if (error) + return (error); /* * Make sure the vnode is bound to a character device. diff -aurN -x '*.orig' src-clean/sys/kern/uipc_mqueue.c src/sys/kern/uipc_mqueue.c --- src-clean/sys/kern/uipc_mqueue.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/uipc_mqueue.c 2010-08-25 10:24:35.000000000 +0200 @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -2071,19 +2072,19 @@ return (error); } -typedef int (*_fgetf)(struct thread *, int, struct file **); +typedef int (*_fgetf)(struct thread *, int, cap_rights_t, struct file **); /* * Get message queue by giving file slot */ static int -_getmq(struct thread *td, int fd, _fgetf func, +_getmq(struct thread *td, int fd, cap_rights_t rights, _fgetf func, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { struct mqfs_node *pn; int error; - error = func(td, fd, fpp); + error = func(td, fd, rights, fpp); if (error) return (error); if (&mqueueops != (*fpp)->f_ops) { @@ -2102,21 +2103,21 @@ getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { - return _getmq(td, fd, fget, fpp, ppn, pmq); + return _getmq(td, fd, CAP_EVENT, fget, fpp, ppn, pmq); } static __inline int getmq_read(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { - return _getmq(td, fd, fget_read, fpp, ppn, pmq); + return _getmq(td, fd, CAP_READ, fget_read, fpp, ppn, pmq); } static __inline int getmq_write(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { - return _getmq(td, fd, fget_write, fpp, ppn, pmq); + return _getmq(td, fd, CAP_WRITE, fget_write, fpp, ppn, pmq); } static int @@ -2227,7 +2228,7 @@ struct filedesc *fdp; struct proc *p; struct mqueue *mq; - struct file *fp; + struct file *fp, *fp2; struct mqueue_notifier *nt, *newnt = NULL; int error; @@ -2251,8 +2252,12 @@ return (error); again: FILEDESC_SLOCK(fdp); - if (fget_locked(fdp, uap->mqd) != fp) { + error = cap_fextract(fget_locked(fdp, uap->mqd), CAP_EVENT, &fp2); + if (error) { FILEDESC_SUNLOCK(fdp); + goto out; + } + if (fp2 != fp) { error = EBADF; goto out; } diff -aurN -x '*.orig' src-clean/sys/kern/uipc_sem.c src/sys/kern/uipc_sem.c --- src-clean/sys/kern/uipc_sem.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/uipc_sem.c 2010-08-25 10:24:35.000000000 +0200 @@ -38,6 +38,7 @@ #include "opt_posix.h" #include +#include #include #include #include @@ -116,7 +117,8 @@ semid_t *semidp, mode_t mode, unsigned int value, int flags, int compat32); static void ksem_drop(struct ksem *ks); -static int ksem_get(struct thread *td, semid_t id, struct file **fpp); +static int ksem_get(struct thread *td, semid_t id, cap_rights_t rights, + struct file **fpp); static struct ksem *ksem_hold(struct ksem *ks); static void ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks); static struct ksem *ksem_lookup(char *path, Fnv32_t fnv); @@ -525,13 +527,14 @@ } static int -ksem_get(struct thread *td, semid_t id, struct file **fpp) +ksem_get(struct thread *td, semid_t id, cap_rights_t rights, + struct file **fpp) { struct ksem *ks; struct file *fp; int error; - error = fget(td, id, &fp); + error = fget(td, id, rights, &fp); if (error) return (EINVAL); if (fp->f_type != DTYPE_SEM) { @@ -623,7 +626,8 @@ struct file *fp; int error; - error = ksem_get(td, uap->id, &fp); + /* XXXRW: No capability required here. */ + error = ksem_get(td, uap->id, 0, &fp); if (error) return (error); ks = fp->f_data; @@ -648,7 +652,7 @@ struct ksem *ks; int error; - error = ksem_get(td, uap->id, &fp); + error = ksem_get(td, uap->id, CAP_SEM_POST, &fp); if (error) return (error); ks = fp->f_data; @@ -738,7 +742,7 @@ int error; DP((">>> kern_sem_wait entered! pid=%d\n", (int)td->td_proc->p_pid)); - error = ksem_get(td, id, &fp); + error = ksem_get(td, id, CAP_SEM_WAIT, &fp); if (error) return (error); ks = fp->f_data; @@ -804,7 +808,7 @@ struct ksem *ks; int error, val; - error = ksem_get(td, uap->id, &fp); + error = ksem_get(td, uap->id, CAP_SEM_GETVALUE, &fp); if (error) return (error); ks = fp->f_data; @@ -838,7 +842,8 @@ struct ksem *ks; int error; - error = ksem_get(td, uap->id, &fp); + /* XXXRW: No capability required since basically a close wrapper? */ + error = ksem_get(td, uap->id, 0, &fp); if (error) return (error); ks = fp->f_data; diff -aurN -x '*.orig' src-clean/sys/kern/uipc_shm.c src/sys/kern/uipc_shm.c --- src-clean/sys/kern/uipc_shm.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/uipc_shm.c 2010-08-25 10:24:35.000000000 +0200 @@ -488,6 +488,14 @@ mode_t cmode; int fd, error; + /* + * shm_open(2) of anonymous objects is allowed in capability mode, + * but naming of globally scoped objects is not. + */ + if ((td->td_ucred->cr_flags & CRED_FLAG_CAPMODE) && + (uap->path != SHM_ANON)) + return (ENOSYS); + if ((uap->flags & O_ACCMODE) != O_RDONLY && (uap->flags & O_ACCMODE) != O_RDWR) return (EINVAL); diff -aurN -x '*.orig' src-clean/sys/kern/uipc_syscalls.c src/sys/kern/uipc_syscalls.c --- src-clean/sys/kern/uipc_syscalls.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/uipc_syscalls.c 2010-08-25 10:24:35.000000000 +0200 @@ -38,10 +38,12 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_sctp.h" +#include "opt_capabilities.h" #include "opt_compat.h" #include "opt_ktrace.h" #include +#include #include #include #include @@ -118,33 +120,47 @@ "Number of sendfile(2) sf_bufs in use"); /* - * Convert a user file descriptor to a kernel file entry. A reference on the - * file entry is held upon returning. This is lighter weight than - * fgetsock(), which bumps the socket reference drops the file reference - * count instead, as this approach avoids several additional mutex operations - * associated with the additional reference count. If requested, return the - * open file flags. + * Convert a user file descriptor to a kernel file entry and check that, if + * it is a capability, the right rights are present. A reference on the file + * entry is held upon returning. */ static int -getsock(struct filedesc *fdp, int fd, struct file **fpp, u_int *fflagp) +getsock_cap(struct filedesc *fdp, int fd, cap_rights_t rights, + struct file **fpp, u_int *fflagp) { struct file *fp; +#ifdef CAPABILITIES + struct file *fp_fromcap; int error; +#endif fp = NULL; - if (fdp == NULL || (fp = fget_unlocked(fdp, fd)) == NULL) { - error = EBADF; - } else if (fp->f_type != DTYPE_SOCKET) { + if (fdp == NULL || (fp = fget_unlocked(fdp, fd)) == NULL) + return (EBADF); +#ifdef CAPABILITIES + /* + * If the file descriptor is for a capability, test rights and use + * the file descriptor referenced by the capability. + */ + error = cap_fextract(fp, rights, &fp_fromcap); + if (error) { fdrop(fp, curthread); - fp = NULL; - error = ENOTSOCK; - } else { - if (fflagp != NULL) - *fflagp = fp->f_flag; - error = 0; + return (error); + } + if (fp != fp_fromcap) { + fhold(fp_fromcap); + fdrop(fp, curthread); + fp = fp_fromcap; } +#endif /* CAPABILITIES */ + if (fp->f_type != DTYPE_SOCKET) { + fdrop(fp, curthread); + return (ENOTSOCK); + } + if (fflagp != NULL) + *fflagp = fp->f_flag; *fpp = fp; - return (error); + return (0); } /* @@ -224,7 +240,7 @@ int error; AUDIT_ARG_FD(fd); - error = getsock(td->td_proc->p_fd, fd, &fp, NULL); + error = getsock_cap(td->td_proc->p_fd, fd, CAP_BIND, &fp, NULL); if (error) return (error); so = fp->f_data; @@ -255,7 +271,8 @@ int error; AUDIT_ARG_FD(uap->s); - error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL); + error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_LISTEN, &fp, + NULL); if (error == 0) { so = fp->f_data; #ifdef MAC @@ -350,7 +367,7 @@ AUDIT_ARG_FD(s); fdp = td->td_proc->p_fd; - error = getsock(fdp, s, &headfp, &fflag); + error = getsock_cap(fdp, s, CAP_ACCEPT, &headfp, &fflag); if (error) return (error); head = headfp->f_data; @@ -540,7 +557,7 @@ int interrupted = 0; AUDIT_ARG_FD(fd); - error = getsock(td->td_proc->p_fd, fd, &fp, NULL); + error = getsock_cap(td->td_proc->p_fd, fd, CAP_CONNECT, &fp, NULL); if (error) return (error); so = fp->f_data; @@ -682,6 +699,12 @@ struct sockaddr *to; int error; +#ifdef CAPABILITIES + if ((td->td_ucred->cr_flags & CRED_FLAG_CAPMODE) && + (mp->msg_name != NULL)) + return (ENOSYS); +#endif + if (mp->msg_name != NULL) { error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error) { @@ -744,12 +767,16 @@ struct socket *so; int i; int len, error; + cap_rights_t rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif AUDIT_ARG_FD(s); - error = getsock(td->td_proc->p_fd, s, &fp, NULL); + rights = CAP_WRITE; + if (mp->msg_name != NULL) + rights |= CAP_CONNECT; + error = getsock_cap(td->td_proc->p_fd, s, rights, &fp, NULL); if (error) return (error); so = (struct socket *)fp->f_data; @@ -949,7 +976,7 @@ *controlp = 0; AUDIT_ARG_FD(s); - error = getsock(td->td_proc->p_fd, s, &fp, NULL); + error = getsock_cap(td->td_proc->p_fd, s, CAP_READ, &fp, NULL); if (error) return (error); so = fp->f_data; @@ -1265,7 +1292,8 @@ int error; AUDIT_ARG_FD(uap->s); - error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL); + error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_SHUTDOWN, &fp, + NULL); if (error == 0) { so = fp->f_data; error = soshutdown(so, uap->how); @@ -1328,7 +1356,7 @@ } AUDIT_ARG_FD(s); - error = getsock(td->td_proc->p_fd, s, &fp, NULL); + error = getsock_cap(td->td_proc->p_fd, s, CAP_SHUTDOWN, &fp, NULL); if (error == 0) { so = fp->f_data; CURVNET_SET(so->so_vnet); @@ -1409,7 +1437,7 @@ } AUDIT_ARG_FD(s); - error = getsock(td->td_proc->p_fd, s, &fp, NULL); + error = getsock_cap(td->td_proc->p_fd, s, CAP_GETSOCKOPT, &fp, NULL); if (error == 0) { so = fp->f_data; CURVNET_SET(so->so_vnet); @@ -1473,7 +1501,8 @@ return (EINVAL); AUDIT_ARG_FD(fd); - error = getsock(td->td_proc->p_fd, fd, &fp, NULL); + error = getsock_cap(td->td_proc->p_fd, fd, CAP_GETSOCKNAME, &fp, + NULL); if (error) return (error); so = fp->f_data; @@ -1573,7 +1602,8 @@ return (EINVAL); AUDIT_ARG_FD(fd); - error = getsock(td->td_proc->p_fd, fd, &fp, NULL); + error = getsock_cap(td->td_proc->p_fd, fd, CAP_GETPEERNAME, &fp, + NULL); if (error) return (error); so = fp->f_data; @@ -1829,7 +1859,8 @@ * we send only the header/trailer and no payload data. */ AUDIT_ARG_FD(uap->fd); - if ((error = fgetvp_read(td, uap->fd, &vp)) != 0) + if ((error = fgetvp_read(td, uap->fd, CAP_READ | CAP_SEEK, &vp)) + != 0) goto out; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_SHARED | LK_RETRY); @@ -1867,8 +1898,8 @@ * The socket must be a stream socket and connected. * Remember if it a blocking or non-blocking socket. */ - if ((error = getsock(td->td_proc->p_fd, uap->s, &sock_fp, - NULL)) != 0) + if ((error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_WRITE, + &sock_fp, NULL)) != 0) goto out; so = sock_fp->f_data; if (so->so_type != SOCK_STREAM) { @@ -2299,7 +2330,7 @@ fdp = td->td_proc->p_fd; AUDIT_ARG_FD(uap->sd); - error = fgetsock(td, uap->sd, &head, &fflag); + error = fgetsock(td, uap->sd, CAP_PEELOFF, &head, &fflag); if (error) goto done2; error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name); @@ -2393,6 +2424,7 @@ #endif struct uio auio; struct iovec iov[1]; + cap_rights_t rights; if (uap->sinfo) { error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); @@ -2400,16 +2432,20 @@ return (error); u_sinfo = &sinfo; } + + rights = CAP_WRITE; if (uap->tolen) { error = getsockaddr(&to, uap->to, uap->tolen); if (error) { to = NULL; goto sctp_bad2; } + rights |= CAP_CONNECT; } AUDIT_ARG_FD(uap->sd); - error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL); + /* XXXRW: Is this use of rights right for SCTP? */ + error = getsock_cap(td->td_proc->p_fd, uap->sd, rights, &fp, NULL); if (error) goto sctp_bad; #ifdef KTRACE @@ -2497,6 +2533,7 @@ #endif struct uio auio; struct iovec *iov, *tiov; + cap_rights_t rights; if (uap->sinfo) { error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); @@ -2504,16 +2541,18 @@ return (error); u_sinfo = &sinfo; } + rights = CAP_WRITE; if (uap->tolen) { error = getsockaddr(&to, uap->to, uap->tolen); if (error) { to = NULL; goto sctp_bad2; } + rights |= CAP_CONNECT; } AUDIT_ARG_FD(uap->sd); - error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL); + error = getsock_cap(td->td_proc->p_fd, uap->sd, rights, &fp, NULL); if (error) goto sctp_bad1; @@ -2621,7 +2660,7 @@ #endif AUDIT_ARG_FD(uap->sd); - error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL); + error = getsock_cap(td->td_proc->p_fd, uap->sd, CAP_READ, &fp, NULL); if (error) { return (error); } diff -aurN -x '*.orig' src-clean/sys/kern/uipc_usrreq.c src/sys/kern/uipc_usrreq.c --- src-clean/sys/kern/uipc_usrreq.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/uipc_usrreq.c 2010-08-25 10:24:35.000000000 +0200 @@ -50,7 +50,8 @@ * garbage collector to find and tear down cycles of disconnected sockets. * * TODO: - * SEQPACKET, RDM + * RDM + * distinguish datagram size limits from flow control limits in SEQPACKET * rethink name space problems * need a proper out-of-band */ @@ -112,6 +113,7 @@ static int unp_rights; /* (g) File descriptors in flight. */ static struct unp_head unp_shead; /* (l) List of stream sockets. */ static struct unp_head unp_dhead; /* (l) List of datagram sockets. */ +static struct unp_head unp_sphead; /* (l) List of seqpacket sockets. */ static const struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL }; @@ -139,10 +141,14 @@ static u_long unpst_recvspace = PIPSIZ; static u_long unpdg_sendspace = 2*1024; /* really max datagram size */ static u_long unpdg_recvspace = 4*1024; +static u_long unpsp_sendspace = PIPSIZ; /* really max datagram size */ +static u_long unpsp_recvspace = PIPSIZ; SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain"); SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM"); SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM"); +SYSCTL_NODE(_net_local, SOCK_SEQPACKET, seqpacket, CTLFLAG_RW, 0, + "SOCK_SEQPACKET"); SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, &unpst_sendspace, 0, "Default stream send space."); @@ -152,6 +158,10 @@ &unpdg_sendspace, 0, "Default datagram send space."); SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW, &unpdg_recvspace, 0, "Default datagram receive space."); +SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, maxseqpacket, CTLFLAG_RW, + &unpsp_sendspace, 0, "Default seqpacket send space."); +SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, recvspace, CTLFLAG_RW, + &unpsp_recvspace, 0, "Default seqpacket receive space."); SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "File descriptors in flight."); @@ -257,6 +267,7 @@ */ static struct domain localdomain; static struct pr_usrreqs uipc_usrreqs_dgram, uipc_usrreqs_stream; +static struct pr_usrreqs uipc_usrreqs_seqpacket; static struct protosw localsw[] = { { .pr_type = SOCK_STREAM, @@ -271,6 +282,19 @@ .pr_flags = PR_ATOMIC|PR_ADDR|PR_RIGHTS, .pr_usrreqs = &uipc_usrreqs_dgram }, +{ + .pr_type = SOCK_SEQPACKET, + .pr_domain = &localdomain, + + /* + * XXXRW: For now, PR_ADDR because soreceive will bump into them + * due to our use of sbappendaddr. A new sbappend variants is needed + * that supports both atomic record writes and control data. + */ + .pr_flags = PR_ADDR|PR_ATOMIC|PR_CONNREQUIRED|PR_WANTRCVD| + PR_RIGHTS, + .pr_usrreqs = &uipc_usrreqs_seqpacket, +}, }; static struct domain localdomain = { @@ -353,6 +377,11 @@ recvspace = unpdg_recvspace; break; + case SOCK_SEQPACKET: + sendspace = unpsp_sendspace; + recvspace = unpsp_recvspace; + break; + default: panic("uipc_attach"); } @@ -372,8 +401,22 @@ UNP_LIST_LOCK(); unp->unp_gencnt = ++unp_gencnt; unp_count++; - LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead : &unp_shead, - unp, unp_link); + switch (so->so_type) { + case SOCK_STREAM: + LIST_INSERT_HEAD(&unp_shead, unp, unp_link); + break; + + case SOCK_DGRAM: + LIST_INSERT_HEAD(&unp_dhead, unp, unp_link); + break; + + case SOCK_SEQPACKET: + LIST_INSERT_HEAD(&unp_sphead, unp, unp_link); + break; + + default: + panic("uipc_attach"); + } UNP_LIST_UNLOCK(); return (0); @@ -705,11 +748,8 @@ unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_rcvd: unp == NULL")); - if (so->so_type == SOCK_DGRAM) - panic("uipc_rcvd DGRAM?"); - - if (so->so_type != SOCK_STREAM) - panic("uipc_rcvd unknown socktype"); + if (so->so_type != SOCK_STREAM && so->so_type != SOCK_SEQPACKET) + panic("uipc_rcvd socktype %d", so->so_type); /* * Adjust backpressure on sender and wakeup any waiting to write. @@ -824,6 +864,7 @@ break; } + case SOCK_SEQPACKET: case SOCK_STREAM: if ((so->so_state & SS_ISCONNECTED) == 0) { if (nam != NULL) { @@ -875,11 +916,33 @@ * Send to paired receive port, and then reduce send buffer * hiwater marks to maintain backpressure. Wake up readers. */ - if (control != NULL) { - if (sbappendcontrol_locked(&so2->so_rcv, m, control)) + switch (so->so_type) { + case SOCK_STREAM: + if (control != NULL) { + if (sbappendcontrol_locked(&so2->so_rcv, m, + control)) + control = NULL; + } else + sbappend_locked(&so2->so_rcv, m); + break; + + case SOCK_SEQPACKET: { + const struct sockaddr *from; + + from = &sun_noname; + if (sbappendaddr_locked(&so2->so_rcv, from, m, + control)) control = NULL; - } else - sbappend_locked(&so2->so_rcv, m); + break; + } + } + + /* + * XXXRW: While fine for SOCK_STREAM, this conflates maximum + * datagram size and back-pressure for SOCK_SEQPACKET, which + * can lead to undesired return of EMSGSIZE on send instead + * of more desirable blocking. + */ mbcnt_delta = so2->so_rcv.sb_mbcnt - unp2->unp_mbcnt; unp2->unp_mbcnt = so2->so_rcv.sb_mbcnt; sbcc = so2->so_rcv.sb_cc; @@ -939,7 +1002,8 @@ UNP_LINK_RLOCK(); UNP_PCB_LOCK(unp); unp2 = unp->unp_conn; - if (so->so_type == SOCK_STREAM && unp2 != NULL) { + if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) && + unp2 != NULL) { so2 = unp2->unp_socket; sb->st_blksize += so2->so_rcv.sb_cc; } @@ -1009,6 +1073,26 @@ .pru_close = uipc_close, }; +static struct pr_usrreqs uipc_usrreqs_seqpacket = { + .pru_abort = uipc_abort, + .pru_accept = uipc_accept, + .pru_attach = uipc_attach, + .pru_bind = uipc_bind, + .pru_connect = uipc_connect, + .pru_connect2 = uipc_connect2, + .pru_detach = uipc_detach, + .pru_disconnect = uipc_disconnect, + .pru_listen = uipc_listen, + .pru_peeraddr = uipc_peeraddr, + .pru_rcvd = uipc_rcvd, + .pru_send = uipc_send, + .pru_sense = uipc_sense, + .pru_shutdown = uipc_shutdown, + .pru_sockaddr = uipc_sockaddr, + .pru_soreceive = soreceive_generic, /* XXX: or...? */ + .pru_close = uipc_close, +}; + static struct pr_usrreqs uipc_usrreqs_stream = { .pru_abort = uipc_abort, .pru_accept = uipc_accept, @@ -1306,6 +1390,7 @@ break; case SOCK_STREAM: + case SOCK_SEQPACKET: unp2->unp_conn = unp; if (req == PRU_CONNECT && ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT)) @@ -1343,6 +1428,7 @@ break; case SOCK_STREAM: + case SOCK_SEQPACKET: soisdisconnected(unp->unp_socket); unp2->unp_conn = NULL; soisdisconnected(unp2->unp_socket); @@ -1368,7 +1454,22 @@ struct unp_head *head; struct xunpcb *xu; - head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead); + switch ((intptr_t)arg1) { + case SOCK_STREAM: + head = &unp_shead; + break; + + case SOCK_DGRAM: + head = &unp_dhead; + break; + + case SOCK_SEQPACKET: + head = &unp_sphead; + break; + + default: + panic("unp_pcblist: arg1 %d", (int)(intptr_t)arg1); + } /* * The process of preparing the PCB list is too time-consuming and @@ -1481,6 +1582,9 @@ SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD, (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb", "List of active local stream sockets"); +SYSCTL_PROC(_net_local_seqpacket, OID_AUTO, pcblist, CTLFLAG_RD, + (caddr_t)(long)SOCK_SEQPACKET, 0, unp_pcblist, "S,xunpcb", + "List of active local seqpacket sockets"); static void unp_shutdown(struct unpcb *unp) @@ -1492,7 +1596,8 @@ UNP_PCB_LOCK_ASSERT(unp); unp2 = unp->unp_conn; - if (unp->unp_socket->so_type == SOCK_STREAM && unp2 != NULL) { + if ((unp->unp_socket->so_type == SOCK_STREAM || + (unp->unp_socket->so_type == SOCK_SEQPACKET)) && unp2 != NULL) { so = unp2->unp_socket; if (so != NULL) socantrcvmore(so); @@ -1658,6 +1763,7 @@ NULL, EVENTHANDLER_PRI_ANY); LIST_INIT(&unp_dhead); LIST_INIT(&unp_shead); + LIST_INIT(&unp_sphead); TASK_INIT(&unp_gc_task, 0, unp_gc, NULL); UNP_LINK_LOCK_INIT(); UNP_LIST_LOCK_INIT(); @@ -1934,6 +2040,11 @@ * Check for a socket potentially in a cycle. It must be in a * queue as indicated by msgcount, and this must equal the file * reference count. Note that when msgcount is 0 the file is NULL. + * + * XXXRW: This will need to change to also look at the capabilities + * referencing the file descriptor. It must: (a) subtract the number + * of capability references, and (b) add the non-message references + * to the capabilities themselves. */ if ((unp->unp_gcflag & UNPGC_REF) == 0 && fp && unp->unp_msgcount != 0 && fp->f_count == unp->unp_msgcount) { @@ -1974,7 +2085,8 @@ static void unp_gc(__unused void *arg, int pending) { - struct unp_head *heads[] = { &unp_dhead, &unp_shead, NULL }; + struct unp_head *heads[] = { &unp_dhead, &unp_shead, &unp_sphead, + NULL }; struct unp_head **head; struct file **unref; struct unpcb *unp; diff -aurN -x '*.orig' src-clean/sys/kern/vfs_acl.c src/sys/kern/vfs_acl.c --- src-clean/sys/kern/vfs_acl.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/vfs_acl.c 2010-08-25 10:24:35.000000000 +0200 @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -408,7 +409,8 @@ struct file *fp; int vfslocked, error; - error = getvnode(td->td_proc->p_fd, uap->filedes, &fp); + error = getvnode_cap(td->td_proc->p_fd, uap->filedes, CAP_ACL_GET, + &fp); if (error == 0) { vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp); @@ -427,7 +429,8 @@ struct file *fp; int vfslocked, error; - error = getvnode(td->td_proc->p_fd, uap->filedes, &fp); + error = getvnode_cap(td->td_proc->p_fd, uap->filedes, CAP_ACL_SET, + &fp); if (error == 0) { vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp); @@ -486,7 +489,8 @@ struct file *fp; int vfslocked, error; - error = getvnode(td->td_proc->p_fd, uap->filedes, &fp); + error = getvnode_cap(td->td_proc->p_fd, uap->filedes, CAP_ACL_DELETE, + &fp); if (error == 0) { vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); error = vacl_delete(td, fp->f_vnode, uap->type); @@ -545,7 +549,8 @@ struct file *fp; int vfslocked, error; - error = getvnode(td->td_proc->p_fd, uap->filedes, &fp); + error = getvnode_cap(td->td_proc->p_fd, uap->filedes, CAP_ACL_CHECK, + &fp); if (error == 0) { vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp); diff -aurN -x '*.orig' src-clean/sys/kern/vfs_aio.c src/sys/kern/vfs_aio.c --- src-clean/sys/kern/vfs_aio.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/vfs_aio.c 2010-08-25 10:24:35.000000000 +0200 @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -1565,17 +1566,30 @@ aiocbe->uaiocb.aio_lio_opcode = type; opcode = aiocbe->uaiocb.aio_lio_opcode; - /* Fetch the file object for the specified file descriptor. */ + /* + * Validate the opcode and fetch the file object for the specified + * file descriptor. + * + * XXXRW: Moved the opcode validation up here so that we don't + * retrieve a file descriptor without knowing what the capability + * should be. + */ fd = aiocbe->uaiocb.aio_fildes; switch (opcode) { case LIO_WRITE: - error = fget_write(td, fd, &fp); + error = fget_write(td, fd, CAP_WRITE, &fp); break; case LIO_READ: - error = fget_read(td, fd, &fp); + error = fget_read(td, fd, CAP_READ, &fp); + break; + case LIO_SYNC: + error = fget(td, fd, CAP_FSYNC, &fp); + break; + case LIO_NOP: + error = fget(td, fd, 0, &fp); break; default: - error = fget(td, fd, &fp); + error = EINVAL; } if (error) { uma_zfree(aiocb_zone, aiocbe); @@ -1611,11 +1625,6 @@ uma_zfree(aiocb_zone, aiocbe); return (0); } - if ((opcode != LIO_READ) && (opcode != LIO_WRITE) && - (opcode != LIO_SYNC)) { - error = EINVAL; - goto aqueue_fail; - } if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT) goto no_kqueue; @@ -1959,7 +1968,7 @@ struct vnode *vp; /* Lookup file object. */ - error = fget(td, uap->fd, &fp); + error = fget(td, uap->fd, 0, &fp); if (error) return (error); diff -aurN -x '*.orig' src-clean/sys/kern/vfs_extattr.c src/sys/kern/vfs_extattr.c --- src-clean/sys/kern/vfs_extattr.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/vfs_extattr.c 2010-08-25 10:24:35.000000000 +0200 @@ -31,6 +31,7 @@ #include #include +#include #include #include #include @@ -230,7 +231,8 @@ return (error); AUDIT_ARG_TEXT(attrname); - error = getvnode(td->td_proc->p_fd, uap->fd, &fp); + error = getvnode_cap(td->td_proc->p_fd, uap->fd, CAP_EXTATTR_SET, + &fp); if (error) return (error); @@ -410,7 +412,8 @@ return (error); AUDIT_ARG_TEXT(attrname); - error = getvnode(td->td_proc->p_fd, uap->fd, &fp); + error = getvnode_cap(td->td_proc->p_fd, uap->fd, CAP_EXTATTR_GET, + &fp); if (error) return (error); @@ -560,7 +563,8 @@ return (error); AUDIT_ARG_TEXT(attrname); - error = getvnode(td->td_proc->p_fd, uap->fd, &fp); + error = getvnode_cap(td->td_proc->p_fd, uap->fd, CAP_EXTATTR_DELETE, + &fp); if (error) return (error); @@ -719,7 +723,8 @@ AUDIT_ARG_FD(uap->fd); AUDIT_ARG_VALUE(uap->attrnamespace); - error = getvnode(td->td_proc->p_fd, uap->fd, &fp); + error = getvnode_cap(td->td_proc->p_fd, uap->fd, CAP_EXTATTR_LIST, + &fp); if (error) return (error); diff -aurN -x '*.orig' src-clean/sys/kern/vfs_lookup.c src/sys/kern/vfs_lookup.c --- src-clean/sys/kern/vfs_lookup.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/vfs_lookup.c 2010-08-25 10:24:35.000000000 +0200 @@ -37,12 +37,15 @@ #include __FBSDID("$FreeBSD: src/sys/kern/vfs_lookup.c,v 1.132.2.3.2.1 2010/06/14 02:09:06 kensmith Exp $"); +#include "opt_capabilities.h" +#include "opt_kdb.h" #include "opt_kdtrace.h" #include "opt_ktrace.h" #include #include #include +#include #include #include #include @@ -55,6 +58,9 @@ #include #include #include +#ifdef KDB +#include +#endif #ifdef KTRACE #include #endif @@ -196,6 +202,7 @@ ktrnamei(cnp->cn_pnbuf); } #endif + /* * Get starting point for the translation. */ @@ -204,7 +211,18 @@ ndp->ni_topdir = fdp->fd_jdir; dp = NULL; +#ifdef CAPABILITIES + /* + * in capability mode, lookups must be performed relative to a real file + * descriptor, not the pseudo-descriptor AT_FDCWD + */ + if (IN_CAPABILITY_MODE(td) && (ndp->ni_dirfd == AT_FDCWD)) { + error = EOPNOTSUPP; + } else { +#else /* !CAPABILITIES */ + /* this optimisation doesn't apply if we have capabilities */ if (cnp->cn_pnbuf[0] != '/') { +#endif if (ndp->ni_startdir != NULL) { dp = ndp->ni_startdir; error = 0; @@ -213,26 +231,39 @@ AUDIT_ARG_ATFD1(ndp->ni_dirfd); if (cnp->cn_flags & AUDITVNODE2) AUDIT_ARG_ATFD2(ndp->ni_dirfd); - error = fgetvp(td, ndp->ni_dirfd, &dp); + error = fgetvp_rights(td, ndp->ni_dirfd, + ndp->ni_rightsneeded | CAP_LOOKUP, + &(ndp->ni_baserights), &dp); + +#ifdef CAPABILITIES + /* + * only set ni_basedir if base was a capability or we are + * in capability mode + */ + if ((ndp->ni_baserights != -1) || (IN_CAPABILITY_MODE(td))) + ndp->ni_basedir = dp; +#endif } - if (error != 0 || dp != NULL) { - FILEDESC_SUNLOCK(fdp); - if (error == 0 && dp->v_type != VDIR) { - vfslocked = VFS_LOCK_GIANT(dp->v_mount); - vrele(dp); - VFS_UNLOCK_GIANT(vfslocked); - error = ENOTDIR; - } + } + if (error != 0 || dp != NULL) { + FILEDESC_SUNLOCK(fdp); + if (error == 0 && dp->v_type != VDIR) { + vfslocked = VFS_LOCK_GIANT(dp->v_mount); + vrele(dp); + VFS_UNLOCK_GIANT(vfslocked); + error = ENOTDIR; } - if (error) { - uma_zfree(namei_zone, cnp->cn_pnbuf); + } + + if (error) { + uma_zfree(namei_zone, cnp->cn_pnbuf); #ifdef DIAGNOSTIC - cnp->cn_pnbuf = NULL; - cnp->cn_nameptr = NULL; + cnp->cn_pnbuf = NULL; + cnp->cn_nameptr = NULL; #endif - return (error); - } + return (error); } + if (dp == NULL) { dp = fdp->fd_cdir; VREF(dp); @@ -250,6 +281,8 @@ /* * Check if root directory should replace current directory. * Done at start of translation and after symbolic link. + * This is illegal if looking up relative to a capability unless + * that capability is for '/' and has CAP_ABSOLUTEPATH. */ cnp->cn_nameptr = cnp->cn_pnbuf; if (*(cnp->cn_nameptr) == '/') { @@ -259,6 +292,13 @@ cnp->cn_nameptr++; ndp->ni_pathlen--; } +#ifdef CAPABILITIES + if (ndp->ni_basedir + && !((ndp->ni_basedir == ndp->ni_rootdir) + && (ndp->ni_baserights & CAP_ABSOLUTEPATH))) + return (ENOTCAPABLE); +#endif + dp = ndp->ni_rootdir; vfslocked = VFS_LOCK_GIANT(dp->v_mount); VREF(dp); @@ -470,7 +510,7 @@ int dvfslocked; /* VFS Giant state for parent */ int tvfslocked; int lkflags_save; - + /* * Setup: break out flag bits into variables. */ @@ -598,17 +638,18 @@ } /* - * Handle "..": four special cases. + * Handle "..": five special cases. * 1. Return an error if this is the last component of * the name and the operation is DELETE or RENAME. - * 2. If at root directory (e.g. after chroot) + * 2. If at the base of a capability *at call, return ENOTCAPABLE. + * 3. If at root directory (e.g. after chroot) * or at absolute root directory * then ignore it so can't get out. - * 3. If this vnode is the root of a mounted + * 4. If this vnode is the root of a mounted * filesystem, then replace it with the * vnode which was mounted on so we take the * .. in the other filesystem. - * 4. If the vnode is the top directory of + * 5. If the vnode is the top directory of * the jail or chroot, don't let them out. */ if (cnp->cn_flags & ISDOTDOT) { @@ -618,6 +659,17 @@ goto bad; } for (;;) { +#ifdef CAPABILITIES + /* + * Attempting to wander out of the *at root; whether or + * not this is allowed is a capability option on the + * '/' capability. + */ + if (dp == ndp->ni_basedir) { + error = ENOTCAPABLE; + goto bad; + } +#endif for (pr = cnp->cn_cred->cr_prison; pr != NULL; pr = pr->pr_parent) if (dp == pr->pr_root) diff -aurN -x '*.orig' src-clean/sys/kern/vfs_syscalls.c src/sys/kern/vfs_syscalls.c --- src-clean/sys/kern/vfs_syscalls.c 2010-08-25 10:09:59.000000000 +0200 +++ src/sys/kern/vfs_syscalls.c 2010-08-25 10:24:35.000000000 +0200 @@ -37,6 +37,7 @@ #include __FBSDID("$FreeBSD: src/sys/kern/vfs_syscalls.c,v 1.487.2.8.2.1 2010/06/14 02:09:06 kensmith Exp $"); +#include "opt_capabilities.h" #include "opt_compat.h" #include "opt_kdtrace.h" #include "opt_ktrace.h" @@ -45,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -116,6 +118,49 @@ #endif /* + * Convert a user file descriptor to a kernel file entry and check that, if + * it is a capability, the right rights are present. A reference on the file + * entry is held upon returning. + */ +int +getvnode_cap(struct filedesc *fdp, int fd, cap_rights_t rights, + struct file **fpp) +{ + struct file *fp; +#ifdef CAPABILITIES + struct file *fp_fromcap; +#endif + int error; + + error = 0; + fp = NULL; + if (fdp == NULL || (fp = fget_unlocked(fdp, fd)) == NULL) + return (EBADF); +#ifdef CAPABILITIES + /* + * If the file descriptor is for a capability, test rights and use + * the file descriptor referenced by the capability. + */ + error = cap_fextract(fp, rights, &fp_fromcap); + if (error) { + fdrop(fp, curthread); + return (error); + } + if (fp != fp_fromcap) { + fhold(fp_fromcap); + fdrop(fp, curthread); + fp = fp_fromcap; + } +#endif /* CAPABILITIES */ + if (fp->f_vnode == NULL) { + fdrop(fp, curthread); + return (EINVAL); + } + *fpp = fp; + return (0); +} + +/* * Sync each mounted filesystem. */ #ifndef _SYS_SYSPROTO_H_ @@ -373,7 +418,7 @@ int error; AUDIT_ARG_FD(fd); - error = getvnode(td->td_proc->p_fd, fd, &fp); + error = getvnode_cap(td->td_proc->p_fd, fd, CAP_FSTATFS, &fp); if (error) return (error); vp = fp->f_vnode; @@ -746,7 +791,7 @@ int error; AUDIT_ARG_FD(uap->fd); - if ((error = getvnode(fdp, uap->fd, &fp)) != 0) + if ((error = getvnode_cap(fdp, uap->fd, CAP_FCHDIR, &fp)) != 0) return (error); vp = fp->f_vnode; VREF(vp); @@ -1049,10 +1094,11 @@ struct vnode *vp; int cmode; struct file *nfp; - int type, indx, error; + int type, indx = -1, error; struct flock lf; struct nameidata nd; int vfslocked; + cap_rights_t baserights = CAP_ATBASE; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); @@ -1062,6 +1108,7 @@ * be specified. */ if (flags & O_EXEC) { + baserights |= CAP_FEXECVE; if (flags & O_ACCMODE) return (EINVAL); } else if ((flags & O_ACCMODE) == O_ACCMODE) @@ -1069,16 +1116,25 @@ else flags = FFLAGS(flags); - error = falloc(td, &nfp, &indx); +#ifdef CAPABILITIES + if (flags & FREAD) baserights |= CAP_READ; + if (flags & FWRITE) baserights |= CAP_WRITE; +#endif + + /* + * allocate the file descriptor, but don't install a descriptor yet + */ + error = falloc_noinstall(td, &nfp); if (error) return (error); + /* An extra reference on `nfp' has been held for us by falloc(). */ fp = nfp; /* Set the flags early so the finit in devfs can pick them up. */ fp->f_flag = flags & FMASK; cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT; - NDINIT_AT(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | MPSAFE, pathseg, path, fd, - td); + NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | MPSAFE, pathseg, + path, fd, baserights, td); td->td_dupfd = -1; /* XXX check for fdopen */ error = vn_open(&nd, &flags, cmode, fp); if (error) { @@ -1087,25 +1143,31 @@ * wonderous happened deep below and we just pass it up * pretending we know what we do. */ - if (error == ENXIO && fp->f_ops != &badfileops) { - fdrop(fp, td); - td->td_retval[0] = indx; - return (0); - } + if (error == ENXIO && fp->f_ops != &badfileops) + goto success; /* * handle special fdopen() case. bleh. dupfdopen() is * responsible for dropping the old contents of ofiles[indx] * if it succeeds. + * + * Don't do this for relative (capability) lookups; we don't + * understand exactly what would happen, and we don't think that + * it ever should. */ - if ((error == ENODEV || error == ENXIO) && - td->td_dupfd >= 0 && /* XXX from fdopen */ - (error = - dupfdopen(td, fdp, indx, td->td_dupfd, flags, error)) == 0) { - td->td_retval[0] = indx; - fdrop(fp, td); - return (0); + if (!nd.ni_basedir && (error == ENODEV || error == ENXIO) && + td->td_dupfd >= 0) { + /* XXX from fdopen */ + int olderror = error; + + if ((error = finstall(td, fp, &indx)) != 0) + goto bad_unlocked; + + if ((error = dupfdopen(td, fdp, indx, td->td_dupfd, + flags, olderror)) == 0) + goto success; } + /* * Clean up the descriptor, but only if another thread hadn't * replaced or closed it. @@ -1161,6 +1223,24 @@ goto bad; } VFS_UNLOCK_GIANT(vfslocked); + +success: +#ifdef CAPABILITIES + if (nd.ni_baserights != -1) { + /* wrap the result in a capability */ + struct file *cap; + + error = kern_capwrap(td, fp, nd.ni_baserights, &cap, &indx); + if (error) + goto bad_unlocked; + } + else +#endif + /* if we haven't already installed the FD (for dupfdopen), do so now */ + if (indx == -1) + if((error = finstall(td, fp, &indx)) != 0) + goto bad_unlocked; + /* * Release our private reference, leaving the one associated with * the descriptor table intact. @@ -1170,6 +1250,7 @@ return (0); bad: VFS_UNLOCK_GIANT(vfslocked); +bad_unlocked: fdclose(fdp, fp, indx, td); fdrop(fp, td); return (error); @@ -1282,7 +1363,12 @@ if (error) return (error); restart: + if (IN_CAPABILITY_MODE(td)) + /* only mkfifoat(2) allowed in capability mode */ + return (EOPNOTSUPP); + bwillwrite(); + NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, pathseg, path, fd, td); if ((error = namei(&nd)) != 0) @@ -1408,8 +1494,8 @@ AUDIT_ARG_MODE(mode); restart: bwillwrite(); - NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, - pathseg, path, fd, td); + NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, + pathseg, path, fd, CAP_MKFIFO, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); @@ -1912,7 +1998,7 @@ int vfslocked; AUDIT_ARG_FD(uap->fd); - if ((error = fget(td, uap->fd, &fp)) != 0) + if ((error = fget(td, uap->fd, CAP_SEEK, &fp)) != 0) return (error); if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) { fdrop(fp, td); @@ -2129,8 +2215,9 @@ } else cred = tmpcred = td->td_ucred; AUDIT_ARG_VALUE(mode); - NDINIT_AT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE | - AUDITVNODE1, pathseg, path, fd, td); + + NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE | + AUDITVNODE1, pathseg, path, fd, CAP_FSTAT, td); if ((error = namei(&nd)) != 0) goto out1; vfslocked = NDHASGIANT(&nd); @@ -2339,9 +2426,9 @@ if (flag & ~AT_SYMLINK_NOFOLLOW) return (EINVAL); - NDINIT_AT(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : + NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1 | MPSAFE, pathseg, - path, fd, td); + path, fd, CAP_FSTAT | CAP_ATBASE, td); if ((error = namei(&nd)) != 0) return (error); @@ -2769,7 +2856,8 @@ AUDIT_ARG_FD(uap->fd); AUDIT_ARG_FFLAGS(uap->flags); - if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) + if ((error = getvnode_cap(td->td_proc->p_fd, uap->fd, CAP_FCHFLAGS, + &fp)) != 0) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); #ifdef AUDIT @@ -2895,10 +2983,13 @@ AUDIT_ARG_MODE(mode); follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW; - NDINIT_AT(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, pathseg, path, - fd, td); - if ((error = namei(&nd)) != 0) + + NDINIT_ATRIGHTS(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, pathseg, + path, fd, CAP_FCHMOD, td); + error = namei(&nd); + if (error) return (error); + vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfmode(td, nd.ni_vp, mode); @@ -2930,7 +3021,8 @@ AUDIT_ARG_FD(uap->fd); AUDIT_ARG_MODE(uap->mode); - if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) + if ((error = getvnode_cap(td->td_proc->p_fd, uap->fd, CAP_FCHMOD, + &fp)) != 0) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); #ifdef AUDIT @@ -3037,8 +3129,8 @@ AUDIT_ARG_OWNER(uid, gid); follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW; - NDINIT_AT(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, pathseg, path, - fd, td); + NDINIT_ATRIGHTS(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, pathseg, path, + fd, CAP_FCHOWN, td); if ((error = namei(&nd)) != 0) return (error); @@ -3107,7 +3199,8 @@ AUDIT_ARG_FD(uap->fd); AUDIT_ARG_OWNER(uap->uid, uap->gid); - if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) + if ((error = getvnode_cap(td->td_proc->p_fd, uap->fd, CAP_FCHOWN, + &fp)) != 0) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); #ifdef AUDIT @@ -3252,8 +3345,8 @@ if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); - NDINIT_AT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, - fd, td); + NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, + fd, CAP_FUTIMES, td); if ((error = namei(&nd)) != 0) return (error); @@ -3342,7 +3435,8 @@ AUDIT_ARG_FD(fd); if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); - if ((error = getvnode(td->td_proc->p_fd, fd, &fp)) != 0) + if ((error = getvnode_cap(td->td_proc->p_fd, fd, CAP_FUTIMES, &fp)) + != 0) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); #ifdef AUDIT @@ -3494,7 +3588,8 @@ int error, lock_flags; AUDIT_ARG_FD(uap->fd); - if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) + if ((error = getvnode_cap(td->td_proc->p_fd, uap->fd, CAP_FSYNC, + &fp)) != 0) return (error); vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); @@ -3581,11 +3676,11 @@ bwillwrite(); #ifdef MAC - NDINIT_AT(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART | MPSAFE | - AUDITVNODE1, pathseg, old, oldfd, td); + NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART | + MPSAFE | AUDITVNODE1, pathseg, old, oldfd, CAP_DELETE, td); #else - NDINIT_AT(&fromnd, DELETE, WANTPARENT | SAVESTART | MPSAFE | - AUDITVNODE1, pathseg, old, oldfd, td); + NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | MPSAFE | + AUDITVNODE1, pathseg, old, oldfd, CAP_DELETE, td); #endif if ((error = namei(&fromnd)) != 0) @@ -3608,8 +3703,8 @@ vrele(fvp); goto out1; } - NDINIT_AT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | - MPSAFE | AUDITVNODE2, pathseg, new, newfd, td); + NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | + SAVESTART | MPSAFE | AUDITVNODE2, pathseg, new, newfd, CAP_CREATE, td); if (fromnd.ni_vp->v_type == VDIR) tond.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&tond)) != 0) { @@ -3735,8 +3830,8 @@ AUDIT_ARG_MODE(mode); restart: bwillwrite(); - NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, - segflg, path, fd, td); + NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, + segflg, path, fd, CAP_MKDIR, td); nd.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&nd)) != 0) return (error); @@ -3824,8 +3919,8 @@ restart: bwillwrite(); - NDINIT_AT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1, - pathseg, path, fd, td); + NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1, + pathseg, path, fd, CAP_RMDIR, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); @@ -3913,7 +4008,8 @@ /* XXX arbitrary sanity limit on `count'. */ if (uap->count > 64 * 1024) return (EINVAL); - if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) + if ((error = getvnode_cap(td->td_proc->p_fd, uap->fd, CAP_READ, &fp)) + != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); @@ -4072,7 +4168,8 @@ AUDIT_ARG_FD(fd); if (count > INT_MAX) return (EINVAL); - if ((error = getvnode(td->td_proc->p_fd, fd, &fp)) != 0) + if ((error = getvnode_cap(td->td_proc->p_fd, fd, CAP_READ, &fp)) + != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); @@ -4235,31 +4332,6 @@ } /* - * Convert a user file descriptor to a kernel file entry. - * A reference on the file entry is held upon returning. - */ -int -getvnode(fdp, fd, fpp) - struct filedesc *fdp; - int fd; - struct file **fpp; -{ - int error; - struct file *fp; - - error = 0; - fp = NULL; - if (fdp == NULL || (fp = fget_unlocked(fdp, fd)) == NULL) - error = EBADF; - else if (fp->f_vnode == NULL) { - error = EINVAL; - fdrop(fp, curthread); - } - *fpp = fp; - return (error); -} - -/* * Get an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ diff -aurN -x '*.orig' src-clean/sys/netgraph/ng_socket.c src/sys/netgraph/ng_socket.c --- src-clean/sys/netgraph/ng_socket.c 2010-08-25 10:10:03.000000000 +0200 +++ src/sys/netgraph/ng_socket.c 2010-08-25 10:24:35.000000000 +0200 @@ -693,9 +693,12 @@ } /* Check that the FD given is legit. and change it to a pointer to a - * struct file. */ + * struct file. + * + * XXXRW: For now, no capability right required to pass an fd. + */ fd = CMSG_DATA(cm); - if ((error = fget(td, fd, &fp)) != 0) + if ((error = fget(td, fd, 0, &fp)) != 0) return (error); /* Depending on what kind of resource it is, act differently. For diff -aurN -x '*.orig' src-clean/sys/nfsserver/nfs_srvkrpc.c src/sys/nfsserver/nfs_srvkrpc.c --- src-clean/sys/nfsserver/nfs_srvkrpc.c 2010-08-25 10:10:03.000000000 +0200 +++ src/sys/nfsserver/nfs_srvkrpc.c 2010-08-25 10:24:35.000000000 +0200 @@ -39,6 +39,7 @@ #include "opt_kgssapi.h" #include +#include #include #include #include @@ -173,7 +174,8 @@ sizeof(addsockarg)); if (error) return (error); - if ((error = fget(td, addsockarg.sock, &fp)) != 0) + if ((error = fget(td, addsockarg.sock, CAP_SOCK_ALL, &fp)) + != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, td); diff -aurN -x '*.orig' src-clean/sys/security/audit/audit.h src/sys/security/audit/audit.h --- src-clean/sys/security/audit/audit.h 2010-08-25 10:10:05.000000000 +0200 +++ src/sys/security/audit/audit.h 2010-08-25 10:24:35.000000000 +0200 @@ -114,6 +114,7 @@ void audit_arg_file(struct proc *p, struct file *fp); void audit_arg_argv(char *argv, int argc, int length); void audit_arg_envv(char *envv, int envc, int length); +void audit_arg_rights(cap_rights_t rights); void audit_sysclose(struct thread *td, int fd); void audit_cred_copy(struct ucred *src, struct ucred *dest); void audit_cred_destroy(struct ucred *cred); @@ -235,6 +236,11 @@ audit_arg_rgid((rgid)); \ } while (0) +#define AUDIT_ARG_RIGHTS(rights) do { \ + if (AUDITING_TD(curthread)) \ + audit_arg_rights((rights)); \ +} while (0) + #define AUDIT_ARG_RUID(ruid) do { \ if (AUDITING_TD(curthread)) \ audit_arg_ruid((ruid)); \ @@ -342,6 +348,7 @@ #define AUDIT_ARG_PID(pid) #define AUDIT_ARG_PROCESS(p) #define AUDIT_ARG_RGID(rgid) +#define AUDIT_ARG_RIGHTS(rights) #define AUDIT_ARG_RUID(ruid) #define AUDIT_ARG_SIGNUM(signum) #define AUDIT_ARG_SGID(sgid) diff -aurN -x '*.orig' src-clean/sys/security/audit/audit_arg.c src/sys/security/audit/audit_arg.c --- src-clean/sys/security/audit/audit_arg.c 2010-08-25 10:10:05.000000000 +0200 +++ src/sys/security/audit/audit_arg.c 2010-08-25 10:24:35.000000000 +0200 @@ -865,6 +865,19 @@ ARG_SET_VALID(ar, ARG_ENVV); } +void +audit_arg_rights(cap_rights_t rights) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + ar->k_ar.ar_arg_rights = rights; + ARG_SET_VALID(ar, ARG_RIGHTS); +} + /* * The close() system call uses it's own audit call to capture the path/vnode * information because those pieces are not easily obtained within the system @@ -886,7 +899,7 @@ audit_arg_fd(fd); - if (getvnode(td->td_proc->p_fd, fd, &fp) != 0) + if (getvnode_cap(td->td_proc->p_fd, fd, 0, &fp) != 0) return; vp = fp->f_vnode; diff -aurN -x '*.orig' src-clean/sys/security/audit/audit_bsm.c src/sys/security/audit/audit_bsm.c --- src-clean/sys/security/audit/audit_bsm.c 2010-08-25 10:10:05.000000000 +0200 +++ src/sys/security/audit/audit_bsm.c 2010-08-25 10:24:35.000000000 +0200 @@ -1581,6 +1581,28 @@ } break; + case AUE_CAP_NEW: + /* + * XXXRW: Would be nice to audit socket/etc information also. + */ + FD_VNODE1_TOKENS; + if (ARG_IS_VALID(kar, ARG_RIGHTS)) { + tok = au_to_arg64(2, "rights", ar->ar_arg_rights); + kau_write(rec, tok); + } + break; + + case AUE_CAP_GETRIGHTS: + if (ARG_IS_VALID(kar, ARG_FD)) { + tok = au_to_arg32(1, "fd", ar->ar_arg_fd); + kau_write(rec, tok); + } + break; + + case AUE_CAP_ENTER: + case AUE_CAP_GETMODE: + break; + case AUE_NULL: default: printf("BSM conversion requested for unknown event %d\n", diff -aurN -x '*.orig' src-clean/sys/security/audit/audit_bsm_errno.c src/sys/security/audit/audit_bsm_errno.c --- src-clean/sys/security/audit/audit_bsm_errno.c 2010-08-25 10:10:05.000000000 +0200 +++ src/sys/security/audit/audit_bsm_errno.c 2010-08-25 10:24:35.000000000 +0200 @@ -686,6 +686,13 @@ ERRNO_NO_LOCAL_MAPPING, #endif ES("Key was rejected by service") }, + { BSM_ERRNO_ENOTCAPABLE, +#ifdef ENOTCAPABLE + ENOTCAPABLE, +#else + ERRNO_NO_LOCAL_MAPPING, +#endif + ES("Capabilities insufficient") }, }; static const int bsm_errnos_count = sizeof(bsm_errnos) / sizeof(bsm_errnos[0]); diff -aurN -x '*.orig' src-clean/sys/security/audit/audit_private.h src/sys/security/audit/audit_private.h --- src-clean/sys/security/audit/audit_private.h 2010-08-25 10:10:05.000000000 +0200 +++ src/sys/security/audit/audit_private.h 2010-08-25 10:24:35.000000000 +0200 @@ -229,6 +229,7 @@ int ar_arg_exitstatus; int ar_arg_exitretval; struct sockaddr_storage ar_arg_sockaddr; + cap_rights_t ar_arg_rights; }; /* @@ -288,6 +289,7 @@ #define ARG_ENVV 0x0002000000000000ULL #define ARG_ATFD1 0x0004000000000000ULL #define ARG_ATFD2 0x0008000000000000ULL +#define ARG_RIGHTS 0x0010000000000000ULL #define ARG_NONE 0x0000000000000000ULL #define ARG_ALL 0xFFFFFFFFFFFFFFFFULL diff -aurN -x '*.orig' src-clean/sys/security/mac/mac_syscalls.c src/sys/security/mac/mac_syscalls.c --- src-clean/sys/security/mac/mac_syscalls.c 2010-08-25 10:10:05.000000000 +0200 +++ src/sys/security/mac/mac_syscalls.c 2010-08-25 10:24:35.000000000 +0200 @@ -48,6 +48,7 @@ #include "opt_mac.h" #include +#include #include #include #include @@ -244,7 +245,7 @@ } buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO); - error = fget(td, uap->fd, &fp); + error = fget(td, uap->fd, CAP_MAC_GET, &fp); if (error) goto out; @@ -439,7 +440,7 @@ return (error); } - error = fget(td, uap->fd, &fp); + error = fget(td, uap->fd, CAP_MAC_SET, &fp); if (error) goto out; diff -aurN -x '*.orig' src-clean/sys/sparc64/sparc64/sys_machdep.c src/sys/sparc64/sparc64/sys_machdep.c --- src-clean/sys/sparc64/sparc64/sys_machdep.c 2010-08-25 10:10:05.000000000 +0200 +++ src/sys/sparc64/sparc64/sys_machdep.c 2010-08-25 10:24:35.000000000 +0200 @@ -53,6 +53,10 @@ { int error; + /* + * XXXRW: As new operations are added here, check that they are safe + * in capability mode. + */ mtx_lock(&Giant); switch (uap->op) { case SPARC_SIGTRAMP_INSTALL: diff -aurN -x '*.orig' src-clean/sys/sys/_types.h src/sys/sys/_types.h --- src-clean/sys/sys/_types.h 2010-08-25 10:10:06.000000000 +0200 +++ src/sys/sys/_types.h 2010-08-25 10:24:35.000000000 +0200 @@ -38,6 +38,7 @@ typedef __uint32_t __blksize_t; /* file block size */ typedef __int64_t __blkcnt_t; /* file block count */ typedef __int32_t __clockid_t; /* clock_gettime()... */ +typedef __uint64_t __cap_rights_t; /* capability rights */ typedef __uint32_t __fflags_t; /* file flags */ typedef __uint64_t __fsblkcnt_t; typedef __uint64_t __fsfilcnt_t; diff -aurN -x '*.orig' src-clean/sys/sys/capability.h src/sys/sys/capability.h --- src-clean/sys/sys/capability.h 1970-01-01 01:00:00.000000000 +0100 +++ src/sys/sys/capability.h 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,209 @@ +/*- + * Copyright (c) 2008-2009 Robert N. M. Watson + * All rights reserved. + * + * WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED + * ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND + * UNEXPECTED WAYS. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $P4: //depot/projects/trustedbsd/capabilities/src/sys/sys/capability.h#29 $ + */ + +/* + * Definitions for FreeBSD capabilities facility. + */ +#ifndef _SYS_CAPABILITY_H_ +#define _SYS_CAPABILITY_H_ + +#include +#include + +/* + * Possibly rights on capabilities. + */ +#define CAP_READ 0x0000000000000001ULL /* read/recv */ +#define CAP_WRITE 0x0000000000000002ULL /* write/send */ +#define CAP_SEEK 0x0000000000000004ULL /* lseek, various io */ +#define CAP_GETPEERNAME 0x0000000000000008ULL /* getpeername */ +#define CAP_GETSOCKNAME 0x0000000000000010ULL /* getsockname */ +#define CAP_FCHFLAGS 0x0000000000000020ULL /* fchflags */ +#define CAP_IOCTL 0x0000000000000040ULL /* ioctl */ +#define CAP_FSTAT 0x0000000000000080ULL /* fstat, faccessat */ +#define CAP_MMAP 0x0000000000000100ULL /* mmap */ +#define CAP_FCNTL 0x0000000000000200ULL /* fcntl */ +#define CAP_EVENT 0x0000000000000400ULL /* select/poll */ +#define CAP_FSYNC 0x0000000000000800ULL /* fsync */ +#define CAP_FCHOWN 0x0000000000001000ULL /* fchown */ +#define CAP_FCHMOD 0x0000000000002000ULL /* fchmod */ +#define CAP_FTRUNCATE 0x0000000000004000ULL /* ftruncate */ +#define CAP_FLOCK 0x0000000000008000ULL /* flock */ +#define CAP_FSTATFS 0x0000000000010000ULL /* fstatfs */ +#define CAP_REVOKE 0x0000000000020000ULL /* revoke */ +#define CAP_FEXECVE 0x0000000000040000ULL /* fexecve */ +#define CAP_FPATHCONF 0x0000000000080000ULL /* fpathconf */ +#define CAP_FUTIMES 0x0000000000100000ULL /* futimes */ +#define CAP_ACL_GET 0x0000000000200000ULL /* acl_get_fd */ +#define CAP_ACL_SET 0x0000000000400000ULL /* acl_set_fd */ +#define CAP_ACL_DELETE 0x0000000000800000ULL /* acl_delete_fd */ +#define CAP_ACL_CHECK 0x0000000001000000ULL /* acl_list_fd */ +#define CAP_EXTATTR_GET 0x0000000002000000ULL /* extattr_get_fd */ +#define CAP_EXTATTR_SET 0x0000000004000000ULL /* extattr_set_fd */ +#define CAP_EXTATTR_DELETE 0x0000000008000000ULL /* extattr_delete_fd */ +#define CAP_EXTATTR_LIST 0x0000000010000000ULL /* extattr_list_fd */ +#define CAP_MAC_GET 0x0000000020000000ULL /* mac_get_fd */ +#define CAP_MAC_SET 0x0000000040000000ULL /* mac_set_fd */ +#define CAP_ACCEPT 0x0000000080000000ULL /* accept */ +#define CAP_CONNECT 0x0000000100000000ULL /* connect/sendto */ +#define CAP_BIND 0x0000000200000000ULL /* bind */ +#define CAP_GETSOCKOPT 0x0000000400000000ULL /* getsockopt */ +#define CAP_SETSOCKOPT 0x0000000800000000ULL /* setsockopt */ +#define CAP_LISTEN 0x0000001000000000ULL /* listen */ +#define CAP_SHUTDOWN 0x0000002000000000ULL /* shutdown */ +#define CAP_PEELOFF 0x0000004000000000ULL /* sctp_peeloff */ +#define CAP_LOOKUP 0x0000008000000000ULL /* _at(2) lookup */ +#define CAP_SEM_POST 0x0000010000000000ULL /* ksem_post */ +#define CAP_SEM_WAIT 0x0000020000000000ULL /* ksem_wait */ +#define CAP_SEM_GETVALUE 0x0000040000000000ULL /* ksem_getvalue */ +#define CAP_KEVENT 0x0000080000000000ULL /* kevent(2) */ +#define CAP_PDGETPID 0x0000100000000000ULL /* pdgetpid(2) */ +#define CAP_PDWAIT 0x0000200000000000ULL /* pdwait(2) */ +#define CAP_PDKILL 0x0000400000000000ULL /* pdkill(2) */ +#define CAP_MAPEXEC 0x0000800000000000ULL /* mmap(2) as exec */ +#define CAP_TTYHOOK 0x0001000000000000ULL /* register tty hook */ +#define CAP_FCHDIR 0x0002000000000000ULL /* fchdir(2) */ +#define CAP_FSCK 0x0004000000000000ULL /* sysctl_ffs_fsck */ +#define CAP_ATBASE 0x0008000000000000ULL /* openat(2), etc. */ +#define CAP_ABSOLUTEPATH 0x0010000000000000ULL /* abs. lookup from '/' */ +#define CAP_CREATE 0x0020000000000000ULL /* open, rename, etc. */ +#define CAP_DELETE 0x0040000000000000ULL /* rename, remove, etc. */ +#define CAP_MKDIR 0x0080000000000000ULL /* mkdirat(2), mknodat(2) */ +#define CAP_RMDIR 0x0100000000000000ULL /* rmdirat(2) */ +#define CAP_MKFIFO 0x0200000000000000ULL /* mkfifoat(2) */ +#define CAP_MASK_VALID 0x03ffffffffffffffULL + +/* + * Notes: + * + * Some system calls don't require a capability in order to perform an + * operation on an fd. These include: close, dup, dup2. + * + * CAP_SEEK is used alone for lseek, but along-side CAP_READ and CAP_WRITE + * for various I/O calls, such as read/write/send/receive. + * + * pread and pwrite will not use CAP_SEEK. + * + * CAP_EVENT covers select, poll, and kqueue registration for a capability; + * CAP_KEVENT controls the use of a kqueue(2) description. + * + * sendfile is authorized using CAP_READ on the file and CAP_WRITE on the + * socket. + * + * sendto should check CAP_CONNECT as well as CAP_WRITE if an address is + * specified. + * + * mmap() and aio*() system calls will need special attention as they may + * involve reads or writes depending a great deal on context. + * + * Socket checks don't generally pass CAP_SEEK but perhaps should? + */ + +/* + * A mask of multiple capabilities useful for situation where a socket will + * be used in a general-purpose way by the kernel, such as a socket used by + * the NFS server. + */ +#define CAP_SOCK_ALL (CAP_READ | CAP_WRITE | CAP_SEEK | CAP_GETPEERNAME | \ + CAP_GETSOCKNAME | CAP_IOCTL | CAP_FSTAT | \ + CAP_FCNTL | CAP_EVENT | CAP_ACCEPT | \ + CAP_CONNECT | CAP_BIND | CAP_GETSOCKOPT | \ + CAP_SETSOCKOPT | CAP_LISTEN | CAP_SHUTDOWN | \ + CAP_PEELOFF) + +#ifdef _KERNEL +struct file; +struct thread; + +#define IN_CAPABILITY_MODE(td) (td->td_ucred->cr_flags & CRED_FLAG_CAPMODE) + + +/* + * Create a capability to wrap a file object. + */ +int kern_capwrap(struct thread *td, struct file *fp, cap_rights_t rights, + struct file **cap, int *capfd); + +/* + * Given a file descriptor that may be a capability, check the requested + * rights and extract the underlying object. Assumes a valid reference is + * held to fp_cap, and returns a pointer via fpp under that assumption. The + * caller invokes fhold(*fpp) if required. + */ +int cap_fextract(struct file *fp_cap, cap_rights_t rights, + struct file **fpp); +int cap_fextract_mmap(struct file *fp_cap, cap_rights_t rights, + u_char *maxprotp, struct file **fpp); + +/* + * For the purposes of procstat(1) and similar tools, allow kern_descrip.c to + * extract the rights from a capability. However, this should not be used by + * kernel code generally, instead cap_fextract() should be used in order to + * keep all access control in one place. + */ +cap_rights_t cap_rights(struct file *fp_cap); + +#else /* !_KERNEL */ + +__BEGIN_DECLS +/* + * cap_enter(): Cause the process to enter capability mode, which will + * prevent it from directly accessing global namespaces. System calls will + * be limited to process-local, process-inherited, or file descriptor + * operations. If already in capability mode, a no-op. + * + * Currently, process-inherited operations are not properly handled -- in + * particular, we're interested in things like waitpid(2), kill(2), etc, + * being properly constrained. One possible solution is to introduce process + * descriptors. + */ +int cap_enter(void); + +/* + * cap_new(): Create a new capability derived from an existing file + * descriptor with the specified rights. If the existing file descriptor is + * a capability, then the new rights must be a subset of the existing rights. + */ +int cap_new(int fd, cap_rights_t rights); + +/* + * cap_getrights(): Query the rights on a capability. + */ +int cap_getrights(int fd, cap_rights_t *rightsp); +__END_DECLS + +#endif /* !_KERNEL */ + +#endif /* !_SYS_CAPABILITY_H_ */ diff -aurN -x '*.orig' src-clean/sys/sys/file.h src/sys/sys/file.h --- src-clean/sys/sys/file.h 2010-08-25 10:10:06.000000000 +0200 +++ src/sys/sys/file.h 2010-08-25 10:24:35.000000000 +0200 @@ -36,6 +36,7 @@ #ifndef _KERNEL #include /* XXX */ #include +#include #include #else #include @@ -63,6 +64,8 @@ #define DTYPE_SHM 8 /* swap-backed shared memory */ #define DTYPE_SEM 9 /* posix semaphore */ #define DTYPE_PTS 10 /* pseudo teletype master device */ +#define DTYPE_CAPABILITY 11 /* capability */ +#define DTYPE_PROCDESC 12 /* process descriptor */ #ifdef _KERNEL @@ -136,6 +139,8 @@ * Mandatory Access control information. */ void *f_label; /* Place-holder for MAC label. */ + LIST_HEAD(, capability) f_caps; /* (f) List of capabilities for file. */ + u_int f_capcount; /* (f) Number of capabilities. */ }; #define FOFFSET_LOCKED 0x1 @@ -174,9 +179,14 @@ extern int maxfilesperproc; /* per process limit on number of open files */ extern volatile int openfiles; /* actual number of open files */ -int fget(struct thread *td, int fd, struct file **fpp); -int fget_read(struct thread *td, int fd, struct file **fpp); -int fget_write(struct thread *td, int fd, struct file **fpp); +int fget(struct thread *td, int fd, cap_rights_t rights, struct file **fpp); +int fget_mmap(struct thread *td, int fd, cap_rights_t rights, + u_char *maxprotp, struct file **fpp); +int fget_read(struct thread *td, int fd, cap_rights_t rights, + struct file **fpp); +int fget_write(struct thread *td, int fd, cap_rights_t rights, + struct file **fpp); +int fgetcap(struct thread *td, int fd, struct file **fpp); int _fdrop(struct file *fp, struct thread *td); /* @@ -194,11 +204,17 @@ fo_close_t soo_close; void finit(struct file *, u_int, short, void *, struct fileops *); -int fgetvp(struct thread *td, int fd, struct vnode **vpp); -int fgetvp_read(struct thread *td, int fd, struct vnode **vpp); -int fgetvp_write(struct thread *td, int fd, struct vnode **vpp); +int fgetvp(struct thread *td, int fd, cap_rights_t rights, + struct vnode **vpp); +int fgetvp_rights(struct thread *td, int fd, cap_rights_t need, cap_rights_t *have, + struct vnode **vpp); +int fgetvp_read(struct thread *td, int fd, cap_rights_t rights, + struct vnode **vpp); +int fgetvp_write(struct thread *td, int fd, cap_rights_t rights, + struct vnode **vpp); -int fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp); +int fgetsock(struct thread *td, int fd, cap_rights_t rights, + struct socket **spp, u_int *fflagp); void fputsock(struct socket *sp); #define fhold(fp) \ diff -aurN -x '*.orig' src-clean/sys/sys/filedesc.h src/sys/sys/filedesc.h --- src-clean/sys/sys/filedesc.h 2010-08-25 10:10:06.000000000 +0200 +++ src/sys/sys/filedesc.h 2010-08-25 10:24:35.000000000 +0200 @@ -112,6 +112,8 @@ int dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, int error); int falloc(struct thread *td, struct file **resultfp, int *resultfd); +int falloc_noinstall(struct thread *td, struct file **resultfp); +int finstall(struct thread *td, struct file *fp, int *resultfp); int fdalloc(struct thread *td, int minfd, int *result); int fdavail(struct thread *td, int n); int fdcheckstd(struct thread *td); @@ -126,6 +128,10 @@ filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader); int getvnode(struct filedesc *fdp, int fd, struct file **fpp); +int getvnode_cap(struct filedesc *fdp, int fd, cap_rights_t rights, + struct file **fpp); +int fgetbase(struct thread *td, int fd, cap_rights_t rights, + struct vnode **base); void mountcheckdirs(struct vnode *olddp, struct vnode *newdp); void setugidsafety(struct thread *td); diff -aurN -x '*.orig' src-clean/sys/sys/namei.h src/sys/sys/namei.h --- src-clean/sys/sys/namei.h 2010-08-25 10:10:06.000000000 +0200 +++ src/sys/sys/namei.h 2010-08-25 10:24:35.000000000 +0200 @@ -63,6 +63,7 @@ */ const char *ni_dirp; /* pathname pointer */ enum uio_seg ni_segflg; /* location of pathname */ + cap_rights_t ni_rightsneeded; /* rights required to look up vnode */ /* * Arguments to lookup. */ @@ -70,6 +71,11 @@ struct vnode *ni_rootdir; /* logical root directory */ struct vnode *ni_topdir; /* logical top directory */ int ni_dirfd; /* starting directory for *at functions */ + struct vnode *ni_basedir; /* root for capability-mode *at */ + /* + * Results: returned from namei + */ + cap_rights_t ni_baserights; /* rights that the *at base has (or -1) */ /* * Results: returned from/manipulated by lookup */ @@ -151,11 +157,13 @@ * Initialization of a nameidata structure. */ #define NDINIT(ndp, op, flags, segflg, namep, td) \ - NDINIT_ALL(ndp, op, flags, segflg, namep, AT_FDCWD, NULL, td) -#define NDINIT_AT(ndp, op, flags, segflg, namep, dirfd, td) \ - NDINIT_ALL(ndp, op, flags, segflg, namep, dirfd, NULL, td) -#define NDINIT_ATVP(ndp, op, flags, segflg, namep, vp, td) \ - NDINIT_ALL(ndp, op, flags, segflg, namep, AT_FDCWD, vp, td) + NDINIT_ALL(ndp, op, flags, segflg, namep, AT_FDCWD, NULL, NULL, 0, td) +#define NDINIT_AT(ndp, op, flags, segflg, namep, dirfd, td) \ + NDINIT_ALL(ndp, op, flags, segflg, namep, dirfd, NULL, NULL, 0, td) +#define NDINIT_ATRIGHTS(ndp, op, flags, segflg, namep, dirfd, rights, td) \ + NDINIT_ALL(ndp, op, flags, segflg, namep, dirfd, NULL, NULL, rights, td) +#define NDINIT_ATVP(ndp, op, flags, segflg, namep, vp, td) \ + NDINIT_ALL(ndp, op, flags, segflg, namep, AT_FDCWD, vp, NULL, 0, td) static __inline void NDINIT_ALL(struct nameidata *ndp, @@ -164,6 +172,8 @@ const char *namep, int dirfd, struct vnode *startdir, + struct vnode *basedir, + cap_rights_t rights, struct thread *td) { ndp->ni_cnd.cn_nameiop = op; @@ -172,6 +182,9 @@ ndp->ni_dirp = namep; ndp->ni_dirfd = dirfd; ndp->ni_startdir = startdir; + ndp->ni_basedir = basedir; + ndp->ni_rightsneeded = rights; + ndp->ni_baserights = -1; ndp->ni_cnd.cn_thread = td; } diff -aurN -x '*.orig' src-clean/sys/sys/proc.h src/sys/sys/proc.h --- src-clean/sys/sys/proc.h 2010-08-25 10:10:06.000000000 +0200 +++ src/sys/sys/proc.h 2010-08-25 10:24:35.000000000 +0200 @@ -162,6 +162,7 @@ struct kaioinfo; struct p_sched; struct proc; +struct procdesc; struct sleepqueue; struct thread; struct trapframe; @@ -517,6 +518,7 @@ int p_boundary_count;/* (c) Num threads at user boundary */ int p_pendingcnt; /* how many signals are pending */ struct itimers *p_itimers; /* (c) POSIX interval timers. */ + struct procdesc *p_procdesc; /* (e) Process descriptor, if any. */ /* End area that is zeroed on creation. */ #define p_endzero p_magic @@ -818,6 +820,8 @@ void procinit(void); void proc_linkup0(struct proc *p, struct thread *td); void proc_linkup(struct proc *p, struct thread *td); +void proc_reap(struct thread *td, struct proc *p, int *status, + int options, struct rusage *rusage); void proc_reparent(struct proc *child, struct proc *newparent); struct pstats *pstats_alloc(void); void pstats_fork(struct pstats *src, struct pstats *dst); diff -aurN -x '*.orig' src-clean/sys/sys/procdesc.h src/sys/sys/procdesc.h --- src-clean/sys/sys/procdesc.h 1970-01-01 01:00:00.000000000 +0100 +++ src/sys/sys/procdesc.h 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,115 @@ +/*- + * Copyright (c) 2009 Robert N. M. Watson + * All rights reserved. + * + * WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED + * ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND + * UNEXPECTED WAYS. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SYS_PROCDESC_H_ +#define _SYS_PROCDESC_H_ + +#ifdef _KERNEL +#include /* struct selinfo */ +#include +#include + +/*- + * struct procdesc describes a process descriptor, and essentially consists + * of two pointers -- one to the file descriptor, and one to the process. + * When both become NULL, the process descriptor will be freed. An important + * invariant is that there is only ever one process descriptor for a process, + * so a single file pointer will suffice. + * + * Locking key: + * (c) - Constant after initial setup. + * (p) - Protected by the process descriptor mutex. + * (r) - Atomic eference count. + * (s) - Protected by selinfo. + * (t) - Protected by the proctree_lock + */ +struct proc; +struct sigio; +struct procdesc { + /* + * Basic process descriptor state: the process, a cache of its pid to + * satisfy queries after the process exits, and process descriptor + * refcount. + */ + struct proc *pd_proc; /* (t) Process. */ + pid_t pd_pid; /* (c) Cached pid. */ + u_int pd_refcount; /* (r) Reference count. */ + + /* + * In-flight data and notification of events. + */ + int pd_flags; /* (p) PD_ flags. */ + struct selinfo pd_selinfo; /* (p) Event notification. */ + struct mtx pd_lock; /* Protect data + events. */ +}; + +/* + * Locking macros for the procdesc itself. + */ +#define PROCDESC_LOCK_DESTROY(pd) mtx_destroy(&(pd)->pd_lock) +#define PROCDESC_LOCK_INIT(pd) mtx_init(&(pd)->pd_lock, "procdesc", NULL, \ + MTX_DEF) +#define PROCDESC_LOCK(pd) mtx_lock(&(pd)->pd_lock) +#define PROCDESC_UNLOCK(pd) mtx_unlock(&(pd)->pd_lock) + +/* + * Flags for the pd_flags field. + */ +#define PD_CLOSED 0x00000001 /* Descriptor has closed. */ +#define PD_SELECTED 0x00000002 /* Issue selwakeup(). */ +#define PD_EXITED 0x00000004 /* Process exited. */ + +/* + * In-kernel interfaces to process descriptors. + */ +int procdesc_exit(struct proc *p); +int procdesc_find(struct thread *td, int fd, cap_rights_t rights, + struct proc **p); +void procdesc_new(struct proc *p, struct file *fp_procdesc); +pid_t procdesc_pid(struct file *fp_procdesc); +void procdesc_reap(struct proc *p); + +#else /* !_KERNEL */ + +/* + * Process descriptor system calls. + */ +struct rusage; +int pdfork(int *fdp); +int pdkill(int fd, int signum); +int pdgetpid(int fd, pid_t *pidp); +int pdwait4(int fd, int *status, int options, struct rusage *rusage); + +#endif /* _KERNEL */ + +#endif /* !_SYS_PROCDESC_H_ */ diff -aurN -x '*.orig' src-clean/sys/sys/syscall.h src/sys/sys/syscall.h --- src-clean/sys/sys/syscall.h 2010-08-25 10:10:06.000000000 +0200 +++ src/sys/sys/syscall.h 2010-08-25 10:24:35.000000000 +0200 @@ -428,5 +428,13 @@ #define SYS_msgctl 511 #define SYS_shmctl 512 #define SYS_lpathconf 513 +#define SYS_cap_new 514 +#define SYS_cap_getrights 515 +#define SYS_cap_enter 516 +#define SYS_cap_getmode 517 +#define SYS_pdfork 518 +#define SYS_pdkill 519 +#define SYS_pdgetpid 520 +#define SYS_pdwait 521 #define SYS_pselect 522 #define SYS_MAXSYSCALL 523 diff -aurN -x '*.orig' src-clean/sys/sys/syscall.mk src/sys/sys/syscall.mk --- src-clean/sys/sys/syscall.mk 2010-08-25 10:10:06.000000000 +0200 +++ src/sys/sys/syscall.mk 2010-08-25 10:24:35.000000000 +0200 @@ -377,4 +377,12 @@ msgctl.o \ shmctl.o \ lpathconf.o \ + cap_new.o \ + cap_getrights.o \ + cap_enter.o \ + cap_getmode.o \ + pdfork.o \ + pdkill.o \ + pdgetpid.o \ + pdwait.o \ pselect.o diff -aurN -x '*.orig' src-clean/sys/sys/sysctl.h src/sys/sys/sysctl.h --- src-clean/sys/sys/sysctl.h 2010-08-25 10:10:06.000000000 +0200 +++ src/sys/sys/sysctl.h 2010-08-25 10:24:35.000000000 +0200 @@ -87,6 +87,9 @@ #define CTLFLAG_MPSAFE 0x00040000 /* Handler is MP safe */ #define CTLFLAG_VNET 0x00020000 /* Prisons with vnet can fiddle */ #define CTLFLAG_RDTUN (CTLFLAG_RD|CTLFLAG_TUN) +#define CTLFLAG_CAPRD 0x00010000 /* Can be read in capability mode */ +#define CTLFLAG_CAPWR 0x00008000 /* Can be written in capability mode */ +#define CTLFLAG_CAPRW (CTLFLAG_CAPRD|CTLFLAG_CAPWR) /* * Secure level. Note that CTLFLAG_SECURE == CTLFLAG_SECURE1. @@ -341,7 +344,8 @@ * kernel features. */ #define FEATURE(name, desc) \ - SYSCTL_INT(_kern_features, OID_AUTO, name, CTLFLAG_RD, 0, 1, desc) + SYSCTL_INT(_kern_features, OID_AUTO, name, CTLFLAG_RD | \ + CTLFLAG_CAPRD, 0, 1, desc) #endif /* _KERNEL */ diff -aurN -x '*.orig' src-clean/sys/sys/sysent.h src/sys/sys/sysent.h --- src-clean/sys/sys/sysent.h 2010-08-25 10:10:06.000000000 +0200 +++ src/sys/sys/sysent.h 2010-08-25 10:24:35.000000000 +0200 @@ -63,6 +63,11 @@ u_int32_t sy_flags; /* General flags for system calls. */ }; +/* + * struct sysent flags + */ +#define SYF_CAPENABLED 0x00000001 + struct image_params; struct __sigset; struct trapframe; diff -aurN -x '*.orig' src-clean/sys/sys/sysproto.h src/sys/sys/sysproto.h --- src-clean/sys/sys/sysproto.h 2010-08-25 10:10:06.000000000 +0200 +++ src/sys/sys/sysproto.h 2010-08-25 10:24:35.000000000 +0200 @@ -1641,6 +1641,37 @@ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)]; char name_l_[PADL_(int)]; int name; char name_r_[PADR_(int)]; }; +struct cap_new_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char rights_l_[PADL_(u_int64_t)]; u_int64_t rights; char rights_r_[PADR_(u_int64_t)]; +}; +struct cap_getrights_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char rightsp_l_[PADL_(u_int64_t *)]; u_int64_t * rightsp; char rightsp_r_[PADR_(u_int64_t *)]; +}; +struct cap_enter_args { + register_t dummy; +}; +struct cap_getmode_args { + char modep_l_[PADL_(u_int *)]; u_int * modep; char modep_r_[PADR_(u_int *)]; +}; +struct pdfork_args { + char fdp_l_[PADL_(int *)]; int * fdp; char fdp_r_[PADR_(int *)]; +}; +struct pdkill_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char signum_l_[PADL_(int)]; int signum; char signum_r_[PADR_(int)]; +}; +struct pdgetpid_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char pidp_l_[PADL_(pid_t *)]; pid_t * pidp; char pidp_r_[PADR_(pid_t *)]; +}; +struct pdwait_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char status_l_[PADL_(int *)]; int * status; char status_r_[PADR_(int *)]; + char options_l_[PADL_(int)]; int options; char options_r_[PADR_(int)]; + char rusage_l_[PADL_(struct rusage *)]; struct rusage * rusage; char rusage_r_[PADR_(struct rusage *)]; +}; struct pselect_args { char nd_l_[PADL_(int)]; int nd; char nd_r_[PADR_(int)]; char in_l_[PADL_(fd_set *)]; fd_set * in; char in_r_[PADR_(fd_set *)]; @@ -2007,6 +2038,14 @@ int msgctl(struct thread *, struct msgctl_args *); int shmctl(struct thread *, struct shmctl_args *); int lpathconf(struct thread *, struct lpathconf_args *); +int cap_new(struct thread *, struct cap_new_args *); +int cap_getrights(struct thread *, struct cap_getrights_args *); +int cap_enter(struct thread *, struct cap_enter_args *); +int cap_getmode(struct thread *, struct cap_getmode_args *); +int pdfork(struct thread *, struct pdfork_args *); +int pdkill(struct thread *, struct pdkill_args *); +int pdgetpid(struct thread *, struct pdgetpid_args *); +int pdwait(struct thread *, struct pdwait_args *); int pselect(struct thread *, struct pselect_args *); #ifdef COMPAT_43 @@ -2680,6 +2719,14 @@ #define SYS_AUE_msgctl AUE_MSGCTL #define SYS_AUE_shmctl AUE_SHMCTL #define SYS_AUE_lpathconf AUE_LPATHCONF +#define SYS_AUE_cap_new AUE_CAP_NEW +#define SYS_AUE_cap_getrights AUE_CAP_GETRIGHTS +#define SYS_AUE_cap_enter AUE_CAP_ENTER +#define SYS_AUE_cap_getmode AUE_CAP_GETMODE +#define SYS_AUE_pdfork AUE_NULL +#define SYS_AUE_pdkill AUE_NULL +#define SYS_AUE_pdgetpid AUE_NULL +#define SYS_AUE_pdwait AUE_NULL #define SYS_AUE_pselect AUE_SELECT #undef PAD_ diff -aurN -x '*.orig' src-clean/sys/sys/types.h src/sys/sys/types.h --- src-clean/sys/sys/types.h 2010-08-25 10:10:06.000000000 +0200 +++ src/sys/sys/types.h 2010-08-25 10:24:35.000000000 +0200 @@ -133,6 +133,8 @@ #define _BLKCNT_T_DECLARED #endif +typedef __cap_rights_t cap_rights_t; + #ifndef _CLOCK_T_DECLARED typedef __clock_t clock_t; #define _CLOCK_T_DECLARED diff -aurN -x '*.orig' src-clean/sys/sys/ucred.h src/sys/sys/ucred.h --- src-clean/sys/sys/ucred.h 2010-08-25 10:10:06.000000000 +0200 +++ src/sys/sys/ucred.h 2010-08-25 10:24:35.000000000 +0200 @@ -68,6 +68,10 @@ #endif /* _KERNEL || _WANT_UCRED */ #define XU_NGROUPS 16 +/* + * Flags for cr_flags. + */ +#define CRED_FLAG_CAPMODE 0x00000001 /* In capability mode. */ /* * This is the external representation of struct ucred. diff -aurN -x '*.orig' src-clean/sys/sys/unistd.h src/sys/sys/unistd.h --- src-clean/sys/sys/unistd.h 2010-08-25 10:10:06.000000000 +0200 +++ src/sys/sys/unistd.h 2010-08-25 10:24:35.000000000 +0200 @@ -180,8 +180,9 @@ #define RFLINUXTHPN (1<<16) /* do linux clone exit parent notification */ #define RFSTOPPED (1<<17) /* leave child in a stopped state */ #define RFHIGHPID (1<<18) /* use a pid higher than 10 (idleproc) */ +#define RFPROCDESC (1<<19) /* return a process descriptor */ #define RFPPWAIT (1<<31) /* parent sleeps until child exits (vfork) */ -#define RFKERNELONLY (RFSTOPPED | RFHIGHPID | RFPPWAIT) +#define RFKERNELONLY (RFSTOPPED | RFHIGHPID | RFPPWAIT | RFPROCDESC) #endif /* __BSD_VISIBLE */ diff -aurN -x '*.orig' src-clean/sys/sys/user.h src/sys/sys/user.h --- src-clean/sys/sys/user.h 2010-08-25 10:10:06.000000000 +0200 +++ src/sys/sys/user.h 2010-08-25 10:24:35.000000000 +0200 @@ -239,6 +239,7 @@ #define KF_TYPE_SHM 8 #define KF_TYPE_SEM 9 #define KF_TYPE_PTS 10 +#define KF_TYPE_PROCDESC 12 #define KF_TYPE_UNKNOWN 255 #define KF_VTYPE_VNON 0 @@ -264,6 +265,7 @@ #define KF_FLAG_NONBLOCK 0x00000020 #define KF_FLAG_DIRECT 0x00000040 #define KF_FLAG_HASLOCK 0x00000080 +#define KF_FLAG_CAPABILITY 0x00000100 /* * Old format. Has variable hidden padding due to alignment. @@ -311,7 +313,10 @@ int kf_sock_protocol; /* Socket protocol. */ struct sockaddr_storage kf_sa_local; /* Socket address. */ struct sockaddr_storage kf_sa_peer; /* Peer address. */ - int _kf_ispare[16]; /* Space for more stuff. */ + pid_t kf_pid; /* Process identifier. */ + int _kf_ispare0; /* Space for more stuff. */ + cap_rights_t kf_cap_rights; /* Capability rights. */ + int _kf_ispare[12]; /* Space for more stuff. */ /* Truncated before copyout in sysctl */ char kf_path[PATH_MAX]; /* Path to file, if any. */ }; diff -aurN -x '*.orig' src-clean/sys/ufs/ffs/ffs_alloc.c src/sys/ufs/ffs/ffs_alloc.c --- src-clean/sys/ufs/ffs/ffs_alloc.c 2010-08-25 10:10:06.000000000 +0200 +++ src/sys/ufs/ffs/ffs_alloc.c 2010-08-25 10:24:35.000000000 +0200 @@ -62,9 +62,11 @@ #include __FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_alloc.c,v 1.153.2.3.2.1 2010/06/14 02:09:06 kensmith Exp $"); +#include "opt_capabilities.h" #include "opt_quota.h" #include +#include #include #include #include @@ -2419,7 +2421,8 @@ return (error); if (cmd.version != FFS_CMD_VERSION) return (ERPCMISMATCH); - if ((error = getvnode(curproc->p_fd, cmd.handle, &fp)) != 0) + if ((error = getvnode_cap(curproc->p_fd, cmd.handle, CAP_FSCK, &fp)) + != 0) return (error); vn_start_write(fp->f_data, &mp, V_WAIT); if (mp == 0 || strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) { diff -aurN -x '*.orig' src-clean/sys/vm/vm_mmap.c src/sys/vm/vm_mmap.c --- src-clean/sys/vm/vm_mmap.c 2010-08-25 10:10:06.000000000 +0200 +++ src/sys/vm/vm_mmap.c 2010-08-25 10:24:35.000000000 +0200 @@ -48,6 +48,7 @@ #include #include +#include #include #include #include @@ -215,12 +216,13 @@ struct vnode *vp; vm_offset_t addr; vm_size_t size, pageoff; - vm_prot_t prot, maxprot; + vm_prot_t cap_maxprot, prot, maxprot; void *handle; objtype_t handle_type; int flags, error; off_t pos; struct vmspace *vms = td->td_proc->p_vmspace; + cap_rights_t rights; addr = (vm_offset_t) uap->addr; size = uap->len; @@ -300,10 +302,22 @@ maxprot = VM_PROT_ALL; } else { /* - * Mapping file, get fp for validation and - * don't let the descriptor disappear on us if we block. - */ - if ((error = fget(td, uap->fd, &fp)) != 0) + * Mapping file, get fp for validation and don't let the + * descriptor disappear on us if we block. Check capability + * rights, but also return the maximum rights to be combined + * with maxprot later. + */ + rights = CAP_MMAP; + if (prot & PROT_READ) + rights |= CAP_READ; + if ((flags & MAP_SHARED) != 0) { + if (prot & PROT_WRITE) + rights |= CAP_WRITE; + } + if (prot & PROT_EXEC) + rights |= CAP_MAPEXEC; + if ((error = fget_mmap(td, uap->fd, rights, &cap_maxprot, + &fp)) != 0) goto done; if (fp->f_type == DTYPE_SHM) { handle = fp->f_data; @@ -370,6 +384,7 @@ } } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { maxprot |= VM_PROT_WRITE; + cap_maxprot |= VM_PROT_WRITE; } handle = (void *)vp; handle_type = OBJT_VNODE; @@ -388,6 +403,7 @@ } td->td_fpop = fp; + maxprot &= cap_maxprot; error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, flags, handle_type, handle, pos); td->td_fpop = NULL; diff -aurN -x '*.orig' src-clean/tools/cap/sandbox_world/Makefile src/tools/cap/sandbox_world/Makefile --- src-clean/tools/cap/sandbox_world/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ src/tools/cap/sandbox_world/Makefile 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,8 @@ +PROG=sandbox_world +NO_MAN= +CFLAGS=-Wall -g + +DPADD= ${LIBCAPABILITY} ${LIBSBUF} +LDADD= -lcapsicum -lsbuf + +.include diff -aurN -x '*.orig' src-clean/tools/cap/sandbox_world/sandbox_world.c src/tools/cap/sandbox_world/sandbox_world.c --- src-clean/tools/cap/sandbox_world/sandbox_world.c 1970-01-01 01:00:00.000000000 +0100 +++ src/tools/cap/sandbox_world/sandbox_world.c 2010-08-25 10:24:35.000000000 +0200 @@ -0,0 +1,152 @@ +/*- + * Copyright (c) 2009-2010 Robert N. M. Watson + * All rights reserved. + * + * WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED + * ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND + * UNEXPECTED WAYS. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +/* + * Almost your standard "hello world" application, only we run the printf in + * a sandbox, and we use a 1-byte synchronous RPC to make sure that the host + * doesn't exit until the sandbox is done. + */ + +#define MYNAME "sandbox_world" /* Binary to run in sandbox. */ + +int ld_insandbox(void); +int sandbox(void); + +/* + * Unsandboxed host process with full user rights. + */ +int +main(int argc, char *argv[]) +{ + struct lc_sandbox *lcsp; + char *sandbox_argv[3] = { argv[1], "nested", NULL }; + struct iovec iov; + size_t len; + char ch; + + if (ld_insandbox()) return sandbox(); + + if (argc != 1) + errx(-1, "usage: sandbox_world"); + + /* + * Create a sandbox, do permit access to stdout and stderr. + */ + if (lch_start(MYNAME, sandbox_argv, LCH_PERMIT_STDERR | + LCH_PERMIT_STDOUT, NULL, &lcsp) < 0) + err(-1, "lch_start %s", argv[1]); + + /* + * Send a one-byte message to the sandbox and wait for a one-byte + * reply. + */ + ch = 'X'; + iov.iov_base = &ch; + iov.iov_len = sizeof(ch); + if (lch_rpc(lcsp, 0, &iov, 1, &iov, 1, &len) < 0) + err(-1, "lch_rpc"); + if (len != sizeof(ch)) + errx(-1, "lch_rpc returned size %zd not %zd", len, sizeof(ch)); + if (ch != 'X') + errx(-1, "lch_recv: expected %d and got %d", 'X', ch); + + /* + * Terminate the sandbox when done. + */ + lch_stop(lcsp); +} + +/* + * Sandboxed process implementing a 'printf hello world' RPC. + */ +int +sandbox() +{ + struct lc_host *lchp; + u_int32_t opno, seqno; + struct iovec iov; + u_char *buffer; + size_t len; + + if (lcs_get(&lchp) < 0) + err(-1, "lcs_get"); + + /* + * Serve RPCs from the host until the sandbox is killed. + */ + while (1) { + /* + * Receive a one-byte RPC from the host. + */ + if (lcs_recvrpc(lchp, &opno, &seqno, &buffer, &len) < 0) { + if (errno != EPIPE) + err(-6, "lcs_recvrpc"); + else + exit(-6); + } + if (len != 1) + errx(-7, "lcs_recvrpc len"); + printf("Hello world!\n"); + fflush(stdout); + + /* + * Reply with the same message. Remember to free the message + * when done. + */ + iov.iov_base = buffer; + iov.iov_len = 1; + if (lcs_sendrpc(lchp, opno, seqno, &iov, 1) < 0) { + if (errno != EPIPE) + err(-8, "lcs_sendrpc"); + else + exit(-8); + } + free(buffer); + } + + return 0; +} diff -aurN -x '*.orig' src-clean/tools/regression/sockets/unix_gc/unix_gc.c src/tools/regression/sockets/unix_gc/unix_gc.c --- src-clean/tools/regression/sockets/unix_gc/unix_gc.c 2010-08-25 10:09:33.000000000 +0200 +++ src/tools/regression/sockets/unix_gc/unix_gc.c 2010-08-25 10:24:35.000000000 +0200 @@ -256,8 +256,8 @@ after_openfiles = getopenfiles(); if (after_openfiles != before_openfiles) - warnx("%s: before: %d, after: %d", test, before_openfiles, - after_openfiles); + warnx("%s: before openfiles: %d, after openfiles: %d", + test, before_openfiles, after_openfiles); } static void diff -aurN -x '*.orig' src-clean/tools/tools/syscall_timing/Makefile src/tools/tools/syscall_timing/Makefile --- src-clean/tools/tools/syscall_timing/Makefile 2010-08-25 10:09:34.000000000 +0200 +++ src/tools/tools/syscall_timing/Makefile 2010-08-25 10:24:35.000000000 +0200 @@ -3,7 +3,8 @@ # PROG= syscall_timing -CFLAGS+= -static -O +CFLAGS+= -O -Wall -rdynamic NO_MAN= +LDADD= -lcapsicum -lsbuf .include diff -aurN -x '*.orig' src-clean/tools/tools/syscall_timing/syscall_timing.c src/tools/tools/syscall_timing/syscall_timing.c --- src-clean/tools/tools/syscall_timing/syscall_timing.c 2010-08-25 10:09:34.000000000 +0200 +++ src/tools/tools/syscall_timing/syscall_timing.c 2010-08-25 10:24:35.000000000 +0200 @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2003-2004 Robert N. M. Watson + * Copyright (c) 2003-2004, 2010 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -27,26 +27,53 @@ */ #include +#include +#include +#include +#include #include +#include #include +#include #include +#include +#include +#include +#include +#include #include #include #include #include +static struct timespec ts_start, ts_end; + #define timespecsub(vvp, uvp) \ - do { \ - (vvp)->tv_sec -= (uvp)->tv_sec; \ - (vvp)->tv_nsec -= (uvp)->tv_nsec; \ - if ((vvp)->tv_nsec < 0) { \ - (vvp)->tv_sec--; \ - (vvp)->tv_nsec += 1000000000; \ - } \ - } while (0) + do { \ + (vvp)->tv_sec -= (uvp)->tv_sec; \ + (vvp)->tv_nsec -= (uvp)->tv_nsec; \ + if ((vvp)->tv_nsec < 0) { \ + (vvp)->tv_sec--; \ + (vvp)->tv_nsec += 1000000000; \ + } \ + } while (0) + +static void +benchmark_start(void) +{ + + assert(clock_gettime(CLOCK_REALTIME, &ts_start) == 0); +} + +static void +benchmark_stop(void) +{ + + assert(clock_gettime(CLOCK_REALTIME, &ts_end) == 0); +} -inline void +void test_getuid(int num) { int i; @@ -55,11 +82,13 @@ * Thread-local data should require no locking if system * call is MPSAFE. */ + benchmark_start(); for (i = 0; i < num; i++) getuid(); + benchmark_stop(); } -inline void +void test_getppid(int num) { int i; @@ -68,28 +97,28 @@ * This is process-local, but can change, so will require a * lock. */ + benchmark_start(); for (i = 0; i < num; i++) getppid(); + benchmark_stop(); } -inline void +void test_clock_gettime(int num) { struct timespec ts; int i; - for (i = 0; i < num; i++) { - if (clock_gettime(CLOCK_REALTIME, &ts) == -1) { - perror("clock_gettime"); - exit(-1); - } - } + benchmark_start(); + for (i = 0; i < num; i++) + (void)clock_gettime(CLOCK_REALTIME, &ts); + benchmark_stop(); } -inline void +void test_pipe(int num) { - int i; + int fd[2], i; /* * pipe creation is expensive, as it will allocate a new file @@ -97,153 +126,761 @@ * Destroying is also expensive, as we now have to free up * the file descriptors and return the pipe. */ + if (pipe(fd) < 0) + err(-1, "test_pipe: pipe"); + close(fd[0]); + close(fd[1]); + benchmark_start(); for (i = 0; i < num; i++) { - int fd[2]; - if (pipe(fd) == -1) { - perror("pipe"); - exit(-1); - } - + if (pipe(fd) == -1) + err(-1, "test_pipe: pipe"); close(fd[0]); close(fd[1]); } + benchmark_stop(); } -inline void +void test_socket_stream(int num) { int i, so; + so = socket(PF_LOCAL, SOCK_STREAM, 0); + if (so < 0) + err(-1, "test_socket_stream: socket"); + close(so); + benchmark_start(); for (i = 0; i < num; i++) { so = socket(PF_LOCAL, SOCK_STREAM, 0); - if (so == -1) { - perror("socket_stream"); - exit(-1); - } + if (so == -1) + err(-1, "test_socket_stream: socket"); close(so); } + benchmark_stop(); } -inline void +void test_socket_dgram(int num) { int i, so; + so = socket(PF_LOCAL, SOCK_DGRAM, 0); + if (so < 0) + err(-1, "test_socket_dgram: socket"); + close(so); + benchmark_start(); for (i = 0; i < num; i++) { so = socket(PF_LOCAL, SOCK_DGRAM, 0); - if (so == -1) { - perror("socket_dgram"); - exit(-1); - } + if (so == -1) + err(-1, "test_socket_dgram: socket"); close(so); } + benchmark_stop(); } -inline void +void test_socketpair_stream(int num) { int i, so[2]; + if (socketpair(PF_LOCAL, SOCK_STREAM, 0, so) == -1) + err(-1, "test_socketpair_stream: socketpair"); + close(so[0]); + close(so[1]); + benchmark_start(); for (i = 0; i < num; i++) { - if (socketpair(PF_LOCAL, SOCK_STREAM, 0, so) == -1) { - perror("socketpair_stream"); - exit(-1); - } + if (socketpair(PF_LOCAL, SOCK_STREAM, 0, so) == -1) + err(-1, "test_socketpair_stream: socketpair"); close(so[0]); close(so[1]); } + benchmark_stop(); } -inline void +void test_socketpair_dgram(int num) { int i, so[2]; + if (socketpair(PF_LOCAL, SOCK_DGRAM, 0, so) == -1) + err(-1, "test_socketpair_dgram: socketpair"); + close(so[0]); + close(so[1]); + benchmark_start(); + for (i = 0; i < num; i++) { + if (socketpair(PF_LOCAL, SOCK_DGRAM, 0, so) == -1) + err(-1, "test_socketpair_dgram: socketpair"); + close(so[0]); + close(so[1]); + } + benchmark_stop(); +} + +void +test_dup(int num) +{ + int fd, i, shmfd; + + shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600); + if (shmfd < 0) + err(-1, "test_dup: shm_open"); + fd = dup(shmfd); + if (fd >= 0) + close(fd); + benchmark_start(); + for (i = 0; i < num; i++) { + fd = dup(shmfd); + if (fd >= 0) + close(fd); + } + benchmark_stop(); + close(shmfd); +} + +void +test_cap_new(int num) +{ + int fd, i, shmfd; + + shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600); + if (shmfd < 0) + err(-1, "test_cap_new: shm_open"); + fd = cap_new(shmfd, 0); + if (fd >= 0) + close(fd); + benchmark_start(); + for (i = 0; i < num; i++) { + fd = cap_new(shmfd, 0); + if (fd >= 0) + close(fd); + } + benchmark_stop(); + close(shmfd); +} + +void +test_shmfd(int num) +{ + int i, shmfd; + + shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600); + if (shmfd < 0) + err(-1, "test_shmfd: shm_open"); + close(shmfd); + benchmark_start(); + for (i = 0; i < num; i++) { + shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600); + if (shmfd < 0) + err(-1, "test_shmfd: shm_open"); + close(shmfd); + } + benchmark_stop(); +} + +void +test_cap_shmfd(int num) +{ + int fd, i, shmfd; + + shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600); + if (shmfd < 0) + err(-1, "test_cap_shmfd: shm_open"); + fd = cap_new(shmfd, 0); + if (fd < 0) + err(-1, "test_cap_shmfd: cap_new"); + close(fd); + close(shmfd); + benchmark_start(); + for (i = 0; i < num; i++) { + shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600); + if (shmfd < 0) + err(-1, "test_cap_shmfd: shm_open"); + fd = cap_new(shmfd, 0); + if (fd < 0) + err(-1, "test_cap_shmfd: cap_new"); + close(fd); + close(shmfd); + } + benchmark_stop(); +} + +void +test_fstat_shmfd(int num) +{ + struct stat sb; + int i, shmfd; + + shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600); + if (shmfd < 0) + err(-1, "test_fstat_shmfd: shm_open"); + if (fstat(shmfd, &sb) < 0) + err(-1, "test_fstat_shmfd: fstat"); + benchmark_start(); + for (i = 0; i < num; i++) + (void)fstat(shmfd, &sb); + benchmark_stop(); + close(shmfd); +} + +void +test_fstat_cap_shmfd(int num) +{ + struct stat sb; + int fd, i, shmfd; + + shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600); + if (shmfd < 0) + err(-1, "test_fstat_cap_shmfd: shm_open"); + fd = cap_new(shmfd, CAP_FSTAT); + if (fd < 0) + err(-1, "test_fstat_cap_shmfd: shm_open"); + if (fstat(fd, &sb) < 0) + err(-1, "test_fstat_cap_shmfd: fstat"); + benchmark_start(); + for (i = 0; i < num; i++) + (void)fstat(fd, &sb); + benchmark_stop(); + close(fd); + close(shmfd); +} + +void +test_cap_enter(int num) +{ + int i; + + /* XXXRW: Note that some tests will fail after this test. */ + + if (cap_enter() < 0) + err(-1, "test_cap_enter: cap_enter"); + benchmark_start(); for (i = 0; i < num; i++) { - if (socketpair(PF_LOCAL, SOCK_DGRAM, 0, so) == -1) { - perror("socketpair_dgram"); - exit(-1); + if (cap_enter() < 0) + err(-1, "test_cap_enter: cap_enter"); + } + benchmark_stop(); +} + +void +test_fork(int num) +{ + pid_t pid; + int i; + + pid = fork(); + if (pid < 0) + err(-1, "test_fork: fork"); + if (pid == 0) + _exit(0); + if (waitpid(pid, NULL, 0) < 0) + err(-1, "test_fork: waitpid"); + benchmark_start(); + for (i = 0; i < num; i++) { + pid = fork(); + if (pid < 0) + err(-1, "test_fork: fork"); + if (pid == 0) + _exit(0); + if (waitpid(pid, NULL, 0) < 0) + err(-1, "test_fork: waitpid"); + } + benchmark_stop(); +} + +void +test_vfork(int num) +{ + pid_t pid; + int i; + + pid = vfork(); + if (pid < 0) + err(-1, "test_vfork: vfork"); + if (pid == 0) + _exit(0); + if (waitpid(pid, NULL, 0) < 0) + err(-1, "test_vfork: waitpid"); + benchmark_start(); + for (i = 0; i < num; i++) { + pid = vfork(); + if (pid < 0) + err(-1, "test_vfork: vfork"); + if (pid == 0) + _exit(0); + if (waitpid(pid, NULL, 0) < 0) + err(-1, "test_vfork: waitpid"); + } + benchmark_stop(); +} + +void +test_pdfork(int num) +{ + struct pollfd pollfd; + pid_t pid; + int fd, i, n; + + pid = pdfork(&fd); + if (pid < 0) + err(-1, "test_pdfork: pdfork"); + if (pid == 0) + _exit(0); + pollfd.fd = fd; + pollfd.events = POLLHUP; + pollfd.revents = 0; + n = poll(&pollfd, 1, INFTIM); + if (n < 0) + err(-1, "test_pdfork: poll"); + if (n != 1) + errx(-1, "test_pdfork: poll returned %d", n); + close(fd); + + benchmark_start(); + for (i = 0; i < num; i++) { + pid = pdfork(&fd); + if (pid < 0) + err(-1, "test_pdfork: pdfork"); + if (pid == 0) + _exit(0); + pollfd.fd = fd; + pollfd.events = POLLHUP; + pollfd.revents = 0; + n = poll(&pollfd, 1, INFTIM); + if (n < 0) + err(-1, "test_pdfork: poll"); + if (n != 1) + errx(-1, "test_pdfork: poll returned %d", n); + close(fd); + } + benchmark_stop(); +} + +#define USR_BIN_TRUE "/usr/bin/true" +static char *execve_args[] = { USR_BIN_TRUE, NULL}; +extern char **environ; + +void +test_fork_exec(int num) +{ + pid_t pid; + int i; + + pid = fork(); + if (pid < 0) + err(-1, "test_fork_exec: fork"); + if (pid == 0) { + (void)execve(USR_BIN_TRUE, execve_args, environ); + err(-1, "execve"); + } + if (waitpid(pid, NULL, 0) < 0) + err(-1, "test_fork: waitpid"); + benchmark_start(); + for (i = 0; i < num; i++) { + pid = fork(); + if (pid < 0) + err(-1, "test_fork_exec: fork"); + if (pid == 0) { + (void)execve(USR_BIN_TRUE, execve_args, environ); + err(-1, "test_fork_exec: execve"); + } + if (waitpid(pid, NULL, 0) < 0) + err(-1, "test_fork_exec: waitpid"); + } + benchmark_stop(); +} + +void +test_vfork_exec(int num) +{ + pid_t pid; + int i; + + pid = vfork(); + if (pid < 0) + err(-1, "test_vfork_exec: vfork"); + if (pid == 0) { + (void)execve(USR_BIN_TRUE, execve_args, environ); + err(-1, "test_vfork_exec: execve"); + } + if (waitpid(pid, NULL, 0) < 0) + err(-1, "test_vfork_exec: waitpid"); + benchmark_start(); + for (i = 0; i < num; i++) { + pid = vfork(); + if (pid < 0) + err(-1, "test_vfork_exec: vfork"); + if (pid == 0) { + (void)execve(USR_BIN_TRUE, execve_args, environ); + err(-1, "execve"); } + if (waitpid(pid, NULL, 0) < 0) + err(-1, "test_vfork_exec: waitpid"); + } + benchmark_stop(); +} + +void +test_pdfork_exec(int num) +{ + struct pollfd pollfd; + pid_t pid; + int fd, i, n; + + pid = pdfork(&fd); + if (pid < 0) + err(-1, "test_pdfork_exec: pdfork"); + if (pid == 0) { + (void)execve(USR_BIN_TRUE, execve_args, environ); + err(-1, "test_pdfork_exec: execve"); + } + pollfd.fd = fd; + pollfd.events = POLLHUP; + pollfd.revents = 0; + n = poll(&pollfd, 1, INFTIM); + if (n < 0) + err(-1, "test_pdfork_exec: poll"); + if (n != 1) + errx(-1, "test_pdfork_exec: poll returned %d", n); + close(fd); + + benchmark_start(); + for (i = 0; i < num; i++) { + pid = pdfork(&fd); + if (pid < 0) + err(-1, "test_pdfork_exec: pdfork"); + if (pid == 0) { + (void)execve(USR_BIN_TRUE, execve_args, environ); + err(-1, "test_pdfork_exec: execve"); + } + pollfd.fd = fd; + pollfd.events = POLLHUP; + pollfd.revents = 0; + n = poll(&pollfd, 1, INFTIM); + if (n < 0) + err(-1, "test_pdfork_exec: poll"); + if (n != 1) + errx(-1, "test_pdfork_exec: poll returned %d", n); + close(fd); + } + benchmark_stop(); +} + +void +test_chroot(int num) +{ + int i; + + if (chroot("/") < 0) + err(-1, "test_chroot: chroot"); + benchmark_start(); + for (i = 0; i < num; i++) { + if (chroot("/") < 0) + err(-1, "test_chroot: chroot"); + } + benchmark_stop(); +} + +void +test_setuid(int num) +{ + uid_t uid; + int i; + + uid = getuid(); + if (setuid(uid) < 0) + err(-1, "test_setuid: setuid"); + benchmark_start(); + for (i = 0; i < num; i++) { + if (setuid(uid) < 0) + err(-1, "test_setuid: setuid"); + } + benchmark_stop(); +} + +/* + * A bit like sandbox, in that a process is forked, IPC ping-pong is done, + * but with none of the sandboxing goo. + */ +void +test_pingpong(int num) +{ + char ch; + int so[2]; + pid_t pid; + ssize_t len; + int i; + + if (socketpair(PF_LOCAL, SOCK_STREAM, 0, so) < 0) + err(-1, "test_pingpong: socketpair"); + pid = fork(); + if (pid < 0) + err(-1, "test_pingpong: fork"); + if (pid == 0) { close(so[0]); + len = recv(so[1], &ch, sizeof(ch), 0); + if (len < 0) + err(-1, "test_pingpong: child: recv"); + if (len != 1) + errx(-1, "test_pingpong: child: recv %d", (int)len); + len = send(so[1], &ch, sizeof(ch), 0); + if (len < 0) + err(-1, "test_pingpong: child: send"); + if (len != 1) + errx(-1, "test_pingpong: child: send %d", (int)len); + _exit(0); + } + close(so[1]); + len = send(so[0], &ch, sizeof(ch), 0); + if (len < 0) + err(-1, "test_pingpong: parent: send"); + if (len != 1) + errx(-1, "test_pingpong: parent: send %d", (int)len); + len = recv(so[0], &ch, sizeof(ch), 0); + if (len < 0) + err(-1, "test_pingpong: parent: recv"); + if (len != 1) + errx(-1, "test_pingpong: parent: recv %d", (int)len); + close(so[0]); + if (waitpid(pid, NULL, 0) < 0) + err(-1, "test_pingpong: waitpid"); + + benchmark_start(); + for (i = 0; i < num; i++) { + if (socketpair(PF_LOCAL, SOCK_STREAM, 0, so) < 0) + err(-1, "test_pingpong: socketpair"); + pid = fork(); + if (pid < 0) + err(-1, "test_pingpong: fork"); + if (pid == 0) { + close(so[0]); + len = recv(so[1], &ch, sizeof(ch), 0); + if (len < 0) + err(-1, "test_pingpong: child: recv"); + if (len != 1) + errx(-1, "test_pingpong: child: recv %d", + (int)len); + len = send(so[1], &ch, sizeof(ch), 0); + if (len < 0) + err(-1, "test_pingpong: child: send"); + if (len != 1) + errx(-1, "test_pingpong: child: send %d", + (int)len); + _exit(0); + } close(so[1]); + len = send(so[0], &ch, sizeof(ch), 0); + if (len < 0) + err(-1, "test_pingpong: parent: send"); + if (len != 1) + errx(-1, "test_pingpong: parent: send %d", (int)len); + len = recv(so[0], &ch, sizeof(ch), 0); + if (len < 0) + err(-1, "test_pingpong: parent: recv"); + if (len != 1) + errx(-1, "test_pingpong: parent: recv %d", (int)len); + close(so[0]); + if (waitpid(pid, NULL, 0) < 0) + err(-1, "test_pingpong: waitpid"); + } + benchmark_stop(); +} + +#define MYNAME "./syscall_timing" /* Binary to run in sandbox. */ + +/* + * Unsandboxed host process with full user rights. + */ +void +test_sandbox(int num) +{ + struct lc_sandbox *lcsp; + char *sandbox_argv[2] = { MYNAME, NULL }; + struct iovec iov; + size_t len; + char ch; + int i; + + if (lch_start(MYNAME, sandbox_argv, LCH_PERMIT_STDERR | + LCH_PERMIT_STDOUT, NULL, &lcsp) < 0) + err(-1, "lch_start %s", MYNAME); + ch = 'X'; + iov.iov_base = &ch; + iov.iov_len = sizeof(ch); + if (lch_rpc(lcsp, 0, &iov, 1, &iov, 1, &len) < 0) + err(-1, "lch_rpc"); + if (len != sizeof(ch)) + errx(-1, "lch_rpc returned size %zd not %zd", len, sizeof(ch)); + if (ch != 'X') + errx(-1, "lch_recv: expected %d and got %d", 'X', ch); + lch_stop(lcsp); + + benchmark_start(); + for (i = 0; i < num; i++) { + if (lch_start(MYNAME, sandbox_argv, LCH_PERMIT_STDERR | + LCH_PERMIT_STDOUT, NULL, &lcsp) < 0) + err(-1, "lch_start %s", MYNAME); + ch = 'X'; + iov.iov_base = &ch; + iov.iov_len = sizeof(ch); + if (lch_rpc(lcsp, 0, &iov, 1, &iov, 1, &len) < 0) + err(-1, "lch_rpc"); + if (len != sizeof(ch)) + errx(-1, "lch_rpc returned size %zd not %zd", len, + sizeof(ch)); + if (ch != 'X') + errx(-1, "lch_recv: expected %d and got %d", 'X', ch); + lch_stop(lcsp); + } + benchmark_stop(); +} + +int +cap_main(int argc, char *argv[]) +{ + struct lc_host *lchp; + u_int32_t opno, seqno; + struct iovec iov; + u_char *buffer; + size_t len; + + if (lcs_get(&lchp) < 0) + err(-1, "lcs_get"); + + /* + * Serve RPCs from the host until the sandbox is killed. + */ + while (1) { + /* + * Receive a one-byte RPC from the host. + */ + if (lcs_recvrpc(lchp, &opno, &seqno, &buffer, &len) < 0) { + if (errno != EPIPE) + err(-6, "lcs_recvrpc"); + else + exit(-6); + } + if (len != 1) + errx(-7, "lcs_recvrpc len"); + + /* + * Reply with the same message. Remember to free the message + * when done. + */ + iov.iov_base = buffer; + iov.iov_len = 1; + if (lcs_sendrpc(lchp, opno, seqno, &iov, 1) < 0) { + if (errno != EPIPE) + err(-8, "lcs_sendrpc"); + else + exit(-8); + } + free(buffer); } } +struct test { + const char *t_name; + void (*t_func)(int); +}; + +static const struct test tests[] = { + { "getuid", test_getuid }, + { "getppid", test_getppid }, + { "clock_gettime", test_clock_gettime }, + { "pipe", test_pipe }, + { "socket_stream", test_socket_stream }, + { "socket_dgram", test_socket_dgram }, + { "socketpair_stream", test_socketpair_stream }, + { "socketpair_dgram", test_socketpair_dgram }, + { "dup", test_dup }, + { "cap_new", test_cap_new }, + { "shmfd", test_shmfd }, + { "cap_shmfd", test_cap_shmfd }, + { "fstat_shmfd", test_fstat_shmfd }, + { "fstat_cap_shmfd", test_fstat_cap_shmfd }, + { "cap_enter", test_cap_enter }, + { "fork", test_fork }, + { "vfork", test_vfork }, + { "pdfork", test_pdfork }, + { "fork_exec", test_fork_exec }, + { "vfork_exec", test_vfork_exec }, + { "pdfork_exec", test_pdfork_exec }, + { "chroot", test_chroot }, + { "setuid", test_setuid }, + { "pingpong", test_pingpong }, + { "sandbox", test_sandbox }, +}; +static const int tests_count = sizeof(tests) / sizeof(tests[0]); + static void usage(void) { + int i; - fprintf(stderr, "syscall_timing [iterations] [test]\n"); - fprintf(stderr, - "supported tests: getuid getppid clock_gettime pipe\n" - "socket_stream socket_dgram socketpair_stream\n" - "socketpair_dgram\n"); + fprintf(stderr, "syscall_timing [iterations] [loops] [test]\n"); + for (i = 0; i < tests_count; i++) + fprintf(stderr, " %s\n", tests[i].t_name); exit(-1); } int main(int argc, char *argv[]) { - struct timespec ts_start, ts_end, ts_res; - int count; + struct timespec ts_res; + const struct test *the_test; + long long ll; + char *endp; + int i, j, k; + int iterations, loops; - if (argc != 3) + if (argc < 4) usage(); - count = atoi(argv[1]); - assert(clock_getres(CLOCK_REALTIME, &ts_res) == 0); - printf("Clock resolution: %d.%09lu\n", ts_res.tv_sec, ts_res.tv_nsec); - - if (strcmp(argv[2], "getuid") == 0) { - assert(clock_gettime(CLOCK_REALTIME, &ts_start) == 0); - test_getuid(count); - assert(clock_gettime(CLOCK_REALTIME, &ts_end) == 0); - } else if (strcmp(argv[2], "getppid") == 0) { - assert(clock_gettime(CLOCK_REALTIME, &ts_start) == 0); - test_getppid(count); - assert(clock_gettime(CLOCK_REALTIME, &ts_end) == 0); - } else if (strcmp(argv[2], "clock_gettime") == 0) { - assert(clock_gettime(CLOCK_REALTIME, &ts_start) == 0); - test_clock_gettime(count); - assert(clock_gettime(CLOCK_REALTIME, &ts_end) == 0); - } else if (strcmp(argv[2], "pipe") == 0) { - assert(clock_gettime(CLOCK_REALTIME, &ts_start) == 0); - test_pipe(count); - assert(clock_gettime(CLOCK_REALTIME, &ts_end) == 0); - } else if (strcmp(argv[2], "socket_stream") == 0) { - assert(clock_gettime(CLOCK_REALTIME, &ts_start) == 0); - test_socket_stream(count); - assert(clock_gettime(CLOCK_REALTIME, &ts_end) == 0); - } else if (strcmp(argv[2], "socket_dgram") == 0) { - assert(clock_gettime(CLOCK_REALTIME, &ts_start) == 0); - test_socket_dgram(count); - assert(clock_gettime(CLOCK_REALTIME, &ts_end) == 0); - } else if (strcmp(argv[2], "socketpair_stream") == 0) { - assert(clock_gettime(CLOCK_REALTIME, &ts_start) == 0); - test_socketpair_stream(count); - assert(clock_gettime(CLOCK_REALTIME, &ts_end) == 0); - } else if (strcmp(argv[2], "socketpair_dgram") == 0) { - assert(clock_gettime(CLOCK_REALTIME, &ts_start) == 0); - test_socketpair_dgram(count); - assert(clock_gettime(CLOCK_REALTIME, &ts_end) == 0); - } else + ll = strtoll(argv[1], &endp, 10); + if (*endp != 0 || ll < 0 || ll > 100000) usage(); + iterations = ll; - timespecsub(&ts_end, &ts_start); - - printf("test: %s\n", argv[2]); + ll = strtoll(argv[2], &endp, 10); + if (*endp != 0 || ll < 0 || ll > 100000) + usage(); + loops = ll; - printf("%d.%09lu for %d iterations\n", ts_end.tv_sec, - ts_end.tv_nsec, count); + assert(clock_getres(CLOCK_REALTIME, &ts_res) == 0); + printf("Clock resolution: %ju.%ju\n", (uintmax_t)ts_res.tv_sec, + (uintmax_t)ts_res.tv_nsec); + printf("test\tloop\ttotal\titerations\tperiteration\n"); + + for (j = 3; j < argc; j++) { + the_test = NULL; + for (i = 0; i < tests_count; i++) { + if (strcmp(argv[j], tests[i].t_name) == 0) + the_test = &tests[i]; + } + if (the_test == NULL) + usage(); - /* - * Note. This assumes that each iteration takes less than - * a second, and that our total nanoseconds doesn't exceed - * the room in our arithmetic unit. Fine for system calls, - * but not for long things. - */ - ts_end.tv_sec *= 1000000000 / count; - printf("0.%09lu per/iteration\n", - ts_end.tv_sec + ts_end.tv_nsec / count); + /* + * Run one warmup, then do the real thing (loops) times. + */ + the_test->t_func(iterations); + for (k = 0; k < loops; k++) { + the_test->t_func(iterations); + timespecsub(&ts_end, &ts_start); + printf("%s\t%d\t", the_test->t_name, k); + printf("%ju.%09ju\t%d\t", (uintmax_t)ts_end.tv_sec, + (uintmax_t)ts_end.tv_nsec, iterations); + + /* + * Note. This assumes that each iteration takes less than + * a second, and that our total nanoseconds doesn't exceed + * the room in our arithmetic unit. Fine for system calls, + * but not for long things. + */ + ts_end.tv_sec *= 1000000000 / iterations; + printf("0.%09ju\n", (uintmax_t)(ts_end.tv_sec + + ts_end.tv_nsec / iterations)); + } + } return (0); } diff -aurN -x '*.orig' src-clean/usr.bin/gzip/Makefile src/usr.bin/gzip/Makefile --- src-clean/usr.bin/gzip/Makefile 2010-08-25 10:09:40.000000000 +0200 +++ src/usr.bin/gzip/Makefile 2010-08-26 21:42:20.000000000 +0200 @@ -5,10 +5,12 @@ PROG= gzip MAN= gzip.1 gzexe.1 zdiff.1 zforce.1 zmore.1 znew.1 +SRCS= gzip.c gzsandbox.c -DPADD= ${LIBZ} -LDADD= -lz +DPADD= ${LIBZ} ${LIBCAPSICUM} +LDADD= -lz -lcapsicum WARNS?= 6 +CFLAGS+= -rdynamic # For sandbox cap_main .if ${MK_BZIP2_SUPPORT} != "no" DPADD+= ${LIBBZ2} @@ -17,6 +19,10 @@ CFLAGS+= -DNO_BZIP2_SUPPORT .endif +.if defined(RESCUE) +CFLAGS+= -DNO_SANDBOX_SUPPORT +.endif + SCRIPTS= gzexe zdiff zforce zmore znew MLINKS+= gzip.1 gunzip.1 \ @@ -27,6 +33,7 @@ LINKS+= ${BINDIR}/gzip ${BINDIR}/gunzip \ ${BINDIR}/gzip ${BINDIR}/gzcat \ ${BINDIR}/gzip ${BINDIR}/zcat \ - ${BINDIR}/zdiff ${BINDIR}/zcmp + ${BINDIR}/zdiff ${BINDIR}/zcmp \ + ${BINDIR}/gzip ${BINDIR}/gzip_sandbox .include diff -aurN -x '*.orig' src-clean/usr.bin/gzip/gzip.c src/usr.bin/gzip/gzip.c --- src-clean/usr.bin/gzip/gzip.c 2010-08-25 10:09:40.000000000 +0200 +++ src/usr.bin/gzip/gzip.c 2010-08-26 21:50:49.000000000 +0200 @@ -64,6 +64,7 @@ #include #include #include +#include "gzip.h" /* what type of file are we dealing with */ enum filetype { @@ -178,7 +179,7 @@ static int cflag; /* stdout mode */ static int dflag; /* decompress mode */ static int lflag; /* list mode */ -static int numflag = 6; /* gzip -1..-9 value */ +int numflag = 6; /* gzip -1..-9 value */ #ifndef SMALL static int fflag; /* force mode */ @@ -213,8 +214,6 @@ #ifdef SMALL #define gz_compress(if, of, sz, fn, tm) gz_compress(if, of, sz) #endif -static off_t gz_compress(int, int, off_t *, const char *, uint32_t); -static off_t gz_uncompress(int, int, char *, size_t, off_t *, const char *); static off_t file_compress(char *, char *, size_t); static off_t file_uncompress(char *, char *, size_t); static void handle_pathname(char *); @@ -244,10 +243,6 @@ static int check_outfile(const char *outfile); #endif -#ifndef NO_BZIP2_SUPPORT -static off_t unbzip2(int, int, char *, size_t, off_t *); -#endif - #ifndef NO_COMPRESS_SUPPORT static FILE *zdopen(int); static off_t zuncompress(FILE *, FILE *, char *, size_t, off_t *); @@ -259,6 +254,11 @@ int main(int, char **p); +#ifndef NO_SANDBOX_SUPPORT +int ld_insandbox(void); +int gzsandbox(void); +#endif + #ifdef SMALL #define getopt_long(a,b,c,d,e) getopt(a,b,c) #else @@ -539,7 +539,7 @@ #endif /* compress input to output. Return bytes read, -1 on error */ -static off_t +off_t gz_compress(int in, int out, off_t *gsizep, const char *origname, uint32_t mtime) { z_stream z; @@ -706,7 +706,7 @@ * uncompressed size written, and put the compressed sized read * into `*gsizep'. */ -static off_t +off_t gz_uncompress(int in, int out, char *pre, size_t prelen, off_t *gsizep, const char *filename) { @@ -1274,7 +1274,8 @@ } else out = STDOUT_FILENO; - insize = gz_compress(in, out, &size, basename(file), (uint32_t)isb.st_mtime); + insize = gz_compress_wrapper(in, out, &size, basename(file), + (uint32_t)isb.st_mtime); (void)close(in); @@ -1453,7 +1454,7 @@ goto lose; } - size = unbzip2(fd, zfd, NULL, 0, NULL); + size = unbzip2_wrapper(fd, zfd, NULL, 0, NULL); } else #endif @@ -1521,7 +1522,7 @@ return -1; /* XXX */ } - size = gz_uncompress(fd, zfd, NULL, 0, NULL, file); + size = gz_uncompress_wrapper(fd, zfd, NULL, 0, NULL, file); } if (close(fd) != 0) @@ -1678,12 +1679,12 @@ break; #endif case FT_GZIP: - usize = gz_uncompress(STDIN_FILENO, STDOUT_FILENO, - (char *)header1, sizeof header1, &gsize, "(stdin)"); + usize = gz_uncompress_wrapper(STDIN_FILENO, STDOUT_FILENO, + (char *)header1, sizeof header1, &gsize, "(stdin)"); break; #ifndef NO_BZIP2_SUPPORT case FT_BZIP2: - usize = unbzip2(STDIN_FILENO, STDOUT_FILENO, + usize = unbzip2_wrapper(STDIN_FILENO, STDOUT_FILENO, (char *)header1, sizeof header1, &gsize); break; #endif @@ -1752,8 +1753,9 @@ #endif mtime = (uint32_t)systime; } - - usize = gz_compress(STDIN_FILENO, STDOUT_FILENO, &gsize, "", mtime); + + usize = gz_compress_wrapper(STDIN_FILENO, STDOUT_FILENO, &gsize, "", + mtime); #ifndef SMALL if (vflag && !tflag && usize != -1 && gsize != -1) print_verbage(NULL, NULL, usize, gsize); diff -aurN -x '*.orig' src-clean/usr.bin/gzip/gzip.h src/usr.bin/gzip/gzip.h --- src-clean/usr.bin/gzip/gzip.h 1970-01-01 01:00:00.000000000 +0100 +++ src/usr.bin/gzip/gzip.h 2010-08-26 21:42:20.000000000 +0200 @@ -0,0 +1,55 @@ +/*- + * Copyright (c) 2009-2010 Robert N. M. Watson + * All rights reserved. + * + * WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED + * ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND + * UNEXPECTED WAYS. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _GZIP_H_ +#define _GZIP_H_ + +/* + * We need to forward the global variable 'numflag' to the sandbox as well as + * function arguments. + */ +extern int numflag; + +off_t gz_compress(int in, int out, off_t *gsizep, const char *origname, + uint32_t mtime); +off_t gz_compress_wrapper(int in, int out, off_t *gsizep, + const char *origname, uint32_t mtime); +off_t gz_uncompress(int in, int out, char *pre, size_t prelen, + off_t *gsizep, const char *filename); +off_t gz_uncompress_wrapper(int in, int out, char *pre, size_t prelen, + off_t *gsizep, const char *filename); +off_t unbzip2(int in, int out, char *pre, size_t prelen, off_t *bytes_in); +off_t unbzip2_wrapper(int in, int out, char *pre, size_t prelen, + off_t *bytes_in); + +#endif /* !_GZIP_H_ */ diff -aurN -x '*.orig' src-clean/usr.bin/gzip/gzsandbox.c src/usr.bin/gzip/gzsandbox.c --- src-clean/usr.bin/gzip/gzsandbox.c 1970-01-01 01:00:00.000000000 +0100 +++ src/usr.bin/gzip/gzsandbox.c 2010-08-26 21:42:20.000000000 +0200 @@ -0,0 +1,410 @@ +/*- + * Copyright (c) 2009-2010 Robert N. M. Watson + * All rights reserved. + * + * WARNING: THIS IS EXPERIMENTAL SECURITY SOFTWARE THAT MUST NOT BE RELIED + * ON IN PRODUCTION SYSTEMS. IT WILL BREAK YOUR SOFTWARE IN NEW AND + * UNEXPECTED WAYS. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gzip.h" + +#define LC_USR_BIN_GZIP_SANDBOX "/usr/bin/gzip" + +#ifndef NO_SANDBOX_SUPPORT + +int gzsandbox(void); + +static char *lc_sandbox_argv[] = { __DECONST(char *, LC_USR_BIN_GZIP_SANDBOX), + NULL }; + +#define PROXIED_GZ_COMPRESS 1 +#define PROXIED_GZ_UNCOMPRESS 2 +#define PROXIED_UNBZIP2 3 + +static struct lc_sandbox *lcsp; +static int gzsandbox_initialized; +static int gzsandbox_enabled; + +static void +gzsandbox_initialize(void) +{ + + if (gzsandbox_initialized) + return; + gzsandbox_enabled = lch_autosandbox_isenabled("gzip"); + gzsandbox_initialized = 1; + if (!gzsandbox_enabled) + return; + + if (lch_start(LC_USR_BIN_GZIP_SANDBOX, lc_sandbox_argv, + LCH_PERMIT_STDERR, NULL, &lcsp) < 0) + err(-1, "lch_start %s", LC_USR_BIN_GZIP_SANDBOX); +} + +struct host_gz_compress_req { + char hgc_req_origname[PATH_MAX]; + int hgc_req_numflag; + uint32_t hgc_req_mtime; +} __packed; + +struct host_gz_compress_rep { + off_t hgc_rep_gsize; + off_t hgc_rep_retval; +} __packed; + +static off_t +gz_compress_insandbox(int in, int out, off_t *gsizep, const char *origname, + uint32_t mtime) +{ + struct host_gz_compress_req req; + struct host_gz_compress_rep rep; + struct iovec iov_req, iov_rep; + int fdarray[2]; + size_t len; + + bzero(&req, sizeof(req)); + strlcpy(req.hgc_req_origname, origname, + sizeof(req.hgc_req_origname)); + req.hgc_req_numflag = numflag; + req.hgc_req_mtime = mtime; + iov_req.iov_base = &req; + iov_req.iov_len = sizeof(req); + iov_rep.iov_base = &rep; + iov_rep.iov_len = sizeof(rep); + fdarray[0] = cap_new(in, CAP_FSTAT | CAP_READ | CAP_SEEK); + fdarray[1] = cap_new(out, CAP_FSTAT | CAP_WRITE | CAP_SEEK); + if (fdarray[0] == -1 || fdarray[1] == -1) + err(-1, "cap_new"); + if (lch_rpc_rights(lcsp, PROXIED_GZ_COMPRESS, &iov_req, 1, + fdarray, 2, &iov_rep, 1, &len, NULL, NULL) < 0) + err(-1, "lch_rpc_rights"); + if (len != sizeof(rep)) + errx(-1, "lch_rpc_rights len %zu", len); + if (gsizep != NULL) + *gsizep = rep.hgc_rep_gsize; + close(fdarray[0]); + close(fdarray[1]); + return (rep.hgc_rep_retval); +} + +static void +sandbox_gz_compress_buffer(struct lc_host *lchp, uint32_t opno, + uint32_t seqno, char *buffer, size_t len, int fd_in, int fd_out) +{ + struct host_gz_compress_req req; + struct host_gz_compress_rep rep; + struct iovec iov; + + if (len != sizeof(req)) + err(-1, "sandbox_gz_compress_buffer: len %zu", len); + + bcopy(buffer, &req, sizeof(req)); + bzero(&rep, sizeof(rep)); + numflag = req.hgc_req_numflag; + rep.hgc_rep_retval = gz_compress(fd_in, fd_out, &rep.hgc_rep_gsize, + req.hgc_req_origname, req.hgc_req_mtime); + iov.iov_base = &rep; + iov.iov_len = sizeof(rep); + if (lcs_sendrpc(lchp, opno, seqno, &iov, 1) < 0) + err(-1, "lcs_sendrpc"); +} + +off_t +gz_compress_wrapper(int in, int out, off_t *gsizep, const char *origname, + uint32_t mtime) +{ + + gzsandbox_initialize(); + if (gzsandbox_enabled) + return (gz_compress_insandbox(in, out, gsizep, origname, + mtime)); + else + return (gz_compress(in, out, gsizep, origname, mtime)); +} + +struct host_gz_uncompress_req { + size_t hgu_req_prelen; + char hgu_req_filename[PATH_MAX]; + /* ... followed by data ... */ +}; + +struct host_gz_uncompress_rep { + off_t hgu_rep_gsize; + off_t hgu_rep_retval; +}; + +static off_t +gz_uncompress_insandbox(int in, int out, char *pre, size_t prelen, + off_t *gsizep, const char *filename) +{ + struct host_gz_uncompress_req req; + struct host_gz_uncompress_rep rep; + struct iovec iov_req[2], iov_rep; + int fdarray[2]; + size_t len; + + bzero(&req, sizeof(req)); + req.hgu_req_prelen = prelen; + strlcpy(req.hgu_req_filename, filename, + sizeof(req.hgu_req_filename)); + iov_req[0].iov_base = &req; + iov_req[0].iov_len = sizeof(req); + iov_req[1].iov_base = pre; + iov_req[1].iov_len = prelen; + iov_rep.iov_base = &rep; + iov_rep.iov_len = sizeof(rep); + fdarray[0] = cap_new(in, CAP_FSTAT | CAP_READ | CAP_SEEK); + fdarray[1] = cap_new(out, CAP_FSTAT | CAP_WRITE | CAP_SEEK); + if (fdarray[0] == -1 || fdarray[1] == -1) + err(-1, "cap_new"); + if (lch_rpc_rights(lcsp, PROXIED_GZ_UNCOMPRESS, iov_req, 1, + fdarray, 2, &iov_rep, 1, &len, NULL, NULL) < 0) + err(-1, "lch_rpc_rights"); + if (len != sizeof(rep)) + errx(-1, "lch_rpc_rights len %zu", len); + if (gsizep != NULL) + *gsizep = rep.hgu_rep_gsize; + close(fdarray[0]); + close(fdarray[1]); + return (rep.hgu_rep_retval); +} + +static void +sandbox_gz_uncompress_buffer(struct lc_host *lchp, uint32_t opno, + uint32_t seqno, char *buffer, size_t len, int fd_in, int fd_out) +{ + struct host_gz_uncompress_req req; + struct host_gz_uncompress_rep rep; + struct iovec iov; + char *pre; + + if (len != sizeof(req)) + err(-1, "sandbox_gz_uncompress_buffer: len %zu", len); + + bcopy(buffer, &req, sizeof(req)); + pre = buffer + sizeof(req); + bzero(&rep, sizeof(rep)); + rep.hgu_rep_retval = gz_uncompress(fd_in, fd_out, pre, + req.hgu_req_prelen, &rep.hgu_rep_gsize, req.hgu_req_filename); + iov.iov_base = &rep; + iov.iov_len = sizeof(rep); + if (lcs_sendrpc(lchp, opno, seqno, &iov, 1) < 0) + err(-1, "lcs_sendrpc"); +} + +off_t +gz_uncompress_wrapper(int in, int out, char *pre, size_t prelen, + off_t *gsizep, const char *filename) +{ + + gzsandbox_initialize(); + if (gzsandbox_enabled) + return (gz_uncompress_insandbox(in, out, pre, prelen, + gsizep, filename)); + else + return (gz_uncompress(in, out, pre, prelen, gsizep, + filename)); +} + +struct host_unbzip2_req { + size_t hub_req_prelen; + /* ... followed by data ... */ +}; + +struct host_unbzip2_rep { + off_t hub_rep_bytes_in; + off_t hub_rep_retval; +}; + +static off_t +unbzip2_insandbox(int in, int out, char *pre, size_t prelen, off_t *bytes_in) +{ + struct host_unbzip2_req req; + struct host_unbzip2_rep rep; + struct iovec iov_req[2], iov_rep; + int fdarray[2]; + size_t len; + + bzero(&req, sizeof(req)); + req.hub_req_prelen = prelen; + iov_req[0].iov_base = &req; + iov_req[0].iov_len = sizeof(req); + iov_req[1].iov_base = pre; + iov_req[1].iov_len = prelen; + iov_rep.iov_base = &rep; + iov_rep.iov_len = sizeof(rep); + fdarray[0] = cap_new(in, CAP_FSTAT | CAP_READ | CAP_SEEK); + fdarray[1] = cap_new(out, CAP_FSTAT | CAP_WRITE | CAP_SEEK); + if (fdarray[0] == -1 || fdarray[1] == -1) + err(-1, "cap_new"); + if (lch_rpc_rights(lcsp, PROXIED_UNBZIP2, iov_req, 1, + fdarray, 2, &iov_rep, 1, &len, NULL, NULL) < 0) + err(-1, "lch_rpc_rights"); + if (len != sizeof(rep)) + errx(-1, "lch_rpc_rights len %zu", len); + if (bytes_in != NULL) + *bytes_in = rep.hub_rep_bytes_in; + close(fdarray[0]); + close(fdarray[1]); + return (rep.hub_rep_retval); +} + +static void +sandbox_unbzip2_buffer(struct lc_host *lchp, uint32_t opno, + uint32_t seqno, char *buffer, size_t len, int fd_in, int fd_out) +{ + struct host_unbzip2_req req; + struct host_unbzip2_rep rep; + struct iovec iov; + char *pre; + + if (len != sizeof(req)) + err(-1, "sandbox_gz_uncompress_buffer: len %zu", len); + + bcopy(buffer, &req, sizeof(req)); + pre = buffer + sizeof(req); + bzero(&rep, sizeof(rep)); + rep.hub_rep_retval = unbzip2(fd_in, fd_out, pre, req.hub_req_prelen, + &rep.hub_rep_bytes_in); + iov.iov_base = &rep; + iov.iov_len = sizeof(rep); + if (lcs_sendrpc(lchp, opno, seqno, &iov, 1) < 0) + err(-1, "lcs_sendrpc"); +} + +off_t +unbzip2_wrapper(int in, int out, char *pre, size_t prelen, off_t *bytes_in) +{ + + gzsandbox_initialize(); + if (gzsandbox_enabled) + return (unbzip2_insandbox(in, out, pre, prelen, bytes_in)); + else + return (unbzip2(in, out, pre, prelen, bytes_in)); +} + +/* + * Main entry point for capability-mode + */ +int gzsandbox(void) +{ + int fdarray[2], fdcount; + struct lc_host *lchp; + uint32_t opno, seqno; + u_char *buffer; + size_t len; + + if (lcs_get(&lchp) < 0) + errx(-1, "libcapsicum sandbox binary"); + + while (1) { + fdcount = 2; + if (lcs_recvrpc_rights(lchp, &opno, &seqno, &buffer, &len, + fdarray, &fdcount) < 0) { + if (errno == EPIPE) + exit(-1); + else + err(-1, "lcs_recvrpc_rights"); + } + switch (opno) { + case PROXIED_GZ_COMPRESS: + if (fdcount != 2) + errx(-1, "sandbox_workloop: %d fds", fdcount); + sandbox_gz_compress_buffer(lchp, opno, seqno, buffer, + len, fdarray[0], fdarray[1]); + close(fdarray[0]); + close(fdarray[1]); + break; + + case PROXIED_GZ_UNCOMPRESS: + if (fdcount != 2) + errx(-1, "sandbox_workloop: %d fds", fdcount); + sandbox_gz_uncompress_buffer(lchp, opno, seqno, + buffer, len, fdarray[0], fdarray[1]); + close(fdarray[0]); + close(fdarray[1]); + break; + + case PROXIED_UNBZIP2: + if (fdcount != 2) + errx(-1, "sandbox_workloop: %d fds", fdcount); + sandbox_unbzip2_buffer(lchp, opno, seqno, buffer, len, + fdarray[0], fdarray[1]); + close(fdarray[0]); + close(fdarray[1]); + break; + + default: + errx(-1, "sandbox_workloop: unknown op %d", opno); + } + free(buffer); + } +} + +#else /* NO_SANDBOX_SUPPORT */ + +off_t +gz_compress_wrapper(int in, int out, off_t *gsizep, const char *origname, + uint32_t mtime) +{ + + return (gz_compress(in, out, gsizep, origname, mtime)); +} + +off_t +gz_uncompress_wrapper(int in, int out, char *pre, size_t prelen, + off_t *gsizep, const char *filename) +{ + + return (gz_uncompress(in, out, pre, prelen, gsizep, filename)); +} + +off_t +unbzip2_wrapper(int in, int out, char *pre, size_t prelen, off_t *bytes_in) +{ + + return (unbzip2(in, out, pre, prelen, bytes_in)); +} + +#endif /* !NO_SANDBOX_SUPPORT */ diff -aurN -x '*.orig' src-clean/usr.bin/gzip/unbzip2.c src/usr.bin/gzip/unbzip2.c --- src-clean/usr.bin/gzip/unbzip2.c 2010-08-25 10:09:40.000000000 +0200 +++ src/usr.bin/gzip/unbzip2.c 2010-08-26 21:42:20.000000000 +0200 @@ -33,7 +33,7 @@ /* This file is #included by gzip.c */ -static off_t +off_t unbzip2(int in, int out, char *pre, size_t prelen, off_t *bytes_in) { int ret, end_of_file, cold = 0; diff -aurN -x '*.orig' src-clean/usr.bin/netstat/main.c src/usr.bin/netstat/main.c --- src-clean/usr.bin/netstat/main.c 2010-08-25 10:09:40.000000000 +0200 +++ src/usr.bin/netstat/main.c 2010-08-25 12:45:04.000000000 +0200 @@ -186,7 +186,9 @@ { .n_name = "_mfctablesize" }, #define N_ARPSTAT 55 { .n_name = "_arpstat" }, - { .n_name = NULL }, +#define N_UNP_SPHEAD 56 + { .n_name = "_unp_sphead" }, + { .n_name = NULL } }; struct protox { @@ -607,7 +609,8 @@ #endif /* NETGRAPH */ if ((af == AF_UNIX || af == AF_UNSPEC) && !sflag) unixpr(nl[N_UNP_COUNT].n_value, nl[N_UNP_GENCNT].n_value, - nl[N_UNP_DHEAD].n_value, nl[N_UNP_SHEAD].n_value); + nl[N_UNP_DHEAD].n_value, nl[N_UNP_SHEAD].n_value, + nl[N_UNP_SPHEAD].n_value); exit(0); } diff -aurN -x '*.orig' src-clean/usr.bin/netstat/netstat.h src/usr.bin/netstat/netstat.h --- src-clean/usr.bin/netstat/netstat.h 2010-08-25 10:09:40.000000000 +0200 +++ src/usr.bin/netstat/netstat.h 2010-08-25 10:24:35.000000000 +0200 @@ -151,7 +151,7 @@ void netgraphprotopr(u_long, const char *, int, int); #endif -void unixpr(u_long, u_long, u_long, u_long); +void unixpr(u_long, u_long, u_long, u_long, u_long); void esis_stats(u_long, const char *, int, int); void clnp_stats(u_long, const char *, int, int); diff -aurN -x '*.orig' src-clean/usr.bin/netstat/unix.c src/usr.bin/netstat/unix.c --- src-clean/usr.bin/netstat/unix.c 2010-08-25 10:09:40.000000000 +0200 +++ src/usr.bin/netstat/unix.c 2010-08-25 10:24:35.000000000 +0200 @@ -193,21 +193,37 @@ } void -unixpr(u_long count_off, u_long gencnt_off, u_long dhead_off, u_long shead_off) +unixpr(u_long count_off, u_long gencnt_off, u_long dhead_off, u_long shead_off, + u_long sphead_off) { char *buf; int ret, type; struct xsocket *so; struct xunpgen *xug, *oxug; struct xunpcb *xunp; + u_long head_off; for (type = SOCK_STREAM; type <= SOCK_SEQPACKET; type++) { if (live) ret = pcblist_sysctl(type, &buf); - else - ret = pcblist_kvm(count_off, gencnt_off, - type == SOCK_STREAM ? shead_off : - (type == SOCK_DGRAM ? dhead_off : 0), &buf); + else { + head_off = 0; + switch (type) { + case SOCK_STREAM: + head_off = shead_off; + break; + + case SOCK_DGRAM: + head_off = dhead_off; + break; + + case SOCK_SEQPACKET: + head_off = sphead_off; + break; + } + ret = pcblist_kvm(count_off, gencnt_off, head_off, + &buf); + } if (ret == -1) continue; if (ret < 0) diff -aurN -x '*.orig' src-clean/usr.bin/procstat/procstat.1 src/usr.bin/procstat/procstat.1 --- src-clean/usr.bin/procstat/procstat.1 2010-08-25 10:09:40.000000000 +0200 +++ src/usr.bin/procstat/procstat.1 2010-08-25 10:24:35.000000000 +0200 @@ -1,5 +1,5 @@ .\"- -.\" Copyright (c) 2007-2008 Robert N. M. Watson +.\" Copyright (c) 2007-2009 Robert N. M. Watson .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without @@ -33,7 +33,7 @@ .Nd get detailed process information .Sh SYNOPSIS .Nm -.Op Fl h +.Op Fl hC .Op Fl n .Op Fl w Ar interval .Op Fl b | c | f | i | j | k | s | t | v @@ -88,6 +88,11 @@ .Fl w flag is not specified, the output will not repeat. .Pp +The +.Fl C +flag requests the printing of additional capability information in the file +descriptor view. +.Pp Some information, such as VM and file descriptor information, is available only to the owner of a process or the superuser. .Ss Binary Information @@ -116,7 +121,8 @@ Display detailed information about each file descriptor referenced by a process, including the process ID, command, file descriptor number, and per-file descriptor object information, such as object type and file system -path: +path. +By default, the following information will be printed: .Pp .Bl -tag -width indent -compact .It PID @@ -138,14 +144,18 @@ .It PRO network protocol .It NAME -file path or socket addresses (if available) +file path, process ID, or socket addresses (if available) .El .Pp The following file descriptor types may be displayed: .Pp .Bl -tag -width X -compact +.It a +capability .It c crypto +.It e +semaphore .It f fifo .It h @@ -156,6 +166,8 @@ message queue .It p pipe +.It P +process descriptor .It s socket .It t @@ -202,7 +214,17 @@ direct I/O .It l lock held +.It C +descriptor is a capability .El +.Pp +If the +.Fl C +flag is specified, the vnode type, reference count, and offset fields will be +omitted, and a new capabilities field will be included listing capabilities, +as described in +.Xr cap_new 2 , +present for each capability descriptor. .Ss Signal Disposition Information Display signal pending and disposition for a process: .Pp @@ -396,6 +418,7 @@ .Xr fstat 1 , .Xr ps 1 , .Xr sockstat 1 , +.Xr cap_new 2 , .Xr ddb 4 , .Xr stack 9 .Sh AUTHORS diff -aurN -x '*.orig' src-clean/usr.bin/procstat/procstat.c src/usr.bin/procstat/procstat.c --- src-clean/usr.bin/procstat/procstat.c 2010-08-25 10:09:40.000000000 +0200 +++ src/usr.bin/procstat/procstat.c 2010-08-25 10:24:35.000000000 +0200 @@ -39,13 +39,13 @@ #include "procstat.h" static int aflag, bflag, cflag, fflag, iflag, jflag, kflag, sflag, tflag, vflag; -int hflag, nflag; +int hflag, nflag, Cflag; static void usage(void) { - fprintf(stderr, "usage: procstat [-h] [-n] [-w interval] [-b | -c | -f | " + fprintf(stderr, "usage: procstat [-hC] [-n] [-w interval] [-b | -c | -f | " "-i | -j | -k | -s | -t | -v]\n"); fprintf(stderr, " [-a | pid ...]\n"); exit(EX_USAGE); @@ -113,7 +113,7 @@ char *dummy; interval = 0; - while ((ch = getopt(argc, argv, "abcfijknhstvw:")) != -1) { + while ((ch = getopt(argc, argv, "abcfijknhstvw:C")) != -1) { switch (ch) { case 'a': aflag++; @@ -172,6 +172,10 @@ interval = l; break; + case 'C': + Cflag++; + break; + case '?': default: usage(); @@ -194,6 +198,10 @@ if (!(aflag == 1 && argc == 0) && !(aflag == 0 && argc > 0)) usage(); + /* Only allow -C with -f. */ + if (Cflag && !fflag) + usage(); + do { if (aflag) { name[0] = CTL_KERN; diff -aurN -x '*.orig' src-clean/usr.bin/procstat/procstat.h src/usr.bin/procstat/procstat.h --- src-clean/usr.bin/procstat/procstat.h 2010-08-25 10:09:40.000000000 +0200 +++ src/usr.bin/procstat/procstat.h 2010-08-25 10:24:35.000000000 +0200 @@ -29,7 +29,7 @@ #ifndef PROCSTAT_H #define PROCSTAT_H -extern int hflag, nflag; +extern int hflag, nflag, Cflag; struct kinfo_proc; void kinfo_proc_sort(struct kinfo_proc *kipp, int count); diff -aurN -x '*.orig' src-clean/usr.bin/procstat/procstat_cred.c src/usr.bin/procstat/procstat_cred.c --- src-clean/usr.bin/procstat/procstat_cred.c 2010-08-25 10:09:40.000000000 +0200 +++ src/usr.bin/procstat/procstat_cred.c 2010-08-25 10:24:35.000000000 +0200 @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2007 Robert N. M. Watson + * Copyright (c) 2007-2008 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -47,9 +47,9 @@ gid_t *groups = NULL; if (!hflag) - printf("%5s %-16s %5s %5s %5s %5s %5s %5s %-20s\n", "PID", + printf("%5s %-16s %5s %5s %5s %5s %5s %5s %4s %-15s\n", "PID", "COMM", "EUID", "RUID", "SVUID", "EGID", "RGID", "SVGID", - "GROUPS"); + "FLAG", "GROUPS"); printf("%5d ", pid); printf("%-16s ", kipp->ki_comm); @@ -59,6 +59,8 @@ printf("%5d ", kipp->ki_groups[0]); printf("%5d ", kipp->ki_rgid); printf("%5d ", kipp->ki_svgid); + printf("%s", kipp->ki_cr_flags & CRED_FLAG_CAPMODE ? "C" : "-"); + printf(" "); /* * We may have too many groups to fit in kinfo_proc's statically diff -aurN -x '*.orig' src-clean/usr.bin/procstat/procstat_files.c src/usr.bin/procstat/procstat_files.c --- src-clean/usr.bin/procstat/procstat_files.c 2010-08-25 10:09:40.000000000 +0200 +++ src/usr.bin/procstat/procstat_files.c 2010-08-25 10:24:35.000000000 +0200 @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2007 Robert N. M. Watson + * Copyright (c) 2007-2010 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -27,6 +27,7 @@ */ #include +#include #include #include #include @@ -131,24 +132,158 @@ printf("%s", addr); } +static struct cap_desc { + cap_rights_t cd_right; + const char *cd_desc; +} cap_desc[] = { + { CAP_READ, "rd" }, + { CAP_WRITE, "wr" }, + { CAP_SEEK, "se" }, + { CAP_GETPEERNAME, "pn" }, + { CAP_GETSOCKNAME, "sn" }, + { CAP_FCHFLAGS, "cf" }, + { CAP_IOCTL, "io" }, + { CAP_FSTAT, "fs" }, + { CAP_MMAP, "mm" }, + { CAP_FCNTL, "fc" }, + { CAP_EVENT, "ev" }, + { CAP_FSYNC, "fy" }, + { CAP_FCHOWN, "cn" }, + { CAP_FCHMOD, "cm" }, + { CAP_FTRUNCATE, "ft" }, + { CAP_FLOCK, "fl" }, + { CAP_FSTATFS, "sf" }, + { CAP_REVOKE, "rv" }, + { CAP_FEXECVE, "fe" }, + { CAP_FPATHCONF, "fp" }, + { CAP_FUTIMES, "fu" }, + { CAP_ACL_GET, "ag" }, + { CAP_ACL_SET, "as" }, + { CAP_ACL_DELETE, "ad" }, + { CAP_ACL_CHECK, "ac" }, + { CAP_EXTATTR_GET, "eg" }, + { CAP_EXTATTR_SET, "es" }, + { CAP_EXTATTR_DELETE, "ed" }, + { CAP_EXTATTR_LIST, "el" }, + { CAP_MAC_GET, "mg" }, + { CAP_MAC_SET, "ms" }, + { CAP_ACCEPT, "at" }, + { CAP_CONNECT, "co" }, + { CAP_BIND, "bd" }, + { CAP_GETSOCKOPT, "gs" }, + { CAP_SETSOCKOPT, "ss" }, + { CAP_LISTEN, "ln" }, + { CAP_SHUTDOWN, "sh" }, + { CAP_PEELOFF, "pf" }, + { CAP_LOOKUP, "lo" }, + { CAP_SEM_POST, "sp" }, + { CAP_SEM_WAIT, "sw" }, + { CAP_SEM_GETVALUE, "sg" }, + { CAP_KEVENT, "ke" }, + { CAP_PDGETPID, "pg" }, + { CAP_PDWAIT, "pw" }, + { CAP_PDKILL, "pk" }, + { CAP_MAPEXEC, "me" }, + { CAP_TTYHOOK, "th" }, + { CAP_FCHDIR, "cd" }, + { CAP_FSCK, "fk" }, + { CAP_ATBASE, "ab" }, + { CAP_ABSOLUTEPATH, "ap" }, + { CAP_CREATE, "cr" }, + { CAP_DELETE, "de" }, + { CAP_MKDIR, "md" }, + { CAP_RMDIR, "rm" }, + { CAP_MKFIFO, "mf" }, +}; +static const u_int cap_desc_count = sizeof(cap_desc) / + sizeof(cap_desc[0]); + +static u_int +width_capability(cap_rights_t rights) +{ + u_int count, i, width; + + count = 0; + width = 0; + for (i = 0; i < cap_desc_count; i++) { + if (rights & cap_desc[i].cd_right) { + width += strlen(cap_desc[i].cd_desc); + if (count) + width++; + count++; + } + } + return (width); +} + +static void +print_capability(cap_rights_t rights, u_int capwidth) +{ + u_int count, i, width; + + count = 0; + width = 0; + for (i = width_capability(rights); i < capwidth; i++) { + if (rights || i != (capwidth - 1)) + printf(" "); + else + printf("-"); + } + for (i = 0; i < cap_desc_count; i++) { + if (rights & cap_desc[i].cd_right) { + printf("%s%s", count ? "," : "", cap_desc[i].cd_desc); + width += strlen(cap_desc[i].cd_desc); + if (count) + width++; + count++; + } + } +} + void procstat_files(pid_t pid, struct kinfo_proc *kipp) { struct kinfo_file *freep, *kif; + u_int capwidth, width; int i, cnt; const char *str; - if (!hflag) - printf("%5s %-16s %4s %1s %1s %-8s %3s %7s %-3s %-12s\n", - "PID", "COMM", "FD", "T", "V", "FLAGS", "REF", "OFFSET", - "PRO", "NAME"); - + /* + * To print the header in capability mode, we need to know the width + * of the widest capability string. Even if we get no processes + * back, we will print the header, so we defer aborting due to a lack + * of processes until after the header logic. + */ + capwidth = 0; freep = kinfo_getfile(pid, &cnt); + if (freep != NULL && Cflag) { + for (i = 0; i < cnt; i++) { + kif = &freep[i]; + width = width_capability(kif->kf_cap_rights); + if (width > capwidth) + capwidth = width; + } + if (capwidth < strlen("CAPABILITIES")) + capwidth = strlen("CAPABILITIES"); + } + + if (!hflag) { + if (Cflag) + printf("%5s %-16s %4s %1s %9s %-*s " + "%-3s %-12s\n", "PID", "COMM", "FD", "T", + "FLAGS", capwidth, "CAPABILITIES", "PRO", + "NAME"); + else + printf("%5s %-16s %4s %1s %1s %-9s %3s %7s %-3s " + "%-12s\n", "PID", "COMM", "FD", "T", "V", + "FLAGS", "REF", "OFFSET", "PRO", "NAME"); + } + if (freep == NULL) return; for (i = 0; i < cnt; i++) { kif = &freep[i]; - + printf("%5d ", pid); printf("%-16s ", kipp->ki_comm); switch (kif->kf_fd) { @@ -209,6 +344,10 @@ str = "e"; break; + case KF_TYPE_PROCDESC: + str = "P"; + break; + case KF_TYPE_NONE: case KF_TYPE_UNKNOWN: default: @@ -216,49 +355,51 @@ break; } printf("%1s ", str); - str = "-"; - if (kif->kf_type == KF_TYPE_VNODE) { - switch (kif->kf_vnode_type) { - case KF_VTYPE_VREG: - str = "r"; - break; - - case KF_VTYPE_VDIR: - str = "d"; - break; - - case KF_VTYPE_VBLK: - str = "b"; - break; - - case KF_VTYPE_VCHR: - str = "c"; - break; - - case KF_VTYPE_VLNK: - str = "l"; - break; - - case KF_VTYPE_VSOCK: - str = "s"; - break; - - case KF_VTYPE_VFIFO: - str = "f"; - break; - - case KF_VTYPE_VBAD: - str = "x"; - break; - - case KF_VTYPE_VNON: - case KF_VTYPE_UNKNOWN: - default: - str = "?"; - break; + if (!Cflag) { + str = "-"; + if (kif->kf_type == KF_TYPE_VNODE) { + switch (kif->kf_vnode_type) { + case KF_VTYPE_VREG: + str = "r"; + break; + + case KF_VTYPE_VDIR: + str = "d"; + break; + + case KF_VTYPE_VBLK: + str = "b"; + break; + + case KF_VTYPE_VCHR: + str = "c"; + break; + + case KF_VTYPE_VLNK: + str = "l"; + break; + + case KF_VTYPE_VSOCK: + str = "s"; + break; + + case KF_VTYPE_VFIFO: + str = "f"; + break; + + case KF_VTYPE_VBAD: + str = "x"; + break; + + case KF_VTYPE_VNON: + case KF_VTYPE_UNKNOWN: + default: + str = "?"; + break; + } } + printf("%1s ", str); } - printf("%1s ", str); printf("%s", kif->kf_flags & KF_FLAG_READ ? "r" : "-"); printf("%s", kif->kf_flags & KF_FLAG_WRITE ? "w" : "-"); printf("%s", kif->kf_flags & KF_FLAG_APPEND ? "a" : "-"); @@ -266,28 +407,42 @@ printf("%s", kif->kf_flags & KF_FLAG_FSYNC ? "f" : "-"); printf("%s", kif->kf_flags & KF_FLAG_NONBLOCK ? "n" : "-"); printf("%s", kif->kf_flags & KF_FLAG_DIRECT ? "d" : "-"); - printf("%s ", kif->kf_flags & KF_FLAG_HASLOCK ? "l" : "-"); - if (kif->kf_ref_count > -1) - printf("%3d ", kif->kf_ref_count); - else - printf("%3c ", '-'); - if (kif->kf_offset > -1) - printf("%7jd ", (intmax_t)kif->kf_offset); - else - printf("%7c ", '-'); + printf("%s", kif->kf_flags & KF_FLAG_HASLOCK ? "l" : "-"); + printf("%s ", kif->kf_flags & KF_FLAG_CAPABILITY ? "c" : "-"); + if (!Cflag) { + if (kif->kf_ref_count > -1) + printf("%3d ", kif->kf_ref_count); + else + printf("%3c ", '-'); + if (kif->kf_offset > -1) + printf("%7jd ", (intmax_t)kif->kf_offset); + else + printf("%7c ", '-'); + } + + if (Cflag) { + print_capability(kif->kf_cap_rights, capwidth); + printf(" "); + } + + switch (kif->kf_type) { + case KF_TYPE_SOCKET: + printf("%-3s ", + protocol_to_string(kif->kf_sock_domain, + kif->kf_sock_type, kif->kf_sock_protocol)); + break; + + default: + printf("%-3s ", "-"); + } switch (kif->kf_type) { case KF_TYPE_VNODE: case KF_TYPE_FIFO: case KF_TYPE_PTS: - printf("%-3s ", "-"); printf("%-18s", kif->kf_path); - break; case KF_TYPE_SOCKET: - printf("%-3s ", - protocol_to_string(kif->kf_sock_domain, - kif->kf_sock_type, kif->kf_sock_protocol)); /* * While generally we like to print two addresses, * local and peer, for sockets, it turns out to be @@ -310,8 +465,11 @@ } break; + case KF_TYPE_PROCDESC: + printf("%d", kif->kf_pid); + break; + default: - printf("%-3s ", "-"); printf("%-18s", "-"); } diff -aurN -x '*.orig' src-clean/usr.sbin/tcpdump/tcpdump/Makefile src/usr.sbin/tcpdump/tcpdump/Makefile --- src-clean/usr.sbin/tcpdump/tcpdump/Makefile 2010-08-25 10:09:38.000000000 +0200 +++ src/usr.sbin/tcpdump/tcpdump/Makefile 2010-08-25 10:24:35.000000000 +0200 @@ -50,8 +50,8 @@ CFLAGS+= -DLBL_ALIGN .endif -DPADD= ${LIBL} ${LIBPCAP} -LDADD= -ll -lpcap +DPADD= ${LIBL} ${LIBPCAP} ${LIBCAPSICUM} +LDADD= -ll -lpcap -lcapsicum .if ${MK_OPENSSL} != "no" && !defined(RELEASE_CRUNCH) DPADD+= ${LIBCRYPTO} LDADD+= -lcrypto