diff -urNp current/TODO hrl/TODO --- current/TODO 1970-01-01 01:00:00.000000000 +0100 +++ hrl/TODO 2011-01-31 19:39:02.342933108 +0100 @@ -0,0 +1,84 @@ +TODO: + + - RCTL accesses resource usage information without holding container_lock. + + - Fix cpu throttling. + + - Consider replacing proc pointer with thread pointer in rusage_add(9) et al. + In most cases caller uses 'td->td_proc' anyway, and passing thread would + allow the HRL code to send a signal to the offending thread instead of the + offending process. It would also mean that these the functions would + no longer need to be called with proc lock held, because we could use + td->td_ucred, which is stable across the syscall. + + - Get rid of container_lock. Atomic instructions would be nice, but we really + need 64 bits (per-process counters could be 32 bit, I guess, but the higher + level containers could overflow), and atomic(9) doesn't support 64 bit values + on 32 bit platforms. Code ported from SunOS seems to use (and implement) + 64 bit atomics. + + - The jailstat(1)/userstat(1) tool seems to use lots of CPU time. Rewriting + it in C could make sense. + + - Currently limit enforcement involves walking over the list of rules applicable + to the process. I expect the number of such rules to be small (about five), + but some optimisation could be in order. Maybe a list of pointers to rules, + one entry per resource? + +Issues: + + - Setting RSS limit too low can make the system thrash to death. + + - We enforce limits when a process allocates a resource, and when it forks. + We don't enforce limits when process changes its credentials, though. This + might be either a bug or feature, depending on point of view. + + - In the long term, the goal is to get rid of lim_get(9), chgproccnt(9) etc, + turning this: + + limit = lim_get(...); + if (value > limit) + return (EWHATEVER); + + [ do stuff ] + + into this: + + if (rusage_add(...)) + return (EWHATEVER); + + [ do stuff ] + + if (some error) + rusage_sub(...); + + However, this requires per-process limit enforcement to be hooked into + container mechanism. This is a part of HRL, and it's outside the scope + of the containers project. This means the code is full of "#ifdef CONTAINERS". + + - RUSAGE_NOFILE accounts for size of file descriptor table, rather than the number + of file descriptors. This shouldn't be a problem, but might be worth keeping + in mind. + + - We should have a limit for the number of files that were mmapped and then closed, + and remain mapped in memory. + + - There are several SysV-related global system limits that are not reflected + in the containers structure - RUSAGE_SHMMNI, for example. This needs + to be investigated; there are many of them and it wouldn't be user-friendly + to add ten new resource counters. + + - What about RLIMIT_RTPRIO and RLIMIT_RTTIME? Linux seems to have these. + +RCTL-specific issues: + + - Reconsider bringing back setrlimit(2) handling. + + - Reconsider bringing back per-group limits. + + - Some things need to be accounted for per-euid, and some per-ruid. Geez. + + - In maxproc limit, make sure the 'p' argument is a child process. Otherwise, + if one adds rule with 'sig*' action, the signal will be sent to the parent + instead of the child. + diff -urNp current/etc/rc.d/Makefile hrl/etc/rc.d/Makefile --- current/etc/rc.d/Makefile 2011-01-31 20:37:06.283230269 +0100 +++ hrl/etc/rc.d/Makefile 2011-01-31 19:48:59.522960738 +0100 @@ -29,7 +29,7 @@ FILES= DAEMON FILESYSTEMS LOGIN NETWORKI pf pflog pfsync \ powerd power_profile ppp pppoed pwcheck \ quota \ - random rarpd resolv rfcomm_pppd_server root \ + random rarpd rctl resolv rfcomm_pppd_server root \ route6d routed routing rpcbind rtadvd rtsold rwho \ savecore sdpd securelevel sendmail \ serial sppp statd static_arp stf swap1 \ diff -urNp current/etc/rc.d/rctl hrl/etc/rc.d/rctl --- current/etc/rc.d/rctl 1970-01-01 01:00:00.000000000 +0100 +++ hrl/etc/rc.d/rctl 2011-01-31 19:49:00.872972261 +0100 @@ -0,0 +1,39 @@ +#!/bin/sh +# +# $FreeBSD$ +# + +# PROVIDE: rctl +# BEFORE: LOGIN +# KEYWORD: nojail + +. /etc/rc.subr + +name="rctl" +start_cmd="rctl_start" +stop_cmd="rctl_stop" + +rctl_start() +{ + if [ -f /etc/rctl.conf ]; then + while read var comments + do + case ${var} in + \#*|'') + ;; + *) + rctl -a "${var}" + ;; + esac + done < /etc/rctl.conf + fi +} + +rctl_stop() +{ + + rctl -r :: +} + +load_rc_config $name +run_rc_command "$1" diff -urNp current/include/unistd.h hrl/include/unistd.h --- current/include/unistd.h 2011-01-31 20:37:21.743005464 +0100 +++ hrl/include/unistd.h 2011-01-31 19:49:46.681070497 +0100 @@ -500,6 +500,7 @@ int feature_present(const char *); char *fflagstostr(u_long); int getdomainname(char *, int); int getgrouplist(const char *, gid_t, gid_t *, int *); +int getloginclass(char *, size_t); mode_t getmode(const void *, mode_t); int getosreldate(void); int getpeereid(int, uid_t *, gid_t *); @@ -560,6 +561,7 @@ int setkey(const char *); #define _SETKEY_DECLARED #endif int setlogin(const char *); +int setloginclass(const char *); void *setmode(const char *); void setproctitle(const char *_fmt, ...) __printf0like(1, 2); int setresgid(gid_t, gid_t, gid_t); diff -urNp current/lib/libc/sys/Symbol.map hrl/lib/libc/sys/Symbol.map --- current/lib/libc/sys/Symbol.map 2011-01-31 20:37:37.492906080 +0100 +++ hrl/lib/libc/sys/Symbol.map 2011-01-31 19:50:31.653000170 +0100 @@ -342,6 +342,7 @@ FBSD_1.1 { fexecve; fstatat; futimesat; + getloginclass; jail_get; jail_set; jail_remove; @@ -355,9 +356,15 @@ FBSD_1.1 { readlinkat; renameat; setfib; + setloginclass; shmctl; symlinkat; unlinkat; + rctl_get_usage; + rctl_get_rules; + rctl_get_limits; + rctl_add_rule; + rctl_remove_rule; }; FBSDprivate_1.0 { diff -urNp current/lib/libutil/login_cap.h hrl/lib/libutil/login_cap.h --- current/lib/libutil/login_cap.h 2011-01-31 20:37:47.923022198 +0100 +++ hrl/lib/libutil/login_cap.h 2011-01-31 19:51:17.282980796 +0100 @@ -49,7 +49,8 @@ #define LOGIN_SETENV 0x0080 /* set user environment */ #define LOGIN_SETMAC 0x0100 /* set user default MAC label */ #define LOGIN_SETCPUMASK 0x0200 /* set user cpumask */ -#define LOGIN_SETALL 0x03ff /* set everything */ +#define LOGIN_SETLOGINCLASS 0x0400 /* set login class in the kernel */ +#define LOGIN_SETALL 0x07ff /* set everything */ #define BI_AUTH "authorize" /* accepted authentication */ #define BI_REJECT "reject" /* rejected authentication */ diff -urNp current/lib/libutil/login_class.c hrl/lib/libutil/login_class.c --- current/lib/libutil/login_class.c 2011-01-31 20:37:47.923022198 +0100 +++ hrl/lib/libutil/login_class.c 2011-01-31 19:51:17.282980796 +0100 @@ -425,6 +425,7 @@ setusercontext(login_cap_t *lc, const st quad_t p; mode_t mymask; login_cap_t *llc = NULL; + sig_t prevsig; struct rtprio rtp; int error; @@ -512,6 +513,24 @@ setusercontext(login_cap_t *lc, const st return (-1); } + if (lc != NULL && lc->lc_class != NULL && (flags & LOGIN_SETLOGINCLASS)) { + /* Inform the kernel about current login class */ + /* + * XXX: This is a workaround to fail gracefully in case the kernel + * does not support setloginclass(2). + */ + prevsig = signal(SIGSYS, SIG_IGN); + error = setloginclass(lc->lc_class); + signal(SIGSYS, prevsig); + if (error != 0) { + syslog(LOG_ERR, "setloginclass(%s): %m", lc->lc_class); +#ifdef notyet + login_close(llc); + return (-1); +#endif + } + } + mymask = (flags & LOGIN_SETUMASK) ? umask(LOGIN_DEFUMASK) : 0; mymask = setlogincontext(lc, pwd, mymask, flags); login_close(llc); diff -urNp current/share/man/man9/Makefile hrl/share/man/man9/Makefile --- current/share/man/man9/Makefile 2011-01-31 20:38:29.932962567 +0100 +++ hrl/share/man/man9/Makefile 2011-01-31 19:53:19.433054897 +0100 @@ -46,6 +46,7 @@ MAN= accept_filter.9 \ cd.9 \ condvar.9 \ config_intrhook.9 \ + container_create.9 \ contigmalloc.9 \ copy.9 \ cr_cansee.9 \ @@ -223,6 +224,8 @@ MAN= accept_filter.9 \ rtalloc.9 \ rtentry.9 \ runqueue.9 \ + rusage_add.9 \ + rusage_get_limit.9 \ rwlock.9 \ sbuf.9 \ scheduler.9 \ @@ -544,6 +547,9 @@ MLINKS+=condvar.9 cv_broadcast.9 \ condvar.9 cv_wmesg.9 MLINKS+=config_intrhook.9 config_intrhook_disestablish.9 \ config_intrhook.9 config_intrhook_establish.9 +MLINKS+=container_create.9 container_destroy.9 \ + container_create.9 container_join.9 \ + container_create.9 container_leave.9 MLINKS+=contigmalloc.9 contigfree.9 MLINKS+=copy.9 copyin.9 \ copy.9 copyinstr.9 \ @@ -1002,6 +1008,8 @@ MLINKS+=runqueue.9 choosethread.9 \ runqueue.9 procrunnable.9 \ runqueue.9 remrunqueue.9 \ runqueue.9 setrunqueue.9 +MLINKS+=rusage_add.9 rusage_set.9 \ + rusage_add.9 rusage_sub.9 MLINKS+=rwlock.9 rw_assert.9 \ rwlock.9 rw_destroy.9 \ rwlock.9 rw_downgrade.9 \ diff -urNp current/share/man/man9/container_create.9 hrl/share/man/man9/container_create.9 --- current/share/man/man9/container_create.9 1970-01-01 01:00:00.000000000 +0100 +++ hrl/share/man/man9/container_create.9 2011-01-31 19:53:20.482995602 +0100 @@ -0,0 +1,122 @@ +.\"- +.\" Copyright (c) 2010 The FreeBSD Foundation +.\" All rights reserved. +.\" +.\" This software was developed by Edward Tomasz Napierala under sponsorship +.\" from the FreeBSD Foundation. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd August 20, 2010 +.Dt CONTAINER_CREATE 9 +.Os +.Sh NAME +.Nm container_create , +.Nm container_destroy , +.Nm container_join , +.Nm container_leave +.Nd resource containers hierachy manipulation routines +.Sh SYNOPSIS +.In sys/container.h +.Ft void +.Fn container_create "struct container *container" +.Ft void +.Fn container_destroy "struct container *container" +.Ft int +.Fn container_join "struct container *child" "struct container *parent" +.Ft int +.Fn container_leave "struct container *child" "struct container *parent" +.Sh DESCRIPTION +The +.Fn container_create +function initializes new, empty container. +The caller is responsible for providing zeroed storage. +.Pp +The +.Fn container_destroy +function destroys the container. +It automatically leaves its parent containers. +It is an error to destroy container with child containers. +It is an error to destroy container with non-zero resource utilisation +for reclaimable resources. +.Pp +The +.Fn container_join +function makes the container pointed to by +.Fa child +a child (subcontainer) of container pointed to by +.Fa parent . +Each container might have +.Dv 0 +or more parent containers. +When joining, the child resource usage accounting information gets propagated +to the parent. +Should it make the parent exceed its resource limits, the non-zero value is returned +and the container hierarchy and resource accounting information are rolled back to the +state before the call to +.Fn container_join . +If the container +.Dv P +is a parent container for +.Dv C , +it is an error to call +.Fn container_join "C" "P" +again without leaving +.Dv P +first. +.Pp +The +.Fn container_leave +function removes the container pointed to by +.Fa child +from the set of child containers for +.Fa parent . +Parents resource usage accounting information is updated accordingly. +It is an error to call +.Fn container_leave "C" "P" +if +.Dv P +is not a parent container for +.Dv C. +.Pp +The +.Fn container_create +routine never sleeps. +The +.Fn container_destroy , +.Fn container_join , +and +.Fn container_leave +routines may perform bounded sleep. +See +.Xr locking 9 +for details. +.Sh SEE ALSO +.Xr locking 9 , +.Xr rusage_add 9 , +.Xr rusage_get_limit 9 +.Sh AUTHORS +Container manipulation routines were added by +.An Edward Tomasz Napierala Aq trasz@FreeBSD.org +under sponsorship from the FreeBSD Foundation. diff -urNp current/share/man/man9/rusage_add.9 hrl/share/man/man9/rusage_add.9 --- current/share/man/man9/rusage_add.9 1970-01-01 01:00:00.000000000 +0100 +++ hrl/share/man/man9/rusage_add.9 2011-01-31 19:53:22.993000645 +0100 @@ -0,0 +1,93 @@ +.\"- +.\" Copyright (c) 2010 The FreeBSD Foundation +.\" All rights reserved. +.\" +.\" This software was developed by Edward Tomasz Napierala under sponsorship +.\" from the FreeBSD Foundation. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd August 20, 2010 +.Dt RUSAGE_ADD 9 +.Os +.Sh NAME +.Nm rusage_add , +.Nm rusage_set , +.Nm rusage_sub +.Nd process resource usage accounting +.Sh SYNOPSIS +.In sys/container.h +.Ft int +.Fn rusage_add "struct proc *p" "int resource" "uint64_t amount" +.Ft int +.Fn rusage_set "struct proc *p" "int resource" "uint64_t amount" +.Ft void +.Fn rusage_sub "struct proc *p" "int resource" "uint64_t amount" +.Sh DESCRIPTION +The +.Fn rusage_add +function checks whether resource limits allow process +.Fa p +to allocate +.Fa amount +of +.Fa resource , +in addition to the amount already allocated at the time of the call. +If they do, resource usage counters are increased and +.Dv 0 +is returned; otherwise, non-zero value is returned. +.Pp +The +.Fn rusage_set +function checks whether resource limits allow process +.Fa p +to have +.Fa amount +of +.Fa resource +allocated, total. +If they do, resource usage counters are increased and +.Dv 0 +is returned; otherwise, non-zero value is returned. +.Pp +The +.Fn rusage_sub +function decreases resource usage information for process +.Fa p +by +.Fa amount . +.Pp +All three routines may perform bounded sleep, see +.Xr locking 9 +for details. +.Sh SEE ALSO +.Xr container_create 9 , +.Xr locking 9 , +.Xr rusage_get_limit 9 +.Sh AUTHORS +The +.Fn rusage_get_limit +function was added by +.An Edward Tomasz Napierala Aq trasz@FreeBSD.org +under sponsorship from the FreeBSD Foundation. diff -urNp current/share/man/man9/rusage_get_limit.9 hrl/share/man/man9/rusage_get_limit.9 --- current/share/man/man9/rusage_get_limit.9 1970-01-01 01:00:00.000000000 +0100 +++ hrl/share/man/man9/rusage_get_limit.9 2011-01-31 19:53:22.993000645 +0100 @@ -0,0 +1,64 @@ +.\"- +.\" Copyright (c) 2010 The FreeBSD Foundation +.\" All rights reserved. +.\" +.\" This software was developed by Edward Tomasz Napierala under sponsorship +.\" from the FreeBSD Foundation. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd August 20, 2010 +.Dt RUSAGE_GET_LIMIT 9 +.Os +.Sh NAME +.Nm rusage_get_limit +.Nd retrievie process resource limit +.Sh SYNOPSIS +.In sys/container.h +.Ft uint64_t +.Fn rusage_get_limit "struct proc *p" "int resource" +.Sh DESCRIPTION +The +.Fn rusage_get_limit +function returns the total amount of +.Fa resource +the process +.Fa p +may have allocated. +It doesn't take into account currently allocated amount. +The +.Fn rusage_get_limit +routne routine may perform bounded sleep, see +.Xr locking 9 +for details. +.Sh SEE ALSO +.Xr container_create 9 , +.Xr locking 9 , +.Xr rusage_add 9 +.Sh AUTHORS +The +.Fn rusage_get_limit +function was added by +.An Edward Tomasz Napierala Aq trasz@FreeBSD.org +under sponsorship from the FreeBSD Foundation. diff -urNp current/sys/amd64/conf/GENERIC hrl/sys/amd64/conf/GENERIC --- current/sys/amd64/conf/GENERIC 2011-01-31 20:38:37.609621752 +0100 +++ hrl/sys/amd64/conf/GENERIC 2011-01-31 19:53:45.143024461 +0100 @@ -62,8 +62,8 @@ options HWPMC_HOOKS # Necessary kernel options AUDIT # Security event auditing options MAC # TrustedBSD MAC Framework options FLOWTABLE # per-cpu routing cache -#options KDTRACE_FRAME # Ensure frames are compiled in -#options KDTRACE_HOOKS # Kernel DTrace hooks +options KDTRACE_FRAME # Ensure frames are compiled in +options KDTRACE_HOOKS # Kernel DTrace hooks options INCLUDE_CONFIG_FILE # Include this file in kernel # Debugging for use in -current @@ -77,6 +77,9 @@ options WITNESS # Enable checks to de options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed options MALLOC_DEBUG_MAXZONES=8 # Separate malloc(9) zones +options CONTAINERS +options HRL + # Make an SMP-capable kernel by default options SMP # Symmetric MultiProcessor Kernel diff -urNp current/sys/compat/freebsd32/freebsd32_proto.h hrl/sys/compat/freebsd32/freebsd32_proto.h --- current/sys/compat/freebsd32/freebsd32_proto.h 2011-01-31 20:38:51.383053570 +0100 +++ hrl/sys/compat/freebsd32/freebsd32_proto.h 2011-01-31 19:54:41.462947523 +0100 @@ -2,8 +2,8 @@ * System call prototypes. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/compat/freebsd32/freebsd32_proto.h,v 1.116 2010/06/28 18:17:21 kib Exp $ - * created from FreeBSD: head/sys/compat/freebsd32/syscalls.master 209579 2010-06-28 18:06:46Z kib + * $FreeBSD$ + * created from FreeBSD: src/sys/compat/freebsd32/syscalls.master,v 1.132 2010/06/28 18:06:46 kib Exp */ #ifndef _FREEBSD32_SYSPROTO_H_ diff -urNp current/sys/compat/freebsd32/freebsd32_syscall.h hrl/sys/compat/freebsd32/freebsd32_syscall.h --- current/sys/compat/freebsd32/freebsd32_syscall.h 2011-01-31 20:38:51.383053570 +0100 +++ hrl/sys/compat/freebsd32/freebsd32_syscall.h 2011-01-31 19:54:41.613048444 +0100 @@ -2,8 +2,8 @@ * System call numbers. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/compat/freebsd32/freebsd32_syscall.h,v 1.112 2010/06/28 18:17:21 kib Exp $ - * created from FreeBSD: head/sys/compat/freebsd32/syscalls.master 209579 2010-06-28 18:06:46Z kib + * $FreeBSD$ + * created from FreeBSD: src/sys/compat/freebsd32/syscalls.master,v 1.132 2010/06/28 18:06:46 kib Exp */ #define FREEBSD32_SYS_syscall 0 @@ -410,4 +410,11 @@ #define FREEBSD32_SYS_freebsd32_shmctl 512 #define FREEBSD32_SYS_lpathconf 513 #define FREEBSD32_SYS_freebsd32_pselect 522 -#define FREEBSD32_SYS_MAXSYSCALL 523 +#define FREEBSD32_SYS_getloginclass 523 +#define FREEBSD32_SYS_setloginclass 524 +#define FREEBSD32_SYS_rctl_get_usage 525 +#define FREEBSD32_SYS_rctl_get_rules 526 +#define FREEBSD32_SYS_rctl_get_limits 527 +#define FREEBSD32_SYS_rctl_add_rule 528 +#define FREEBSD32_SYS_rctl_remove_rule 529 +#define FREEBSD32_SYS_MAXSYSCALL 530 diff -urNp current/sys/compat/freebsd32/freebsd32_syscalls.c hrl/sys/compat/freebsd32/freebsd32_syscalls.c --- current/sys/compat/freebsd32/freebsd32_syscalls.c 2011-01-31 20:38:51.383053570 +0100 +++ hrl/sys/compat/freebsd32/freebsd32_syscalls.c 2011-01-31 19:54:41.642927940 +0100 @@ -2,8 +2,8 @@ * System call names. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/compat/freebsd32/freebsd32_syscalls.c,v 1.103 2010/06/28 18:17:21 kib Exp $ - * created from FreeBSD: head/sys/compat/freebsd32/syscalls.master 209579 2010-06-28 18:06:46Z kib + * $FreeBSD$ + * created from FreeBSD: src/sys/compat/freebsd32/syscalls.master,v 1.132 2010/06/28 18:06:46 kib Exp */ const char *freebsd32_syscallnames[] = { @@ -546,4 +546,11 @@ const char *freebsd32_syscallnames[] = { "#520", /* 520 = pdgetpid */ "#521", /* 521 = pdwait */ "freebsd32_pselect", /* 522 = freebsd32_pselect */ + "getloginclass", /* 523 = getloginclass */ + "setloginclass", /* 524 = setloginclass */ + "rctl_get_usage", /* 525 = rctl_get_usage */ + "rctl_get_rules", /* 526 = rctl_get_rules */ + "rctl_get_limits", /* 527 = rctl_get_limits */ + "rctl_add_rule", /* 528 = rctl_add_rule */ + "rctl_remove_rule", /* 529 = rctl_remove_rule */ }; diff -urNp current/sys/compat/freebsd32/freebsd32_sysent.c hrl/sys/compat/freebsd32/freebsd32_sysent.c --- current/sys/compat/freebsd32/freebsd32_sysent.c 2011-01-31 20:38:51.437334771 +0100 +++ hrl/sys/compat/freebsd32/freebsd32_sysent.c 2011-01-31 19:54:41.663148666 +0100 @@ -2,8 +2,8 @@ * System call switch table. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/compat/freebsd32/freebsd32_sysent.c,v 1.114 2010/06/28 18:17:21 kib Exp $ - * created from FreeBSD: head/sys/compat/freebsd32/syscalls.master 209579 2010-06-28 18:06:46Z kib + * $FreeBSD$ + * created from FreeBSD: src/sys/compat/freebsd32/syscalls.master,v 1.132 2010/06/28 18:06:46 kib Exp */ #include "opt_compat.h" @@ -583,4 +583,11 @@ struct sysent freebsd32_sysent[] = { { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 520 = pdgetpid */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 521 = pdwait */ { AS(freebsd32_pselect_args), (sy_call_t *)freebsd32_pselect, AUE_SELECT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 522 = freebsd32_pselect */ + { AS(getloginclass_args), (sy_call_t *)getloginclass, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 523 = getloginclass */ + { AS(setloginclass_args), (sy_call_t *)setloginclass, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 524 = setloginclass */ + { AS(rctl_get_usage_args), (sy_call_t *)rctl_get_usage, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 525 = rctl_get_usage */ + { AS(rctl_get_rules_args), (sy_call_t *)rctl_get_rules, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 526 = rctl_get_rules */ + { AS(rctl_get_limits_args), (sy_call_t *)rctl_get_limits, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 527 = rctl_get_limits */ + { AS(rctl_add_rule_args), (sy_call_t *)rctl_add_rule, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 528 = rctl_add_rule */ + { AS(rctl_remove_rule_args), (sy_call_t *)rctl_remove_rule, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 529 = rctl_remove_rule */ }; diff -urNp current/sys/compat/freebsd32/syscalls.master hrl/sys/compat/freebsd32/syscalls.master --- current/sys/compat/freebsd32/syscalls.master 2011-01-31 20:38:51.443071533 +0100 +++ hrl/sys/compat/freebsd32/syscalls.master 2011-01-31 19:54:41.703050386 +0100 @@ -962,3 +962,11 @@ fd_set *ou, fd_set *ex, \ const struct timespec32 *ts, \ const sigset_t *sm); } +523 AUE_NULL NOPROTO { int getloginclass(char *namebuf, size_t \ + namelen); } +524 AUE_NULL NOPROTO { int setloginclass(const char *namebuf); } +525 AUE_NULL NOPROTO { int rctl_get_usage(const void *inbufp, size_t inbuflen, void *outbufp, size_t outbuflen); } +526 AUE_NULL NOPROTO { int rctl_get_rules(const void *inbufp, size_t inbuflen, void *outbufp, size_t outbuflen); } +527 AUE_NULL NOPROTO { int rctl_get_limits(const void *inbufp, size_t inbuflen, void *outbufp, size_t outbuflen); } +528 AUE_NULL NOPROTO { int rctl_add_rule(const void *inbufp, size_t inbuflen, void *outbufp, size_t outbuflen); } +529 AUE_NULL NOPROTO { int rctl_remove_rule(const void *inbufp, size_t inbuflen, void *outbufp, size_t outbuflen); } diff -urNp current/sys/compat/linux/linux_misc.c hrl/sys/compat/linux/linux_misc.c --- current/sys/compat/linux/linux_misc.c 2011-01-31 20:38:51.783137939 +0100 +++ hrl/sys/compat/linux/linux_misc.c 2011-01-31 19:54:42.212949549 +0100 @@ -35,6 +35,7 @@ __FBSDID("$FreeBSD: src/sys/compat/linux #include #include #include +#include #if defined(__i386__) #include #endif @@ -360,7 +361,9 @@ linux_uselib(struct thread *td, struct l */ PROC_LOCK(td->td_proc); if (a_out->a_text > maxtsiz || - a_out->a_data + bss_size > lim_cur(td->td_proc, RLIMIT_DATA)) { + a_out->a_data + bss_size > lim_cur(td->td_proc, RLIMIT_DATA) || + rusage_set(td->td_proc, RUSAGE_DATA, a_out->a_data + + bss_size) != 0) { PROC_UNLOCK(td->td_proc); error = ENOMEM; goto cleanup; diff -urNp current/sys/compat/svr4/imgact_svr4.c hrl/sys/compat/svr4/imgact_svr4.c --- current/sys/compat/svr4/imgact_svr4.c 2011-01-31 20:38:52.643054492 +0100 +++ hrl/sys/compat/svr4/imgact_svr4.c 2011-01-31 19:54:43.642886035 +0100 @@ -36,6 +36,7 @@ __FBSDID("$FreeBSD: src/sys/compat/svr4/ #include #include #include +#include #include #include #include @@ -108,7 +109,8 @@ exec_svr4_imgact(imgp) */ PROC_LOCK(imgp->proc); if (a_out->a_text > maxtsiz || - a_out->a_data + bss_size > lim_cur(imgp->proc, RLIMIT_DATA)) { + a_out->a_data + bss_size > lim_cur(imgp->proc, RLIMIT_DATA) || + rusage_set(imgp->proc, RUSAGE_DATA, a_out->a_data + bss_size) != 0) {; PROC_UNLOCK(imgp->proc); return (ENOMEM); } diff -urNp current/sys/conf/NOTES hrl/sys/conf/NOTES --- current/sys/conf/NOTES 2011-01-31 20:38:53.243039965 +0100 +++ hrl/sys/conf/NOTES 2011-01-31 19:54:45.722936991 +0100 @@ -1157,6 +1157,12 @@ options MAC_SEEOTHERUIDS options MAC_STUB options MAC_TEST +# Resource Containers +options CONTAINERS + +# Resource Limits +options RCTL + ##################################################################### # CLOCK OPTIONS diff -urNp current/sys/conf/files hrl/sys/conf/files --- current/sys/conf/files 2011-01-31 20:38:53.273009696 +0100 +++ hrl/sys/conf/files 2011-01-31 20:19:45.103072456 +0100 @@ -2146,6 +2146,7 @@ kern/kern_clock.c standard kern/kern_condvar.c standard kern/kern_conf.c standard kern/kern_cons.c standard +kern/kern_container.c standard kern/kern_cpu.c standard kern/kern_cpuset.c standard kern/kern_context.c standard @@ -2171,6 +2172,7 @@ kern/kern_linker.c standard kern/kern_lock.c standard kern/kern_lockf.c standard kern/kern_lockstat.c optional kdtrace_hooks +kern/kern_loginclass.c standard kern/kern_malloc.c standard kern/kern_mbuf.c standard kern/kern_mib.c standard @@ -2185,6 +2187,7 @@ kern/kern_poll.c optional device_pollin kern/kern_priv.c standard kern/kern_proc.c standard kern/kern_prot.c standard +kern/kern_rctl.c standard kern/kern_resource.c standard kern/kern_rmlock.c standard kern/kern_rwlock.c standard diff -urNp current/sys/conf/options hrl/sys/conf/options --- current/sys/conf/options 2011-01-31 20:38:53.352993046 +0100 +++ hrl/sys/conf/options 2011-01-31 19:54:46.953154519 +0100 @@ -856,3 +856,9 @@ X86BIOS # Flattened device tree options FDT opt_platform.h FDT_DTB_STATIC opt_platform.h + +# Resource Containers +CONTAINERS opt_global.h + +# Resource Limits +RCTL opt_global.h diff -urNp current/sys/fs/fdescfs/fdesc_vfsops.c hrl/sys/fs/fdescfs/fdesc_vfsops.c --- current/sys/fs/fdescfs/fdesc_vfsops.c 2011-01-31 20:41:01.963023092 +0100 +++ hrl/sys/fs/fdescfs/fdesc_vfsops.c 2011-01-31 19:57:27.352954773 +0100 @@ -38,6 +38,7 @@ * /dev/fd Filesystem */ +#include #include #include #include @@ -186,6 +187,7 @@ fdesc_statfs(mp, sbp) int i; int last; int freefd; + uint64_t limit; td = curthread; @@ -200,6 +202,9 @@ fdesc_statfs(mp, sbp) PROC_UNLOCK(td->td_proc); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); + limit = rusage_get_limit(td->td_proc, RUSAGE_NOFILE); + if (lim > limit) + lim = limit; last = min(fdp->fd_nfiles, lim); freefd = 0; for (i = fdp->fd_freefile; i < last; i++) diff -urNp current/sys/i386/linux/imgact_linux.c hrl/sys/i386/linux/imgact_linux.c --- current/sys/i386/linux/imgact_linux.c 2011-01-31 20:41:11.582988199 +0100 +++ hrl/sys/i386/linux/imgact_linux.c 2011-01-31 19:57:36.842926264 +0100 @@ -34,6 +34,7 @@ __FBSDID("$FreeBSD: src/sys/i386/linux/i #include #include +#include #include #include #include @@ -107,7 +108,8 @@ exec_linux_imgact(struct image_params *i */ PROC_LOCK(imgp->proc); if (a_out->a_text > maxtsiz || - a_out->a_data + bss_size > lim_cur(imgp->proc, RLIMIT_DATA)) { + a_out->a_data + bss_size > lim_cur(imgp->proc, RLIMIT_DATA) || + rusage_set(imgp->proc, RUSAGE_DATA, a_out->a_data + bss_size) != 0) { PROC_UNLOCK(imgp->proc); return (ENOMEM); } diff -urNp current/sys/kern/imgact_aout.c hrl/sys/kern/imgact_aout.c --- current/sys/kern/imgact_aout.c 2011-01-31 20:41:13.488564466 +0100 +++ hrl/sys/kern/imgact_aout.c 2011-01-31 19:57:38.523045106 +0100 @@ -29,6 +29,7 @@ __FBSDID("$FreeBSD: src/sys/kern/imgact_ #include #include +#include #include #include #include @@ -189,7 +190,9 @@ exec_aout_imgact(imgp) a_out->a_text > maxtsiz || /* data + bss can't exceed rlimit */ - a_out->a_data + bss_size > lim_cur(imgp->proc, RLIMIT_DATA)) { + a_out->a_data + bss_size > lim_cur(imgp->proc, RLIMIT_DATA) || + rusage_set(imgp->proc, RUSAGE_DATA, + a_out->a_data + bss_size) != 0) { PROC_UNLOCK(imgp->proc); return (ENOMEM); } diff -urNp current/sys/kern/imgact_elf.c hrl/sys/kern/imgact_elf.c --- current/sys/kern/imgact_elf.c 2011-01-31 20:41:13.512994666 +0100 +++ hrl/sys/kern/imgact_elf.c 2011-01-31 19:57:38.553081046 +0100 @@ -35,7 +35,9 @@ __FBSDID("$FreeBSD: src/sys/kern/imgact_ #include "opt_core.h" #include +#include #include +#include #include #include #include @@ -874,7 +876,9 @@ __CONCAT(exec_, __elfN(imgact))(struct i PROC_LOCK(imgp->proc); if (data_size > lim_cur(imgp->proc, RLIMIT_DATA) || text_size > maxtsiz || - total_size > lim_cur(imgp->proc, RLIMIT_VMEM)) { + total_size > lim_cur(imgp->proc, RLIMIT_VMEM) || + rusage_set(imgp->proc, RUSAGE_DATA, data_size) != 0 || + rusage_set(imgp->proc, RUSAGE_VMEM, total_size) != 0) { PROC_UNLOCK(imgp->proc); return (ENOMEM); } @@ -1101,6 +1105,13 @@ __elfN(coredump)(struct thread *td, stru hdrsize = 0; __elfN(puthdr)(td, (void *)NULL, &hdrsize, seginfo.count); + PROC_LOCK(td->td_proc); + error = rusage_add(td->td_proc, RUSAGE_CORE, hdrsize + seginfo.size); + PROC_UNLOCK(td->td_proc); + if (error != 0) { + error = EFAULT; + goto done; + } if (hdrsize + seginfo.size >= limit) { error = EFAULT; goto done; diff -urNp current/sys/kern/imgact_gzip.c hrl/sys/kern/imgact_gzip.c --- current/sys/kern/imgact_gzip.c 2011-01-31 20:41:13.533251151 +0100 +++ hrl/sys/kern/imgact_gzip.c 2011-01-31 19:57:38.562927549 +0100 @@ -216,7 +216,9 @@ do_aout_hdr(struct imgact_gzip * gz) /* data + bss can't exceed rlimit */ gz->a_out.a_data + gz->bss_size > - lim_cur(gz->ip->proc, RLIMIT_DATA)) { + lim_cur(gz->ip->proc, RLIMIT_DATA) || + rusage_set(gz->ip->proc, RUSAGE_DATA, + gz->a_out.a_data + gz->bss_size) != 0) { PROC_UNLOCK(gz->ip->proc); gz->where = __LINE__; return (ENOMEM); diff -urNp current/sys/kern/init_main.c hrl/sys/kern/init_main.c --- current/sys/kern/init_main.c 2011-01-31 20:41:13.592975502 +0100 +++ hrl/sys/kern/init_main.c 2011-01-31 19:57:38.712986285 +0100 @@ -49,12 +49,14 @@ __FBSDID("$FreeBSD: src/sys/kern/init_ma #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -484,6 +486,7 @@ proc0_init(void *dummy __unused) p->p_ucred->cr_uidinfo = uifind(0); p->p_ucred->cr_ruidinfo = uifind(0); p->p_ucred->cr_prison = &prison0; + p->p_ucred->cr_loginclass = loginclass_find("default"); #ifdef AUDIT audit_cred_kproc0(p->p_ucred); #endif @@ -523,6 +526,9 @@ proc0_init(void *dummy __unused) p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = pageablemem; p->p_cpulimit = RLIM_INFINITY; + /* Initialize resource accounting structures. */ + container_create(&p->p_container); + p->p_stats = pstats_alloc(); /* Allocate a prototype map so we have something to fork. */ @@ -550,6 +556,9 @@ proc0_init(void *dummy __unused) * Charge root for one process. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0); + PROC_LOCK(p); + rusage_add_force(p, RUSAGE_NPROC, 1); + PROC_UNLOCK(p); } SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL); diff -urNp current/sys/kern/init_sysent.c hrl/sys/kern/init_sysent.c --- current/sys/kern/init_sysent.c 2011-01-31 20:41:13.653232322 +0100 +++ hrl/sys/kern/init_sysent.c 2011-01-31 19:57:38.742980042 +0100 @@ -2,8 +2,8 @@ * System call switch table. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/kern/init_sysent.c,v 1.257 2010/08/30 14:26:02 kib Exp $ - * created from FreeBSD: head/sys/kern/syscalls.master 211998 2010-08-30 14:24:44Z kib + * $FreeBSD$ + * created from FreeBSD: src/sys/kern/syscalls.master,v 1.265 2010/08/30 14:24:44 kib Exp */ #include "opt_compat.h" @@ -557,4 +557,11 @@ struct sysent sysent[] = { { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 520 = pdgetpid */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 521 = pdwait */ { AS(pselect_args), (sy_call_t *)pselect, AUE_SELECT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 522 = pselect */ + { AS(getloginclass_args), (sy_call_t *)getloginclass, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 523 = getloginclass */ + { AS(setloginclass_args), (sy_call_t *)setloginclass, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 524 = setloginclass */ + { AS(rctl_get_usage_args), (sy_call_t *)rctl_get_usage, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 525 = rctl_get_usage */ + { AS(rctl_get_rules_args), (sy_call_t *)rctl_get_rules, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 526 = rctl_get_rules */ + { AS(rctl_get_limits_args), (sy_call_t *)rctl_get_limits, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 527 = rctl_get_limits */ + { AS(rctl_add_rule_args), (sy_call_t *)rctl_add_rule, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 528 = rctl_add_rule */ + { AS(rctl_remove_rule_args), (sy_call_t *)rctl_remove_rule, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 529 = rctl_remove_rule */ }; diff -urNp current/sys/kern/kern_container.c hrl/sys/kern/kern_container.c --- current/sys/kern/kern_container.c 1970-01-01 01:00:00.000000000 +0100 +++ hrl/sys/kern/kern_container.c 2011-01-31 19:57:38.843017616 +0100 @@ -0,0 +1,884 @@ +/*- + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Edward Tomasz Napierala under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_kdtrace.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef RCTL +#include +#endif + +#ifdef CONTAINERS + +FEATURE(containers, "Resource Containers"); + +static struct mtx container_lock; +MTX_SYSINIT(container_lock, &container_lock, "container lock", MTX_DEF); + +static uma_zone_t container_zone; + +static void container_sub(struct container *dest, const struct container *src); +static void rusage_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount); +static void rusage_add_cred_locked(struct ucred *cred, int resource, uint64_t amount); + +SDT_PROVIDER_DEFINE(container); +SDT_PROBE_DEFINE3(container, kernel, rusage, add, add, "struct proc *", "int", "uint64_t"); +SDT_PROBE_DEFINE3(container, kernel, rusage, add_failure, add-failure, "struct proc *", "int", "uint64_t"); +SDT_PROBE_DEFINE3(container, kernel, rusage, add_cred, add-cred, "struct ucred *", "int", "uint64_t"); +SDT_PROBE_DEFINE3(container, kernel, rusage, add_force, add-force, "struct proc *", "int", "uint64_t"); +SDT_PROBE_DEFINE3(container, kernel, rusage, set, set, "struct proc *", "int", "uint64_t"); +SDT_PROBE_DEFINE3(container, kernel, rusage, set_failure, set-failure, "struct proc *", "int", "uint64_t"); +SDT_PROBE_DEFINE3(container, kernel, rusage, sub, sub, "struct proc *", "int", "uint64_t"); +SDT_PROBE_DEFINE3(container, kernel, rusage, sub_cred, sub-cred, "struct ucred *", "int", "uint64_t"); +SDT_PROBE_DEFINE1(container, kernel, container, create, create, "struct container *"); +SDT_PROBE_DEFINE1(container, kernel, container, destroy, destroy, "struct container *"); +SDT_PROBE_DEFINE2(container, kernel, container, join, join, "struct container *", "struct container *"); +SDT_PROBE_DEFINE2(container, kernel, container, join_failure, join-failure, "struct container *", "struct container *"); +SDT_PROBE_DEFINE2(container, kernel, container, leave, leave, "struct container *", "struct container *"); + +int rusage_types[] = { + [RUSAGE_CPU] = RUSAGE_IN_THOUSANDS, + [RUSAGE_FSIZE] = RUSAGE_RECLAIMABLE | RUSAGE_INHERITABLE | RUSAGE_DENIABLE, + [RUSAGE_DATA] = RUSAGE_RECLAIMABLE | RUSAGE_INHERITABLE | RUSAGE_DENIABLE, + [RUSAGE_STACK] = RUSAGE_RECLAIMABLE | RUSAGE_INHERITABLE | RUSAGE_DENIABLE, + [RUSAGE_CORE] = RUSAGE_DENIABLE, + [RUSAGE_RSS] = RUSAGE_RECLAIMABLE | RUSAGE_INHERITABLE, + [RUSAGE_MEMLOCK] = RUSAGE_RECLAIMABLE | RUSAGE_DENIABLE, + [RUSAGE_NPROC] = RUSAGE_RECLAIMABLE | RUSAGE_DENIABLE, + [RUSAGE_NOFILE] = RUSAGE_RECLAIMABLE | RUSAGE_INHERITABLE | RUSAGE_DENIABLE, + [RUSAGE_SBSIZE] = RUSAGE_RECLAIMABLE | RUSAGE_DENIABLE | RUSAGE_SLOPPY, + [RUSAGE_VMEM] = RUSAGE_RECLAIMABLE | RUSAGE_INHERITABLE | RUSAGE_DENIABLE, + [RUSAGE_NPTS] = RUSAGE_RECLAIMABLE | RUSAGE_DENIABLE | RUSAGE_SLOPPY, + [RUSAGE_SWAP] = RUSAGE_RECLAIMABLE | RUSAGE_DENIABLE | RUSAGE_SLOPPY, + [RUSAGE_NTHR] = RUSAGE_RECLAIMABLE | RUSAGE_DENIABLE, + [RUSAGE_MSGQQUEUED] = RUSAGE_RECLAIMABLE | RUSAGE_DENIABLE | RUSAGE_SLOPPY, + [RUSAGE_MSGQSIZE] = RUSAGE_RECLAIMABLE | RUSAGE_DENIABLE | RUSAGE_SLOPPY, + [RUSAGE_NMSGQ] = RUSAGE_RECLAIMABLE | RUSAGE_DENIABLE | RUSAGE_SLOPPY, + [RUSAGE_NSEM] = RUSAGE_RECLAIMABLE | RUSAGE_DENIABLE | RUSAGE_SLOPPY, + [RUSAGE_NSEMOP] = RUSAGE_RECLAIMABLE | RUSAGE_INHERITABLE | RUSAGE_DENIABLE, + [RUSAGE_NSHM] = RUSAGE_RECLAIMABLE | RUSAGE_DENIABLE | RUSAGE_SLOPPY, + [RUSAGE_SHMSIZE] = RUSAGE_RECLAIMABLE | RUSAGE_DENIABLE | RUSAGE_SLOPPY, + [RUSAGE_WALLCLOCK] = RUSAGE_IN_THOUSANDS, + [RUSAGE_PCTCPU] = RUSAGE_IN_THOUSANDS | RUSAGE_RECLAIMABLE | RUSAGE_DAMPENED }; + +static void +container_add(struct container *dest, const struct container *src) +{ + int i; + + mtx_assert(&container_lock, MA_OWNED); + + /* + * Update resource usage in dest. + */ + for (i = 0; i <= RUSAGE_MAX; i++) { + KASSERT(dest->c_resources[i] >= 0, + ("resource usage propagation meltdown: dest < 0")); + KASSERT(src->c_resources[i] >= 0, + ("resource usage propagation meltdown: src < 0")); + dest->c_resources[i] += src->c_resources[i]; + } +} + +static void +container_sub(struct container *dest, const struct container *src) +{ + int i; + + mtx_assert(&container_lock, MA_OWNED); + + /* + * Update resource usage in dest. + */ + for (i = 0; i <= RUSAGE_MAX; i++) { + if (!rusage_is_sloppy(i) && + !rusage_is_dampened(i)) { + KASSERT(dest->c_resources[i] >= 0, + ("resource usage propagation meltdown: dest < 0")); + KASSERT(src->c_resources[i] >= 0, + ("resource usage propagation meltdown: src < 0")); + KASSERT(src->c_resources[i] <= dest->c_resources[i], + ("resource usage propagation meltdown: src > dest")); + } + if (rusage_is_reclaimable(i)) { + dest->c_resources[i] -= src->c_resources[i]; + if (dest->c_resources[i] < 0) { + KASSERT(rusage_is_sloppy(i) || + rusage_is_dampened(i), + ("container_sub: usage < 0")); + dest->c_resources[i] = 0; + } + } + } +} + +void +container_create(struct container **containerp) +{ + + SDT_PROBE(container, kernel, container, create, containerp, 0, 0, 0, 0); + + KASSERT(*containerp == NULL, ("container already allocated")); + + *containerp = uma_zalloc(container_zone, M_WAITOK | M_ZERO); +} + +static void +container_destroy_locked(struct container **containerp) +{ + int i; + struct container *container; + + SDT_PROBE(container, kernel, container, destroy, containerp, 0, 0, 0, 0); + + mtx_assert(&container_lock, MA_OWNED); + KASSERT(containerp != NULL, ("NULL containerp")); + KASSERT(*containerp != NULL, ("NULL container")); + + container = *containerp; + + for (i = 0; i <= RUSAGE_MAX; i++) { + if (rusage_is_sloppy(i)) + continue; + if (!rusage_is_reclaimable(i)) + continue; + if (rusage_is_dampened(i)) + continue; + KASSERT(container->c_resources[i] == 0, + ("destroying non-empty container: " + "%ju allocated for resource %d\n", + container->c_resources[i], i)); + } + uma_zfree(container_zone, container); + *containerp = NULL; +} + +void +container_destroy(struct container **container) +{ + + mtx_lock(&container_lock); + container_destroy_locked(container); + mtx_unlock(&container_lock); +} + +/* + * Increase consumption of 'resource' by 'amount' for 'container' + * and all its parents. Differently from other cases, 'amount' here + * may be less than zero. + */ +static void +container_alloc_resource(struct container *container, int resource, + uint64_t amount) +{ + + mtx_assert(&container_lock, MA_OWNED); + KASSERT(container != NULL, ("NULL container")); + + container->c_resources[resource] += amount; + if (container->c_resources[resource] < 0) { + KASSERT(rusage_is_sloppy(resource) || + rusage_is_dampened(resource), + ("container_alloc_resource: usage < 0")); + container->c_resources[resource] = 0; + } +} + +/* + * Increase allocation of 'resource' by 'amount' for process 'p'. + * Return 0 if it's below limits, or errno, if it's not. + */ +int +rusage_add(struct proc *p, int resource, uint64_t amount) +{ +#ifdef RCTL + int error; +#endif + + if (p->p_flag & P_SYSTEM) + return (0); + + SDT_PROBE(container, kernel, rusage, add, p, resource, amount, 0, 0); + + /* + * We need proc lock to dereference p->p_ucred. + */ + PROC_LOCK_ASSERT(p, MA_OWNED); + KASSERT(amount >= 0, ("rusage_add: invalid amount for resource %d: %ju", + resource, amount)); + + mtx_lock(&container_lock); +#ifdef RCTL + error = rctl_enforce(p, resource, amount); + if (error && rusage_is_deniable(resource)) { + SDT_PROBE(container, kernel, rusage, add_failure, p, resource, amount, 0, 0); + mtx_unlock(&container_lock); + return (error); + } +#endif + container_alloc_resource(p->p_container, resource, amount); + rusage_add_cred_locked(p->p_ucred, resource, amount); + mtx_unlock(&container_lock); + + return (0); +} + +static void +rusage_add_cred_locked(struct ucred *cred, int resource, uint64_t amount) +{ + struct prison *pr; + + SDT_PROBE(container, kernel, rusage, add_cred, cred, resource, amount, 0, 0); + + KASSERT(amount >= 0, ("rusage_add_cred: invalid amount for resource %d: %ju", + resource, amount)); + + container_alloc_resource(cred->cr_ruidinfo->ui_container, resource, amount); + for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) + container_alloc_resource(pr->pr_container, resource, amount); + container_alloc_resource(cred->cr_loginclass->lc_container, resource, amount); +} + +/* + * Increase allocation of 'resource' by 'amount' for credential 'cred'. Doesn't + * check for limits and never fails. + * + * XXX: Shouldn't this ever return an error? + */ +void +rusage_add_cred(struct ucred *cred, int resource, uint64_t amount) +{ + + mtx_lock(&container_lock); + rusage_add_cred_locked(cred, resource, amount); + mtx_unlock(&container_lock); +} + +/* + * Increase allocation of 'resource' by 'amount' for process 'p'. Doesn't check + * for limits and never fails. + */ +void +rusage_add_force(struct proc *p, int resource, uint64_t amount) +{ + + if (p->p_flag & P_SYSTEM) + return; + + SDT_PROBE(container, kernel, rusage, add_force, p, resource, amount, 0, 0); + + /* + * We need proc lock to dereference p->p_ucred. + */ + PROC_LOCK_ASSERT(p, MA_OWNED); + KASSERT(amount >= 0, ("rusage_add_force: invalid amount for resource %d: %ju", + resource, amount)); + + mtx_lock(&container_lock); + container_alloc_resource(p->p_container, resource, amount); + mtx_unlock(&container_lock); + rusage_add_cred(p->p_ucred, resource, amount); +} + +static int +rusage_set_locked(struct proc *p, int resource, uint64_t amount) +{ + int64_t diff; +#ifdef RCTL + int error; +#endif + + if (p->p_flag & P_SYSTEM) + return (0); + + SDT_PROBE(container, kernel, rusage, set, p, resource, amount, 0, 0); + + /* + * We need proc lock to dereference p->p_ucred. + */ + PROC_LOCK_ASSERT(p, MA_OWNED); + KASSERT(amount >= 0, ("rusage_set: invalid amount for resource %d: %ju", + resource, amount)); + + diff = amount - p->p_container->c_resources[resource]; +#ifdef notyet + KASSERT(diff >= 0 || rusage_is_reclaimable(resource), + ("rusage_set: usage of non-reclaimable resource %d dropping", + resource)); +#endif +#ifdef RCTL + if (diff > 0) { + error = rctl_enforce(p, resource, diff); + if (error && rusage_is_deniable(resource)) { + SDT_PROBE(container, kernel, rusage, set_failure, p, resource, amount, 0, 0); + return (error); + } + } +#endif + container_alloc_resource(p->p_container, resource, diff); + if (diff > 0) + rusage_add_cred_locked(p->p_ucred, resource, diff); + else if (diff < 0) + rusage_sub_cred_locked(p->p_ucred, resource, -diff); + + return (0); +} + +/* + * Set allocation of 'resource' to 'amount' for process 'p'. + * Return 0 if it's below limits, or errno, if it's not. + * + * Note that decreasing the allocation always returns 0, + * even if it's above the limit. + */ +int +rusage_set(struct proc *p, int resource, uint64_t amount) +{ + int error; + + mtx_lock(&container_lock); + error = rusage_set_locked(p, resource, amount); + mtx_unlock(&container_lock); + return (error); +} + +void +rusage_set_force(struct proc *p, int resource, uint64_t amount) +{ + int64_t diff; + + if (p->p_flag & P_SYSTEM) + return; + + SDT_PROBE(container, kernel, rusage, set, p, resource, amount, 0, 0); + + /* + * We need proc lock to dereference p->p_ucred. + */ + PROC_LOCK_ASSERT(p, MA_OWNED); + KASSERT(amount >= 0, ("rusage_set_force: invalid amount for resource %d: %ju", + resource, amount)); + + mtx_lock(&container_lock); + diff = amount - p->p_container->c_resources[resource]; + container_alloc_resource(p->p_container, resource, diff); + if (diff > 0) + rusage_add_cred_locked(p->p_ucred, resource, diff); + else if (diff < 0) + rusage_sub_cred_locked(p->p_ucred, resource, -diff); + mtx_unlock(&container_lock); +} + +/* + * Returns amount of 'resource' the process 'p' can keep allocated. + * Allocating more than that would be denied, unless the resource + * is marked undeniable. Amount of already allocated resource does + * not matter. + */ +uint64_t +rusage_get_limit(struct proc *p, int resource) +{ + +#ifdef RCTL + return (rctl_get_limit(p, resource)); +#else + return (UINT64_MAX); +#endif +} + +/* + * Returns amount of 'resource' the process 'p' can keep allocated. + * Allocating more than that would be denied, unless the resource + * is marked undeniable. Amount of already allocated resource does + * matter. + */ +uint64_t +rusage_get_available(struct proc *p, int resource) +{ + +#ifdef RCTL + return (rctl_get_available(p, resource)); +#else + return (UINT64_MAX); +#endif +} + +/* + * Decrease allocation of 'resource' by 'amount' for process 'p'. + */ +void +rusage_sub(struct proc *p, int resource, uint64_t amount) +{ + + if (p->p_flag & P_SYSTEM) + return; + + SDT_PROBE(container, kernel, rusage, sub, p, resource, amount, 0, 0); + + /* + * We need proc lock to dereference p->p_ucred. + */ + PROC_LOCK_ASSERT(p, MA_OWNED); + KASSERT(amount >= 0, ("rusage_sub: invalid amount for resource %d: %ju", + resource, amount)); + KASSERT(rusage_is_reclaimable(resource), + ("rusage_sub: called for non-reclaimable resource %d", resource)); + + mtx_lock(&container_lock); + KASSERT(amount <= p->p_container->c_resources[resource], + ("rusage_sub: freeing %ju of resource %d, which is more than allocated " + "%jd for %s (pid %d)", amount, resource, + (intmax_t)p->p_container->c_resources[resource], p->p_comm, p->p_pid)); + + container_alloc_resource(p->p_container, resource, -amount); + rusage_sub_cred_locked(p->p_ucred, resource, amount); + mtx_unlock(&container_lock); +} + +static void +rusage_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount) +{ + struct prison *pr; + + SDT_PROBE(container, kernel, rusage, sub_cred, cred, resource, amount, 0, 0); + + KASSERT(amount >= 0, ("rusage_sub_cred: invalid amount for resource %d: %ju", + resource, amount)); +#ifdef notyet + KASSERT(rusage_is_reclaimable(resource), + ("rusage_sub_cred: called for non-reclaimable resource %d", resource)); +#endif + + container_alloc_resource(cred->cr_ruidinfo->ui_container, resource, -amount); + for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) + container_alloc_resource(pr->pr_container, resource, -amount); + container_alloc_resource(cred->cr_loginclass->lc_container, resource, -amount); +} + +/* + * Decrease allocation of 'resource' by 'amount' for credential 'cred'. + */ +void +rusage_sub_cred(struct ucred *cred, int resource, uint64_t amount) +{ + + mtx_lock(&container_lock); + rusage_sub_cred_locked(cred, resource, amount); + mtx_unlock(&container_lock); +} + +/* + * Inherit resource usage information and containing containers + * from the parent process. + */ +int +container_proc_fork(struct proc *parent, struct proc *child) +{ + int i, error = 0; + + /* + * Create container for the child process. + */ + container_create(&child->p_container); + + /* + * No resource accounting for kernel processes. + */ + if (child->p_flag & P_SYSTEM) + return (0); + + PROC_LOCK(parent); + PROC_LOCK(child); + mtx_lock(&container_lock); + + /* + * Inherit resource usage. + */ + for (i = 0; i <= RUSAGE_MAX; i++) { + if (parent->p_container->c_resources[i] == 0 || + !rusage_is_inheritable(i)) + continue; + + error = rusage_set_locked(child, i, parent->p_container->c_resources[i]); + if (error != 0) { + /* + * XXX: The only purpose of these two lines is to prevent from + * tripping checks in container_destroy(). + */ + for (i = 0; i <= RUSAGE_MAX; i++) + rusage_set_locked(child, i, 0); + goto out; + } + } + +#ifdef RCTL + error = rctl_proc_fork(parent, child); + if (error != 0) { + /* + * XXX: The only purpose of these two lines is to prevent from + * tripping checks in container_destroy(). + */ + for (i = 0; i <= RUSAGE_MAX; i++) + rusage_set_locked(child, i, 0); + } +#endif + +out: + if (error != 0) + container_destroy_locked(&child->p_container); + mtx_unlock(&container_lock); + PROC_UNLOCK(child); + PROC_UNLOCK(parent); + + return (error); +} + +void +container_proc_exit(struct proc *p) +{ + uint64_t runtime, pctcpu; + + PROC_LOCK(p); + /* + * We don't need to calculate rux, proc_reap() has already done this. + */ + runtime = cputick2usec(p->p_rux.rux_runtime); +#ifdef notyet + KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); +#else + if (runtime < p->p_prev_runtime) + runtime = p->p_prev_runtime; +#endif + pctcpu = (runtime - p->p_prev_runtime) / 10; + rusage_set(p, RUSAGE_CPU, runtime); + rusage_add(p, RUSAGE_PCTCPU, pctcpu); + + /* + * XXX: Free this some other way. + */ + rusage_set(p, RUSAGE_FSIZE, 0); + rusage_set(p, RUSAGE_NPTS, 0); + rusage_set(p, RUSAGE_NTHR, 0); + rusage_set(p, RUSAGE_RSS, 0); + PROC_UNLOCK(p); + +#ifdef RCTL + rctl_proc_exit(p); +#endif + container_destroy(&p->p_container); +} + +/* + * Called after credentials change, to move resource utilisation + * between containers. + */ +void +container_proc_ucred_changed(struct proc *p, struct ucred *oldcred, + struct ucred *newcred) +{ + struct uidinfo *olduip, *newuip; + struct loginclass *oldlc, *newlc; + struct prison *oldpr, *newpr, *pr; + + PROC_LOCK_ASSERT(p, MA_NOTOWNED); + + newuip = newcred->cr_ruidinfo; + olduip = oldcred->cr_ruidinfo; + newlc = newcred->cr_loginclass; + oldlc = oldcred->cr_loginclass; + newpr = newcred->cr_prison; + oldpr = oldcred->cr_prison; + + mtx_lock(&container_lock); + if (newuip != olduip) { + container_sub(olduip->ui_container, p->p_container); + container_add(newuip->ui_container, p->p_container); + } + if (newlc != oldlc) { + container_sub(oldlc->lc_container, p->p_container); + container_add(newlc->lc_container, p->p_container); + } + if (newpr != oldpr) { + for (pr = oldpr; pr != NULL; pr = pr->pr_parent) + container_sub(pr->pr_container, p->p_container); + for (pr = newpr; pr != NULL; pr = pr->pr_parent) + container_add(pr->pr_container, p->p_container); + } + mtx_unlock(&container_lock); + +#ifdef RCTL + rctl_proc_ucred_changed(p, newcred); +#endif +} + +static void +rusage_throttle(struct proc *p, int throttle) +{ + struct thread *td; + u_char oldpri; + u_char newpri; + int type; + + PROC_LOCK_ASSERT(p, MA_OWNED); + + if (throttle) { + p->p_throttle++; + newpri = PRI_MIN_IDLE; + type = RTP_PRIO_IDLE; + } else if (p->p_throttle > 0) { + p->p_throttle--; + newpri = PRI_MIN_TIMESHARE; + type = RTP_PRIO_NORMAL; + } else + return; + + FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); + /* Mostly copied from rtp_to_pri(). */ + sched_class(td, type); /* XXX fix */ + oldpri = td->td_user_pri; + sched_user_prio(td, newpri); + if (TD_IS_RUNNING(td) || TD_CAN_RUN(td)) + sched_prio(td, td->td_user_pri); /* XXX dubious */ + if (TD_ON_UPILOCK(td) && oldpri != newpri) + umtx_pi_adjust(td, oldpri); + thread_unlock(td); + } +} + +/* + * %CPU is special. Each second we zero out RUSAGE_PCTCPU for all + * the processes and other containers before calculating %CPU. Reason + * for this is that we also to update %CPU when process exits, + * and that would cause the %CPU for per-user or per-jail containers + * to grow indefinitely. + */ +static void +container_dampen_callback(struct container *container, void *arg2, void *arg3) +{ + + mtx_lock(&container_lock); + container->c_resources[RUSAGE_PCTCPU] = 0; + mtx_unlock(&container_lock); +} + +static void +containerd(void) +{ + struct thread *td; + struct proc *p; + struct timeval wallclock; + uint64_t pctcpu, pctcpu_limit, runtime; + + for (;;) { + sx_slock(&allproc_lock); + /* + * XXX: There is a window between zeroing the stats and setting + * them to a proper value. + */ + loginclass_container_foreach(container_dampen_callback, NULL, + NULL); + ui_container_foreach(container_dampen_callback, NULL, NULL); + prison_container_foreach(container_dampen_callback, NULL, + NULL); + + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_flag & P_SYSTEM) + continue; + + microuptime(&wallclock); + timevalsub(&wallclock, &p->p_stats->p_start); + pctcpu_limit = rusage_get_available(p, RUSAGE_PCTCPU); + PROC_LOCK(p); + PROC_SLOCK(p); + FOREACH_THREAD_IN_PROC(p, td) { + ruxagg(p, td); + thread_lock(td); + thread_unlock(td); + } + runtime = cputick2usec(p->p_rux.rux_runtime); + PROC_SUNLOCK(p); +#ifdef notyet + KASSERT(runtime >= p->p_prev_runtime, + ("runtime < p_prev_runtime")); +#else + if (runtime < p->p_prev_runtime) + runtime = p->p_prev_runtime; +#endif + pctcpu = (runtime - p->p_prev_runtime) / 10; + p->p_prev_runtime = runtime; + if (pctcpu > pctcpu_limit) + rusage_throttle(p, 1); + else + rusage_throttle(p, 0); + mtx_lock(&container_lock); + rusage_set_locked(p, RUSAGE_CPU, runtime); + p->p_container->c_resources[RUSAGE_PCTCPU] = 0; + rusage_set_locked(p, RUSAGE_PCTCPU, pctcpu); + rusage_set_locked(p, RUSAGE_WALLCLOCK, + wallclock.tv_sec * 1000000 + wallclock.tv_usec); + mtx_unlock(&container_lock); + PROC_UNLOCK(p); + } + sx_sunlock(&allproc_lock); + pause("-", hz); + } +} + +static struct kproc_desc containerd_kp = { + "containerd", + containerd, + NULL +}; +SYSINIT(containerd, SI_SUB_CONTAINERD, SI_ORDER_FIRST, kproc_start, &containerd_kp); + +static void +container_proc_fork_sched(void *arg __unused, struct proc *p1, + struct proc *newproc, int flags) +{ + uint64_t pctcpu_limit; + + /* + * Newly created process may already be over the %CPU limit. Throttle + * it immediately after fork instead of waiting for containerd. + */ + pctcpu_limit = rusage_get_limit(newproc, RUSAGE_PCTCPU); + if (pctcpu_limit <= 0) + rusage_throttle(newproc, 1); +} + +static void +container_init(void) +{ + + container_zone = uma_zcreate("container", sizeof(struct container), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + EVENTHANDLER_REGISTER(process_fork, container_proc_fork_sched, NULL, + EVENTHANDLER_PRI_ANY); + /* + * XXX: Move this somewhere. + */ + container_create(&prison0.pr_container); +} +SYSINIT(container, SI_SUB_CONTAINER, SI_ORDER_FIRST, container_init, NULL); + +#else /* !CONTAINERS */ + +int +rusage_add(struct proc *p, int resource, uint64_t amount) +{ + + return (0); +} + +void +rusage_add_cred(struct ucred *cred, int resource, uint64_t amount) +{ +} + +void +rusage_add_force(struct proc *p, int resource, uint64_t amount) +{ + + return (0); +} + +int +rusage_set(struct proc *p, int resource, uint64_t amount) +{ + + return (0); +} + +void +rusage_sub(struct proc *p, int resource, uint64_t amount) +{ +} + +void +rusage_sub_cred(struct ucred *cred, int resource, uint64_t amount) +{ +} + +uint64_t +rusage_get_limit(struct proc *p, int resource) +{ + + return (UINT64_MAX); +} + +void +container_create(struct container **containerp) +{ +} + +void +container_destroy(struct container **containerp) +{ +} + +int +container_proc_fork(struct proc *parent, struct proc *child) +{ + + return (0); +} + +void +container_proc_exit(struct proc *p) +{ +} + +#endif /* !CONTAINERS */ diff -urNp current/sys/kern/kern_descrip.c hrl/sys/kern/kern_descrip.c --- current/sys/kern/kern_descrip.c 2011-01-31 20:41:14.032966897 +0100 +++ hrl/sys/kern/kern_descrip.c 2011-01-31 19:57:39.033002097 +0100 @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD: src/sys/kern/kern_de #include #include +#include #include #include #include @@ -274,11 +275,15 @@ int getdtablesize(struct thread *td, struct getdtablesize_args *uap) { struct proc *p = td->td_proc; + uint64_t lim; PROC_LOCK(p); td->td_retval[0] = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); + lim = rusage_get_limit(td->td_proc, RUSAGE_NOFILE); PROC_UNLOCK(p); + if (lim < td->td_retval[0]) + td->td_retval[0] = lim; return (0); } @@ -791,8 +796,25 @@ do_dup(struct thread *td, int flags, int * out for a race. */ if (flags & DUP_FIXED) { - if (new >= fdp->fd_nfiles) + if (new >= fdp->fd_nfiles) { + /* + * The resource limits are here instead of e.g. fdalloc(), + * because the file descriptor table may be shared between + * processes, so we can't really use rusage_add()/rusage_sub(). + * Instead of counting the number of actually allocated + * descriptors, just put the limit on the size of the file + * descriptor table. + */ + PROC_LOCK(p); + error = rusage_set(p, RUSAGE_NOFILE, new + 1); + PROC_UNLOCK(p); + if (error != 0) { + FILEDESC_XUNLOCK(fdp); + fdrop(fp, td); + return (EMFILE); + } fdgrowtable(fdp, new + 1); + } if (fdp->fd_ofiles[new] == NULL) fdused(fdp, new); } else { @@ -1438,7 +1460,7 @@ fdalloc(struct thread *td, int minfd, in { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; - int fd = -1, maxfd; + int fd = -1, maxfd, error; FILEDESC_XLOCK_ASSERT(fdp); @@ -1461,6 +1483,11 @@ fdalloc(struct thread *td, int minfd, in return (EMFILE); if (fd < fdp->fd_nfiles) break; + PROC_LOCK(p); + error = rusage_set(p, RUSAGE_NOFILE, min(fdp->fd_nfiles * 2, maxfd)); + PROC_UNLOCK(p); + if (error != 0) + return (EMFILE); fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd)); } @@ -1492,6 +1519,11 @@ fdavail(struct thread *td, int n) FILEDESC_LOCK_ASSERT(fdp); + /* + * XXX: This is only called from uipc_usrreq.c:unp_externalize(); + * call rusage_add() from there instead of dealing with containers + * here. + */ PROC_LOCK(p); lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); PROC_UNLOCK(p); @@ -1738,6 +1770,10 @@ fdfree(struct thread *td) if (fdp == NULL) return; + PROC_LOCK(td->td_proc); + rusage_set(td->td_proc, RUSAGE_NOFILE, 0); + PROC_UNLOCK(td->td_proc); + /* Check for special need to clear POSIX style locks */ fdtol = td->td_proc->p_fdtol; if (fdtol != NULL) { diff -urNp current/sys/kern/kern_exit.c hrl/sys/kern/kern_exit.c --- current/sys/kern/kern_exit.c 2011-01-31 20:41:14.232940078 +0100 +++ hrl/sys/kern/kern_exit.c 2011-01-31 19:57:39.302868240 +0100 @@ -66,6 +66,7 @@ __FBSDID("$FreeBSD: src/sys/kern/kern_ex #include #include /* for acct_process() function prototype */ #include +#include #include #include #include @@ -176,6 +177,7 @@ exit1(struct thread *td, int rv) } KASSERT(p->p_numthreads == 1, ("exit1: proc %p exiting with %d threads", p, p->p_numthreads)); + rusage_sub(p, RUSAGE_NTHR, 1); /* * Wakeup anyone in procfs' PIOCWAIT. They should have a hold * on our vmspace, so we should block below until they have @@ -741,6 +743,14 @@ proc_reap(struct thread *td, struct proc (void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0); /* + * Destroy resource container associated with the process. + */ + container_proc_exit(p); + PROC_LOCK(p->p_pptr); + rusage_sub(p->p_pptr, RUSAGE_NPROC, 1); + PROC_UNLOCK(p->p_pptr); + + /* * Free credentials, arguments, and sigacts. */ crfree(p->p_ucred); @@ -899,7 +909,11 @@ proc_reparent(struct proc *child, struct if (child->p_pptr == parent) return; + PROC_LOCK(parent); + rusage_add_force(parent, RUSAGE_NPROC, 1); + PROC_UNLOCK(parent); PROC_LOCK(child->p_pptr); + rusage_sub(child->p_pptr, RUSAGE_NPROC, 1); sigqueue_take(child->p_ksi); PROC_UNLOCK(child->p_pptr); LIST_REMOVE(child, p_sibling); diff -urNp current/sys/kern/kern_fork.c hrl/sys/kern/kern_fork.c --- current/sys/kern/kern_fork.c 2011-01-31 20:41:14.272952426 +0100 +++ hrl/sys/kern/kern_fork.c 2011-01-31 19:57:39.343017756 +0100 @@ -46,6 +46,7 @@ __FBSDID("$FreeBSD: src/sys/kern/kern_fo #include #include #include +#include #include #include #include @@ -735,6 +736,12 @@ fork1(struct thread *td, int flags, int return (fork_norfproc(td, flags)); } + PROC_LOCK(p1); + error = rusage_add(p1, RUSAGE_NPROC, 1); + PROC_UNLOCK(p1); + if (error != 0) + return (error); + mem_charged = 0; vm2 = NULL; if (pages == 0) @@ -785,6 +792,21 @@ fork1(struct thread *td, int flags, int knlist_init_mtx(&newproc->p_klist, &newproc->p_mtx); STAILQ_INIT(&newproc->p_ktr); + /* + * XXX: This is ugly; when we copy resource usage, we need to bump + * per-cred resource counters. + */ + newproc->p_ucred = p1->p_ucred; + + /* + * Initialize resource container for the child process. + */ + error = container_proc_fork(p1, newproc); + if (error != 0) { + error = EAGAIN; + goto fail1; + } + /* We have to lock the process tree while we look for a pid. */ sx_slock(&proctree_lock); @@ -803,6 +825,17 @@ fork1(struct thread *td, int flags, int } /* + * After fork, there is exactly one thread running. + */ + PROC_LOCK(newproc); + error = rusage_set(newproc, RUSAGE_NTHR, 1); + PROC_UNLOCK(newproc); + if (error != 0) { + error = EAGAIN; + goto fail; + } + + /* * Increment the count of procs running with this uid. Don't allow * a nonprivileged user to exceed their current limit. * @@ -829,6 +862,7 @@ fork1(struct thread *td, int flags, int error = EAGAIN; fail: + container_proc_exit(newproc); sx_sunlock(&proctree_lock); if (ppsratecheck(&lastfail, &curfail, 1)) printf("maxproc limit exceeded by uid %i, please see tuning(7) and login.conf(5).\n", @@ -842,6 +876,9 @@ fail1: vmspace_free(vm2); uma_zfree(proc_zone, newproc); pause("fork", hz / 2); + PROC_LOCK(p1); + rusage_sub(p1, RUSAGE_NPROC, 1); + PROC_UNLOCK(p1); return (error); } diff -urNp current/sys/kern/kern_jail.c hrl/sys/kern/kern_jail.c --- current/sys/kern/kern_jail.c 2011-01-31 20:41:14.452947649 +0100 +++ hrl/sys/kern/kern_jail.c 2011-01-31 19:57:39.442884918 +0100 @@ -38,6 +38,7 @@ __FBSDID("$FreeBSD: src/sys/kern/kern_ja #include #include #include +#include #include #include #include @@ -1195,6 +1196,7 @@ kern_jail_set(struct thread *td, struct root = mypr->pr_root; vref(root); } + container_create(&pr->pr_container); strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN); pr->pr_flags |= PR_HOST; #if defined(INET) || defined(INET6) @@ -2295,6 +2297,9 @@ do_jail_attach(struct thread *td, struct newcred->cr_prison = pr; p->p_ucred = newcred; PROC_UNLOCK(p); +#ifdef CONTAINERS + container_proc_ucred_changed(p, oldcred, newcred); +#endif crfree(oldcred); prison_deref(ppr, PD_DEREF | PD_DEUREF); return (0); @@ -2527,6 +2532,7 @@ prison_deref(struct prison *pr, int flag if (pr->pr_cpuset != NULL) cpuset_rel(pr->pr_cpuset); osd_jail_exit(pr); + container_destroy(&pr->pr_container); free(pr, M_PRISON); /* Removing a prison frees a reference on its parent. */ @@ -3874,6 +3880,12 @@ prison_priv_check(struct ucred *cred, in case PRIV_NETINET_GETCRED: return (0); + /* + * Allow jailed root to set loginclass. + */ + case PRIV_PROC_SETLOGINCLASS: + return (0); + default: /* * In all remaining cases, deny the privilege request. This @@ -4257,6 +4269,17 @@ SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYP SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route"); +void +prison_container_foreach(void (*callback)(struct container *container, + void *arg2, void *arg3), void *arg2, void *arg3) +{ + struct prison *pr; + + sx_slock(&allprison_lock); + TAILQ_FOREACH(pr, &allprison, pr_list) + (callback)(pr->pr_container, arg2, arg3); + sx_sunlock(&allprison_lock); +} #ifdef DDB diff -urNp current/sys/kern/kern_loginclass.c hrl/sys/kern/kern_loginclass.c --- current/sys/kern/kern_loginclass.c 1970-01-01 01:00:00.000000000 +0100 +++ hrl/sys/kern/kern_loginclass.c 2011-01-31 19:57:39.873065569 +0100 @@ -0,0 +1,236 @@ +/*- + * Copyright (c) 2009 Edward Tomasz NapieraƂa + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Processes may set login class name using setloginclass(2). This + * is usually done through call to setusercontext(3), by programs + * such as login(1), based on information from master.passwd(5). Kernel + * uses this information to enforce per-class resource limits. Current + * login class can be determined using id(1). Login class is inherited + * from the parent process during fork(2). If not set, it defaults + * to "default". + * + * Code in this file implements setloginclass(2) and getloginclass(2) + * system calls, and maintains class name storage and retrieval. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * XXX: Review locking. + */ + +/* XXX: Use UMA instead? */ +static MALLOC_DEFINE(M_LOGINCLASS, "loginclass", "loginclass structures"); + +LIST_HEAD(, loginclass) loginclasses; + +/* + * Lock protecting loginclasses list. + */ +static struct mtx loginclasses_lock; + +static void lc_init(void); +SYSINIT(loginclass, SI_SUB_CPU, SI_ORDER_FIRST, lc_init, NULL); + +void +loginclass_acquire(struct loginclass *lc) +{ + + refcount_acquire(&lc->lc_refcount); +} + +void +loginclass_release(struct loginclass *lc) +{ + int old; + + old = lc->lc_refcount; + if (old > 1 && atomic_cmpset_int(&lc->lc_refcount, old, old - 1)) + return; + + mtx_lock(&loginclasses_lock); + if (refcount_release(&lc->lc_refcount)) { + container_destroy(&lc->lc_container); + LIST_REMOVE(lc, lc_next); + mtx_unlock(&loginclasses_lock); + free(lc, M_LOGINCLASS); + + return; + } + mtx_unlock(&loginclasses_lock); +} + +/* + * Return loginclass structure with a corresponding name. Not + * performance critical, as it's used mainly by setloginclass(2), + * which happens once per login session. Caller has to use + * loginclass_release() on the returned value when it's no longer + * needed. + */ +struct loginclass * +loginclass_find(const char *name) +{ + struct loginclass *lc, *newlc; + + KASSERT(strlen(name) <= MAXLOGNAME - 1, + ("loginclass_find: got too long name")); + + newlc = malloc(sizeof(*newlc), M_LOGINCLASS, M_ZERO | M_WAITOK); + container_create(&newlc->lc_container); + + mtx_lock(&loginclasses_lock); + LIST_FOREACH(lc, &loginclasses, lc_next) { + if (strcmp(name, lc->lc_name) != 0) + continue; + + /* Found loginclass with a matching name? */ + loginclass_acquire(lc); + mtx_unlock(&loginclasses_lock); + container_destroy(&newlc->lc_container); + free(newlc, M_LOGINCLASS); + return (lc); + } + + /* Add new loginclass. */ + strcpy(newlc->lc_name, name); + refcount_init(&newlc->lc_refcount, 1); + LIST_INSERT_HEAD(&loginclasses, newlc, lc_next); + mtx_unlock(&loginclasses_lock); + + return (newlc); +} + +/* + * Get login class name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getloginclass_args { + char *namebuf; + size_t namelen; +}; +#endif +/* ARGSUSED */ +int +getloginclass(struct thread *td, struct getloginclass_args *uap) +{ + int error = 0; + size_t lcnamelen; + struct proc *p = td->td_proc; + struct loginclass *lc; + + PROC_LOCK(p); + lc = p->p_ucred->cr_loginclass; + loginclass_acquire(lc); + PROC_UNLOCK(p); + + lcnamelen = strlen(lc->lc_name) + 1; + if (lcnamelen > uap->namelen) + error = ERANGE; + if (error == 0) + error = copyout(lc->lc_name, uap->namebuf, lcnamelen); + loginclass_release(lc); + return (error); +} + +/* + * Set login class name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct setloginclass_args { + const char *namebuf; +}; +#endif +/* ARGSUSED */ +int +setloginclass(struct thread *td, struct setloginclass_args *uap) +{ + struct proc *p = td->td_proc; + int error; + char lcname[MAXLOGNAME]; + struct loginclass *newlc; + struct ucred *newcred, *oldcred; + + error = priv_check(td, PRIV_PROC_SETLOGINCLASS); + if (error != 0) + return (error); + error = copyinstr(uap->namebuf, lcname, sizeof(lcname), NULL); + if (error == ENAMETOOLONG) + return (EINVAL); + + newcred = crget(); + newlc = loginclass_find(lcname); + + PROC_LOCK(p); + oldcred = crcopysafe(p, newcred); + newcred->cr_loginclass = newlc; + p->p_ucred = newcred; + PROC_UNLOCK(p); +#ifdef CONTAINERS + container_proc_ucred_changed(p, oldcred, newcred); +#endif + + loginclass_release(oldcred->cr_loginclass); + crfree(oldcred); + + return (0); +} + +void +loginclass_container_foreach(void (*callback)(struct container *container, + void *arg2, void *arg3), void *arg2, void *arg3) +{ + struct loginclass *lc; + + mtx_lock(&loginclasses_lock); + LIST_FOREACH(lc, &loginclasses, lc_next) + (callback)(lc->lc_container, arg2, arg3); + mtx_unlock(&loginclasses_lock); +} + +static void +lc_init(void) +{ + + mtx_init(&loginclasses_lock, "loginclasses lock", NULL, MTX_DEF); +} diff -urNp current/sys/kern/kern_prot.c hrl/sys/kern/kern_prot.c --- current/sys/kern/kern_prot.c 2011-01-31 20:41:14.992957343 +0100 +++ hrl/sys/kern/kern_prot.c 2011-01-31 19:57:41.092969496 +0100 @@ -51,9 +51,11 @@ __FBSDID("$FreeBSD: src/sys/kern/kern_pr #include #include #include +#include #include #include #include +#include #include #include #include @@ -578,6 +580,9 @@ setuid(struct thread *td, struct setuid_ } p->p_ucred = newcred; PROC_UNLOCK(p); +#ifdef CONTAINERS + container_proc_ucred_changed(p, oldcred, newcred); +#endif uifree(uip); crfree(oldcred); return (0); @@ -916,6 +921,9 @@ setreuid(register struct thread *td, str } p->p_ucred = newcred; PROC_UNLOCK(p); +#ifdef CONTAINERS + container_proc_ucred_changed(p, oldcred, newcred); +#endif uifree(ruip); uifree(euip); crfree(oldcred); @@ -1054,6 +1062,9 @@ setresuid(register struct thread *td, st } p->p_ucred = newcred; PROC_UNLOCK(p); +#ifdef CONTAINERS + container_proc_ucred_changed(p, oldcred, newcred); +#endif uifree(ruip); uifree(euip); crfree(oldcred); @@ -1837,6 +1848,7 @@ crfree(struct ucred *cr) */ if (cr->cr_prison != NULL) prison_free(cr->cr_prison); + loginclass_release(cr->cr_loginclass); #ifdef AUDIT audit_cred_destroy(cr); #endif @@ -1873,6 +1885,7 @@ crcopy(struct ucred *dest, struct ucred uihold(dest->cr_uidinfo); uihold(dest->cr_ruidinfo); prison_hold(dest->cr_prison); + loginclass_acquire(dest->cr_loginclass); #ifdef AUDIT audit_cred_copy(src, dest); #endif diff -urNp current/sys/kern/kern_rctl.c hrl/sys/kern/kern_rctl.c --- current/sys/kern/kern_rctl.c 1970-01-01 01:00:00.000000000 +0100 +++ hrl/sys/kern/kern_rctl.c 2011-01-31 19:57:41.132958097 +0100 @@ -0,0 +1,1779 @@ +/*- + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Edward Tomasz Napierala under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef RCTL +#ifndef CONTAINERS +#error "The RCTL option requires the CONTAINERS option" +#endif + +FEATURE(rctl, "Resource Limits"); + +#define HRF_DEFAULT 0 +#define HRF_DONT_INHERIT 1 +#define HRF_DONT_ACCUMULATE 2 + +/* Default buffer size for rctl_get_rules(2). */ +#define RCTL_DEFAULT_BUFSIZE 4096 +#define RCTL_LOG_BUFSIZE 128 + +/* + * 'rctl_rule_link' connects a rule with every container it's related to. + * For example, rule 'user:X:openfiles:deny=N/process' is linked + * with uidinfo for user X, and to each process of that user. + */ +struct rctl_rule_link { + LIST_ENTRY(rctl_rule_link) rrl_next; + struct rctl_rule *rrl_rule; +}; + +struct dict { + const char *d_name; + int d_value; +}; + +static struct dict subjectnames[] = { + { "process", RCTL_SUBJECT_TYPE_PROCESS }, + { "user", RCTL_SUBJECT_TYPE_USER }, + { "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS }, + { "jail", RCTL_SUBJECT_TYPE_JAIL }, + { NULL, -1 }}; + +static struct dict resourcenames[] = { + { "cpu", RUSAGE_CPU }, + { "fsize", RUSAGE_FSIZE }, + { "data", RUSAGE_DATA }, + { "stack", RUSAGE_STACK }, + { "core", RUSAGE_CORE }, + { "rss", RUSAGE_RSS }, + { "memlock", RUSAGE_MEMLOCK }, + { "nproc", RUSAGE_NPROC }, + { "nofile", RUSAGE_NOFILE }, + { "sbsize", RUSAGE_SBSIZE }, + { "vmem", RUSAGE_VMEM }, + { "npts", RUSAGE_NPTS }, + { "swap", RUSAGE_SWAP }, + { "nthr", RUSAGE_NTHR }, + { "msgqqueued", RUSAGE_MSGQQUEUED }, + { "msgqsize", RUSAGE_MSGQSIZE }, + { "nmsgq", RUSAGE_NMSGQ }, + { "nsem", RUSAGE_NSEM }, + { "nsemop", RUSAGE_NSEMOP }, + { "nshm", RUSAGE_NSHM }, + { "shmsize", RUSAGE_SHMSIZE }, + { "wallclock", RUSAGE_WALLCLOCK }, + { "pctcpu", RUSAGE_PCTCPU }, + { NULL, -1 }}; + +static struct dict actionnames[] = { + { "sighup", RCTL_ACTION_SIGHUP }, + { "sigint", RCTL_ACTION_SIGINT }, + { "sigquit", RCTL_ACTION_SIGQUIT }, + { "sigill", RCTL_ACTION_SIGILL }, + { "sigtrap", RCTL_ACTION_SIGTRAP }, + { "sigabrt", RCTL_ACTION_SIGABRT }, + { "sigemt", RCTL_ACTION_SIGEMT }, + { "sigfpe", RCTL_ACTION_SIGFPE }, + { "sigkill", RCTL_ACTION_SIGKILL }, + { "sigbus", RCTL_ACTION_SIGBUS }, + { "sigsegv", RCTL_ACTION_SIGSEGV }, + { "sigsys", RCTL_ACTION_SIGSYS }, + { "sigpipe", RCTL_ACTION_SIGPIPE }, + { "sigalrm", RCTL_ACTION_SIGALRM }, + { "sigterm", RCTL_ACTION_SIGTERM }, + { "sigurg", RCTL_ACTION_SIGURG }, + { "sigstop", RCTL_ACTION_SIGSTOP }, + { "sigtstp", RCTL_ACTION_SIGTSTP }, + { "sigchld", RCTL_ACTION_SIGCHLD }, + { "sigttin", RCTL_ACTION_SIGTTIN }, + { "sigttou", RCTL_ACTION_SIGTTOU }, + { "sigio", RCTL_ACTION_SIGIO }, + { "sigxcpu", RCTL_ACTION_SIGXCPU }, + { "sigxfsz", RCTL_ACTION_SIGXFSZ }, + { "sigvtalrm", RCTL_ACTION_SIGVTALRM }, + { "sigprof", RCTL_ACTION_SIGPROF }, + { "sigwinch", RCTL_ACTION_SIGWINCH }, + { "siginfo", RCTL_ACTION_SIGINFO }, + { "sigusr1", RCTL_ACTION_SIGUSR1 }, + { "sigusr2", RCTL_ACTION_SIGUSR2 }, + { "sigthr", RCTL_ACTION_SIGTHR }, + { "deny", RCTL_ACTION_DENY }, + { "log", RCTL_ACTION_LOG }, + { NULL, -1 }}; + +static void rctl_init(void); +SYSINIT(rctl, SI_SUB_CONTAINER, SI_ORDER_FIRST, rctl_init, NULL); + +static uma_zone_t rctl_rule_link_zone; +static uma_zone_t rctl_rule_zone; +static struct rwlock rctl_lock; +RW_SYSINIT(rctl_lock, &rctl_lock, "RCTL lock"); + +static int rctl_rule_fully_specified(const struct rctl_rule *rule); +static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule); + +MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits"); + +static const char * +rctl_subject_type_name(int subject) +{ + int i; + + for (i = 0; subjectnames[i].d_name != NULL; i++) { + if (subjectnames[i].d_value == subject) + return (subjectnames[i].d_name); + } + + panic("rctl_subject_type_name: unknown subject type %d", subject); +} + +static const char * +rctl_action_name(int action) +{ + int i; + + for (i = 0; actionnames[i].d_name != NULL; i++) { + if (actionnames[i].d_value == action) + return (actionnames[i].d_name); + } + + panic("rctl_action_name: unknown action %d", action); +} + +const char * +rctl_resource_name(int resource) +{ + int i; + + for (i = 0; resourcenames[i].d_name != NULL; i++) { + if (resourcenames[i].d_value == resource) + return (resourcenames[i].d_name); + } + + panic("rctl_resource_name: unknown resource %d", resource); +} + +/* + * Return the amount of resource that can be allocated by 'p' before + * hitting 'rule'. + */ +static int64_t +rctl_available_resource(const struct proc *p, const struct rctl_rule *rule) +{ + int resource; + int64_t available = INT64_MAX; + struct ucred *cred = p->p_ucred; + + rw_assert(&rctl_lock, RA_LOCKED); + + resource = rule->rr_resource; + switch (rule->rr_per) { + case RCTL_SUBJECT_TYPE_PROCESS: + available = rule->rr_amount - + p->p_container->c_resources[resource]; + break; + case RCTL_SUBJECT_TYPE_USER: + available = rule->rr_amount - + cred->cr_ruidinfo->ui_container->c_resources[resource]; + break; + case RCTL_SUBJECT_TYPE_LOGINCLASS: + available = rule->rr_amount - + cred->cr_loginclass->lc_container->c_resources[resource]; + break; + case RCTL_SUBJECT_TYPE_JAIL: + available = rule->rr_amount - + cred->cr_prison->pr_container->c_resources[resource]; + break; + default: + panic("rctl_compute_available: unknown per %d", + rule->rr_per); + } + + return (available); +} + +/* + * Return non-zero if allocating 'amount' by proc 'p' would exceed + * resource limit specified by 'rule'. + */ +static int +rctl_would_exceed(const struct proc *p, const struct rctl_rule *rule, + int64_t amount) +{ + int64_t available; + + rw_assert(&rctl_lock, RA_LOCKED); + + available = rctl_available_resource(p, rule); + if (available >= amount) + return (0); + + /* + * We've already exceeded that one. + */ + if (available < 0) { +#ifdef notyet + KASSERT(rule->rr_action != RCTL_ACTION_DENY || + !rusage_is_deniable(rule->rr_resource), + ("rctl_would_exceed: deny rule already exceeded")); +#endif + return (0); + } + + return (1); +} + +/* + * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition + * to what it keeps allocated now. Returns non-zero if the allocation should + * be denied, 0 otherwise. + */ +int +rctl_enforce(struct proc *p, int resource, uint64_t amount) +{ + struct rctl_rule *rule; + struct rctl_rule_link *link; + struct sbuf sb; + int should_deny = 0; + char *buf; + static int curtime = 0; + static struct timeval lasttime; + + rw_rlock(&rctl_lock); + + /* + * There may be more than one matching rule; go through all of them. + * Denial should be done last, after logging and sending signals. + */ + LIST_FOREACH(link, &p->p_container->c_rule_links, rrl_next) { + rule = link->rrl_rule; + if (rule->rr_resource != resource) + continue; + if (!rctl_would_exceed(p, rule, amount)) + continue; + + switch (rule->rr_action) { + case RCTL_ACTION_DENY: + should_deny = 1; + continue; + case RCTL_ACTION_LOG: + if (!ppsratecheck(&lasttime, &curtime, 10)) + continue; + + buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT); + if (buf == NULL) { + printf("rctl_enforce: out of memory\n"); + continue; + } + sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN); + rctl_rule_to_sbuf(&sb, rule); + sbuf_finish(&sb); + printf("resource limit \"%s\" exceeded by process %d " + "(%s), uid %d\n", sbuf_data(&sb), p->p_pid, + p->p_comm, p->p_ucred->cr_uid); + sbuf_delete(&sb); + free(buf, M_RCTL); + continue; + default: + KASSERT(rule->rr_action > 0 && + rule->rr_action <= RCTL_ACTION_SIGNAL_MAX, + ("rctl_enforce: unknown action %d", + rule->rr_action)); + + /* + * We're using the fact that RCTL_ACTION_SIG* values + * are equal to their counterparts from sys/signal.h. + */ + psignal(p, rule->rr_action); + continue; + } + } + + rw_runlock(&rctl_lock); + + if (should_deny) { + /* + * Return fake error code; the caller should change it + * into one proper for the situation - EFSIZ, ENOMEM etc. + */ + return (EDOOFUS); + } + + return (0); +} + +uint64_t +rctl_get_limit(struct proc *p, int resource) +{ + struct rctl_rule *rule; + struct rctl_rule_link *link; + uint64_t amount = UINT64_MAX; + + rw_rlock(&rctl_lock); + + /* + * There may be more than one matching rule; go through all of them. + * Denial should be done last, after logging and sending signals. + */ + LIST_FOREACH(link, &p->p_container->c_rule_links, rrl_next) { + rule = link->rrl_rule; + if (rule->rr_resource != resource) + continue; + if (rule->rr_action != RCTL_ACTION_DENY) + continue; + if (rule->rr_amount < amount) + amount = rule->rr_amount; + } + + rw_runlock(&rctl_lock); + + return (amount); +} + +uint64_t +rctl_get_available(struct proc *p, int resource) +{ + struct rctl_rule *rule; + struct rctl_rule_link *link; + int64_t available, minavailable, allocated; + + minavailable = INT64_MAX; + + rw_rlock(&rctl_lock); + + /* + * There may be more than one matching rule; go through all of them. + * Denial should be done last, after logging and sending signals. + */ + LIST_FOREACH(link, &p->p_container->c_rule_links, rrl_next) { + rule = link->rrl_rule; + if (rule->rr_resource != resource) + continue; + if (rule->rr_action != RCTL_ACTION_DENY) + continue; + available = rctl_available_resource(p, rule); + if (available < minavailable) + minavailable = available; + } + + rw_runlock(&rctl_lock); + + /* + * XXX: Think about this _hard_. + */ + allocated = p->p_container->c_resources[resource]; + if (minavailable < INT64_MAX - allocated) + minavailable += allocated; + if (minavailable < 0) + minavailable = 0; + return (minavailable); +} + +static int +rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter) +{ + + if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) { + if (rule->rr_subject_type != filter->rr_subject_type) + return (0); + + switch (filter->rr_subject_type) { + case RCTL_SUBJECT_TYPE_PROCESS: + if (filter->rr_subject.rs_proc != NULL && + rule->rr_subject.rs_proc != + filter->rr_subject.rs_proc) + return (0); + break; + case RCTL_SUBJECT_TYPE_USER: + if (filter->rr_subject.rs_uip != NULL && + rule->rr_subject.rs_uip != + filter->rr_subject.rs_uip) + return (0); + break; + case RCTL_SUBJECT_TYPE_LOGINCLASS: + if (filter->rr_subject.hr_loginclass != NULL && + rule->rr_subject.hr_loginclass != + filter->rr_subject.hr_loginclass) + return (0); + break; + case RCTL_SUBJECT_TYPE_JAIL: + if (filter->rr_subject.rs_prison != NULL && + rule->rr_subject.rs_prison != + filter->rr_subject.rs_prison) + return (0); + break; + default: + panic("rctl_rule_matches: unknown subject type %d", + filter->rr_subject_type); + } + } + + if (filter->rr_resource != RUSAGE_UNDEFINED) { + if (rule->rr_resource != filter->rr_resource) + return (0); + } + + if (filter->rr_action != RCTL_ACTION_UNDEFINED) { + if (rule->rr_action != filter->rr_action) + return (0); + } + + if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) { + if (rule->rr_amount != filter->rr_amount) + return (0); + } + + if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) { + if (rule->rr_per != filter->rr_per) + return (0); + } + + return (1); +} + +static int +str2value(const char *str, int *value, struct dict *table) +{ + int i; + + if (value == NULL) + return (EINVAL); + + for (i = 0; table[i].d_name != NULL; i++) { + if (strcasecmp(table[i].d_name, str) == 0) { + *value = table[i].d_value; + return (0); + } + } + + return (EINVAL); +} + +static int +str2id(const char *str, id_t *value) +{ + char *end; + + if (str == NULL) + return (EINVAL); + + *value = strtoul(str, &end, 10); + if ((size_t)(end - str) != strlen(str)) + return (EINVAL); + + return (0); +} + +static int +str2int64(const char *str, int64_t *value) +{ + char *end; + + if (str == NULL) + return (EINVAL); + + *value = strtoul(str, &end, 10); + if ((size_t)(end - str) != strlen(str)) + return (EINVAL); + + return (0); +} + +/* + * Connect the rule to the container, increasing refcount for the rule. + */ +static void +rctl_container_add_rule(struct container *container, struct rctl_rule *rule) +{ + struct rctl_rule_link *link; + + KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); + + rctl_rule_acquire(rule); + link = uma_zalloc(rctl_rule_link_zone, M_WAITOK); + link->rrl_rule = rule; + + rw_wlock(&rctl_lock); + LIST_INSERT_HEAD(&container->c_rule_links, link, rrl_next); + rw_wunlock(&rctl_lock); +} + +static int +rctl_container_add_rule_locked(struct container *container, struct rctl_rule *rule) +{ + struct rctl_rule_link *link; + + KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); + rw_assert(&rctl_lock, RA_WLOCKED); + + link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT); + if (link == NULL) + return (ENOMEM); + rctl_rule_acquire(rule); + link->rrl_rule = rule; + + LIST_INSERT_HEAD(&container->c_rule_links, link, rrl_next); + return (0); +} + +/* + * Remove limits for a rules matching the filter and release + * the refcounts for the rules, possibly freeing them. Returns + * the number of limit structures removed. + */ +static int +rctl_container_remove_rules(struct container *container, + const struct rctl_rule *filter) +{ + int removed = 0; + struct rctl_rule_link *link, *linktmp; + + rw_assert(&rctl_lock, RA_WLOCKED); + + LIST_FOREACH_SAFE(link, &container->c_rule_links, rrl_next, linktmp) { + if (!rctl_rule_matches(link->rrl_rule, filter)) + continue; + + LIST_REMOVE(link, rrl_next); + rctl_rule_release(link->rrl_rule); + uma_zfree(rctl_rule_link_zone, link); + removed++; + } + return (removed); +} + +static void +rctl_rule_acquire_subject(struct rctl_rule *rule) +{ + + switch (rule->rr_subject_type) { + case RCTL_SUBJECT_TYPE_UNDEFINED: + case RCTL_SUBJECT_TYPE_PROCESS: + break; + case RCTL_SUBJECT_TYPE_USER: + if (rule->rr_subject.rs_uip != NULL) + uihold(rule->rr_subject.rs_uip); + break; + case RCTL_SUBJECT_TYPE_LOGINCLASS: + if (rule->rr_subject.hr_loginclass != NULL) + loginclass_acquire(rule->rr_subject.hr_loginclass); + break; + case RCTL_SUBJECT_TYPE_JAIL: + if (rule->rr_subject.hr_loginclass != NULL) + prison_hold(rule->rr_subject.rs_prison); + break; + default: + panic("rctl_rule_acquire_subject: unknown subject type %d", + rule->rr_subject_type); + } +} + +static void +rctl_rule_release_subject(struct rctl_rule *rule) +{ + + switch (rule->rr_subject_type) { + case RCTL_SUBJECT_TYPE_UNDEFINED: + case RCTL_SUBJECT_TYPE_PROCESS: + break; + case RCTL_SUBJECT_TYPE_USER: + if (rule->rr_subject.rs_uip != NULL) + uifree(rule->rr_subject.rs_uip); + break; + case RCTL_SUBJECT_TYPE_LOGINCLASS: + if (rule->rr_subject.hr_loginclass != NULL) + loginclass_release(rule->rr_subject.hr_loginclass); + break; + case RCTL_SUBJECT_TYPE_JAIL: + if (rule->rr_subject.rs_prison != NULL) + prison_free(rule->rr_subject.rs_prison); + break; + default: + panic("rctl_rule_release_subject: unknown subject type %d", + rule->rr_subject_type); + } +} + +struct rctl_rule * +rctl_rule_alloc(int flags) +{ + struct rctl_rule *rule; + + rule = uma_zalloc(rctl_rule_zone, flags); + if (rule == NULL) + return (NULL); + rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED; + rule->rr_subject.rs_proc = NULL; + rule->rr_subject.rs_uip = NULL; + rule->rr_subject.hr_loginclass = NULL; + rule->rr_subject.rs_prison = NULL; + rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED; + rule->rr_resource = RUSAGE_UNDEFINED; + rule->rr_action = RCTL_ACTION_UNDEFINED; + rule->rr_amount = RCTL_AMOUNT_UNDEFINED; + refcount_init(&rule->rr_refcount, 1); + + return (rule); +} + +struct rctl_rule * +rctl_rule_duplicate(const struct rctl_rule *rule, int flags) +{ + struct rctl_rule *copy; + + copy = uma_zalloc(rctl_rule_zone, flags); + if (copy == NULL) + return (NULL); + copy->rr_subject_type = rule->rr_subject_type; + copy->rr_subject.rs_proc = rule->rr_subject.rs_proc; + copy->rr_subject.rs_uip = rule->rr_subject.rs_uip; + copy->rr_subject.hr_loginclass = rule->rr_subject.hr_loginclass; + copy->rr_subject.rs_prison = rule->rr_subject.rs_prison; + copy->rr_per = rule->rr_per; + copy->rr_resource = rule->rr_resource; + copy->rr_action = rule->rr_action; + copy->rr_amount = rule->rr_amount; + refcount_init(©->rr_refcount, 1); + rctl_rule_acquire_subject(copy); + + return (copy); +} + +void +rctl_rule_acquire(struct rctl_rule *rule) +{ + + KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0")); + + refcount_acquire(&rule->rr_refcount); +} + +static void +rctl_rule_free(void *context, int pending) +{ + struct rctl_rule *rule; + + rule = (struct rctl_rule *)context; + + KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0")); + + /* + * We don't need locking here; rule is guaranteed to be inaccessible. + */ + + rctl_rule_release_subject(rule); + uma_zfree(rctl_rule_zone, rule); +} + +void +rctl_rule_release(struct rctl_rule *rule) +{ + + KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0")); + + if (refcount_release(&rule->rr_refcount)) { + /* + * rctl_rule_release() is often called when iterating + * over all the uidinfo structures in the system, + * holding uihashtbl_lock. Since rctl_rule_free() + * might end up calling uifree(), this would lead + * to lock recursion. Use taskqueue to avoid this. + */ + TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule); + taskqueue_enqueue(taskqueue_thread, &rule->rr_task); + } +} + +static int +rctl_rule_fully_specified(const struct rctl_rule *rule) +{ + + switch (rule->rr_subject_type) { + case RCTL_SUBJECT_TYPE_UNDEFINED: + return (0); + case RCTL_SUBJECT_TYPE_PROCESS: + if (rule->rr_subject.rs_proc == NULL) + return (0); + break; + case RCTL_SUBJECT_TYPE_USER: + if (rule->rr_subject.rs_uip == NULL) + return (0); + break; + case RCTL_SUBJECT_TYPE_LOGINCLASS: + if (rule->rr_subject.hr_loginclass == NULL) + return (0); + break; + case RCTL_SUBJECT_TYPE_JAIL: + if (rule->rr_subject.rs_prison == NULL) + return (0); + break; + default: + panic("rctl_rule_fully_specified: unknown subject type %d", + rule->rr_subject_type); + } + if (rule->rr_resource == RUSAGE_UNDEFINED) + return (0); + if (rule->rr_action == RCTL_ACTION_UNDEFINED) + return (0); + if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED) + return (0); + if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED) + return (0); + + return (1); +} + +static struct rctl_rule * +rctl_rule_from_string(char *rulestr) +{ + int error = 0; + char *subjectstr, *subject_idstr, *resourcestr, *actionstr, + *amountstr, *perstr; + struct rctl_rule *rule; + id_t id; + + rule = rctl_rule_alloc(M_WAITOK); + + subjectstr = strsep(&rulestr, ":"); + subject_idstr = strsep(&rulestr, ":"); + resourcestr = strsep(&rulestr, ":"); + actionstr = strsep(&rulestr, "=/"); + amountstr = strsep(&rulestr, "/"); + perstr = rulestr; + + if (subjectstr == NULL || subjectstr[0] == '\0') + rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED; + else { + error = str2value(subjectstr, &rule->rr_subject_type, subjectnames); + if (error != 0) + goto out; + } + + if (subject_idstr == NULL || subject_idstr[0] == '\0') { + rule->rr_subject.rs_proc = NULL; + rule->rr_subject.rs_uip = NULL; + rule->rr_subject.hr_loginclass = NULL; + rule->rr_subject.rs_prison = NULL; + } else { + + /* + * Loginclasses don't have any numerical ID's. + */ + if (rule->rr_subject_type != RCTL_SUBJECT_TYPE_LOGINCLASS) { + error = str2id(subject_idstr, &id); + if (error != 0) + goto out; + } + switch (rule->rr_subject_type) { + case RCTL_SUBJECT_TYPE_UNDEFINED: + error = EINVAL; + goto out; + case RCTL_SUBJECT_TYPE_PROCESS: + sx_assert(&allproc_lock, SA_LOCKED); + rule->rr_subject.rs_proc = pfind(id); + if (rule->rr_subject.rs_proc == NULL) { + error = ESRCH; + goto out; + } + PROC_UNLOCK(rule->rr_subject.rs_proc); + break; + case RCTL_SUBJECT_TYPE_USER: + rule->rr_subject.rs_uip = uifind(id); + break; + case RCTL_SUBJECT_TYPE_LOGINCLASS: + rule->rr_subject.hr_loginclass = loginclass_find(subject_idstr); + break; + case RCTL_SUBJECT_TYPE_JAIL: + sx_slock(&allprison_lock); + rule->rr_subject.rs_prison = prison_find(id); + if (rule->rr_subject.rs_prison == NULL) { + sx_sunlock(&allprison_lock); + error = ESRCH; + goto out; + } + prison_hold_locked(rule->rr_subject.rs_prison); + /* prison_find() returns with mutex held. */ + mtx_unlock(&rule->rr_subject.rs_prison->pr_mtx); + sx_sunlock(&allprison_lock); + break; + default: + panic("rctl_rule_from_string: unknown subject type %d", + rule->rr_subject_type); + } + } + + if (resourcestr == NULL || resourcestr[0] == '\0') + rule->rr_resource = RUSAGE_UNDEFINED; + else { + error = str2value(resourcestr, &rule->rr_resource, + resourcenames); + if (error != 0) + goto out; + } + + if (actionstr == NULL || actionstr[0] == '\0') + rule->rr_action = RCTL_ACTION_UNDEFINED; + else { + error = str2value(actionstr, &rule->rr_action, actionnames); + if (error != 0) + goto out; + } + + if (amountstr == NULL || amountstr[0] == '\0') + rule->rr_amount = RCTL_AMOUNT_UNDEFINED; + else { + error = str2int64(amountstr, &rule->rr_amount); + if (error != 0) + goto out; + if (rusage_is_in_thousands(rule->rr_resource)) + rule->rr_amount *= 1000; + } + + if (perstr == NULL || perstr[0] == '\0') + rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED; + else { + error = str2value(perstr, &rule->rr_per, subjectnames); + if (error != 0) + goto out; + } + +out: + if (error != 0) { + rctl_rule_release(rule); + return (NULL); + } + + return (rule); +} + +/* + * Link a rule with all the subjects it applies to. + */ +int +rctl_rule_add(struct rctl_rule *rule) +{ + struct proc *p; + struct ucred *cred; + struct uidinfo *uip; + struct prison *pr; + struct loginclass *lc; + struct rctl_rule *rule2; + int match; + + KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); + + /* + * Some rules just don't make sense. Note that the one below + * cannot be rewritten using rusage_is_deniable(); the RUSAGE_PCTCPU, + * for example, is not deniable in the containers sense, but the + * limit is enforced in a different way, so "deny" rules for %CPU + * do make sense. + */ + if (rule->rr_action == RCTL_ACTION_DENY && + (rule->rr_resource == RUSAGE_CPU || + rule->rr_resource == RUSAGE_WALLCLOCK)) + return (EINVAL); + + if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS && + rusage_is_sloppy(rule->rr_resource)) + return (EINVAL); + + /* + * Make sure there are no duplicated rules. Also, for the "deny" + * rules, remove ones differing only by "amount". + */ + if (rule->rr_action == RCTL_ACTION_DENY) { + rule2 = rctl_rule_duplicate(rule, M_WAITOK); + rule2->rr_amount = RCTL_AMOUNT_UNDEFINED; + rctl_rule_remove(rule2); + rctl_rule_release(rule2); + } else + rctl_rule_remove(rule); + + switch (rule->rr_subject_type) { + case RCTL_SUBJECT_TYPE_PROCESS: + p = rule->rr_subject.rs_proc; + KASSERT(p != NULL, ("rctl_rule_add: NULL proc")); + /* + * No resource limits for system processes. + */ + if (p->p_flag & P_SYSTEM) + return (EINVAL); + + rctl_container_add_rule(p->p_container, rule); + /* + * In case of per-process rule, we don't have anything more + * to do. + */ + return (0); + + case RCTL_SUBJECT_TYPE_USER: + uip = rule->rr_subject.rs_uip; + KASSERT(uip != NULL, ("rctl_rule_add: NULL uip")); + rctl_container_add_rule(uip->ui_container, rule); + break; + + case RCTL_SUBJECT_TYPE_LOGINCLASS: + lc = rule->rr_subject.hr_loginclass; + KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass")); + rctl_container_add_rule(lc->lc_container, rule); + break; + + case RCTL_SUBJECT_TYPE_JAIL: + pr = rule->rr_subject.rs_prison; + KASSERT(pr != NULL, ("rctl_rule_add: NULL pr")); + rctl_container_add_rule(pr->pr_container, rule); + break; + + default: + panic("rctl_rule_add: unknown subject type %d", + rule->rr_subject_type); + } + + /* + * Now go through all the processes and add the new rule to the ones + * it applies to. + */ + sx_assert(&allproc_lock, SA_LOCKED); + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_flag & P_SYSTEM) + continue; + cred = p->p_ucred; + switch (rule->rr_subject_type) { + case RCTL_SUBJECT_TYPE_USER: + if (cred->cr_uidinfo == rule->rr_subject.rs_uip || + cred->cr_ruidinfo == rule->rr_subject.rs_uip) + break; + continue; + case RCTL_SUBJECT_TYPE_LOGINCLASS: + if (cred->cr_loginclass == rule->rr_subject.hr_loginclass) + break; + continue; + case RCTL_SUBJECT_TYPE_JAIL: + match = 0; + for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) { + if (pr == rule->rr_subject.rs_prison) { + match = 1; + break; + } + } + if (match) + break; + continue; + default: + panic("rctl_rule_add: unknown subject type %d", + rule->rr_subject_type); + } + + rctl_container_add_rule(p->p_container, rule); + } + + return (0); +} + +static void +rctl_rule_remove_callback(struct container *container, void *arg2, void *arg3) +{ + struct rctl_rule *filter = (struct rctl_rule *)arg2; + int found = 0; + + rw_wlock(&rctl_lock); + found += rctl_container_remove_rules(container, filter); + rw_wunlock(&rctl_lock); + + *((int *)arg3) += found; +} + +/* + * Remove all rules that match the filter. + */ +int +rctl_rule_remove(struct rctl_rule *filter) +{ + int found = 0; + struct proc *p; + + if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS && + filter->rr_subject.rs_proc != NULL) { + p = filter->rr_subject.rs_proc; + rw_wlock(&rctl_lock); + found = rctl_container_remove_rules(p->p_container, filter); + rw_wunlock(&rctl_lock); + if (found) + return (0); + return (ESRCH); + } + + loginclass_container_foreach(rctl_rule_remove_callback, filter, + (void *)&found); + ui_container_foreach(rctl_rule_remove_callback, filter, + (void *)&found); + prison_container_foreach(rctl_rule_remove_callback, filter, + (void *)&found); + + sx_assert(&allproc_lock, SA_LOCKED); + rw_wlock(&rctl_lock); + FOREACH_PROC_IN_SYSTEM(p) { + found += rctl_container_remove_rules(p->p_container, filter); + } + rw_wunlock(&rctl_lock); + + if (found) + return (0); + return (ESRCH); +} + +/* + * Appends a rule to the sbuf. + */ +static void +rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule) +{ + int64_t amount; + + sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type)); + + switch (rule->rr_subject_type) { + case RCTL_SUBJECT_TYPE_PROCESS: + if (rule->rr_subject.rs_proc == NULL) + sbuf_printf(sb, ":"); + else + sbuf_printf(sb, "%d:", rule->rr_subject.rs_proc->p_pid); + break; + case RCTL_SUBJECT_TYPE_USER: + if (rule->rr_subject.rs_uip == NULL) + sbuf_printf(sb, ":"); + else + sbuf_printf(sb, "%d:", rule->rr_subject.rs_uip->ui_uid); + break; + case RCTL_SUBJECT_TYPE_LOGINCLASS: + if (rule->rr_subject.hr_loginclass == NULL) + sbuf_printf(sb, ":"); + else + sbuf_printf(sb, "%s:", rule->rr_subject.hr_loginclass->lc_name); + break; + case RCTL_SUBJECT_TYPE_JAIL: + if (rule->rr_subject.rs_prison == NULL) + sbuf_printf(sb, ":"); + else + sbuf_printf(sb, "%d:", rule->rr_subject.rs_prison->pr_id); + break; + default: + panic("rctl_rule_to_sbuf: unknown subject type %d", + rule->rr_subject_type); + } + + amount = rule->rr_amount; + if (amount != RCTL_AMOUNT_UNDEFINED && + rusage_is_in_thousands(rule->rr_resource)) + amount /= 1000; + + sbuf_printf(sb, "%s:%s=%jd", + rctl_resource_name(rule->rr_resource), + rctl_action_name(rule->rr_action), + amount); + + if (rule->rr_per != rule->rr_subject_type) + sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per)); +} + +/* + * Routine used by RCTL syscalls to read in input string. + */ +static int +rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen) +{ + int error; + char *str; + + if (inbuflen <= 0) + return (EINVAL); + + str = malloc(inbuflen + 1, M_RCTL, M_WAITOK); + error = copyinstr(inbufp, str, inbuflen, NULL); + if (error != 0) { + free(str, M_RCTL); + return (error); + } + + *inputstr = str; + + return (0); +} + +/* + * Routine used by RCTL syscalls to write out output string. + */ +static int +rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen) +{ + int error; + + if (outputsbuf == NULL) + return (0); + + sbuf_finish(outputsbuf); + if (outbuflen < sbuf_len(outputsbuf) + 1) { + sbuf_delete(outputsbuf); + return (ERANGE); + } + error = copyout(sbuf_data(outputsbuf), outbufp, + sbuf_len(outputsbuf) + 1); + sbuf_delete(outputsbuf); + return (error); +} + +static struct sbuf * +rctl_container_to_sbuf(struct container *container, int sloppy) +{ + int i; + int64_t amount; + struct sbuf *sb; + + sb = sbuf_new_auto(); + for (i = 0; i <= RUSAGE_MAX; i++) { + if (sloppy == 0 && rusage_is_sloppy(i)) + continue; + amount = container->c_resources[i]; + if (rusage_is_in_thousands(i)) + amount /= 1000; + sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount); + } + sbuf_setpos(sb, sbuf_len(sb) - 1); + return (sb); +} + +int +rctl_get_usage(struct thread *td, struct rctl_get_usage_args *uap) +{ + int error; + char *inputstr; + struct rctl_rule *filter; + struct sbuf *outputsbuf = NULL; + struct proc *p; + struct uidinfo *uip; + struct loginclass *lc; + struct prison *pr; + + error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); + if (error != 0) + return (error); + + sx_slock(&allproc_lock); + filter = rctl_rule_from_string(inputstr); + free(inputstr, M_RCTL); + if (filter == NULL) { + sx_sunlock(&allproc_lock); + return (EINVAL); + } + + switch (filter->rr_subject_type) { + case RCTL_SUBJECT_TYPE_PROCESS: + p = filter->rr_subject.rs_proc; + if (p == NULL) { + error = EINVAL; + goto out; + } + if (p->p_flag & P_SYSTEM) { + error = EINVAL; + goto out; + } + outputsbuf = rctl_container_to_sbuf(p->p_container, 0); + break; + case RCTL_SUBJECT_TYPE_USER: + uip = filter->rr_subject.rs_uip; + if (uip == NULL) { + error = EINVAL; + goto out; + } + outputsbuf = rctl_container_to_sbuf(uip->ui_container, 1); + break; + case RCTL_SUBJECT_TYPE_LOGINCLASS: + lc = filter->rr_subject.hr_loginclass; + if (lc == NULL) { + error = EINVAL; + goto out; + } + outputsbuf = rctl_container_to_sbuf(lc->lc_container, 1); + break; + case RCTL_SUBJECT_TYPE_JAIL: + pr = filter->rr_subject.rs_prison; + if (pr == NULL) { + error = EINVAL; + goto out; + } + outputsbuf = rctl_container_to_sbuf(pr->pr_container, 1); + break; + default: + error = EINVAL; + } +out: + rctl_rule_release(filter); + sx_sunlock(&allproc_lock); + if (error != 0) + return (error); + + error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen); + + return (error); +} + +static void +rctl_get_rules_callback(struct container *container, void *arg2, void *arg3) +{ + struct rctl_rule *filter = (struct rctl_rule *)arg2; + struct rctl_rule_link *link; + struct sbuf *sb = (struct sbuf *)arg3; + + rw_rlock(&rctl_lock); + LIST_FOREACH(link, &container->c_rule_links, rrl_next) { + if (!rctl_rule_matches(link->rrl_rule, filter)) + continue; + rctl_rule_to_sbuf(sb, link->rrl_rule); + sbuf_printf(sb, ","); + } + rw_runlock(&rctl_lock); +} + +int +rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap) +{ + int error; + size_t bufsize = RCTL_DEFAULT_BUFSIZE; + char *inputstr, *buf; + struct sbuf *sb; + struct rctl_rule *filter; + struct rctl_rule_link *link; + struct proc *p; + + error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); + if (error != 0) + return (error); + + sx_slock(&allproc_lock); + filter = rctl_rule_from_string(inputstr); + free(inputstr, M_RCTL); + if (filter == NULL) { + sx_sunlock(&allproc_lock); + return (EINVAL); + } + +again: + buf = malloc(bufsize, M_RCTL, M_WAITOK); + sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN); + KASSERT(sb != NULL, ("sbuf_new failed")); + + sx_assert(&allproc_lock, SA_LOCKED); + FOREACH_PROC_IN_SYSTEM(p) { + rw_rlock(&rctl_lock); + LIST_FOREACH(link, &p->p_container->c_rule_links, rrl_next) { + /* + * Non-process rules will be added to the buffer later. + * Adding them here would result in duplicated output. + */ + if (link->rrl_rule->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) + continue; + if (!rctl_rule_matches(link->rrl_rule, filter)) + continue; + rctl_rule_to_sbuf(sb, link->rrl_rule); + sbuf_printf(sb, ","); + } + rw_runlock(&rctl_lock); + } + + loginclass_container_foreach(rctl_get_rules_callback, filter, sb); + ui_container_foreach(rctl_get_rules_callback, filter, sb); + prison_container_foreach(rctl_get_rules_callback, filter, sb); + if (sbuf_error(sb) == ENOMEM) { + sbuf_delete(sb); + free(buf, M_RCTL); + bufsize *= 4; + goto again; + } + + /* + * Remove trailing ",". + */ + if (sbuf_len(sb) > 0) + sbuf_setpos(sb, sbuf_len(sb) - 1); + + error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen); + + rctl_rule_release(filter); + sx_sunlock(&allproc_lock); + free(buf, M_RCTL); + return (error); +} + +int +rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap) +{ + int error; + size_t bufsize = RCTL_DEFAULT_BUFSIZE; + char *inputstr, *buf; + struct sbuf *sb; + struct rctl_rule *filter; + struct rctl_rule_link *link; + + error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); + if (error != 0) + return (error); + + sx_slock(&allproc_lock); + filter = rctl_rule_from_string(inputstr); + free(inputstr, M_RCTL); + if (filter == NULL) { + sx_sunlock(&allproc_lock); + return (EINVAL); + } + + if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) { + rctl_rule_release(filter); + sx_sunlock(&allproc_lock); + return (EINVAL); + } + if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) { + rctl_rule_release(filter); + sx_sunlock(&allproc_lock); + return (EOPNOTSUPP); + } + if (filter->rr_subject.rs_proc == NULL) { + rctl_rule_release(filter); + sx_sunlock(&allproc_lock); + return (EINVAL); + } + +again: + buf = malloc(bufsize, M_RCTL, M_WAITOK); + sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN); + KASSERT(sb != NULL, ("sbuf_new failed")); + + rw_rlock(&rctl_lock); + LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_container->c_rule_links, rrl_next) { + rctl_rule_to_sbuf(sb, link->rrl_rule); + sbuf_printf(sb, ","); + } + rw_runlock(&rctl_lock); + if (sbuf_error(sb) == ENOMEM) { + sbuf_delete(sb); + free(buf, M_RCTL); + bufsize *= 4; + goto again; + } + + /* + * Remove trailing ",". + */ + if (sbuf_len(sb) > 0) + sbuf_setpos(sb, sbuf_len(sb) - 1); + + error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen); + rctl_rule_release(filter); + sx_sunlock(&allproc_lock); + free(buf, M_RCTL); + return (error); +} + +int +rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap) +{ + int error; + struct rctl_rule *rule; + char *inputstr; + + error = priv_check(td, PRIV_RCTL_SET); + if (error != 0) + return (error); + + error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); + if (error != 0) + return (error); + + sx_slock(&allproc_lock); + rule = rctl_rule_from_string(inputstr); + free(inputstr, M_RCTL); + if (rule == NULL) { + sx_sunlock(&allproc_lock); + return (EINVAL); + } + /* + * The 'per' part of a rule is optional. + */ + if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED && + rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) + rule->rr_per = rule->rr_subject_type; + + if (!rctl_rule_fully_specified(rule)) { + error = EINVAL; + goto out; + } + + error = rctl_rule_add(rule); + +out: + rctl_rule_release(rule); + sx_sunlock(&allproc_lock); + return (error); +} + +int +rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap) +{ + int error; + struct rctl_rule *filter; + char *inputstr; + + error = priv_check(td, PRIV_RCTL_SET); + if (error != 0) + return (error); + + error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); + if (error != 0) + return (error); + + sx_slock(&allproc_lock); + filter = rctl_rule_from_string(inputstr); + free(inputstr, M_RCTL); + if (filter == NULL) { + sx_sunlock(&allproc_lock); + return (EINVAL); + } + + error = rctl_rule_remove(filter); + rctl_rule_release(filter); + sx_sunlock(&allproc_lock); + + return (error); +} + +/* + * Update RCTL rule list after credential change. + */ +void +rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred) +{ + int rulecnt, i; + struct rctl_rule_link *link, *newlink; + struct uidinfo *newuip; + struct loginclass *newlc; + struct prison *newpr; + LIST_HEAD(, rctl_rule_link) newrules; + + newuip = newcred->cr_ruidinfo; + newlc = newcred->cr_loginclass; + newpr = newcred->cr_prison; + + LIST_INIT(&newrules); + +again: + /* + * First, count the rules that apply to the process with new + * credentials. + */ + rulecnt = 0; + rw_rlock(&rctl_lock); + LIST_FOREACH(link, &p->p_container->c_rule_links, rrl_next) { + if (link->rrl_rule->rr_subject_type == + RCTL_SUBJECT_TYPE_PROCESS) + rulecnt++; + } + LIST_FOREACH(link, &newuip->ui_container->c_rule_links, rrl_next) + rulecnt++; + LIST_FOREACH(link, &newlc->lc_container->c_rule_links, rrl_next) + rulecnt++; + LIST_FOREACH(link, &newpr->pr_container->c_rule_links, rrl_next) + rulecnt++; + rw_runlock(&rctl_lock); + + /* + * Create temporary list. We've dropped the rctl_lock in order + * to use M_WAITOK. + */ + for (i = 0; i < rulecnt; i++) { + newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK); + newlink->rrl_rule = NULL; + LIST_INSERT_HEAD(&newrules, newlink, rrl_next); + } + + newlink = LIST_FIRST(&newrules); + + /* + * Assign rules to the newly allocated list entries. + */ + rw_wlock(&rctl_lock); + LIST_FOREACH(link, &p->p_container->c_rule_links, rrl_next) { + if (link->rrl_rule->rr_subject_type == + RCTL_SUBJECT_TYPE_PROCESS) { + if (newlink == NULL) + goto goaround; + rctl_rule_acquire(link->rrl_rule); + newlink->rrl_rule = link->rrl_rule; + newlink = LIST_NEXT(newlink, rrl_next); + rulecnt--; + } + } + + LIST_FOREACH(link, &newuip->ui_container->c_rule_links, rrl_next) { + if (newlink == NULL) + goto goaround; + rctl_rule_acquire(link->rrl_rule); + newlink->rrl_rule = link->rrl_rule; + newlink = LIST_NEXT(newlink, rrl_next); + rulecnt--; + } + + LIST_FOREACH(link, &newlc->lc_container->c_rule_links, rrl_next) { + if (newlink == NULL) + goto goaround; + rctl_rule_acquire(link->rrl_rule); + newlink->rrl_rule = link->rrl_rule; + newlink = LIST_NEXT(newlink, rrl_next); + rulecnt--; + } + + LIST_FOREACH(link, &newpr->pr_container->c_rule_links, rrl_next) { + if (newlink == NULL) + goto goaround; + rctl_rule_acquire(link->rrl_rule); + newlink->rrl_rule = link->rrl_rule; + newlink = LIST_NEXT(newlink, rrl_next); + rulecnt--; + } + + if (rulecnt == 0) { + /* + * Free the old rule list. + */ + while (!LIST_EMPTY(&p->p_container->c_rule_links)) { + link = LIST_FIRST(&p->p_container->c_rule_links); + LIST_REMOVE(link, rrl_next); + rctl_rule_release(link->rrl_rule); + uma_zfree(rctl_rule_link_zone, link); + } + + /* + * Replace lists and we're done. + * + * XXX: Is there any way to switch list heads instead + * of iterating here? + */ + while (!LIST_EMPTY(&newrules)) { + newlink = LIST_FIRST(&newrules); + LIST_REMOVE(newlink, rrl_next); + LIST_INSERT_HEAD(&p->p_container->c_rule_links, + newlink, rrl_next); + } + + rw_wunlock(&rctl_lock); + + return; + } + +goaround: + rw_wunlock(&rctl_lock); + + /* + * Rule list changed while we were not holding the rctl_lock. + * Free the new list and try again. + */ + while (!LIST_EMPTY(&newrules)) { + newlink = LIST_FIRST(&newrules); + LIST_REMOVE(newlink, rrl_next); + if (newlink->rrl_rule != NULL) + rctl_rule_release(newlink->rrl_rule); + uma_zfree(rctl_rule_link_zone, newlink); + } + + goto again; +} + +/* + * Assign RCTL rules to the newly created process. + */ +int +rctl_proc_fork(struct proc *parent, struct proc *child) +{ + int error; + struct rctl_rule_link *link; + struct rctl_rule *rule; + + LIST_INIT(&child->p_container->c_rule_links); + + /* + * No limits for kernel processes. + */ + if (child->p_flag & P_SYSTEM) + return (0); + + /* + * Nothing to inherit from P_SYSTEM parents. + */ + if (parent->p_container == NULL) { + KASSERT(parent->p_flag & P_SYSTEM, + ("non-system process without container; p = %p", parent)); + return (0); + } + + rw_wlock(&rctl_lock); + + /* + * Go through limits applicable to the parent and assign them to the child. + * Rules with 'process' subject have to be duplicated in order to make their + * rr_subject point to the new process. + */ + LIST_FOREACH(link, &parent->p_container->c_rule_links, rrl_next) { + if (link->rrl_rule->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS) { + rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT); + if (rule == NULL) + goto fail; + KASSERT(rule->rr_subject.rs_proc == parent, + ("rule->rr_subject.rs_proc != parent")); + rule->rr_subject.rs_proc = child; + error = rctl_container_add_rule_locked(child->p_container, rule); + rctl_rule_release(rule); + if (error != 0) + goto fail; + } else { + error = rctl_container_add_rule_locked(child->p_container, link->rrl_rule); + if (error != 0) + goto fail; + } + } + + rw_wunlock(&rctl_lock); + return (0); + +fail: + while (!LIST_EMPTY(&child->p_container->c_rule_links)) { + link = LIST_FIRST(&child->p_container->c_rule_links); + LIST_REMOVE(link, rrl_next); + rctl_rule_release(link->rrl_rule); + uma_zfree(rctl_rule_link_zone, link); + } + rw_wunlock(&rctl_lock); + return (EAGAIN); +} + +/* + * Go through the process' limits, freeing them. + */ +void +rctl_proc_exit(struct proc *p) +{ + struct rctl_rule_link *link; + + rw_wlock(&rctl_lock); + while (!LIST_EMPTY(&p->p_container->c_rule_links)) { + link = LIST_FIRST(&p->p_container->c_rule_links); + LIST_REMOVE(link, rrl_next); + rctl_rule_release(link->rrl_rule); + uma_zfree(rctl_rule_link_zone, link); + } + rw_wunlock(&rctl_lock); +} + +static void +rctl_init(void) +{ + + rctl_rule_link_zone = uma_zcreate("rctl_rule_link", sizeof(struct rctl_rule_link), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); +} + +#else /* !RCTL */ + +int +rctl_get_usage(struct thread *td, struct rctl_get_usage_args *uap) +{ + + return (EOPNOTSUPP); +} + +int +rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap) +{ + + return (EOPNOTSUPP); +} + +int +rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap) +{ + + return (EOPNOTSUPP); +} + +int +rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap) +{ + + return (EOPNOTSUPP); +} + +int +rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap) +{ + + return (EOPNOTSUPP); +} + +#endif /* !RCTL */ diff -urNp current/sys/kern/kern_resource.c hrl/sys/kern/kern_resource.c --- current/sys/kern/kern_resource.c 2011-01-31 20:41:15.002939617 +0100 +++ hrl/sys/kern/kern_resource.c 2011-01-31 19:57:41.332910605 +0100 @@ -42,6 +42,7 @@ __FBSDID("$FreeBSD: src/sys/kern/kern_re #include #include #include +#include #include #include #include @@ -56,6 +57,7 @@ __FBSDID("$FreeBSD: src/sys/kern/kern_re #include #include #include +#include #include #include @@ -1203,6 +1205,7 @@ uifind(uid) if (uip == NULL) { rw_runlock(&uihashtbl_lock); uip = malloc(sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO); + container_create(&uip->ui_container); rw_wlock(&uihashtbl_lock); /* * There's a chance someone created our uidinfo while we @@ -1211,6 +1214,7 @@ uifind(uid) */ if ((old_uip = uilookup(uid)) != NULL) { /* Someone else beat us to it. */ + container_destroy(&uip->ui_container); free(uip, M_UIDINFO); uip = old_uip; } else { @@ -1266,6 +1270,7 @@ uifree(uip) /* Prepare for suboptimal case. */ rw_wlock(&uihashtbl_lock); if (refcount_release(&uip->ui_ref)) { + container_destroy(&uip->ui_container); LIST_REMOVE(uip, ui_hash); rw_wunlock(&uihashtbl_lock); if (uip->ui_sbsize != 0) @@ -1288,6 +1293,22 @@ uifree(uip) rw_wunlock(&uihashtbl_lock); } +void +ui_container_foreach(void (*callback)(struct container *container, + void *arg2, void *arg3), void *arg2, void *arg3) +{ + struct uidinfo *uip; + struct uihashhead *uih; + + rw_rlock(&uihashtbl_lock); + for (uih = &uihashtbl[uihash]; uih >= uihashtbl; uih--) { + LIST_FOREACH(uip, uih, ui_hash) { + (callback)(uip->ui_container, arg2, arg3); + } + } + rw_runlock(&uihashtbl_lock); +} + /* * Change the count associated with number of processes * a given user is using. When 'max' is 0, don't enforce a limit diff -urNp current/sys/kern/kern_sig.c hrl/sys/kern/kern_sig.c --- current/sys/kern/kern_sig.c 2011-01-31 20:41:15.182959983 +0100 +++ hrl/sys/kern/kern_sig.c 2011-01-31 19:57:41.602923694 +0100 @@ -48,6 +48,7 @@ __FBSDID("$FreeBSD: src/sys/kern/kern_si #include #include #include +#include #include #include #include @@ -3172,14 +3173,15 @@ coredump(struct thread *td) * if it is larger than the limit. */ limit = (off_t)lim_cur(p, RLIMIT_CORE); - PROC_UNLOCK(p); - if (limit == 0) { + if (limit == 0 && rusage_get_available(p, RUSAGE_CORE) == 0) { + PROC_UNLOCK(p); #ifdef AUDIT audit_proc_coredump(td, name, EFBIG); #endif free(name, M_TEMP); return (EFBIG); } + PROC_UNLOCK(p); restart: NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, name, td); diff -urNp current/sys/kern/kern_thr.c hrl/sys/kern/kern_thr.c --- current/sys/kern/kern_thr.c 2011-01-31 20:41:15.302944506 +0100 +++ hrl/sys/kern/kern_thr.c 2011-01-31 19:57:41.862972108 +0100 @@ -29,6 +29,7 @@ __FBSDID("$FreeBSD: src/sys/kern/kern_th #include "opt_compat.h" #include "opt_posix.h" +#include #include #include #include @@ -176,10 +177,18 @@ create_thread(struct thread *td, mcontex } } + PROC_LOCK(td->td_proc); + error = rusage_add(p, RUSAGE_NTHR, 1); + PROC_UNLOCK(td->td_proc); + if (error != 0) + return (EPROCLIM); + /* Initialize our td */ newtd = thread_alloc(0); - if (newtd == NULL) - return (ENOMEM); + if (newtd == NULL) { + error = ENOMEM; + goto fail; + } /* * Try the copyout as soon as we allocate the td so we don't @@ -195,7 +204,8 @@ create_thread(struct thread *td, mcontex (parent_tid != NULL && suword_lwpid(parent_tid, newtd->td_tid))) { thread_free(newtd); - return (EFAULT); + error = EFAULT; + goto fail; } bzero(&newtd->td_startzero, @@ -212,7 +222,7 @@ create_thread(struct thread *td, mcontex if (error != 0) { thread_free(newtd); crfree(td->td_ucred); - return (error); + goto fail; } } else { /* Set up our machine context. */ @@ -225,7 +235,7 @@ create_thread(struct thread *td, mcontex if (error != 0) { thread_free(newtd); crfree(td->td_ucred); - return (error); + goto fail; } } @@ -257,6 +267,12 @@ create_thread(struct thread *td, mcontex thread_unlock(newtd); return (0); + +fail: + PROC_LOCK(p); + rusage_sub(p, RUSAGE_NTHR, 1); + PROC_UNLOCK(p); + return (error); } int @@ -286,7 +302,10 @@ thr_exit(struct thread *td, struct thr_e } rw_wlock(&tidhash_lock); + PROC_LOCK(p); + rusage_sub(p, RUSAGE_NTHR, 1); + /* * Shutting down last thread in the proc. This will actually * call exit() in the trampoline when it returns. diff -urNp current/sys/kern/syscalls.c hrl/sys/kern/syscalls.c --- current/sys/kern/syscalls.c 2011-01-31 20:41:16.803493878 +0100 +++ hrl/sys/kern/syscalls.c 2011-01-31 19:57:44.632967456 +0100 @@ -2,8 +2,8 @@ * System call names. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/kern/syscalls.c,v 1.240 2010/08/30 14:26:02 kib Exp $ - * created from FreeBSD: head/sys/kern/syscalls.master 211998 2010-08-30 14:24:44Z kib + * $FreeBSD$ + * created from FreeBSD: src/sys/kern/syscalls.master,v 1.265 2010/08/30 14:24:44 kib Exp */ const char *syscallnames[] = { @@ -530,4 +530,11 @@ const char *syscallnames[] = { "#520", /* 520 = pdgetpid */ "#521", /* 521 = pdwait */ "pselect", /* 522 = pselect */ + "getloginclass", /* 523 = getloginclass */ + "setloginclass", /* 524 = setloginclass */ + "rctl_get_usage", /* 525 = rctl_get_usage */ + "rctl_get_rules", /* 526 = rctl_get_rules */ + "rctl_get_limits", /* 527 = rctl_get_limits */ + "rctl_add_rule", /* 528 = rctl_add_rule */ + "rctl_remove_rule", /* 529 = rctl_remove_rule */ }; diff -urNp current/sys/kern/syscalls.master hrl/sys/kern/syscalls.master --- current/sys/kern/syscalls.master 2011-01-31 20:41:16.812947035 +0100 +++ hrl/sys/kern/syscalls.master 2011-01-31 19:57:44.662903943 +0100 @@ -926,5 +926,13 @@ fd_set *ou, fd_set *ex, \ const struct timespec *ts, \ const sigset_t *sm); } +523 AUE_NULL STD { int getloginclass(char *namebuf, size_t \ + namelen); } +524 AUE_NULL STD { int setloginclass(const char *namebuf); } +525 AUE_NULL STD { int rctl_get_usage(const void *inbufp, size_t inbuflen, void *outbufp, size_t outbuflen); } +526 AUE_NULL STD { int rctl_get_rules(const void *inbufp, size_t inbuflen, void *outbufp, size_t outbuflen); } +527 AUE_NULL STD { int rctl_get_limits(const void *inbufp, size_t inbuflen, void *outbufp, size_t outbuflen); } +528 AUE_NULL STD { int rctl_add_rule(const void *inbufp, size_t inbuflen, void *outbufp, size_t outbuflen); } +529 AUE_NULL STD { int rctl_remove_rule(const void *inbufp, size_t inbuflen, void *outbufp, size_t outbuflen); } ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master diff -urNp current/sys/kern/systrace_args.c hrl/sys/kern/systrace_args.c --- current/sys/kern/systrace_args.c 2011-01-31 20:41:17.012935022 +0100 +++ hrl/sys/kern/systrace_args.c 2011-01-31 19:57:44.922875810 +0100 @@ -2,7 +2,7 @@ * System call argument to DTrace register array converstion. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/kern/systrace_args.c,v 1.36 2010/08/30 14:26:02 kib Exp $ + * $FreeBSD$ * This file is part of the DTrace syscall provider. */ @@ -3108,6 +3108,71 @@ systrace_args(int sysnum, void *params, *n_args = 6; break; } + /* getloginclass */ + case 523: { + struct getloginclass_args *p = params; + uarg[0] = (intptr_t) p->namebuf; /* char * */ + uarg[1] = p->namelen; /* size_t */ + *n_args = 2; + break; + } + /* setloginclass */ + case 524: { + struct setloginclass_args *p = params; + uarg[0] = (intptr_t) p->namebuf; /* const char * */ + *n_args = 1; + break; + } + /* rctl_get_usage */ + case 525: { + struct rctl_get_usage_args *p = params; + uarg[0] = (intptr_t) p->inbufp; /* const void * */ + uarg[1] = p->inbuflen; /* size_t */ + uarg[2] = (intptr_t) p->outbufp; /* void * */ + uarg[3] = p->outbuflen; /* size_t */ + *n_args = 4; + break; + } + /* rctl_get_rules */ + case 526: { + struct rctl_get_rules_args *p = params; + uarg[0] = (intptr_t) p->inbufp; /* const void * */ + uarg[1] = p->inbuflen; /* size_t */ + uarg[2] = (intptr_t) p->outbufp; /* void * */ + uarg[3] = p->outbuflen; /* size_t */ + *n_args = 4; + break; + } + /* rctl_get_limits */ + case 527: { + struct rctl_get_limits_args *p = params; + uarg[0] = (intptr_t) p->inbufp; /* const void * */ + uarg[1] = p->inbuflen; /* size_t */ + uarg[2] = (intptr_t) p->outbufp; /* void * */ + uarg[3] = p->outbuflen; /* size_t */ + *n_args = 4; + break; + } + /* rctl_add_rule */ + case 528: { + struct rctl_add_rule_args *p = params; + uarg[0] = (intptr_t) p->inbufp; /* const void * */ + uarg[1] = p->inbuflen; /* size_t */ + uarg[2] = (intptr_t) p->outbufp; /* void * */ + uarg[3] = p->outbuflen; /* size_t */ + *n_args = 4; + break; + } + /* rctl_remove_rule */ + case 529: { + struct rctl_remove_rule_args *p = params; + uarg[0] = (intptr_t) p->inbufp; /* const void * */ + uarg[1] = p->inbuflen; /* size_t */ + uarg[2] = (intptr_t) p->outbufp; /* void * */ + uarg[3] = p->outbuflen; /* size_t */ + *n_args = 4; + break; + } default: *n_args = 0; break; @@ -8265,6 +8330,124 @@ systrace_setargdesc(int sysnum, int ndx, break; }; break; + /* getloginclass */ + case 523: + switch(ndx) { + case 0: + p = "char *"; + break; + case 1: + p = "size_t"; + break; + default: + break; + }; + break; + /* setloginclass */ + case 524: + switch(ndx) { + case 0: + p = "const char *"; + break; + default: + break; + }; + break; + /* rctl_get_usage */ + case 525: + switch(ndx) { + case 0: + p = "const void *"; + break; + case 1: + p = "size_t"; + break; + case 2: + p = "void *"; + break; + case 3: + p = "size_t"; + break; + default: + break; + }; + break; + /* rctl_get_rules */ + case 526: + switch(ndx) { + case 0: + p = "const void *"; + break; + case 1: + p = "size_t"; + break; + case 2: + p = "void *"; + break; + case 3: + p = "size_t"; + break; + default: + break; + }; + break; + /* rctl_get_limits */ + case 527: + switch(ndx) { + case 0: + p = "const void *"; + break; + case 1: + p = "size_t"; + break; + case 2: + p = "void *"; + break; + case 3: + p = "size_t"; + break; + default: + break; + }; + break; + /* rctl_add_rule */ + case 528: + switch(ndx) { + case 0: + p = "const void *"; + break; + case 1: + p = "size_t"; + break; + case 2: + p = "void *"; + break; + case 3: + p = "size_t"; + break; + default: + break; + }; + break; + /* rctl_remove_rule */ + case 529: + switch(ndx) { + case 0: + p = "const void *"; + break; + case 1: + p = "size_t"; + break; + case 2: + p = "void *"; + break; + case 3: + p = "size_t"; + break; + default: + break; + }; + break; default: break; }; diff -urNp current/sys/kern/sysv_msg.c hrl/sys/kern/sysv_msg.c --- current/sys/kern/sysv_msg.c 2011-01-31 20:41:17.070132794 +0100 +++ hrl/sys/kern/sysv_msg.c 2011-01-31 19:57:44.942983115 +0100 @@ -56,6 +56,7 @@ __FBSDID("$FreeBSD: src/sys/kern/sysv_ms #include #include #include +#include #include #include #include @@ -464,6 +465,12 @@ kern_msgctl(td, msqid, cmd, msqbuf) } #endif + rusage_sub_cred(msqkptr->cred, RUSAGE_NMSGQ, 1); + rusage_sub_cred(msqkptr->cred, RUSAGE_MSGQQUEUED, msqkptr->u.msg_qnum); + rusage_sub_cred(msqkptr->cred, RUSAGE_MSGQSIZE, msqkptr->u.msg_cbytes); + crfree(msqkptr->cred); + msqkptr->cred = NULL; + /* Free the message headers */ msghdr = msqkptr->u.msg_first; while (msghdr != NULL) { @@ -611,6 +618,13 @@ msgget(td, uap) error = ENOSPC; goto done2; } + PROC_LOCK(td->td_proc); + error = rusage_add(td->td_proc, RUSAGE_NMSGQ, 1); + PROC_UNLOCK(td->td_proc); + if (error != 0) { + error = ENOSPC; + goto done2; + } DPRINTF(("msqid %d is available\n", msqid)); msqkptr->u.msg_perm.key = key; msqkptr->u.msg_perm.cuid = cred->cr_uid; @@ -618,6 +632,8 @@ msgget(td, uap) msqkptr->u.msg_perm.cgid = cred->cr_gid; msqkptr->u.msg_perm.gid = cred->cr_gid; msqkptr->u.msg_perm.mode = (msgflg & 0777); + crhold(cred); + msqkptr->cred = cred; /* Make sure that the returned msqid is unique */ msqkptr->u.msg_perm.seq = (msqkptr->u.msg_perm.seq + 1) & 0x7fff; msqkptr->u.msg_first = NULL; @@ -668,6 +684,7 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgf register struct msqid_kernel *msqkptr; register struct msg *msghdr; short next; + size_t saved_msgsz; if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); @@ -705,6 +722,21 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgf goto done2; #endif + PROC_LOCK(td->td_proc); + if (rusage_add(td->td_proc, RUSAGE_MSGQQUEUED, 1)) { + PROC_UNLOCK(td->td_proc); + error = EAGAIN; + goto done2; + } + saved_msgsz = msgsz; + if (rusage_add(td->td_proc, RUSAGE_MSGQSIZE, msgsz)) { + rusage_sub(td->td_proc, RUSAGE_MSGQQUEUED, 1); + PROC_UNLOCK(td->td_proc); + error = EAGAIN; + goto done2; + } + PROC_UNLOCK(td->td_proc); + segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz; DPRINTF(("msgsz=%zu, msgssz=%d, segs_needed=%d\n", msgsz, msginfo.msgssz, segs_needed)); @@ -719,7 +751,7 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgf if (msgsz > msqkptr->u.msg_qbytes) { DPRINTF(("msgsz > msqkptr->u.msg_qbytes\n")); error = EINVAL; - goto done2; + goto done3; } if (msqkptr->u.msg_perm.mode & MSG_LOCKED) { @@ -746,7 +778,7 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgf DPRINTF(("need more resources but caller " "doesn't want to wait\n")); error = EAGAIN; - goto done2; + goto done3; } if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) { @@ -772,7 +804,7 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgf if (error != 0) { DPRINTF(("msgsnd: interrupted system call\n")); error = EINTR; - goto done2; + goto done3; } /* @@ -782,7 +814,7 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgf if (msqkptr->u.msg_qbytes == 0) { DPRINTF(("msqid deleted\n")); error = EIDRM; - goto done2; + goto done3; } } else { @@ -864,7 +896,7 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgf wakeup(msqkptr); DPRINTF(("mtype (%ld) < 1\n", msghdr->msg_type)); error = EINVAL; - goto done2; + goto done3; } /* @@ -891,7 +923,7 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgf msg_freehdr(msghdr); msqkptr->u.msg_perm.mode &= ~MSG_LOCKED; wakeup(msqkptr); - goto done2; + goto done3; } mtx_lock(&msq_mtx); msgsz -= tlen; @@ -915,7 +947,7 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgf msg_freehdr(msghdr); wakeup(msqkptr); error = EIDRM; - goto done2; + goto done3; } #ifdef MAC @@ -934,7 +966,7 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgf if (error != 0) { msg_freehdr(msghdr); wakeup(msqkptr); - goto done2; + goto done3; } #endif @@ -957,6 +989,13 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgf wakeup(msqkptr); td->td_retval[0] = 0; +done3: + if (error != 0) { + PROC_LOCK(td->td_proc); + rusage_sub(td->td_proc, RUSAGE_MSGQQUEUED, 1); + rusage_sub(td->td_proc, RUSAGE_MSGQSIZE, saved_msgsz); + PROC_UNLOCK(td->td_proc); + } done2: mtx_unlock(&msq_mtx); return (error); @@ -1190,6 +1229,9 @@ kern_msgrcv(td, msqid, msgp, msgsz, msgt msqkptr->u.msg_lrpid = td->td_proc->p_pid; msqkptr->u.msg_rtime = time_second; + rusage_sub_cred(msqkptr->cred, RUSAGE_MSGQQUEUED, 1); + rusage_sub_cred(msqkptr->cred, RUSAGE_MSGQSIZE, msghdr->msg_ts); + /* * Make msgsz the actual amount that we'll be returning. * Note that this effectively truncates the message if it is too long diff -urNp current/sys/kern/sysv_sem.c hrl/sys/kern/sysv_sem.c --- current/sys/kern/sysv_sem.c 2011-01-31 20:41:17.082898942 +0100 +++ hrl/sys/kern/sysv_sem.c 2011-01-31 19:57:45.143118886 +0100 @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD: src/sys/kern/sysv_se #include #include #include +#include #include #include #include @@ -654,6 +655,9 @@ kern_semctl(struct thread *td, int semid semakptr->u.sem_perm.cuid = cred->cr_uid; semakptr->u.sem_perm.uid = cred->cr_uid; semakptr->u.sem_perm.mode = 0; + rusage_sub_cred(semakptr->cred, RUSAGE_NSEM, semakptr->u.sem_nsems); + crfree(semakptr->cred); + semakptr->cred = NULL; SEMUNDO_LOCK(); semundo_clear(semidx, -1); SEMUNDO_UNLOCK(); @@ -925,6 +929,13 @@ semget(struct thread *td, struct semget_ error = ENOSPC; goto done2; } + PROC_LOCK(td->td_proc); + error = rusage_add(td->td_proc, RUSAGE_NSEM, nsems); + PROC_UNLOCK(td->td_proc); + if (error != 0) { + error = ENOSPC; + goto done2; + } DPRINTF(("semid %d is available\n", semid)); mtx_lock(&sema_mtx[semid]); KASSERT((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0, @@ -935,6 +946,8 @@ semget(struct thread *td, struct semget_ sema[semid].u.sem_perm.cgid = cred->cr_gid; sema[semid].u.sem_perm.gid = cred->cr_gid; sema[semid].u.sem_perm.mode = (semflg & 0777) | SEM_ALLOC; + crhold(cred); + sema[semid].cred = cred; sema[semid].u.sem_perm.seq = (sema[semid].u.sem_perm.seq + 1) & 0x7fff; sema[semid].u.sem_nsems = nsems; @@ -1004,12 +1017,19 @@ semop(struct thread *td, struct semop_ar /* Allocate memory for sem_ops */ if (nsops <= SMALL_SOPS) sops = small_sops; - else if (nsops <= seminfo.semopm) - sops = malloc(nsops * sizeof(*sops), M_TEMP, M_WAITOK); - else { + else if (nsops > seminfo.semopm) { DPRINTF(("too many sops (max=%d, nsops=%d)\n", seminfo.semopm, nsops)); return (E2BIG); + } else { + PROC_LOCK(td->td_proc); + if (nsops > rusage_get_available(td->td_proc, RUSAGE_NSEMOP)) { + PROC_UNLOCK(td->td_proc); + return (E2BIG); + } + PROC_UNLOCK(td->td_proc); + + sops = malloc(nsops * sizeof(*sops), M_TEMP, M_WAITOK); } if ((error = copyin(uap->sops, sops, nsops * sizeof(sops[0]))) != 0) { DPRINTF(("error = %d from copyin(%p, %p, %d)\n", error, diff -urNp current/sys/kern/sysv_shm.c hrl/sys/kern/sysv_shm.c --- current/sys/kern/sysv_shm.c 2011-01-31 20:41:17.093111413 +0100 +++ hrl/sys/kern/sysv_shm.c 2011-01-31 19:57:45.162952692 +0100 @@ -67,6 +67,7 @@ __FBSDID("$FreeBSD: src/sys/kern/sysv_sh #include #include +#include #include #include #include @@ -244,6 +245,10 @@ shm_deallocate_segment(shmseg) #ifdef MAC mac_sysvshm_cleanup(shmseg); #endif + rusage_sub_cred(shmseg->cred, RUSAGE_NSHM, 1); + rusage_sub_cred(shmseg->cred, RUSAGE_SHMSIZE, size); + crfree(shmseg->cred); + shmseg->cred = NULL; } static int @@ -665,6 +670,17 @@ shmget_allocate_segment(td, uap, mode) shm_last_free = -1; } shmseg = &shmsegs[segnum]; + PROC_LOCK(td->td_proc); + if (rusage_add(td->td_proc, RUSAGE_NSHM, 1)) { + PROC_UNLOCK(td->td_proc); + return (ENOSPC); + } + if (rusage_add(td->td_proc, RUSAGE_SHMSIZE, size)) { + rusage_sub(td->td_proc, RUSAGE_NSHM, 1); + PROC_UNLOCK(td->td_proc); + return (ENOMEM); + } + PROC_UNLOCK(td->td_proc); /* * In case we sleep in malloc(), mark the segment present but deleted * so that noone else tries to create the same key. @@ -680,8 +696,13 @@ shmget_allocate_segment(td, uap, mode) */ shm_object = vm_pager_allocate(shm_use_phys ? OBJT_PHYS : OBJT_SWAP, 0, size, VM_PROT_DEFAULT, 0, cred); - if (shm_object == NULL) + if (shm_object == NULL) { + PROC_LOCK(td->td_proc); + rusage_sub(td->td_proc, RUSAGE_NSHM, 1); + rusage_sub(td->td_proc, RUSAGE_SHMSIZE, size); + PROC_UNLOCK(td->td_proc); return (ENOMEM); + } VM_OBJECT_LOCK(shm_object); vm_object_clear_flag(shm_object, OBJ_ONEMAPPING); vm_object_set_flag(shm_object, OBJ_NOSPLIT); @@ -692,6 +713,8 @@ shmget_allocate_segment(td, uap, mode) shmseg->u.shm_perm.cgid = shmseg->u.shm_perm.gid = cred->cr_gid; shmseg->u.shm_perm.mode = (shmseg->u.shm_perm.mode & SHMSEG_WANTED) | (mode & ACCESSPERMS) | SHMSEG_ALLOCATED; + crhold(cred); + shmseg->cred = cred; shmseg->u.shm_segsz = uap->size; shmseg->u.shm_cpid = td->td_proc->p_pid; shmseg->u.shm_lpid = shmseg->u.shm_nattch = 0; diff -urNp current/sys/kern/tty_pts.c hrl/sys/kern/tty_pts.c --- current/sys/kern/tty_pts.c 2011-01-31 20:41:17.253083420 +0100 +++ hrl/sys/kern/tty_pts.c 2011-01-31 19:57:45.402987948 +0100 @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD: src/sys/kern/tty_pts #include #include #include +#include #include #include #include @@ -97,7 +98,7 @@ struct pts_softc { struct cdev *pts_cdev; /* (c) Master device node. */ #endif /* PTS_EXTERNAL */ - struct uidinfo *pts_uidinfo; /* (c) Resource limit. */ + struct ucred *pts_cred; /* (c) Resource limit. */ }; /* @@ -681,8 +682,9 @@ ptsdrv_free(void *softc) if (psc->pts_unit >= 0) free_unr(pts_pool, psc->pts_unit); - chgptscnt(psc->pts_uidinfo, -1, 0); - uifree(psc->pts_uidinfo); + chgptscnt(psc->pts_cred->cr_ruidinfo, -1, 0); + rusage_sub_cred(psc->pts_cred, RUSAGE_NPTS, 1); + crfree(psc->pts_cred); knlist_destroy(&psc->pts_inpoll.si_note); knlist_destroy(&psc->pts_outpoll.si_note); @@ -712,23 +714,32 @@ static int pts_alloc(int fflags, struct thread *td, struct file *fp) { - int unit, ok; + int unit, ok, error; struct tty *tp; struct pts_softc *psc; struct proc *p = td->td_proc; - struct uidinfo *uid = td->td_ucred->cr_ruidinfo; + struct ucred *cred = td->td_ucred; /* Resource limiting. */ PROC_LOCK(p); - ok = chgptscnt(uid, 1, lim_cur(p, RLIMIT_NPTS)); - PROC_UNLOCK(p); - if (!ok) + error = rusage_add(p, RUSAGE_NPTS, 1); + if (error != 0) { + PROC_UNLOCK(p); + return (EAGAIN); + } + ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(p, RLIMIT_NPTS)); + if (!ok) { + rusage_sub(p, RUSAGE_NPTS, 1); + PROC_UNLOCK(p); return (EAGAIN); + } + PROC_UNLOCK(p); /* Try to allocate a new pts unit number. */ unit = alloc_unr(pts_pool); if (unit < 0) { - chgptscnt(uid, -1, 0); + rusage_sub(p, RUSAGE_NPTS, 1); + chgptscnt(cred->cr_ruidinfo, -1, 0); return (EAGAIN); } @@ -738,8 +749,7 @@ pts_alloc(int fflags, struct thread *td, cv_init(&psc->pts_outwait, "ptsout"); psc->pts_unit = unit; - psc->pts_uidinfo = uid; - uihold(uid); + psc->pts_cred = crhold(cred); tp = tty_alloc(&pts_class, psc); knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx); @@ -758,18 +768,26 @@ int pts_alloc_external(int fflags, struct thread *td, struct file *fp, struct cdev *dev, const char *name) { - int ok; + int ok, error; struct tty *tp; struct pts_softc *psc; struct proc *p = td->td_proc; - struct uidinfo *uid = td->td_ucred->cr_ruidinfo; + struct ucred *cred = td->td_ucred; /* Resource limiting. */ PROC_LOCK(p); - ok = chgptscnt(uid, 1, lim_cur(p, RLIMIT_NPTS)); - PROC_UNLOCK(p); - if (!ok) + error = rusage_add(p, RUSAGE_NPTS, 1); + if (error != 0) { + PROC_UNLOCK(p); + return (EAGAIN); + } + ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(p, RLIMIT_NPTS)); + if (!ok) { + rusage_sub(p, RUSAGE_NPTS, 1); + PROC_UNLOCK(p); return (EAGAIN); + } + PROC_UNLOCK(p); /* Allocate TTY and softc. */ psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO); @@ -778,8 +796,7 @@ pts_alloc_external(int fflags, struct th psc->pts_unit = -1; psc->pts_cdev = dev; - psc->pts_uidinfo = uid; - uihold(uid); + psc->pts_cred = crhold(cred);; tp = tty_alloc(&pts_class, psc); knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx); diff -urNp current/sys/kern/uipc_sockbuf.c hrl/sys/kern/uipc_sockbuf.c --- current/sys/kern/uipc_sockbuf.c 2011-01-31 20:41:17.552855598 +0100 +++ hrl/sys/kern/uipc_sockbuf.c 2011-01-31 19:57:45.932985755 +0100 @@ -36,6 +36,7 @@ __FBSDID("$FreeBSD: src/sys/kern/uipc_so #include #include /* for aio_swake proto */ +#include /* for aio_swake proto */ #include #include #include @@ -290,6 +291,7 @@ sbreserve_locked(struct sockbuf *sb, u_l struct thread *td) { rlim_t sbsize_limit; + int error; SOCKBUF_LOCK_ASSERT(sb); @@ -305,12 +307,19 @@ sbreserve_locked(struct sockbuf *sb, u_l if (td != NULL) { PROC_LOCK(td->td_proc); sbsize_limit = lim_cur(td->td_proc, RLIMIT_SBSIZE); + error = rusage_add(td->td_proc, RUSAGE_SBSIZE, cc); PROC_UNLOCK(td->td_proc); - } else + if (error != 0) + return (0); + } else { sbsize_limit = RLIM_INFINITY; + rusage_add_cred(so->so_cred, RUSAGE_SBSIZE, cc); + } if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc, - sbsize_limit)) + sbsize_limit)) { + rusage_sub_cred(so->so_cred, RUSAGE_SBSIZE, cc); return (0); + } sb->sb_mbmax = min(cc * sb_efficiency, sb_max); if (sb->sb_lowat > sb->sb_hiwat) sb->sb_lowat = sb->sb_hiwat; @@ -337,6 +346,7 @@ sbrelease_internal(struct sockbuf *sb, s { sbflush_internal(sb); + rusage_sub_cred(so->so_cred, RUSAGE_SBSIZE, sb->sb_hiwat); (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY); sb->sb_mbmax = 0; diff -urNp current/sys/kern/uipc_socket.c hrl/sys/kern/uipc_socket.c --- current/sys/kern/uipc_socket.c 2011-01-31 20:41:17.604347338 +0100 +++ hrl/sys/kern/uipc_socket.c 2011-01-31 19:57:46.153006176 +0100 @@ -104,6 +104,7 @@ __FBSDID("$FreeBSD: src/sys/kern/uipc_so #include #include +#include #include #include #include @@ -321,12 +322,18 @@ sodealloc(struct socket *so) so->so_vnet->vnet_sockcnt--; #endif mtx_unlock(&so_global_mtx); - if (so->so_rcv.sb_hiwat) + if (so->so_rcv.sb_hiwat) { + rusage_sub_cred(so->so_cred, RUSAGE_SBSIZE, + so->so_rcv.sb_hiwat); (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); - if (so->so_snd.sb_hiwat) + } + if (so->so_snd.sb_hiwat) { + rusage_sub_cred(so->so_cred, RUSAGE_SBSIZE, + so->so_snd.sb_hiwat); (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); + } #ifdef INET /* remove acccept filter if one is present. */ if (so->so_accf != NULL) diff -urNp current/sys/kern/uipc_usrreq.c hrl/sys/kern/uipc_usrreq.c --- current/sys/kern/uipc_usrreq.c 2011-01-31 20:41:17.743104638 +0100 +++ hrl/sys/kern/uipc_usrreq.c 2011-01-31 19:57:46.312954158 +0100 @@ -62,6 +62,7 @@ __FBSDID("$FreeBSD: src/sys/kern/uipc_us #include "opt_ddb.h" #include +#include #include #include #include /* XXX must be before */ @@ -800,6 +801,7 @@ uipc_rcvd(struct socket *so, int flags) SOCKBUF_LOCK(&so2->so_snd); so2->so_snd.sb_mbmax += unp->unp_mbcnt - mbcnt; newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - sbcc; + rusage_add_cred(so2->so_cred, RUSAGE_SBSIZE, newhiwat - so2->so_snd.sb_hiwat); (void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat, newhiwat, RLIM_INFINITY); sowwakeup_locked(so2); @@ -975,6 +977,7 @@ uipc_send(struct socket *so, int flags, SOCKBUF_LOCK(&so->so_snd); newhiwat = so->so_snd.sb_hiwat - (sbcc - unp2->unp_cc); + rusage_add_cred(so->so_cred, RUSAGE_SBSIZE, newhiwat - so->so_snd.sb_hiwat); (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, newhiwat, RLIM_INFINITY); so->so_snd.sb_mbmax -= mbcnt_delta; diff -urNp current/sys/kern/vfs_vnops.c hrl/sys/kern/vfs_vnops.c --- current/sys/kern/vfs_vnops.c 2011-01-31 20:41:18.573000896 +0100 +++ hrl/sys/kern/vfs_vnops.c 2011-01-31 19:57:47.662990825 +0100 @@ -39,6 +39,7 @@ __FBSDID("$FreeBSD: src/sys/kern/vfs_vno #include #include +#include #include #include #include @@ -1344,16 +1345,21 @@ int vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio, const struct thread *td) { - + uoff_t fsize; + if (vp->v_type != VREG || td == NULL) return (0); + + fsize = (uoff_t)uio->uio_offset + uio->uio_resid; + PROC_LOCK(td->td_proc); - if ((uoff_t)uio->uio_offset + uio->uio_resid > - lim_cur(td->td_proc, RLIMIT_FSIZE)) { + if (fsize > lim_cur(td->td_proc, RLIMIT_FSIZE) || + rusage_set(td->td_proc, RUSAGE_FSIZE, fsize) != 0) { psignal(td->td_proc, SIGXFSZ); PROC_UNLOCK(td->td_proc); return (EFBIG); } PROC_UNLOCK(td->td_proc); + return (0); } diff -urNp current/sys/sys/container.h hrl/sys/sys/container.h --- current/sys/sys/container.h 1970-01-01 01:00:00.000000000 +0100 +++ hrl/sys/sys/container.h 2011-01-31 19:58:23.903002614 +0100 @@ -0,0 +1,164 @@ +/*- + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Edward Tomasz Napierala under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _CONTAINER_H_ +#define _CONTAINER_H_ + +#include +#include +#include + +struct proc; +struct rctl_rule_link; +struct ucred; + +/* + * Resource containers. + */ + +#define RUSAGE_UNDEFINED -1 +#define RUSAGE_CPU 0 +#define RUSAGE_FSIZE 1 +#define RUSAGE_DATA 2 +#define RUSAGE_STACK 3 +#define RUSAGE_CORE 4 +#define RUSAGE_RSS 5 +#define RUSAGE_MEMLOCK 6 +#define RUSAGE_NPROC 7 +#define RUSAGE_NOFILE 8 +#define RUSAGE_SBSIZE 9 +#define RUSAGE_VMEM 10 +#define RUSAGE_NPTS 11 +#define RUSAGE_SWAP 12 +#define RUSAGE_NTHR 13 +#define RUSAGE_MSGQQUEUED 14 +#define RUSAGE_MSGQSIZE 15 +#define RUSAGE_NMSGQ 16 +#define RUSAGE_NSEM 17 +#define RUSAGE_NSEMOP 18 +#define RUSAGE_NSHM 19 +#define RUSAGE_SHMSIZE 20 +#define RUSAGE_WALLCLOCK 21 +#define RUSAGE_PCTCPU 22 +#define RUSAGE_MAX RUSAGE_PCTCPU + +/* + * Resource types. + */ +#define RUSAGE_IN_THOUSANDS 0x01 +#define RUSAGE_RECLAIMABLE 0x02 +#define RUSAGE_INHERITABLE 0x04 +#define RUSAGE_DENIABLE 0x08 +#define RUSAGE_SLOPPY 0x10 +#define RUSAGE_DAMPENED 0x20 + +extern int rusage_types[]; + +/* + * Amount stored in c_resources[] is thousand times bigger than what's + * visible to the userland. It gets fixed up when retrieving resource + * usage or adding rules. + */ +#define rusage_is_in_thousands(X) (rusage_types[X] & RUSAGE_IN_THOUSANDS) + +/* + * Resource usage can drop, as opposed to only grow. + */ +#define rusage_is_reclaimable(X) (rusage_types[X] & RUSAGE_RECLAIMABLE) + +/* + * Children inherit resource usage. + */ +#define rusage_is_inheritable(X) (rusage_types[X] & RUSAGE_INHERITABLE) + +/* + * rusage_{add,set}(9) can actually return an error and not update resource + * usage counters. Note that even when resource is not deniable, allocating + * resource might cause signals to be sent by RCTL code. + */ +#define rusage_is_deniable(X) (rusage_types[X] & RUSAGE_DENIABLE) + +/* + * Per-process resource usage information makes no sense, but per-credential + * one does. This kind of resources are usually allocated for process, but + * freed using credentials. + */ +#define rusage_is_sloppy(X) (rusage_types[X] & RUSAGE_SLOPPY) + +/* + * XXX: Explain somehow. + */ +#define rusage_is_dampened(X) (rusage_types[X] & RUSAGE_DAMPENED) + +/* + * 'container' defines resource consumption for a particular + * subject, such as process or jail. Containers form a graph - each + * container has zero or more subcontainers and zero or more + * "containing" containers (parents). For example, container for + * an uidinfo can have several subcontainers for processes of that + * user. On the other hand, each process can have several containing + * containers - one for jail the process is in, one for the user, + * one for every group this process belongs to (note that per-group + * limits are not implemented yet). + * + * Every process has exactly one container assigned to it. Containers + * for other objects are initialized when there is a rule which requires + * it. For example, uidinfo will have container assigned only if there + * is a rule this uidinfo is subject to, and 'hr_per' for this rule + * is RCTL_SUBJECT_TYPE_USER. + * + * This structure must be filled with zeroes initially. + */ +struct container { + int64_t c_resources[RUSAGE_MAX + 1]; + LIST_HEAD(, rctl_rule_link) c_rule_links; +}; + +int rusage_add(struct proc *p, int resource, uint64_t amount); +void rusage_add_cred(struct ucred *cred, int resource, uint64_t amount); +void rusage_add_force(struct proc *p, int resource, uint64_t amount); +int rusage_set(struct proc *p, int resource, uint64_t amount); +void rusage_set_force(struct proc *p, int resource, uint64_t amount); +void rusage_sub(struct proc *p, int resource, uint64_t amount); +void rusage_sub_cred(struct ucred *cred, int resource, uint64_t amount); +uint64_t rusage_get_limit(struct proc *p, int resource); +uint64_t rusage_get_available(struct proc *p, int resource); + +void container_create(struct container **containerp); +void container_destroy(struct container **containerp); + +int container_proc_fork(struct proc *parent, struct proc *child); +void container_proc_exit(struct proc *p); + +void container_proc_ucred_changed(struct proc *p, struct ucred *oldcred, + struct ucred *newcred); + +#endif /* !_CONTAINER_H_ */ diff -urNp current/sys/sys/jail.h hrl/sys/sys/jail.h --- current/sys/sys/jail.h 2011-01-31 20:41:51.203153052 +0100 +++ hrl/sys/sys/jail.h 2011-01-31 19:58:24.393022715 +0100 @@ -135,6 +135,8 @@ MALLOC_DECLARE(M_PRISON); #define HOSTUUIDLEN 64 +struct container; + /* * This structure describes a prison. It is pointed to by all struct * ucreds's of the inmates. pr_ref keeps track of them and is used to @@ -179,6 +181,7 @@ struct prison { char pr_hostname[MAXHOSTNAMELEN]; /* (p) jail hostname */ char pr_domainname[MAXHOSTNAMELEN]; /* (p) jail domainname */ char pr_hostuuid[HOSTUUIDLEN]; /* (p) jail hostuuid */ + struct container *pr_container; /* (c) resource accounting */ }; #endif /* _KERNEL || _WANT_PRISON */ @@ -338,6 +341,7 @@ struct ucred; struct mount; struct sockaddr; struct statfs; +struct container; int jailed(struct ucred *cred); int jailed_without_vnet(struct ucred *); void getcredhostname(struct ucred *, char *, size_t); @@ -380,6 +384,8 @@ int prison_if(struct ucred *cred, struct char *prison_name(struct prison *, struct prison *); int prison_priv_check(struct ucred *cred, int priv); int sysctl_jail_param(struct sysctl_oid *, void *, int , struct sysctl_req *); +void prison_container_foreach(void (*callback)(struct container *container, + void *arg2, void *arg3), void *arg2, void *arg3); #endif /* _KERNEL */ #endif /* !_SYS_JAIL_H_ */ diff -urNp current/sys/sys/kernel.h hrl/sys/sys/kernel.h --- current/sys/sys/kernel.h 2011-01-31 20:41:51.212983072 +0100 +++ hrl/sys/sys/kernel.h 2011-01-31 19:58:24.412863225 +0100 @@ -109,6 +109,7 @@ enum sysinit_sub_id { SI_SUB_VNET_PRELINK = 0x1E00000, /* vnet init before modules */ SI_SUB_KLD = 0x2000000, /* KLD and module setup */ SI_SUB_CPU = 0x2100000, /* CPU resource(s)*/ + SI_SUB_CONTAINER = 0x2110000, /* resource accounting */ SI_SUB_RANDOM = 0x2120000, /* random number generator */ SI_SUB_KDTRACE = 0x2140000, /* Kernel dtrace hooks */ SI_SUB_MAC = 0x2180000, /* TrustedBSD MAC subsystem */ @@ -169,6 +170,7 @@ enum sysinit_sub_id { SI_SUB_KTHREAD_UPDATE = 0xec00000, /* update daemon*/ SI_SUB_KTHREAD_IDLE = 0xee00000, /* idle procs*/ SI_SUB_SMP = 0xf000000, /* start the APs*/ + SI_SUB_CONTAINERD = 0xf100000, /* start containerd*/ SI_SUB_RUN_SCHEDULER = 0xfffffff /* scheduler*/ }; diff -urNp current/sys/sys/loginclass.h hrl/sys/sys/loginclass.h --- current/sys/sys/loginclass.h 1970-01-01 01:00:00.000000000 +0100 +++ hrl/sys/sys/loginclass.h 2011-01-31 19:58:24.512891301 +0100 @@ -0,0 +1,49 @@ +/*- + * Copyright (c) 2009 Edward Tomasz NapieraƂa + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SYS_LOGINCLASS_H_ +#define _SYS_LOGINCLASS_H_ + +struct container; + +/* + * Exactly one of these structures exists per login class. + */ +struct loginclass { + LIST_ENTRY(loginclass) lc_next; + char lc_name[MAXLOGNAME]; + u_int lc_refcount; + struct container *lc_container; +}; + +void loginclass_acquire(struct loginclass *lc); +void loginclass_release(struct loginclass *lc); +struct loginclass *loginclass_find(const char *name); +void loginclass_container_foreach(void (*callback)(struct container + *container, void *arg2, void *arg3), void *arg2, void *arg3); + +#endif /* !_SYS_LOGINCLASS_H_ */ + diff -urNp current/sys/sys/msg.h hrl/sys/sys/msg.h --- current/sys/sys/msg.h 2011-01-31 20:41:51.683021862 +0100 +++ hrl/sys/sys/msg.h 2011-01-31 19:58:24.692985699 +0100 @@ -160,6 +160,7 @@ struct msqid_kernel { * Kernel-private components of the message queue. */ struct label *label; /* MAC label */ + struct ucred *cred; /* creator's credentials */ }; #else /* !_KERNEL */ diff -urNp current/sys/sys/priv.h hrl/sys/sys/priv.h --- current/sys/sys/priv.h 2011-01-31 20:41:51.912963657 +0100 +++ hrl/sys/sys/priv.h 2011-01-31 19:58:24.972967931 +0100 @@ -156,6 +156,7 @@ #define PRIV_PROC_LIMIT 160 /* Exceed user process limit. */ #define PRIV_PROC_SETLOGIN 161 /* Can call setlogin. */ #define PRIV_PROC_SETRLIMIT 162 /* Can raise resources limits. */ +#define PRIV_PROC_SETLOGINCLASS 163 /* Can call setloginclass(2). */ /* System V IPC privileges. */ @@ -483,9 +484,15 @@ #define PRIV_AFS_DAEMON 661 /* Can become the AFS daemon. */ /* + * Resource Limits privileges. + */ +#define PRIV_RCTL_SET 670 +#define PRIV_RCTL_GET 671 + +/* * Track end of privilege list. */ -#define _PRIV_HIGHEST 662 +#define _PRIV_HIGHEST 672 /* * Validate that a named privilege is known by the privilege system. Invalid diff -urNp current/sys/sys/proc.h hrl/sys/sys/proc.h --- current/sys/sys/proc.h 2011-01-31 20:41:51.952926836 +0100 +++ hrl/sys/sys/proc.h 2011-01-31 19:58:24.992970194 +0100 @@ -157,6 +157,7 @@ struct pargs { * either lock is sufficient for read access, but both locks must be held * for write access. */ +struct container; struct kaudit_record; struct td_sched; struct nlminfo; @@ -545,6 +546,7 @@ struct proc { rlim_t p_cpulimit; /* (c) Current CPU limit in seconds. */ signed char p_nice; /* (c) Process "nice" value. */ int p_fibnum; /* in this routing domain XXX MRT */ + u_int p_throttle; /* (c) Resource usage accounting. */ /* End area that is copied on creation. */ #define p_endcopy p_xstat @@ -565,6 +567,8 @@ struct proc { struct cv p_pwait; /* (*) wait cv for exit/exec. */ struct cv p_dbgwait; /* (*) wait cv for debugger attach after fork. */ + uint64_t p_prev_runtime; /* (c) Resource usage accounting. */ + struct container *p_container; /* (b) Resource usage accounting. */ }; #define p_session p_pgrp->pg_session diff -urNp current/sys/sys/rctl.h hrl/sys/sys/rctl.h --- current/sys/sys/rctl.h 1970-01-01 01:00:00.000000000 +0100 +++ hrl/sys/sys/rctl.h 2011-01-31 19:58:25.052872779 +0100 @@ -0,0 +1,165 @@ +/*- + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Edward Tomasz Napierala under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _RCTL_H_ +#define _RCTL_H_ + +#include +#include +#include +#include + +struct proc; +struct uidinfo; +struct loginclass; +struct prison; +struct ucred; +struct rctl_rule_link; + +/* + * Resource Limits. + */ + +#ifdef _KERNEL + +/* + * 'rctl_rule' describes a single limit configured by the system + * administrator or a temporary limit set using setrlimit(2). + * The difference between 'subject' and 'per' is best described + * by example: to specify that every process of user with uid 1984 + * can consume 1gb of virtual memory, the 'rr_subject_type' would be + * RCTL_SUBJECT_TYPE_USER, 'rctl_subject.rs_uip' would point to + * 'struct uidinfo' for uid 1984, and 'rr_per' would be equal + * RCTL_SUBJECT_TYPE_PROCESS. + * + * 'rr_refcount' is equal to the number of rctl_rule_link structures + * pointing to the rule. + * + * This structure must never change after being added, via rctl_rule_link + * structures, to subjects. In order to change a limit, add a new + * rule and remove the previous one. + */ +struct rctl_rule { + int rr_subject_type; + union { + struct proc *rs_proc; + struct uidinfo *rs_uip; + struct loginclass *hr_loginclass; + struct prison *rs_prison; + } rr_subject; + int rr_per; + int rr_resource; + int rr_action; + int64_t rr_amount; + u_int rr_refcount; + struct task rr_task; +}; + +#define RCTL_SUBJECT_TYPE_UNDEFINED -1 +#define RCTL_SUBJECT_TYPE_PROCESS 0x0000 +#define RCTL_SUBJECT_TYPE_USER 0x0001 +#define RCTL_SUBJECT_TYPE_LOGINCLASS 0x0003 +#define RCTL_SUBJECT_TYPE_JAIL 0x0004 +#define RCTL_SUBJECT_TYPE_MAX RCTL_SUBJECT_TYPE_JAIL + +/* + * 'rr_per' takes the same flags as 'rr_subject_type'. + */ + +#define RCTL_ACTION_UNDEFINED -1 +#define RCTL_ACTION_SIGHUP SIGHUP +#define RCTL_ACTION_SIGINT SIGINT +#define RCTL_ACTION_SIGQUIT SIGQUIT +#define RCTL_ACTION_SIGILL SIGILL +#define RCTL_ACTION_SIGTRAP SIGTRAP +#define RCTL_ACTION_SIGABRT SIGABRT +#define RCTL_ACTION_SIGEMT SIGEMT +#define RCTL_ACTION_SIGFPE SIGFPE +#define RCTL_ACTION_SIGKILL SIGKILL +#define RCTL_ACTION_SIGBUS SIGBUS +#define RCTL_ACTION_SIGSEGV SIGSEGV +#define RCTL_ACTION_SIGSYS SIGSYS +#define RCTL_ACTION_SIGPIPE SIGPIPE +#define RCTL_ACTION_SIGALRM SIGALRM +#define RCTL_ACTION_SIGTERM SIGTERM +#define RCTL_ACTION_SIGURG SIGURG +#define RCTL_ACTION_SIGSTOP SIGSTOP +#define RCTL_ACTION_SIGTSTP SIGTSTP +#define RCTL_ACTION_SIGCHLD SIGCHLD +#define RCTL_ACTION_SIGTTIN SIGTTIN +#define RCTL_ACTION_SIGTTOU SIGTTOU +#define RCTL_ACTION_SIGIO SIGIO +#define RCTL_ACTION_SIGXCPU SIGXCPU +#define RCTL_ACTION_SIGXFSZ SIGXFSZ +#define RCTL_ACTION_SIGVTALRM SIGVTALRM +#define RCTL_ACTION_SIGPROF SIGPROF +#define RCTL_ACTION_SIGWINCH SIGWINCH +#define RCTL_ACTION_SIGINFO SIGINFO +#define RCTL_ACTION_SIGUSR1 SIGUSR1 +#define RCTL_ACTION_SIGUSR2 SIGUSR2 +#define RCTL_ACTION_SIGTHR SIGTHR +#define RCTL_ACTION_SIGNAL_MAX RCTL_ACTION_SIGTHR +#define RCTL_ACTION_DENY (RCTL_ACTION_SIGNAL_MAX + 1) +#define RCTL_ACTION_LOG (RCTL_ACTION_SIGNAL_MAX + 2) +#define RCTL_ACTION_MAX RCTL_ACTION_LOG + +#define RCTL_AMOUNT_UNDEFINED -1 + +void rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred); +struct rctl_rule *rctl_rule_alloc(int flags); +struct rctl_rule *rctl_rule_duplicate(const struct rctl_rule *rule, int flags); +void rctl_rule_acquire(struct rctl_rule *rule); +void rctl_rule_release(struct rctl_rule *rule); +int rctl_rule_add(struct rctl_rule *rule); +int rctl_rule_remove(struct rctl_rule *filter); + +int rctl_enforce(struct proc *p, int resource, uint64_t amount); +uint64_t rctl_get_limit(struct proc *p, int resource); +uint64_t rctl_get_available(struct proc *p, int resource); +const char *rctl_resource_name(int resource); +int rctl_proc_fork(struct proc *parent, struct proc *child); +void rctl_proc_exit(struct proc *p); +#else /* !_KERNEL */ + +/* + * Syscall interface. + */ +__BEGIN_DECLS +int rctl_get_usage(const char *inbufp, size_t inbuflen, char *outbufp, size_t outbuflen); +int rctl_get_rules(const char *inbufp, size_t inbuflen, char *outbufp, size_t outbuflen); +int rctl_get_limits(const char *inbufp, size_t inbuflen, char *outbufp, size_t outbuflen); +int rctl_add_rule(const char *inbufp, size_t inbuflen, char *outbufp, size_t outbuflen); +int rctl_remove_rule(const char *inbufp, size_t inbuflen, char *outbufp, size_t outbuflen); +__END_DECLS + +#endif /* !_KERNEL */ + +#endif /* !_RCTL_H_ */ diff -urNp current/sys/sys/resourcevar.h hrl/sys/sys/resourcevar.h --- current/sys/sys/resourcevar.h 2011-01-31 20:41:52.013100685 +0100 +++ hrl/sys/sys/resourcevar.h 2011-01-31 19:58:25.153011205 +0100 @@ -79,6 +79,8 @@ struct plimit { int pl_refcnt; /* number of references */ }; +struct container; + /*- * Per uid resource consumption. This structure is used to track * the total resource consumption (process count, socket buffer size, @@ -99,6 +101,7 @@ struct uidinfo { long ui_ptscnt; /* (b) number of pseudo-terminals */ uid_t ui_uid; /* (a) uid */ u_int ui_ref; /* (b) reference count */ + struct container *ui_container; /* (a) resource usage accounting */ }; #define UIDINFO_VMSIZE_LOCK(ui) mtx_lock(&((ui)->ui_vmsize_mtx)) @@ -140,6 +143,8 @@ struct uidinfo void uifree(struct uidinfo *uip); void uihashinit(void); void uihold(struct uidinfo *uip); +void ui_container_foreach(void (*callback)(struct container *container, + void *arg2, void *arg3), void *arg2, void *arg3); #endif /* _KERNEL */ #endif /* !_SYS_RESOURCEVAR_H_ */ diff -urNp current/sys/sys/sem.h hrl/sys/sys/sem.h --- current/sys/sys/sem.h 2011-01-31 20:41:52.122986439 +0100 +++ hrl/sys/sys/sem.h 2011-01-31 19:58:25.183091005 +0100 @@ -126,6 +126,7 @@ extern struct seminfo seminfo; struct semid_kernel { struct semid_ds u; struct label *label; /* MAC framework label */ + struct ucred *cred; /* creator's credentials */ }; /* internal "mode" bits */ diff -urNp current/sys/sys/shm.h hrl/sys/sys/shm.h --- current/sys/sys/shm.h 2011-01-31 20:41:52.122986439 +0100 +++ hrl/sys/sys/shm.h 2011-01-31 19:58:25.202927046 +0100 @@ -124,6 +124,7 @@ struct shmid_kernel { struct shmid_ds u; vm_object_t object; struct label *label; /* MAC label */ + struct ucred *cred; /* creator's credendials */ }; extern struct shminfo shminfo; diff -urNp current/sys/sys/syscall.h hrl/sys/sys/syscall.h --- current/sys/sys/syscall.h 2011-01-31 20:41:52.492918930 +0100 +++ hrl/sys/sys/syscall.h 2011-01-31 19:58:25.512996622 +0100 @@ -2,8 +2,8 @@ * System call numbers. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/sys/syscall.h,v 1.237 2010/08/30 14:26:02 kib Exp $ - * created from FreeBSD: head/sys/kern/syscalls.master 211998 2010-08-30 14:24:44Z kib + * $FreeBSD$ + * created from FreeBSD: src/sys/kern/syscalls.master,v 1.265 2010/08/30 14:24:44 kib Exp */ #define SYS_syscall 0 @@ -431,4 +431,11 @@ #define SYS_shmctl 512 #define SYS_lpathconf 513 #define SYS_pselect 522 -#define SYS_MAXSYSCALL 523 +#define SYS_getloginclass 523 +#define SYS_setloginclass 524 +#define SYS_rctl_get_usage 525 +#define SYS_rctl_get_rules 526 +#define SYS_rctl_get_limits 527 +#define SYS_rctl_add_rule 528 +#define SYS_rctl_remove_rule 529 +#define SYS_MAXSYSCALL 530 diff -urNp current/sys/sys/syscall.mk hrl/sys/sys/syscall.mk --- current/sys/sys/syscall.mk 2011-01-31 20:41:52.492918930 +0100 +++ hrl/sys/sys/syscall.mk 2011-01-31 19:58:25.512996622 +0100 @@ -1,7 +1,7 @@ # FreeBSD system call names. # DO NOT EDIT-- this file is automatically generated. -# $FreeBSD: src/sys/sys/syscall.mk,v 1.192 2010/08/30 14:26:02 kib Exp $ -# created from FreeBSD: head/sys/kern/syscalls.master 211998 2010-08-30 14:24:44Z kib +# $FreeBSD$ +# created from FreeBSD: src/sys/kern/syscalls.master,v 1.265 2010/08/30 14:24:44 kib Exp MIASM = \ syscall.o \ exit.o \ @@ -379,4 +379,11 @@ MIASM = \ msgctl.o \ shmctl.o \ lpathconf.o \ - pselect.o + pselect.o \ + getloginclass.o \ + setloginclass.o \ + rctl_get_usage.o \ + rctl_get_rules.o \ + rctl_get_limits.o \ + rctl_add_rule.o \ + rctl_remove_rule.o diff -urNp current/sys/sys/sysproto.h hrl/sys/sys/sysproto.h --- current/sys/sys/sysproto.h 2011-01-31 20:41:52.682934980 +0100 +++ hrl/sys/sys/sysproto.h 2011-01-31 19:58:25.693077610 +0100 @@ -2,8 +2,8 @@ * System call prototypes. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/sys/sysproto.h,v 1.244 2010/08/30 14:26:02 kib Exp $ - * created from FreeBSD: head/sys/kern/syscalls.master 211998 2010-08-30 14:24:44Z kib + * $FreeBSD$ + * created from FreeBSD: src/sys/kern/syscalls.master,v 1.265 2010/08/30 14:24:44 kib Exp */ #ifndef _SYS_SYSPROTO_H_ @@ -1665,6 +1665,43 @@ struct pselect_args { char ts_l_[PADL_(const struct timespec *)]; const struct timespec * ts; char ts_r_[PADR_(const struct timespec *)]; char sm_l_[PADL_(const sigset_t *)]; const sigset_t * sm; char sm_r_[PADR_(const sigset_t *)]; }; +struct getloginclass_args { + char namebuf_l_[PADL_(char *)]; char * namebuf; char namebuf_r_[PADR_(char *)]; + char namelen_l_[PADL_(size_t)]; size_t namelen; char namelen_r_[PADR_(size_t)]; +}; +struct setloginclass_args { + char namebuf_l_[PADL_(const char *)]; const char * namebuf; char namebuf_r_[PADR_(const char *)]; +}; +struct rctl_get_usage_args { + char inbufp_l_[PADL_(const void *)]; const void * inbufp; char inbufp_r_[PADR_(const void *)]; + char inbuflen_l_[PADL_(size_t)]; size_t inbuflen; char inbuflen_r_[PADR_(size_t)]; + char outbufp_l_[PADL_(void *)]; void * outbufp; char outbufp_r_[PADR_(void *)]; + char outbuflen_l_[PADL_(size_t)]; size_t outbuflen; char outbuflen_r_[PADR_(size_t)]; +}; +struct rctl_get_rules_args { + char inbufp_l_[PADL_(const void *)]; const void * inbufp; char inbufp_r_[PADR_(const void *)]; + char inbuflen_l_[PADL_(size_t)]; size_t inbuflen; char inbuflen_r_[PADR_(size_t)]; + char outbufp_l_[PADL_(void *)]; void * outbufp; char outbufp_r_[PADR_(void *)]; + char outbuflen_l_[PADL_(size_t)]; size_t outbuflen; char outbuflen_r_[PADR_(size_t)]; +}; +struct rctl_get_limits_args { + char inbufp_l_[PADL_(const void *)]; const void * inbufp; char inbufp_r_[PADR_(const void *)]; + char inbuflen_l_[PADL_(size_t)]; size_t inbuflen; char inbuflen_r_[PADR_(size_t)]; + char outbufp_l_[PADL_(void *)]; void * outbufp; char outbufp_r_[PADR_(void *)]; + char outbuflen_l_[PADL_(size_t)]; size_t outbuflen; char outbuflen_r_[PADR_(size_t)]; +}; +struct rctl_add_rule_args { + char inbufp_l_[PADL_(const void *)]; const void * inbufp; char inbufp_r_[PADR_(const void *)]; + char inbuflen_l_[PADL_(size_t)]; size_t inbuflen; char inbuflen_r_[PADR_(size_t)]; + char outbufp_l_[PADL_(void *)]; void * outbufp; char outbufp_r_[PADR_(void *)]; + char outbuflen_l_[PADL_(size_t)]; size_t outbuflen; char outbuflen_r_[PADR_(size_t)]; +}; +struct rctl_remove_rule_args { + char inbufp_l_[PADL_(const void *)]; const void * inbufp; char inbufp_r_[PADR_(const void *)]; + char inbuflen_l_[PADL_(size_t)]; size_t inbuflen; char inbuflen_r_[PADR_(size_t)]; + char outbufp_l_[PADL_(void *)]; void * outbufp; char outbufp_r_[PADR_(void *)]; + char outbuflen_l_[PADL_(size_t)]; size_t outbuflen; char outbuflen_r_[PADR_(size_t)]; +}; int nosys(struct thread *, struct nosys_args *); void sys_exit(struct thread *, struct sys_exit_args *); int fork(struct thread *, struct fork_args *); @@ -2026,6 +2063,13 @@ int msgctl(struct thread *, struct msgct int shmctl(struct thread *, struct shmctl_args *); int lpathconf(struct thread *, struct lpathconf_args *); int pselect(struct thread *, struct pselect_args *); +int getloginclass(struct thread *, struct getloginclass_args *); +int setloginclass(struct thread *, struct setloginclass_args *); +int rctl_get_usage(struct thread *, struct rctl_get_usage_args *); +int rctl_get_rules(struct thread *, struct rctl_get_rules_args *); +int rctl_get_limits(struct thread *, struct rctl_get_limits_args *); +int rctl_add_rule(struct thread *, struct rctl_add_rule_args *); +int rctl_remove_rule(struct thread *, struct rctl_remove_rule_args *); #ifdef COMPAT_43 @@ -2701,6 +2745,13 @@ int freebsd7_shmctl(struct thread *, str #define SYS_AUE_shmctl AUE_SHMCTL #define SYS_AUE_lpathconf AUE_LPATHCONF #define SYS_AUE_pselect AUE_SELECT +#define SYS_AUE_getloginclass AUE_NULL +#define SYS_AUE_setloginclass AUE_NULL +#define SYS_AUE_rctl_get_usage AUE_NULL +#define SYS_AUE_rctl_get_rules AUE_NULL +#define SYS_AUE_rctl_get_limits AUE_NULL +#define SYS_AUE_rctl_add_rule AUE_NULL +#define SYS_AUE_rctl_remove_rule AUE_NULL #undef PAD_ #undef PADL_ diff -urNp current/sys/sys/ucred.h hrl/sys/sys/ucred.h --- current/sys/sys/ucred.h 2011-01-31 20:41:52.932921221 +0100 +++ hrl/sys/sys/ucred.h 2011-01-31 19:58:25.792898677 +0100 @@ -35,6 +35,8 @@ #include +struct loginclass; + /* * Credentials. * @@ -54,7 +56,7 @@ struct ucred { struct uidinfo *cr_uidinfo; /* per euid resource consumption */ struct uidinfo *cr_ruidinfo; /* per ruid resource consumption */ struct prison *cr_prison; /* jail(2) */ - void *cr_pspare; /* general use */ + struct loginclass *cr_loginclass; /* login class */ u_int cr_flags; /* credential flags */ void *cr_pspare2[2]; /* general use 2 */ #define cr_endcopy cr_label @@ -87,6 +89,7 @@ struct xucred { #ifdef _KERNEL struct proc; struct thread; +struct proc; void change_egid(struct ucred *newcred, gid_t egid); void change_euid(struct ucred *newcred, struct uidinfo *euip); diff -urNp current/sys/vm/swap_pager.c hrl/sys/vm/swap_pager.c --- current/sys/vm/swap_pager.c 2011-01-31 20:41:54.732971227 +0100 +++ hrl/sys/vm/swap_pager.c 2011-01-31 19:58:27.932962847 +0100 @@ -75,6 +75,7 @@ __FBSDID("$FreeBSD: src/sys/vm/swap_page #include #include #include +#include #include #include #include @@ -192,6 +193,12 @@ swap_reserve_by_cred(vm_ooffset_t incr, if (incr & PAGE_MASK) panic("swap_reserve: & PAGE_MASK"); + PROC_LOCK(curproc); + error = rusage_add(curproc, RUSAGE_SWAP, incr); + PROC_UNLOCK(curproc); + if (error != 0) + return (0); + res = 0; mtx_lock(&sw_dev_mtx); r = swap_reserved + incr; @@ -230,6 +237,12 @@ swap_reserve_by_cred(vm_ooffset_t incr, curproc->p_pid, uip->ui_uid, incr); } + if (!res) { + PROC_LOCK(curproc); + rusage_sub(curproc, RUSAGE_SWAP, incr); + PROC_UNLOCK(curproc); + } + return (res); } @@ -242,6 +255,10 @@ swap_reserve_force(vm_ooffset_t incr) swap_reserved += incr; mtx_unlock(&sw_dev_mtx); + PROC_LOCK(curproc); + rusage_add_force(curproc, RUSAGE_SWAP, incr); + PROC_UNLOCK(curproc); + uip = curthread->td_ucred->cr_ruidinfo; PROC_LOCK(curproc); UIDINFO_VMSIZE_LOCK(uip); @@ -282,6 +299,8 @@ swap_release_by_cred(vm_ooffset_t decr, printf("negative vmsize for uid = %d\n", uip->ui_uid); uip->ui_vmsize -= decr; UIDINFO_VMSIZE_UNLOCK(uip); + + rusage_sub_cred(cred, RUSAGE_SWAP, decr); } static void swapdev_strategy(struct buf *, struct swdevt *sw); diff -urNp current/sys/vm/vm_glue.c hrl/sys/vm/vm_glue.c --- current/sys/vm/vm_glue.c 2011-01-31 20:41:55.002947998 +0100 +++ hrl/sys/vm/vm_glue.c 2011-01-31 19:58:28.012886692 +0100 @@ -65,6 +65,7 @@ __FBSDID("$FreeBSD: src/sys/vm/vm_glue.c #include #include +#include #include #include #include @@ -182,6 +183,7 @@ int vslock(void *addr, size_t len) { vm_offset_t end, last, start; + unsigned long nsize; vm_size_t npages; int error; @@ -194,9 +196,13 @@ vslock(void *addr, size_t len) if (npages > vm_page_max_wired) return (ENOMEM); PROC_LOCK(curproc); - if (ptoa(npages + - pmap_wired_count(vm_map_pmap(&curproc->p_vmspace->vm_map))) > - lim_cur(curproc, RLIMIT_MEMLOCK)) { + nsize = ptoa(npages + + pmap_wired_count(vm_map_pmap(&curproc->p_vmspace->vm_map))); + if (nsize > lim_cur(curproc, RLIMIT_MEMLOCK)) { + PROC_UNLOCK(curproc); + return (ENOMEM); + } + if (rusage_set(curproc, RUSAGE_MEMLOCK, nsize)) { PROC_UNLOCK(curproc); return (ENOMEM); } @@ -216,6 +222,12 @@ vslock(void *addr, size_t len) #endif error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); + if (error != KERN_SUCCESS) { + PROC_LOCK(curproc); + rusage_set(curproc, RUSAGE_MEMLOCK, + ptoa(pmap_wired_count(vm_map_pmap(&curproc->p_vmspace->vm_map)))); + PROC_UNLOCK(curproc); + } /* * Return EFAULT on error to match copy{in,out}() behaviour * rather than returning ENOMEM like mlock() would. @@ -231,6 +243,11 @@ vsunlock(void *addr, size_t len) (void)vm_map_unwire(&curproc->p_vmspace->vm_map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); + + PROC_LOCK(curproc); + rusage_set(curproc, RUSAGE_MEMLOCK, + ptoa(pmap_wired_count(vm_map_pmap(&curproc->p_vmspace->vm_map)))); + PROC_UNLOCK(curproc); } /* diff -urNp current/sys/vm/vm_map.c hrl/sys/vm/vm_map.c --- current/sys/vm/vm_map.c 2011-01-31 20:41:55.063122685 +0100 +++ hrl/sys/vm/vm_map.c 2011-01-31 20:19:46.842995826 +0100 @@ -68,6 +68,7 @@ __FBSDID("$FreeBSD: src/sys/vm/vm_map.c, #include #include #include +#include #include #include #include @@ -313,6 +314,19 @@ vm_init2(void) vmspace_zinit, vmspace_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); } +static void +vmspace_container_reset(struct proc *p) +{ + + PROC_LOCK(p); + rusage_set(p, RUSAGE_DATA, 0); + rusage_set(p, RUSAGE_STACK, 0); + rusage_set(p, RUSAGE_RSS, 0); + rusage_set(p, RUSAGE_MEMLOCK, 0); + rusage_set(p, RUSAGE_VMEM, 0); + PROC_UNLOCK(p); +} + static inline void vmspace_dofree(struct vmspace *vm) { @@ -410,6 +424,7 @@ vmspace_exit(struct thread *td) pmap_activate(td); vmspace_dofree(vm); } + vmspace_container_reset(p); } /* Acquire reference to vmspace owned by another process. */ @@ -3278,6 +3293,10 @@ vm_map_growstack(struct proc *p, vm_offs rlim_t stacklim, vmemlim; int is_procstack, rv; struct ucred *cred; +#ifdef notyet + uint64_t limit; +#endif + int error; Retry: PROC_LOCK(p); @@ -3376,6 +3395,14 @@ Retry: vm_map_unlock_read(map); return (KERN_NO_SPACE); } + PROC_LOCK(p); + if (is_procstack && + rusage_set(p, RUSAGE_STACK, ctob(vm->vm_ssize) + grow_amount)) { + PROC_UNLOCK(p); + vm_map_unlock_read(map); + return (KERN_NO_SPACE); + } + PROC_UNLOCK(p); /* Round up the grow amount modulo SGROWSIZ */ grow_amount = roundup (grow_amount, sgrowsiz); @@ -3385,12 +3412,28 @@ Retry: grow_amount = trunc_page((vm_size_t)stacklim) - ctob(vm->vm_ssize); } +#ifdef notyet + PROC_LOCK(p); + limit = rusage_get_available(p, RUSAGE_STACK); + PROC_UNLOCK(p); + if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit)) + grow_amount = limit - ctob(vm->vm_ssize); +#endif /* If we would blow our VMEM resource limit, no go */ if (map->size + grow_amount > vmemlim) { vm_map_unlock_read(map); - return (KERN_NO_SPACE); + rv = KERN_NO_SPACE; + goto out; } + PROC_LOCK(p); + if (rusage_set(p, RUSAGE_VMEM, map->size + grow_amount)) { + PROC_UNLOCK(p); + vm_map_unlock_read(map); + rv = KERN_NO_SPACE; + goto out; + } + PROC_UNLOCK(p); if (vm_map_lock_upgrade(map)) goto Retry; @@ -3489,6 +3532,16 @@ Retry: : VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); } +out: + if (rv != KERN_SUCCESS) { + PROC_LOCK(p); + error = rusage_set(p, RUSAGE_VMEM, map->size); + KASSERT(error == 0, ("decreasing RUSAGE_VMEM failed")); + error = rusage_set(p, RUSAGE_STACK, ctob(vm->vm_ssize)); + KASSERT(error == 0, ("decreasing RUSAGE_STACK failed")); + PROC_UNLOCK(p); + } + return (rv); } diff -urNp current/sys/vm/vm_mmap.c hrl/sys/vm/vm_mmap.c --- current/sys/vm/vm_mmap.c 2011-01-31 20:41:55.172934966 +0100 +++ hrl/sys/vm/vm_mmap.c 2011-01-31 19:58:28.722856534 +0100 @@ -46,6 +46,7 @@ __FBSDID("$FreeBSD: src/sys/vm/vm_mmap.c #include "opt_compat.h" #include "opt_hwpmc_hooks.h" +#include #include #include #include @@ -1030,6 +1031,7 @@ mlock(td, uap) struct proc *proc; vm_offset_t addr, end, last, start; vm_size_t npages, size; + unsigned long nsize; int error; error = priv_check(td, PRIV_VM_MLOCK); @@ -1047,17 +1049,28 @@ mlock(td, uap) return (ENOMEM); proc = td->td_proc; PROC_LOCK(proc); - if (ptoa(npages + - pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) > - lim_cur(proc, RLIMIT_MEMLOCK)) { + nsize = ptoa(npages + + pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))); + if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) { PROC_UNLOCK(proc); return (ENOMEM); } PROC_UNLOCK(proc); if (npages + cnt.v_wire_count > vm_page_max_wired) return (EAGAIN); + PROC_LOCK(proc); + error = rusage_set(proc, RUSAGE_MEMLOCK, nsize); + PROC_UNLOCK(proc); + if (error != 0) + return (ENOMEM); error = vm_map_wire(&proc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); + if (error != KERN_SUCCESS) { + PROC_LOCK(proc); + rusage_set(proc, RUSAGE_MEMLOCK, + ptoa(pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map)))); + PROC_UNLOCK(proc); + } return (error == KERN_SUCCESS ? 0 : ENOMEM); } @@ -1100,6 +1113,11 @@ mlockall(td, uap) if (error) return (error); #endif + PROC_LOCK(td->td_proc); + error = rusage_set(td->td_proc, RUSAGE_MEMLOCK, map->size); + PROC_UNLOCK(td->td_proc); + if (error != 0) + return (ENOMEM); if (uap->how & MCL_FUTURE) { vm_map_lock(map); @@ -1119,6 +1137,12 @@ mlockall(td, uap) VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); error = (error == KERN_SUCCESS ? 0 : EAGAIN); } + if (error != KERN_SUCCESS) { + PROC_LOCK(td->td_proc); + rusage_set(td->td_proc, RUSAGE_MEMLOCK, + ptoa(pmap_wired_count(vm_map_pmap(&td->td_proc->p_vmspace->vm_map)))); + PROC_UNLOCK(td->td_proc); + } return (error); } @@ -1153,6 +1177,11 @@ munlockall(td, uap) /* Forcibly unwire all pages. */ error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); + if (error == KERN_SUCCESS) { + PROC_LOCK(td->td_proc); + rusage_set(td->td_proc, RUSAGE_MEMLOCK, 0); + PROC_UNLOCK(td->td_proc); + } return (error); } @@ -1187,6 +1216,11 @@ munlock(td, uap) return (EINVAL); error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); + if (error == KERN_SUCCESS) { + PROC_LOCK(td->td_proc); + rusage_sub(td->td_proc, RUSAGE_MEMLOCK, ptoa(end - start)); + PROC_UNLOCK(td->td_proc); + } return (error == KERN_SUCCESS ? 0 : ENOMEM); } @@ -1419,6 +1453,11 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, PROC_UNLOCK(td->td_proc); return(ENOMEM); } + if (rusage_set(td->td_proc, RUSAGE_VMEM, + td->td_proc->p_vmspace->vm_map.size + size)) { + PROC_UNLOCK(td->td_proc); + return (ENOMEM); + } PROC_UNLOCK(td->td_proc); /* diff -urNp current/sys/vm/vm_pageout.c hrl/sys/vm/vm_pageout.c --- current/sys/vm/vm_pageout.c 2011-01-31 20:41:55.463087485 +0100 +++ hrl/sys/vm/vm_pageout.c 2011-01-31 19:58:29.062968197 +0100 @@ -79,6 +79,7 @@ __FBSDID("$FreeBSD: src/sys/vm/vm_pageou #include #include #include +#include #include #include #include @@ -1630,11 +1631,16 @@ vm_daemon() struct proc *p; struct thread *td; struct vmspace *vm; - int breakout, swapout_flags; + int breakout, swapout_flags, tryagain; + uint64_t rsize, ravailable; while (TRUE) { mtx_lock(&vm_daemon_mtx); +#ifdef CONTAINERS + msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", hz); +#else msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", 0); +#endif swapout_flags = vm_pageout_req_swapout; vm_pageout_req_swapout = 0; mtx_unlock(&vm_daemon_mtx); @@ -1645,6 +1651,8 @@ vm_daemon() * scan the processes for exceeding their rlimits or if * process is swapped out -- deactivate pages */ +again: + tryagain = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { vm_pindex_t limit, size; @@ -1702,9 +1710,44 @@ vm_daemon() vm_pageout_map_deactivate_pages( &vm->vm_map, limit); } + rsize = IDX_TO_OFF(size); + PROC_LOCK(p); + rusage_set(p, RUSAGE_RSS, rsize); + ravailable = rusage_get_available(p, RUSAGE_RSS); + PROC_UNLOCK(p); + if (rsize > ravailable) { + /* + * Don't be overly aggressive; this might be + * an innocent process, and the limit could've + * been exceeded by some memory hog. Don't + * try to deactivate more than 1/4th of process' + * resident set size. + * + * XXX: Reconsider. + */ + if (ravailable < rsize - (rsize / 4)) + ravailable = rsize - (rsize / 4); + vm_pageout_map_deactivate_pages( + &vm->vm_map, OFF_TO_IDX(ravailable)); + /* Update RSS usage after paging out. */ + size = vmspace_resident_count(vm); + rsize = IDX_TO_OFF(size); + PROC_LOCK(p); + rusage_set(p, RUSAGE_RSS, rsize); + PROC_UNLOCK(p); + if (rsize > ravailable) + tryagain++; + if (tryagain > 20) { + printf("still too much: rsize = %ju, ravailable = %ju\n", + (uintmax_t)rsize, (uintmax_t)ravailable); + tryagain = 0; + } + } vmspace_free(vm); } sx_sunlock(&allproc_lock); + if (tryagain != 0) + goto again; } } #endif /* !defined(NO_SWAPPING) */ diff -urNp current/sys/vm/vm_unix.c hrl/sys/vm/vm_unix.c --- current/sys/vm/vm_unix.c 2011-01-31 20:41:55.512986565 +0100 +++ hrl/sys/vm/vm_unix.c 2011-01-31 19:58:29.272942369 +0100 @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD: src/sys/vm/vm_unix.c,v 1.49 2009/04/11 22:34:08 alc Exp $"); #include +#include #include #include #include @@ -116,9 +117,29 @@ obreak(td, uap) error = ENOMEM; goto done; } + PROC_LOCK(td->td_proc); + error = rusage_set(td->td_proc, RUSAGE_DATA, new - base); + if (error != 0) { + PROC_UNLOCK(td->td_proc); + error = ENOMEM; + goto done; + } + error = rusage_set(td->td_proc, RUSAGE_VMEM, + vm->vm_map.size + (new - old)); + if (error != 0) { + rusage_set_force(td->td_proc, RUSAGE_DATA, old - base); + PROC_UNLOCK(td->td_proc); + error = ENOMEM; + goto done; + } + PROC_UNLOCK(td->td_proc); rv = vm_map_insert(&vm->vm_map, NULL, 0, old, new, VM_PROT_RW, VM_PROT_ALL, 0); if (rv != KERN_SUCCESS) { + PROC_LOCK(td->td_proc); + rusage_set_force(td->td_proc, RUSAGE_DATA, old - base); + rusage_set_force(td->td_proc, RUSAGE_VMEM, vm->vm_map.size); + PROC_UNLOCK(td->td_proc); error = ENOMEM; goto done; } @@ -144,6 +165,10 @@ obreak(td, uap) goto done; } vm->vm_dsize -= btoc(old - new); + PROC_LOCK(td->td_proc); + rusage_set_force(td->td_proc, RUSAGE_DATA, new - base); + rusage_set_force(td->td_proc, RUSAGE_VMEM, vm->vm_map.size); + PROC_UNLOCK(td->td_proc); } done: vm_map_unlock(&vm->vm_map); diff -urNp current/usr.bin/Makefile hrl/usr.bin/Makefile --- current/usr.bin/Makefile 2011-01-31 20:42:05.182964732 +0100 +++ hrl/usr.bin/Makefile 2011-01-31 19:58:37.173008718 +0100 @@ -64,6 +64,7 @@ SUBDIR= alias \ id \ ipcrm \ ipcs \ + jailstat \ join \ jot \ kdump \ @@ -123,6 +124,7 @@ SUBDIR= alias \ printenv \ printf \ procstat \ + rctl \ renice \ rev \ revoke \ diff -urNp current/usr.bin/id/id.1 hrl/usr.bin/id/id.1 --- current/usr.bin/id/id.1 2011-01-31 20:42:08.732897196 +0100 +++ hrl/usr.bin/id/id.1 2011-01-31 19:58:42.952927395 +0100 @@ -51,6 +51,8 @@ .Fl P .Op Ar user .Nm +.Fl c +.Nm .Fl g Op Fl nr .Op Ar user .Nm @@ -89,6 +91,8 @@ Display the id as a password file entry. Ignored for compatibility with other .Nm implementations. +.It Fl c +Display current login class. .It Fl g Display the effective group ID as a number. .It Fl n diff -urNp current/usr.bin/id/id.c hrl/usr.bin/id/id.c --- current/usr.bin/id/id.c 2011-01-31 20:42:08.732897196 +0100 +++ hrl/usr.bin/id/id.c 2011-01-31 19:58:42.952927395 +0100 @@ -74,11 +74,13 @@ main(int argc, char *argv[]) struct group *gr; struct passwd *pw; int Gflag, Mflag, Pflag, ch, gflag, id, nflag, pflag, rflag, uflag; - int Aflag; + int Aflag, cflag; + int error; const char *myname; + char loginclass[MAXLOGNAME]; Gflag = Mflag = Pflag = gflag = nflag = pflag = rflag = uflag = 0; - Aflag = 0; + Aflag = cflag = 0; myname = strrchr(argv[0], '/'); myname = (myname != NULL) ? myname + 1 : argv[0]; @@ -92,7 +94,7 @@ main(int argc, char *argv[]) } while ((ch = getopt(argc, argv, - (isgroups || iswhoami) ? "" : "APGMagnpru")) != -1) + (isgroups || iswhoami) ? "" : "APGMacgnpru")) != -1) switch(ch) { #ifdef USE_BSM_AUDIT case 'A': @@ -110,6 +112,9 @@ main(int argc, char *argv[]) break; case 'a': break; + case 'c': + cflag = 1; + break; case 'g': gflag = 1; break; @@ -158,6 +163,14 @@ main(int argc, char *argv[]) } #endif + if (cflag) { + error = getloginclass(loginclass, sizeof(loginclass)); + if (error != 0) + err(1, "loginclass"); + (void)printf("%s\n", loginclass); + exit(0); + } + if (gflag) { id = pw ? pw->pw_gid : rflag ? getgid() : getegid(); if (nflag && (gr = getgrgid(id))) @@ -467,7 +480,7 @@ usage(void) else if (iswhoami) (void)fprintf(stderr, "usage: whoami\n"); else - (void)fprintf(stderr, "%s\n%s%s\n%s\n%s\n%s\n%s\n%s\n", + (void)fprintf(stderr, "%s\n%s%s\n%s\n%s\n%s\n%s\n%s\n%s\n", "usage: id [user]", #ifdef USE_BSM_AUDIT " id -A\n", @@ -477,6 +490,7 @@ usage(void) " id -G [-n] [user]", " id -M", " id -P [user]", + " id -c", " id -g [-nr] [user]", " id -p [user]", " id -u [-nr] [user]"); diff -urNp current/usr.bin/jailstat/Makefile hrl/usr.bin/jailstat/Makefile --- current/usr.bin/jailstat/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ hrl/usr.bin/jailstat/Makefile 2011-01-31 19:58:43.193186702 +0100 @@ -0,0 +1,9 @@ +# $FreeBSD$ + +SCRIPTS= jailstat.sh +LINKS= ${BINDIR}/jailstat ${BINDIR}/userstat + +MAN= jailstat.8 +MLINKS= jailstat.8 userstat.8 + +.include diff -urNp current/usr.bin/jailstat/jailstat.8 hrl/usr.bin/jailstat/jailstat.8 --- current/usr.bin/jailstat/jailstat.8 1970-01-01 01:00:00.000000000 +0100 +++ hrl/usr.bin/jailstat/jailstat.8 2011-01-31 19:58:43.193186702 +0100 @@ -0,0 +1,76 @@ +.\"- +.\" Copyright (c) 2009 Edward Tomasz Napierala +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR THE VOICES IN HIS HEAD BE +.\" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +.\" POSSIBILITY OF SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd February 13, 2011 +.Dt USERSTAT 8 +.Os +.Sh NAME +.Nm jailstat , +.Nm userstat +.Nd display resource utilisation for jails or users +.Sh SYNOPSIS +.Nm +.Op Fl s +.Op Ar time +.Op Ar count +.Sh DESCRIPTION +The +.Nm +command displays current per-user or per-jail resource utilisation +and limits. +It reports percentage of CPU time, resident set size (e.g. amount +of physical memory currently used), allocated virtual memory and amount +of allocated swap space. +The options are as follows: +.Bl -tag -width indent +.It Fl s +Display raw values instead of humanized ones. +.It Ar time +Repeat display every +.Ar time +seconds. +.It Ar count +Exit after +.Ar count +iterations. +.El +.Pp +.Sh EXIT STATUS +.Ex -std +.Sh SEE ALSO +.Xr rctl 8 +.Sh HISTORY +The +.Nm +command appeared in +.Fx 9.0. +.Sh AUTHORS +.An -nosplit +The +.Nm +command was written by +.An Edward Tomasz Napierala Aq trasz@FreeBSD.org . diff -urNp current/usr.bin/jailstat/jailstat.sh hrl/usr.bin/jailstat/jailstat.sh --- current/usr.bin/jailstat/jailstat.sh 1970-01-01 01:00:00.000000000 +0100 +++ hrl/usr.bin/jailstat/jailstat.sh 2011-01-31 19:58:43.202987389 +0100 @@ -0,0 +1,113 @@ +#!/bin/sh +# +# Copyright (c) 2010 The FreeBSD Foundation +# All rights reserved. +# +# This software was developed by Edward Tomasz Napierala under sponsorship +# from the FreeBSD Foundation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# $FreeBSD$ + +usage() { + echo "usage: `basename $0` [-s] [wait [count]]" + exit 1 +} + +get_limit() { + rule=`rctl $hflag $1` + if [ -z "$rule" ]; then + echo "-" + else + amount="${rule##*=}" + echo "$amount" + fi +} + +hflag="-h" + +while getopts 's' cmd_arg; do + case "${cmd_arg}" in + s) hflag="" ;; + *) usage ;; + esac +done + +shift $(($OPTIND - 1)) + +wait="$1" +count="$2" + +[ "$wait" ">" 0 ] || wait="0" +[ "$count" ">" 0 ] || count="0" + +sysctl kern.features.rctl > /dev/null 2>&1 +if [ $? -ne 0 ]; then + echo "no RCTL support in the kernel" 2>&1 + exit 1 +fi + +n=0 +while :; do + if [ "`basename $0`" = "jailstat" ]; then + jails="`ps ax -o jid= | sort -u | sed 1d`" + printf "JID\t%%CPU\tLIMIT\tRSS\tLIMIT\tVMEM\tLIMIT\tSWAP\tLIMIT\n" + for jail in $jails; do + # Put resource=value pairs into environment variables. + eval `rctl $hflag -u j:$jail` + + pctcpulimit=`get_limit j:$jail:pctcpu:deny=/jail` + rsslimit=`get_limit j:$jail:rss:deny=/jail` + vmemlimit=`get_limit j:$jail:vmem:deny=/jail` + swaplimit=`get_limit j:$jail:swap:deny=/jail` + + printf "%s\t\%s\t\%s\t\%s\t\%s\t\%s\t\%s\t\%s\t\%s\n" "$jail" "$pctcpu" "$pctcpulimit" "$rss" "$rsslimit" "$vmem" "$vmemlimit" "$swap" "$swaplimit" + done + + else + users="`ps ax -o user= | sort -u`" + printf "USER\t%%CPU\tLIMIT\tRSS\tLIMIT\tVMEM\tLIMIT\tSWAP\tLIMIT\n" + for user in $users; do + # Put resource=value pairs into environment variables. + eval `rctl $hflag -u u:$user` + + pctcpulimit=`get_limit u:$user:pctcpu:deny=/user` + rsslimit=`get_limit u:$user:rss:deny=/user` + vmemlimit=`get_limit u:$user:vmem:deny=/user` + swaplimit=`get_limit u:$user:swap:deny=/user` + + printf "%s\t\%s\t\%s\t\%s\t\%s\t\%s\t\%s\t\%s\t\%s\n" "$user" "$pctcpu" "$pctcpulimit" "$rss" "$rsslimit" "$vmem" "$vmemlimit" "$swap" "$swaplimit" + done + fi + + n=$(($n + 1)) + + if [ "$wait" -eq 0 ]; then + break + else + echo + fi + + [ "$count" -gt 0 -a "$n" -ge "$count" ] && break + sleep "$wait" +done diff -urNp current/usr.bin/rctl/Makefile hrl/usr.bin/rctl/Makefile --- current/usr.bin/rctl/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ hrl/usr.bin/rctl/Makefile 2011-01-31 19:58:46.883011247 +0100 @@ -0,0 +1,11 @@ +# $FreeBSD$ + +PROG= rctl +MAN= rctl.8 + +DPADD= ${LIBUTIL} +LDADD= -lutil + +WARNS?= 6 + +.include diff -urNp current/usr.bin/rctl/rctl.8 hrl/usr.bin/rctl/rctl.8 --- current/usr.bin/rctl/rctl.8 1970-01-01 01:00:00.000000000 +0100 +++ hrl/usr.bin/rctl/rctl.8 2011-01-31 19:58:46.883011247 +0100 @@ -0,0 +1,162 @@ +.\"- +.\" Copyright (c) 2009 Edward Tomasz Napierala +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR THE VOICES IN HIS HEAD BE +.\" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +.\" POSSIBILITY OF SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd February 13, 2011 +.Dt RCTL 8 +.Os +.Sh NAME +.Nm rctl +.Nd display and update RCTL database +.Sh SYNOPSIS +.Nm +.Op Fl h +.Op Ar filter +.Nm +.Fl a +.Op Ar rule +.Nm +.Op Fl h +.Fl l +.Op Ar filter +.Nm +.Fl r +.Op Ar filter +.Nm +.Op Fl h +.Fl u +.Op Ar filter +.Sh DESCRIPTION +When called without options, the +.Nm +command writes currently defined RCTL rules to standard output. +.Pp +If a +.Ar filter +argument is specified, only rules matching the filter are displayed. +The options are as follows: +.Bl -tag -width indent +.It Fl a Ar rule +Add +.Ar rule +to the RCTL database. +.It Fl l Ar filter +Display rules applicable to the process defined by +.Ar filter . +.It Fl r Ar filter +Remove rules matching +.Ar filter +from the RCTL database. +.It Fl u Ar filter +Display resource usage for a subject (process, user, login class +or jail) matching the +.Ar filter . +.It Fl h +"Human-readable" output. +Use unit suffixes: Byte, Kilobyte, Megabyte, +Gigabyte, Terabyte and Petabyte. +.Pp +.Sh RULE SYNTAX +Syntax for a rule is subject:subject-id:resource:action=amount/per. +.Pp +Subject defines the kind of entity the rule applies to. +It can be either process, user, login class, or jail. +.Pp +Subject ID identifies the subject. It can be user name, +login class name, or a numerical UID, or JID. +.Pp +Resource identifies the resource the rule controls. +.Pp +Action defines what will happen when a process exceeds the allowed amount. +It can be either deny, delay, log, sighup, sigint, sigkill, sigsegv, sigxcpu, +or sigxfsz. +.Pp +Amount defines how much of the resource a process can use before +the defined action triggers. +.Pp +The per field defines what entity the limit gets accounted for. +For example, rule "loginclass:users:vmem:deny=100M/process" means +that each process of any user belonging to login class "users" may allocate +up to 100MB of virtual memory. +Rule "loginclass:users:vmem:deny=100M/user" would mean that for each +user belonging to the login class "users", the sum of virtual memory allocated +by all the processes of a that user will not exceed 100MB. +Rule "loginclass:users:vmem:deny=100M/loginclass" would mean that the sum of +virtual memory allocated by all processes of all users belonging to that login +class will not exceed 100MB. +.Pp +Valid rule has all those fields specified, except for the per, which defaults +to the value of subject. +.Pp +A filter is a rule for which one of more fields other than per is left empty. +For example, a filter that matches every rule could be written as ":::=/", +or, in short, ":". A filter that matches all the login classes would be +"loginclass:". A filter that matches all defined limits for maxprocesses +resource would be "::maxprocesses". +.Pp +.Sh RESOURCES +.Bl -column -offset 3n "msgqqueued" +.It cpu CPU time, in microseconds +.It fsize maximum file size, in megabytes +.It data data size, in megabytes +.It stack stack size, in megabytes +.It core core dump size, in megabytes +.It rss resident set size, in megabytes +.It memlock locked memory, in megabytes +.It nproc number of processes +.It nofile file descriptor table size +.It sbsize memory consumed by socket buffers, in megabytes +.It vmem address space limit, in megabytes +.It npts number of PTYs +.It swap swap usage, in megabytes +.It nthr number of threads +.It msgqqueued number of queued SysV messages +.It msgqsize SysV message queue size, in megabytes +.It nmsgq number of SysV message queues +.It nsem number of SysV semaphores +.It nsemop number of SysV semaphores modified in a single semop(2) call +.It nshm number of SysV shared memory segments +.It shmsize SysV shared memory size, in megabytes +.It wallclock wallclock time +.It pctcpu %cpu time +.El +.Pp +.Sh EXIT STATUS +.Ex -std +.Sh SEE ALSO +.Xr jailstat 8 , +.Xr userstat 8 +.Sh HISTORY +The +.Nm +command appeared in +.Fx 9.0. +.Sh AUTHORS +.An -nosplit +The +.Nm +command was written by +.An Edward Tomasz Napierala Aq trasz@FreeBSD.org . diff -urNp current/usr.bin/rctl/rctl.c hrl/usr.bin/rctl/rctl.c --- current/usr.bin/rctl/rctl.c 1970-01-01 01:00:00.000000000 +0100 +++ hrl/usr.bin/rctl/rctl.c 2011-01-31 19:58:46.913000254 +0100 @@ -0,0 +1,525 @@ +/*- + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Edward Tomasz Napierala under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RCTL_DEFAULT_BUFSIZE 4096 + +static id_t +parse_user(const char *s) +{ + id_t id; + char *end; + struct passwd *pwd; + + pwd = getpwnam(s); + if (pwd != NULL) + return (pwd->pw_uid); + + if (!isnumber(s[0])) + errx(1, "uknown user '%s'", s); + + id = strtod(s, &end); + if ((size_t)(end - s) != strlen(s)) + errx(1, "trailing characters after numerical id"); + + return (id); +} + +static id_t +parse_group(const char *s) +{ + id_t id; + char *end; + struct group *grp; + + grp = getgrnam(s); + if (grp != NULL) + return (grp->gr_gid); + + if (!isnumber(s[0])) + errx(1, "uknown group '%s'", s); + + id = strtod(s, &end); + if ((size_t)(end - s) != strlen(s)) + errx(1, "trailing characters after numerical id"); + + return (id); +} + +/* + * This routine replaces user/group name with numeric id. + */ +static char * +resolve_ids(char *rule) +{ + id_t id; + const char *subject, *textid, *rest; + char *resolved; + + subject = strsep(&rule, ":"); + textid = strsep(&rule, ":"); + if (textid == NULL) + errx(1, "error in rule specification -- no subject"); + if (rule != NULL) + rest = rule; + else + rest = ""; + + if (strcasecmp(subject, "u") == 0) + subject = "user"; + else if (strcasecmp(subject, "g") == 0) + subject = "group"; + else if (strcasecmp(subject, "p") == 0) + subject = "process"; + else if (strcasecmp(subject, "l") == 0 || + strcasecmp(subject, "c") == 0 || + strcasecmp(subject, "class") == 0) + subject = "loginclass"; + else if (strcasecmp(subject, "j") == 0) + subject = "jail"; + + if (strcasecmp(subject, "user") == 0 && strlen(textid) > 0) { + id = parse_user(textid); + asprintf(&resolved, "%s:%d:%s", subject, (int)id, rest); + } else if (strcasecmp(subject, "group") == 0 && strlen(textid) > 0) { + id = parse_group(textid); + asprintf(&resolved, "%s:%d:%s", subject, (int)id, rest); + } else + asprintf(&resolved, "%s:%s:%s", subject, textid, rest); + + if (resolved == NULL) + err(1, "asprintf"); + + return (resolved); +} + +/* + * This routine replaces "human-readable" number with its expanded form. + */ +static char * +expand_amount(char *rule) +{ + uint64_t num; + const char *subject, *subject_id, *resource, *action, *amount, *per; + char *copy, *expanded; + + copy = strdup(rule); + if (copy == NULL) + err(1, "strdup"); + + subject = strsep(©, ":"); + subject_id = strsep(©, ":"); + resource = strsep(©, ":"); + action = strsep(©, "=/"); + amount = strsep(©, "/"); + per = copy; + + if (amount == NULL || strlen(amount) == 0) { + free(copy); + return (rule); + } + + assert(subject != NULL); + assert(subject_id != NULL); + assert(resource != NULL); + assert(action != NULL); + + if (expand_number(amount, &num)) + err(1, "expand_number"); + + if (per == NULL) + asprintf(&expanded, "%s:%s:%s:%s=%ju", subject, subject_id, resource, action, (uintmax_t)num); + else + asprintf(&expanded, "%s:%s:%s:%s=%ju/%s", subject, subject_id, resource, action, (uintmax_t)num, per); + + if (expanded == NULL) + err(1, "asprintf"); + + return (expanded); +} + +static char * +humanize_ids(char *rule) +{ + id_t id; + struct passwd *pwd; + struct group *grp; + const char *subject, *textid, *rest; + char *humanized; + + subject = strsep(&rule, ":"); + textid = strsep(&rule, ":"); + if (textid == NULL) + errx(1, "rule passed from the kernel didn't contain subject"); + if (rule != NULL) + rest = rule; + else + rest = ""; + + /* Replace numerical user and group ids with names. */ + if (strcasecmp(subject, "user") == 0) { + id = parse_user(textid); + pwd = getpwuid(id); + if (pwd != NULL) + textid = pwd->pw_name; + } else if (strcasecmp(subject, "group") == 0) { + id = parse_group(textid); + grp = getgrgid(id); + if (grp != NULL) + textid = grp->gr_name; + } + + asprintf(&humanized, "%s:%s:%s", subject, textid, rest); + + if (humanized == NULL) + err(1, "asprintf"); + + return (humanized); +} + +static int +str2int64(const char *str, int64_t *value) +{ + char *end; + + if (str == NULL) + return (EINVAL); + + *value = strtoul(str, &end, 10); + if ((size_t)(end - str) != strlen(str)) + return (EINVAL); + + return (0); +} + +static char * +humanize_amount(char *rule) +{ + int64_t num; + const char *subject, *subject_id, *resource, *action, *amount, *per; + char *copy, *humanized, buf[6]; + + copy = strdup(rule); + if (copy == NULL) + err(1, "strdup"); + + subject = strsep(©, ":"); + subject_id = strsep(©, ":"); + resource = strsep(©, ":"); + action = strsep(©, "=/"); + amount = strsep(©, "/"); + per = copy; + + if (amount == NULL || strlen(amount) == 0 || str2int64(amount, &num) != 0) { + free(copy); + return (rule); + } + + assert(subject != NULL); + assert(subject_id != NULL); + assert(resource != NULL); + assert(action != NULL); + + if (humanize_number(buf, sizeof(buf), num, "", HN_AUTOSCALE, HN_DECIMAL | HN_NOSPACE) == -1) + err(1, "humanize_number"); + + if (per == NULL) + asprintf(&humanized, "%s:%s:%s:%s=%s", subject, subject_id, resource, action, buf); + else + asprintf(&humanized, "%s:%s:%s:%s=%s/%s", subject, subject_id, resource, action, buf, per); + + if (humanized == NULL) + err(1, "asprintf"); + + return (humanized); +} + +/* + * Print rules, one per line. + */ +static void +print_rules(char *rules, int hflag) +{ + char *rule; + + while ((rule = strsep(&rules, ",")) != NULL) { + if (rule[0] == '\0') + break; /* XXX */ + rule = humanize_ids(rule); + if (hflag) + rule = humanize_amount(rule); + printf("%s\n", rule); + free(rule); + } +} + +static void +add_rule(char *rule) +{ + int error; + + error = rctl_add_rule(rule, strlen(rule) + 1, NULL, 0); + if (error != 0) + err(1, "rctl_add_rule"); + free(rule); +} + +static void +show_limits(char *filter, int hflag) +{ + int error; + char *outbuf = NULL; + size_t outbuflen = RCTL_DEFAULT_BUFSIZE / 4; + + do { + outbuflen *= 4; + outbuf = realloc(outbuf, outbuflen); + if (outbuf == NULL) + err(1, "realloc"); + + error = rctl_get_limits(filter, strlen(filter) + 1, outbuf, outbuflen); + if (error && errno != ERANGE) + err(1, "rctl_get_limits"); + } while (error && errno == ERANGE); + + print_rules(outbuf, hflag); + free(filter); + free(outbuf); +} + +static void +remove_rule(char *filter) +{ + int error; + + error = rctl_remove_rule(filter, strlen(filter) + 1, NULL, 0); + if (error != 0) + err(1, "rctl_remove_rule"); + free(filter); +} + +static char * +humanize_usage_amount(char *usage) +{ + int64_t num; + const char *resource, *amount; + char *copy, *humanized, buf[6]; + + copy = strdup(usage); + if (copy == NULL) + err(1, "strdup"); + + resource = strsep(©, "="); + amount = copy; + + assert(resource != NULL); + assert(amount != NULL); + + if (str2int64(amount, &num) != 0 || + humanize_number(buf, sizeof(buf), num, "", HN_AUTOSCALE, HN_DECIMAL | HN_NOSPACE) == -1) { + free(copy); + return (usage); + } + + asprintf(&humanized, "%s=%s", resource, buf); + if (humanized == NULL) + err(1, "asprintf"); + + return (humanized); +} + +/* + * Query the kernel about a resource usage and print it out. + */ +static void +show_usage(char *filter, int hflag) +{ + int error; + char *outbuf = NULL, *tmp; + size_t outbuflen = RCTL_DEFAULT_BUFSIZE / 4; + + do { + outbuflen *= 4; + outbuf = realloc(outbuf, outbuflen); + if (outbuf == NULL) + err(1, "realloc"); + + error = rctl_get_usage(filter, strlen(filter) + 1, outbuf, outbuflen); + if (error && errno != ERANGE) + err(1, "rctl_get_usage"); + } while (error && errno == ERANGE); + + while ((tmp = strsep(&outbuf, ",")) != NULL) { + if (tmp[0] == '\0') + break; /* XXX */ + + if (hflag) + tmp = humanize_usage_amount(tmp); + + printf("%s\n", tmp); + } + + free(filter); + free(outbuf); +} + +/* + * Query the kernel about resource limit rules and print them out. + */ +static void +show_rules(char *filter, int hflag) +{ + int error; + char *outbuf = NULL; + size_t filterlen, outbuflen = RCTL_DEFAULT_BUFSIZE / 4; + + if (filter != NULL) + filterlen = strlen(filter) + 1; + else + filterlen = 0; + + do { + outbuflen *= 4; + outbuf = realloc(outbuf, outbuflen); + if (outbuf == NULL) + err(1, "realloc"); + + error = rctl_get_rules(filter, filterlen, outbuf, outbuflen); + if (error && errno != ERANGE) + err(1, "rctl_get_rules"); + } while (error && errno == ERANGE); + + print_rules(outbuf, hflag); + free(outbuf); +} + +static void +usage(void) +{ + + fprintf(stderr, "usage: rctl [ -h ] [-a rule | -l filter | -r filter | -u filter | filter]\n"); + exit(1); +} + +int +main(int argc __unused, char **argv __unused) +{ + int ch, aflag = 0, hflag = 0, lflag = 0, rflag = 0, uflag = 0; + char *rule = NULL; + + while ((ch = getopt(argc, argv, "a:hl:r:u:")) != -1) { + switch (ch) { + case 'a': + aflag = 1; + rule = strdup(optarg); + break; + case 'h': + hflag = 1; + break; + case 'l': + lflag = 1; + rule = strdup(optarg); + break; + case 'r': + rflag = 1; + rule = strdup(optarg); + break; + case 'u': + uflag = 1; + rule = strdup(optarg); + break; + + case '?': + default: + usage(); + } + } + + argc -= optind; + argv += optind; + + if (argc > 1) + usage(); + + if (rule == NULL) { + if (argc == 1) + rule = strdup(argv[0]); + else + rule = strdup("::"); + } + + if (aflag + lflag + rflag + uflag + argc > 1) + errx(1, "only one flag or argument may be specified " + "at the same time"); + + rule = resolve_ids(rule); + rule = expand_amount(rule); + + if (aflag) { + add_rule(rule); + return (0); + } + + if (lflag) { + show_limits(rule, hflag); + return (0); + } + + if (rflag) { + remove_rule(rule); + return (0); + } + + if (uflag) { + show_usage(rule, hflag); + return (0); + } + + show_rules(rule, hflag); + return (0); +}