diff --git a/bin/sh/miscbltin.c b/bin/sh/miscbltin.c index 5fbecde..14a3677 100644 --- a/bin/sh/miscbltin.c +++ b/bin/sh/miscbltin.c @@ -358,7 +358,7 @@ ulimitcmd(int argc __unused, char **argv __unused) struct rlimit limit; what = 'f'; - while ((optc = nextopt("HSatfdsmcnuvlb")) != '\0') + while ((optc = nextopt("HSatfdsmcnuvlbw")) != '\0') switch (optc) { case 'H': how = HARD; diff --git a/contrib/tcsh/sh.func.c b/contrib/tcsh/sh.func.c index 92e2447..a5c263c 100644 --- a/contrib/tcsh/sh.func.c +++ b/contrib/tcsh/sh.func.c @@ -1796,6 +1796,10 @@ struct limits limits[] = { RLIMIT_SBSIZE, "sbsize", 1, "" }, # endif /* RLIMIT_SBSIZE */ +# ifdef RLIMIT_SWAP + { RLIMIT_SWAP, "swapreserved", 1024, "kbytes" }, +# endif /* RLIMIT_SWAP */ + { -1, NULL, 0, NULL } }; diff --git a/contrib/tcsh/tcsh.man b/contrib/tcsh/tcsh.man index 0c9c3b6..bab06ae 100644 --- a/contrib/tcsh/tcsh.man +++ b/contrib/tcsh/tcsh.man @@ -2921,6 +2921,9 @@ the maximum number of simultaneous processes for this user id .TP \fIsbsize\fR the maximum size of socket buffer usage for this user +.TP +\fIswapreserved\fR +the maximum amount of swap space reserved for this user .PP \fImaximum-use\fR may be given as a (floating point or integer) number followed by a scale factor. For all limits diff --git a/etc/login.conf b/etc/login.conf index 847407a..b759eb9 100644 --- a/etc/login.conf +++ b/etc/login.conf @@ -40,6 +40,7 @@ default:\ :maxproc=unlimited:\ :sbsize=unlimited:\ :vmemoryuse=unlimited:\ + :swapuse=unlimited:\ :priority=0:\ :ignoretime@:\ :umask=022: diff --git a/lib/libc/sys/getrlimit.2 b/lib/libc/sys/getrlimit.2 index 9384f79..bb5eaac 100644 --- a/lib/libc/sys/getrlimit.2 +++ b/lib/libc/sys/getrlimit.2 @@ -97,6 +97,13 @@ mbufs, that this user may hold at any time. The maximum size (in bytes) of the stack segment for a process; this defines how far a program's stack segment may be extended. Stack extension is performed automatically by the system. +.It Dv RLIMIT_SWAP +The maximum size (in bytes) of the reserved swap space for this user +id. +To enforce the limit, the bit 1 of the +.Va vm.overcommit +sysctl +variable shall be set (i.e., sysctl vm.overcommit=2). .El .Pp A resource limit is specified as a soft limit and a hard limit. diff --git a/lib/libutil/login_class.c b/lib/libutil/login_class.c index 93a8e5f..b620656 100644 --- a/lib/libutil/login_class.c +++ b/lib/libutil/login_class.c @@ -59,6 +59,7 @@ static struct login_res { { "coredumpsize", login_getcapsize, RLIMIT_CORE }, { "sbsize", login_getcapsize, RLIMIT_SBSIZE }, { "vmemoryuse", login_getcapsize, RLIMIT_VMEM }, + { "swapuse", login_getcapsize, RLIMIT_SWAP }, { NULL, 0, 0 } }; diff --git a/share/man/man7/tuning.7 b/share/man/man7/tuning.7 index 4102370..6c1bd04 100644 --- a/share/man/man7/tuning.7 +++ b/share/man/man7/tuning.7 @@ -402,6 +402,36 @@ In this document we will only cover the ones that have the greatest effect on the system. .Pp The +.Va vm.overcommit +sysctl defines the overcommit behaviour of the vm subsystem. +System always does accounting of the swap space reservation, both +total for system and per-user. Corresponding values are available +through sysctl +.Va vm.swap_total, +that gives the total bytes available for swapping, and +.Va vm.swap_reserved, +that gives number of bytes that may be needed to back all currently +allocated anonymous memory. Note, that both vm.swap_total and +vm.swap_reserved are exported as long by sysctl interface, but both +are actually long long. +.Pp +Setting of the bit 0 if vm.overcommit causes system to return failure +to the process when allocation of memory causes vm.swap_reserved +exceed vm.swap_total. +Bit 1 of the sysctl enforces RLIMIT_SWAP limit +(see +.Xr getrlimit 2 +). +Root is immune against this limit. +Bit 2 allows to count all physical +memory as allocatable, except wired pages and pages free reserved pages +(available as +.Va vm.stats.vm.v_free_target +and +.Va vm.stats.vm.v_wire_count +, respectively). +.Pp +The .Va kern.ipc.shm_use_phys sysctl defaults to 0 (off) and may be set to 0 (off) or 1 (on). Setting diff --git a/share/man/man9/vm_map.9 b/share/man/man9/vm_map.9 index 8e6acb3..5c35da7 100644 --- a/share/man/man9/vm_map.9 +++ b/share/man/man9/vm_map.9 @@ -146,6 +146,11 @@ Do not include the mapping in a core dump. .It Dv MAP_PREFAULT_MADVISE Specify that the request is from a user process calling .Xr madvise 2 . +.It Dv MAP_ACC_CHARGED +Region is already charged to the requestor by some means. +.It Dv MAP_ACC_NO_CHARGE +Forbids charging for allocated region even if +entry looks like is shall be. .El .Pp The diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c index e041789..bd2a205 100644 --- a/sys/dev/md/md.c +++ b/sys/dev/md/md.c @@ -78,6 +78,7 @@ #include #include #include +#include #include @@ -1010,6 +1011,7 @@ static int mdcreate_swap(struct md_s *sc, struct md_ioctl *mdio, struct thread *td) { vm_ooffset_t npage; + struct uidinfo *uip; int error; /* @@ -1030,20 +1032,36 @@ mdcreate_swap(struct md_s *sc, struct md_ioctl *mdio, struct thread *td) sc->fwsectors = mdio->md_fwsectors; if (mdio->md_fwheads != 0) sc->fwheads = mdio->md_fwheads; + if (!swap_reserve_by_uid(PAGE_SIZE * npage, td->td_ucred->cr_ruidinfo)) + return (ENOMEM); sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage, VM_PROT_DEFAULT, 0); - if (sc->object == NULL) + if (sc->object == NULL) { + swap_release_by_uid(PAGE_SIZE * npage, td->td_ucred->cr_ruidinfo); return (ENOMEM); + } + uip = td->td_ucred->cr_ruidinfo; + uihold(uip); + VM_OBJECT_LOCK(sc->object); + sc->object->uip = uip; + sc->object->charge = ptoa(sc->object->size); + VM_OBJECT_UNLOCK(sc->object); sc->flags = mdio->md_options & MD_FORCE; if (mdio->md_options & MD_RESERVE) { if (swap_pager_reserve(sc->object, 0, npage) < 0) { - vm_object_deallocate(sc->object); - sc->object = NULL; - return (EDOM); + error = EDOM; + goto finish; } } error = mdsetcred(sc, td->td_ucred); + finish: if (error != 0) { + swap_release_by_uid(sc->object->charge, uip); + VM_OBJECT_LOCK(sc->object); + sc->object->charge = 0; + sc->object->uip = NULL; + VM_OBJECT_UNLOCK(sc->object); + uifree(uip); vm_object_deallocate(sc->object); sc->object = NULL; } diff --git a/sys/fs/procfs/procfs_map.c b/sys/fs/procfs/procfs_map.c index 9f97593..16fc79a 100644 --- a/sys/fs/procfs/procfs_map.c +++ b/sys/fs/procfs/procfs_map.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -88,6 +89,7 @@ procfs_doprocmap(PFS_FILL_ARGS) struct vnode *vp; char mebuffer[MEBUFFERSIZE]; char *fullpath, *freepath; + struct uidinfo *uip; unsigned int last_timestamp; #ifdef COMPAT_IA32 int wrap32 = 0; @@ -133,6 +135,7 @@ procfs_doprocmap(PFS_FILL_ARGS) if (obj->shadow_count == 1) privateresident = obj->resident_page_count; } + uip = (entry->uip) ? entry->uip : (obj ? obj->uip : NULL); resident = 0; addr = entry->start; @@ -196,10 +199,11 @@ procfs_doprocmap(PFS_FILL_ARGS) /* * format: - * start, end, resident, private resident, cow, access, type. + * start, end, resident, private resident, cow, access, type, + * charged, charged uid. */ snprintf(mebuffer, sizeof mebuffer, - "0x%lx 0x%lx %d %d %p %s%s%s %d %d 0x%x %s %s %s %s\n", + "0x%lx 0x%lx %d %d %p %s%s%s %d %d 0x%x %s %s %s %s %s %d\n", (u_long)entry->start, (u_long)entry->end, resident, privateresident, #ifdef COMPAT_IA32 @@ -213,7 +217,8 @@ procfs_doprocmap(PFS_FILL_ARGS) ref_count, shadow_count, flags, (entry->eflags & MAP_ENTRY_COW)?"COW":"NCOW", (entry->eflags & MAP_ENTRY_NEEDS_COPY)?"NC":"NNC", - type, fullpath); + type, fullpath, + uip ? "CH":"NCH", uip ? uip->ui_uid : -1); if (freepath != NULL) free(freepath, M_TEMP); diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index 3c5d40c..e6d9680 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -196,6 +196,7 @@ fork1(td, flags, pages, procp) struct thread *td2; struct sigacts *newsigacts; struct vmspace *vm2; + vm_pindex_t mem_charged; int error; /* Can't copy and clear. */ @@ -255,7 +256,8 @@ norfproc_fail: * We did have single-threading code here * however it proved un-needed and caused problems */ - + mem_charged = 0; + vm2 = NULL; /* Allocate new proc. */ newproc = uma_zalloc(proc_zone, M_WAITOK); if (TAILQ_EMPTY(&newproc->p_threads)) { @@ -277,11 +279,22 @@ norfproc_fail: } } if ((flags & RFMEM) == 0) { - vm2 = vmspace_fork(p1->p_vmspace); + vm2 = vmspace_fork(p1->p_vmspace, &mem_charged); if (vm2 == NULL) { error = ENOMEM; goto fail1; } + if (!swap_reserve(mem_charged)) { + /* + * The swap reservation failed. The accounting + * from the entries of the copied vm2 will be + * substracted in vmspace_free(), so force the + * reservation there. + */ + swap_reserve_force(mem_charged); + error = ENOMEM; + goto fail1; + } } else vm2 = NULL; #ifdef MAC @@ -715,6 +728,9 @@ again: KNOTE_LOCKED(&p1->p_klist, NOTE_FORK | p2->p_pid); PROC_UNLOCK(p1); +#ifdef INVARIANTS + swap_check_reserve(); +#endif /* * Preserve synchronization semantics of vfork. If waiting for @@ -741,6 +757,8 @@ fail: mac_proc_destroy(newproc); #endif fail1: + if (vm2 != NULL) + vmspace_free(vm2); uma_zfree(proc_zone, newproc); pause("fork", hz / 2); return (error); diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c index ebc6c0c..1c2fb0e 100644 --- a/sys/kern/kern_resource.c +++ b/sys/kern/kern_resource.c @@ -1224,6 +1224,8 @@ uifind(uid) } else { refcount_init(&uip->ui_ref, 0); uip->ui_uid = uid; + mtx_init(&uip->ui_vmsize_mtx, "ui_vmsize", NULL, + MTX_DEF); LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash); } uihold(uip); @@ -1281,6 +1283,10 @@ uifree(uip) if (uip->ui_proccnt != 0) printf("freeing uidinfo: uid = %d, proccnt = %ld\n", uip->ui_uid, uip->ui_proccnt); + if (uip->ui_vmsize != 0) + printf("freeing uidinfo: uid = %d, swapuse = %lld\n", + uip->ui_uid, (unsigned long long)uip->ui_vmsize); + mtx_destroy(&uip->ui_vmsize_mtx); FREE(uip, M_UIDINFO); return; } diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c index 6b6523f..59848e4 100644 --- a/sys/kern/sys_process.c +++ b/sys/kern/sys_process.c @@ -58,6 +58,7 @@ __FBSDID("$FreeBSD: src/sys/kern/sys_process.c,v 1.147 2008/03/12 10:11:59 jeff #include #include #include +#include #ifdef COMPAT_IA32 #include @@ -271,7 +272,10 @@ proc_rwmem(struct proc *p, struct uio *uio) */ error = vm_fault(map, pageno, reqprot, fault_flags); if (error) { - error = EFAULT; + if (error == KERN_RESOURCE_SHORTAGE) + error = ENOMEM; + else + error = EFAULT; break; } diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c index 1d28f43..80e2fa2 100644 --- a/sys/kern/sysv_shm.c +++ b/sys/kern/sysv_shm.c @@ -422,7 +422,8 @@ kern_shmat(td, shmid, shmaddr, shmflg) vm_object_reference(shmseg->u.shm_internal); rv = vm_map_find(&p->p_vmspace->vm_map, shmseg->u.shm_internal, - 0, &attach_va, size, (flags & MAP_FIXED)?0:1, prot, prot, 0); + 0, &attach_va, size, (flags & MAP_FIXED)?0:1, prot, prot, + MAP_ACC_NO_CHARGE); if (rv != KERN_SUCCESS) { vm_object_deallocate(shmseg->u.shm_internal); error = ENOMEM; @@ -736,6 +737,7 @@ shmget_allocate_segment(td, uap, mode) struct ucred *cred = td->td_ucred; struct shmid_kernel *shmseg; vm_object_t shm_object; + struct uidinfo *uip; GIANT_REQUIRED; @@ -759,6 +761,16 @@ shmget_allocate_segment(td, uap, mode) shm_last_free = -1; } shmseg = &shmsegs[segnum]; + if (shm_use_phys) + uip = NULL; + else { + uip = cred->cr_ruidinfo; + uihold(uip); + if (!swap_reserve_by_uid(size, uip)) { + uifree(uip); + return (ENOMEM); + } + } /* * In case we sleep in malloc(), mark the segment present but deleted * so that noone else tries to create the same key. @@ -782,6 +794,10 @@ shmget_allocate_segment(td, uap, mode) VM_OBJECT_LOCK(shm_object); vm_object_clear_flag(shm_object, OBJ_ONEMAPPING); vm_object_set_flag(shm_object, OBJ_NOSPLIT); + if (uip != NULL) { + shm_object->uip = uip; + shm_object->charge = size; + } VM_OBJECT_UNLOCK(shm_object); shmseg->u.shm_internal = shm_object; diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c index 029db09..2b5d595 100644 --- a/sys/kern/uipc_shm.c +++ b/sys/kern/uipc_shm.c @@ -108,7 +108,7 @@ static struct shmfd *shm_hold(struct shmfd *shmfd); static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); -static void shm_dotruncate(struct shmfd *shmfd, off_t length); +static int shm_dotruncate(struct shmfd *shmfd, off_t length); static fo_rdwr_t shm_read; static fo_rdwr_t shm_write; @@ -165,8 +165,7 @@ shm_truncate(struct file *fp, off_t length, struct ucred *active_cred, if (error) return (error); #endif - shm_dotruncate(shmfd, length); - return (0); + return (shm_dotruncate(shmfd, length)); } static int @@ -240,23 +239,25 @@ shm_close(struct file *fp, struct thread *td) return (0); } -static void +static int shm_dotruncate(struct shmfd *shmfd, off_t length) { vm_object_t object; vm_page_t m; - vm_pindex_t nobjsize; + vm_pindex_t nobjsize, delta; object = shmfd->shm_object; VM_OBJECT_LOCK(object); if (length == shmfd->shm_size) { VM_OBJECT_UNLOCK(object); - return; + return (0); } nobjsize = OFF_TO_IDX(length + PAGE_MASK); /* Are we shrinking? If so, trim the end. */ if (length < shmfd->shm_size) { + delta = object->size - nobjsize; + /* Toss in memory pages. */ if (nobjsize < object->size) vm_object_page_remove(object, nobjsize, object->size, @@ -264,8 +265,11 @@ shm_dotruncate(struct shmfd *shmfd, off_t length) /* Toss pages from swap. */ if (object->type == OBJT_SWAP) - swap_pager_freespace(object, nobjsize, - object->size - nobjsize); + swap_pager_freespace(object, nobjsize, delta); + + /* Free the swap accounted for shm */ + swap_release_by_uid(delta, object->uip); + object->charge -= delta; /* * If the last page is partially mapped, then zero out @@ -292,6 +296,15 @@ shm_dotruncate(struct shmfd *shmfd, off_t length) vm_page_cache_free(object, OFF_TO_IDX(length), nobjsize); } + } else { + + /* Attempt to reserve the swap */ + delta = nobjsize - object->size; + if (!swap_reserve_by_uid(delta, object->uip)) { + VM_OBJECT_UNLOCK(object); + return (ENOMEM); + } + object->charge += delta; } shmfd->shm_size = length; mtx_lock(&shm_timestamp_lock); @@ -300,6 +313,7 @@ shm_dotruncate(struct shmfd *shmfd, off_t length) mtx_unlock(&shm_timestamp_lock); object->size = nobjsize; VM_OBJECT_UNLOCK(object); + return (0); } /* @@ -310,15 +324,21 @@ static struct shmfd * shm_alloc(struct ucred *ucred, mode_t mode) { struct shmfd *shmfd; + struct uidinfo *uip; shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO); shmfd->shm_size = 0; shmfd->shm_uid = ucred->cr_uid; shmfd->shm_gid = ucred->cr_gid; shmfd->shm_mode = mode; + uip = ucred->cr_ruidinfo; + uihold(uip); shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL, shmfd->shm_size, VM_PROT_DEFAULT, 0); KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); + VM_OBJECT_LOCK(shmfd->shm_object); + shmfd->shm_object->uip = uip; + VM_OBJECT_UNLOCK(shmfd->shm_object); vfs_timestamp(&shmfd->shm_birthtime); shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = shmfd->shm_birthtime; diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 5314a80..afce4eb 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -2008,7 +2008,8 @@ restart: if (addr) { vm_map_insert(buffer_map, NULL, 0, addr, addr + maxsize, - VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); + VM_PROT_ALL, VM_PROT_ALL, + MAP_NOFAULT | MAP_ACC_NO_CHARGE); bp->b_kvabase = (caddr_t) addr; bp->b_kvasize = maxsize; diff --git a/sys/security/mac_biba/mac_biba.c b/sys/security/mac_biba/mac_biba.c index 9ad1fcf..09fb895 100644 --- a/sys/security/mac_biba/mac_biba.c +++ b/sys/security/mac_biba/mac_biba.c @@ -1714,6 +1714,8 @@ biba_priv_check(struct ucred *cred, int priv) case PRIV_VM_MADV_PROTECT: case PRIV_VM_MLOCK: case PRIV_VM_MUNLOCK: + case PRIV_VM_SWAP_NOQUOTA: + case PRIV_VM_SWAP_NORLIMIT: /* * Allow some but not all network privileges. In general, dont allow diff --git a/sys/security/mac_lomac/mac_lomac.c b/sys/security/mac_lomac/mac_lomac.c index 87b6595..9092a49 100644 --- a/sys/security/mac_lomac/mac_lomac.c +++ b/sys/security/mac_lomac/mac_lomac.c @@ -1737,6 +1737,8 @@ lomac_priv_check(struct ucred *cred, int priv) case PRIV_VM_MADV_PROTECT: case PRIV_VM_MLOCK: case PRIV_VM_MUNLOCK: + case PRIV_VM_SWAP_NOQUOTA: + case PRIV_VM_SWAP_NORLIMIT: /* * Allow some but not all network privileges. In general, dont allow diff --git a/sys/sys/priv.h b/sys/sys/priv.h index 01ec727..4a727a6 100644 --- a/sys/sys/priv.h +++ b/sys/sys/priv.h @@ -285,6 +285,14 @@ #define PRIV_VM_MADV_PROTECT 360 /* Can set MADV_PROTECT. */ #define PRIV_VM_MLOCK 361 /* Can mlock(), mlockall(). */ #define PRIV_VM_MUNLOCK 362 /* Can munlock(), munlockall(). */ +#define PRIV_VM_SWAP_NOQUOTA 363 /* + * Can override the global + * swap reservation limits. + */ +#define PRIV_VM_SWAP_NORLIMIT 364 /* + * Can override the per-uid + * swap reservation limits. + */ /* * Device file system privileges. diff --git a/sys/sys/resource.h b/sys/sys/resource.h index a98daea..2a6a7aa 100644 --- a/sys/sys/resource.h +++ b/sys/sys/resource.h @@ -93,8 +93,9 @@ struct rusage { #define RLIMIT_SBSIZE 9 /* maximum size of all socket buffers */ #define RLIMIT_VMEM 10 /* virtual process size (inclusive of mmap) */ #define RLIMIT_AS RLIMIT_VMEM /* standard name for RLIMIT_VMEM */ +#define RLIMIT_SWAP 11 /* swap used */ -#define RLIM_NLIMITS 11 /* number of resource limits */ +#define RLIM_NLIMITS 12 /* number of resource limits */ #define RLIM_INFINITY ((rlim_t)(((uint64_t)1 << 63) - 1)) /* XXX Missing: RLIM_SAVED_MAX, RLIM_SAVED_CUR */ @@ -117,6 +118,7 @@ static char *rlimit_ident[] = { "nofile", "sbsize", "vmem", + "swap", }; #endif diff --git a/sys/sys/resourcevar.h b/sys/sys/resourcevar.h index 45ac20a..20f1b5a 100644 --- a/sys/sys/resourcevar.h +++ b/sys/sys/resourcevar.h @@ -86,15 +86,21 @@ struct plimit { * (a) Constant from inception * (b) Lockless, updated using atomics * (c) Locked by global uihashtbl_mtx + * (d) Locked by the ui_vmsize_mtx */ struct uidinfo { LIST_ENTRY(uidinfo) ui_hash; /* (c) hash chain of uidinfos */ + struct mtx ui_vmsize_mtx; + vm_pindex_t ui_vmsize; /* (d) swap reservation by uid */ long ui_sbsize; /* (b) socket buffer space consumed */ long ui_proccnt; /* (b) number of processes */ uid_t ui_uid; /* (a) uid */ u_int ui_ref; /* (b) reference count */ }; +#define UIDINFO_VMSIZE_LOCK(ui) mtx_lock(&((ui)->ui_vmsize_mtx)) +#define UIDINFO_VMSIZE_UNLOCK(ui) mtx_unlock(&((ui)->ui_vmsize_mtx)) + struct proc; struct rusage_ext; struct thread; diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 15a46aa..daf5f95 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -93,6 +93,8 @@ __FBSDID("$FreeBSD: src/sys/vm/swap_pager.c,v 1.300 2008/01/13 14:44:15 attilio #include #include #include +#include +#include #include @@ -153,6 +155,203 @@ static int nswapdev; /* Number of swap devices */ int swap_pager_avail; static int swdev_syscall_active = 0; /* serialize swap(on|off) */ +static vm_pindex_t swap_total; +SYSCTL_LONG(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0, ""); +static vm_pindex_t swap_reserved; +SYSCTL_LONG(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0, ""); +static int overcommit = 0; +SYSCTL_INT(_vm, OID_AUTO, overcommit, CTLFLAG_RW, &overcommit, 0, ""); + +/* bits from overcommit */ +#define SWAP_RESERVE_FORCE_ON (1 << 0) +#define SWAP_RESERVE_RLIMIT_ON (1 << 1) +#define SWAP_RESERVE_ALLOW_NONWIRED (1 << 2) + +int +swap_reserve(vm_pindex_t incr) +{ + + return (swap_reserve_by_uid(incr, curthread->td_ucred->cr_ruidinfo)); +} + +int +swap_reserve_by_uid(vm_pindex_t incr, struct uidinfo *uip) +{ + vm_pindex_t r, s, max; + int res, error; + static int curfail; + static struct timeval lastfail; + + if (incr & PAGE_MASK) + panic("swap_reserve: & PAGE_MASK"); + + res = 0; + error = priv_check(curthread, PRIV_VM_SWAP_NOQUOTA); + mtx_lock(&sw_dev_mtx); + r = swap_reserved + incr; + if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) { + s = cnt.v_page_count - cnt.v_free_reserved - cnt.v_wire_count; + s *= PAGE_SIZE; + } else + s = 0; + s += swap_total; + if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 || r <= s || + (error = priv_check(curthread, PRIV_VM_SWAP_NOQUOTA)) == 0) { + res = 1; + swap_reserved = r; + } + mtx_unlock(&sw_dev_mtx); + + if (res) { + PROC_LOCK(curproc); + UIDINFO_VMSIZE_LOCK(uip); + error = priv_check(curthread, PRIV_VM_SWAP_NORLIMIT); + max = (error != 0) ? lim_cur(curproc, RLIMIT_SWAP) : 0; + if (max != 0 && uip->ui_vmsize + incr > max && + (overcommit & SWAP_RESERVE_RLIMIT_ON) != 0) + res = 0; + else + uip->ui_vmsize += incr; + UIDINFO_VMSIZE_UNLOCK(uip); + PROC_UNLOCK(curproc); + if (!res) { + mtx_lock(&sw_dev_mtx); + swap_reserved -= incr; + mtx_unlock(&sw_dev_mtx); + } + } + if (!res && ppsratecheck(&lastfail, &curfail, 1)) { + printf("uid %d, pid %d: swap reservation for %jd bytes failed\n", + curproc->p_pid, uip->ui_uid, incr); + } + + return (res); +} + +void +swap_reserve_force(vm_pindex_t incr) +{ + struct uidinfo *uip; + + mtx_lock(&sw_dev_mtx); + swap_reserved += incr; + mtx_unlock(&sw_dev_mtx); + + uip = curthread->td_ucred->cr_ruidinfo; + PROC_LOCK(curproc); + UIDINFO_VMSIZE_LOCK(uip); + uip->ui_vmsize += incr; + UIDINFO_VMSIZE_UNLOCK(uip); + PROC_UNLOCK(curproc); +} + +void +swap_release(vm_pindex_t decr) +{ + struct uidinfo *uip; + + PROC_LOCK(curproc); + uip = curthread->td_ucred->cr_ruidinfo; + swap_release_by_uid(decr, uip); + PROC_UNLOCK(curproc); +} + +void +swap_release_by_uid(vm_pindex_t decr, struct uidinfo *uip) +{ + + if (decr & PAGE_MASK) + panic("swap_release: & PAGE_MASK"); + + mtx_lock(&sw_dev_mtx); + if (swap_reserved < decr) + panic("swap_reserved < decr"); + swap_reserved -= decr; + mtx_unlock(&sw_dev_mtx); + + UIDINFO_VMSIZE_LOCK(uip); + if (uip->ui_vmsize < decr) + printf("negative vmsize for uid = %d\n", uip->ui_uid); + uip->ui_vmsize -= decr; + UIDINFO_VMSIZE_UNLOCK(uip); +} + +#ifdef INVARIANTS +int allow_swap_reserve_check = 0; + +static void +swap_check_reserve_map(struct vm_map *m, vm_pindex_t *s) +{ + vm_map_entry_t e; + int l; + + if (!sx_xlocked(&m->lock)) { + vm_map_lock_read(m); + l = 1; + } else + l = 0; + for (e = m->header.next; e != &m->header; e = e->next) { + if (e->uip) + *s += e->end - e->start; + } + if (l) + vm_map_unlock_read(m); +} + +void +swap_check_reserve(void) +{ + struct proc *p; + struct vmspace *vm; + struct vm_map *m; + struct vm_object *obj; + vm_pindex_t e_allocated; + vm_pindex_t obj_allocated; + static int warn; + + if (!allow_swap_reserve_check || proc0.p_vmspace == NULL || + mtx_owned(&kernel_map->system_mtx) || + mtx_owned(&kmem_map->system_mtx)) + return; + e_allocated = obj_allocated = 0; + + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + vm = vmspace_acquire_ref(p); + if (vm->vm_refcnt > 2 && !warn) { + vmspace_free(vm); + sx_sunlock(&allproc_lock); + printf("swap_check_reserve: rfork(~RFMEM) detected\n"); + warn = 1; + return; + } + m = &p->p_vmspace->vm_map; + if (m == NULL || m->system_map) { + vmspace_free(vm); + continue; + } + swap_check_reserve_map(m, &e_allocated); + vmspace_free(vm); + } + sx_sunlock(&allproc_lock); + + swap_check_reserve_map(exec_map, &e_allocated); + + mtx_lock(&vm_object_list_mtx); + TAILQ_FOREACH(obj, &vm_object_list, object_list) { + VM_OBJECT_LOCK(obj); + if (obj->uip) + obj_allocated += obj->charge; + VM_OBJECT_UNLOCK(obj); + } + mtx_unlock(&vm_object_list_mtx); + + KASSERT(obj_allocated + e_allocated == swap_reserved, + ("swap_check_reserve swp %llx obj %llx ent %llx", + swap_reserved, obj_allocated, e_allocated)); +} +#endif + static void swapdev_strategy(struct buf *, struct swdevt *sw); #define SWM_FREE 0x02 /* free, period */ @@ -2042,6 +2241,7 @@ swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strateg TAILQ_INSERT_TAIL(&swtailq, sp, sw_list); nswapdev++; swap_pager_avail += nblks; + swap_total += nblks * PAGE_SIZE; swp_sizecheck(); mtx_unlock(&sw_dev_mtx); } @@ -2146,6 +2346,7 @@ swapoff_one(struct swdevt *sp, struct ucred *cred) swap_pager_avail -= blist_fill(sp->sw_blist, dvbase, dmmax); } + swap_total -= nblks * PAGE_SIZE; mtx_unlock(&sw_dev_mtx); /* diff --git a/sys/vm/vm.h b/sys/vm/vm.h index c995488..e517e3c 100644 --- a/sys/vm/vm.h +++ b/sys/vm/vm.h @@ -133,5 +133,15 @@ struct kva_md_info { extern struct kva_md_info kmi; extern void vm_ksubmap_init(struct kva_md_info *); +struct uidinfo; +int swap_reserve(vm_pindex_t incr); +int swap_reserve_by_uid(vm_pindex_t incr, struct uidinfo *uip); +void swap_reserve_force(vm_pindex_t incr); +void swap_release(vm_pindex_t decr); +void swap_release_by_uid(vm_pindex_t decr, struct uidinfo *uip); +#ifdef INVARIANTS +void swap_check_reserve(void); +#endif + #endif /* VM_H */ diff --git a/sys/vm/vm_contig.c b/sys/vm/vm_contig.c index ae6129a..1b31785 100644 --- a/sys/vm/vm_contig.c +++ b/sys/vm/vm_contig.c @@ -209,7 +209,7 @@ contigmapping(vm_page_t m, vm_pindex_t npages, int flags) } vm_object_reference(object); vm_map_insert(map, object, addr - VM_MIN_KERNEL_ADDRESS, - addr, addr + (npages << PAGE_SHIFT), VM_PROT_ALL, VM_PROT_ALL, 0); + addr, addr + (npages << PAGE_SHIFT), VM_PROT_ALL, VM_PROT_ALL, MAP_ACC_NO_CHARGE); vm_map_unlock(map); tmp_addr = addr; VM_OBJECT_LOCK(object); diff --git a/sys/vm/vm_extern.h b/sys/vm/vm_extern.h index a7cde76..163af00 100644 --- a/sys/vm/vm_extern.h +++ b/sys/vm/vm_extern.h @@ -75,7 +75,7 @@ void vm_waitproc(struct proc *); int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int, objtype_t, void *, vm_ooffset_t); void vm_set_page_size(void); struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t); -struct vmspace *vmspace_fork(struct vmspace *); +struct vmspace *vmspace_fork(struct vmspace *, vm_pindex_t *); int vmspace_exec(struct proc *, vm_offset_t, vm_offset_t); int vmspace_unshare(struct proc *); void vmspace_exit(struct thread *); diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 3f4f4f6..1a99024 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -1124,7 +1124,11 @@ vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry) VM_OBJECT_LOCK(dst_object); dst_entry->object.vm_object = dst_object; dst_entry->offset = 0; - + if (dst_entry->uip != NULL) { + dst_object->uip = dst_entry->uip; + dst_object->charge = dst_entry->end - dst_entry->start; + dst_entry->uip = NULL; + } prot = dst_entry->max_protection; /* diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index 7af3e5d..fa6580d 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -110,7 +110,7 @@ kmem_alloc_nofault(map, size) size = round_page(size); addr = vm_map_min(map); result = vm_map_find(map, NULL, 0, - &addr, size, TRUE, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); + &addr, size, TRUE, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT | MAP_ACC_NO_CHARGE); if (result != KERN_SUCCESS) { return (0); } @@ -150,7 +150,7 @@ kmem_alloc(map, size) offset = addr - VM_MIN_KERNEL_ADDRESS; vm_object_reference(kernel_object); vm_map_insert(map, kernel_object, offset, addr, addr + size, - VM_PROT_ALL, VM_PROT_ALL, 0); + VM_PROT_ALL, VM_PROT_ALL, MAP_ACC_NO_CHARGE); vm_map_unlock(map); /* @@ -235,7 +235,7 @@ kmem_suballoc(parent, min, max, size) *min = (vm_offset_t) vm_map_min(parent); ret = vm_map_find(parent, NULL, (vm_offset_t) 0, - min, size, TRUE, VM_PROT_ALL, VM_PROT_ALL, 0); + min, size, TRUE, VM_PROT_ALL, VM_PROT_ALL, MAP_ACC_NO_CHARGE); if (ret != KERN_SUCCESS) { printf("kmem_suballoc: bad status return of %d.\n", ret); panic("kmem_suballoc"); @@ -319,7 +319,7 @@ kmem_malloc(map, size, flags) offset = addr - VM_MIN_KERNEL_ADDRESS; vm_object_reference(kmem_object); vm_map_insert(map, kmem_object, offset, addr, addr + size, - VM_PROT_ALL, VM_PROT_ALL, 0); + VM_PROT_ALL, VM_PROT_ALL, MAP_ACC_NO_CHARGE); /* * Note: if M_NOWAIT specified alone, allocate from @@ -439,6 +439,8 @@ kmem_alloc_wait(map, size) vm_offset_t addr; size = round_page(size); + if (!swap_reserve(size)) + return (0); for (;;) { /* @@ -451,12 +453,14 @@ kmem_alloc_wait(map, size) /* no space now; see if we can ever get space */ if (vm_map_max(map) - vm_map_min(map) < size) { vm_map_unlock(map); + swap_release(size); return (0); } map->needs_wakeup = TRUE; vm_map_unlock_and_wait(map, 0); } - vm_map_insert(map, NULL, 0, addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, 0); + vm_map_insert(map, NULL, 0, addr, addr + size, VM_PROT_ALL, + VM_PROT_ALL, MAP_ACC_CHARGED); vm_map_unlock(map); return (addr); } @@ -504,7 +508,7 @@ kmem_init(start, end) kernel_map = m; (void) vm_map_insert(m, NULL, (vm_ooffset_t) 0, VM_MIN_KERNEL_ADDRESS, start, VM_PROT_ALL, VM_PROT_ALL, - MAP_NOFAULT); + MAP_NOFAULT | MAP_ACC_NO_CHARGE); /* ... and ending with the completion of the above `insert' */ vm_map_unlock(m); } diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 7d760e3..79cd2e5 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -148,6 +148,10 @@ static void vm_map_zdtor(void *mem, int size, void *arg); static void vmspace_zdtor(void *mem, int size, void *arg); #endif +#define ENTRY_CHARGED(e) ((e)->uip != NULL || \ + ((e)->object.vm_object != NULL && (e)->object.vm_object->uip != NULL && \ + !((e)->eflags & MAP_ENTRY_NEEDS_COPY))) + /* * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type * stable. @@ -331,6 +335,9 @@ vmspace_dofree(struct vmspace *vm) */ /* pmap_release(vmspace_pmap(vm)); */ uma_zfree(vmspace_zone, vm); +#ifdef INVARIANTS + swap_check_reserve(); +#endif } void @@ -938,6 +945,7 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_map_entry_t prev_entry; vm_map_entry_t temp_entry; vm_eflags_t protoeflags; + struct uidinfo *uip; /* * Check that the start and end points are not bogus. @@ -978,6 +986,22 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, if (cow & MAP_DISABLE_COREDUMP) protoeflags |= MAP_ENTRY_NOCOREDUMP; + uip = NULL; + if (!(cow & MAP_ACC_NO_CHARGE)) { + if ((cow & MAP_ACC_CHARGED) || + ((prot & VM_PROT_WRITE) && + ((protoeflags & MAP_ENTRY_NEEDS_COPY) || + object == NULL))) { + if (!(cow & MAP_ACC_CHARGED) && + !swap_reserve(end - start)) + return (KERN_RESOURCE_SHORTAGE); + KASSERT((object == NULL) || (cow & MAP_ENTRY_NEEDS_COPY) || (object->uip == NULL), + ("OVERCOMMIT: vm_map_insert o %p", object)); + uip = curthread->td_ucred->cr_ruidinfo; + uihold(uip); + } + } + if (object != NULL) { /* * OBJ_ONEMAPPING must be cleared unless this mapping @@ -995,11 +1019,13 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, (prev_entry->eflags == protoeflags) && (prev_entry->end == start) && (prev_entry->wired_count == 0) && - ((prev_entry->object.vm_object == NULL) || - vm_object_coalesce(prev_entry->object.vm_object, - prev_entry->offset, - (vm_size_t)(prev_entry->end - prev_entry->start), - (vm_size_t)(end - prev_entry->end)))) { + (prev_entry->uip == uip || + (prev_entry->object.vm_object && + (prev_entry->object.vm_object->uip == uip))) && + vm_object_coalesce(prev_entry->object.vm_object, + prev_entry->offset, + (vm_size_t)(prev_entry->end - prev_entry->start), + (vm_size_t)(end - prev_entry->end))) { /* * We were able to extend the object. Determine if we * can extend the previous map entry to include the @@ -1012,6 +1038,12 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, prev_entry->end = end; vm_map_entry_resize_free(map, prev_entry); vm_map_simplify_entry(map, prev_entry); + if (uip) + uifree(uip); +#ifdef INVARIANTS + if (uip != NULL) + swap_check_reserve(); +#endif return (KERN_SUCCESS); } @@ -1025,6 +1057,12 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, offset = prev_entry->offset + (prev_entry->end - prev_entry->start); vm_object_reference(object); + if (uip && object && object->uip && + !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { + /* Object already accounts for this uid. */ + uifree(uip); + uip = NULL; + } } /* @@ -1039,6 +1077,7 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, new_entry = vm_map_entry_create(map); new_entry->start = start; new_entry->end = end; + new_entry->uip = NULL; new_entry->eflags = protoeflags; new_entry->object.vm_object = object; @@ -1050,6 +1089,10 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, new_entry->max_protection = max; new_entry->wired_count = 0; + KASSERT(uip == NULL || !ENTRY_CHARGED(new_entry), + ("OVERCOMMIT: vm_map_insert leaks vm_map %p", new_entry)); + new_entry->uip = uip; + /* * Insert the new entry into the list */ @@ -1074,6 +1117,10 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, cow & MAP_PREFAULT_PARTIAL); } +#ifdef INVARIANTS + if (uip != NULL) + swap_check_reserve(); +#endif return (KERN_SUCCESS); } @@ -1254,7 +1301,8 @@ vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry) (prev->protection == entry->protection) && (prev->max_protection == entry->max_protection) && (prev->inheritance == entry->inheritance) && - (prev->wired_count == entry->wired_count)) { + (prev->wired_count == entry->wired_count) && + (prev->uip == entry->uip)) { vm_map_entry_unlink(map, prev); entry->start = prev->start; entry->offset = prev->offset; @@ -1262,6 +1310,8 @@ vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry) vm_map_entry_resize_free(map, entry->prev); if (prev->object.vm_object) vm_object_deallocate(prev->object.vm_object); + if (prev->uip) + uifree(prev->uip); vm_map_entry_dispose(map, prev); } } @@ -1277,15 +1327,21 @@ vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry) (next->protection == entry->protection) && (next->max_protection == entry->max_protection) && (next->inheritance == entry->inheritance) && - (next->wired_count == entry->wired_count)) { + (next->wired_count == entry->wired_count) && + (next->uip == entry->uip)) { vm_map_entry_unlink(map, next); entry->end = next->end; vm_map_entry_resize_free(map, entry); if (next->object.vm_object) vm_object_deallocate(next->object.vm_object); + if (next->uip) + uifree(next->uip); vm_map_entry_dispose(map, next); } } +#ifdef INVARIANTS + swap_check_reserve(); +#endif } /* * vm_map_clip_start: [ internal use only ] @@ -1329,6 +1385,17 @@ _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start) atop(entry->end - entry->start)); entry->object.vm_object = object; entry->offset = 0; + object->uip = entry->uip; + object->charge = entry->end - entry->start; + entry->uip = NULL; + } else if (entry->object.vm_object && + ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) && entry->uip) + { + KASSERT(!entry->object.vm_object->uip, + ("OVERCOMMIT: vm_entry_clip_start: both uip e %p", entry)); + entry->object.vm_object->uip = entry->uip; + entry->object.vm_object->charge = entry->end - entry->start; + entry->uip = NULL; } new_entry = vm_map_entry_create(map); @@ -1337,12 +1404,18 @@ _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start) new_entry->end = start; entry->offset += (start - entry->start); entry->start = start; + if (new_entry->uip) + uihold(entry->uip); vm_map_entry_link(map, entry->prev, new_entry); if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { vm_object_reference(new_entry->object.vm_object); } + +#ifdef INVARIANTS + swap_check_reserve(); +#endif } /* @@ -1380,6 +1453,17 @@ _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end) atop(entry->end - entry->start)); entry->object.vm_object = object; entry->offset = 0; + object->uip = entry->uip; + object->charge = entry->end - entry->start; + entry->uip = NULL; + } else if (entry->object.vm_object && + ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) && entry->uip) + { + KASSERT(!entry->object.vm_object->uip, + ("OVERCOMMIT: vm_entry_clip_end: both uip e %p", entry)); + entry->object.vm_object->uip = entry->uip; + entry->object.vm_object->charge = entry->end - entry->start; + entry->uip = NULL; } /* @@ -1390,12 +1474,18 @@ _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end) new_entry->start = entry->end = end; new_entry->offset += (end - entry->start); + if (new_entry->uip) + uihold(entry->uip); vm_map_entry_link(map, entry, new_entry); if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { vm_object_reference(new_entry->object.vm_object); } + +#ifdef INVARIANTS + swap_check_reserve(); +#endif } /* @@ -1565,6 +1655,7 @@ vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end, { vm_map_entry_t current; vm_map_entry_t entry; + vm_object_t obj; vm_map_lock(map); @@ -1592,6 +1683,46 @@ vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end, current = current->next; } + + /* Do an accounting pass for private read-only mappings that + now will do cow due to allowed write (e.g. debugger sets + breakpoint on text segment) */ + current = entry; + while ((current != &map->header) && (current->start < end)) { + + vm_map_clip_end(map, current, end); + + if (!set_max && + ((new_prot & ~(current->protection)) & VM_PROT_WRITE) && + !ENTRY_CHARGED(current)) { + struct uidinfo *uip; + if (!swap_reserve(current->end - current->start)) { + vm_map_unlock(map); + return (KERN_RESOURCE_SHORTAGE); + } + uip = curthread->td_ucred->cr_ruidinfo; + uihold(uip); + if ((current->eflags & MAP_ENTRY_NEEDS_COPY) || + (current->object.vm_object == NULL)) { + current->uip = uip; + } else { + obj = current->object.vm_object; + VM_OBJECT_LOCK(obj); + obj->uip = uip; + obj->charge = current->end - current->start; + KASSERT(ptoa(obj->size) == obj->charge, + ("OVERCOMMIT: vm_map_protect size e %p o %p", + current, obj)); + VM_OBJECT_UNLOCK(obj); + } + } + current = current->next; + } + +#ifdef INVARIANTS + swap_check_reserve(); +#endif + /* * Go back and fix up protections. [Note that clipping is not * necessary the second time.] @@ -1600,8 +1731,6 @@ vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end, while ((current != &map->header) && (current->start < end)) { vm_prot_t old_prot; - vm_map_clip_end(map, current, end); - old_prot = current->protection; if (set_max) current->protection = @@ -2289,14 +2418,27 @@ static void vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry) { vm_object_t object; - vm_pindex_t offidxstart, offidxend, count; + vm_pindex_t offidxstart, offidxend, count, size, size1; +#ifdef INVARIANTS + swap_check_reserve(); +#endif vm_map_entry_unlink(map, entry); - map->size -= entry->end - entry->start; + object = entry->object.vm_object; + size = entry->end - entry->start; + map->size -= size; + + if (entry->uip != NULL) { + swap_release_by_uid(size, entry->uip); + uifree(entry->uip); + } if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 && - (object = entry->object.vm_object) != NULL) { - count = OFF_TO_IDX(entry->end - entry->start); + (object != NULL)) { + KASSERT(!entry->uip || !object->uip || + (entry->eflags & MAP_ENTRY_NEEDS_COPY), + ("OVERCOMMIT vm_map_entry_delete: both uip %p", entry)); + count = OFF_TO_IDX(size); offidxstart = OFF_TO_IDX(entry->offset); offidxend = offidxstart + count; VM_OBJECT_LOCK(object); @@ -2308,14 +2450,26 @@ vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry) if (object->type == OBJT_SWAP) swap_pager_freespace(object, offidxstart, count); if (offidxend >= object->size && - offidxstart < object->size) + offidxstart < object->size) { object->size = offidxstart; + if (object->uip) { + size1 = object->size - offidxstart; + KASSERT(object->charge >= size1, + ("vm_map_entry_delete: object->charge < 0")); + swap_release_by_uid(size1, object->uip); + object->charge -= size1; + } + } } VM_OBJECT_UNLOCK(object); vm_object_deallocate(object); } vm_map_entry_dispose(map, entry); + +#ifdef INVARIANTS + swap_check_reserve(); +#endif } /* @@ -2481,9 +2635,13 @@ vm_map_copy_entry( vm_map_t src_map, vm_map_t dst_map, vm_map_entry_t src_entry, - vm_map_entry_t dst_entry) + vm_map_entry_t dst_entry, + vm_pindex_t *fork_charge) { vm_object_t src_object; + vm_offset_t size; + struct uidinfo *uip; + int charged; if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP) return; @@ -2504,8 +2662,10 @@ vm_map_copy_entry( /* * Make a copy of the object. */ + size = src_entry->end - src_entry->start; if ((src_object = src_entry->object.vm_object) != NULL) { VM_OBJECT_LOCK(src_object); + charged = ENTRY_CHARGED(src_entry); if ((src_object->handle == NULL) && (src_object->type == OBJT_DEFAULT || src_object->type == OBJT_SWAP)) { @@ -2517,14 +2677,39 @@ vm_map_copy_entry( } vm_object_reference_locked(src_object); vm_object_clear_flag(src_object, OBJ_ONEMAPPING); + if (src_entry->uip != NULL && + !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { + KASSERT(src_object->uip == NULL, + ("OVERCOMMIT: vm_map_copy_entry: uip %p", + src_object)); + src_object->uip = src_entry->uip; + src_object->charge = size; + } VM_OBJECT_UNLOCK(src_object); dst_entry->object.vm_object = src_object; + if (charged) { + uip = curthread->td_ucred->cr_ruidinfo; + uihold(uip); + dst_entry->uip = uip; + *fork_charge += size; + if (!(src_entry->eflags & + MAP_ENTRY_NEEDS_COPY)) { + uihold(uip); + src_entry->uip = uip; + *fork_charge += size; + } + } src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY); dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY); dst_entry->offset = src_entry->offset; } else { dst_entry->object.vm_object = NULL; dst_entry->offset = 0; + if (src_entry->uip) { + dst_entry->uip = curthread->td_ucred->cr_ruidinfo; + uihold(dst_entry->uip); + *fork_charge += size; + } } pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start, @@ -2581,7 +2766,7 @@ vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2, * The source map must not be locked. */ struct vmspace * -vmspace_fork(struct vmspace *vm1) +vmspace_fork(struct vmspace *vm1, vm_pindex_t *fork_charge) { struct vmspace *vm2; vm_map_t old_map = &vm1->vm_map; @@ -2591,7 +2776,6 @@ vmspace_fork(struct vmspace *vm1) vm_object_t object; vm_map_lock(old_map); - vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset); if (vm2 == NULL) goto unlock_and_return; @@ -2621,6 +2805,9 @@ vmspace_fork(struct vmspace *vm1) atop(old_entry->end - old_entry->start)); old_entry->object.vm_object = object; old_entry->offset = 0; + object->uip = old_entry->uip; + object->charge = old_entry->end - old_entry->start; + old_entry->uip = NULL; } /* @@ -2641,6 +2828,12 @@ vmspace_fork(struct vmspace *vm1) } VM_OBJECT_LOCK(object); vm_object_clear_flag(object, OBJ_ONEMAPPING); + if (old_entry->uip) { + KASSERT(object->uip == NULL, ("vmspace_fork both uip")); + object->uip = old_entry->uip; + object->charge = old_entry->end - old_entry->start; + old_entry->uip = NULL; + } VM_OBJECT_UNLOCK(object); /* @@ -2681,7 +2874,7 @@ vmspace_fork(struct vmspace *vm1) new_entry); vmspace_map_entry_forked(vm1, vm2, new_entry); vm_map_copy_entry(old_map, new_map, old_entry, - new_entry); + new_entry, fork_charge); break; } old_entry = old_entry->next; @@ -2807,6 +3000,7 @@ vm_map_growstack(struct proc *p, vm_offset_t addr) size_t grow_amount, max_grow; rlim_t stacklim, vmemlim; int is_procstack, rv; + struct uidinfo *uip; Retry: PROC_LOCK(p); @@ -2972,13 +3166,17 @@ Retry: } grow_amount = addr - stack_entry->end; - + uip = stack_entry->uip; + if (uip == NULL && stack_entry->object.vm_object != NULL) + uip = stack_entry->object.vm_object->uip; + if (uip && !swap_reserve_by_uid(grow_amount, uip)) + rv = KERN_NO_SPACE; /* Grow the underlying object if applicable. */ - if (stack_entry->object.vm_object == NULL || - vm_object_coalesce(stack_entry->object.vm_object, - stack_entry->offset, - (vm_size_t)(stack_entry->end - stack_entry->start), - (vm_size_t)grow_amount)) { + else if (stack_entry->object.vm_object == NULL || + vm_object_coalesce(stack_entry->object.vm_object, + stack_entry->offset, + (vm_size_t)(stack_entry->end - stack_entry->start), + (vm_size_t)grow_amount)) { map->size += (addr - stack_entry->end); /* Update the current entry. */ stack_entry->end = addr; @@ -3051,12 +3249,18 @@ vmspace_unshare(struct proc *p) { struct vmspace *oldvmspace = p->p_vmspace; struct vmspace *newvmspace; + vm_pindex_t fork_charge; if (oldvmspace->vm_refcnt == 1) return (0); - newvmspace = vmspace_fork(oldvmspace); + fork_charge = 0; + newvmspace = vmspace_fork(oldvmspace, &fork_charge); if (newvmspace == NULL) return (ENOMEM); + if (!swap_reserve_by_uid(fork_charge, p->p_ucred->cr_ruidinfo)) { + vmspace_free(newvmspace); + return (ENOMEM); + } PROC_VMSPACE_LOCK(p); p->p_vmspace = newvmspace; PROC_VMSPACE_UNLOCK(p); @@ -3102,6 +3306,9 @@ vm_map_lookup(vm_map_t *var_map, /* IN/OUT */ vm_map_t map = *var_map; vm_prot_t prot; vm_prot_t fault_type = fault_typea; + vm_object_t eobject; + struct uidinfo *uip; + vm_pindex_t size; RetryLookup:; /* @@ -3172,7 +3379,7 @@ RetryLookup:; *wired = (entry->wired_count != 0); if (*wired) prot = fault_type = entry->protection; - + size = entry->end - entry->start; /* * If the entry was copy-on-write, we either ... */ @@ -3194,11 +3401,46 @@ RetryLookup:; if (vm_map_lock_upgrade(map)) goto RetryLookup; +#ifdef INVARIANTS + swap_check_reserve(); +#endif + if (entry->uip == NULL) { + /* + * The debugger owner is charged for + * the memory. + */ + uip = curthread->td_ucred->cr_ruidinfo; + uihold(uip); + if (!swap_reserve_by_uid(size, uip)) { + uifree(uip); + vm_map_unlock(map); + return (KERN_RESOURCE_SHORTAGE); + } + entry->uip = uip; + } vm_object_shadow( &entry->object.vm_object, &entry->offset, - atop(entry->end - entry->start)); + atop(size)); entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; + eobject = entry->object.vm_object; + if (eobject->uip != NULL) { + /* + * The object was not shadowed. + */ + swap_release_by_uid(size, entry->uip); + uifree(entry->uip); + entry->uip = NULL; + } else { + VM_OBJECT_LOCK(eobject); + eobject->uip = entry->uip; + eobject->charge = size; + VM_OBJECT_UNLOCK(eobject); + entry->uip = NULL; + } +#ifdef INVARIANTS + swap_check_reserve(); +#endif vm_map_lock_downgrade(map); } else { @@ -3218,10 +3460,20 @@ RetryLookup:; if (vm_map_lock_upgrade(map)) goto RetryLookup; entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT, - atop(entry->end - entry->start)); + atop(size)); entry->offset = 0; + if (entry->uip != NULL) { + VM_OBJECT_LOCK(entry->object.vm_object); + entry->object.vm_object->uip = entry->uip; + entry->object.vm_object->charge = size; + VM_OBJECT_UNLOCK(entry->object.vm_object); + entry->uip = NULL; + } vm_map_lock_downgrade(map); } +#ifdef INVARIANTS + swap_check_reserve(); +#endif /* * Return the object/offset from this entry. If the entry was @@ -3411,9 +3663,15 @@ DB_SHOW_COMMAND(map, vm_map_print) db_indent -= 2; } } else { + if (entry->uip) + db_printf(", uip %d", entry->uip->ui_uid); db_printf(", object=%p, offset=0x%jx", (void *)entry->object.vm_object, (uintmax_t)entry->offset); + if (entry->object.vm_object && entry->object.vm_object->uip) + db_printf(", obj uip %d charge %llx", + entry->object.vm_object->uip->ui_uid, + entry->object.vm_object->charge); if (entry->eflags & MAP_ENTRY_COW) db_printf(", copy (%s)", (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done"); diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h index 86efc6b..f704852 100644 --- a/sys/vm/vm_map.h +++ b/sys/vm/vm_map.h @@ -115,6 +115,7 @@ struct vm_map_entry { vm_inherit_t inheritance; /* inheritance */ int wired_count; /* can be paged if = 0 */ vm_pindex_t lastr; /* last read */ + struct uidinfo *uip; /* tmp storage for creator ref */ }; #define MAP_ENTRY_NOSYNC 0x0001 @@ -314,6 +315,8 @@ long vmspace_wired_count(struct vmspace *vmspace); #define MAP_PREFAULT_MADVISE 0x0200 /* from (user) madvise request */ #define MAP_STACK_GROWS_DOWN 0x1000 #define MAP_STACK_GROWS_UP 0x2000 +#define MAP_ACC_CHARGED 0x4000 +#define MAP_ACC_NO_CHARGE 0x8000 /* * vm_fault option flags diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index ad9b34e..44e3dee 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -646,6 +646,8 @@ mprotect(td, uap) return (0); case KERN_PROTECTION_FAILURE: return (EACCES); + case KERN_RESOURCE_SHORTAGE: + return (ENOMEM); } return (EINVAL); } diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index 21c2fc9..fb977cc 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -80,6 +80,7 @@ __FBSDID("$FreeBSD: src/sys/vm/vm_object.c,v 1.392 2008/02/26 17:16:48 alc Exp $ #include #include #include +#include #include #include @@ -222,6 +223,8 @@ _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object) object->generation = 1; object->ref_count = 1; object->flags = 0; + object->uip = NULL; + object->charge = 0; if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP)) object->flags = OBJ_ONEMAPPING; object->pg_color = 0; @@ -695,6 +698,19 @@ vm_object_terminate(vm_object_t object) mtx_unlock(&vm_object_list_mtx); /* + * Release the allocation charge. + */ + if (object->uip != NULL) { + KASSERT(object->type == OBJT_DEFAULT || + object->type == OBJT_SWAP, + ("vm_object_terminate: non-swap obj %p has uip", + object)); + swap_release_by_uid(object->charge, object->uip); + object->charge = 0; + uifree(object->uip); + object->uip = NULL; + } + /* * Free the space for the object. */ uma_zfree(obj_zone, object); @@ -1351,6 +1367,14 @@ vm_object_split(vm_map_entry_t entry) orig_object->backing_object_offset + entry->offset; new_object->backing_object = source; } + if (orig_object->uip != NULL) { + new_object->uip = orig_object->uip; + uihold(orig_object->uip); + new_object->charge = ptoa(size); + KASSERT(orig_object->charge >= ptoa(size), + ("orig_object->charge < 0")); + orig_object->charge -= ptoa(size); + } new_object->flags |= orig_object->flags & OBJ_NEEDGIANT; retry: if ((m = TAILQ_FIRST(&orig_object->memq)) != NULL) { @@ -1762,6 +1786,13 @@ vm_object_collapse(vm_object_t object) * and no object references within it, all that is * necessary is to dispose of it. */ + if (backing_object->uip) { + swap_release_by_uid(backing_object->charge, + backing_object->uip); + backing_object->charge = 0; + uifree(backing_object->uip); + backing_object->uip = NULL; + } KASSERT(backing_object->ref_count == 1, ("backing_object %p was somehow re-referenced during collapse!", backing_object)); VM_OBJECT_UNLOCK(backing_object); @@ -2003,13 +2034,20 @@ vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset, if (prev_object->type == OBJT_SWAP) swap_pager_freespace(prev_object, next_pindex, next_size); - } +/* if (prev_object->uip) { + KASSERT(prev_object->charge >= ptoa(prev_object->size - next_pindex), + ("prev_object->charge < 0")); + prev_object->charge -= ptoa(prev_object->size - next_pindex); + } +*/ } /* * Extend the object if necessary. */ if (next_pindex + next_size > prev_object->size) prev_object->size = next_pindex + next_size; + if (prev_object->uip) + prev_object->charge += ptoa(next_size); VM_OBJECT_UNLOCK(prev_object); return (TRUE); @@ -2152,9 +2190,10 @@ DB_SHOW_COMMAND(object, vm_object_print_static) return; db_iprintf( - "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x\n", + "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x uip %d charge %llx\n", object, (int)object->type, (uintmax_t)object->size, - object->resident_page_count, object->ref_count, object->flags); + object->resident_page_count, object->ref_count, object->flags, + object->uip ? object->uip->ui_uid : -1, object->charge); db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n", object->shadow_count, object->backing_object ? object->backing_object->ref_count : 0, diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h index 7f54a1f..2957f49 100644 --- a/sys/vm/vm_object.h +++ b/sys/vm/vm_object.h @@ -133,6 +133,8 @@ struct vm_object { int swp_bcount; } swp; } un_pager; + struct uidinfo *uip; + vm_pindex_t charge; }; /* diff --git a/usr.bin/limits/limits.c b/usr.bin/limits/limits.c index 82d18e5..6c31c0c 100644 --- a/usr.bin/limits/limits.c +++ b/usr.bin/limits/limits.c @@ -86,7 +86,8 @@ static struct { { " maxprocesses%-4s %8s", "\n", 1 }, { " openfiles%-4s %8s", "\n", 1 }, { " sbsize%-4s %8s", " bytes\n", 1 }, - { " vmemoryuse%-4s %8s", " kB\n", 1024 } + { " vmemoryuse%-4s %8s", " kB\n", 1024 }, + { " swapuse%-4s %8s", " kB\n", 1024 } } }, { "sh", "unlimited", "", " -H", " -S", "", @@ -101,7 +102,8 @@ static struct { { "ulimit%s -u %s", ";\n", 1 }, { "ulimit%s -n %s", ";\n", 1 }, { "ulimit%s -b %s", ";\n", 1 }, - { "ulimit%s -v %s", ";\n", 1024 } + { "ulimit%s -v %s", ";\n", 1024 }, + { "ulimit%s -w %s", ";\n", 1024 } } }, { "csh", "unlimited", "", " -h", "", NULL, @@ -116,7 +118,8 @@ static struct { { "limit%s maxproc %s", ";\n", 1 }, { "limit%s openfiles %s", ";\n", 1 }, { "limit%s sbsize %s", ";\n", 1 }, - { "limit%s vmemoryuse %s", ";\n", 1024 } + { "limit%s vmemoryuse %s", ";\n", 1024 }, + { "limit%s swapuse %s", ";\n", 1024 } } }, { "bash|bash2", "unlimited", "", " -H", " -S", "", @@ -131,7 +134,8 @@ static struct { { "ulimit%s -u %s", ";\n", 1 }, { "ulimit%s -n %s", ";\n", 1 }, { "ulimit%s -b %s", ";\n", 1 }, - { "ulimit%s -v %s", ";\n", 1024 } + { "ulimit%s -v %s", ";\n", 1024 }, + { "ulimit%s -w %s", ";\n", 1024 } } }, { "tcsh", "unlimited", "", " -h", "", NULL, @@ -146,7 +150,8 @@ static struct { { "limit%s maxproc %s", ";\n", 1 }, { "limit%s descriptors %s", ";\n", 1 }, { "limit%s sbsize %s", ";\n", 1 }, - { "limit%s vmemoryuse %s", ";\n", 1024 } + { "limit%s vmemoryuse %s", ";\n", 1024 }, + { "limit%s swapuse %s", ";\n", 1024 } } }, { "ksh|pdksh", "unlimited", "", " -H", " -S", "", @@ -161,7 +166,8 @@ static struct { { "ulimit%s -p %s", ";\n", 1 }, { "ulimit%s -n %s", ";\n", 1 }, { "ulimit%s -b %s", ";\n", 1 }, - { "ulimit%s -v %s", ";\n", 1024 } + { "ulimit%s -v %s", ";\n", 1024 }, + { "ulimit%s -w %s", ";\n", 1024 } } }, { "zsh", "unlimited", "", " -H", " -S", "", @@ -176,7 +182,8 @@ static struct { { "ulimit%s -u %s", ";\n", 1 }, { "ulimit%s -n %s", ";\n", 1 }, { "ulimit%s -b %s", ";\n", 1 }, - { "ulimit%s -v %s", ";\n", 1024 } + { "ulimit%s -v %s", ";\n", 1024 }, + { "ulimit%s -w %s", ";\n", 1024 } } }, { "rc|es", "unlimited", "", " -h", "", NULL, @@ -191,7 +198,8 @@ static struct { { "limit%s processes %s", ";\n", 1 }, { "limit%s descriptors %s", ";\n", 1 }, { "limit%s sbsize %s", ";\n", 1 }, - { "limit%s vmemoryuse %s", ";\n", 1024 } + { "limit%s vmemoryuse %s", ";\n", 1024 }, + { "limit%s swapuse %s", ";\n", 1024 } } }, { NULL, NULL, NULL, NULL, NULL, NULL, @@ -212,8 +220,9 @@ static struct { { "memorylocked", login_getcapsize }, { "maxproc", login_getcapnum }, { "openfiles", login_getcapnum }, - { "sbsize", login_getcapsize }, - { "vmemoryuse", login_getcapsize } + { "sbsize", login_getcapsize }, + { "vmemoryuse", login_getcapsize }, + { "swapuse", login_getcapsize } }; /* @@ -224,7 +233,7 @@ static struct { * to be modified accordingly! */ -#define RCS_STRING "tfdscmlunbv" +#define RCS_STRING "tfdscmlunbvw" static rlim_t resource_num(int which, int ch, const char *str); static void usage(void); @@ -261,7 +270,7 @@ main(int argc, char *argv[]) } optarg = NULL; - while ((ch = getopt(argc, argv, ":EeC:U:BSHab:c:d:f:l:m:n:s:t:u:v:")) != -1) { + while ((ch = getopt(argc, argv, ":EeC:U:BSHab:c:d:f:l:m:n:s:t:u:v:w:")) != -1) { switch(ch) { case 'a': doall = 1; @@ -475,7 +484,7 @@ static void usage(void) { (void)fprintf(stderr, -"usage: limits [-C class|-U user] [-eaSHBE] [-bcdflmnstuv [val]] [[name=val ...] cmd]\n"); +"usage: limits [-C class|-U user] [-eaSHBE] [-bcdflmnstuvw [val]] [[name=val ...] cmd]\n"); exit(EXIT_FAILURE); } @@ -547,6 +556,7 @@ resource_num(int which, int ch, const char *str) case RLIMIT_MEMLOCK: case RLIMIT_SBSIZE: case RLIMIT_VMEM: + case RLIMIT_SWAP: errno = 0; res = 0; while (*s) {