diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index fee3caf..8390526 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -742,7 +742,7 @@ trap_pfault(frame, usermode) PROC_UNLOCK(p); /* Fault in the user page: */ - rv = vm_fault(map, va, ftype, + rv = vm_fault(map, eva, ftype, (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL); @@ -754,7 +754,7 @@ trap_pfault(frame, usermode) * Don't have to worry about process locking or stacks in the * kernel. */ - rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); + rv = vm_fault(map, eva, ftype, VM_FAULT_NORMAL); } if (rv == KERN_SUCCESS) return (0); diff --git a/sys/amd64/amd64/uio_machdep.c b/sys/amd64/amd64/uio_machdep.c index d3897cf..ccbec27 100644 --- a/sys/amd64/amd64/uio_machdep.c +++ b/sys/amd64/amd64/uio_machdep.c @@ -94,8 +94,10 @@ uiomove_fromphys(vm_page_t ma[], vm_offset_t offset, int n, struct uio *uio) error = copyout(cp, iov->iov_base, cnt); else error = copyin(iov->iov_base, cp, cnt); - if (error) + if (error) { + error = uiorestart(error, uio, cnt); goto out; + } break; case UIO_SYSSPACE: if (uio->uio_rw == UIO_READ) @@ -106,10 +108,7 @@ uiomove_fromphys(vm_page_t ma[], vm_offset_t offset, int n, struct uio *uio) case UIO_NOCOPY: break; } - iov->iov_base = (char *)iov->iov_base + cnt; - iov->iov_len -= cnt; - uio->uio_resid -= cnt; - uio->uio_offset += cnt; + fwduio(uio, cnt); offset += cnt; n -= cnt; } diff --git a/sys/conf/files b/sys/conf/files index 6500775..bfbe428 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1933,6 +1933,7 @@ kern/kern_poll.c optional device_polling kern/kern_priv.c standard kern/kern_proc.c standard kern/kern_prot.c standard +kern/kern_rangelock.c standard kern/kern_resource.c standard kern/kern_rmlock.c standard kern/kern_rwlock.c standard diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c index 76237fb..0d0ef86 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c @@ -91,7 +91,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include static int (*pru_sosend)(struct socket *so, struct sockaddr *addr, diff --git a/sys/dev/cxgb/ulp/tom/cxgb_ddp.c b/sys/dev/cxgb/ulp/tom/cxgb_ddp.c index a54598c..1c3953d 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_ddp.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_ddp.c @@ -90,7 +90,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #define MAX_SCHEDULE_TIMEOUT 300 diff --git a/sys/dev/cxgb/ulp/tom/cxgb_vm.c b/sys/dev/cxgb/ulp/tom/cxgb_vm.c deleted file mode 100644 index e7a3893..0000000 --- a/sys/dev/cxgb/ulp/tom/cxgb_vm.c +++ /dev/null @@ -1,166 +0,0 @@ -/************************************************************************** - -Copyright (c) 2007-2008, Chelsio Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Neither the name of the Chelsio Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -***************************************************************************/ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -/* - * This routine takes a user's map, array of pages, number of pages, and flags - * and then does the following: - * - validate that the user has access to those pages (flags indicates read - * or write) - if not fail - * - validate that count is enough to hold range number of pages - if not fail - * - fault in any non-resident pages - * - if the user is doing a read force a write fault for any COWed pages - * - if the user is doing a read mark all pages as dirty - * - hold all pages - */ -int -vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr, vm_page_t *mp, - int count, vm_prot_t prot) -{ - vm_offset_t end, va; - int faults, rv; - pmap_t pmap; - vm_page_t m, *pages; - - pmap = vm_map_pmap(map); - pages = mp; - addr &= ~PAGE_MASK; - /* - * Check that virtual address range is legal - * This check is somewhat bogus as on some architectures kernel - * and user do not share VA - however, it appears that all FreeBSD - * architectures define it - */ - end = addr + (count * PAGE_SIZE); - if (end > VM_MAXUSER_ADDRESS) { - log(LOG_WARNING, "bad address passed to vm_fault_hold_user_pages"); - return (EFAULT); - } - - /* - * First optimistically assume that all pages are resident - * (and R/W if for write) if so just mark pages as held (and - * dirty if for write) and return - */ - vm_page_lock_queues(); - for (pages = mp, faults = 0, va = addr; va < end; - va += PAGE_SIZE, pages++) { - /* - * page queue mutex is recursable so this is OK - * it would be really nice if we had an unlocked - * version of this so we were only acquiring the - * pmap lock 1 time as opposed to potentially - * many dozens of times - */ - *pages = m = pmap_extract_and_hold(pmap, va, prot); - if (m == NULL) { - faults++; - continue; - } - /* - * Preemptively mark dirty - the pages - * will never have the modified bit set if - * they are only changed via DMA - */ - if (prot & VM_PROT_WRITE) - vm_page_dirty(m); - - } - vm_page_unlock_queues(); - - if (faults == 0) - return (0); - - /* - * Pages either have insufficient permissions or are not present - * trigger a fault where neccessary - * - */ - rv = 0; - for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++) { - /* - * Account for a very narrow race where the page may be - * taken away from us before it is held - */ - while (*pages == NULL) { - rv = vm_fault(map, va, prot, - (prot & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL); - if (rv) - goto error; - *pages = pmap_extract_and_hold(pmap, va, prot); - } - } - return (0); -error: - log(LOG_WARNING, - "vm_fault bad return rv=%d va=0x%zx\n", rv, va); - vm_page_lock_queues(); - for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++) - if (*pages) { - vm_page_unhold(*pages); - *pages = NULL; - } - vm_page_unlock_queues(); - return (EFAULT); -} - -void -vm_fault_unhold_pages(vm_page_t *mp, int count) -{ - - KASSERT(count >= 0, ("negative count %d", count)); - vm_page_lock_queues(); - while (count--) { - vm_page_unhold(*mp); - mp++; - } - vm_page_unlock_queues(); -} diff --git a/sys/dev/cxgb/ulp/tom/cxgb_vm.h b/sys/dev/cxgb/ulp/tom/cxgb_vm.h deleted file mode 100644 index 7532e20..0000000 --- a/sys/dev/cxgb/ulp/tom/cxgb_vm.h +++ /dev/null @@ -1,39 +0,0 @@ -/************************************************************************** - -Copyright (c) 2007-2008, Chelsio Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Neither the name of the Chelsio Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -$FreeBSD$ - -***************************************************************************/ -#ifndef CXGB_VM_H_ -#define CXGB_VM_H_ - -int vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr, - vm_page_t *mp, int count, vm_prot_t prot); -void vm_fault_unhold_pages(vm_page_t *mp, int count); - -#endif diff --git a/sys/i386/i386/trap.c b/sys/i386/i386/trap.c index e967104..153b7da 100644 --- a/sys/i386/i386/trap.c +++ b/sys/i386/i386/trap.c @@ -820,7 +820,7 @@ trap_pfault(frame, usermode, eva) PROC_UNLOCK(p); /* Fault in the user page: */ - rv = vm_fault(map, va, ftype, + rv = vm_fault(map, eva, ftype, (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL); @@ -832,7 +832,7 @@ trap_pfault(frame, usermode, eva) * Don't have to worry about process locking or stacks in the * kernel. */ - rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); + rv = vm_fault(map, eva, ftype, VM_FAULT_NORMAL); } if (rv == KERN_SUCCESS) return (0); diff --git a/sys/i386/i386/uio_machdep.c b/sys/i386/i386/uio_machdep.c index 3558ec0..a316b2d 100644 --- a/sys/i386/i386/uio_machdep.c +++ b/sys/i386/i386/uio_machdep.c @@ -97,6 +97,7 @@ uiomove_fromphys(vm_page_t ma[], vm_offset_t offset, int n, struct uio *uio) else error = copyin(iov->iov_base, cp, cnt); if (error) { + error = uiorestart(error, uio, cnt); sf_buf_free(sf); sched_unpin(); goto out; @@ -113,10 +114,7 @@ uiomove_fromphys(vm_page_t ma[], vm_offset_t offset, int n, struct uio *uio) } sf_buf_free(sf); sched_unpin(); - iov->iov_base = (char *)iov->iov_base + cnt; - iov->iov_len -= cnt; - uio->uio_resid -= cnt; - uio->uio_offset += cnt; + fwduio(uio, cnt); offset += cnt; n -= cnt; } diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c new file mode 100644 index 0000000..8401ce5 --- /dev/null +++ b/sys/kern/kern_rangelock.c @@ -0,0 +1,165 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +uma_zone_t rl_entry_zone; + +static void +rangelock_sys_init(void) +{ + + rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); +} +SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, rangelock_sys_init, NULL); + +void +rangelock_init(struct rangelock *lock) +{ + + TAILQ_INIT(&lock->rl_waiters); + lock->rl_currdep = NULL; +} + +void +rangelock_destroy(struct rangelock *lock) +{ + + KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters")); +} + +static int +rangelock_incompatible(const struct rl_q_entry *e1, const struct rl_q_entry *e2) +{ + + if ((e1->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ && + (e2->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ) + return (0); +#define IN_RANGE(a, e) (a <= e->rl_q_start && a < e->rl_q_end) + if (IN_RANGE(e1->rl_q_start, e2) || IN_RANGE(e2->rl_q_start, e1) || + IN_RANGE(e1->rl_q_end, e2) || IN_RANGE(e2->rl_q_end, e1)) + return (1); +#undef IN_RANGE + return (0); +} + +static void +rangelock_calc_block(struct rangelock *lock) +{ + struct rl_q_entry *entry, *entry1, *whead; + + if (lock->rl_currdep == TAILQ_FIRST(&lock->rl_waiters) && + lock->rl_currdep != NULL) + lock->rl_currdep = TAILQ_NEXT(lock->rl_currdep, rl_q_link); + for (entry = lock->rl_currdep; entry; + entry = TAILQ_NEXT(entry, rl_q_link)) { + TAILQ_FOREACH(entry1, &lock->rl_waiters, rl_q_link) { + if (rangelock_incompatible(entry, entry1)) + goto out; + if (entry1 == entry) + break; + } + } +out: + lock->rl_currdep = entry; + TAILQ_FOREACH(whead, &lock->rl_waiters, rl_q_link) { + if (whead == lock->rl_currdep) + break; + if (!(whead->rl_q_flags & RL_LOCK_GRANTED)) { + whead->rl_q_flags |= RL_LOCK_GRANTED; + wakeup(whead); + } + } +} + +static void +rangelock_unlock_vp_locked(struct vnode *vp, struct rl_q_entry *entry) +{ + + ASSERT_VI_LOCKED(vp, "rangelock"); + KASSERT(entry != vp->v_rl.rl_currdep, ("stuck currdep")); + TAILQ_REMOVE(&vp->v_rl.rl_waiters, entry, rl_q_link); + rangelock_calc_block(&vp->v_rl); + VI_UNLOCK(vp); + uma_zfree(rl_entry_zone, entry); +} + +void +rangelock_unlock(struct vnode *vp, void *cookie) +{ + struct rl_q_entry *entry; + + entry = cookie; + VI_LOCK(vp); + rangelock_unlock_vp_locked(vp, entry); +} + +void * +rangelock_unlock_range(struct vnode *vp, void *cookie, off_t base, size_t len) +{ + struct rl_q_entry *entry; + + entry = cookie; + VI_LOCK(vp); + KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED, ("XXX")); + KASSERT(entry->rl_q_start == base, ("XXX")); + KASSERT(entry->rl_q_end >= base + len, ("XXX")); + if (entry->rl_q_end == base + len) { + rangelock_unlock_vp_locked(vp, cookie); + return (NULL); + } + entry->rl_q_end = base + len; + rangelock_calc_block(&vp->v_rl); + VI_UNLOCK(vp); + return (cookie); +} + +static void * +rangelock_enqueue(struct vnode *vp, struct rl_q_entry *entry) +{ + + VI_LOCK(vp); + TAILQ_INSERT_TAIL(&vp->v_rl.rl_waiters, entry, rl_q_link); + if (vp->v_rl.rl_currdep == NULL) + vp->v_rl.rl_currdep = entry; + rangelock_calc_block(&vp->v_rl); + while (!(entry->rl_q_flags & RL_LOCK_GRANTED)) + msleep(entry, &vp->v_interlock, 0, "range", 0); + VI_UNLOCK(vp); + return (entry); +} + +void * +rangelock_rlock(struct vnode *vp, off_t base, size_t len) +{ + struct rl_q_entry *entry; + + entry = uma_zalloc(rl_entry_zone, M_WAITOK); + entry->rl_q_flags = RL_LOCK_READ; + entry->rl_q_start = base; + entry->rl_q_end = base + len; + return (rangelock_enqueue(vp, entry)); +} + +void * +rangelock_wlock(struct vnode *vp, off_t base, size_t len) +{ + struct rl_q_entry *entry; + + entry = uma_zalloc(rl_entry_zone, M_WAITOK); + entry->rl_q_flags = RL_LOCK_WRITE; + entry->rl_q_start = base; + entry->rl_q_end = base + len; + return (rangelock_enqueue(vp, entry)); +} diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c index ce1afd2..8996771 100644 --- a/sys/kern/kern_subr.c +++ b/sys/kern/kern_subr.c @@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #ifdef ZERO_COPY_SOCKETS #include #include @@ -138,7 +139,8 @@ uiomove(void *cp, int n, struct uio *uio) int error = 0; int save = 0; - KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, + KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE || + uio->uio_rw == UIO_NOCOPY, ("uiomove: mode")); KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, ("uiomove proc")); @@ -164,12 +166,18 @@ uiomove(void *cp, int n, struct uio *uio) case UIO_USERSPACE: if (ticks - PCPU_GET(switchticks) >= hogticks) uio_yield(); + if (td->td_pflags & TDP_VMUIODEADLK) { + td->td_iov_base = (uintptr_t)iov->iov_base; + td->td_iov_len = iov->iov_len; + } if (uio->uio_rw == UIO_READ) error = copyout(cp, iov->iov_base, cnt); else error = copyin(iov->iov_base, cp, cnt); - if (error) + if (error) { + error = uiorestart(error, uio, cnt); goto out; + } break; case UIO_SYSSPACE: @@ -181,10 +189,7 @@ uiomove(void *cp, int n, struct uio *uio) case UIO_NOCOPY: break; } - iov->iov_base = (char *)iov->iov_base + cnt; - iov->iov_len -= cnt; - uio->uio_resid -= cnt; - uio->uio_offset += cnt; + fwduio(uio, cnt); cp = (char *)cp + cnt; n -= cnt; } @@ -194,6 +199,28 @@ out: return (error); } +int +uiorestart(int error, struct uio *uio, u_int cnt) +{ + struct thread *td; + struct iovec *iov; + + td = uio->uio_td; + iov = uio->uio_iov; + + if (error == EFAULT && td->td_faultaddr != 0 && + (td->td_pflags & TDP_VMUIODEADLK)) { + KASSERT(td->td_faultaddr >= (uintptr_t)iov->iov_base && + td->td_faultaddr < (uintptr_t)iov->iov_base + cnt, + ("faultaddr %jx outside region %p %d\n", + (uintmax_t)td->td_faultaddr, + iov->iov_base, iov->iov_len)); + error = ERESTART; + fwduio(uio, td->td_faultaddr - (uintptr_t)iov->iov_base); + } + return (error); +} + /* * Wrapper for uiomove() that validates the arguments against a known-good * kernel buffer. Currently, uiomove accepts a signed (n) argument, which @@ -544,6 +571,7 @@ copyinuio(struct iovec *iovp, u_int iovcnt, struct uio **uiop) uio->uio_segflg = UIO_USERSPACE; uio->uio_offset = -1; uio->uio_resid = 0; + uio->uio_flags = 0; for (i = 0; i < iovcnt; i++) { if (iov->iov_len > INT_MAX - uio->uio_resid) { free(uio, M_IOV); @@ -569,3 +597,25 @@ cloneuio(struct uio *uiop) bcopy(uiop->uio_iov, uio->uio_iov, iovlen); return (uio); } + +void +fwduio(struct uio *uio, int cnt) +{ + + uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + cnt; + uio->uio_iov->iov_len -= cnt; + uio->uio_resid -= cnt; + uio->uio_offset += cnt; +} + +void +copyuio(struct uio *dst, struct uio *src) +{ + struct iovec *dst_iovec; + + dst_iovec = dst->uio_iov; + *dst = *src; + dst->uio_iov = dst_iovec; + bcopy(src->uio_iov, dst->uio_iov, src->uio_iovcnt * + sizeof(struct iovec)); +} diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c index fb6b35d..1b59cc5 100644 --- a/sys/kern/sys_pipe.c +++ b/sys/kern/sys_pipe.c @@ -768,6 +768,8 @@ pipe_build_write_buffer(wpipe, uio) * should not be performed outside of this loop. */ race: + KASSERT(!(curthread->td_pflags & TDP_VMUIODEADLK), + ("pipe_build_write_buffer: TDP_VMUIODEADLK")); if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) { vm_page_lock_queues(); for (j = 0; j < i; j++) diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c index 0ed7ce4..29bf927 100644 --- a/sys/kern/sys_process.c +++ b/sys/kern/sys_process.c @@ -211,11 +211,14 @@ proc_sstep(struct thread *td) int proc_rwmem(struct proc *p, struct uio *uio) { + struct vmspace *vm; vm_map_t map; - vm_object_t backing_object, object = NULL; - vm_offset_t pageno = 0; /* page number */ + vm_page_t m; + struct thread *td; + vm_offset_t pageno, uva; vm_prot_t reqprot; - int error, fault_flags, writing; + int error, fault_flags, page_offset, writing, save_vmuiodeadlk; + u_int len; /* * Assert that someone has locked this vmspace. (Should be @@ -228,30 +231,22 @@ proc_rwmem(struct proc *p, struct uio *uio) /* * The map we want... */ - map = &p->p_vmspace->vm_map; + vm = vmspace_acquire_ref(p); + map = &vm->vm_map; + td = curthread; writing = uio->uio_rw == UIO_WRITE; reqprot = writing ? (VM_PROT_WRITE | VM_PROT_OVERRIDE_WRITE) : VM_PROT_READ; fault_flags = writing ? VM_FAULT_DIRTY : VM_FAULT_NORMAL; + save_vmuiodeadlk = td->td_pflags & TDP_VMUIODEADLK; + td->td_pflags &= ~TDP_VMUIODEADLK; /* * Only map in one page at a time. We don't have to, but it * makes things easier. This way is trivial - right? */ do { - vm_map_t tmap; - vm_offset_t uva; - int page_offset; /* offset into page */ - vm_map_entry_t out_entry; - vm_prot_t out_prot; - boolean_t wired; - vm_pindex_t pindex; - u_int len; - vm_page_t m; - - object = NULL; - uva = (vm_offset_t)uio->uio_offset; /* @@ -268,57 +263,13 @@ proc_rwmem(struct proc *p, struct uio *uio) /* * Fault the page on behalf of the process */ - error = vm_fault(map, pageno, reqprot, fault_flags); + error = vm_fault_hold(map, pageno, reqprot, fault_flags, &m); if (error) { error = EFAULT; break; } /* - * Now we need to get the page. out_entry, out_prot, wired, - * and single_use aren't used. One would think the vm code - * would be a *bit* nicer... We use tmap because - * vm_map_lookup() can change the map argument. - */ - tmap = map; - error = vm_map_lookup(&tmap, pageno, reqprot, &out_entry, - &object, &pindex, &out_prot, &wired); - if (error) { - error = EFAULT; - break; - } - VM_OBJECT_LOCK(object); - while ((m = vm_page_lookup(object, pindex)) == NULL && - !writing && - (backing_object = object->backing_object) != NULL) { - /* - * Allow fallback to backing objects if we are reading. - */ - VM_OBJECT_LOCK(backing_object); - pindex += OFF_TO_IDX(object->backing_object_offset); - VM_OBJECT_UNLOCK(object); - object = backing_object; - } - VM_OBJECT_UNLOCK(object); - if (m == NULL) { - vm_map_lookup_done(tmap, out_entry); - error = EFAULT; - break; - } - - /* - * Hold the page in memory. - */ - vm_page_lock_queues(); - vm_page_hold(m); - vm_page_unlock_queues(); - - /* - * We're done with tmap now. - */ - vm_map_lookup_done(tmap, out_entry); - - /* * Now do the i/o move. */ error = uiomove_fromphys(&m, page_offset, len, uio); @@ -331,6 +282,8 @@ proc_rwmem(struct proc *p, struct uio *uio) vm_page_unlock_queues(); } while (error == 0 && uio->uio_resid > 0); + vmspace_free(vm); + td->td_pflags |= save_vmuiodeadlk; return (error); } diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index a3c3e97..8d2c106 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -1112,6 +1112,7 @@ bundirty(struct buf *bp) * Since it is now being written, we can clear its deferred write flag. */ bp->b_flags &= ~B_DEFERRED; + bp->b_xflags &= ~BX_NEWALLOC; } /* @@ -1440,6 +1441,7 @@ brelse(struct buf *bp) bufspacewakeup(); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); + bp->b_xflags &= ~BX_NEWALLOC; if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); /* unlock */ @@ -1529,6 +1531,7 @@ bqrelse(struct buf *bp) bufspacewakeup(); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); + bp->b_xflags &= ~BX_NEWALLOC; if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); /* unlock */ @@ -3885,6 +3888,8 @@ vmapbuf(struct buf *bp) * to work for the kernland address space (see: sparc64 port). */ retry: + KASSERT(!(curthread->td_pflags & TDP_VMUIODEADLK), + ("vmapbuf: TDP_VMUIODEADLK")); if (vm_fault_quick(addr >= bp->b_data ? addr : bp->b_data, prot) < 0) { vm_page_lock_queues(); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 8c26b13..e3867d6 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -871,6 +871,7 @@ vdestroy(struct vnode *vp) /* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif + rangelock_destroy(&vp->v_rl); lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); mtx_destroy(BO_MTX(bo)); @@ -1025,6 +1026,7 @@ alloc: if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) vp->v_vflag |= VV_NOKNOTE; } + rangelock_init(&vp->v_rl); *vpp = vp; return (0); diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 3cc6f22..21fead6 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -60,8 +60,12 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include +#include +#include + #include static fo_rdwr_t vn_read; @@ -363,37 +367,64 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, int *aresid; struct thread *td; { - struct uio auio; - struct iovec aiov; + struct uio auio, auio_clone; + struct iovec aiov, aiov_clone; struct mount *mp; struct ucred *cred; - int error; + vm_page_t *m_hold; + void *rl_cookie; + int wired_pages, error; VFS_ASSERT_GIANT(vp->v_mount); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = base; + aiov.iov_len = len; + auio.uio_resid = len; + auio.uio_offset = offset; + auio.uio_segflg = segflg; + auio.uio_rw = rw; + auio.uio_td = td; + error = 0; + + if ((ioflg & IO_NODELOCKED) == 0) { + if (rw == UIO_READ) + rl_cookie = rangelock_rlock(vp, offset, len); + else + rl_cookie = rangelock_wlock(vp, offset, len); + } else + rl_cookie = NULL; + + m_hold = NULL; + if (segflg == UIO_USERSPACE) { + m_hold = malloc(sizeof(vm_page_t) * (btoc(len) + 1), M_IOV, + M_WAITOK); + aiov_clone = aiov; + auio_clone = auio; + auio_clone.uio_iov = &aiov_clone; + error = vm_wireuio(&auio, m_hold, + round_page((vm_offset_t)base + len) - + trunc_page((vm_offset_t)base), &wired_pages); + if (error) { + free(m_hold, M_IOV); + goto out; + } + } + if ((ioflg & IO_NODELOCKED) == 0) { mp = NULL; if (rw == UIO_WRITE) { if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) - return (error); + goto out_unwire; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } else vn_lock(vp, LK_SHARED | LK_RETRY); } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - aiov.iov_base = base; - aiov.iov_len = len; - auio.uio_resid = len; - auio.uio_offset = offset; - auio.uio_segflg = segflg; - auio.uio_rw = rw; - auio.uio_td = td; - error = 0; #ifdef MAC if ((ioflg & IO_NOMACCHECK) == 0) { if (rw == UIO_READ) @@ -424,6 +455,14 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, vn_finished_write(mp); VOP_UNLOCK(vp, 0); } +out_unwire: + if (segflg == UIO_USERSPACE) { + vm_unwireuio(&auio_clone, m_hold, wired_pages); + free(m_hold, M_IOV); + } +out: + if (rl_cookie != NULL) + rangelock_unlock(vp, rl_cookie); return (error); } @@ -485,68 +524,214 @@ vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred, return (error); } +static int uio_hold_pages = 12; +SYSCTL_INT(_vfs, OID_AUTO, uio_hold_pages, CTLFLAG_RW, &uio_hold_pages, 0, + "The max amount of held pages for one i/o chunk"); +static int uio_short = 128; +SYSCTL_INT(_vfs, OID_AUTO, uio_short, CTLFLAG_RW, &uio_short, 0, + "The length of the short i/o"); + +typedef int (*vn_chunk_func_t)(struct file *, struct uio *, struct ucred *, + int, int, struct thread *); + +static int +do_vn_rw_chunked(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, int ioflag, struct thread *td, vm_page_t *m_hold, + vn_chunk_func_t vn_chunk_func) +{ + struct uio *uio_clone; + int error, wire_bytes, io_chunk, total_cnt, cnt; + int first_chunk, wired_pages; + + if (uio->uio_segflg != UIO_USERSPACE || fp->f_vnode->v_type != VREG) + return (vn_chunk_func(fp, uio, active_cred, flags, ioflag, td)); + + uio_clone = cloneuio(uio); + KASSERT(!(td->td_pflags & TDP_VMUIODEADLK), + ("Nested TDP_VMUIODEADLK")); + td->td_pflags |= TDP_VMUIODEADLK; + td->td_faultaddr = 0; + error = vn_chunk_func(fp, uio, active_cred, flags, ioflag, td); + td->td_pflags &= ~TDP_VMUIODEADLK; + if (error != ERESTART || td->td_faultaddr == 0) + goto out; + + first_chunk = 1; + if (uio->uio_flags & UIO_ROLLBACK) { + cnt = uio_clone->uio_resid - uio->uio_resid; + copyuio(uio, uio_clone); + if (cnt > 0) { + uio->uio_rw = UIO_NOCOPY; + uiomove(NULL, cnt, uio); + uio->uio_rw = uio_clone->uio_rw; + first_chunk = 0; + } + } + while (uio->uio_resid > 0) { + io_chunk = min(uio_hold_pages * PAGE_SIZE, uio->uio_resid); /* XXXKIB */ + wire_bytes = round_page(io_chunk); + error = vm_wireuio(uio, m_hold, wire_bytes, &wired_pages); + if (error != 0) { + if (!first_chunk) + error = 0; + break; + } + copyuio(uio_clone, uio); + total_cnt = uio->uio_resid; + uio->uio_resid = io_chunk; + error = vn_chunk_func(fp, uio, active_cred, flags, ioflag, td); + vm_unwireuio(uio_clone, m_hold, wired_pages); + cnt = io_chunk - uio->uio_resid; + uio->uio_resid = total_cnt - cnt; + if (error != 0) { + if (!first_chunk) + error = 0; + break; + } + if (cnt == 0) + break; + first_chunk = 0; + } + out: + free(uio_clone, M_IOV); + return (error); +} + +static struct mtx * +vn_lock_foffset(struct file *fp) +{ + struct mtx *mtxp; + + mtxp = mtx_pool_find(mtxpool_sleep, fp); + mtx_lock(mtxp); + while (fp->f_vnread_flags & FOFFSET_LOCKED) { + fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; + msleep(&fp->f_vnread_flags, mtxp, PUSER -1, + "vnread offlock", 0); + } + fp->f_vnread_flags |= FOFFSET_LOCKED; + mtx_unlock(mtxp); + return (mtxp); +} + +static void +vn_unlock_foffset(struct file *fp, struct mtx *mtxp) +{ + + mtx_lock(mtxp); + if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) + wakeup(&fp->f_vnread_flags); + fp->f_vnread_flags = 0; + mtx_unlock(mtxp); +} + +static inline int +vn_read_wired_chunk(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, int ioflag, struct thread *td) +{ + struct vnode *vp; + int error, vfslocked; + + vp = fp->f_vnode; + + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vn_lock(vp, LK_SHARED | LK_RETRY); + + ioflag |= sequential_heuristic(uio, fp); + +#ifdef MAC + error = mac_vnode_check_read(active_cred, fp->f_cred, vp); + if (error == 0) +#endif + error = VOP_READ(vp, uio, ioflag, fp->f_cred); + fp->f_nextoff = uio->uio_offset; + VOP_UNLOCK(vp, 0); + VFS_UNLOCK_GIANT(vfslocked); + return (error); +} + /* * File table vnode read routine. */ static int -vn_read(fp, uio, active_cred, flags, td) - struct file *fp; - struct uio *uio; - struct ucred *active_cred; - struct thread *td; - int flags; +vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, + struct thread *td) { struct vnode *vp; - int error, ioflag; + vm_page_t m_hold[uio_hold_pages]; struct mtx *mtxp; - int vfslocked; + void *rl_cookie; + int ioflag; + int error; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); - mtxp = NULL; - vp = fp->f_vnode; ioflag = 0; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; - vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vp = fp->f_vnode; + /* * According to McKusick the vn lock was protecting f_offset here. * It is now protected by the FOFFSET_LOCKED flag. */ if ((flags & FOF_OFFSET) == 0) { - mtxp = mtx_pool_find(mtxpool_sleep, fp); - mtx_lock(mtxp); - while(fp->f_vnread_flags & FOFFSET_LOCKED) { - fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; - msleep(&fp->f_vnread_flags, mtxp, PUSER -1, - "vnread offlock", 0); - } - fp->f_vnread_flags |= FOFFSET_LOCKED; - mtx_unlock(mtxp); - vn_lock(vp, LK_SHARED | LK_RETRY); + mtxp = vn_lock_foffset(fp); uio->uio_offset = fp->f_offset; } else - vn_lock(vp, LK_SHARED | LK_RETRY); + mtxp = NULL; /* gcc */ + if (vp->v_type == VREG) + rl_cookie = rangelock_rlock(vp, uio->uio_offset, + uio->uio_resid); + else + rl_cookie = NULL; + error = do_vn_rw_chunked(fp, uio, active_cred, flags, ioflag, td, + m_hold, vn_read_wired_chunk); + if (rl_cookie != NULL) + rangelock_unlock(vp, rl_cookie); + if ((flags & FOF_OFFSET) == 0) { + fp->f_offset = uio->uio_offset; + vn_unlock_foffset(fp, mtxp); + } + return (error); +} - ioflag |= sequential_heuristic(uio, fp); +static inline int +vn_write_wired_chunk(struct file *fp, struct uio *uio, + struct ucred *active_cred, int flags, int ioflag, struct thread *td) +{ + struct mount *mp; + struct vnode *vp; + int error, vfslocked; + mp = NULL; + vp = fp->f_vnode; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + if (vp->v_type == VREG) + bwillwrite(); + if (vp->v_type != VCHR) { + error = vn_start_write(vp, &mp, V_WAIT | PCATCH); + if (error != 0) + goto unlock; + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + if ((flags & FOF_OFFSET) == 0) + uio->uio_offset = fp->f_offset; + ioflag |= sequential_heuristic(uio, fp); #ifdef MAC - error = mac_vnode_check_read(active_cred, fp->f_cred, vp); + error = mac_vnode_check_write(active_cred, fp->f_cred, vp); if (error == 0) #endif - error = VOP_READ(vp, uio, ioflag, fp->f_cred); - if ((flags & FOF_OFFSET) == 0) { + error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); + if ((flags & FOF_OFFSET) == 0) fp->f_offset = uio->uio_offset; - mtx_lock(mtxp); - if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) - wakeup(&fp->f_vnread_flags); - fp->f_vnread_flags = 0; - mtx_unlock(mtxp); - } fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0); + if (vp->v_type != VCHR) + vn_finished_write(mp); +unlock: VFS_UNLOCK_GIANT(vfslocked); return (error); } @@ -555,24 +740,17 @@ vn_read(fp, uio, active_cred, flags, td) * File table vnode write routine. */ static int -vn_write(fp, uio, active_cred, flags, td) - struct file *fp; - struct uio *uio; - struct ucred *active_cred; - struct thread *td; - int flags; +vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, + struct thread *td) { struct vnode *vp; - struct mount *mp; - int error, ioflag; - int vfslocked; + vm_page_t m_hold[uio_hold_pages]; + void *rl_cookie; + int ioflag, error; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); vp = fp->f_vnode; - vfslocked = VFS_LOCK_GIANT(vp->v_mount); - if (vp->v_type == VREG) - bwillwrite(); ioflag = IO_UNIT; if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) ioflag |= IO_APPEND; @@ -583,27 +761,22 @@ vn_write(fp, uio, active_cred, flags, td) if ((fp->f_flag & O_FSYNC) || (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) ioflag |= IO_SYNC; - mp = NULL; - if (vp->v_type != VCHR && - (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) - goto unlock; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - if ((flags & FOF_OFFSET) == 0) - uio->uio_offset = fp->f_offset; - ioflag |= sequential_heuristic(uio, fp); -#ifdef MAC - error = mac_vnode_check_write(active_cred, fp->f_cred, vp); - if (error == 0) -#endif - error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); - if ((flags & FOF_OFFSET) == 0) - fp->f_offset = uio->uio_offset; - fp->f_nextoff = uio->uio_offset; - VOP_UNLOCK(vp, 0); - if (vp->v_type != VCHR) - vn_finished_write(mp); -unlock: - VFS_UNLOCK_GIANT(vfslocked); + if (vp->v_type == VREG) { + if ((ioflag & IO_APPEND) || !(flags & FOF_OFFSET)) + /* + * For appenders, punt and lock the whole + * range. It also protects f_offset. + */ + rl_cookie = rangelock_wlock(vp, 0, (size_t)-1); + else + rl_cookie = rangelock_wlock(vp, uio->uio_offset, + uio->uio_resid); + } else + rl_cookie = NULL; + error = do_vn_rw_chunked(fp, uio, active_cred, flags, ioflag, td, + m_hold, vn_write_wired_chunk); + if (rl_cookie != NULL) + rangelock_unlock(vp, rl_cookie); return (error); } diff --git a/sys/modules/cxgb/tom/Makefile b/sys/modules/cxgb/tom/Makefile index 2b8750a..d2566fd 100644 --- a/sys/modules/cxgb/tom/Makefile +++ b/sys/modules/cxgb/tom/Makefile @@ -5,7 +5,7 @@ CXGB = ${.CURDIR}/../../../dev/cxgb KMOD= tom SRCS= cxgb_tom.c cxgb_cpl_io.c cxgb_listen.c cxgb_tom_sysctl.c cxgb_cpl_socket.c -SRCS+= cxgb_ddp.c cxgb_vm.c cxgb_l2t.c cxgb_tcp_offload.c +SRCS+= cxgb_ddp.c cxgb_l2t.c cxgb_tcp_offload.c SRCS+= opt_compat.h opt_inet.h opt_inet6.h opt_ipsec.h opt_mac.h SRCS+= opt_tcpdebug.h opt_ddb.h opt_sched.h opt_global.h opt_ktr.h SRCS+= device_if.h bus_if.h pci_if.h diff --git a/sys/sys/buf.h b/sys/sys/buf.h index fb0d804..6623ccf 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -235,6 +235,7 @@ struct buf { */ #define BX_VNDIRTY 0x00000001 /* On vnode dirty list */ #define BX_VNCLEAN 0x00000002 /* On vnode clean list */ +#define BX_NEWALLOC 0x00000004 /* Balloc allocated the buffer */ #define BX_BKGRDWRITE 0x00000010 /* Do writes in background */ #define BX_BKGRDMARKER 0x00000020 /* Mark buffer for splay tree */ #define BX_ALTDATA 0x00000040 /* Holds extended data */ diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 96f811d..9ce96da 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -236,6 +236,9 @@ struct thread { struct file *td_fpop; /* (k) file referencing cdev under op */ int td_dbgflags; /* (c) Userland debugger flags */ struct osd td_osd; /* (k) Object specific data. */ + vm_offset_t td_faultaddr; /* (k) fault address for TDP_VMUIODEADLK */ + vm_offset_t td_iov_base; /* (k) the region where VMUIODEADLK ... */ + size_t td_iov_len; /* (k) ... is handled */ #define td_endzero td_base_pri /* Copied during fork1() or thread_sched_upcall(). */ @@ -353,7 +356,7 @@ do { \ #define TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */ #define TDP_ALTSTACK 0x00000020 /* Have alternate signal stack. */ #define TDP_DEADLKTREAT 0x00000040 /* Lock aquisition - deadlock treatment. */ -#define TDP_UNUSED80 0x00000080 /* available. */ +#define TDP_VMUIODEADLK 0x00000080 /* Non-blocking vm_fault required. */ #define TDP_NOSLEEPING 0x00000100 /* Thread is not allowed to sleep on a sq. */ #define TDP_OWEUPC 0x00000200 /* Call addupc() at next AST. */ #define TDP_ITHREAD 0x00000400 /* Thread is an interrupt thread. */ diff --git a/sys/sys/rangelock.h b/sys/sys/rangelock.h new file mode 100644 index 0000000..5ec6433 --- /dev/null +++ b/sys/sys/rangelock.h @@ -0,0 +1,48 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + * $FreeBSD$ + */ + +#ifndef _SYS_RANGELOCK_H +#define _SYS_RANGELOCK_H + +#include +#include +#include +#include +#include + +#ifdef _KERNEL + +struct vnode; + +struct rl_q_entry +{ + TAILQ_ENTRY(rl_q_entry) rl_q_link; + size_t rl_q_start, rl_q_end; + int rl_q_flags; +}; + +#define RL_LOCK_READ 0x0001 +#define RL_LOCK_WRITE 0x0002 +#define RL_LOCK_TYPE_MASK 0x0003 +#define RL_LOCK_GRANTED 0x0004 + +struct rangelock +{ + TAILQ_HEAD(, rl_q_entry) rl_waiters; + struct rl_q_entry *rl_currdep; +}; + +void rangelock_init(struct rangelock *lock); +void rangelock_destroy(struct rangelock *lock); +void rangelock_unlock(struct vnode *vp, void *cookie); +void *rangelock_unlock_range(struct vnode *vp, void *cookie, off_t base, + size_t len); +void *rangelock_rlock(struct vnode *vp, off_t base, size_t len); +void *rangelock_wlock(struct vnode *vp, off_t base, size_t len); +#endif + +#endif diff --git a/sys/sys/uio.h b/sys/sys/uio.h index 871f93a..c2117a9 100644 --- a/sys/sys/uio.h +++ b/sys/sys/uio.h @@ -68,8 +68,11 @@ struct uio { enum uio_seg uio_segflg; /* address space */ enum uio_rw uio_rw; /* operation */ struct thread *uio_td; /* owner */ + int uio_flags; }; +#define UIO_ROLLBACK 0x0001 + /* * Limits * @@ -100,6 +103,9 @@ int uiomove_frombuf(void *buf, int buflen, struct uio *uio); int uiomove_fromphys(struct vm_page *ma[], vm_offset_t offset, int n, struct uio *uio); int uiomoveco(void *cp, int n, struct uio *uio, int disposable); +void fwduio(struct uio *uip, int cnt); +int uiorestart(int error, struct uio *uio, u_int cnt); +void copyuio(struct uio *dst, struct uio *src); #else /* !_KERNEL */ diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 0a3d1dc..af760a5 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -168,7 +169,8 @@ struct vnode { */ struct vpollinfo *v_pollinfo; /* G Poll events, p for *v_pi */ struct label *v_label; /* MAC label for vnode */ - struct lockf *v_lockf; /* Byte-level lock list */ + struct lockf *v_lockf; /* Byte-level adv lock list */ + struct rangelock v_rl; /* Byte-range lock */ }; #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */ diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c index a12f96e..730d0b4 100644 --- a/sys/ufs/ffs/ffs_balloc.c +++ b/sys/ufs/ffs/ffs_balloc.c @@ -209,6 +209,8 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, bp->b_blkno = fsbtodb(fs, newb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(bp); + else + bp->b_xflags |= BX_NEWALLOC; if (DOINGSOFTDEP(vp)) softdep_setup_allocdirect(ip, lbn, newb, 0, nsize, 0, bp); @@ -360,6 +362,8 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, nbp->b_blkno = fsbtodb(fs, nb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(nbp); + else + bp->b_xflags |= BX_NEWALLOC; if (DOINGSOFTDEP(vp)) softdep_setup_allocindir_page(ip, lbn, bp, indirs[i].in_off, nb, 0, nbp); @@ -616,6 +620,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, bp->b_xflags |= BX_ALTDATA; if (flags & BA_CLRBUF) vfs_bio_clrbuf(bp); + else + bp->b_xflags |= BX_NEWALLOC; if (DOINGSOFTDEP(vp)) softdep_setup_allocext(ip, lbn, newb, 0, nsize, 0, bp); @@ -715,6 +721,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, bp->b_blkno = fsbtodb(fs, newb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(bp); + else + bp->b_xflags |= BX_NEWALLOC; if (DOINGSOFTDEP(vp)) softdep_setup_allocdirect(ip, lbn, newb, 0, nsize, 0, bp); @@ -866,6 +874,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, nbp->b_blkno = fsbtodb(fs, nb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(nbp); + else + bp->b_xflags |= BX_NEWALLOC; if (DOINGSOFTDEP(vp)) softdep_setup_allocindir_page(ip, lbn, bp, indirs[i].in_off, nb, 0, nbp); diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c index 1abb994..907221b 100644 --- a/sys/ufs/ffs/ffs_vnops.c +++ b/sys/ufs/ffs/ffs_vnops.c @@ -613,6 +613,32 @@ ffs_read(ap) return (error); } +static int +ffs_erestart(struct vnode *vp, int error, struct buf *bp, off_t s_size, + int s_resid, int ioflag, struct uio *uio, struct ucred *ucred) +{ + if (!(bp->b_xflags & BX_NEWALLOC) || (bp->b_flags & B_CACHE)) + return (0); + + /* + * When uiomove() failed due to vm_fault cowardly refused to + * process a dangerous page-in, and the previous content of + * the buffer is garbage, e.g. because supposed transfer + * length was big enough to cover the whole buffer, clear the + * buffer. + */ + vfs_bio_clrbuf(bp); + if (ioflag & IO_SYNC) + bwrite(bp); + else + bawrite(bp); + ffs_truncate(vp, s_size, ioflag, ucred, uio->uio_td); + uio->uio_offset -= s_resid - uio->uio_resid; + uio->uio_resid = s_resid; + uio->uio_flags |= UIO_ROLLBACK; + return (1); +} + /* * Vnode op for writing. */ @@ -632,8 +658,8 @@ ffs_write(ap) struct buf *bp; struct thread *td; ufs_lbn_t lbn; - off_t osize; - int seqcount; + off_t osize, s_size; + int seqcount, s_resid; int blkoffset, error, flags, ioflag, resid, size, xfersize; vp = ap->a_vp; @@ -707,6 +733,7 @@ ffs_write(ap) lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->fs_bsize - blkoffset; + s_size = ip->i_size; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (uio->uio_offset + xfersize > ip->i_size) @@ -734,8 +761,10 @@ ffs_write(ap) * the prior contents of the pages exposed to a userland * mmap(). XXX deal with uiomove() errors a better way. */ - if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) + if ((bp->b_flags & B_CACHE) == 0 /*&& fs->fs_bsize <= xfersize*/) { vfs_bio_clrbuf(bp); + flags |= BA_CLRBUF; + } if (ioflag & IO_DIRECT) bp->b_flags |= B_DIRECT; if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) @@ -750,6 +779,7 @@ ffs_write(ap) if (size < xfersize) xfersize = size; + s_resid = uio->uio_resid; error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if ((ioflag & (IO_VMIO|IO_DIRECT)) && @@ -757,6 +787,10 @@ ffs_write(ap) bp->b_flags |= B_RELBUF; } + if (error == ERESTART && ffs_erestart(vp, error, bp, s_size, + s_resid, IO_NORMAL | (ioflag & IO_SYNC), uio, ap->a_cred)) + break; + /* * If IO_SYNC each buffer is written synchronously. Otherwise * if we have a severe page deficiency write the buffer @@ -808,6 +842,8 @@ ffs_write(ap) ap->a_cred, uio->uio_td); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; + if (error == ERESTART) + uio->uio_flags |= UIO_ROLLBACK; } } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) error = ffs_update(vp, 1); @@ -1020,8 +1056,8 @@ ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) struct fs *fs; struct buf *bp; ufs_lbn_t lbn; - off_t osize; - int blkoffset, error, flags, resid, size, xfersize; + off_t osize, s_size; + int blkoffset, error, flags, resid, s_resid, size, xfersize; ip = VTOI(vp); fs = ip->i_fs; @@ -1052,6 +1088,7 @@ ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->fs_bsize - blkoffset; + s_size = dp->di_extsize; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; @@ -1086,6 +1123,7 @@ ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) if (size < xfersize) xfersize = size; + s_resid = uio->uio_resid; error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if ((ioflag & (IO_VMIO|IO_DIRECT)) && @@ -1093,6 +1131,10 @@ ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) bp->b_flags |= B_RELBUF; } + if (error == ERESTART && ffs_erestart(vp, error, bp, s_size, + s_resid, IO_EXT | (ioflag & IO_SYNC), uio, ucred)) + break; + /* * If IO_SYNC each buffer is written synchronously. Otherwise * if we have a severe page deficiency write the buffer diff --git a/sys/vm/vm_extern.h b/sys/vm/vm_extern.h index 475a20e..b0e982d 100644 --- a/sys/vm/vm_extern.h +++ b/sys/vm/vm_extern.h @@ -34,11 +34,13 @@ #define _VM_EXTERN_H_ struct buf; +struct iovec; struct proc; struct vmspace; struct vmtotal; struct mount; struct vnode; +struct uio; #ifdef _KERNEL @@ -55,7 +57,12 @@ vm_map_t kmem_suballoc(vm_map_t, vm_offset_t *, vm_offset_t *, vm_size_t, void swapout_procs(int); int useracc(void *, int, int); int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int); +int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, + int fault_flags, struct vm_page **m_hold); void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t); +int vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr, + vm_page_t *mp, int count, vm_prot_t prot); +void vm_fault_unhold_pages(vm_page_t *mp, int count); void vm_fault_unwire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t); int vm_fault_wire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t); int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int); @@ -84,5 +91,9 @@ int vm_thread_new(struct thread *td, int pages); int vm_thread_new_altkstack(struct thread *td, int pages); void vm_thread_swapin(struct thread *td); void vm_thread_swapout(struct thread *td); +int vm_wireuio(struct uio *uiop, struct vm_page *m_hold[], int wire_bytes, + int *wired_pages); +void vm_unwireuio(struct uio *, struct vm_page *m_hold[], int wired_pages); + #endif /* _KERNEL */ #endif /* !_VM_EXTERN_H_ */ diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 3a21616..6b1158a 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -1,4 +1,30 @@ /*- + +Copyright (c) 2007-2008, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 1994 John S. Dyson @@ -207,8 +233,8 @@ unlock_and_deallocate(struct faultstate *fs) * Caller may hold no locks. */ int -vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, - int fault_flags) +vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, + int fault_flags, struct vm_page **m_hold) { vm_prot_t prot; int is_first_object_locked, result; @@ -220,8 +246,20 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int faultcount, ahead, behind; struct faultstate fs; struct vnode *vp; + struct thread *td; int locked, error; + td = curthread; + if (td->td_pflags & TDP_VMUIODEADLK) { + KASSERT(td->td_iov_base <= vaddr && + vaddr < td->td_iov_base + td->td_iov_len, + ("uiomove EFAULT %jx %jx %d\n", (uintmax_t)vaddr, + (uintmax_t)td->td_iov_base, td->td_iov_len)); + td->td_faultaddr = vaddr; + return (KERN_VMUIODEADLOCK); + } + vaddr = trunc_page(vaddr); + hardfault = 0; growstack = TRUE; PCPU_INC(cnt.v_vm_faults); @@ -949,6 +987,10 @@ vnode_locked: } else { vm_page_activate(fs.m); } + if (m_hold != NULL) { + *m_hold = fs.m; + vm_page_hold(fs.m); + } vm_page_unlock_queues(); vm_page_wakeup(fs.m); @@ -964,6 +1006,14 @@ vnode_locked: return (KERN_SUCCESS); } +int +vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, + int fault_flags) +{ + + return (vm_fault_hold(map, vaddr, fault_type, fault_flags, NULL)); +} + /* * vm_fault_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" @@ -1360,3 +1410,108 @@ vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage) /* return number of pages */ return i; } + +/* + * This routine takes a user's map, array of pages, number of pages, and flags + * and then does the following: + * - validate that the user has access to those pages (flags indicates read + * or write) - if not fail + * - validate that count is enough to hold range number of pages - if not fail + * - fault in any non-resident pages + * - if the user is doing a read force a write fault for any COWed pages + * - if the user is doing a read mark all pages as dirty + * - hold all pages + */ +int +vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr, vm_page_t *mp, + int count, vm_prot_t prot) +{ + vm_offset_t end, va; + int faults, rv; + pmap_t pmap; + vm_page_t m, *pages; + + pmap = vm_map_pmap(map); + pages = mp; + addr &= ~PAGE_MASK; + + /* + * Check that virtual address range is legal. + * This check is somewhat bogus as on some architectures kernel + * and user do not share VA - however, it appears that all FreeBSD + * architectures define it + */ + end = addr + (count * PAGE_SIZE); + if (end > VM_MAXUSER_ADDRESS) + return (EFAULT); + + /* + * First optimistically assume that all pages are resident + * (and R/W if for write) if so just mark pages as held (and + * dirty if for write) and return. + */ + vm_page_lock_queues(); + for (pages = mp, faults = 0, va = addr; va < end; + va += PAGE_SIZE, pages++) { + /* + * Page queue mutex is recursable so this is OK. + * It would be really nice if we had an unlocked + * version of this so we were only acquiring the + * pmap lock 1 time as opposed to potentially + * many dozens of times. + */ + *pages = m = pmap_extract_and_hold(pmap, va, prot); + if (m == NULL) { + faults++; + continue; + } + + /* + * Preemptively mark dirty - the pages will never have + * the modified bit set if they are only changed via + * DMA. + */ + if (prot & VM_PROT_WRITE) + vm_page_dirty(m); + } + vm_page_unlock_queues(); + + if (faults == 0) + return (0); + + /* + * Pages either have insufficient permissions or are not present + * trigger a fault where neccessary. + */ + rv = 0; + for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++) { + rv = vm_fault_hold(map, va, prot, (prot & VM_PROT_WRITE) ? + VM_FAULT_DIRTY : VM_FAULT_NORMAL, pages); + if (rv) + goto error; + } + return (0); + +error: + vm_page_lock_queues(); + for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++) + if (*pages) { + vm_page_unhold(*pages); + *pages = NULL; + } + vm_page_unlock_queues(); + return (EFAULT); +} + +void +vm_fault_unhold_pages(vm_page_t *mp, int count) +{ + + KASSERT(count >= 0, ("negative count %d", count)); + vm_page_lock_queues(); + while (count--) { + vm_page_unhold(*mp); + mp++; + } + vm_page_unlock_queues(); +} diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 2c5821c..3f19a49 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -1374,6 +1374,7 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, new_entry->protection = prot; new_entry->max_protection = max; new_entry->wired_count = 0; + new_entry->pin_count = 0; /* * Insert the new entry into the list @@ -1596,7 +1597,8 @@ vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry) (prev->protection == entry->protection) && (prev->max_protection == entry->max_protection) && (prev->inheritance == entry->inheritance) && - (prev->wired_count == entry->wired_count)) { + (prev->wired_count == entry->wired_count) && + (prev->pin_count == entry->pin_count)) { vm_map_entry_unlink(map, prev); entry->start = prev->start; entry->offset = prev->offset; @@ -1622,7 +1624,8 @@ vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry) (next->protection == entry->protection) && (next->max_protection == entry->max_protection) && (next->inheritance == entry->inheritance) && - (next->wired_count == entry->wired_count)) { + (next->wired_count == entry->wired_count) && + (next->pin_count == entry->pin_count)) { vm_map_entry_unlink(map, next); entry->end = next->end; vm_map_entry_resize_free(map, entry); @@ -2796,7 +2799,8 @@ reclip_start: */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 || (vm_map_pmap(map) != kernel_pmap && - vm_map_entry_system_wired_count(entry) != 0)) { + vm_map_entry_system_wired_count(entry) != 0) || + entry->pin_count != 0) { entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; last_timestamp = map->timestamp; @@ -2816,6 +2820,7 @@ reclip_start: holder_entry->max_protection = VM_PROT_NONE; holder_entry->inheritance = VM_INHERIT_NONE; holder_entry->wired_count = 0; + holder_entry->pin_count = 0; vm_map_entry_link(map, entry->prev, holder_entry); } (void) vm_map_unlock_and_wait(map, 0); @@ -3154,6 +3159,7 @@ vmspace_fork(struct vmspace *vm1) new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION); new_entry->wired_count = 0; + new_entry->pin_count = 0; /* * Insert the entry into the new map -- we know we're @@ -3181,6 +3187,7 @@ vmspace_fork(struct vmspace *vm1) new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION); new_entry->wired_count = 0; + new_entry->pin_count = 0; new_entry->object.vm_object = NULL; vm_map_entry_link(new_map, new_map->header.prev, new_entry); @@ -3865,6 +3872,167 @@ vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry) vm_map_unlock_read(map); } +static boolean_t +vm_map_unpin_entries(vm_map_t map, struct uio *uiop, int pinned_entries) +{ + vm_offset_t start; + struct iovec *iov; + vm_map_entry_t entry; + int i, acc, wire; + boolean_t need_wakeup; + + iov = uiop->uio_iov; + need_wakeup = FALSE; + + for (i = 0, acc = 0; acc < pinned_entries; iov++, i++) { + KASSERT(i < uiop->uio_iovcnt, ("wireio: iovcnt overflow %d %d %d", + i, uiop->uio_iovcnt, pinned_entries)); + wire = round_page(iov->iov_len); + if (acc + wire > pinned_entries) + wire = pinned_entries - acc; + start = trunc_page((vm_offset_t)iov->iov_base); + for (;;) { + if (!vm_map_lookup_entry(map, start, &entry)) { +#ifdef INVARIANTS + panic("vm_unwireuio: hole"); +#endif + } + KASSERT(entry->pin_count > 0, ("pin_count %p", entry)); + if (--entry->pin_count == 0 && + (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP)) { + entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; + need_wakeup = TRUE; + } + if (entry->end >= start + wire) { + acc += wire; + break; + } else { + acc += entry->end - start; + wire -= entry->end - start; + start = entry->end; + } + } + } + return (need_wakeup); +} + +/* + * vm_wireuio + * + * Given userspace struct uio, we set up vm state such that after the + * successfull return there will be no page faults during uiomove with + * this uio until vm_unwireuio is called. At most wire_bytes bytes of + * the user address space are held. + * + * Function performs this by first pinning all map entries that will + * be referenced. This guarantees that our ranges of user address + * space cannot be remmapped during the operation. Then, all accessed + * pages are faulted in and held. + */ +int +vm_wireuio(struct uio *uiop, struct vm_page *m_hold[], int wire_bytes, + int *wired_pages) +{ + vm_map_t map; + vm_offset_t start, start1; + struct iovec *iov; + vm_map_entry_t entry; + struct vm_page **m_hold1; + int i, acc, wire, wire_pages, pinned_entries, rv, prot; + int error; + boolean_t need_wakeup; + + KASSERT(round_page(wire_bytes) == wire_bytes, + ("wireuio: wire_bytes is not page-size aligned")); + KASSERT(uiop->uio_segflg == UIO_USERSPACE, + ("wireuio: !UIO_USERSPACE")); + + error = 0; + prot = uiop->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ; + m_hold1 = m_hold; + pinned_entries = 0; + *wired_pages = 0; + map = &uiop->uio_td->td_proc->p_vmspace->vm_map; + iov = uiop->uio_iov; + + /* + * Do the pass over iov. + */ + for (i = 0, acc = 0; acc < wire_bytes; iov++, i++) { + wire = round_page(iov->iov_len); + if (acc + wire > wire_bytes) + wire = wire_bytes - acc; + acc += wire; + wire_pages = btoc(wire); + start1 = start = trunc_page((vm_offset_t)iov->iov_base); + if (start < vm_map_min(map) || start + wire > vm_map_max(map) || + start > start + wire) { + error = EINVAL; + goto fault; + } + vm_map_lock(map); + + /* + * Pin each entry referenced by addresses in iov. + */ + for (;;) { + if (!vm_map_lookup_entry(map, start1, &entry) || + (entry->eflags & MAP_ENTRY_IS_HOLDER)) + goto fault_map_locked; + entry->pin_count++; + if (entry->end >= start1 + wire) { + pinned_entries += wire; + break; + } else { + pinned_entries += entry->end - start1; + wire -= entry->end - start1; + start1 = entry->end; + } + } + vm_map_unlock(map); + + /* + * If entries are successfully pinned, the + * corresponding pages are faulted in and held. + */ + rv = vm_fault_hold_user_pages(map, start, m_hold1, wire_pages, + prot); + if (rv != KERN_SUCCESS) { + error = EFAULT; + goto fault; + } + *wired_pages += wire_pages; + m_hold1 += wire_pages; + } + return (0); + fault: + vm_map_lock(map); + fault_map_locked: + need_wakeup = vm_map_unpin_entries(map, uiop, pinned_entries); + vm_map_unlock(map); + vm_fault_unhold_pages(m_hold, *wired_pages); + if (need_wakeup) + vm_map_wakeup(map); + return (error); +} + +void +vm_unwireuio(struct uio *uiop, struct vm_page *m_hold[], int wired_pages) +{ + vm_map_t map; + boolean_t need_wakeup; + + map = &uiop->uio_td->td_proc->p_vmspace->vm_map; + + vm_fault_unhold_pages(m_hold, wired_pages); + + vm_map_lock(map); + need_wakeup = vm_map_unpin_entries(map, uiop, ctob(wired_pages)); + vm_map_unlock(map); + if (need_wakeup) + vm_map_wakeup(map); +} + #include "opt_ddb.h" #ifdef DDB #include diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h index f2c4fd3..9310718 100644 --- a/sys/vm/vm_map.h +++ b/sys/vm/vm_map.h @@ -114,6 +114,7 @@ struct vm_map_entry { vm_inherit_t inheritance; /* inheritance */ int wired_count; /* can be paged if = 0 */ vm_pindex_t lastr; /* last read */ + unsigned pin_count; /* non-exclusive pin count */ }; #define MAP_ENTRY_NOSYNC 0x0001 @@ -383,5 +384,6 @@ int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end, int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags); int vmspace_swap_count (struct vmspace *vmspace); + #endif /* _KERNEL */ #endif /* _VM_MAP_ */ diff --git a/sys/vm/vm_param.h b/sys/vm/vm_param.h index 2ff2603..d866925 100644 --- a/sys/vm/vm_param.h +++ b/sys/vm/vm_param.h @@ -125,6 +125,7 @@ struct xswdev { #define KERN_RESOURCE_SHORTAGE 6 #define KERN_NOT_RECEIVER 7 #define KERN_NO_ACCESS 8 +#define KERN_VMUIODEADLOCK 9 #ifndef ASSEMBLER #ifdef _KERNEL diff --git a/tools/regression/file/uio/uio.c b/tools/regression/file/uio/uio.c new file mode 100644 index 0000000..d857605 --- /dev/null +++ b/tools/regression/file/uio/uio.c @@ -0,0 +1,116 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int chunk_cnt = 1024; +int chunk_size = 1024; + +int +main(int argc, char *argv[]) +{ + struct iovec *wiov, *riov; + char **wdata, **rdata; + int fd, i; + ssize_t io_error; + + if (argc < 2) { + fprintf(stderr, "Usage: uio file [chunk count [chunk size]]\n"); + return (2); + } + fd = open(argv[1], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); + if (fd == -1) { + fprintf(stderr, "Failed to create %s: %s\n", + argv[1], strerror(errno)); + return (1); + } + + if (argc > 2) + chunk_cnt = atoi(argv[2]); + if (argc > 3) + chunk_size = atoi(argv[3]); + + wiov = calloc(chunk_cnt, sizeof(*wiov)); + wdata = calloc(chunk_cnt, sizeof(*wdata)); + + riov = calloc(chunk_cnt, sizeof(*riov)); + rdata = calloc(chunk_cnt, sizeof(*rdata)); + + for (i = 0; i < chunk_cnt; i++) { + rdata[i] = malloc(chunk_size); + riov[i].iov_base = rdata[i]; + riov[i].iov_len = chunk_size; + + wdata[i] = malloc(chunk_size); + memset(wdata[i], i, chunk_size); + wiov[i].iov_base = wdata[i]; + wiov[i].iov_len = chunk_size; + } + + io_error = writev(fd, wiov, chunk_cnt); + if (io_error == -1) { + fprintf(stderr, "write failed: %s\n", strerror(errno)); + return (1); + } else if (io_error != chunk_cnt * chunk_size) { + fprintf(stderr, "truncated write: %d %d\n", + io_error, chunk_cnt * chunk_size); + return (1); + } + + if (lseek(fd, 0, SEEK_SET) == -1) { + fprintf(stderr, "lseek failed: %s\n", strerror(errno)); + return (1); + } + + io_error = readv(fd, riov, chunk_cnt); + if (io_error == -1) { + fprintf(stderr, "read failed: %s\n", strerror(errno)); + return (1); + } else if (io_error != chunk_cnt * chunk_size) { + fprintf(stderr, "truncated read: %d %d\n", + io_error, chunk_cnt * chunk_size); + return (1); + } + + for (i = 0; i < chunk_cnt; i++) { + if (memcmp(rdata[i], wdata[i], chunk_size) != 0) { + fprintf(stderr, "chunk %d differs\n", i); + return (1); + } + } + + return (0); +} diff --git a/tools/regression/ufs/ba_clrbuf/ba_clrbuf.c b/tools/regression/ufs/ba_clrbuf/ba_clrbuf.c new file mode 100644 index 0000000..9376648 --- /dev/null +++ b/tools/regression/ufs/ba_clrbuf/ba_clrbuf.c @@ -0,0 +1,129 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const int blks = 2; + +static void +flush_buffers(int fd) +{ + struct stat st; + char *addr; + int error; + + printf("Flushing buffers\n"); + error = fstat(fd, &st); + if (error == -1) + err(2, "stat"); + fsync(fd); + addr = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (addr == (char *)-1) + err(2, "mmap"); + error = msync(addr, st.st_size, MS_SYNC | MS_INVALIDATE); + if (error == -1) + err(2, "msync"); + munmap(addr, st.st_size); +} + +int +main(int argc, char *argv[]) +{ + struct statfs fst; + char *data, *vrfy; + size_t sz; + int fd, i, error, ret; + + if (argc < 2) + errx(2, "Usage: ba_clrbuf file"); + + fd = open(argv[1], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); + if (fd == -1) + err(2, "Failed to create %s", argv[1]); + + if (fstatfs(fd, &fst) == -1) + err(2, "stat"); + + sz = fst.f_iosize * blks; + data = malloc(sz); + if (data == NULL) + err(2, "malloc"); + vrfy = malloc(sz); + if (vrfy == NULL) + err(2, "malloc"); + for (i = 0; i < (int)sz; i++) + data[i] = i; + error = write(fd, data, sz); + if (error == -1) + err(2, "write"); + else if (error != (int)sz) + errx(2, "Short write %d %d", error, sz); + + flush_buffers(fd); + + error = lseek(fd, 0, SEEK_SET); + if (error == -1) + err(2, "lseek 0"); + else if (error != 0) + errx(2, "lseek 0 returned %d", error); + error = write(fd, NULL, fst.f_iosize); + printf("faulty write, error %s\n", strerror(errno)); + + error = lseek(fd, 0, SEEK_SET); + if (error == -1) + err(2, "lseek 0/2"); + else if (error != 0) + errx(2, "lseek 0/2 returned %d", error); + error = read(fd, vrfy, sz); + if (error == -1) + err(2, "read"); + else if (error != (int)sz) + errx(2, "short read %d %d", error, sz); + + if (memcmp(data, vrfy, fst.f_iosize) != 0) { + printf("Zero block corrupted, byte at 0 is %x\n", + (unsigned char)vrfy[0]); + ret = 1; + } else { + printf("No corruption\n"); + ret = 0; + } + + return (ret); +}