diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index fee3caf..8390526 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -742,7 +742,7 @@ trap_pfault(frame, usermode) PROC_UNLOCK(p); /* Fault in the user page: */ - rv = vm_fault(map, va, ftype, + rv = vm_fault(map, eva, ftype, (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL); @@ -754,7 +754,7 @@ trap_pfault(frame, usermode) * Don't have to worry about process locking or stacks in the * kernel. */ - rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); + rv = vm_fault(map, eva, ftype, VM_FAULT_NORMAL); } if (rv == KERN_SUCCESS) return (0); diff --git a/sys/conf/files b/sys/conf/files index a3bd42f..3b9fca5 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1931,6 +1931,7 @@ kern/kern_poll.c optional device_polling kern/kern_priv.c standard kern/kern_proc.c standard kern/kern_prot.c standard +kern/kern_rangelock.c standard kern/kern_resource.c standard kern/kern_rmlock.c standard kern/kern_rwlock.c standard diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c index 76237fb..0d0ef86 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c @@ -91,7 +91,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include static int (*pru_sosend)(struct socket *so, struct sockaddr *addr, diff --git a/sys/dev/cxgb/ulp/tom/cxgb_ddp.c b/sys/dev/cxgb/ulp/tom/cxgb_ddp.c index a54598c..1c3953d 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_ddp.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_ddp.c @@ -90,7 +90,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #define MAX_SCHEDULE_TIMEOUT 300 diff --git a/sys/dev/cxgb/ulp/tom/cxgb_vm.c b/sys/dev/cxgb/ulp/tom/cxgb_vm.c deleted file mode 100644 index e7a3893..0000000 --- a/sys/dev/cxgb/ulp/tom/cxgb_vm.c +++ /dev/null @@ -1,166 +0,0 @@ -/************************************************************************** - -Copyright (c) 2007-2008, Chelsio Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Neither the name of the Chelsio Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -***************************************************************************/ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -/* - * This routine takes a user's map, array of pages, number of pages, and flags - * and then does the following: - * - validate that the user has access to those pages (flags indicates read - * or write) - if not fail - * - validate that count is enough to hold range number of pages - if not fail - * - fault in any non-resident pages - * - if the user is doing a read force a write fault for any COWed pages - * - if the user is doing a read mark all pages as dirty - * - hold all pages - */ -int -vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr, vm_page_t *mp, - int count, vm_prot_t prot) -{ - vm_offset_t end, va; - int faults, rv; - pmap_t pmap; - vm_page_t m, *pages; - - pmap = vm_map_pmap(map); - pages = mp; - addr &= ~PAGE_MASK; - /* - * Check that virtual address range is legal - * This check is somewhat bogus as on some architectures kernel - * and user do not share VA - however, it appears that all FreeBSD - * architectures define it - */ - end = addr + (count * PAGE_SIZE); - if (end > VM_MAXUSER_ADDRESS) { - log(LOG_WARNING, "bad address passed to vm_fault_hold_user_pages"); - return (EFAULT); - } - - /* - * First optimistically assume that all pages are resident - * (and R/W if for write) if so just mark pages as held (and - * dirty if for write) and return - */ - vm_page_lock_queues(); - for (pages = mp, faults = 0, va = addr; va < end; - va += PAGE_SIZE, pages++) { - /* - * page queue mutex is recursable so this is OK - * it would be really nice if we had an unlocked - * version of this so we were only acquiring the - * pmap lock 1 time as opposed to potentially - * many dozens of times - */ - *pages = m = pmap_extract_and_hold(pmap, va, prot); - if (m == NULL) { - faults++; - continue; - } - /* - * Preemptively mark dirty - the pages - * will never have the modified bit set if - * they are only changed via DMA - */ - if (prot & VM_PROT_WRITE) - vm_page_dirty(m); - - } - vm_page_unlock_queues(); - - if (faults == 0) - return (0); - - /* - * Pages either have insufficient permissions or are not present - * trigger a fault where neccessary - * - */ - rv = 0; - for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++) { - /* - * Account for a very narrow race where the page may be - * taken away from us before it is held - */ - while (*pages == NULL) { - rv = vm_fault(map, va, prot, - (prot & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL); - if (rv) - goto error; - *pages = pmap_extract_and_hold(pmap, va, prot); - } - } - return (0); -error: - log(LOG_WARNING, - "vm_fault bad return rv=%d va=0x%zx\n", rv, va); - vm_page_lock_queues(); - for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++) - if (*pages) { - vm_page_unhold(*pages); - *pages = NULL; - } - vm_page_unlock_queues(); - return (EFAULT); -} - -void -vm_fault_unhold_pages(vm_page_t *mp, int count) -{ - - KASSERT(count >= 0, ("negative count %d", count)); - vm_page_lock_queues(); - while (count--) { - vm_page_unhold(*mp); - mp++; - } - vm_page_unlock_queues(); -} diff --git a/sys/dev/cxgb/ulp/tom/cxgb_vm.h b/sys/dev/cxgb/ulp/tom/cxgb_vm.h deleted file mode 100644 index 7532e20..0000000 --- a/sys/dev/cxgb/ulp/tom/cxgb_vm.h +++ /dev/null @@ -1,39 +0,0 @@ -/************************************************************************** - -Copyright (c) 2007-2008, Chelsio Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Neither the name of the Chelsio Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -$FreeBSD$ - -***************************************************************************/ -#ifndef CXGB_VM_H_ -#define CXGB_VM_H_ - -int vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr, - vm_page_t *mp, int count, vm_prot_t prot); -void vm_fault_unhold_pages(vm_page_t *mp, int count); - -#endif diff --git a/sys/i386/i386/trap.c b/sys/i386/i386/trap.c index e967104..153b7da 100644 --- a/sys/i386/i386/trap.c +++ b/sys/i386/i386/trap.c @@ -820,7 +820,7 @@ trap_pfault(frame, usermode, eva) PROC_UNLOCK(p); /* Fault in the user page: */ - rv = vm_fault(map, va, ftype, + rv = vm_fault(map, eva, ftype, (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL); @@ -832,7 +832,7 @@ trap_pfault(frame, usermode, eva) * Don't have to worry about process locking or stacks in the * kernel. */ - rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); + rv = vm_fault(map, eva, ftype, VM_FAULT_NORMAL); } if (rv == KERN_SUCCESS) return (0); diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c new file mode 100644 index 0000000..7a45c13 --- /dev/null +++ b/sys/kern/kern_rangelock.c @@ -0,0 +1,151 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +void +rangelock_init(struct rangelock *lock) +{ + + TAILQ_INIT(&lock->rl_waiters); + lock->rl_currdep = NULL; +} + +void +rangelock_destroy(struct rangelock *lock) +{ + + KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters")); +} + +static int +rangelock_incompatible(const struct rl_q_entry *e1, const struct rl_q_entry *e2) +{ + + if ((e1->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ && + (e2->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ) + return (0); +#define IN_RANGE(a, e) (a <= e->rl_q_start && a < e->rl_q_end) + if (IN_RANGE(e1->rl_q_start, e2) || IN_RANGE(e2->rl_q_start, e1) || + IN_RANGE(e1->rl_q_end, e2) || IN_RANGE(e2->rl_q_end, e1)) + return (1); +#undef IN_RANGE + return (0); +} + +static void +rangelock_calc_block(struct rangelock *lock) +{ + struct rl_q_entry *entry, *entry1, *whead; + + if (lock->rl_currdep == TAILQ_FIRST(&lock->rl_waiters) && + lock->rl_currdep != NULL) + lock->rl_currdep = TAILQ_NEXT(lock->rl_currdep, rl_q_link); + for (entry = lock->rl_currdep; entry; + entry = TAILQ_NEXT(entry, rl_q_link)) { + TAILQ_FOREACH(entry1, &lock->rl_waiters, rl_q_link) { + if (rangelock_incompatible(entry, entry1)) + goto out; + if (entry1 == entry) + break; + } + } +out: + lock->rl_currdep = entry; + TAILQ_FOREACH(whead, &lock->rl_waiters, rl_q_link) { + if (whead == lock->rl_currdep) + break; + if (!(whead->rl_q_flags & RL_LOCK_GRANTED)) { + whead->rl_q_flags |= RL_LOCK_GRANTED; + wakeup(whead); + } + } +} + +static void +rangelock_unlock_vp_locked(struct vnode *vp, struct rl_q_entry *entry) +{ + + ASSERT_VI_LOCKED(vp, "rangelock"); + KASSERT(entry != vp->v_rl.rl_currdep, ("stuck currdep")); + TAILQ_REMOVE(&vp->v_rl.rl_waiters, entry, rl_q_link); + rangelock_calc_block(&vp->v_rl); + VI_UNLOCK(vp); +} + +void +rangelock_unlock(struct vnode *vp, void *cookie) +{ + struct rl_q_entry *entry; + + entry = cookie; + VI_LOCK(vp); + rangelock_unlock_vp_locked(vp, entry); +} + +void * +rangelock_unlock_range(struct vnode *vp, void *cookie, off_t base, size_t len) +{ + struct rl_q_entry *entry; + + entry = cookie; + VI_LOCK(vp); + KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED, ("XXX")); + KASSERT(entry->rl_q_start == base, ("XXX")); + KASSERT(entry->rl_q_end >= base + len, ("XXX")); + if (entry->rl_q_end == base + len) { + rangelock_unlock_vp_locked(vp, cookie); + return (NULL); + } + entry->rl_q_end = base + len; + rangelock_calc_block(&vp->v_rl); + VI_UNLOCK(vp); + return (cookie); +} + +static void * +rangelock_enqueue(struct vnode *vp, struct rl_q_entry *entry) +{ + + VI_LOCK(vp); + TAILQ_INSERT_TAIL(&vp->v_rl.rl_waiters, entry, rl_q_link); + if (vp->v_rl.rl_currdep == NULL) + vp->v_rl.rl_currdep = entry; + rangelock_calc_block(&vp->v_rl); + while (!(entry->rl_q_flags & RL_LOCK_GRANTED)) + msleep(entry, &vp->v_interlock, 0, "range", 0); + VI_UNLOCK(vp); + return (entry); +} + +void * +rangelock_rlock(struct vnode *vp, struct rl_q_entry *entry, off_t base, + size_t len) +{ + + entry->rl_q_flags = RL_LOCK_READ; + entry->rl_q_start = base; + entry->rl_q_end = base + len; + return (rangelock_enqueue(vp, entry)); +} + +void * +rangelock_wlock(struct vnode *vp, struct rl_q_entry *entry, off_t base, + size_t len) +{ + + entry->rl_q_flags = RL_LOCK_WRITE; + entry->rl_q_start = base; + entry->rl_q_end = base + len; + return (rangelock_enqueue(vp, entry)); +} diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c index ce1afd2..e2cd1a9 100644 --- a/sys/kern/kern_subr.c +++ b/sys/kern/kern_subr.c @@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #ifdef ZERO_COPY_SOCKETS #include #include @@ -138,7 +139,8 @@ uiomove(void *cp, int n, struct uio *uio) int error = 0; int save = 0; - KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, + KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE || + uio->uio_rw == UIO_NOCOPY, ("uiomove: mode")); KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, ("uiomove proc")); @@ -164,10 +166,25 @@ uiomove(void *cp, int n, struct uio *uio) case UIO_USERSPACE: if (ticks - PCPU_GET(switchticks) >= hogticks) uio_yield(); + if (td->td_pflags & TDP_VMUIODEADLK) { + td->td_iov_base = (uintptr_t)iov->iov_base; + td->td_iov_len = iov->iov_len; + } if (uio->uio_rw == UIO_READ) error = copyout(cp, iov->iov_base, cnt); else error = copyin(iov->iov_base, cp, cnt); + if (error == EFAULT && td->td_faultaddr != 0 && + (td->td_pflags & TDP_VMUIODEADLK)) { + KASSERT(td->td_faultaddr >= (uintptr_t)iov->iov_base && + td->td_faultaddr < (uintptr_t)iov->iov_base + cnt, + ("faultaddr %jx outside region %p %d\n", + (uintmax_t)td->td_faultaddr, + iov->iov_base, iov->iov_len)); + error = ERESTART; + fwduio(uio, td->td_faultaddr - (uintptr_t) + iov->iov_base); + } if (error) goto out; break; @@ -181,10 +198,7 @@ uiomove(void *cp, int n, struct uio *uio) case UIO_NOCOPY: break; } - iov->iov_base = (char *)iov->iov_base + cnt; - iov->iov_len -= cnt; - uio->uio_resid -= cnt; - uio->uio_offset += cnt; + fwduio(uio, cnt); cp = (char *)cp + cnt; n -= cnt; } @@ -544,6 +558,7 @@ copyinuio(struct iovec *iovp, u_int iovcnt, struct uio **uiop) uio->uio_segflg = UIO_USERSPACE; uio->uio_offset = -1; uio->uio_resid = 0; + uio->uio_flags = 0; for (i = 0; i < iovcnt; i++) { if (iov->iov_len > INT_MAX - uio->uio_resid) { free(uio, M_IOV); @@ -569,3 +584,25 @@ cloneuio(struct uio *uiop) bcopy(uiop->uio_iov, uio->uio_iov, iovlen); return (uio); } + +void +fwduio(struct uio *uio, int cnt) +{ + + uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + cnt; + uio->uio_iov->iov_len -= cnt; + uio->uio_resid -= cnt; + uio->uio_offset += cnt; +} + +void +copyuio(struct uio *dst, struct uio *src) +{ + struct iovec *dst_iovec; + + dst_iovec = dst->uio_iov; + *dst = *src; + dst->uio_iov = dst_iovec; + bcopy(src->uio_iov, dst->uio_iov, src->uio_iovcnt * + sizeof(struct iovec)); +} diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 8c26b13..e3867d6 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -871,6 +871,7 @@ vdestroy(struct vnode *vp) /* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif + rangelock_destroy(&vp->v_rl); lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); mtx_destroy(BO_MTX(bo)); @@ -1025,6 +1026,7 @@ alloc: if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) vp->v_vflag |= VV_NOKNOTE; } + rangelock_init(&vp->v_rl); *vpp = vp; return (0); diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 3cc6f22..9e94b58 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -60,8 +60,12 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include +#include +#include + #include static fo_rdwr_t vn_read; @@ -363,37 +367,68 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, int *aresid; struct thread *td; { - struct uio auio; - struct iovec aiov; + struct uio auio, auio_clone; + struct iovec aiov, aiov_clone; struct mount *mp; struct ucred *cred; - int error; + vm_page_t *m_hold; + struct rl_q_entry rl_entry; + void *rl_cookie; + int wired_pages, error; VFS_ASSERT_GIANT(vp->v_mount); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = base; + aiov.iov_len = len; + auio.uio_resid = len; + auio.uio_offset = offset; + auio.uio_segflg = segflg; + auio.uio_rw = rw; + auio.uio_td = td; + error = 0; + + if ((ioflg & IO_NODELOCKED) == 0) { + if (rw == UIO_READ) + rl_cookie = rangelock_rlock(vp, &rl_entry, + offset, len); + else + rl_cookie = rangelock_wlock(vp, &rl_entry, + offset, len); + } else + rl_cookie = NULL; + + m_hold = NULL; + if (segflg == UIO_USERSPACE) { + m_hold = malloc(sizeof(vm_page_t) * (btoc(len) + 1), M_IOV, + M_WAITOK); + aiov_clone = aiov; + auio_clone = auio; + auio_clone.uio_iov = &aiov_clone; + error = vm_wireuio(&auio, m_hold, + round_page((vm_offset_t)base + len) - + trunc_page((vm_offset_t)base), + &wired_pages); + if (error) { + free(m_hold, M_IOV); + goto out; + } + } + if ((ioflg & IO_NODELOCKED) == 0) { mp = NULL; if (rw == UIO_WRITE) { if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) - return (error); + goto out_unwire; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } else vn_lock(vp, LK_SHARED | LK_RETRY); } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - aiov.iov_base = base; - aiov.iov_len = len; - auio.uio_resid = len; - auio.uio_offset = offset; - auio.uio_segflg = segflg; - auio.uio_rw = rw; - auio.uio_td = td; - error = 0; #ifdef MAC if ((ioflg & IO_NOMACCHECK) == 0) { if (rw == UIO_READ) @@ -424,6 +459,14 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, vn_finished_write(mp); VOP_UNLOCK(vp, 0); } +out_unwire: + if (segflg == UIO_USERSPACE) { + vm_unwireuio(&auio_clone, m_hold, wired_pages); + free(m_hold, M_IOV); + } +out: + if (rl_cookie != NULL) + rangelock_unlock(vp, rl_cookie); return (error); } @@ -485,68 +528,215 @@ vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred, return (error); } +static int uio_hold_pages = 12; +SYSCTL_INT(_vfs, OID_AUTO, uio_hold_pages, CTLFLAG_RW, &uio_hold_pages, 0, + "The max amount of held pages for one i/o chunk"); +static int uio_short = 128; +SYSCTL_INT(_vfs, OID_AUTO, uio_short, CTLFLAG_RW, &uio_short, 0, + "The length of the short i/o"); + +typedef int (*vn_chunk_func_t)(struct file *, struct uio *, struct ucred *, + int, int, struct thread *); + +static int +do_vn_rw_chunked(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, int ioflag, struct thread *td, vm_page_t *m_hold, + vn_chunk_func_t vn_chunk_func) +{ + struct uio *uio_clone; + int error, wire_bytes, io_chunk, total_cnt, cnt; + int first_chunk, wired_pages; + + if (uio->uio_segflg != UIO_USERSPACE || fp->f_vnode->v_type != VREG) + return (vn_chunk_func(fp, uio, active_cred, flags, ioflag, td)); + + uio_clone = cloneuio(uio); + KASSERT(!(td->td_pflags & TDP_VMUIODEADLK), + ("Nested TDP_VMUIODEADLK")); + td->td_pflags |= TDP_VMUIODEADLK; + td->td_faultaddr = 0; + error = vn_chunk_func(fp, uio, active_cred, flags, ioflag, td); + td->td_pflags &= ~TDP_VMUIODEADLK; + if (error != ERESTART || td->td_faultaddr == 0) + goto out; + + first_chunk = 1; + if (uio->uio_flags & UIO_ROLLBACK) { + cnt = uio_clone->uio_resid - uio->uio_resid; + copyuio(uio, uio_clone); + if (cnt > 0) { + uio->uio_rw = UIO_NOCOPY; + uiomove(NULL, cnt, uio); + uio->uio_rw = uio_clone->uio_rw; + first_chunk = 0; + } + } + while (uio->uio_resid > 0) { + io_chunk = min(uio_hold_pages * PAGE_SIZE, uio->uio_resid); /* XXXKIB */ + wire_bytes = round_page(io_chunk); + error = vm_wireuio(uio, m_hold, wire_bytes, &wired_pages); + if (error != 0) { + if (!first_chunk) + error = 0; + break; + } + copyuio(uio_clone, uio); + total_cnt = uio->uio_resid; + uio->uio_resid = io_chunk; + error = vn_chunk_func(fp, uio, active_cred, flags, ioflag, td); + vm_unwireuio(uio_clone, m_hold, wired_pages); + cnt = io_chunk - uio->uio_resid; + uio->uio_resid = total_cnt - cnt; + if (error != 0) { + if (!first_chunk) + error = 0; + break; + } + if (cnt == 0) + break; + first_chunk = 0; + } + out: + free(uio_clone, M_IOV); + return (error); +} + +static struct mtx * +vn_lock_foffset(struct file *fp) +{ + struct mtx *mtxp; + + mtxp = mtx_pool_find(mtxpool_sleep, fp); + mtx_lock(mtxp); + while (fp->f_vnread_flags & FOFFSET_LOCKED) { + fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; + msleep(&fp->f_vnread_flags, mtxp, PUSER -1, + "vnread offlock", 0); + } + fp->f_vnread_flags |= FOFFSET_LOCKED; + mtx_unlock(mtxp); + return (mtxp); +} + +static void +vn_unlock_foffset(struct file *fp, struct mtx *mtxp) +{ + + mtx_lock(mtxp); + if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) + wakeup(&fp->f_vnread_flags); + fp->f_vnread_flags = 0; + mtx_unlock(mtxp); +} + +static inline int +vn_read_wired_chunk(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, int ioflag, struct thread *td) +{ + struct vnode *vp; + int error, vfslocked; + + vp = fp->f_vnode; + + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vn_lock(vp, LK_SHARED | LK_RETRY); + + ioflag |= sequential_heuristic(uio, fp); + +#ifdef MAC + error = mac_vnode_check_read(active_cred, fp->f_cred, vp); + if (error == 0) +#endif + error = VOP_READ(vp, uio, ioflag, fp->f_cred); + fp->f_nextoff = uio->uio_offset; + VOP_UNLOCK(vp, 0); + VFS_UNLOCK_GIANT(vfslocked); + return (error); +} + /* * File table vnode read routine. */ static int -vn_read(fp, uio, active_cred, flags, td) - struct file *fp; - struct uio *uio; - struct ucred *active_cred; - struct thread *td; - int flags; +vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, + struct thread *td) { struct vnode *vp; - int error, ioflag; + vm_page_t m_hold[uio_hold_pages]; struct mtx *mtxp; - int vfslocked; + struct rl_q_entry rl_entry; + void *rl_cookie; + int ioflag; + int error; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); - mtxp = NULL; - vp = fp->f_vnode; ioflag = 0; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; - vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vp = fp->f_vnode; + /* * According to McKusick the vn lock was protecting f_offset here. * It is now protected by the FOFFSET_LOCKED flag. */ if ((flags & FOF_OFFSET) == 0) { - mtxp = mtx_pool_find(mtxpool_sleep, fp); - mtx_lock(mtxp); - while(fp->f_vnread_flags & FOFFSET_LOCKED) { - fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; - msleep(&fp->f_vnread_flags, mtxp, PUSER -1, - "vnread offlock", 0); - } - fp->f_vnread_flags |= FOFFSET_LOCKED; - mtx_unlock(mtxp); - vn_lock(vp, LK_SHARED | LK_RETRY); + mtxp = vn_lock_foffset(fp); uio->uio_offset = fp->f_offset; } else - vn_lock(vp, LK_SHARED | LK_RETRY); + mtxp = NULL; /* gcc */ + if (vp->v_type == VREG) + rl_cookie = rangelock_rlock(vp, &rl_entry, uio->uio_offset, + uio->uio_resid); + else + rl_cookie = NULL; + error = do_vn_rw_chunked(fp, uio, active_cred, flags, ioflag, td, + m_hold, vn_read_wired_chunk); + if (rl_cookie != NULL) + rangelock_unlock(vp, rl_cookie); + if ((flags & FOF_OFFSET) == 0) { + fp->f_offset = uio->uio_offset; + vn_unlock_foffset(fp, mtxp); + } + return (error); +} - ioflag |= sequential_heuristic(uio, fp); +static inline int +vn_write_wired_chunk(struct file *fp, struct uio *uio, + struct ucred *active_cred, int flags, int ioflag, struct thread *td) +{ + struct mount *mp; + struct vnode *vp; + int error, vfslocked; + mp = NULL; + vp = fp->f_vnode; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + if (vp->v_type == VREG) + bwillwrite(); + if (vp->v_type != VCHR) { + error = vn_start_write(vp, &mp, V_WAIT | PCATCH); + if (error != 0) + goto unlock; + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + if ((flags & FOF_OFFSET) == 0) + uio->uio_offset = fp->f_offset; + ioflag |= sequential_heuristic(uio, fp); #ifdef MAC - error = mac_vnode_check_read(active_cred, fp->f_cred, vp); + error = mac_vnode_check_write(active_cred, fp->f_cred, vp); if (error == 0) #endif - error = VOP_READ(vp, uio, ioflag, fp->f_cred); - if ((flags & FOF_OFFSET) == 0) { + error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); + if ((flags & FOF_OFFSET) == 0) fp->f_offset = uio->uio_offset; - mtx_lock(mtxp); - if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) - wakeup(&fp->f_vnread_flags); - fp->f_vnread_flags = 0; - mtx_unlock(mtxp); - } fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0); + if (vp->v_type != VCHR) + vn_finished_write(mp); +unlock: VFS_UNLOCK_GIANT(vfslocked); return (error); } @@ -555,24 +745,18 @@ vn_read(fp, uio, active_cred, flags, td) * File table vnode write routine. */ static int -vn_write(fp, uio, active_cred, flags, td) - struct file *fp; - struct uio *uio; - struct ucred *active_cred; - struct thread *td; - int flags; +vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, + struct thread *td) { struct vnode *vp; - struct mount *mp; - int error, ioflag; - int vfslocked; + vm_page_t m_hold[uio_hold_pages]; + struct rl_q_entry rl_entry; + void *rl_cookie; + int ioflag, error; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); vp = fp->f_vnode; - vfslocked = VFS_LOCK_GIANT(vp->v_mount); - if (vp->v_type == VREG) - bwillwrite(); ioflag = IO_UNIT; if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) ioflag |= IO_APPEND; @@ -583,27 +767,23 @@ vn_write(fp, uio, active_cred, flags, td) if ((fp->f_flag & O_FSYNC) || (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) ioflag |= IO_SYNC; - mp = NULL; - if (vp->v_type != VCHR && - (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) - goto unlock; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - if ((flags & FOF_OFFSET) == 0) - uio->uio_offset = fp->f_offset; - ioflag |= sequential_heuristic(uio, fp); -#ifdef MAC - error = mac_vnode_check_write(active_cred, fp->f_cred, vp); - if (error == 0) -#endif - error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); - if ((flags & FOF_OFFSET) == 0) - fp->f_offset = uio->uio_offset; - fp->f_nextoff = uio->uio_offset; - VOP_UNLOCK(vp, 0); - if (vp->v_type != VCHR) - vn_finished_write(mp); -unlock: - VFS_UNLOCK_GIANT(vfslocked); + if (vp->v_type == VREG) { + if ((ioflag & IO_APPEND) || !(flags & FOF_OFFSET)) + /* + * For appenders, punt and lock the whole + * range. It also protects f_offset. + */ + rl_cookie = rangelock_wlock(vp, &rl_entry, + 0, (size_t)-1); + else + rl_cookie = rangelock_wlock(vp, &rl_entry, + uio->uio_offset, uio->uio_resid); + } else + rl_cookie = NULL; + error = do_vn_rw_chunked(fp, uio, active_cred, flags, ioflag, td, + m_hold, vn_write_wired_chunk); + if (rl_cookie != NULL) + rangelock_unlock(vp, rl_cookie); return (error); } diff --git a/sys/modules/cxgb/tom/Makefile b/sys/modules/cxgb/tom/Makefile index 2b8750a..d2566fd 100644 --- a/sys/modules/cxgb/tom/Makefile +++ b/sys/modules/cxgb/tom/Makefile @@ -5,7 +5,7 @@ CXGB = ${.CURDIR}/../../../dev/cxgb KMOD= tom SRCS= cxgb_tom.c cxgb_cpl_io.c cxgb_listen.c cxgb_tom_sysctl.c cxgb_cpl_socket.c -SRCS+= cxgb_ddp.c cxgb_vm.c cxgb_l2t.c cxgb_tcp_offload.c +SRCS+= cxgb_ddp.c cxgb_l2t.c cxgb_tcp_offload.c SRCS+= opt_compat.h opt_inet.h opt_inet6.h opt_ipsec.h opt_mac.h SRCS+= opt_tcpdebug.h opt_ddb.h opt_sched.h opt_global.h opt_ktr.h SRCS+= device_if.h bus_if.h pci_if.h diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 96f811d..9ce96da 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -236,6 +236,9 @@ struct thread { struct file *td_fpop; /* (k) file referencing cdev under op */ int td_dbgflags; /* (c) Userland debugger flags */ struct osd td_osd; /* (k) Object specific data. */ + vm_offset_t td_faultaddr; /* (k) fault address for TDP_VMUIODEADLK */ + vm_offset_t td_iov_base; /* (k) the region where VMUIODEADLK ... */ + size_t td_iov_len; /* (k) ... is handled */ #define td_endzero td_base_pri /* Copied during fork1() or thread_sched_upcall(). */ @@ -353,7 +356,7 @@ do { \ #define TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */ #define TDP_ALTSTACK 0x00000020 /* Have alternate signal stack. */ #define TDP_DEADLKTREAT 0x00000040 /* Lock aquisition - deadlock treatment. */ -#define TDP_UNUSED80 0x00000080 /* available. */ +#define TDP_VMUIODEADLK 0x00000080 /* Non-blocking vm_fault required. */ #define TDP_NOSLEEPING 0x00000100 /* Thread is not allowed to sleep on a sq. */ #define TDP_OWEUPC 0x00000200 /* Call addupc() at next AST. */ #define TDP_ITHREAD 0x00000400 /* Thread is an interrupt thread. */ diff --git a/sys/sys/rangelock.h b/sys/sys/rangelock.h new file mode 100644 index 0000000..4a5ac1e --- /dev/null +++ b/sys/sys/rangelock.h @@ -0,0 +1,50 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + * $FreeBSD$ + */ + +#ifndef _SYS_RANGELOCK_H +#define _SYS_RANGELOCK_H + +#include +#include +#include +#include +#include + +#ifdef _KERNEL + +struct vnode; + +struct rl_q_entry +{ + TAILQ_ENTRY(rl_q_entry) rl_q_link; + size_t rl_q_start, rl_q_end; + int rl_q_flags; +}; + +#define RL_LOCK_READ 0x0001 +#define RL_LOCK_WRITE 0x0002 +#define RL_LOCK_TYPE_MASK 0x0003 +#define RL_LOCK_GRANTED 0x0004 + +struct rangelock +{ + TAILQ_HEAD(, rl_q_entry) rl_waiters; + struct rl_q_entry *rl_currdep; +}; + +void rangelock_init(struct rangelock *lock); +void rangelock_destroy(struct rangelock *lock); +void rangelock_unlock(struct vnode *vp, void *cookie); +void *rangelock_unlock_range(struct vnode *vp, void *cookie, off_t base, + size_t len); +void *rangelock_rlock(struct vnode *vp, struct rl_q_entry *entry, + off_t base, size_t len); +void *rangelock_wlock(struct vnode *vp, struct rl_q_entry *entry, + off_t base, size_t len); +#endif + +#endif diff --git a/sys/sys/uio.h b/sys/sys/uio.h index 871f93a..6a7cce7 100644 --- a/sys/sys/uio.h +++ b/sys/sys/uio.h @@ -68,8 +68,11 @@ struct uio { enum uio_seg uio_segflg; /* address space */ enum uio_rw uio_rw; /* operation */ struct thread *uio_td; /* owner */ + int uio_flags; }; +#define UIO_ROLLBACK 0x0001 + /* * Limits * @@ -100,6 +103,8 @@ int uiomove_frombuf(void *buf, int buflen, struct uio *uio); int uiomove_fromphys(struct vm_page *ma[], vm_offset_t offset, int n, struct uio *uio); int uiomoveco(void *cp, int n, struct uio *uio, int disposable); +void fwduio(struct uio *uip, int cnt); +void copyuio(struct uio *dst, struct uio *src); #else /* !_KERNEL */ diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 0a3d1dc..af760a5 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -168,7 +169,8 @@ struct vnode { */ struct vpollinfo *v_pollinfo; /* G Poll events, p for *v_pi */ struct label *v_label; /* MAC label for vnode */ - struct lockf *v_lockf; /* Byte-level lock list */ + struct lockf *v_lockf; /* Byte-level adv lock list */ + struct rangelock v_rl; /* Byte-range lock */ }; #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */ diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c index 1abb994..34d75e5 100644 --- a/sys/ufs/ffs/ffs_vnops.c +++ b/sys/ufs/ffs/ffs_vnops.c @@ -632,8 +632,8 @@ ffs_write(ap) struct buf *bp; struct thread *td; ufs_lbn_t lbn; - off_t osize; - int seqcount; + off_t osize, s_size; + int seqcount, s_resid; int blkoffset, error, flags, ioflag, resid, size, xfersize; vp = ap->a_vp; @@ -707,6 +707,7 @@ ffs_write(ap) lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->fs_bsize - blkoffset; + s_size = ip->i_size; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (uio->uio_offset + xfersize > ip->i_size) @@ -734,8 +735,10 @@ ffs_write(ap) * the prior contents of the pages exposed to a userland * mmap(). XXX deal with uiomove() errors a better way. */ - if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) + if ((bp->b_flags & B_CACHE) == 0 /*&& fs->fs_bsize <= xfersize*/) { vfs_bio_clrbuf(bp); + flags |= BA_CLRBUF; + } if (ioflag & IO_DIRECT) bp->b_flags |= B_DIRECT; if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) @@ -750,6 +753,7 @@ ffs_write(ap) if (size < xfersize) xfersize = size; + s_resid = uio->uio_resid; error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if ((ioflag & (IO_VMIO|IO_DIRECT)) && @@ -757,6 +761,41 @@ ffs_write(ap) bp->b_flags |= B_RELBUF; } + if (error == ERESTART && !(flags & BA_CLRBUF)) { + /* + * When uiomove() failed due to vm_fault + * cowardly refused to process a dangerous + * page-in, and the previous content of the + * buffer is garbage, e.g. because supposed + * transfer length was big enough to cover the + * whole buffer, discard it. + */ + if (LIST_EMPTY(&bp->b_dep)) { + bp->b_flags |= B_RELBUF | B_NOCACHE | B_INVAL; + brelse(bp); + } else { + /* + * But cannot discard the buffer with + * dependencies. Since the buffer is + * newly allocated, fill it with + * zeros. If the buffer extended the + * file, truncate. + */ + vfs_bio_clrbuf(bp); + if (ioflag & IO_SYNC) + bwrite(bp); + else + bawrite(bp); + ffs_truncate(vp, s_size, + IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred, + uio->uio_td); + } + uio->uio_offset -= s_resid - uio->uio_resid; + uio->uio_resid = s_resid; + uio->uio_flags |= UIO_ROLLBACK; + break; + } + /* * If IO_SYNC each buffer is written synchronously. Otherwise * if we have a severe page deficiency write the buffer @@ -808,6 +847,8 @@ ffs_write(ap) ap->a_cred, uio->uio_td); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; + if (error == ERESTART) + uio->uio_flags |= UIO_ROLLBACK; } } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) error = ffs_update(vp, 1); diff --git a/sys/vm/vm_extern.h b/sys/vm/vm_extern.h index 475a20e..f2b4d16 100644 --- a/sys/vm/vm_extern.h +++ b/sys/vm/vm_extern.h @@ -34,11 +34,13 @@ #define _VM_EXTERN_H_ struct buf; +struct iovec; struct proc; struct vmspace; struct vmtotal; struct mount; struct vnode; +struct uio; #ifdef _KERNEL @@ -56,6 +58,9 @@ void swapout_procs(int); int useracc(void *, int, int); int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int); void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t); +int vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr, + vm_page_t *mp, int count, vm_prot_t prot); +void vm_fault_unhold_pages(vm_page_t *mp, int count); void vm_fault_unwire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t); int vm_fault_wire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t); int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int); @@ -84,5 +89,9 @@ int vm_thread_new(struct thread *td, int pages); int vm_thread_new_altkstack(struct thread *td, int pages); void vm_thread_swapin(struct thread *td); void vm_thread_swapout(struct thread *td); +int vm_wireuio(struct uio *uiop, struct vm_page *m_hold[], int wire_bytes, + int *wired_pages); +void vm_unwireuio(struct uio *, struct vm_page *m_hold[], int wired_pages); + #endif /* _KERNEL */ #endif /* !_VM_EXTERN_H_ */ diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 3a21616..d354c92 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -1,4 +1,30 @@ /*- + +Copyright (c) 2007-2008, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 1994 John S. Dyson @@ -206,9 +232,9 @@ unlock_and_deallocate(struct faultstate *fs) * The map in question must be referenced, and remains so. * Caller may hold no locks. */ -int -vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, - int fault_flags) +static int +vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, + int fault_flags, struct vm_page **m_hold) { vm_prot_t prot; int is_first_object_locked, result; @@ -220,8 +246,20 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int faultcount, ahead, behind; struct faultstate fs; struct vnode *vp; + struct thread *td; int locked, error; + td = curthread; + if (td->td_pflags & TDP_VMUIODEADLK) { + KASSERT(td->td_iov_base <= vaddr && + vaddr < td->td_iov_base + td->td_iov_len, + ("uiomove EFAULT %jx %jx %d\n", (uintmax_t)vaddr, + (uintmax_t)td->td_iov_base, td->td_iov_len)); + td->td_faultaddr = vaddr; + return (KERN_VMUIODEADLOCK); + } + vaddr = trunc_page(vaddr); + hardfault = 0; growstack = TRUE; PCPU_INC(cnt.v_vm_faults); @@ -949,6 +987,10 @@ vnode_locked: } else { vm_page_activate(fs.m); } + if (m_hold != NULL) { + *m_hold = fs.m; + vm_page_hold(fs.m); + } vm_page_unlock_queues(); vm_page_wakeup(fs.m); @@ -964,6 +1006,14 @@ vnode_locked: return (KERN_SUCCESS); } +int +vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, + int fault_flags) +{ + + return (vm_fault_hold(map, vaddr, fault_type, fault_flags, NULL)); +} + /* * vm_fault_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" @@ -1360,3 +1410,108 @@ vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage) /* return number of pages */ return i; } + +/* + * This routine takes a user's map, array of pages, number of pages, and flags + * and then does the following: + * - validate that the user has access to those pages (flags indicates read + * or write) - if not fail + * - validate that count is enough to hold range number of pages - if not fail + * - fault in any non-resident pages + * - if the user is doing a read force a write fault for any COWed pages + * - if the user is doing a read mark all pages as dirty + * - hold all pages + */ +int +vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr, vm_page_t *mp, + int count, vm_prot_t prot) +{ + vm_offset_t end, va; + int faults, rv; + pmap_t pmap; + vm_page_t m, *pages; + + pmap = vm_map_pmap(map); + pages = mp; + addr &= ~PAGE_MASK; + + /* + * Check that virtual address range is legal. + * This check is somewhat bogus as on some architectures kernel + * and user do not share VA - however, it appears that all FreeBSD + * architectures define it + */ + end = addr + (count * PAGE_SIZE); + if (end > VM_MAXUSER_ADDRESS) + return (EFAULT); + + /* + * First optimistically assume that all pages are resident + * (and R/W if for write) if so just mark pages as held (and + * dirty if for write) and return. + */ + vm_page_lock_queues(); + for (pages = mp, faults = 0, va = addr; va < end; + va += PAGE_SIZE, pages++) { + /* + * Page queue mutex is recursable so this is OK. + * It would be really nice if we had an unlocked + * version of this so we were only acquiring the + * pmap lock 1 time as opposed to potentially + * many dozens of times. + */ + *pages = m = pmap_extract_and_hold(pmap, va, prot); + if (m == NULL) { + faults++; + continue; + } + + /* + * Preemptively mark dirty - the pages will never have + * the modified bit set if they are only changed via + * DMA. + */ + if (prot & VM_PROT_WRITE) + vm_page_dirty(m); + } + vm_page_unlock_queues(); + + if (faults == 0) + return (0); + + /* + * Pages either have insufficient permissions or are not present + * trigger a fault where neccessary. + */ + rv = 0; + for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++) { + rv = vm_fault_hold(map, va, prot, (prot & VM_PROT_WRITE) ? + VM_FAULT_DIRTY : VM_FAULT_NORMAL, pages); + if (rv) + goto error; + } + return (0); + +error: + vm_page_lock_queues(); + for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++) + if (*pages) { + vm_page_unhold(*pages); + *pages = NULL; + } + vm_page_unlock_queues(); + return (EFAULT); +} + +void +vm_fault_unhold_pages(vm_page_t *mp, int count) +{ + + KASSERT(count >= 0, ("negative count %d", count)); + vm_page_lock_queues(); + while (count--) { + vm_page_unhold(*mp); + mp++; + } + vm_page_unlock_queues(); +} diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 2c5821c..3f19a49 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -1374,6 +1374,7 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, new_entry->protection = prot; new_entry->max_protection = max; new_entry->wired_count = 0; + new_entry->pin_count = 0; /* * Insert the new entry into the list @@ -1596,7 +1597,8 @@ vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry) (prev->protection == entry->protection) && (prev->max_protection == entry->max_protection) && (prev->inheritance == entry->inheritance) && - (prev->wired_count == entry->wired_count)) { + (prev->wired_count == entry->wired_count) && + (prev->pin_count == entry->pin_count)) { vm_map_entry_unlink(map, prev); entry->start = prev->start; entry->offset = prev->offset; @@ -1622,7 +1624,8 @@ vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry) (next->protection == entry->protection) && (next->max_protection == entry->max_protection) && (next->inheritance == entry->inheritance) && - (next->wired_count == entry->wired_count)) { + (next->wired_count == entry->wired_count) && + (next->pin_count == entry->pin_count)) { vm_map_entry_unlink(map, next); entry->end = next->end; vm_map_entry_resize_free(map, entry); @@ -2796,7 +2799,8 @@ reclip_start: */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 || (vm_map_pmap(map) != kernel_pmap && - vm_map_entry_system_wired_count(entry) != 0)) { + vm_map_entry_system_wired_count(entry) != 0) || + entry->pin_count != 0) { entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; last_timestamp = map->timestamp; @@ -2816,6 +2820,7 @@ reclip_start: holder_entry->max_protection = VM_PROT_NONE; holder_entry->inheritance = VM_INHERIT_NONE; holder_entry->wired_count = 0; + holder_entry->pin_count = 0; vm_map_entry_link(map, entry->prev, holder_entry); } (void) vm_map_unlock_and_wait(map, 0); @@ -3154,6 +3159,7 @@ vmspace_fork(struct vmspace *vm1) new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION); new_entry->wired_count = 0; + new_entry->pin_count = 0; /* * Insert the entry into the new map -- we know we're @@ -3181,6 +3187,7 @@ vmspace_fork(struct vmspace *vm1) new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION); new_entry->wired_count = 0; + new_entry->pin_count = 0; new_entry->object.vm_object = NULL; vm_map_entry_link(new_map, new_map->header.prev, new_entry); @@ -3865,6 +3872,167 @@ vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry) vm_map_unlock_read(map); } +static boolean_t +vm_map_unpin_entries(vm_map_t map, struct uio *uiop, int pinned_entries) +{ + vm_offset_t start; + struct iovec *iov; + vm_map_entry_t entry; + int i, acc, wire; + boolean_t need_wakeup; + + iov = uiop->uio_iov; + need_wakeup = FALSE; + + for (i = 0, acc = 0; acc < pinned_entries; iov++, i++) { + KASSERT(i < uiop->uio_iovcnt, ("wireio: iovcnt overflow %d %d %d", + i, uiop->uio_iovcnt, pinned_entries)); + wire = round_page(iov->iov_len); + if (acc + wire > pinned_entries) + wire = pinned_entries - acc; + start = trunc_page((vm_offset_t)iov->iov_base); + for (;;) { + if (!vm_map_lookup_entry(map, start, &entry)) { +#ifdef INVARIANTS + panic("vm_unwireuio: hole"); +#endif + } + KASSERT(entry->pin_count > 0, ("pin_count %p", entry)); + if (--entry->pin_count == 0 && + (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP)) { + entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; + need_wakeup = TRUE; + } + if (entry->end >= start + wire) { + acc += wire; + break; + } else { + acc += entry->end - start; + wire -= entry->end - start; + start = entry->end; + } + } + } + return (need_wakeup); +} + +/* + * vm_wireuio + * + * Given userspace struct uio, we set up vm state such that after the + * successfull return there will be no page faults during uiomove with + * this uio until vm_unwireuio is called. At most wire_bytes bytes of + * the user address space are held. + * + * Function performs this by first pinning all map entries that will + * be referenced. This guarantees that our ranges of user address + * space cannot be remmapped during the operation. Then, all accessed + * pages are faulted in and held. + */ +int +vm_wireuio(struct uio *uiop, struct vm_page *m_hold[], int wire_bytes, + int *wired_pages) +{ + vm_map_t map; + vm_offset_t start, start1; + struct iovec *iov; + vm_map_entry_t entry; + struct vm_page **m_hold1; + int i, acc, wire, wire_pages, pinned_entries, rv, prot; + int error; + boolean_t need_wakeup; + + KASSERT(round_page(wire_bytes) == wire_bytes, + ("wireuio: wire_bytes is not page-size aligned")); + KASSERT(uiop->uio_segflg == UIO_USERSPACE, + ("wireuio: !UIO_USERSPACE")); + + error = 0; + prot = uiop->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ; + m_hold1 = m_hold; + pinned_entries = 0; + *wired_pages = 0; + map = &uiop->uio_td->td_proc->p_vmspace->vm_map; + iov = uiop->uio_iov; + + /* + * Do the pass over iov. + */ + for (i = 0, acc = 0; acc < wire_bytes; iov++, i++) { + wire = round_page(iov->iov_len); + if (acc + wire > wire_bytes) + wire = wire_bytes - acc; + acc += wire; + wire_pages = btoc(wire); + start1 = start = trunc_page((vm_offset_t)iov->iov_base); + if (start < vm_map_min(map) || start + wire > vm_map_max(map) || + start > start + wire) { + error = EINVAL; + goto fault; + } + vm_map_lock(map); + + /* + * Pin each entry referenced by addresses in iov. + */ + for (;;) { + if (!vm_map_lookup_entry(map, start1, &entry) || + (entry->eflags & MAP_ENTRY_IS_HOLDER)) + goto fault_map_locked; + entry->pin_count++; + if (entry->end >= start1 + wire) { + pinned_entries += wire; + break; + } else { + pinned_entries += entry->end - start1; + wire -= entry->end - start1; + start1 = entry->end; + } + } + vm_map_unlock(map); + + /* + * If entries are successfully pinned, the + * corresponding pages are faulted in and held. + */ + rv = vm_fault_hold_user_pages(map, start, m_hold1, wire_pages, + prot); + if (rv != KERN_SUCCESS) { + error = EFAULT; + goto fault; + } + *wired_pages += wire_pages; + m_hold1 += wire_pages; + } + return (0); + fault: + vm_map_lock(map); + fault_map_locked: + need_wakeup = vm_map_unpin_entries(map, uiop, pinned_entries); + vm_map_unlock(map); + vm_fault_unhold_pages(m_hold, *wired_pages); + if (need_wakeup) + vm_map_wakeup(map); + return (error); +} + +void +vm_unwireuio(struct uio *uiop, struct vm_page *m_hold[], int wired_pages) +{ + vm_map_t map; + boolean_t need_wakeup; + + map = &uiop->uio_td->td_proc->p_vmspace->vm_map; + + vm_fault_unhold_pages(m_hold, wired_pages); + + vm_map_lock(map); + need_wakeup = vm_map_unpin_entries(map, uiop, ctob(wired_pages)); + vm_map_unlock(map); + if (need_wakeup) + vm_map_wakeup(map); +} + #include "opt_ddb.h" #ifdef DDB #include diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h index f2c4fd3..9310718 100644 --- a/sys/vm/vm_map.h +++ b/sys/vm/vm_map.h @@ -114,6 +114,7 @@ struct vm_map_entry { vm_inherit_t inheritance; /* inheritance */ int wired_count; /* can be paged if = 0 */ vm_pindex_t lastr; /* last read */ + unsigned pin_count; /* non-exclusive pin count */ }; #define MAP_ENTRY_NOSYNC 0x0001 @@ -383,5 +384,6 @@ int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end, int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags); int vmspace_swap_count (struct vmspace *vmspace); + #endif /* _KERNEL */ #endif /* _VM_MAP_ */ diff --git a/sys/vm/vm_param.h b/sys/vm/vm_param.h index 2ff2603..d866925 100644 --- a/sys/vm/vm_param.h +++ b/sys/vm/vm_param.h @@ -125,6 +125,7 @@ struct xswdev { #define KERN_RESOURCE_SHORTAGE 6 #define KERN_NOT_RECEIVER 7 #define KERN_NO_ACCESS 8 +#define KERN_VMUIODEADLOCK 9 #ifndef ASSEMBLER #ifdef _KERNEL diff --git a/tools/regression/file/uio/uio.c b/tools/regression/file/uio/uio.c new file mode 100644 index 0000000..d857605 --- /dev/null +++ b/tools/regression/file/uio/uio.c @@ -0,0 +1,116 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int chunk_cnt = 1024; +int chunk_size = 1024; + +int +main(int argc, char *argv[]) +{ + struct iovec *wiov, *riov; + char **wdata, **rdata; + int fd, i; + ssize_t io_error; + + if (argc < 2) { + fprintf(stderr, "Usage: uio file [chunk count [chunk size]]\n"); + return (2); + } + fd = open(argv[1], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); + if (fd == -1) { + fprintf(stderr, "Failed to create %s: %s\n", + argv[1], strerror(errno)); + return (1); + } + + if (argc > 2) + chunk_cnt = atoi(argv[2]); + if (argc > 3) + chunk_size = atoi(argv[3]); + + wiov = calloc(chunk_cnt, sizeof(*wiov)); + wdata = calloc(chunk_cnt, sizeof(*wdata)); + + riov = calloc(chunk_cnt, sizeof(*riov)); + rdata = calloc(chunk_cnt, sizeof(*rdata)); + + for (i = 0; i < chunk_cnt; i++) { + rdata[i] = malloc(chunk_size); + riov[i].iov_base = rdata[i]; + riov[i].iov_len = chunk_size; + + wdata[i] = malloc(chunk_size); + memset(wdata[i], i, chunk_size); + wiov[i].iov_base = wdata[i]; + wiov[i].iov_len = chunk_size; + } + + io_error = writev(fd, wiov, chunk_cnt); + if (io_error == -1) { + fprintf(stderr, "write failed: %s\n", strerror(errno)); + return (1); + } else if (io_error != chunk_cnt * chunk_size) { + fprintf(stderr, "truncated write: %d %d\n", + io_error, chunk_cnt * chunk_size); + return (1); + } + + if (lseek(fd, 0, SEEK_SET) == -1) { + fprintf(stderr, "lseek failed: %s\n", strerror(errno)); + return (1); + } + + io_error = readv(fd, riov, chunk_cnt); + if (io_error == -1) { + fprintf(stderr, "read failed: %s\n", strerror(errno)); + return (1); + } else if (io_error != chunk_cnt * chunk_size) { + fprintf(stderr, "truncated read: %d %d\n", + io_error, chunk_cnt * chunk_size); + return (1); + } + + for (i = 0; i < chunk_cnt; i++) { + if (memcmp(rdata[i], wdata[i], chunk_size) != 0) { + fprintf(stderr, "chunk %d differs\n", i); + return (1); + } + } + + return (0); +} diff --git a/tools/regression/ufs/ba_clrbuf/ba_clrbuf.c b/tools/regression/ufs/ba_clrbuf/ba_clrbuf.c new file mode 100644 index 0000000..9376648 --- /dev/null +++ b/tools/regression/ufs/ba_clrbuf/ba_clrbuf.c @@ -0,0 +1,129 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const int blks = 2; + +static void +flush_buffers(int fd) +{ + struct stat st; + char *addr; + int error; + + printf("Flushing buffers\n"); + error = fstat(fd, &st); + if (error == -1) + err(2, "stat"); + fsync(fd); + addr = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (addr == (char *)-1) + err(2, "mmap"); + error = msync(addr, st.st_size, MS_SYNC | MS_INVALIDATE); + if (error == -1) + err(2, "msync"); + munmap(addr, st.st_size); +} + +int +main(int argc, char *argv[]) +{ + struct statfs fst; + char *data, *vrfy; + size_t sz; + int fd, i, error, ret; + + if (argc < 2) + errx(2, "Usage: ba_clrbuf file"); + + fd = open(argv[1], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); + if (fd == -1) + err(2, "Failed to create %s", argv[1]); + + if (fstatfs(fd, &fst) == -1) + err(2, "stat"); + + sz = fst.f_iosize * blks; + data = malloc(sz); + if (data == NULL) + err(2, "malloc"); + vrfy = malloc(sz); + if (vrfy == NULL) + err(2, "malloc"); + for (i = 0; i < (int)sz; i++) + data[i] = i; + error = write(fd, data, sz); + if (error == -1) + err(2, "write"); + else if (error != (int)sz) + errx(2, "Short write %d %d", error, sz); + + flush_buffers(fd); + + error = lseek(fd, 0, SEEK_SET); + if (error == -1) + err(2, "lseek 0"); + else if (error != 0) + errx(2, "lseek 0 returned %d", error); + error = write(fd, NULL, fst.f_iosize); + printf("faulty write, error %s\n", strerror(errno)); + + error = lseek(fd, 0, SEEK_SET); + if (error == -1) + err(2, "lseek 0/2"); + else if (error != 0) + errx(2, "lseek 0/2 returned %d", error); + error = read(fd, vrfy, sz); + if (error == -1) + err(2, "read"); + else if (error != (int)sz) + errx(2, "short read %d %d", error, sz); + + if (memcmp(data, vrfy, fst.f_iosize) != 0) { + printf("Zero block corrupted, byte at 0 is %x\n", + (unsigned char)vrfy[0]); + ret = 1; + } else { + printf("No corruption\n"); + ret = 0; + } + + return (ret); +}