sys/conf/files | 1 + sys/fs/nfsclient/nfs_clbio.c | 62 +++++-- sys/fs/nfsclient/nfs_clvfsops.c | 3 +- sys/kern/kern_rangelock.c | 246 ++++++++++++++++++++++++++++ sys/kern/kern_thread.c | 3 + sys/kern/subr_syscall.c | 6 + sys/kern/vfs_subr.c | 2 + sys/kern/vfs_vnops.c | 338 ++++++++++++++++++++++++++++++++++----- sys/sys/mount.h | 3 + sys/sys/proc.h | 6 +- sys/sys/rangelock.h | 78 +++++++++ sys/sys/vnode.h | 16 +- sys/ufs/ffs/ffs_vfsops.c | 2 +- sys/ufs/ffs/ffs_vnops.c | 6 +- 14 files changed, 709 insertions(+), 63 deletions(-) diff --git a/sys/conf/files b/sys/conf/files index 3c84cf6..843f2dd 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -2523,6 +2523,7 @@ kern/kern_priv.c standard kern/kern_proc.c standard kern/kern_prot.c standard kern/kern_racct.c standard +kern/kern_rangelock.c standard kern/kern_rctl.c standard kern/kern_resource.c standard kern/kern_rmlock.c standard diff --git a/sys/fs/nfsclient/nfs_clbio.c b/sys/fs/nfsclient/nfs_clbio.c index fa9636b..df9677c 100644 --- a/sys/fs/nfsclient/nfs_clbio.c +++ b/sys/fs/nfsclient/nfs_clbio.c @@ -722,7 +722,7 @@ ncl_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred) }; if (n > 0) { - error = uiomove(bp->b_data + on, (int)n, uio); + error = vn_io_fault_uiomove(bp->b_data + on, (int)n, uio); } if (vp->v_type == VLNK) n = 0; @@ -897,8 +897,9 @@ ncl_write(struct vop_write_args *ap) struct nfsmount *nmp = VFSTONFS(vp->v_mount); daddr_t lbn; int bcount; - int n, on, error = 0; - off_t tmp_off; + int bp_cached, n, on, error = 0; + size_t orig_resid, local_resid; + off_t orig_size, tmp_off; KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode")); KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, @@ -950,6 +951,11 @@ flush_and_restart: mtx_unlock(&np->n_mtx); } + orig_resid = uio->uio_resid; + mtx_lock(&np->n_mtx); + orig_size = np->n_size; + mtx_unlock(&np->n_mtx); + /* * If IO_APPEND then load uio_offset. We restart here if we cannot * get the append lock. @@ -1127,7 +1133,10 @@ again: * normally. */ + bp_cached = 1; if (on == 0 && n == bcount) { + if ((bp->b_flags & B_CACHE) == 0) + bp_cached = 0; bp->b_flags |= B_CACHE; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; @@ -1178,7 +1187,7 @@ again: * significant cache coherency problems with multiple clients, * especially if locking is implemented later on. * - * as an optimization we could theoretically maintain + * As an optimization we could theoretically maintain * a linked list of discontinuous areas, but we would still * have to commit them separately so there isn't much * advantage to it except perhaps a bit of asynchronization. @@ -1193,7 +1202,23 @@ again: goto again; } - error = uiomove((char *)bp->b_data + on, n, uio); + local_resid = uio->uio_resid; + error = vn_io_fault_uiomove((char *)bp->b_data + on, n, uio); + + if (error != 0 && !bp_cached) { + /* + * This block has no other content then what + * possibly was written by faulty uiomove. + * Release it, forgetting the data pages, to + * prevent the leak of uninitialized data to + * usermode. + */ + bp->b_ioflags |= BIO_ERROR; + brelse(bp); + uio->uio_offset -= local_resid - uio->uio_resid; + uio->uio_resid = local_resid; + break; + } /* * Since this block is being modified, it must be written @@ -1203,17 +1228,18 @@ again: */ bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); - if (error) { - bp->b_ioflags |= BIO_ERROR; - brelse(bp); - break; - } + /* + * Get the partial update on the progress made from + * uiomove, if error occured. + */ + if (error != 0) + n = local_resid - uio->uio_resid; /* * Only update dirtyoff/dirtyend if not a degenerate * condition. */ - if (n) { + if (n > 0) { if (bp->b_dirtyend > 0) { bp->b_dirtyoff = min(on, bp->b_dirtyoff); bp->b_dirtyend = max((on + n), bp->b_dirtyend); @@ -1242,8 +1268,22 @@ again: } else { bdwrite(bp); } + + if (error != 0) + break; } while (uio->uio_resid > 0 && n > 0); + if (error != 0) { + if (ioflag & IO_UNIT) { + VATTR_NULL(&vattr); + vattr.va_size = orig_size; + /* IO_SYNC is handled implicitely */ + (void)VOP_SETATTR(vp, &vattr, cred); + uio->uio_offset -= orig_resid - uio->uio_resid; + uio->uio_resid = orig_resid; + } + } + return (error); } diff --git a/sys/fs/nfsclient/nfs_clvfsops.c b/sys/fs/nfsclient/nfs_clvfsops.c index af0e33b..966688f 100644 --- a/sys/fs/nfsclient/nfs_clvfsops.c +++ b/sys/fs/nfsclient/nfs_clvfsops.c @@ -1136,7 +1136,8 @@ nfs_mount(struct mount *mp) out: if (!error) { MNT_ILOCK(mp); - mp->mnt_kern_flag |= (MNTK_MPSAFE|MNTK_LOOKUP_SHARED); + mp->mnt_kern_flag |= MNTK_MPSAFE | MNTK_LOOKUP_SHARED | + MNTK_NO_IOPF; MNT_IUNLOCK(mp); } return (error); diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c new file mode 100644 index 0000000..5e02717 --- /dev/null +++ b/sys/kern/kern_rangelock.c @@ -0,0 +1,246 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +#include + +struct rl_q_entry { + TAILQ_ENTRY(rl_q_entry) rl_q_link; + off_t rl_q_start, rl_q_end; + int rl_q_flags; +}; + +static uma_zone_t rl_entry_zone; + +static void +rangelock_sys_init(void) +{ + + rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); +} +SYSINIT(vfs, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL); + +static struct rl_q_entry * +rlqentry_alloc(void) +{ + + return (uma_zalloc(rl_entry_zone, M_WAITOK)); +} + +void +rlqentry_free(struct rl_q_entry *rleq) +{ + + uma_zfree(rl_entry_zone, rleq); +} + +void +rangelock_init(struct rangelock *lock) +{ + + TAILQ_INIT(&lock->rl_waiters); + lock->rl_currdep = NULL; +} + +void +rangelock_destroy(struct rangelock *lock) +{ + + KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters")); +} + +/* + * Verifies the supplied rl_q_entries for compatibility. Returns true + * if rangelock queue entries are not compatible, false if they are. + * + * Two entries are compatible if their ranges do not overlap, or both + * entries are for read. + */ +static int +rangelock_incompatible(const struct rl_q_entry *e1, + const struct rl_q_entry *e2) +{ + + if ((e1->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ && + (e2->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ) + return (0); + if (e1->rl_q_start < e2->rl_q_end && e1->rl_q_end > e2->rl_q_start) + return (1); + return (0); +} + +/* + * Recalculate the lock->rl_currdep after an unlock. + */ +static void +rangelock_calc_block(struct rangelock *lock) +{ + struct rl_q_entry *entry, *entry1, *whead; + + if (lock->rl_currdep == TAILQ_FIRST(&lock->rl_waiters) && + lock->rl_currdep != NULL) + lock->rl_currdep = TAILQ_NEXT(lock->rl_currdep, rl_q_link); + for (entry = lock->rl_currdep; entry != NULL; + entry = TAILQ_NEXT(entry, rl_q_link)) { + TAILQ_FOREACH(entry1, &lock->rl_waiters, rl_q_link) { + if (rangelock_incompatible(entry, entry1)) + goto out; + if (entry1 == entry) + break; + } + } +out: + lock->rl_currdep = entry; + TAILQ_FOREACH(whead, &lock->rl_waiters, rl_q_link) { + if (whead == lock->rl_currdep) + break; + if (!(whead->rl_q_flags & RL_LOCK_GRANTED)) { + whead->rl_q_flags |= RL_LOCK_GRANTED; + wakeup(whead); + } + } +} + +static void +rangelock_unlock_locked(struct rangelock *lock, struct rl_q_entry *entry, + struct mtx *ilk) +{ + + MPASS(lock != NULL && entry != NULL && ilk != NULL); + mtx_assert(ilk, MA_OWNED); + KASSERT(entry != lock->rl_currdep, ("stuck currdep")); + + TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link); + rangelock_calc_block(lock); + mtx_unlock(ilk); + if (curthread->td_rlqe == NULL) + curthread->td_rlqe = entry; + else + rlqentry_free(entry); +} + +void +rangelock_unlock(struct rangelock *lock, void *cookie, struct mtx *ilk) +{ + + MPASS(lock != NULL && cookie != NULL && ilk != NULL); + + mtx_lock(ilk); + rangelock_unlock_locked(lock, cookie, ilk); +} + +/* + * Unlock the sub-range of granted lock. + */ +void * +rangelock_unlock_range(struct rangelock *lock, void *cookie, off_t start, + off_t end, struct mtx *ilk) +{ + struct rl_q_entry *entry; + + MPASS(lock != NULL && cookie != NULL && ilk != NULL); + entry = cookie; + KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED, + ("Unlocking non-granted lock")); + KASSERT(entry->rl_q_start == start, ("wrong start")); + KASSERT(entry->rl_q_end >= end, ("wrong end")); + + mtx_lock(ilk); + if (entry->rl_q_end == end) { + rangelock_unlock_locked(lock, cookie, ilk); + return (NULL); + } + entry->rl_q_end = end; + rangelock_calc_block(lock); + mtx_unlock(ilk); + return (cookie); +} + +/* + * Add the lock request to the queue of the pending requests for + * rangelock. Sleeps until the request can be granted. + */ +static void * +rangelock_enqueue(struct rangelock *lock, off_t start, off_t end, int mode, + struct mtx *ilk) +{ + struct rl_q_entry *entry; + struct thread *td; + + MPASS(lock != NULL && ilk != NULL); + + td = curthread; + if (td->td_rlqe != NULL) { + entry = td->td_rlqe; + td->td_rlqe = NULL; + } else + entry = rlqentry_alloc(); + MPASS(entry != NULL); + entry->rl_q_flags = mode; + entry->rl_q_start = start; + entry->rl_q_end = end; + + mtx_lock(ilk); + /* + * XXXKIB TODO. Check that thread does not try to enqueue a + * lock which is incompatible with other request from the same + * thread. + */ + + TAILQ_INSERT_TAIL(&lock->rl_waiters, entry, rl_q_link); + if (lock->rl_currdep == NULL) + lock->rl_currdep = entry; + rangelock_calc_block(lock); + while (!(entry->rl_q_flags & RL_LOCK_GRANTED)) + msleep(entry, ilk, 0, "range", 0); + mtx_unlock(ilk); + return (entry); +} + +void * +rangelock_rlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk) +{ + + return (rangelock_enqueue(lock, start, end, RL_LOCK_READ, ilk)); +} + +void * +rangelock_wlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk) +{ + + return (rangelock_enqueue(lock, start, end, RL_LOCK_WRITE, ilk)); +} diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c index d4c5c4c..8116c15 100644 --- a/sys/kern/kern_thread.c +++ b/sys/kern/kern_thread.c @@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -199,6 +200,7 @@ thread_init(void *mem, int size, int flags) td->td_sleepqueue = sleepq_alloc(); td->td_turnstile = turnstile_alloc(); + td->td_rlqe = NULL; EVENTHANDLER_INVOKE(thread_init, td); td->td_sched = (struct td_sched *)&td[1]; umtx_thread_init(td); @@ -216,6 +218,7 @@ thread_fini(void *mem, int size) td = (struct thread *)mem; EVENTHANDLER_INVOKE(thread_fini, td); + rlqentry_free(td->td_rlqe); turnstile_free(td->td_turnstile); sleepq_free(td->td_sleepqueue); umtx_thread_fini(td); diff --git a/sys/kern/subr_syscall.c b/sys/kern/subr_syscall.c index 5b48595..5aee684 100644 --- a/sys/kern/subr_syscall.c +++ b/sys/kern/subr_syscall.c @@ -182,6 +182,12 @@ syscallret(struct thread *td, int error, struct syscall_args *sa __unused) KASSERT(td->td_locks == 0, ("System call %s returning with %d locks held", syscallname(p, sa->code), td->td_locks)); + KASSERT((td->td_pflags & TDP_NOFAULTING) == 0, + ("System call %s returning with pagefaults disabled", + syscallname(p, sa->code))); + KASSERT((td->td_pflags & TDP_NOSLEEPING) == 0, + ("System call %s returning with sleep disabled", + syscallname(p, sa->code))); /* * Handle reschedule and other end-of-syscall issues diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index a06ba31..8d999c3 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -1027,6 +1027,7 @@ alloc: if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) vp->v_vflag |= VV_NOKNOTE; } + rangelock_init(&vp->v_rl); *vpp = vp; return (0); @@ -2468,6 +2469,7 @@ vdropl(struct vnode *vp) /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif + rangelock_destroy(&vp->v_rl); lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); mtx_destroy(BO_MTX(bo)); diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index d4b60f1..5b4799e 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -65,10 +65,15 @@ __FBSDID("$FreeBSD$"); #include #include +#include +#include +#include #include +#include static fo_rdwr_t vn_read; static fo_rdwr_t vn_write; +static fo_rdwr_t vn_io_fault; static fo_truncate_t vn_truncate; static fo_ioctl_t vn_ioctl; static fo_poll_t vn_poll; @@ -77,8 +82,8 @@ static fo_stat_t vn_statfile; static fo_close_t vn_closefile; struct fileops vnops = { - .fo_read = vn_read, - .fo_write = vn_write, + .fo_read = vn_io_fault, + .fo_write = vn_io_fault, .fo_truncate = vn_truncate, .fo_ioctl = vn_ioctl, .fo_poll = vn_poll, @@ -367,57 +372,56 @@ sequential_heuristic(struct uio *uio, struct file *fp) * Package up an I/O request on a vnode into a uio and do it. */ int -vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, - aresid, td) - enum uio_rw rw; - struct vnode *vp; - void *base; - int len; - off_t offset; - enum uio_seg segflg; - int ioflg; - struct ucred *active_cred; - struct ucred *file_cred; - ssize_t *aresid; - struct thread *td; +vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, + enum uio_seg segflg, int ioflg, struct ucred *active_cred, + struct ucred *file_cred, ssize_t *aresid, struct thread *td) { struct uio auio; struct iovec aiov; struct mount *mp; struct ucred *cred; + void *rl_cookie; int error, lock_flags; VFS_ASSERT_GIANT(vp->v_mount); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = base; + aiov.iov_len = len; + auio.uio_resid = len; + auio.uio_offset = offset; + auio.uio_segflg = segflg; + auio.uio_rw = rw; + auio.uio_td = td; + error = 0; + if ((ioflg & IO_NODELOCKED) == 0) { + if (rw == UIO_READ) { + rl_cookie = vn_rangelock_rlock(vp, offset, + offset + len); + } else { + rl_cookie = vn_rangelock_wlock(vp, offset, + offset + len); + } mp = NULL; if (rw == UIO_WRITE) { if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) - return (error); + goto out; if (MNT_SHARED_WRITES(mp) || - ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) { + ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) lock_flags = LK_SHARED; - } else { + else lock_flags = LK_EXCLUSIVE; - } - vn_lock(vp, lock_flags | LK_RETRY); } else - vn_lock(vp, LK_SHARED | LK_RETRY); + lock_flags = LK_SHARED; + vn_lock(vp, lock_flags | LK_RETRY); + } else + rl_cookie = NULL; - } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - aiov.iov_base = base; - aiov.iov_len = len; - auio.uio_resid = len; - auio.uio_offset = offset; - auio.uio_segflg = segflg; - auio.uio_rw = rw; - auio.uio_td = td; - error = 0; #ifdef MAC if ((ioflg & IO_NOMACCHECK) == 0) { if (rw == UIO_READ) @@ -429,7 +433,7 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, } #endif if (error == 0) { - if (file_cred) + if (file_cred != NULL) cred = file_cred; else cred = active_cred; @@ -444,10 +448,13 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, if (auio.uio_resid && error == 0) error = EIO; if ((ioflg & IO_NODELOCKED) == 0) { - if (rw == UIO_WRITE && vp->v_type != VCHR) - vn_finished_write(mp); VOP_UNLOCK(vp, 0); + if (mp != NULL) + vn_finished_write(mp); } + out: + if (rl_cookie != NULL) + vn_rangelock_unlock(vp, rl_cookie); return (error); } @@ -688,29 +695,270 @@ unlock: return (error); } +static const int io_hold_cnt = 16; + /* - * File table truncate routine. + * The vn_io_fault() is a wrapper around vn_read() and vn_write() to + * prevent the following deadlock: + * + * Assume that the thread A reads from the vnode vp1 into userspace + * buffer buf1 backed by the pages of vnode vp2. If a page in buf1 is + * currently not resident, then system ends up with the call chain + * vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] -> + * vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2) + * which establishes lock order vp1->vn_lock, then vp2->vn_lock. + * If, at the same time, thread B reads from vnode vp2 into buffer buf2 + * backed by the pages of vnode vp1, and some page in buf2 is not + * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock. + * + * To prevent the lock order reversal and deadlock, vn_io_fault() does + * not allow page faults to happen during VOP_READ() or VOP_WRITE(). + * Instead, it first tries to do the whole range i/o with pagefaults + * disabled. If all pages in the i/o buffer are resident and mapped, + * VOP will succeed (ignoring the genuine filesystem errors). + * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do + * i/o in chunks, with all pages in the chunk prefaulted and held + * using vm_fault_quick_hold_pages(). + * + * Filesystems using this deadlock avoidance scheme should use the + * array of the held pages from uio, saved in the curthread->td_ma, + * instead of doing uiomove(). A helper function + * vn_io_fault_uiomove() converts uiomove request into + * uiomove_fromphys() over td_ma array. + * + * Since vnode locks do not cover the whole i/o anymore , rangelocks + * make the current i/o request atomic with respect to other i/os and + * truncations. */ static int -vn_truncate(fp, length, active_cred, td) - struct file *fp; - off_t length; - struct ucred *active_cred; +vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td) +{ + vm_page_t ma[io_hold_cnt + 2]; + struct uio *uio_clone, short_uio; + struct iovec short_iovec[1]; + fo_rdwr_t *doio; + struct vnode *vp; + void *rl_cookie; + struct mount *mp; + vm_page_t *prev_td_ma; + int cnt, error, save, saveheld, prev_td_ma_cnt; + vm_offset_t addr, end; + vm_prot_t prot; + size_t len, resid; + ssize_t adv; + + if (uio->uio_rw == UIO_READ) + doio = vn_read; + else + doio = vn_write; + vp = fp->f_vnode; + if (uio->uio_segflg != UIO_USERSPACE || vp->v_type != VREG || + ((mp = vp->v_mount) != NULL && + (mp->mnt_kern_flag & MNTK_NO_IOPF) == 0)) + return (doio(fp, uio, active_cred, flags, td)); + + /* + * The UFS follows IO_UNIT directive and replays back both + * uio_offset and uio_resid if error encountered during the + * operation. But, since the iovec may be already advanced, + * uio is still in the inconsistent state. + * + * Cache a copy of the original uio, which is advanced to redo + * point using UIO_NOCOPY below. + */ + uio_clone = cloneuio(uio); + resid = uio->uio_resid; + + short_uio.uio_segflg = UIO_USERSPACE; + short_uio.uio_rw = uio->uio_rw; + short_uio.uio_td = uio->uio_td; + + if (uio->uio_rw == UIO_READ) { + prot = VM_PROT_WRITE; + rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset, + uio->uio_offset + uio->uio_resid); + } else { + prot = VM_PROT_READ; + if ((fp->f_flag & O_APPEND) != 0 || (flags & FOF_OFFSET) == 0) + /* For appenders, punt and lock the whole range. */ + rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); + else + rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset, + uio->uio_offset + uio->uio_resid); + } + + save = vm_fault_disable_pagefaults(); + error = doio(fp, uio, active_cred, flags, td); + if (error != EFAULT) + goto out; + + uio_clone->uio_segflg = UIO_NOCOPY; + uiomove(NULL, resid - uio->uio_resid, uio_clone); + uio_clone->uio_segflg = uio->uio_segflg; + + saveheld = curthread_pflags_set(TDP_UIOHELD); + prev_td_ma = td->td_ma; + prev_td_ma_cnt = td->td_ma_cnt; + + while (uio_clone->uio_resid != 0) { + len = uio_clone->uio_iov->iov_len; + if (len == 0) { + KASSERT(uio_clone->uio_iovcnt >= 1, + ("iovcnt underflow")); + uio_clone->uio_iov++; + uio_clone->uio_iovcnt--; + continue; + } + + addr = (vm_offset_t)uio_clone->uio_iov->iov_base; + end = round_page(addr + len); + cnt = howmany(end - trunc_page(addr), PAGE_SIZE); + /* + * Perfectly misaligned address and lenght could cause + * both start and end of the chunk to use partial + * page. +2 accounts for such situation. + */ + if (cnt > io_hold_cnt + 2) { + len = io_hold_cnt * PAGE_SIZE; + KASSERT(howmany(round_page(addr + len) - + trunc_page(addr), PAGE_SIZE) <= io_hold_cnt + 2, + ("cnt overflow")); + } + cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map, + addr, len, prot, ma, io_hold_cnt + 2); + if (cnt == -1) { + error = EFAULT; + break; + } + short_uio.uio_iov = &short_iovec[0]; + short_iovec[0].iov_base = (void *)addr; + short_uio.uio_iovcnt = 1; + short_uio.uio_resid = short_iovec[0].iov_len = len; + short_uio.uio_offset = uio_clone->uio_offset; + td->td_ma = ma; + td->td_ma_cnt = cnt; + + error = doio(fp, &short_uio, active_cred, flags, td); + vm_page_unhold_pages(ma, cnt); + adv = len - short_uio.uio_resid; + + uio_clone->uio_iov->iov_base = + (char *)uio_clone->uio_iov->iov_base + adv; + uio_clone->uio_iov->iov_len -= adv; + uio_clone->uio_resid -= adv; + uio_clone->uio_offset += adv; + + uio->uio_resid -= adv; + uio->uio_offset += adv; + + if (error != 0 || adv == 0) + break; + } + td->td_ma = prev_td_ma; + td->td_ma_cnt = prev_td_ma_cnt; + curthread_pflags_restore(saveheld); +out: + vm_fault_enable_pagefaults(save); + vn_rangelock_unlock(vp, rl_cookie); + free(uio_clone, M_IOV); + return (error); +} + +/* + * Helper function to perform the requested uiomove operation using + * the held pages for io->uio_iov[0].iov_base buffer instead of + * copyin/copyout. Access to the pages with uiomove_fromphys() + * instead of iov_base prevents page faults that could occur due to + * pmap_collect() invalidating the mapping created by + * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or + * object cleanup revoking the write access from page mappings. + * + * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove() + * instead of plain uiomove(). + */ +int +vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio) +{ + struct uio transp_uio; + struct iovec transp_iov[1]; struct thread *td; + size_t adv; + int error, pgadv; + + td = curthread; + if ((td->td_pflags & TDP_UIOHELD) == 0 || + uio->uio_segflg != UIO_USERSPACE) + return (uiomove(data, xfersize, uio)); + + KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); + transp_iov[0].iov_base = data; + transp_uio.uio_iov = &transp_iov[0]; + transp_uio.uio_iovcnt = 1; + if (xfersize > uio->uio_resid) + xfersize = uio->uio_resid; + transp_uio.uio_resid = transp_iov[0].iov_len = xfersize; + transp_uio.uio_offset = 0; + transp_uio.uio_segflg = UIO_SYSSPACE; + /* + * Since transp_iov points to data, and td_ma page array + * corresponds to original uio->uio_iov, we need to invert the + * direction of the i/o operation as passed to + * uiomove_fromphys(). + */ + switch (uio->uio_rw) { + case UIO_WRITE: + transp_uio.uio_rw = UIO_READ; + break; + case UIO_READ: + transp_uio.uio_rw = UIO_WRITE; + break; + } + transp_uio.uio_td = uio->uio_td; + error = uiomove_fromphys(td->td_ma, + ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK, + xfersize, &transp_uio); + adv = xfersize - transp_uio.uio_resid; + pgadv = + (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) - + (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT); + td->td_ma += pgadv; + KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, + pgadv)); + td->td_ma_cnt -= pgadv; + uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv; + uio->uio_iov->iov_len -= adv; + uio->uio_resid -= adv; + uio->uio_offset += adv; + return (error); +} + +/* + * File table truncate routine. + */ +static int +vn_truncate(struct file *fp, off_t length, struct ucred *active_cred, + struct thread *td) { struct vattr vattr; struct mount *mp; struct vnode *vp; + void *rl_cookie; int vfslocked; int error; vp = fp->f_vnode; + + /* + * Lock the whole range for truncation. Otherwise splitted + * i/o might partially happen before, partially after the + * truncation. + */ + rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); vfslocked = VFS_LOCK_GIANT(vp->v_mount); error = vn_start_write(vp, &mp, V_WAIT | PCATCH); - if (error) { - VFS_UNLOCK_GIANT(vfslocked); - return (error); - } + if (error) + goto out1; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type == VDIR) { error = EISDIR; @@ -730,7 +978,9 @@ vn_truncate(fp, length, active_cred, td) out: VOP_UNLOCK(vp, 0); vn_finished_write(mp); +out1: VFS_UNLOCK_GIANT(vfslocked); + vn_rangelock_unlock(vp, rl_cookie); return (error); } diff --git a/sys/sys/mount.h b/sys/sys/mount.h index 319e094..9bf8d08 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -369,6 +369,9 @@ void __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp); #define MNTK_REFEXPIRE 0x00000020 /* refcount expiring is happening */ #define MNTK_EXTENDED_SHARED 0x00000040 /* Allow shared locking for more ops */ #define MNTK_SHARED_WRITES 0x00000080 /* Allow shared locking for writes */ +#define MNTK_NO_IOPF 0x00000100 /* Disallow page faults during reads + and writes. Filesystem shall properly + handle i/o state on EFAULT. */ #define MNTK_NOASYNC 0x00800000 /* disable async */ #define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 0873927..0ba8162 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -213,6 +213,7 @@ struct thread { struct seltd *td_sel; /* Select queue/channel. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ + struct rl_q_entry *td_rlqe; /* (k) Associated range lock entry. */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ lwpid_t td_tid; /* (b) Thread ID. */ sigqueue_t td_sigqueue; /* (c) Sigs arrived, not delivered. */ @@ -311,7 +312,9 @@ struct thread { struct vnet *td_vnet; /* (k) Effective vnet. */ const char *td_vnet_lpush; /* (k) Debugging vnet push / pop. */ struct trapframe *td_intr_frame;/* (k) Frame of the current irq */ - struct proc *td_rfppwait_p; /* (k) The vforked child */ + struct proc *td_rfppwait_p; /* (k) The vforked child */ + struct vm_page **td_ma; /* (k) uio pages held */ + int td_ma_cnt; /* (k) size of *td_ma */ }; struct mtx *thread_lock_block(struct thread *); @@ -419,6 +422,7 @@ do { \ #define TDP_RFPPWAIT 0x02000000 /* Handle RFPPWAIT on syscall exit */ #define TDP_RESETSPUR 0x04000000 /* Reset spurious page fault history. */ #define TDP_NERRNO 0x08000000 /* Last errno is already in td_errno */ +#define TDP_UIOHELD 0x10000000 /* Current uio has pages held in td_ma */ /* * Reasons that the current thread can not be run yet. diff --git a/sys/sys/rangelock.h b/sys/sys/rangelock.h new file mode 100644 index 0000000..bf82183 --- /dev/null +++ b/sys/sys/rangelock.h @@ -0,0 +1,78 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_RANGELOCK_H +#define _SYS_RANGELOCK_H + +#include + +#define RL_LOCK_READ 0x0001 +#define RL_LOCK_WRITE 0x0002 +#define RL_LOCK_TYPE_MASK 0x0003 +#define RL_LOCK_GRANTED 0x0004 + +struct rl_q_entry; + +/* + * The structure representing the range lock. Caller may request the + * read or write access to the range of bytes. Access is granted if + * all existing lock owners are compatible with the request. Two lock + * owners are compatible if their ranges do not overlap, or both + * owners are for read. + * + * Access to the structure itself is synchronized with the externally + * supplied mutex. + * + * rl_waiters is the queue of lock requests in the order of arrival. + * rl_currdep is the first lock request that cannot be granted now due + * to the preceeding requests conflicting with it. + */ +struct rangelock { + TAILQ_HEAD(, rl_q_entry) rl_waiters; + struct rl_q_entry *rl_currdep; +}; + +#ifdef _KERNEL + +struct mtx; + +void rangelock_init(struct rangelock *lock); +void rangelock_destroy(struct rangelock *lock); +void rangelock_unlock(struct rangelock *lock, void *cookie, + struct mtx *ilk); +void *rangelock_unlock_range(struct rangelock *lock, void *cookie, + off_t start, off_t end, struct mtx *ilk); +void *rangelock_rlock(struct rangelock *lock, off_t start, off_t end, + struct mtx *ilk); +void *rangelock_wlock(struct rangelock *lock, off_t start, off_t end, + struct mtx *ilk); +void rlqentry_free(struct rl_q_entry *rlqe); + +#endif /* _KERNEL */ + +#endif /* _SYS_RANGELOCK_H */ diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 49f6f5b..97e3e29 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -164,7 +165,8 @@ struct vnode { */ struct vpollinfo *v_pollinfo; /* i Poll events, p for *v_pi */ struct label *v_label; /* MAC label for vnode */ - struct lockf *v_lockf; /* Byte-level lock list */ + struct lockf *v_lockf; /* Byte-level adv lock list */ + struct rangelock v_rl; /* Byte-range lock */ }; #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */ @@ -676,7 +678,17 @@ int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td); int vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp); - +int vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio); + +#define vn_rangelock_unlock(vp, cookie) \ + rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp)) +#define vn_rangelock_unlock_range(vp, cookie, start, end) \ + rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), \ + VI_MTX(vp)) +#define vn_rangelock_rlock(vp, start, end) \ + rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) +#define vn_rangelock_wlock(vp, start, end) \ + rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) int vfs_cache_lookup(struct vop_lookup_args *ap); void vfs_timestamp(struct timespec *); diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index 9aff694..fee8012 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -1064,7 +1064,7 @@ ffs_mountfs(devvp, mp, td) */ MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_MPSAFE | MNTK_LOOKUP_SHARED | - MNTK_EXTENDED_SHARED; + MNTK_EXTENDED_SHARED | MNTK_NO_IOPF; MNT_IUNLOCK(mp); #ifdef UFS_EXTATTR #ifdef UFS_EXTATTR_AUTOSTART diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c index 0699eef..6259911 100644 --- a/sys/ufs/ffs/ffs_vnops.c +++ b/sys/ufs/ffs/ffs_vnops.c @@ -567,7 +567,7 @@ ffs_read(ap) xfersize = size; } - error = uiomove((char *)bp->b_data + blkoffset, + error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if (error) break; @@ -738,8 +738,8 @@ ffs_write(ap) if (size < xfersize) xfersize = size; - error = - uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); + error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset, + (int)xfersize, uio); /* * If the buffer is not already filled and we encounter an * error while trying to fill it, we have to clear out any