diff --git a/sys/conf/files b/sys/conf/files index 70113b3..6713e75 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -2166,6 +2166,7 @@ kern/kern_poll.c optional device_polling kern/kern_priv.c standard kern/kern_proc.c standard kern/kern_prot.c standard +kern/kern_rangelock.c standard kern/kern_resource.c standard kern/kern_rmlock.c standard kern/kern_rwlock.c standard @@ -2874,6 +2875,7 @@ vm/vm_page.c standard vm/vm_pageout.c standard vm/vm_pager.c standard vm/vm_phys.c standard +vm/vm_readwrite.c standard vm/vm_reserv.c standard vm/vm_unix.c standard vm/vm_zeroidle.c standard diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c index ccd9039..a201bd0 100644 --- a/sys/dev/md/md.c +++ b/sys/dev/md/md.c @@ -85,6 +85,7 @@ #include #include #include +#include #include #include #include @@ -589,7 +590,7 @@ mdstart_swap(struct md_s *sc, struct bio *bp) { struct sf_buf *sf; int rv, offs, len, lastend; - vm_pindex_t i, lastp; + vm_pindex_t i, firstp, lastp; vm_page_t m; u_char *p; @@ -612,18 +613,26 @@ mdstart_swap(struct md_s *sc, struct bio *bp) * we're operating on complete aligned pages). */ offs = bp->bio_offset % PAGE_SIZE; + firstp = bp->bio_offset / PAGE_SIZE; lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE; lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1; + vm_page_t ma[lastp - firstp + 1]; + rv = VM_PAGER_OK; VM_OBJECT_LOCK(sc->object); vm_object_pip_add(sc->object, 1); - for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) { + for (i = firstp; i <= lastp; i++) { len = ((i == lastp) ? lastend : PAGE_SIZE) - offs; - m = vm_page_grab(sc->object, i, - VM_ALLOC_NORMAL|VM_ALLOC_RETRY); + /* + * Write cleans pages of the buffer, give it a + * priority. + */ + m = vm_page_grab(sc->object, i, (bp->bio_cmd == BIO_WRITE ? + VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | VM_ALLOC_RETRY); VM_OBJECT_UNLOCK(sc->object); + ma[i - firstp] = m; sched_pin(); sf = sf_buf_alloc(m, SFB_CPUPRIVATE); VM_OBJECT_LOCK(sc->object); @@ -685,6 +694,9 @@ printf("wire_count %d busy %d flags %x hold_count %d act_count %d queue %d valid } vm_object_pip_subtract(sc->object, 1); vm_object_set_writeable_dirty(sc->object); + if (rv != VM_PAGER_ERROR && bp->bio_cmd == BIO_WRITE && + vm_page_count_severe()) + vm_pageout_flush(ma, lastp - firstp + 1, IO_SYNC, 0, NULL); VM_OBJECT_UNLOCK(sc->object); return (rv != VM_PAGER_ERROR ? 0 : ENOSPC); } diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c new file mode 100644 index 0000000..fc0ae39 --- /dev/null +++ b/sys/kern/kern_rangelock.c @@ -0,0 +1,237 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +#include + +struct rl_q_entry { + TAILQ_ENTRY(rl_q_entry) rl_q_link; + size_t rl_q_start, rl_q_end; + int rl_q_flags; +}; + +static uma_zone_t rl_entry_zone; + +static void +rangelock_sys_init(void) +{ + + rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + thread0.td_rlqe = rlqentry_alloc(); +} +SYSINIT(vfs, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL); + +struct rl_q_entry * +rlqentry_alloc() +{ + + return (uma_zalloc(rl_entry_zone, M_WAITOK)); +} + +void +rlqentry_free(struct rl_q_entry *rleq) +{ + + uma_zfree(rl_entry_zone, rleq); +} + +void +rangelock_init(struct rangelock *lock) +{ + + TAILQ_INIT(&lock->rl_waiters); + lock->rl_currdep = NULL; +} + +void +rangelock_destroy(struct rangelock *lock) +{ + + KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters")); +} + +static int +rangelock_incompatible(const struct rl_q_entry *e1, + const struct rl_q_entry *e2) +{ + + if ((e1->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ && + (e2->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ) + return (0); +#define IN_RANGE(a, e) (a >= e->rl_q_start && a < e->rl_q_end) + if (IN_RANGE(e1->rl_q_start, e2) || IN_RANGE(e2->rl_q_start, e1) || + IN_RANGE(e1->rl_q_end, e2) || IN_RANGE(e2->rl_q_end, e1)) + return (1); +#undef IN_RANGE + return (0); +} + +static void +rangelock_calc_block(struct rangelock *lock) +{ + struct rl_q_entry *entry, *entry1, *whead; + + if (lock->rl_currdep == TAILQ_FIRST(&lock->rl_waiters) && + lock->rl_currdep != NULL) + lock->rl_currdep = TAILQ_NEXT(lock->rl_currdep, rl_q_link); + for (entry = lock->rl_currdep; entry; + entry = TAILQ_NEXT(entry, rl_q_link)) { + TAILQ_FOREACH(entry1, &lock->rl_waiters, rl_q_link) { + if (rangelock_incompatible(entry, entry1)) + goto out; + if (entry1 == entry) + break; + } + } +out: + lock->rl_currdep = entry; + TAILQ_FOREACH(whead, &lock->rl_waiters, rl_q_link) { + if (whead == lock->rl_currdep) + break; + if (!(whead->rl_q_flags & RL_LOCK_GRANTED)) { + whead->rl_q_flags |= RL_LOCK_GRANTED; + wakeup(whead); + } + } +} + +static void +rangelock_unlock_locked(struct rangelock *lock, struct rl_q_entry *entry, + struct mtx *ilk) +{ + + MPASS(lock != NULL && entry != NULL && ilk != NULL); + mtx_assert(ilk, MA_OWNED); + KASSERT(entry != lock->rl_currdep, ("stuck currdep")); + + TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link); + rangelock_calc_block(lock); + mtx_unlock(ilk); + if (curthread->td_rlqe == NULL) + curthread->td_rlqe = entry; + else + rlqentry_free(entry); +} + +void +rangelock_unlock(struct rangelock *lock, void *cookie, struct mtx *ilk) +{ + struct rl_q_entry *entry; + + MPASS(lock != NULL && cookie != NULL && ilk != NULL); + + entry = cookie; + mtx_lock(ilk); + rangelock_unlock_locked(lock, entry, ilk); +} + +void * +rangelock_unlock_range(struct rangelock *lock, void *cookie, off_t base, + size_t len, struct mtx *ilk) +{ + struct rl_q_entry *entry; + + MPASS(lock != NULL && cookie != NULL && ilk != NULL); + + mtx_lock(ilk); + entry = cookie; + KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED, ("XXX")); + KASSERT(entry->rl_q_start == base, ("XXX")); + KASSERT(entry->rl_q_end >= base + len, ("XXX")); + if (entry->rl_q_end == base + len) { + rangelock_unlock_locked(lock, cookie, ilk); + return (NULL); + } + entry->rl_q_end = base + len; + rangelock_calc_block(lock); + mtx_unlock(ilk); + return (cookie); +} + +static void * +rangelock_enqueue(struct rangelock *lock, struct rl_q_entry *entry, + struct mtx *ilk) +{ + + MPASS(lock != NULL && entry != NULL && ilk != NULL); + + mtx_lock(ilk); + TAILQ_INSERT_TAIL(&lock->rl_waiters, entry, rl_q_link); + if (lock->rl_currdep == NULL) + lock->rl_currdep = entry; + rangelock_calc_block(lock); + while (!(entry->rl_q_flags & RL_LOCK_GRANTED)) + msleep(entry, ilk, 0, "range", 0); + mtx_unlock(ilk); + return (entry); +} + +void * +rangelock_rlock(struct rangelock *lock, off_t base, size_t len, struct mtx *ilk) +{ + struct rl_q_entry *entry; + struct thread *td; + + td = curthread; + if (td->td_rlqe != NULL) { + entry = td->td_rlqe; + td->td_rlqe = NULL; + } else + entry = rlqentry_alloc(); + entry->rl_q_flags = RL_LOCK_READ; + entry->rl_q_start = base; + entry->rl_q_end = base + len; + return (rangelock_enqueue(lock, entry, ilk)); +} + +void * +rangelock_wlock(struct rangelock *lock, off_t base, size_t len, struct mtx *ilk) +{ + struct rl_q_entry *entry; + struct thread *td; + + td = curthread; + if (td->td_rlqe != NULL) { + entry = td->td_rlqe; + td->td_rlqe = NULL; + } else + entry = rlqentry_alloc(); + entry->rl_q_flags = RL_LOCK_WRITE; + entry->rl_q_start = base; + entry->rl_q_end = base + len; + return (rangelock_enqueue(lock, entry, ilk)); +} diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c index 7161a99..ba1869b 100644 --- a/sys/kern/kern_thread.c +++ b/sys/kern/kern_thread.c @@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -210,6 +211,7 @@ thread_init(void *mem, int size, int flags) td->td_sleepqueue = sleepq_alloc(); td->td_turnstile = turnstile_alloc(); + td->td_rlqe = rlqentry_alloc(); EVENTHANDLER_INVOKE(thread_init, td); td->td_sched = (struct td_sched *)&td[1]; umtx_thread_init(td); @@ -227,6 +229,8 @@ thread_fini(void *mem, int size) td = (struct thread *)mem; EVENTHANDLER_INVOKE(thread_fini, td); + KASSERT(td->td_rlqe != NULL, ("Leaked td_rlqe")); + rlqentry_free(td->td_rlqe); turnstile_free(td->td_turnstile); sleepq_free(td->td_sleepqueue); umtx_thread_fini(td); diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index ba331b1..de1eca9 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -71,8 +71,8 @@ static int write_behind = 1; SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, "Cluster write-behind; 0: disable, 1: enable, 2: backed off"); -static int read_max = 32; -SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0, +int vfs_read_max = 32; +SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &vfs_read_max, 0, "Cluster read-ahead max block count"); /* Page expended to mark partially backed buffers */ @@ -109,7 +109,7 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) */ racluster = vp->v_mount->mnt_iosize_max / size; maxra = seqcount; - maxra = min(read_max, maxra); + maxra = min(vfs_read_max, maxra); maxra = min(nbuf/8, maxra); if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize) maxra = (filesize / size) - lblkno; @@ -803,7 +803,9 @@ cluster_wbuild(vp, size, start_lbn, len) (tbp->b_bcount != tbp->b_bufsize) || (tbp->b_bcount != size) || (len == 1) || - ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) { + ((bp = (vp->v_vflag & VV_MD) ? + trypbuf(&cluster_pbuf_freecnt) : + getpbuf(&cluster_pbuf_freecnt)) == NULL)) { totalwritten += tbp->b_bufsize; bawrite(tbp); ++start_lbn; diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c index 195e735..bf038cf 100644 --- a/sys/kern/vfs_default.c +++ b/sys/kern/vfs_default.c @@ -78,6 +78,8 @@ static int dirent_exists(struct vnode *vp, const char *dirname, #define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4) +static int vop_stdextend(struct vop_extend_args *ap); + /* * This vnode table stores what we want to do if the filesystem doesn't * implement a particular VOP. @@ -121,6 +123,7 @@ struct vop_vector default_vnodeops = { .vop_unlock = vop_stdunlock, .vop_vptocnp = vop_stdvptocnp, .vop_vptofh = vop_stdvptofh, + .vop_extend = vop_stdextend, }; /* @@ -855,6 +858,23 @@ out: return (error); } +static int +vop_stdextend(struct vop_extend_args *ap) +{ + struct vattr vattr, oattr; + int error; + + + error = VOP_GETATTR(ap->a_vp, &oattr, ap->a_cred); + if (error != 0) + return (error); + if (oattr.va_size >= ap->a_size) + return (0); + VATTR_NULL(&vattr); + vattr.va_size = ap->a_size; + return (VOP_SETATTR(ap->a_vp, &vattr, ap->a_cred)); +} + /* * vfs default ops * used to fill the vfs function table to get reasonable default return values. diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index fc413a2..00dffe8 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -878,6 +878,7 @@ vdestroy(struct vnode *vp) /* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif + rangelock_destroy(&vp->v_rl); lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); mtx_destroy(BO_MTX(bo)); @@ -1032,6 +1033,7 @@ alloc: if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) vp->v_vflag |= VV_NOKNOTE; } + rangelock_init(&vp->v_rl); *vpp = vp; return (0); diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 42abf6e..855e3f8 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -37,12 +37,14 @@ #include __FBSDID("$FreeBSD$"); +#include #include #include #include #include #include #include +#include #include #include #include @@ -63,6 +65,13 @@ __FBSDID("$FreeBSD$"); #include +#include +#include + +static int vmio_enabled = 1; +SYSCTL_INT(_vfs, OID_AUTO, vmio_enabled, CTLFLAG_RW, &vmio_enabled, 0, + "Use vm pages copyin/out instead of vops for read/write"); + static fo_rdwr_t vn_read; static fo_rdwr_t vn_write; static fo_truncate_t vn_truncate; @@ -84,6 +93,9 @@ struct fileops vnops = { .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; +static int vn_write_chunk(struct vnode *, struct uio *, struct ucred *, + struct ucred *, int); + int vn_open(ndp, flagp, cmode, fp) struct nameidata *ndp; @@ -280,17 +292,14 @@ vn_writechk(vp) * Vnode close call */ int -vn_close(vp, flags, file_cred, td) - register struct vnode *vp; - int flags; - struct ucred *file_cred; - struct thread *td; +vn_close(struct vnode *vp, int flags, struct ucred *file_cred, + struct thread *td) { - struct mount *mp; + struct mount *mp, *mp1; int error, lock_flags; - if (!(flags & FWRITE) && vp->v_mount != NULL && - vp->v_mount->mnt_kern_flag & MNTK_EXTENDED_SHARED) + if (!(flags & FWRITE) && (mp1 = vp->v_mount) != NULL && + MNT_SHARED_WRITES(mp1)) lock_flags = LK_SHARED; else lock_flags = LK_EXCLUSIVE; @@ -338,7 +347,7 @@ sequential_heuristic(struct uio *uio, struct file *fp) * closely related to the best I/O size for real disks than * to any block size used by software. */ - fp->f_seqcount += howmany(uio->uio_resid, 16384); + fp->f_seqcount += howmany(uio->uio_resid, FRA_BLOCK_SZ); if (fp->f_seqcount > IO_SEQMAX) fp->f_seqcount = IO_SEQMAX; return (fp->f_seqcount << IO_SEQSHIFT); @@ -356,76 +365,71 @@ sequential_heuristic(struct uio *uio, struct file *fp) * Package up an I/O request on a vnode into a uio and do it. */ int -vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, - aresid, td) - enum uio_rw rw; - struct vnode *vp; - void *base; - int len; - off_t offset; - enum uio_seg segflg; - int ioflg; - struct ucred *active_cred; - struct ucred *file_cred; - int *aresid; - struct thread *td; +vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, + enum uio_seg segflg, int ioflg, struct ucred *active_cred, + struct ucred *file_cred, int *aresid, struct thread *td) { struct uio auio; struct iovec aiov; struct mount *mp; struct ucred *cred; + void *rl_cookie; int error, lock_flags; VFS_ASSERT_GIANT(vp->v_mount); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = base; + aiov.iov_len = len; + auio.uio_resid = len; + auio.uio_offset = offset; + auio.uio_segflg = segflg; + auio.uio_rw = rw; + auio.uio_td = td; + error = 0; + if ((ioflg & IO_NODELOCKED) == 0) { + if (rw == UIO_READ) + rl_cookie = vn_rangelock_rlock(vp, offset, len); + else + rl_cookie = vn_rangelock_wlock(vp, offset, len); mp = NULL; if (rw == UIO_WRITE) { if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) - return (error); + goto out; if (MNT_SHARED_WRITES(mp) || - ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) { + ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) lock_flags = LK_SHARED; - } else { + else lock_flags = LK_EXCLUSIVE; - } - vn_lock(vp, lock_flags | LK_RETRY); } else - vn_lock(vp, LK_SHARED | LK_RETRY); + lock_flags = LK_SHARED; + vn_lock(vp, lock_flags | LK_RETRY); + } else + rl_cookie = NULL; - } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - aiov.iov_base = base; - aiov.iov_len = len; - auio.uio_resid = len; - auio.uio_offset = offset; - auio.uio_segflg = segflg; - auio.uio_rw = rw; - auio.uio_td = td; - error = 0; #ifdef MAC if ((ioflg & IO_NOMACCHECK) == 0) { - if (rw == UIO_READ) - error = mac_vnode_check_read(active_cred, file_cred, - vp); - else + if (rw == UIO_WRITE) error = mac_vnode_check_write(active_cred, file_cred, vp); } #endif if (error == 0) { - if (file_cred) + if (file_cred != NULL) cred = file_cred; else cred = active_cred; if (rw == UIO_READ) - error = VOP_READ(vp, &auio, ioflg, cred); + error = vn_read_chunk(vp, &auio, active_cred, cred, + ioflg | IO_NODELOCKED); else - error = VOP_WRITE(vp, &auio, ioflg, cred); + error = vn_write_chunk(vp, &auio, active_cred, cred, + ioflg | IO_NODELOCKED); } if (aresid) *aresid = auio.uio_resid; @@ -433,10 +437,13 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, if (auio.uio_resid && error == 0) error = EIO; if ((ioflg & IO_NODELOCKED) == 0) { - if (rw == UIO_WRITE && vp->v_type != VCHR) - vn_finished_write(mp); VOP_UNLOCK(vp, 0); + if (mp != NULL) + vn_finished_write(mp); } + out: + if (rl_cookie != NULL) + vn_rangelock_unlock(vp, rl_cookie); return (error); } @@ -498,68 +505,148 @@ vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred, return (error); } +static struct mtx * +vn_lock_foffset(struct file *fp) +{ + struct mtx *mtxp; + + mtxp = mtx_pool_find(mtxpool_sleep, fp); + mtx_lock(mtxp); + while (fp->f_vnread_flags & FOFFSET_LOCKED) { + fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; + msleep(&fp->f_vnread_flags, mtxp, PUSER -1, + "vnread offlock", 0); + } + fp->f_vnread_flags |= FOFFSET_LOCKED; + mtx_unlock(mtxp); + return (mtxp); +} + +static void +vn_unlock_foffset(struct file *fp, struct mtx *mtxp) +{ + + mtx_lock(mtxp); + if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) + wakeup(&fp->f_vnread_flags); + fp->f_vnread_flags = 0; + mtx_unlock(mtxp); +} + +int +vn_read_chunk(struct vnode *vp, struct uio *uio, struct ucred *active_cred, + struct ucred *fcred, int ioflag) +{ + int error, vfslocked; + + error = 0; + vfslocked = 0; /* gcc */ + + if ((ioflag & IO_NODELOCKED) == 0) { + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vn_lock(vp, LK_SHARED | LK_RETRY); + } + +#ifdef MAC + if ((ioflag & IO_NOMACCHECK) == 0) + error = mac_vnode_check_read(active_cred, fcred, vp); +#endif + if (error == 0) { + if (!vmio_enabled || + (error = vnode_pager_read(vp, uio, ioflag)) == EOPNOTSUPP) + error = VOP_READ(vp, uio, ioflag, fcred); + } + if ((ioflag & IO_NODELOCKED) == 0) { + VOP_UNLOCK(vp, 0); + VFS_UNLOCK_GIANT(vfslocked); + } + return (error); +} + /* * File table vnode read routine. */ static int -vn_read(fp, uio, active_cred, flags, td) - struct file *fp; - struct uio *uio; - struct ucred *active_cred; - int flags; - struct thread *td; +vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, + struct thread *td) { struct vnode *vp; - int error, ioflag; struct mtx *mtxp; - int vfslocked; + void *rl_cookie; + int ioflag; + int error; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); - mtxp = NULL; - vp = fp->f_vnode; ioflag = 0; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; - vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vp = fp->f_vnode; + /* * According to McKusick the vn lock was protecting f_offset here. * It is now protected by the FOFFSET_LOCKED flag. */ if ((flags & FOF_OFFSET) == 0) { - mtxp = mtx_pool_find(mtxpool_sleep, fp); - mtx_lock(mtxp); - while(fp->f_vnread_flags & FOFFSET_LOCKED) { - fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; - msleep(&fp->f_vnread_flags, mtxp, PUSER -1, - "vnread offlock", 0); - } - fp->f_vnread_flags |= FOFFSET_LOCKED; - mtx_unlock(mtxp); - vn_lock(vp, LK_SHARED | LK_RETRY); + mtxp = vn_lock_foffset(fp); uio->uio_offset = fp->f_offset; } else - vn_lock(vp, LK_SHARED | LK_RETRY); - + mtxp = NULL; /* gcc */ + if (vp->v_type == VREG) + rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset, + uio->uio_resid); + else + rl_cookie = NULL; ioflag |= sequential_heuristic(uio, fp); + error = vn_read_chunk(vp, uio, active_cred, fp->f_cred, ioflag); + fp->f_nextoff = uio->uio_offset; + if (rl_cookie != NULL) + vn_rangelock_unlock(vp, rl_cookie); + if ((flags & FOF_OFFSET) == 0) { + fp->f_offset = uio->uio_offset; + vn_unlock_foffset(fp, mtxp); + } + return (error); +} +static int +vn_write_chunk(struct vnode *vp, struct uio *uio, struct ucred *active_cred, + struct ucred *fcred, int ioflag) +{ + struct mount *mp, *mp1; + int error, lock_flags, vfslocked; + + mp = NULL; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + if (vp->v_type == VREG) + bwillwrite(); + if (vp->v_type != VCHR && + (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + goto unlock; + + if (MNT_SHARED_WRITES(mp) || + (mp == NULL && (mp1 = vp->v_mount) != NULL && + MNT_SHARED_WRITES(mp1))) + lock_flags = LK_SHARED; + else + lock_flags = LK_EXCLUSIVE; + vn_lock(vp, lock_flags | LK_RETRY); #ifdef MAC - error = mac_vnode_check_read(active_cred, fp->f_cred, vp); - if (error == 0) + error = mac_vnode_check_write(active_cred, fcred, vp); +#else + error = 0; #endif - error = VOP_READ(vp, uio, ioflag, fp->f_cred); - if ((flags & FOF_OFFSET) == 0) { - fp->f_offset = uio->uio_offset; - mtx_lock(mtxp); - if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) - wakeup(&fp->f_vnread_flags); - fp->f_vnread_flags = 0; - mtx_unlock(mtxp); + if (error == 0) { + if (!vmio_enabled || + (error = vnode_pager_write(vp, uio, ioflag)) == EOPNOTSUPP) + error = VOP_WRITE(vp, uio, ioflag, fcred); } - fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0); + if (vp->v_type != VCHR) + vn_finished_write(mp); +unlock: VFS_UNLOCK_GIANT(vfslocked); return (error); } @@ -568,24 +655,17 @@ vn_read(fp, uio, active_cred, flags, td) * File table vnode write routine. */ static int -vn_write(fp, uio, active_cred, flags, td) - struct file *fp; - struct uio *uio; - struct ucred *active_cred; - int flags; - struct thread *td; +vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, + struct thread *td) { struct vnode *vp; - struct mount *mp; - int error, ioflag, lock_flags; - int vfslocked; + struct mtx *mtxp; + void *rl_cookie; + int error, ioflag; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); vp = fp->f_vnode; - vfslocked = VFS_LOCK_GIANT(vp->v_mount); - if (vp->v_type == VREG) - bwillwrite(); ioflag = IO_UNIT; if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) ioflag |= IO_APPEND; @@ -596,36 +676,32 @@ vn_write(fp, uio, active_cred, flags, td) if ((fp->f_flag & O_FSYNC) || (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) ioflag |= IO_SYNC; - mp = NULL; - if (vp->v_type != VCHR && - (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) - goto unlock; - - if ((MNT_SHARED_WRITES(mp) || - ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) && - (flags & FOF_OFFSET) != 0) { - lock_flags = LK_SHARED; - } else { - lock_flags = LK_EXCLUSIVE; - } - - vn_lock(vp, lock_flags | LK_RETRY); - if ((flags & FOF_OFFSET) == 0) + if ((flags & FOF_OFFSET) == 0) { + mtxp = vn_lock_foffset(fp); uio->uio_offset = fp->f_offset; + } else + mtxp = NULL; /* gcc */ ioflag |= sequential_heuristic(uio, fp); -#ifdef MAC - error = mac_vnode_check_write(active_cred, fp->f_cred, vp); - if (error == 0) -#endif - error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); - if ((flags & FOF_OFFSET) == 0) + if (vp->v_type == VREG) { + if ((ioflag & IO_APPEND) || !(flags & FOF_OFFSET)) + /* + * For appenders, punt and lock the whole + * range. It also protects f_offset. + */ + rl_cookie = vn_rangelock_wlock(vp, 0, (size_t)-1); + else + rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset, + uio->uio_resid); + } else + rl_cookie = NULL; + error = vn_write_chunk(vp, uio, active_cred, fp->f_cred, ioflag); + if (rl_cookie != NULL) + vn_rangelock_unlock(vp, rl_cookie); + if ((flags & FOF_OFFSET) == 0) { fp->f_offset = uio->uio_offset; + vn_unlock_foffset(fp, mtxp); + } fp->f_nextoff = uio->uio_offset; - VOP_UNLOCK(vp, 0); - if (vp->v_type != VCHR) - vn_finished_write(mp); -unlock: - VFS_UNLOCK_GIANT(vfslocked); return (error); } @@ -633,25 +709,29 @@ unlock: * File table truncate routine. */ static int -vn_truncate(fp, length, active_cred, td) - struct file *fp; - off_t length; - struct ucred *active_cred; - struct thread *td; +vn_truncate(struct file *fp, off_t length, struct ucred *active_cred, + struct thread *td) { struct vattr vattr; struct mount *mp; struct vnode *vp; + void *rl_cookie; int vfslocked; int error; vp = fp->f_vnode; + + /* + * Lock the range where the shortening take place. Increase of + * file size does not need rangelock, but it is faster to lock + * the range then call VOP_GETATTR to get the current size and + * deal with races. + */ + rl_cookie = vn_rangelock_wlock(vp, length, -1); vfslocked = VFS_LOCK_GIANT(vp->v_mount); error = vn_start_write(vp, &mp, V_WAIT | PCATCH); - if (error) { - VFS_UNLOCK_GIANT(vfslocked); - return (error); - } + if (error) + goto out1; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type == VDIR) { error = EISDIR; @@ -671,7 +751,9 @@ vn_truncate(fp, length, active_cred, td) out: VOP_UNLOCK(vp, 0); vn_finished_write(mp); +out1: VFS_UNLOCK_GIANT(vfslocked); + vn_rangelock_unlock(vp, rl_cookie); return (error); } diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src index 304e009..47657d7 100644 --- a/sys/kern/vnode_if.src +++ b/sys/kern/vnode_if.src @@ -618,3 +618,12 @@ vop_vptocnp { INOUT char *buf; INOUT int *buflen; }; + +%% extend vp L L L + +vop_extend { + IN struct vnode *vp; + IN struct ucred *cred; + IN u_quad_t size; + IN int flags; +}; diff --git a/sys/sys/buf.h b/sys/sys/buf.h index f57d6ed..ffcfa80 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -258,6 +258,8 @@ extern const char *buf_wmesg; /* Default buffer lock message */ #include /* XXX for curthread */ #include +extern int vfs_read_max; + /* * Initialize a lock. */ diff --git a/sys/sys/file.h b/sys/sys/file.h index 061ce02..3e7d5c7 100644 --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -141,6 +141,8 @@ struct file { #define FOFFSET_LOCKED 0x1 #define FOFFSET_LOCK_WAITING 0x2 +#define FRA_BLOCK_SZ 16384 + #endif /* _KERNEL || _WANT_FILE */ /* diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 48ef012..91ddbe1 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -210,6 +210,7 @@ struct thread { struct seltd *td_sel; /* Select queue/channel. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ + struct rl_q_entry *td_rlqe; /* (k) Associated range lock entry. */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ lwpid_t td_tid; /* (b) Thread ID. */ sigqueue_t td_sigqueue; /* (c) Sigs arrived, not delivered. */ @@ -386,7 +387,7 @@ do { \ #define TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */ #define TDP_ALTSTACK 0x00000020 /* Have alternate signal stack. */ #define TDP_DEADLKTREAT 0x00000040 /* Lock aquisition - deadlock treatment. */ -#define TDP_UNUSED80 0x00000080 /* available. */ +#define TDP_VMIO 0x00000080 /* Busied pages for vnode_pager io. */ #define TDP_NOSLEEPING 0x00000100 /* Thread is not allowed to sleep on a sq. */ #define TDP_OWEUPC 0x00000200 /* Call addupc() at next AST. */ #define TDP_ITHREAD 0x00000400 /* Thread is an interrupt thread. */ diff --git a/sys/sys/rangelock.h b/sys/sys/rangelock.h new file mode 100644 index 0000000..a328330 --- /dev/null +++ b/sys/sys/rangelock.h @@ -0,0 +1,66 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_RANGELOCK_H +#define _SYS_RANGELOCK_H + +#include + +#ifdef _KERNEL + +#define RL_LOCK_READ 0x0001 +#define RL_LOCK_WRITE 0x0002 +#define RL_LOCK_TYPE_MASK 0x0003 +#define RL_LOCK_GRANTED 0x0004 + +struct vnode; +struct rl_q_entry; +struct mtx; + +struct rangelock { + TAILQ_HEAD(, rl_q_entry) rl_waiters; + struct rl_q_entry *rl_currdep; +}; + +void rangelock_init(struct rangelock *lock); +void rangelock_destroy(struct rangelock *lock); +void rangelock_unlock(struct rangelock *lock, void *cookie, + struct mtx *ilk); +void *rangelock_unlock_range(struct rangelock *lock, void *cookie, + off_t base, size_t len, struct mtx *ilk); +void *rangelock_rlock(struct rangelock *lock, off_t base, size_t len, + struct mtx *ilk); +void *rangelock_wlock(struct rangelock *lock, off_t base, size_t len, + struct mtx *ilk); + +struct rl_q_entry *rlqentry_alloc(void); +void rlqentry_free(struct rl_q_entry *rlqe); + +#endif /* _KERNEL */ + +#endif /* _SYS_RANGELOCK_H */ diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 86ff8b6..78e579e 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -168,7 +169,8 @@ struct vnode { */ struct vpollinfo *v_pollinfo; /* G Poll events, p for *v_pi */ struct label *v_label; /* MAC label for vnode */ - struct lockf *v_lockf; /* Byte-level lock list */ + struct lockf *v_lockf; /* Byte-level adv lock list */ + struct rangelock v_rl; /* Byte-range lock */ }; #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */ @@ -653,6 +655,8 @@ int vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, size_t *aresid, struct thread *td); +int vn_read_chunk(struct vnode *vp, struct uio *uio, + struct ucred *active_cred, struct ucred *f_cred, int ioflag); int vn_rlimit_fsize(const struct vnode *vn, const struct uio *uio, const struct thread *td); int vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred, @@ -670,6 +674,14 @@ int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, int vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp); +#define vn_rangelock_unlock(vp, cookie) \ + rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp)) +#define vn_rangelock_unlock_range(vp, cookie, base, len) \ + rangelock_unlock_range(&(vp)->v_rl, (cookie), (base), (len), VI_MTX(vp)) +#define vn_rangelock_rlock(vp, base, len) \ + rangelock_rlock(&(vp)->v_rl, (base), (len), VI_MTX(vp)) +#define vn_rangelock_wlock(vp, base, len) \ + rangelock_wlock(&(vp)->v_rl, (base), (len), VI_MTX(vp)) int vfs_cache_lookup(struct vop_lookup_args *ap); void vfs_timestamp(struct timespec *); diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c index 6d5f27c..5dcceee 100644 --- a/sys/ufs/ffs/ffs_balloc.c +++ b/sys/ufs/ffs/ffs_balloc.c @@ -641,7 +641,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, if (lastlbn < NDADDR && lastlbn < lbn) { nb = lastlbn; osize = blksize(fs, ip, nb); - if (osize < fs->fs_bsize && osize > 0) { + if (osize < fs->fs_bsize && osize > 0 && dp->di_db[nb] != 0) { UFS_LOCK(ump); error = ffs_realloccg(ip, nb, dp->di_db[nb], ffs_blkpref_ufs2(ip, lastlbn, (int)nb, @@ -708,9 +708,17 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, nsize, osize, bp); } } else { - if (ip->i_size < smalllblktosize(fs, lbn + 1)) + if (ip->i_size < smalllblktosize(fs, lbn)) nsize = fragroundup(fs, size); - else + else if (ip->i_size < smalllblktosize(fs, lbn + 1)) { + /* + * Allocate entire tail of the file. + * Write may cover subpart of the extended + * area. + */ + nsize = fragroundup(fs, max(size, + blkoff(fs, ip->i_size))); + } else nsize = fs->fs_bsize; UFS_LOCK(ump); error = ffs_alloc(ip, lbn, diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c index eb14e73..ec0789a 100644 --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -4378,7 +4378,7 @@ softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp) ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ long newsize; /* size of new block */ - long oldsize; /* size of new block */ + long oldsize; /* size of old block */ struct buf *bp; /* bp for allocated block */ { struct allocdirect *adp, *oldadp; @@ -4506,8 +4506,8 @@ allocdirect_merge(adphead, newadp, oldadp) if (newadp->ad_oldblkno != oldadp->ad_newblkno || newadp->ad_oldsize != oldadp->ad_newsize || newadp->ad_offset >= NDADDR) - panic("%s %jd != new %jd || old size %ld != new %ld", - "allocdirect_merge: old blkno", + panic("allocdirect_merge: old blkno" + " %jd != new %jd || old size %ld != new %ld", (intmax_t)newadp->ad_oldblkno, (intmax_t)oldadp->ad_newblkno, newadp->ad_oldsize, oldadp->ad_newsize); diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c index 15c3f9f..9574839 100644 --- a/sys/ufs/ffs/ffs_vnops.c +++ b/sys/ufs/ffs/ffs_vnops.c @@ -116,7 +116,7 @@ static vop_listextattr_t ffs_listextattr; static vop_openextattr_t ffs_openextattr; static vop_setextattr_t ffs_setextattr; static vop_vptofh_t ffs_vptofh; - +static vop_extend_t ffs_extend; /* Global vfs data structures for ufs. */ struct vop_vector ffs_vnodeops1 = { @@ -128,6 +128,7 @@ struct vop_vector ffs_vnodeops1 = { .vop_reallocblks = ffs_reallocblks, .vop_write = ffs_write, .vop_vptofh = ffs_vptofh, + .vop_extend = ffs_extend, }; struct vop_vector ffs_fifoops1 = { @@ -153,6 +154,7 @@ struct vop_vector ffs_vnodeops2 = { .vop_openextattr = ffs_openextattr, .vop_setextattr = ffs_setextattr, .vop_vptofh = ffs_vptofh, + .vop_extend = ffs_extend, }; struct vop_vector ffs_fifoops2 = { @@ -170,6 +172,18 @@ struct vop_vector ffs_fifoops2 = { .vop_vptofh = ffs_vptofh, }; +static void +ffs_drop_suid(struct inode *ip, struct ucred *cred) +{ + + if (ip->i_mode & (ISUID | ISGID)) { + if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0)) { + ip->i_mode &= ~(ISUID | ISGID); + DIP_SET(ip, i_mode, ip->i_mode); + } + } +} + /* * Synch an open file. */ @@ -803,13 +817,8 @@ ffs_write(ap) * we clear the setuid and setgid bits as a precaution against * tampering. */ - if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && - ap->a_cred) { - if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) { - ip->i_mode &= ~(ISUID | ISGID); - DIP_SET(ip, i_mode, ip->i_mode); - } - } + if (resid > uio->uio_resid && ap->a_cred != NULL) + ffs_drop_suid(ip, ap->a_cred); if (error) { if (ioflag & IO_UNIT) { (void)ffs_truncate(vp, osize, @@ -1768,3 +1777,69 @@ vop_vptofh { ufhp->ufid_gen = ip->i_gen; return (0); } + +static int +ffs_extend(struct vop_extend_args *ap) +{ + struct vnode *vp; + struct inode *ip; + struct buf *bp; + struct fs *fs; + off_t osize, xosize; + u_quad_t size; + ufs_lbn_t lastlbn; + ufs2_daddr_t nb; + int error, flags; + + vp = ap->a_vp; + ip = VTOI(vp); + size = ap->a_size; + osize = ip->i_size; + if (osize >= size) + return (0); + + vnode_pager_setsize(vp, size); + fs = ip->i_fs; + flags = ap->a_flags & IO_SYNC; + if (flags != 0) + goto slow; + + lastlbn = lblkno(fs, osize); + if (lastlbn < NDADDR) { + xosize = fragroundup(fs, blkoff(fs, osize)); + if (xosize < fs->fs_bsize && xosize > 0) { + if (ip->i_ump->um_fstype == UFS1) + nb = ip->i_din1->di_db[lastlbn]; + else + nb = ip->i_din2->di_db[lastlbn]; + /* Need to extend fragment */ + if (nb != 0) + goto slow; + } + } + ip->i_size = size; + DIP_SET(ip, i_size, size); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + ffs_drop_suid(ip, ap->a_cred); + return (0); + + slow: + error = UFS_BALLOC(vp, size - 1, 1, ap->a_cred, flags|BA_CLRBUF, &bp); + if (error) { + vnode_pager_setsize(vp, osize); + return (error); + } + ip->i_size = size; + DIP_SET(ip, i_size, size); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (bp->b_bufsize == fs->fs_bsize) + bp->b_flags |= B_CLUSTEROK; + if (flags & IO_SYNC) { + bwrite(bp); + error = ffs_update(vp, 1); + } else + bawrite(bp); + if (error == 0) + ffs_drop_suid(ip, ap->a_cred); + return (error); +} diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c index c396910..af8f100 100644 --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -2150,7 +2150,8 @@ ufs_readdir(ap) uio->uio_iov->iov_len = count; # if (BYTE_ORDER == LITTLE_ENDIAN) if (ap->a_vp->v_mount->mnt_maxsymlinklen > 0) { - error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); + error = vn_read_chunk(ap->a_vp, uio, ap->a_cred, + ap->a_cred, IO_NODELOCKED); } else { struct dirent *dp, *edp; struct uio auio; @@ -2166,7 +2167,8 @@ ufs_readdir(ap) aiov.iov_len = count; dirbuf = malloc(count, M_TEMP, M_WAITOK); aiov.iov_base = dirbuf; - error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred); + error = vn_read_chunk(ap->a_vp, &auio, ap->a_cred, + ap->a_cred, IO_NODELOCKED); if (error == 0) { readcnt = count - auio.uio_resid; edp = (struct dirent *)&dirbuf[readcnt]; @@ -2188,7 +2190,8 @@ ufs_readdir(ap) free(dirbuf, M_TEMP); } # else - error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); + error = vn_read_chunk(ap->a_vp, uio, ap->a_cred, + ap->a_cred, IO_NODELOCKED); # endif if (!error && ap->a_ncookies != NULL) { struct dirent* dpStart; diff --git a/sys/vm/vm_extern.h b/sys/vm/vm_extern.h index ae8e578..95f15d6 100644 --- a/sys/vm/vm_extern.h +++ b/sys/vm/vm_extern.h @@ -87,5 +87,8 @@ struct sf_buf *vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset); void vm_imgact_unmap_page(struct sf_buf *sf); void vm_thread_dispose(struct thread *td); int vm_thread_new(struct thread *td, int pages); +int vnode_pager_read(struct vnode *vp, struct uio *uio, int ioflags); +int vnode_pager_write(struct vnode *vp, struct uio *uio, int ioflags); + #endif /* _KERNEL */ #endif /* !_VM_EXTERN_H_ */ diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index fefc2e7..4c8a420 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -157,18 +157,22 @@ static void unlock_and_deallocate(struct faultstate *fs) { - vm_object_pip_wakeup(fs->object); - VM_OBJECT_UNLOCK(fs->object); - if (fs->object != fs->first_object) { - VM_OBJECT_LOCK(fs->first_object); - vm_page_lock(fs->first_m); - vm_page_free(fs->first_m); - vm_page_unlock(fs->first_m); - vm_object_pip_wakeup(fs->first_object); - VM_OBJECT_UNLOCK(fs->first_object); - fs->first_m = NULL; + if (fs->object != NULL) { + vm_object_pip_wakeup(fs->object); + VM_OBJECT_UNLOCK(fs->object); + if (fs->object != fs->first_object && + fs->first_object != NULL) { + VM_OBJECT_LOCK(fs->first_object); + vm_page_lock(fs->first_m); + vm_page_free(fs->first_m); + vm_page_unlock(fs->first_m); + vm_object_pip_wakeup(fs->first_object); + VM_OBJECT_UNLOCK(fs->first_object); + fs->first_m = NULL; + } + vm_object_deallocate(fs->first_object); + fs->object = fs->first_object = NULL; } - vm_object_deallocate(fs->first_object); unlock_map(fs); if (fs->vp != NULL) { vput(fs->vp); @@ -226,14 +230,15 @@ vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int faultcount, ahead, behind, alloc_req; struct faultstate fs; struct vnode *vp; + struct thread *td; int locked, error; hardfault = 0; growstack = TRUE; PCPU_INC(cnt.v_vm_faults); - fs.vp = NULL; - fs.vfslocked = 0; + memset(&fs, 0, sizeof(fs)); faultcount = behind = 0; + td = curthread; RetryFault:; @@ -248,11 +253,14 @@ RetryFault:; if (growstack && result == KERN_INVALID_ADDRESS && map != kernel_map) { result = vm_map_growstack(curproc, vaddr); - if (result != KERN_SUCCESS) + if (result != KERN_SUCCESS) { + unlock_and_deallocate(&fs); return (KERN_FAILURE); + } growstack = FALSE; goto RetryFault; } + unlock_and_deallocate(&fs); return (result); } @@ -384,7 +392,8 @@ RetryFault:; */ vm_page_busy(fs.m); if (fs.m->valid != VM_PAGE_BITS_ALL && - fs.m->object != kernel_object && fs.m->object != kmem_object) { + fs.m->object != kernel_object && + fs.m->object != kmem_object) { goto readrest; } @@ -547,7 +556,7 @@ vnode_lock: locked = LK_SHARED; /* Do not sleep for vnode lock while fs.m is busy */ error = vget(vp, locked | LK_CANRECURSE | - LK_NOWAIT, curthread); + LK_NOWAIT, td); if (error != 0) { int vfslocked; @@ -557,7 +566,7 @@ vnode_lock: release_page(&fs); unlock_and_deallocate(&fs); error = vget(vp, locked | LK_RETRY | - LK_CANRECURSE, curthread); + LK_CANRECURSE, td); vdrop(vp); fs.vp = vp; fs.vfslocked = vfslocked; @@ -970,9 +979,9 @@ vnode_locked: */ unlock_and_deallocate(&fs); if (hardfault) - curthread->td_ru.ru_majflt++; + td->td_ru.ru_majflt++; else - curthread->td_ru.ru_minflt++; + td->td_ru.ru_minflt++; return (KERN_SUCCESS); } diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h index b830202..ee50c02 100644 --- a/sys/vm/vm_object.h +++ b/sys/vm/vm_object.h @@ -109,9 +109,13 @@ struct vm_object { * VNode pager * * vnp_size - current size of file + * wpos - start write position for seq write detector + * off - offset from wpos for current write */ struct { off_t vnp_size; + off_t wpos; + ssize_t off; } vnp; /* diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index c11b024..4e28f8f 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -856,6 +856,8 @@ vm_page_remove(vm_page_t m) m->oflags &= ~VPO_BUSY; vm_page_flash(m); } + if (m->flags & PG_WRITEDIRTY) + vm_writedirty_cleaned(m); /* * Now remove from the object's list of backed pages. @@ -1384,6 +1386,19 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) return (m); } +void +vm_wait_queue_free(const char *wmsg) +{ + + mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + if (!vm_pages_needed) { + vm_pages_needed = 1; + wakeup(&vm_pages_needed); + } + msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM, wmsg, + 0); +} + /* * Initialize a page that has been freshly dequeued from a freelist. * The caller has to drop the vnode returned, if it is not NULL. @@ -1488,14 +1503,8 @@ vm_wait(void) vm_pageout_pages_needed = 1; msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx, PDROP | PSWP, "VMWait", 0); - } else { - if (!vm_pages_needed) { - vm_pages_needed = 1; - wakeup(&vm_pages_needed); - } - msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM, - "vmwait", 0); - } + } else + vm_wait_queue_free("vmwait"); } /* @@ -2007,6 +2016,9 @@ vm_page_cache(vm_page_t m) if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); + if (m->flags & PG_WRITEDIRTY) + vm_writedirty_cleaned(m); + /* * Insert the page into the object's collection of cached pages * and the physical memory allocator's cache/free page queues. diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 54a15fb..edb02da 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -217,6 +217,7 @@ extern struct vpglocks pa_lock[]; #define PG_WINATCFLS 0x0004 /* flush dirty page on inactive q */ #define PG_FICTITIOUS 0x0008 /* physical page doesn't exist (O) */ #define PG_WRITEABLE 0x0010 /* page is mapped writeable */ +#define PG_WRITEDIRTY 0x0020 /* dirtied by vmio write */ #define PG_ZERO 0x0040 /* page is zeroed */ #define PG_REFERENCED 0x0080 /* page has been referenced */ #define PG_UNMANAGED 0x0800 /* No PV management for page */ diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 8a9bfe1..14d5522 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -111,7 +111,6 @@ __FBSDID("$FreeBSD$"); /* the kernel process "vm_pageout"*/ static void vm_pageout(void); -static int vm_pageout_clean(vm_page_t); static void vm_pageout_scan(int pass); struct proc *pageproc; @@ -215,7 +214,7 @@ static void vm_req_vmdaemon(int req); #endif static void vm_pageout_page_stats(void); -static void +void vm_pageout_init_marker(vm_page_t marker, u_short queue) { @@ -316,7 +315,7 @@ vm_pageout_page_lock(vm_page_t m, vm_page_t *next) * block. Note the careful timing, however, the busy bit isn't set till * late and we cannot do anything that will mess with the page. */ -static int +int vm_pageout_clean(vm_page_t m) { vm_object_t object; @@ -388,7 +387,7 @@ more: vm_page_lock(p); vm_page_test_dirty(p); if (p->dirty == 0 || - p->queue != PQ_INACTIVE || + (p->queue != PQ_INACTIVE && p->queue != PQ_ACTIVE) || p->hold_count != 0) { /* may be undergoing I/O */ vm_page_unlock(p); ib = 0; @@ -416,7 +415,7 @@ more: vm_page_lock(p); vm_page_test_dirty(p); if (p->dirty == 0 || - p->queue != PQ_INACTIVE || + (p->queue != PQ_INACTIVE && p->queue != PQ_ACTIVE) || p->hold_count != 0) { /* may be undergoing I/O */ vm_page_unlock(p); break; @@ -531,11 +530,14 @@ vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen) if (pageout_status[i] != VM_PAGER_PEND) { vm_object_pip_wakeup(object); vm_page_io_finish(mt); - if (vm_page_count_severe()) { - vm_page_lock(mt); + vm_page_lock(mt); + if (mt->queue == PQ_INACTIVE && vm_page_count_severe()) vm_page_try_to_cache(mt); - vm_page_unlock(mt); - } + if ((mt->flags & PG_WRITEDIRTY) != 0 && + (pageout_status[i] == VM_PAGER_OK || + pageout_status[i] == VM_PAGER_BAD)) + vm_writedirty_cleaned(mt); + vm_page_unlock(mt); } } if (prunlen != NULL) @@ -1258,7 +1260,6 @@ unlock_and_continue: vm_pageout_oom(VM_OOM_MEM); } - void vm_pageout_oom(int shortage) { @@ -1478,12 +1479,17 @@ vm_pageout() vm_pageout_page_count = 8; /* + * Try to allow not more then 1/4 of usable pages for write. + */ + vmio_max_writedirty = cnt.v_page_count / 4; + + /* * v_free_reserved needs to include enough for the largest * swap pager structures plus enough for any pv_entry structs * when paging. */ if (cnt.v_page_count > 1024) - cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200; + cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 100; else cnt.v_free_min = 4; cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE + diff --git a/sys/vm/vm_pageout.h b/sys/vm/vm_pageout.h index 53e051a..4fcf42c 100644 --- a/sys/vm/vm_pageout.h +++ b/sys/vm/vm_pageout.h @@ -77,6 +77,8 @@ extern int vm_pageout_pages_needed; extern int vm_pageout_deficit; extern int vm_pageout_page_count; +extern long vmio_max_writedirty; + /* * Swap out requests */ @@ -94,17 +96,21 @@ extern int vm_pageout_page_count; * Signal pageout-daemon and wait for it. */ +#ifdef _KERNEL extern void pagedaemon_wakeup(void); #define VM_WAIT vm_wait() #define VM_WAITPFAULT vm_waitpfault() extern void vm_wait(void); extern void vm_waitpfault(void); +extern void vm_wait_queue_free(const char *); -#ifdef _KERNEL boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); int vm_pageout_flush(vm_page_t *, int, int, int, int *); -void vm_pageout_oom(int shortage); +void vm_pageout_oom(int); +int vm_pageout_clean(vm_page_t); +void vm_writedirty_cleaned(vm_page_t); boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *); void vm_contig_grow_cache(int, vm_paddr_t, vm_paddr_t); +void vm_pageout_init_marker(vm_page_t marker, u_short queue); #endif #endif /* _VM_VM_PAGEOUT_H_ */ diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c index 16b6747..4236721 100644 --- a/sys/vm/vm_phys.c +++ b/sys/vm/vm_phys.c @@ -883,4 +883,24 @@ DB_SHOW_COMMAND(freepages, db_show_freepages) db_printf("\n"); } } + +DB_SHOW_COMMAND(vpo_dw, vpo_dw) +{ + struct vm_phys_seg *seg; + vm_page_t m; + int segind; + long npages, i; + + for (segind = 0; segind < vm_phys_nsegs; segind++) { + seg = &vm_phys_segs[segind]; + npages = seg->end - seg->start; + npages /= PAGE_SIZE; + m = seg->first_page; + for (i = 0; i < npages; i++, m++) { + if (m->flags & PG_WRITEDIRTY) + printf("%p\n", m); + } + } +} + #endif diff --git a/sys/vm/vm_readwrite.c b/sys/vm/vm_readwrite.c new file mode 100644 index 0000000..6fc5a11 --- /dev/null +++ b/sys/vm/vm_readwrite.c @@ -0,0 +1,1109 @@ +/*- + * Copyright (c) 2008 Jeffrey Roberson + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_vm.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * XXXKIB TODO + * + * 2. VOP_REALLOCBLKS. + * 3. Unset setuid/setgid bits after write. + * 4. Filesystem full handling. + * + */ + +static SYSCTL_NODE(_vfs, OID_AUTO, vmio, CTLFLAG_RW, 0, "VFS VMIO leaf"); + +static int vmio_run = 0; +SYSCTL_INT(_vfs_vmio, OID_AUTO, run, CTLFLAG_RW, &vmio_run, 0, + "Calculate the max sequential run for vnode_pager_read_cluster"); +static int vmio_clrbuf = 1; +SYSCTL_INT(_vfs_vmio, OID_AUTO, clrbuf, CTLFLAG_RW, &vmio_clrbuf, 0, + ""); /* Intentionally undocumented */ +static int vmio_read_pack = 16; +SYSCTL_INT(_vfs_vmio, OID_AUTO, read_pack, CTLFLAG_RW, &vmio_read_pack, 0, + "Length of the page pack for read"); +static int vmio_write_pack = 16; +SYSCTL_INT(_vfs_vmio, OID_AUTO, write_pack, CTLFLAG_RW, &vmio_write_pack, + 0, + "Length of the page pack for write"); +static int vmio_rollbacks1; +SYSCTL_INT(_vfs_vmio, OID_AUTO, rollbacks1, CTLFLAG_RD, &vmio_rollbacks1, + 0, + "Count of times vnode size has to be rolled back for writes " + "while collecting pages"); +static int vmio_rollbacks2; +SYSCTL_INT(_vfs_vmio, OID_AUTO, rollbacks2, CTLFLAG_RD, &vmio_rollbacks2, + 0, + "Count of times vnode size has to be rolled back for writes " + "while reading pages"); +static int vmio_getpages_read; +SYSCTL_INT(_vfs_vmio, OID_AUTO, getpages_read, CTLFLAG_RD, + &vmio_getpages_read, 0, + "Count of times VOP_GETPAGES called for read"); +static int vmio_getpages_write; +SYSCTL_INT(_vfs_vmio, OID_AUTO, getpages_write, CTLFLAG_RD, + &vmio_getpages_write, 0, + "Count of times VOP_GETPAGES called for write"); +static int vmio_reserv_used; +SYSCTL_INT(_vfs_vmio, OID_AUTO, reserv_used, CTLFLAG_RD, + &vmio_reserv_used, 0, + "Count of times reserved page was used by vmio"); +static int vmio_alloc_wait; +SYSCTL_INT(_vfs_vmio, OID_AUTO, alloc_wait, CTLFLAG_RD, &vmio_alloc_wait, + 0, + "Count of times vmio reserved page allocation has to wait"); +static long vmio_writedirty; +SYSCTL_LONG(_vfs_vmio, OID_AUTO, writedirty, CTLFLAG_RD, &vmio_writedirty, + 0, + "Count of pages dirtied by vnode_pager_write"); +long vmio_max_writedirty; +SYSCTL_LONG(_vfs_vmio, OID_AUTO, max_writedirty, CTLFLAG_RW, + &vmio_max_writedirty, 0, + "Maximum allowed system-wide count of pages dirtied by vnode_pager_write"); +static int vmio_writed_wakeups; +SYSCTL_INT(_vfs_vmio, OID_AUTO, writed_wakeups, CTLFLAG_RD, + &vmio_writed_wakeups, 0, + "Count of times vmio write daemon was woken up"); +static int vmio_writed_inact; +SYSCTL_INT(_vfs_vmio, OID_AUTO, writed_inact, CTLFLAG_RD, + &vmio_writed_inact, 0, + "Count of times vmio write daemon cleaned inactive queue"); +static int vmio_writed_act; +SYSCTL_INT(_vfs_vmio, OID_AUTO, writed_act, CTLFLAG_RD, &vmio_writed_act, + 0, + "Count of times vmio write daemon cleaned active queue"); + +static u_int +io_page_bits(int i, vm_offset_t off, ssize_t size) +{ + int start, chunk; + + if (i == 0) { + start = off; + chunk = min(PAGE_SIZE - off, size); + } else if (i * PAGE_SIZE < off + size) { + start = 0; + chunk = PAGE_SIZE; + } else if ((i - 1) * PAGE_SIZE < off + size) { + start = 0; + chunk = (size - off) % PAGE_SIZE; + } else + return (0); + return (vm_page_bits(start, chunk)); +} + +/* + * Blocking allocator of the reserve page. Cannot be called with vnode + * or object lock held. + */ +static void +vnode_alloc_reserv(vm_page_t *reserv) +{ + + while (*reserv == NULL) { + *reserv = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | + VM_ALLOC_NOOBJ); + if (*reserv == NULL) { + atomic_add_int(&vmio_alloc_wait, 1); + VM_WAIT; + } + } +} + +/* + * Copied from vm_pageout_scan(). + */ +static boolean_t +vnode_writedirty_clean_page(vm_page_t m, int queue, int *target, + vm_page_t *next) +{ + vm_object_t object; + struct mount *mp; + struct vnode *vp; + struct vm_page marker; + int vfslocked; + + vm_page_lock_assert(m, MA_OWNED); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + + if (m->queue != queue) + return (FALSE); + *next = TAILQ_NEXT(m, pageq); + + if (m->flags & PG_MARKER) + return (TRUE); + if (m->hold_count) { + vm_page_requeue(m); + return (TRUE); + } + + object = m->object; + if (!VM_OBJECT_TRYLOCK(object) && + (!vm_pageout_fallback_object_lock(m, next) || + m->hold_count != 0)) { + VM_OBJECT_UNLOCK(object); + return (TRUE); + } + if (m->busy || (m->oflags & VPO_BUSY) || !(m->flags & PG_WRITEDIRTY)) { + VM_OBJECT_UNLOCK(object); + return (TRUE); + } + + if (m->dirty != VM_PAGE_BITS_ALL && (m->flags & PG_WRITEABLE) != 0) { + if (pmap_is_modified(m)) + vm_page_dirty(m); + else if (m->dirty == 0) + pmap_remove_all(m); + } + + KASSERT(m->valid != 0, ("VPO_WRITEDIRTY and not valid %p", m)); + if (m->dirty == 0) { + vm_page_flag_clear(m, PG_WRITEDIRTY); + vmio_writedirty--; + VM_OBJECT_UNLOCK(object); + return (TRUE); + } + if (object->flags & OBJ_DEAD) { + VM_OBJECT_UNLOCK(object); + vm_page_requeue(m); + return (TRUE); + } + KASSERT(object->type == OBJT_VNODE, ("VPO_WRITEDIRTY and not vnode")); + + vm_pageout_init_marker(&marker, queue); + TAILQ_INSERT_AFTER(&vm_page_queues[queue].pl, m, &marker, pageq); + vp = object->handle; + vfslocked = 0; + if (vp->v_type == VREG && vn_start_write(vp, &mp, V_NOWAIT) != 0) { + mp = NULL; + goto unlock_and_continue; + } + KASSERT(mp != NULL, ("vp %p with NULL v_mount", vp)); + vm_page_unlock_queues(); + vm_page_unlock(m); + vm_object_reference_locked(object); + VM_OBJECT_UNLOCK(object); + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + if (vget(vp, LK_EXCLUSIVE | LK_TIMELOCK, curthread)) { + VM_OBJECT_LOCK(object); + vm_page_lock(m); + vm_page_lock_queues(); + vp = NULL; + goto unlock_and_continue; + } + VM_OBJECT_LOCK(object); + vm_page_lock(m); + vm_page_lock_queues(); + if (m->queue != queue || m->object != object || + TAILQ_NEXT(m, pageq) != &marker) + goto unlock_and_continue; + if (m->busy || (m->oflags & VPO_BUSY)) + goto unlock_and_continue; + if (m->hold_count) { + vm_page_requeue(m); + goto unlock_and_continue; + } + + vm_page_unlock_queues(); + if (vm_pageout_clean(m) != 0) + (*target)--; + vm_page_lock_queues(); + unlock_and_continue: + VM_OBJECT_UNLOCK(object); + if (mp != NULL) { + vm_page_unlock_queues(); + vm_page_unlock(m); + if (vp != NULL) + vput(vp); + VFS_UNLOCK_GIANT(vfslocked); + vm_object_deallocate(object); + vn_finished_write(mp); + vm_page_lock(m); + vm_page_lock_queues(); + } + *next = TAILQ_NEXT(&marker, pageq); + TAILQ_REMOVE(&vm_page_queues[queue].pl, &marker, pageq); + return (TRUE); +} + +static void +vnode_writedirty_clean_queue(int *target, int queue) +{ + vm_page_t m, next; + boolean_t res; + + vm_page_lock_queues(); + rescan0: + for (m = TAILQ_FIRST(&vm_page_queues[queue].pl); + m != NULL && *target > 0; m = next) { + if (!vm_pageout_page_lock(m, &next)) { + vm_page_unlock(m); + continue; + } + res = vnode_writedirty_clean_page(m, queue, target, &next); + vm_page_unlock(m); + if (!res) + goto rescan0; + } + vm_page_unlock_queues(); +} + +static struct cv wd_speedup; +static struct cv wd_back; + +static void +vnode_writedirty_daemon(void) +{ + int target; + + cv_init(&wd_speedup, "writed"); + cv_init(&wd_back, "vnodeww"); + + vm_page_lock_queues(); + for (;;) { + cv_wait(&wd_speedup, &vm_page_queue_mtx); + target = vmio_writedirty - vmio_max_writedirty; + vm_page_unlock_queues(); + atomic_add_int(&vmio_writed_wakeups, 1); + if (target > 0) { + bwillwrite(); + atomic_add_int(&vmio_writed_inact, 1); + vnode_writedirty_clean_queue(&target, PQ_INACTIVE); + } + if (target > 0) { + bwillwrite(); + atomic_add_int(&vmio_writed_act, 1); + vnode_writedirty_clean_queue(&target, PQ_ACTIVE); + } + vm_page_lock_queues(); + cv_broadcast(&wd_back); + } +} + +void +vm_writedirty_cleaned(vm_page_t m) +{ + + vm_page_lock_queues(); + vm_page_flag_clear(m, PG_WRITEDIRTY); + vmio_writedirty--; + cv_broadcast(&wd_back); + vm_page_unlock_queues(); +} + +static struct proc *writedproc; +static struct kproc_desc writed_kp = { + .arg0 = "writed", + .func = vnode_writedirty_daemon, + .global_procpp = &writedproc +}; +SYSINIT(writed, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &writed_kp); + +/* + * Attempt to put backpressure on writes. + */ +static void +vnode_pager_wwait(void) +{ + + if (vmio_writedirty >= vmio_max_writedirty) { + vm_page_lock_queues(); + while (vmio_writedirty >= vmio_max_writedirty) { + cv_signal(&wd_speedup); + cv_wait(&wd_back, &vm_page_queue_mtx); + } + vm_page_unlock_queues(); + } +} + +#define VN_GRAB_NO_VMWAIT 0x0001 + +/* + * Grab a page, waiting until we are woken up due to the page + * changing state. We keep on waiting, if the page continues + * to be in the object. If the page doesn't exist allocate it. + * + * This routine may block, either waiting for busy vnode page, or for + * a page allocation. Later may be disabled with VN_GRAB_NO_VMWAIT + * flag, when vnode lock is held. To ensure progress, reserve page is + * used for ma[0] when wait is disabled and system cannot provide a + * page. + * + * Returns updated page run length in *wp, and filled in ma page + * array. + */ +static void +vnode_grab_pages(struct vnode *vp, vm_page_t *ma, int *wp, vm_pindex_t pindex, + int flags, vm_page_t *reserv) +{ + vm_object_t object; + vm_page_t m; + vm_pindex_t pi; + int i; + + KASSERT((flags & VN_GRAB_NO_VMWAIT) || reserv == NULL, + ("vnode_grab_pages: NO_VMWAIT and no reserve")); + + object = vp->v_object; +redo: + VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); + m = NULL; + for (i = 0, pi = pindex; i < *wp; ) { + if (i > 0) + m = vm_page_next(ma[i - 1]); + if (m == NULL) + m = vm_page_lookup(object, pi); + if (m != NULL) { + if (vm_page_sleep_if_busy(m, TRUE, "pgrnbwt")) + goto redo; + } else { + m = vm_page_alloc(object, pi, VM_ALLOC_NORMAL | + VM_ALLOC_NOBUSY); + } + if (m != NULL) { + ma[i] = m; + i++; + pi++; + continue; + } + if (flags & VN_GRAB_NO_VMWAIT) { + if (i == 0) { + m = *reserv; + *reserv = NULL; + atomic_add_int(&vmio_reserv_used, 1); + m->flags &= ~PG_UNMANAGED; + if (object->memattr != VM_MEMATTR_DEFAULT) + pmap_page_set_memattr(m, + object->memattr); + vm_page_insert(m, object, pindex); + ma[i] = m; + i++; + } + break; + } + VM_OBJECT_UNLOCK(object); + atomic_add_int(&vmio_alloc_wait, 1); + VM_WAIT; + VM_OBJECT_LOCK(object); + goto redo; + } + *wp = i; +} + +/* + * Read a cluster starting at 'ma'. Note that we need to always redo + * page grab because our caller dropped object lock while not holding + * vnode lock. + */ +static int +vnode_pager_read_cluster(struct vnode *vp, vm_page_t ma[], vm_pindex_t idx, + int *maxrun, int flags, vm_page_t *reserv) +{ + vm_object_t obj; + vm_page_t m; + daddr_t blkno; + int bsize; + int error; + int run; + int i; + + obj = vp->v_object; + bsize = vp->v_mount->mnt_stat.f_iosize; + error = 0; + blkno = 0; + + if (vmio_run) { + VM_OBJECT_UNLOCK(obj); + error = VOP_BMAP(vp, IDX_TO_OFF(idx)/bsize, NULL, &blkno, &run, + NULL); + VM_OBJECT_LOCK(obj); + run = MIN(run, *maxrun); + if (error || run == 0 || blkno == -1) { +/* printf("vnode_pager_read_cluster short\n"); */ + *maxrun = 1; + vnode_grab_pages(vp, ma, maxrun, idx, + VN_GRAB_NO_VMWAIT, reserv); + error = vm_pager_get_pages(obj, ma, 1, 0); + if (error != VM_PAGER_OK) + return (EIO); + return (0); + } + run = (run + 1) * bsize / PAGE_SIZE; + run = MIN(run, vp->v_mount->mnt_iosize_max / PAGE_SIZE); + } else { + if (*maxrun == 0) + *maxrun = 1; + run = MIN(*maxrun, vp->v_mount->mnt_iosize_max / PAGE_SIZE); + } + if (IDX_TO_OFF(idx) + run * PAGE_SIZE > obj->un_pager.vnp.vnp_size) { + run = (obj->un_pager.vnp.vnp_size - IDX_TO_OFF(idx)) / + PAGE_SIZE; + } + if (run == 0) + run = 1; + vnode_grab_pages(vp, ma, &run, idx, VN_GRAB_NO_VMWAIT, reserv); + for (i = 0; i < run; i++) { + if (i > 0 && ma[i]->valid != 0) { + run = i; + break; + } + vm_page_busy(ma[i]); + } + +/* printf("vnode_pager_read_cluster %d %p %p\n", run, ma, ma[0]); */ + error = vm_pager_get_pages(obj, ma, run, 0); + if (error != VM_PAGER_OK) { + for (i = 0; i < run; i++) { + vm_page_lock(ma[i]); + vm_page_free(ma[i]); + vm_page_unlock(ma[i]); + } + return (EIO); + } + KASSERT(ma[0]->valid == VM_PAGE_BITS_ALL, + ("ma[0]->valid %x", ma[0]->valid)); + vm_page_wakeup(ma[0]); + /* ma[0] cannot be cached */ + for (i = 1; i < run; i++) { + m = vm_page_next(ma[i - 1]); + if (m == NULL || ma[i] != m || m->valid == 0) + break; +/* printf("run %d ma[%d]: obj %p %p pindex %jd p+i %jd valid %x\n", + run, i, obj, ma[i]->object, ma[i]->pindex, ma[0]->pindex + i, ma[i]->valid); */ + } + *maxrun = i; + return (0); +} + +int +vnode_pager_read(struct vnode *vp, struct uio *uio, int ioflags) +{ + vm_object_t obj; + vm_offset_t off; + vm_pindex_t idx; + vm_page_t reserv; + ssize_t size; + int error, seqcount, wpmax, wp, i; + u_int bits; + struct thread *td; + + if (ioflags & (IO_EXT|IO_DIRECT)) + return (EOPNOTSUPP); + + ASSERT_VOP_LOCKED(vp, "vnode_pager_read"); + if (vp->v_iflag & VI_DOOMED) + return (EBADF); + + /* + * Ignore non-regular files. + */ + if (vp->v_type != VREG) + return (EOPNOTSUPP); + obj = vp->v_object; + if (obj == NULL) + return (EOPNOTSUPP); + + seqcount = (ioflags >> IO_SEQSHIFT) * FRA_BLOCK_SZ / PAGE_SIZE; + seqcount = min(vfs_read_max, seqcount); + seqcount = min(vp->v_mount->mnt_iosize_max / PAGE_SIZE, seqcount); + VOP_UNLOCK(vp, 0); + + wpmax = atomic_load_acq_int(&vmio_read_pack); + vm_page_t ma[wpmax + 1]; + + while (vm_page_count_severe()) { + atomic_add_int(&vm_pageout_deficit, MIN(wpmax + 1, + (uio->uio_resid + PAGE_SIZE - 1) >> PAGE_SHIFT)); + VM_WAIT; + } + + error = 0; + reserv = NULL; + td = uio->uio_td; + /* XXXKIB This should be disallowed. */ + if (td == NULL) + td = curthread; + + VM_OBJECT_LOCK(obj); + while (uio->uio_resid > 0) { + wp = wpmax; + + size = obj->un_pager.vnp.vnp_size - uio->uio_offset; + if (size <= 0) + break; + idx = OFF_TO_IDX(uio->uio_offset); + off = uio->uio_offset - IDX_TO_OFF(idx); + size = MIN(MIN(PAGE_SIZE * wp - off, uio->uio_resid), size); + + wp = (size + off + PAGE_SIZE - 1) / PAGE_SIZE; + vnode_grab_pages(vp, ma, &wp, idx, 0, NULL); + find_valid: + for (i = 0; i < wp; i++) { + bits = io_page_bits(i, off, size); + + /* + * Only do read if first page of array is not + * valid for us. We have to drop object lock + * to obtain vnode lock, that allows the pages + * to change identity or validity bits, and we + * can guarantee allocation of only one + * (reserved) page. + */ + if ((ma[i]->valid & bits) != bits) { + if (i != 0) { + wp = i; + break; + } + VM_OBJECT_UNLOCK(obj); + vnode_alloc_reserv(&reserv); + error = vn_lock(vp, LK_SHARED); + VM_OBJECT_LOCK(obj); + if (error != 0) { + error = EBADF; + break; + } + + /* + * Read page, honouring read-ahead settings + * for filedescriptor. + */ + atomic_add_int(&vmio_getpages_read, 1); + error = vnode_pager_read_cluster(vp, ma, idx, + &wp, VN_GRAB_NO_VMWAIT, &reserv); + VOP_UNLOCK(vp, 0); + if (error != 0) + break; + /* + * No need to redo size calculation. + * Despite both vnode and object locks + * were dropped, range lock and file + * descriptor reference shall keep + * file from truncation. + */ + goto find_valid; + } + } + if (error != 0) + break; + KASSERT(wp > 0, ("wp == 0")); +/* printf("vp %p wp %d size %d\n", vp, wp, size); */ + + /* + * Prevent object deallocation and pages swap-out. + */ + vm_object_pip_add(obj, 1); + for (i = 0; i < wp; i++) { + vm_page_lock(ma[i]); + vm_page_hold(ma[i]); + vm_page_unlock(ma[i]); + } + VM_OBJECT_UNLOCK(obj); + + /* + * Recalculate i/o size, since vnode_grab_pages() + * might shortened the page run. + */ + size = MIN(MIN(PAGE_SIZE * wp - off, uio->uio_resid), size); + + /* + * Access user map pages, vnode lock is dropped. + * Possible page fault is safe at this point. Vnode + * rangelock is held, protecting from parallel + * writers. + */ +/* printf("size %d %d %ju\n", size, uio->uio_resid, (uintmax_t)off); */ + KASSERT((td->td_pflags & TDP_VMIO) == 0, + ("Recursed vnode_pager_read")); + td->td_pflags |= TDP_VMIO; + error = uiomove_fromphys(ma, off, size, uio); + td->td_pflags &= ~TDP_VMIO; + + VM_OBJECT_LOCK(obj); + for (i = 0; i < wp; i++) { + vm_page_lock(ma[i]); + vm_page_unhold(ma[i]); + vm_page_activate(ma[i]); + vm_page_unlock(ma[i]); + } + vm_object_pip_wakeup(obj); + if (error != 0) + break; + } + VM_OBJECT_UNLOCK(obj); + if (reserv != NULL) + vm_page_free(reserv); + vn_lock(vp, LK_SHARED | LK_RETRY); + if (error == 0) + vfs_mark_atime(vp, td->td_ucred); + + return (error); +} + +int +vnode_pager_write(struct vnode *vp, struct uio *uio, int ioflags) +{ + vm_object_t obj; + vm_offset_t off; + vm_pindex_t idx, clean_start, clean_end; + vm_page_t reserv; + struct vattr vattr; + ssize_t size, size1, osize, osize1, resid, sresid, written; + int error, vn_locked, wpmax, wp, i, pflags; + u_int bits; + boolean_t vnode_locked, freed, freed1, first_extend; + struct thread *td; + + if (ioflags & (IO_EXT|IO_INVAL|IO_DIRECT)) + return (EOPNOTSUPP); + ASSERT_VOP_LOCKED(vp, "vnode_pager_write"); + if (vp->v_iflag & VI_DOOMED) + return (EBADF); + if (vp->v_type != VREG) + return (EOPNOTSUPP); + obj = vp->v_object; + if (obj == NULL) + return (EOPNOTSUPP); + vn_locked = VOP_ISLOCKED(vp); + vnode_locked = TRUE; + error = 0; + first_extend = TRUE; + + /* + * Reversed logic from vnode_generic_putpages(). + */ + if (ioflags & IO_SYNC) + pflags = VM_PAGER_PUT_SYNC; + else if (ioflags & IO_ASYNC) + pflags = 0; + else + pflags = VM_PAGER_CLUSTER_OK; + + wpmax = atomic_load_acq_int(&vmio_write_pack); + vm_page_t ma[wpmax + 1]; + + /* + * Try to ensure that enough pages is available in advance. + */ + while (vm_page_count_severe()) { + if (vnode_locked) { + VOP_UNLOCK(vp, 0); + vnode_locked = FALSE; + } + atomic_add_int(&vm_pageout_deficit, MIN(wpmax + 1, + (uio->uio_resid + PAGE_SIZE - 1) >> PAGE_SHIFT)); + VM_WAIT; + } + + /* + * Allocate first reserve page. + */ + for (reserv = NULL; reserv == NULL; ) { + reserv = vm_page_alloc(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ); + if (reserv == NULL) { + if (vnode_locked) { + VOP_UNLOCK(vp, 0); + vnode_locked = FALSE; + } + atomic_add_int(&vmio_alloc_wait, 1); + VM_WAIT; + } + } + if (!vnode_locked) { + /* + * Since vnode lock was dropped, we are under low free + * pages condition, so more write trottling is due. + */ + vnode_pager_wwait(); + + vn_lock(vp, vn_locked | LK_RETRY); + if (vp->v_iflag & VI_DOOMED) { + if (reserv != NULL) + vm_page_free(reserv); + return (EBADF); + } + vnode_locked = TRUE; + } + + if (ioflags & IO_APPEND) + uio->uio_offset = obj->un_pager.vnp.vnp_size; + + clean_start = OFF_TO_IDX(uio->uio_offset); + clean_end = OFF_TO_IDX(uio->uio_offset + uio->uio_resid + + PAGE_SIZE - 1); + + td = uio->uio_td; + if (td == NULL) + td = curthread; + + error = vn_rlimit_fsize(vp, uio, td); + if (error != 0) + return (error); + osize = osize1 = obj->un_pager.vnp.vnp_size; + resid = uio->uio_resid; + +io_loop: + while (uio->uio_resid > 0) { + wp = wpmax; + size = uio->uio_resid; + idx = OFF_TO_IDX(uio->uio_offset); + off = uio->uio_offset - IDX_TO_OFF(idx); + size = MIN(PAGE_SIZE * wp - off, uio->uio_resid); + if (!vnode_locked) { + error = vn_lock(vp, LK_EXCLUSIVE); + if (error != 0) { + error = EBADF; + break; + } + vnode_locked = TRUE; + } + osize1 = obj->un_pager.vnp.vnp_size; + + /* + * Extend the file if writing past end. + */ + if (osize1 < uio->uio_offset + size || first_extend) { + if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { + VOP_UNLOCK(vp, 0); + vnode_locked = FALSE; + } + if (!vnode_locked) { + error = vn_lock(vp, LK_EXCLUSIVE); + if (error != 0) { + error = EBADF; + break; + } + vnode_locked = TRUE; + } + vattr.va_size = uio->uio_offset + size; + error = VOP_EXTEND(vp, td->td_ucred, uio->uio_offset + + size, ioflags); + first_extend = FALSE; + } + if (error != 0) + break; + + wp = (size + off + PAGE_SIZE - 1) / PAGE_SIZE; + VM_OBJECT_LOCK(obj); + + /* + * Use VN_GRAB_NO_VMWAIT since vnode lock is held. + */ + vnode_grab_pages(vp, ma, &wp, idx, VN_GRAB_NO_VMWAIT, &reserv); + find_valid: + for (i = 0; i < wp; i++) { + /* + * If the page falls into the newly-extended + * range, zero it and mark as valid. There is + * nothing VOP_GETPAGES can read from file. + */ + if (IDX_TO_OFF(ma[i]->pindex) >= osize1) { + if ((ma[i]->flags & PG_ZERO) == 0) + pmap_zero_page(ma[i]); + ma[i]->valid = VM_PAGE_BITS_ALL; + } + + /* + * Pages need to be fully valid, because we + * can only hold them during uiomove later. + * + * The page fault happening in other thread + * after uiomove finished but before valid + * bits are corrected below would cause lost + * of newly written data if page is not fully + * valid. + */ + if (ma[i]->valid == VM_PAGE_BITS_ALL) + continue; + if (!vmio_clrbuf) { + bits = io_page_bits(i, off, size); + if ((ma[i]->valid & ~bits) == (~bits & + VM_PAGE_BITS_ALL)) + continue; + } + if (i != 0) { + wp = i; + break; + } + if (reserv == NULL) + reserv = vm_page_alloc(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ); + if (reserv == NULL) { + VM_OBJECT_UNLOCK(obj); + + /* + * Truncate the file back to the + * original size to prevent mmap from + * seeing invalid pages. We are going + * to drop vnode lock. + */ + if (osize1 < uio->uio_offset + size) { + atomic_add_int(&vmio_rollbacks1, 1); + VATTR_NULL(&vattr); + vattr.va_size = osize1; + error = VOP_SETATTR(vp, &vattr, + td->td_ucred); + if (error != 0) + break; + } + KASSERT(vnode_locked, ("lost vnode lock 1")); + VOP_UNLOCK(vp, 0); + vnode_locked = FALSE; + vnode_pager_wwait(); + vnode_alloc_reserv(&reserv); + goto io_loop; + } + + atomic_add_int(&vmio_getpages_write, 1); + error = vnode_pager_read_cluster(vp, ma, idx, &wp, + VN_GRAB_NO_VMWAIT, &reserv); + if (error != 0) { + VM_OBJECT_UNLOCK(obj); + break; + } + goto find_valid; + } + /* Loop above is exited with unlocked obj if error != 0. */ + if (error != 0) + break; + KASSERT(wp > 0, ("wp == 0")); + + /* + * Prevent the object deallocation and hold the pages. + * Held page can be removed from object, but cannot be + * reused. Range lock taken in vn_truncate() prevents + * most typical race. + * + * XXXKIB Busying the pages there would cause deadlock + * with vm_object_page_remove() or self-lock with + * vm_fault(), but would allow to not require the + * pages to be fully valid before uiomove. + * + * The mmap could see zeroed pages that are inserted + * into extended area after we dropped object lock. + * This could be considered an application race. + */ + vm_object_pip_add(obj, 1); + for (i = 0; i < wp; i++) { + vm_page_lock(ma[i]); + vm_page_hold(ma[i]); + vm_page_unlock(ma[i]); + } + VM_OBJECT_UNLOCK(obj); + + /* + * Recalculate i/o size, since vnode_grab_pages() + * might have shortened the page run. Save previous + * resid to correctly mark written pages regions as + * dirty. + */ + sresid = uio->uio_resid; + size1 = MIN(MIN(PAGE_SIZE * wp - off, sresid), size); + + /* + * Shrunk file in case we allocated less pages then + * the estimation that was used to VOP_EXTEND. + */ + KASSERT(vnode_locked, ("lost vnode lock 2")); + if (size1 < size && osize1 < uio->uio_offset + size) { + atomic_add_int(&vmio_rollbacks2, 1); + VATTR_NULL(&vattr); + vattr.va_size = uio->uio_offset + size1; + error = VOP_SETATTR(vp, &vattr, td->td_ucred); + if (error != 0) { + VM_OBJECT_LOCK(obj); + for (i = 0; i < wp; i++) { + vm_page_lock(ma[i]); + vm_page_unhold(ma[i]); + vm_page_deactivate(ma[i]); + vm_page_unlock(ma[i]); + } + vm_object_pip_wakeup(obj); + VM_OBJECT_UNLOCK(obj); + break; + } + } + size = size1; + + VOP_UNLOCK(vp, 0); + vnode_locked = FALSE; + + KASSERT((td->td_pflags & TDP_VMIO) == 0, + ("Recursed vnode_pager_write")); +/* printf("W: vp %p off %jd %jd size %jd\n", + vp, (intmax_t)uio->uio_offset, (intmax_t)off, (intmax_t)size); */ + td->td_pflags |= TDP_VMIO; + error = uiomove_fromphys(ma, off, size, uio); + td->td_pflags &= ~TDP_VMIO; + + freed = FALSE; + VM_OBJECT_LOCK(obj); + for (i = 0; i < wp; i++) { + /* + * Note that the page is marked dirty + * regardeless of the possible error from + * uiomove. We must mark the pages that were + * touched by uiomove before fault + * occured. Since we do not record the + * progress of the uiomove till fault, just + * mark them all. + */ + ma[i]->dirty |= io_page_bits(i, off, sresid - + uio->uio_resid); + vm_page_lock_queues(); + if ((ma[i]->flags & PG_WRITEDIRTY) == 0) { + vm_page_flag_set(ma[i], PG_WRITEDIRTY); + vmio_writedirty++; + } + vm_page_unlock_queues(); + freed1 = FALSE; + if (ma[i]->queue == PQ_HOLD) + freed = freed1 = TRUE; + vm_page_lock(ma[i]); + vm_page_unhold(ma[i]); + if (!freed1) + vm_page_activate(ma[i]); + vm_page_unlock(ma[i]); + } + /* See the comment above about page dirtiness. */ + vm_object_set_writeable_dirty(obj); + + /* + * Try to cluster writes. + */ + written = sresid - uio->uio_resid; + if (obj->un_pager.vnp.wpos + obj->un_pager.vnp.off == + uio->uio_offset - written) { + /* + * Sequential writes detected, make a note and + * try to take immediate advantage of it. + */ + if (!freed && OFF_TO_IDX(uio->uio_offset) > + OFF_TO_IDX(uio->uio_offset - written) && + vn_lock(vp, vn_locked | LK_NOWAIT) == 0) { + vm_pageout_flush(ma, wp, pflags, 0, NULL); + VOP_UNLOCK(vp, 0); + } +/* printf("seq write, wpos %jd off %jd written %d\n", (intmax_t)obj->un_pager.vnp.wpos, (intmax_t)obj->un_pager.vnp.off, written); */ + obj->un_pager.vnp.off += written; + } else { + /* + * Not a sequential write situation, still + * might be good to not split large write in + * the daemons struggling under pressure. + */ + if (!freed && wp >= vm_pageout_page_count && + vn_lock(vp, vn_locked | LK_NOWAIT) == 0) { + vm_pageout_flush(ma, wp, pflags, 0, NULL); + VOP_UNLOCK(vp, 0); + } +/* printf("nonseq write, wpos %jd off %jd wp %d\n", (intmax_t)obj->un_pager.vnp.wpos, (intmax_t)obj->un_pager.vnp.off, wp); */ + obj->un_pager.vnp.wpos = uio->uio_offset; + obj->un_pager.vnp.off = 0; + } + vm_object_pip_wakeup(obj); + VM_OBJECT_UNLOCK(obj); + if (error != 0) + break; + KASSERT(!vnode_locked, ("vnode leak 3")); + + vnode_pager_wwait(); + + /* + * Re-fill reserv while vnode lock is dropped. + */ + if (uio->uio_resid != 0) + vnode_alloc_reserv(&reserv); + } + + if (!vnode_locked) + vn_lock(vp, vn_locked | LK_RETRY); + if (reserv != NULL) + vm_page_free(reserv); + if (vp->v_iflag & VI_DOOMED) { + if (error == 0) + error = EBADF; + return (error); + } + if (error == 0) { + if (((ioflags & IO_SYNC) != 0 && + (vp->v_vflag & VV_NOSYNC) == 0) || vm_page_count_severe()) { + VM_OBJECT_LOCK(obj); + vm_object_page_clean(obj, clean_start, clean_end, + OBJPC_SYNC); + VM_OBJECT_UNLOCK(obj); +#if 0 + /* + * XXXKIB The following call is commented out in + * vm_object_page_clean() in the same way. + */ + error = VOP_FSYNC(vp, MNT_WAIT); +#endif + } + } else { + /* + * Roll back on error if atomic write was requested. + */ + VATTR_NULL(&vattr); + vattr.va_size = (ioflags & IO_UNIT) ? osize : osize1; + VOP_SETATTR(vp, &vattr, td->td_ucred); + if (ioflags & IO_UNIT) { + uio->uio_offset -= resid - uio->uio_resid; + uio->uio_resid = resid; + } + } + + return (error); +} diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index f497d41..7db9bd9 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -695,6 +695,7 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage) object = vp->v_object; count = bytecount / PAGE_SIZE; +/* printf("vpgg: %p %jd %x %d\n", vp, m[0]->pindex, count, reqpage); */ KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("vnode_pager_generic_getpages does not support devices")); @@ -1087,6 +1088,7 @@ vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *ma, int bytecount, object = vp->v_object; count = bytecount / PAGE_SIZE; +/* printf("vpgp: %p %jd %x %d\n", vp, m[0]->pindex, m[0]->dirty, count); */ for (i = 0; i < count; i++) rtvals[i] = VM_PAGER_AGAIN; diff --git a/tools/regression/file/uio/uio.c b/tools/regression/file/uio/uio.c new file mode 100644 index 0000000..d857605 --- /dev/null +++ b/tools/regression/file/uio/uio.c @@ -0,0 +1,116 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int chunk_cnt = 1024; +int chunk_size = 1024; + +int +main(int argc, char *argv[]) +{ + struct iovec *wiov, *riov; + char **wdata, **rdata; + int fd, i; + ssize_t io_error; + + if (argc < 2) { + fprintf(stderr, "Usage: uio file [chunk count [chunk size]]\n"); + return (2); + } + fd = open(argv[1], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); + if (fd == -1) { + fprintf(stderr, "Failed to create %s: %s\n", + argv[1], strerror(errno)); + return (1); + } + + if (argc > 2) + chunk_cnt = atoi(argv[2]); + if (argc > 3) + chunk_size = atoi(argv[3]); + + wiov = calloc(chunk_cnt, sizeof(*wiov)); + wdata = calloc(chunk_cnt, sizeof(*wdata)); + + riov = calloc(chunk_cnt, sizeof(*riov)); + rdata = calloc(chunk_cnt, sizeof(*rdata)); + + for (i = 0; i < chunk_cnt; i++) { + rdata[i] = malloc(chunk_size); + riov[i].iov_base = rdata[i]; + riov[i].iov_len = chunk_size; + + wdata[i] = malloc(chunk_size); + memset(wdata[i], i, chunk_size); + wiov[i].iov_base = wdata[i]; + wiov[i].iov_len = chunk_size; + } + + io_error = writev(fd, wiov, chunk_cnt); + if (io_error == -1) { + fprintf(stderr, "write failed: %s\n", strerror(errno)); + return (1); + } else if (io_error != chunk_cnt * chunk_size) { + fprintf(stderr, "truncated write: %d %d\n", + io_error, chunk_cnt * chunk_size); + return (1); + } + + if (lseek(fd, 0, SEEK_SET) == -1) { + fprintf(stderr, "lseek failed: %s\n", strerror(errno)); + return (1); + } + + io_error = readv(fd, riov, chunk_cnt); + if (io_error == -1) { + fprintf(stderr, "read failed: %s\n", strerror(errno)); + return (1); + } else if (io_error != chunk_cnt * chunk_size) { + fprintf(stderr, "truncated read: %d %d\n", + io_error, chunk_cnt * chunk_size); + return (1); + } + + for (i = 0; i < chunk_cnt; i++) { + if (memcmp(rdata[i], wdata[i], chunk_size) != 0) { + fprintf(stderr, "chunk %d differs\n", i); + return (1); + } + } + + return (0); +} diff --git a/tools/regression/ufs/ba_clrbuf/ba_clrbuf.c b/tools/regression/ufs/ba_clrbuf/ba_clrbuf.c new file mode 100644 index 0000000..1b0acbe --- /dev/null +++ b/tools/regression/ufs/ba_clrbuf/ba_clrbuf.c @@ -0,0 +1,129 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const int blks = 2; + +static void +flush_buffers(int fd) +{ + struct stat st; + char *addr; + int error; + + printf("Flushing buffers\n"); + error = fstat(fd, &st); + if (error == -1) + err(2, "stat"); + fsync(fd); + addr = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (addr == (char *)-1) + err(2, "mmap"); + error = msync(addr, st.st_size, MS_SYNC | MS_INVALIDATE); + if (error == -1) + err(2, "msync"); + munmap(addr, st.st_size); +} + +int +main(int argc, char *argv[]) +{ + struct statfs fst; + char *data, *vrfy; + size_t sz; + int fd, i, error, ret; + + if (argc < 2) + errx(2, "Usage: ba_clrbuf file"); + + fd = open(argv[1], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); + if (fd == -1) + err(2, "Failed to create %s", argv[1]); + + if (fstatfs(fd, &fst) == -1) + err(2, "stat"); + + sz = fst.f_iosize * blks; + data = malloc(sz); + if (data == NULL) + err(2, "malloc"); + vrfy = malloc(sz); + if (vrfy == NULL) + err(2, "malloc"); + for (i = 0; i < (int)sz; i++) + data[i] = i; + error = write(fd, data, sz); + if (error == -1) + err(2, "write"); + else if (error != (int)sz) + errx(2, "Short write %d %d", error, sz); + + flush_buffers(fd); + + error = lseek(fd, 0, SEEK_SET); + if (error == -1) + err(2, "lseek 0"); + else if (error != 0) + errx(2, "lseek 0 returned %d", error); + error = write(fd, NULL, fst.f_iosize); + printf("faulty write, error %s\n", strerror(errno)); + + error = lseek(fd, 0, SEEK_SET); + if (error == -1) + err(2, "lseek 0/2"); + else if (error != 0) + errx(2, "lseek 0/2 returned %d", error); + error = read(fd, vrfy, sz); + if (error == -1) + err(2, "read"); + else if (error != (int)sz) + errx(2, "short read %d %d", error, sz); + + if (memcmp(data, vrfy, fst.f_iosize) != 0) { + printf("Zero block corrupted, byte at 0 is %x\n", + (unsigned char)vrfy[0]); + ret = 1; + } else { + printf("No corruption\n"); + ret = 0; + } + + return (ret); +} diff --git a/tools/tools/ufs/fragc/fragc.c b/tools/tools/ufs/fragc/fragc.c new file mode 100644 index 0000000..80ec3ff --- /dev/null +++ b/tools/tools/ufs/fragc/fragc.c @@ -0,0 +1,215 @@ +/* $Id: fragc.c,v 1.9 2010/02/07 14:32:22 kostik Exp kostik $ */ + +/* /usr/local/opt/gcc-4.4.3/bin/gcc -g -Wall -Wextra -O -o fragc fragc.c -lufs */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const int blocksz = 512; + +static int verbose; + +static void +usage(void) +{ + + fprintf(stderr, "Usage: fragc [-v] devname\n"); +} + +static ufs2_daddr_t blks_total; +static ufs2_daddr_t blks_breaks; + +static void +block_pair(struct fs *fs, ufs2_daddr_t *prev, ufs2_daddr_t curr) +{ + + blks_total++; + if (curr != 0) { + if (*prev != 0 && + (*prev) + fs->fs_bsize / fs->fs_fsize != curr) { + blks_breaks++; + if (verbose) + putchar('|'); + } + if (verbose) + printf(" %jd", (intmax_t)curr); + } + *prev = curr; +} + +static void +count_indir(struct uufsd *u, struct fs *fs, int level, int maxlevel, + ufs2_daddr_t ib, ufs2_daddr_t *prev) +{ + ufs2_daddr_t *b; + unsigned i; + + if (ib == 0) + return; + b = malloc(fs->fs_bsize); + if (bread(u, ib * fs->fs_fsize / blocksz, b, fs->fs_bsize) == -1) { + printf("\nRead block %jd: %s\n", (intmax_t)ib, u->d_error); + goto out; + } + for (i = 0; i < fs->fs_bsize / sizeof(ufs2_daddr_t); i++) { + if (level == maxlevel) + block_pair(fs, prev, b[i]); + else + count_indir(u, fs, level + 1, maxlevel, b[i], prev); + } + out: + free(b); +} + +static void +count_ino_ufs1(struct uufsd *u, struct fs *fs, struct ufs1_dinode *dp) +{ + ufs2_daddr_t prev; + unsigned i; + + if (dp->di_size == 0) + return; + if ((dp->di_mode & IFMT) == IFLNK && dp->di_size < + (u_int64_t)fs->fs_maxsymlinklen) + return; + + prev = 0; + for (i = 0; i < NDADDR; i++) + block_pair(fs, &prev, dp->di_db[i]); + for (i = 0; i < NIADDR; i++) { + if (0 && verbose) + printf(" [%d]", dp->di_ib[i]); + count_indir(u, fs, 0, i, dp->di_ib[i], &prev); + } +} + +static void +count_ino_ufs2(struct uufsd *u, struct fs *fs, struct ufs2_dinode *dp) +{ + ufs2_daddr_t prev; + unsigned i; + + if (dp->di_size == 0) + return; + if ((dp->di_mode & IFMT) == IFLNK && dp->di_size < + (u_int64_t)fs->fs_maxsymlinklen) + return; + + prev = 0; + for (i = 0; i < NDADDR; i++) + block_pair(fs, &prev, dp->di_db[i]); + for (i = 0; i < NIADDR; i++) { + if (0 && verbose) + printf(" [%jd]", (intmax_t)(dp->di_ib[i])); + count_indir(u, fs, 0, i, dp->di_ib[i], &prev); + } +} + +static void +frag_calc(struct uufsd *u) +{ + struct fs *fs; + struct cg *cg; + void *dino; + int32_t cgno; + uint32_t ino, inoused, cgino, next_cg_ino; + int mode; + u_int8_t *cp; + + fs = &u->d_fs; + if (verbose) + printf("%s UFS%d\n", u->d_name, u->d_ufs); + ino = 0; + for (cgno = 0; cgread(u); cgno++) { + cg = &u->d_cg; + if (u->d_ufs == 1) + inoused = fs->fs_ipg; + else + inoused = cg->cg_initediblk; + if (verbose) + printf("cg %d inodes %u\n", cgno, inoused); + cp = cg_inosused(cg); + next_cg_ino = ino + fs->fs_ipg; + for (cgino = 0; cgino < inoused; cgino++, ino++) { + if ((cp[cgino / CHAR_BIT] & (1 << (cgino % CHAR_BIT))) + != 0 && ino != 0 && ino != 1) { + if (verbose) + printf(" ino %u:", ino); + if (getino(u, &dino, ino, &mode) == -1) { + printf("\nReading ino %u: %s\n", + ino, u->d_error); + return; + } + if (mode == 0) { + printf( +"\nIno %u/%u is allocated in bitmap, but mode is 0\n", + ino, ino % fs->fs_ipg); + continue; + } + if (mode != IFDIR && mode != IFREG && + mode != IFLNK) + continue; + + if (u->d_ufs == 1) + count_ino_ufs1(u, fs, dino); + else + count_ino_ufs2(u, fs, dino); + if (verbose) + putchar('\n'); + } + } + ino = next_cg_ino; + } +} + +int +main(int argc, char *argv[]) +{ + struct uufsd ufsd; + int c; + + verbose = 0; + while ((c = getopt(argc, argv, "hv")) != -1) { + switch (c) { + case 'h': + usage(); + return (0); + case 'v': + verbose = 1; + break; + default: + usage(); + return (2); + } + } + if (optind + 1 != argc) { + usage(); + return (2); + } + + if (ufs_disk_fillout(&ufsd, argv[optind]) == -1) { + fprintf(stderr, "Fillout: %s\n", ufsd.d_error); + return (1); + } + + frag_calc(&ufsd); + + if (ufs_disk_close(&ufsd) == -1) { + fprintf(stderr, "Disk close: %s\n", ufsd.d_error); + return (1); + } + + printf("Total %jd data blocks, %jd breaks, %02.2f%% fragmentation.\n", + (intmax_t)blks_total, (intmax_t)blks_breaks, + (double)blks_breaks * 100.0 / blks_total); + + return (0); +}